]> jfr.im git - yt-dlp.git/blame - youtube_dl/extractor/youtube.py
[youtube:playlist] Support mix ids longer than 13 (#1295)
[yt-dlp.git] / youtube_dl / extractor / youtube.py
CommitLineData
c5e8d7af 1# coding: utf-8
c5e8d7af 2
e0df6211 3import collections
edf3e38e 4import errno
e0df6211 5import io
0ca96d48 6import itertools
c5e8d7af 7import json
c4417ddb 8import os.path
c5e8d7af
PH
9import re
10import socket
e0df6211
PH
11import string
12import struct
13import traceback
14import zlib
c5e8d7af 15
b05654f0 16from .common import InfoExtractor, SearchInfoExtractor
54d39d8b 17from .subtitles import SubtitlesInfoExtractor
c5e8d7af 18from ..utils import (
edf3e38e 19 compat_chr,
c5e8d7af
PH
20 compat_http_client,
21 compat_parse_qs,
22 compat_urllib_error,
23 compat_urllib_parse,
24 compat_urllib_request,
7c61bd36 25 compat_urlparse,
c5e8d7af
PH
26 compat_str,
27
28 clean_html,
c38b1e77 29 get_cachedir,
c5e8d7af 30 get_element_by_id,
652cdaa2 31 get_element_by_attribute,
c5e8d7af
PH
32 ExtractorError,
33 unescapeHTML,
34 unified_strdate,
04cc9617 35 orderedSet,
edf3e38e 36 write_json_file,
c5e8d7af
PH
37)
38
de7f3446 39class YoutubeBaseInfoExtractor(InfoExtractor):
b2e8bc1b
JMF
40 """Provide base functions for Youtube extractors"""
41 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
42 _LANG_URL = r'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
43 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
44 _NETRC_MACHINE = 'youtube'
45 # If True it will raise an error if no login info is provided
46 _LOGIN_REQUIRED = False
47
48 def report_lang(self):
49 """Report attempt to set language."""
50 self.to_screen(u'Setting language')
51
52 def _set_language(self):
53 request = compat_urllib_request.Request(self._LANG_URL)
54 try:
55 self.report_lang()
56 compat_urllib_request.urlopen(request).read()
57 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
58 self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
59 return False
60 return True
61
62 def _login(self):
63 (username, password) = self._get_login_info()
64 # No authentication to be performed
65 if username is None:
66 if self._LOGIN_REQUIRED:
67 raise ExtractorError(u'No login info available, needed for using %s.' % self.IE_NAME, expected=True)
68 return False
69
70 request = compat_urllib_request.Request(self._LOGIN_URL)
71 try:
72 login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
73 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
74 self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err))
75 return False
76
795f28f8
PH
77 galx = self._search_regex(r'(?s)<input.+?name="GALX".+?value="(.+?)"',
78 login_page, u'Login GALX parameter')
c5e8d7af 79
b2e8bc1b
JMF
80 # Log in
81 login_form_strs = {
82 u'continue': u'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
83 u'Email': username,
84 u'GALX': galx,
85 u'Passwd': password,
86 u'PersistentCookie': u'yes',
87 u'_utf8': u'霱',
88 u'bgresponse': u'js_disabled',
89 u'checkConnection': u'',
90 u'checkedDomains': u'youtube',
91 u'dnConn': u'',
b2e8bc1b
JMF
92 u'pstMsg': u'0',
93 u'rmShown': u'1',
94 u'secTok': u'',
95 u'signIn': u'Sign in',
96 u'timeStmp': u'',
97 u'service': u'youtube',
98 u'uilel': u'3',
99 u'hl': u'en_US',
100 }
101 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
102 # chokes on unicode
103 login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
104 login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
105 request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
106 try:
107 self.report_login()
108 login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
109 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
110 self._downloader.report_warning(u'unable to log in: bad username or password')
111 return False
112 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
113 self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
114 return False
115 return True
116
117 def _confirm_age(self):
118 age_form = {
119 'next_url': '/',
120 'action_confirm': 'Confirm',
121 }
122 request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
123 try:
124 self.report_age_confirmation()
125 compat_urllib_request.urlopen(request).read().decode('utf-8')
126 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
127 raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err))
128 return True
129
130 def _real_initialize(self):
131 if self._downloader is None:
132 return
133 if not self._set_language():
134 return
135 if not self._login():
136 return
137 self._confirm_age()
c5e8d7af 138
8377574c 139
de7f3446 140class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
0f818663 141 IE_DESC = u'YouTube.com'
cb7dfeea 142 _VALID_URL = r"""(?x)^
c5e8d7af 143 (
83aa5293 144 (?:https?://|//)? # http(s):// or protocol-independent URL (optional)
cb7dfeea 145 (?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/|
e69ae5b9
JMF
146 tube\.majestyc\.net/|
147 youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains
c5e8d7af
PH
148 (?:.*?\#/)? # handle anchor (#/) redirect urls
149 (?: # the various things that can precede the ID:
150 (?:(?:v|embed|e)/) # v/ or embed/ or e/
151 |(?: # or the v= param in all its forms
d741e55a 152 (?:(?:watch|movie)(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
c5e8d7af
PH
153 (?:\?|\#!?) # the params delimiter ? or # or #!
154 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
155 v=
156 )
f4b05232
JMF
157 ))
158 |youtu\.be/ # just youtu.be/xxxx
159 )
c5e8d7af 160 )? # all until now is optional -> you can pass the naked ID
8963d9c2 161 ([0-9A-Za-z_-]{11}) # here is it! the YouTube video ID
c5e8d7af
PH
162 (?(1).+)? # if we found the ID, everything can follow
163 $"""
c5e8d7af 164 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
c5e8d7af 165 # Listed in order of quality
bdc6b3fc 166 _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '36', '17', '13',
96fb5605 167 # Apple HTTP Live Streaming
bdc6b3fc 168 '96', '95', '94', '93', '92', '132', '151',
939fbd26
JMF
169 # 3D
170 '85', '84', '102', '83', '101', '82', '100',
171 # Dash video
172 '138', '137', '248', '136', '247', '135', '246',
173 '245', '244', '134', '243', '133', '242', '160',
174 # Dash audio
175 '141', '172', '140', '171', '139',
1d043b93 176 ]
bdc6b3fc 177 _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '36', '17', '13',
96fb5605 178 # Apple HTTP Live Streaming
bdc6b3fc
AZ
179 '96', '95', '94', '93', '92', '132', '151',
180 # 3D
86fe61c8 181 '85', '102', '84', '101', '83', '100', '82',
939fbd26
JMF
182 # Dash video
183 '138', '248', '137', '247', '136', '246', '245',
184 '244', '135', '243', '134', '242', '133', '160',
185 # Dash audio
186 '172', '141', '171', '140', '139',
1d043b93 187 ]
bdc6b3fc
AZ
188 _video_formats_map = {
189 'flv': ['35', '34', '6', '5'],
190 '3gp': ['36', '17', '13'],
191 'mp4': ['38', '37', '22', '18'],
192 'webm': ['46', '45', '44', '43'],
193 }
c5e8d7af
PH
194 _video_extensions = {
195 '13': '3gp',
bdc6b3fc 196 '17': '3gp',
c5e8d7af
PH
197 '18': 'mp4',
198 '22': 'mp4',
bdc6b3fc 199 '36': '3gp',
c5e8d7af 200 '37': 'mp4',
d69cf69a 201 '38': 'mp4',
c5e8d7af
PH
202 '43': 'webm',
203 '44': 'webm',
204 '45': 'webm',
205 '46': 'webm',
1d043b93 206
86fe61c8
AZ
207 # 3d videos
208 '82': 'mp4',
209 '83': 'mp4',
210 '84': 'mp4',
211 '85': 'mp4',
212 '100': 'webm',
213 '101': 'webm',
214 '102': 'webm',
836a086c 215
96fb5605 216 # Apple HTTP Live Streaming
1d043b93
JMF
217 '92': 'mp4',
218 '93': 'mp4',
219 '94': 'mp4',
220 '95': 'mp4',
221 '96': 'mp4',
222 '132': 'mp4',
223 '151': 'mp4',
836a086c
AZ
224
225 # Dash mp4
226 '133': 'mp4',
227 '134': 'mp4',
228 '135': 'mp4',
229 '136': 'mp4',
230 '137': 'mp4',
231 '138': 'mp4',
836a086c
AZ
232 '160': 'mp4',
233
f6f1fc92
RB
234 # Dash mp4 audio
235 '139': 'm4a',
16f36a6f
RB
236 '140': 'm4a',
237 '141': 'm4a',
836a086c
AZ
238
239 # Dash webm
240 '171': 'webm',
241 '172': 'webm',
242 '242': 'webm',
243 '243': 'webm',
244 '244': 'webm',
245 '245': 'webm',
246 '246': 'webm',
247 '247': 'webm',
248 '248': 'webm',
c5e8d7af
PH
249 }
250 _video_dimensions = {
d5a9bb4e 251 '5': '400x240',
c5e8d7af
PH
252 '6': '???',
253 '13': '???',
d5a9bb4e
RB
254 '17': '176x144',
255 '18': '640x360',
256 '22': '1280x720',
257 '34': '640x360',
258 '35': '854x480',
259 '36': '320x240',
260 '37': '1920x1080',
261 '38': '4096x3072',
262 '43': '640x360',
263 '44': '854x480',
264 '45': '1280x720',
265 '46': '1920x1080',
86fe61c8
AZ
266 '82': '360p',
267 '83': '480p',
268 '84': '720p',
269 '85': '1080p',
1d043b93
JMF
270 '92': '240p',
271 '93': '360p',
272 '94': '480p',
273 '95': '720p',
274 '96': '1080p',
86fe61c8
AZ
275 '100': '360p',
276 '101': '480p',
836a086c 277 '102': '720p',
1d043b93
JMF
278 '132': '240p',
279 '151': '72p',
836a086c
AZ
280 '133': '240p',
281 '134': '360p',
282 '135': '480p',
283 '136': '720p',
284 '137': '1080p',
285 '138': '>1080p',
286 '139': '48k',
287 '140': '128k',
288 '141': '256k',
289 '160': '192p',
290 '171': '128k',
291 '172': '256k',
292 '242': '240p',
293 '243': '360p',
294 '244': '480p',
295 '245': '480p',
296 '246': '480p',
297 '247': '720p',
298 '248': '1080p',
c5e8d7af 299 }
836a086c
AZ
300 _special_itags = {
301 '82': '3D',
302 '83': '3D',
303 '84': '3D',
304 '85': '3D',
305 '100': '3D',
306 '101': '3D',
307 '102': '3D',
308 '133': 'DASH Video',
309 '134': 'DASH Video',
310 '135': 'DASH Video',
311 '136': 'DASH Video',
312 '137': 'DASH Video',
313 '138': 'DASH Video',
314 '139': 'DASH Audio',
315 '140': 'DASH Audio',
316 '141': 'DASH Audio',
317 '160': 'DASH Video',
318 '171': 'DASH Audio',
319 '172': 'DASH Audio',
320 '242': 'DASH Video',
321 '243': 'DASH Video',
322 '244': 'DASH Video',
323 '245': 'DASH Video',
324 '246': 'DASH Video',
325 '247': 'DASH Video',
326 '248': 'DASH Video',
c5e8d7af 327 }
836a086c 328
c5e8d7af 329 IE_NAME = u'youtube'
2eb88d95
PH
330 _TESTS = [
331 {
0e853ca4
PH
332 u"url": u"http://www.youtube.com/watch?v=BaW_jenozKc",
333 u"file": u"BaW_jenozKc.mp4",
334 u"info_dict": {
335 u"title": u"youtube-dl test video \"'/\\ä↭𝕐",
336 u"uploader": u"Philipp Hagemeister",
337 u"uploader_id": u"phihag",
338 u"upload_date": u"20121002",
27dcce19 339 u"description": u"test chars: \"'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de ."
2eb88d95 340 }
0e853ca4 341 },
0e853ca4
PH
342 {
343 u"url": u"http://www.youtube.com/watch?v=UxxajLWwzqY",
344 u"file": u"UxxajLWwzqY.mp4",
345 u"note": u"Test generic use_cipher_signature video (#897)",
346 u"info_dict": {
347 u"upload_date": u"20120506",
348 u"title": u"Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]",
2dc59299 349 u"description": u"md5:5b292926389560516e384ac437c0ec07",
45ed795c 350 u"uploader": u"Icona Pop",
0e853ca4 351 u"uploader_id": u"IconaPop"
2eb88d95 352 }
c108eb73
JMF
353 },
354 {
355 u"url": u"https://www.youtube.com/watch?v=07FYdnEawAQ",
356 u"file": u"07FYdnEawAQ.mp4",
357 u"note": u"Test VEVO video with age protection (#956)",
358 u"info_dict": {
359 u"upload_date": u"20130703",
360 u"title": u"Justin Timberlake - Tunnel Vision (Explicit)",
361 u"description": u"md5:64249768eec3bc4276236606ea996373",
362 u"uploader": u"justintimberlakeVEVO",
363 u"uploader_id": u"justintimberlakeVEVO"
364 }
365 },
fccd3771 366 {
83aa5293 367 u"url": u"//www.YouTube.com/watch?v=yZIXLfi8CZQ",
fccd3771
PH
368 u"file": u"yZIXLfi8CZQ.mp4",
369 u"note": u"Embed-only video (#1746)",
370 u"info_dict": {
371 u"upload_date": u"20120608",
372 u"title": u"Principal Sexually Assaults A Teacher - Episode 117 - 8th June 2012",
373 u"description": u"md5:09b78bd971f1e3e289601dfba15ca4f7",
374 u"uploader": u"SET India",
375 u"uploader_id": u"setindia"
376 }
377 },
2eb88d95
PH
378 ]
379
c5e8d7af
PH
380
381 @classmethod
382 def suitable(cls, url):
383 """Receives a URL and returns True if suitable for this IE."""
e3ea4790 384 if YoutubePlaylistIE.suitable(url): return False
fccd3771 385 return re.match(cls._VALID_URL, url) is not None
c5e8d7af 386
e0df6211
PH
387 def __init__(self, *args, **kwargs):
388 super(YoutubeIE, self).__init__(*args, **kwargs)
83799698 389 self._player_cache = {}
e0df6211 390
c5e8d7af
PH
391 def report_video_info_webpage_download(self, video_id):
392 """Report attempt to download video info webpage."""
393 self.to_screen(u'%s: Downloading video info webpage' % video_id)
394
c5e8d7af
PH
395 def report_information_extraction(self, video_id):
396 """Report attempt to extract video information."""
397 self.to_screen(u'%s: Extracting video information' % video_id)
398
399 def report_unavailable_format(self, video_id, format):
400 """Report extracted video URL."""
401 self.to_screen(u'%s: Format %s not available' % (video_id, format))
402
403 def report_rtmp_download(self):
404 """Indicate the download will use the RTMP protocol."""
405 self.to_screen(u'RTMP download detected')
406
c4417ddb
PH
407 def _extract_signature_function(self, video_id, player_url, slen):
408 id_m = re.match(r'.*-(?P<id>[a-zA-Z0-9_-]+)\.(?P<ext>[a-z]+)$',
83799698 409 player_url)
e0df6211
PH
410 player_type = id_m.group('ext')
411 player_id = id_m.group('id')
412
c4417ddb
PH
413 # Read from filesystem cache
414 func_id = '%s_%s_%d' % (player_type, player_id, slen)
415 assert os.path.basename(func_id) == func_id
c38b1e77 416 cache_dir = get_cachedir(self._downloader.params)
c4417ddb 417
c3c88a26 418 cache_enabled = cache_dir is not None
f8061589 419 if cache_enabled:
c4417ddb
PH
420 cache_fn = os.path.join(os.path.expanduser(cache_dir),
421 u'youtube-sigfuncs',
422 func_id + '.json')
423 try:
edf3e38e 424 with io.open(cache_fn, 'r', encoding='utf-8') as cachef:
c4417ddb
PH
425 cache_spec = json.load(cachef)
426 return lambda s: u''.join(s[i] for i in cache_spec)
edf3e38e 427 except IOError:
c4417ddb 428 pass # No cache available
83799698 429
e0df6211
PH
430 if player_type == 'js':
431 code = self._download_webpage(
432 player_url, video_id,
83799698 433 note=u'Downloading %s player %s' % (player_type, player_id),
e0df6211 434 errnote=u'Download of %s failed' % player_url)
83799698 435 res = self._parse_sig_js(code)
c4417ddb 436 elif player_type == 'swf':
e0df6211
PH
437 urlh = self._request_webpage(
438 player_url, video_id,
83799698 439 note=u'Downloading %s player %s' % (player_type, player_id),
e0df6211
PH
440 errnote=u'Download of %s failed' % player_url)
441 code = urlh.read()
83799698 442 res = self._parse_sig_swf(code)
e0df6211
PH
443 else:
444 assert False, 'Invalid player type %r' % player_type
445
f8061589 446 if cache_enabled:
edf3e38e 447 try:
c705320f
PH
448 test_string = u''.join(map(compat_chr, range(slen)))
449 cache_res = res(test_string)
edf3e38e
PH
450 cache_spec = [ord(c) for c in cache_res]
451 try:
452 os.makedirs(os.path.dirname(cache_fn))
453 except OSError as ose:
454 if ose.errno != errno.EEXIST:
455 raise
456 write_json_file(cache_spec, cache_fn)
0ca96d48 457 except Exception:
edf3e38e
PH
458 tb = traceback.format_exc()
459 self._downloader.report_warning(
460 u'Writing cache to %r failed: %s' % (cache_fn, tb))
83799698
PH
461
462 return res
463
edf3e38e
PH
464 def _print_sig_code(self, func, slen):
465 def gen_sig_code(idxs):
466 def _genslice(start, end, step):
467 starts = u'' if start == 0 else str(start)
e35e4ddc
PH
468 ends = (u':%d' % (end+step)) if end + step >= 0 else u':'
469 steps = u'' if step == 1 else (u':%d' % step)
edf3e38e
PH
470 return u's[%s%s%s]' % (starts, ends, steps)
471
472 step = None
0ca96d48
PH
473 start = '(Never used)' # Quelch pyflakes warnings - start will be
474 # set as soon as step is set
edf3e38e
PH
475 for i, prev in zip(idxs[1:], idxs[:-1]):
476 if step is not None:
477 if i - prev == step:
478 continue
479 yield _genslice(start, prev, step)
480 step = None
481 continue
482 if i - prev in [-1, 1]:
483 step = i - prev
484 start = prev
485 continue
486 else:
487 yield u's[%d]' % prev
488 if step is None:
489 yield u's[%d]' % i
490 else:
491 yield _genslice(start, i, step)
492
c705320f
PH
493 test_string = u''.join(map(compat_chr, range(slen)))
494 cache_res = func(test_string)
edf3e38e
PH
495 cache_spec = [ord(c) for c in cache_res]
496 expr_code = u' + '.join(gen_sig_code(cache_spec))
497 code = u'if len(s) == %d:\n return %s\n' % (slen, expr_code)
f8061589 498 self.to_screen(u'Extracted signature function:\n' + code)
edf3e38e 499
e0df6211
PH
500 def _parse_sig_js(self, jscode):
501 funcname = self._search_regex(
502 r'signature=([a-zA-Z]+)', jscode,
503 u'Initial JS player signature function name')
504
505 functions = {}
506
507 def argidx(varname):
508 return string.lowercase.index(varname)
509
510 def interpret_statement(stmt, local_vars, allow_recursion=20):
511 if allow_recursion < 0:
0ca96d48 512 raise ExtractorError(u'Recursion limit reached')
e0df6211
PH
513
514 if stmt.startswith(u'var '):
515 stmt = stmt[len(u'var '):]
516 ass_m = re.match(r'^(?P<out>[a-z]+)(?:\[(?P<index>[^\]]+)\])?' +
517 r'=(?P<expr>.*)$', stmt)
518 if ass_m:
519 if ass_m.groupdict().get('index'):
520 def assign(val):
521 lvar = local_vars[ass_m.group('out')]
522 idx = interpret_expression(ass_m.group('index'),
523 local_vars, allow_recursion)
524 assert isinstance(idx, int)
525 lvar[idx] = val
526 return val
527 expr = ass_m.group('expr')
528 else:
529 def assign(val):
530 local_vars[ass_m.group('out')] = val
531 return val
532 expr = ass_m.group('expr')
533 elif stmt.startswith(u'return '):
534 assign = lambda v: v
535 expr = stmt[len(u'return '):]
536 else:
537 raise ExtractorError(
538 u'Cannot determine left side of statement in %r' % stmt)
539
540 v = interpret_expression(expr, local_vars, allow_recursion)
541 return assign(v)
542
543 def interpret_expression(expr, local_vars, allow_recursion):
544 if expr.isdigit():
545 return int(expr)
546
547 if expr.isalpha():
548 return local_vars[expr]
549
550 m = re.match(r'^(?P<in>[a-z]+)\.(?P<member>.*)$', expr)
551 if m:
552 member = m.group('member')
553 val = local_vars[m.group('in')]
554 if member == 'split("")':
555 return list(val)
556 if member == 'join("")':
557 return u''.join(val)
558 if member == 'length':
559 return len(val)
560 if member == 'reverse()':
561 return val[::-1]
562 slice_m = re.match(r'slice\((?P<idx>.*)\)', member)
563 if slice_m:
564 idx = interpret_expression(
565 slice_m.group('idx'), local_vars, allow_recursion-1)
566 return val[idx:]
567
568 m = re.match(
569 r'^(?P<in>[a-z]+)\[(?P<idx>.+)\]$', expr)
570 if m:
571 val = local_vars[m.group('in')]
572 idx = interpret_expression(m.group('idx'), local_vars,
573 allow_recursion-1)
574 return val[idx]
575
576 m = re.match(r'^(?P<a>.+?)(?P<op>[%])(?P<b>.+?)$', expr)
577 if m:
578 a = interpret_expression(m.group('a'),
579 local_vars, allow_recursion)
580 b = interpret_expression(m.group('b'),
581 local_vars, allow_recursion)
582 return a % b
583
584 m = re.match(
585 r'^(?P<func>[a-zA-Z]+)\((?P<args>[a-z0-9,]+)\)$', expr)
586 if m:
587 fname = m.group('func')
588 if fname not in functions:
589 functions[fname] = extract_function(fname)
590 argvals = [int(v) if v.isdigit() else local_vars[v]
591 for v in m.group('args').split(',')]
592 return functions[fname](argvals)
593 raise ExtractorError(u'Unsupported JS expression %r' % expr)
594
595 def extract_function(funcname):
596 func_m = re.search(
597 r'function ' + re.escape(funcname) +
598 r'\((?P<args>[a-z,]+)\){(?P<code>[^}]+)}',
599 jscode)
600 argnames = func_m.group('args').split(',')
601
602 def resf(args):
603 local_vars = dict(zip(argnames, args))
604 for stmt in func_m.group('code').split(';'):
605 res = interpret_statement(stmt, local_vars)
606 return res
607 return resf
608
609 initial_function = extract_function(funcname)
610 return lambda s: initial_function([s])
611
612 def _parse_sig_swf(self, file_contents):
613 if file_contents[1:3] != b'WS':
614 raise ExtractorError(
615 u'Not an SWF file; header is %r' % file_contents[:3])
616 if file_contents[:1] == b'C':
617 content = zlib.decompress(file_contents[8:])
618 else:
619 raise NotImplementedError(u'Unsupported compression format %r' %
620 file_contents[:1])
621
622 def extract_tags(content):
623 pos = 0
624 while pos < len(content):
625 header16 = struct.unpack('<H', content[pos:pos+2])[0]
626 pos += 2
627 tag_code = header16 >> 6
628 tag_len = header16 & 0x3f
629 if tag_len == 0x3f:
630 tag_len = struct.unpack('<I', content[pos:pos+4])[0]
631 pos += 4
632 assert pos+tag_len <= len(content)
633 yield (tag_code, content[pos:pos+tag_len])
634 pos += tag_len
635
636 code_tag = next(tag
637 for tag_code, tag in extract_tags(content)
638 if tag_code == 82)
639 p = code_tag.index(b'\0', 4) + 1
ba552f54 640 code_reader = io.BytesIO(code_tag[p:])
e0df6211
PH
641
642 # Parse ABC (AVM2 ByteCode)
ba552f54
PH
643 def read_int(reader=None):
644 if reader is None:
645 reader = code_reader
e0df6211
PH
646 res = 0
647 shift = 0
648 for _ in range(5):
ba552f54
PH
649 buf = reader.read(1)
650 assert len(buf) == 1
651 b = struct.unpack('<B', buf)[0]
e0df6211
PH
652 res = res | ((b & 0x7f) << shift)
653 if b & 0x80 == 0:
654 break
655 shift += 7
ba552f54
PH
656 return res
657
658 def u30(reader=None):
659 res = read_int(reader)
660 assert res & 0xf0000000 == 0
e0df6211
PH
661 return res
662 u32 = read_int
663
ba552f54
PH
664 def s32(reader=None):
665 v = read_int(reader)
e0df6211
PH
666 if v & 0x80000000 != 0:
667 v = - ((v ^ 0xffffffff) + 1)
ba552f54
PH
668 return v
669
0ca96d48 670 def read_string(reader=None):
ba552f54
PH
671 if reader is None:
672 reader = code_reader
673 slen = u30(reader)
674 resb = reader.read(slen)
675 assert len(resb) == slen
676 return resb.decode('utf-8')
677
678 def read_bytes(count, reader=None):
679 if reader is None:
680 reader = code_reader
681 resb = reader.read(count)
682 assert len(resb) == count
683 return resb
684
685 def read_byte(reader=None):
686 resb = read_bytes(1, reader=reader)
687 res = struct.unpack('<B', resb)[0]
688 return res
e0df6211
PH
689
690 # minor_version + major_version
0ca96d48 691 read_bytes(2 + 2)
e0df6211
PH
692
693 # Constant pool
ba552f54 694 int_count = u30()
e0df6211 695 for _c in range(1, int_count):
0ca96d48 696 s32()
ba552f54 697 uint_count = u30()
e0df6211 698 for _c in range(1, uint_count):
0ca96d48 699 u32()
ba552f54 700 double_count = u30()
0ca96d48 701 read_bytes((double_count-1) * 8)
ba552f54 702 string_count = u30()
e0df6211
PH
703 constant_strings = [u'']
704 for _c in range(1, string_count):
0ca96d48 705 s = read_string()
e0df6211 706 constant_strings.append(s)
ba552f54 707 namespace_count = u30()
e0df6211 708 for _c in range(1, namespace_count):
0ca96d48
PH
709 read_bytes(1) # kind
710 u30() # name
ba552f54 711 ns_set_count = u30()
e0df6211 712 for _c in range(1, ns_set_count):
ba552f54 713 count = u30()
e0df6211 714 for _c2 in range(count):
0ca96d48 715 u30()
ba552f54 716 multiname_count = u30()
e0df6211
PH
717 MULTINAME_SIZES = {
718 0x07: 2, # QName
719 0x0d: 2, # QNameA
720 0x0f: 1, # RTQName
721 0x10: 1, # RTQNameA
722 0x11: 0, # RTQNameL
723 0x12: 0, # RTQNameLA
724 0x09: 2, # Multiname
725 0x0e: 2, # MultinameA
726 0x1b: 1, # MultinameL
727 0x1c: 1, # MultinameLA
728 }
729 multinames = [u'']
730 for _c in range(1, multiname_count):
ba552f54 731 kind = u30()
e0df6211
PH
732 assert kind in MULTINAME_SIZES, u'Invalid multiname kind %r' % kind
733 if kind == 0x07:
0ca96d48 734 u30() # namespace_idx
ba552f54 735 name_idx = u30()
e0df6211
PH
736 multinames.append(constant_strings[name_idx])
737 else:
738 multinames.append('[MULTINAME kind: %d]' % kind)
739 for _c2 in range(MULTINAME_SIZES[kind]):
0ca96d48 740 u30()
e0df6211
PH
741
742 # Methods
ba552f54 743 method_count = u30()
e0df6211
PH
744 MethodInfo = collections.namedtuple(
745 'MethodInfo',
746 ['NEED_ARGUMENTS', 'NEED_REST'])
747 method_infos = []
748 for method_id in range(method_count):
ba552f54 749 param_count = u30()
0ca96d48 750 u30() # return type
e0df6211 751 for _ in range(param_count):
0ca96d48
PH
752 u30() # param type
753 u30() # name index (always 0 for youtube)
ba552f54 754 flags = read_byte()
e0df6211
PH
755 if flags & 0x08 != 0:
756 # Options present
ba552f54 757 option_count = u30()
e0df6211 758 for c in range(option_count):
0ca96d48
PH
759 u30() # val
760 read_bytes(1) # kind
e0df6211
PH
761 if flags & 0x80 != 0:
762 # Param names present
763 for _ in range(param_count):
0ca96d48 764 u30() # param name
e0df6211
PH
765 mi = MethodInfo(flags & 0x01 != 0, flags & 0x04 != 0)
766 method_infos.append(mi)
767
768 # Metadata
ba552f54 769 metadata_count = u30()
e0df6211 770 for _c in range(metadata_count):
0ca96d48 771 u30() # name
ba552f54 772 item_count = u30()
e0df6211 773 for _c2 in range(item_count):
0ca96d48
PH
774 u30() # key
775 u30() # value
ba552f54
PH
776
777 def parse_traits_info():
778 trait_name_idx = u30()
779 kind_full = read_byte()
e0df6211
PH
780 kind = kind_full & 0x0f
781 attrs = kind_full >> 4
782 methods = {}
783 if kind in [0x00, 0x06]: # Slot or Const
0ca96d48
PH
784 u30() # Slot id
785 u30() # type_name_idx
ba552f54 786 vindex = u30()
e0df6211 787 if vindex != 0:
0ca96d48 788 read_byte() # vkind
e0df6211 789 elif kind in [0x01, 0x02, 0x03]: # Method / Getter / Setter
0ca96d48 790 u30() # disp_id
ba552f54 791 method_idx = u30()
e0df6211
PH
792 methods[multinames[trait_name_idx]] = method_idx
793 elif kind == 0x04: # Class
0ca96d48
PH
794 u30() # slot_id
795 u30() # classi
e0df6211 796 elif kind == 0x05: # Function
0ca96d48 797 u30() # slot_id
ba552f54 798 function_idx = u30()
e0df6211
PH
799 methods[function_idx] = multinames[trait_name_idx]
800 else:
801 raise ExtractorError(u'Unsupported trait kind %d' % kind)
802
803 if attrs & 0x4 != 0: # Metadata present
ba552f54 804 metadata_count = u30()
e0df6211 805 for _c3 in range(metadata_count):
0ca96d48 806 u30() # metadata index
e0df6211 807
ba552f54 808 return methods
e0df6211
PH
809
810 # Classes
811 TARGET_CLASSNAME = u'SignatureDecipher'
812 searched_idx = multinames.index(TARGET_CLASSNAME)
813 searched_class_id = None
ba552f54 814 class_count = u30()
e0df6211 815 for class_id in range(class_count):
ba552f54 816 name_idx = u30()
e0df6211
PH
817 if name_idx == searched_idx:
818 # We found the class we're looking for!
819 searched_class_id = class_id
0ca96d48 820 u30() # super_name idx
ba552f54 821 flags = read_byte()
e0df6211 822 if flags & 0x08 != 0: # Protected namespace is present
0ca96d48 823 u30() # protected_ns_idx
ba552f54 824 intrf_count = u30()
e0df6211 825 for _c2 in range(intrf_count):
0ca96d48
PH
826 u30()
827 u30() # iinit
ba552f54 828 trait_count = u30()
e0df6211 829 for _c2 in range(trait_count):
0ca96d48 830 parse_traits_info()
e0df6211
PH
831
832 if searched_class_id is None:
833 raise ExtractorError(u'Target class %r not found' %
834 TARGET_CLASSNAME)
835
836 method_names = {}
837 method_idxs = {}
838 for class_id in range(class_count):
0ca96d48 839 u30() # cinit
ba552f54 840 trait_count = u30()
e0df6211 841 for _c2 in range(trait_count):
ba552f54 842 trait_methods = parse_traits_info()
e0df6211
PH
843 if class_id == searched_class_id:
844 method_names.update(trait_methods.items())
845 method_idxs.update(dict(
846 (idx, name)
847 for name, idx in trait_methods.items()))
848
849 # Scripts
ba552f54 850 script_count = u30()
e0df6211 851 for _c in range(script_count):
0ca96d48 852 u30() # init
ba552f54 853 trait_count = u30()
e0df6211 854 for _c2 in range(trait_count):
0ca96d48 855 parse_traits_info()
e0df6211
PH
856
857 # Method bodies
ba552f54 858 method_body_count = u30()
e0df6211
PH
859 Method = collections.namedtuple('Method', ['code', 'local_count'])
860 methods = {}
861 for _c in range(method_body_count):
ba552f54 862 method_idx = u30()
0ca96d48 863 u30() # max_stack
ba552f54 864 local_count = u30()
0ca96d48
PH
865 u30() # init_scope_depth
866 u30() # max_scope_depth
ba552f54
PH
867 code_length = u30()
868 code = read_bytes(code_length)
e0df6211 869 if method_idx in method_idxs:
ba552f54 870 m = Method(code, local_count)
e0df6211 871 methods[method_idxs[method_idx]] = m
ba552f54 872 exception_count = u30()
e0df6211 873 for _c2 in range(exception_count):
0ca96d48
PH
874 u30() # from
875 u30() # to
876 u30() # target
877 u30() # exc_type
878 u30() # var_name
ba552f54 879 trait_count = u30()
e0df6211 880 for _c2 in range(trait_count):
0ca96d48 881 parse_traits_info()
e0df6211 882
ba552f54 883 assert p + code_reader.tell() == len(code_tag)
e0df6211
PH
884 assert len(methods) == len(method_idxs)
885
886 method_pyfunctions = {}
887
888 def extract_function(func_name):
889 if func_name in method_pyfunctions:
890 return method_pyfunctions[func_name]
891 if func_name not in methods:
892 raise ExtractorError(u'Cannot find function %r' % func_name)
893 m = methods[func_name]
894
895 def resfunc(args):
e0df6211
PH
896 registers = ['(this)'] + list(args) + [None] * m.local_count
897 stack = []
898 coder = io.BytesIO(m.code)
899 while True:
900 opcode = struct.unpack('!B', coder.read(1))[0]
a7177865 901 if opcode == 36: # pushbyte
e0df6211
PH
902 v = struct.unpack('!B', coder.read(1))[0]
903 stack.append(v)
904 elif opcode == 44: # pushstring
905 idx = u30(coder)
906 stack.append(constant_strings[idx])
907 elif opcode == 48: # pushscope
908 # We don't implement the scope register, so we'll just
909 # ignore the popped value
910 stack.pop()
911 elif opcode == 70: # callproperty
912 index = u30(coder)
913 mname = multinames[index]
914 arg_count = u30(coder)
915 args = list(reversed(
916 [stack.pop() for _ in range(arg_count)]))
917 obj = stack.pop()
918 if mname == u'split':
919 assert len(args) == 1
920 assert isinstance(args[0], compat_str)
921 assert isinstance(obj, compat_str)
922 if args[0] == u'':
923 res = list(obj)
924 else:
925 res = obj.split(args[0])
926 stack.append(res)
a7177865
PH
927 elif mname == u'slice':
928 assert len(args) == 1
929 assert isinstance(args[0], int)
930 assert isinstance(obj, list)
931 res = obj[args[0]:]
932 stack.append(res)
933 elif mname == u'join':
934 assert len(args) == 1
935 assert isinstance(args[0], compat_str)
936 assert isinstance(obj, list)
937 res = args[0].join(obj)
938 stack.append(res)
e0df6211
PH
939 elif mname in method_pyfunctions:
940 stack.append(method_pyfunctions[mname](args))
941 else:
942 raise NotImplementedError(
943 u'Unsupported property %r on %r'
944 % (mname, obj))
a7177865
PH
945 elif opcode == 72: # returnvalue
946 res = stack.pop()
947 return res
948 elif opcode == 79: # callpropvoid
949 index = u30(coder)
950 mname = multinames[index]
951 arg_count = u30(coder)
952 args = list(reversed(
953 [stack.pop() for _ in range(arg_count)]))
954 obj = stack.pop()
955 if mname == u'reverse':
956 assert isinstance(obj, list)
957 obj.reverse()
958 else:
959 raise NotImplementedError(
960 u'Unsupported (void) property %r on %r'
961 % (mname, obj))
e0df6211
PH
962 elif opcode == 93: # findpropstrict
963 index = u30(coder)
964 mname = multinames[index]
965 res = extract_function(mname)
966 stack.append(res)
967 elif opcode == 97: # setproperty
968 index = u30(coder)
969 value = stack.pop()
970 idx = stack.pop()
971 obj = stack.pop()
972 assert isinstance(obj, list)
973 assert isinstance(idx, int)
974 obj[idx] = value
975 elif opcode == 98: # getlocal
976 index = u30(coder)
977 stack.append(registers[index])
978 elif opcode == 99: # setlocal
979 index = u30(coder)
980 value = stack.pop()
981 registers[index] = value
982 elif opcode == 102: # getproperty
983 index = u30(coder)
984 pname = multinames[index]
985 if pname == u'length':
986 obj = stack.pop()
987 assert isinstance(obj, list)
988 stack.append(len(obj))
989 else: # Assume attribute access
990 idx = stack.pop()
991 assert isinstance(idx, int)
992 obj = stack.pop()
993 assert isinstance(obj, list)
994 stack.append(obj[idx])
995 elif opcode == 128: # coerce
0ca96d48 996 u30(coder)
e0df6211
PH
997 elif opcode == 133: # coerce_s
998 assert isinstance(stack[-1], (type(None), compat_str))
999 elif opcode == 164: # modulo
1000 value2 = stack.pop()
1001 value1 = stack.pop()
1002 res = value1 % value2
1003 stack.append(res)
a7177865
PH
1004 elif opcode == 208: # getlocal_0
1005 stack.append(registers[0])
1006 elif opcode == 209: # getlocal_1
1007 stack.append(registers[1])
1008 elif opcode == 210: # getlocal_2
1009 stack.append(registers[2])
1010 elif opcode == 211: # getlocal_3
1011 stack.append(registers[3])
e0df6211
PH
1012 elif opcode == 214: # setlocal_2
1013 registers[2] = stack.pop()
1014 elif opcode == 215: # setlocal_3
1015 registers[3] = stack.pop()
1016 else:
1017 raise NotImplementedError(
1018 u'Unsupported opcode %d' % opcode)
1019
1020 method_pyfunctions[func_name] = resfunc
1021 return resfunc
1022
1023 initial_function = extract_function(u'decipher')
1024 return lambda s: initial_function([s])
1025
83799698 1026 def _decrypt_signature(self, s, video_id, player_url, age_gate=False):
257a2501 1027 """Turn the encrypted s field into a working signature"""
6b37f0be 1028
83799698 1029 if player_url is not None:
9f9be844
PH
1030 if player_url.startswith(u'//'):
1031 player_url = u'https:' + player_url
e0df6211 1032 try:
7f8ae73a
PH
1033 player_id = (player_url, len(s))
1034 if player_id not in self._player_cache:
83799698 1035 func = self._extract_signature_function(
c4417ddb 1036 video_id, player_url, len(s)
e0df6211 1037 )
7f8ae73a
PH
1038 self._player_cache[player_id] = func
1039 func = self._player_cache[player_id]
edf3e38e
PH
1040 if self._downloader.params.get('youtube_print_sig_code'):
1041 self._print_sig_code(func, len(s))
1042 return func(s)
0ca96d48 1043 except Exception:
e0df6211 1044 tb = traceback.format_exc()
83799698
PH
1045 self._downloader.report_warning(
1046 u'Automatic signature extraction failed: ' + tb)
e0df6211 1047
d2d8f895
PH
1048 self._downloader.report_warning(
1049 u'Warning: Falling back to static signature algorithm')
920de7a2 1050
2f2ffea9
PH
1051 return self._static_decrypt_signature(
1052 s, video_id, player_url, age_gate)
e0df6211 1053
2f2ffea9 1054 def _static_decrypt_signature(self, s, video_id, player_url, age_gate):
e0df6211
PH
1055 if age_gate:
1056 # The videos with age protection use another player, so the
1057 # algorithms can be different.
1058 if len(s) == 86:
1059 return s[2:63] + s[82] + s[64:82] + s[63]
1060
bc4b9008 1061 if len(s) == 93:
1062 return s[86:29:-1] + s[88] + s[28:5:-1]
1063 elif len(s) == 92:
444b1165 1064 return s[25] + s[3:25] + s[0] + s[26:42] + s[79] + s[43:79] + s[91] + s[80:83]
38d025b3
JMF
1065 elif len(s) == 91:
1066 return s[84:27:-1] + s[86] + s[26:5:-1]
444b1165
JMF
1067 elif len(s) == 90:
1068 return s[25] + s[3:25] + s[2] + s[26:40] + s[77] + s[41:77] + s[89] + s[78:81]
8a9d86a2 1069 elif len(s) == 89:
1070 return s[84:78:-1] + s[87] + s[77:60:-1] + s[0] + s[59:3:-1]
444b1165 1071 elif len(s) == 88:
3e223834 1072 return s[7:28] + s[87] + s[29:45] + s[55] + s[46:55] + s[2] + s[56:87] + s[28]
be547e1d 1073 elif len(s) == 87:
3a725669 1074 return s[6:27] + s[4] + s[28:39] + s[27] + s[40:59] + s[2] + s[60:]
be547e1d 1075 elif len(s) == 86:
f2c327fd 1076 return s[80:72:-1] + s[16] + s[71:39:-1] + s[72] + s[38:16:-1] + s[82] + s[15::-1]
be547e1d 1077 elif len(s) == 85:
6ae8ee3f 1078 return s[3:11] + s[0] + s[12:55] + s[84] + s[56:84]
be547e1d 1079 elif len(s) == 84:
6f56389b 1080 return s[78:70:-1] + s[14] + s[69:37:-1] + s[70] + s[36:14:-1] + s[80] + s[:14][::-1]
be547e1d 1081 elif len(s) == 83:
920de7a2 1082 return s[80:63:-1] + s[0] + s[62:0:-1] + s[63]
be547e1d 1083 elif len(s) == 82:
c21315f2 1084 return s[80:37:-1] + s[7] + s[36:7:-1] + s[0] + s[6:0:-1] + s[37]
be547e1d 1085 elif len(s) == 81:
aedd6bb9 1086 return s[56] + s[79:56:-1] + s[41] + s[55:41:-1] + s[80] + s[40:34:-1] + s[0] + s[33:29:-1] + s[34] + s[28:9:-1] + s[29] + s[8:0:-1] + s[9]
066090dd
JMF
1087 elif len(s) == 80:
1088 return s[1:19] + s[0] + s[20:68] + s[19] + s[69:80]
5c468ca8
JMF
1089 elif len(s) == 79:
1090 return s[54] + s[77:54:-1] + s[39] + s[53:39:-1] + s[78] + s[38:34:-1] + s[0] + s[33:29:-1] + s[34] + s[28:9:-1] + s[29] + s[8:0:-1] + s[9]
be547e1d
PH
1091
1092 else:
1093 raise ExtractorError(u'Unable to decrypt signature, key length %d not supported; retrying might work' % (len(s)))
c5e8d7af 1094
1f343eaa 1095 def _get_available_subtitles(self, video_id, webpage):
de7f3446 1096 try:
7fad1c63
JMF
1097 sub_list = self._download_webpage(
1098 'http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,
1099 video_id, note=False)
1100 except ExtractorError as err:
de7f3446
JMF
1101 self._downloader.report_warning(u'unable to download video subtitles: %s' % compat_str(err))
1102 return {}
1103 lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
1104
1105 sub_lang_list = {}
1106 for l in lang_list:
1107 lang = l[1]
1108 params = compat_urllib_parse.urlencode({
1109 'lang': lang,
1110 'v': video_id,
ca715127 1111 'fmt': self._downloader.params.get('subtitlesformat', 'srt'),
8eddf3e9 1112 'name': l[0].encode('utf-8'),
de7f3446
JMF
1113 })
1114 url = u'http://www.youtube.com/api/timedtext?' + params
1115 sub_lang_list[lang] = url
1116 if not sub_lang_list:
1117 self._downloader.report_warning(u'video doesn\'t have subtitles')
1118 return {}
1119 return sub_lang_list
1120
055e6f36 1121 def _get_available_automatic_caption(self, video_id, webpage):
de7f3446
JMF
1122 """We need the webpage for getting the captions url, pass it as an
1123 argument to speed up the process."""
ca715127 1124 sub_format = self._downloader.params.get('subtitlesformat', 'srt')
de7f3446
JMF
1125 self.to_screen(u'%s: Looking for automatic captions' % video_id)
1126 mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
055e6f36 1127 err_msg = u'Couldn\'t find automatic captions for %s' % video_id
de7f3446
JMF
1128 if mobj is None:
1129 self._downloader.report_warning(err_msg)
1130 return {}
1131 player_config = json.loads(mobj.group(1))
1132 try:
1133 args = player_config[u'args']
1134 caption_url = args[u'ttsurl']
1135 timestamp = args[u'timestamp']
055e6f36
JMF
1136 # We get the available subtitles
1137 list_params = compat_urllib_parse.urlencode({
1138 'type': 'list',
1139 'tlangs': 1,
1140 'asrs': 1,
de7f3446 1141 })
055e6f36 1142 list_url = caption_url + '&' + list_params
e26f8712 1143 caption_list = self._download_xml(list_url, video_id)
e3dc22ca 1144 original_lang_node = caption_list.find('track')
f6a54188 1145 if original_lang_node is None or original_lang_node.attrib.get('kind') != 'asr' :
e3dc22ca
JMF
1146 self._downloader.report_warning(u'Video doesn\'t have automatic captions')
1147 return {}
1148 original_lang = original_lang_node.attrib['lang_code']
055e6f36
JMF
1149
1150 sub_lang_list = {}
1151 for lang_node in caption_list.findall('target'):
1152 sub_lang = lang_node.attrib['lang_code']
1153 params = compat_urllib_parse.urlencode({
1154 'lang': original_lang,
1155 'tlang': sub_lang,
1156 'fmt': sub_format,
1157 'ts': timestamp,
1158 'kind': 'asr',
1159 })
1160 sub_lang_list[sub_lang] = caption_url + '&' + params
1161 return sub_lang_list
de7f3446
JMF
1162 # An extractor error can be raise by the download process if there are
1163 # no automatic captions but there are subtitles
1164 except (KeyError, ExtractorError):
1165 self._downloader.report_warning(err_msg)
1166 return {}
1167
c5e8d7af
PH
1168 def _print_formats(self, formats):
1169 print('Available formats:')
1170 for x in formats:
03cc7c20
JMF
1171 print('%s\t:\t%s\t[%s]%s' %(x, self._video_extensions.get(x, 'flv'),
1172 self._video_dimensions.get(x, '???'),
836a086c 1173 ' ('+self._special_itags[x]+')' if x in self._special_itags else ''))
c5e8d7af
PH
1174
1175 def _extract_id(self, url):
1176 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1177 if mobj is None:
1178 raise ExtractorError(u'Invalid URL: %s' % url)
1179 video_id = mobj.group(2)
1180 return video_id
1181
1d043b93
JMF
1182 def _get_video_url_list(self, url_map):
1183 """
1184 Transform a dictionary in the format {itag:url} to a list of (itag, url)
1185 with the requested formats.
1186 """
1187 req_format = self._downloader.params.get('format', None)
1188 format_limit = self._downloader.params.get('format_limit', None)
1189 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
1190 if format_limit is not None and format_limit in available_formats:
1191 format_list = available_formats[available_formats.index(format_limit):]
1192 else:
1193 format_list = available_formats
1194 existing_formats = [x for x in format_list if x in url_map]
1195 if len(existing_formats) == 0:
1196 raise ExtractorError(u'no known formats available for video')
1197 if self._downloader.params.get('listformats', None):
1198 self._print_formats(existing_formats)
1199 return
1200 if req_format is None or req_format == 'best':
1201 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1202 elif req_format == 'worst':
1203 video_url_list = [(existing_formats[-1], url_map[existing_formats[-1]])] # worst quality
1204 elif req_format in ('-1', 'all'):
1205 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1206 else:
1207 # Specific formats. We pick the first in a slash-delimeted sequence.
bdc6b3fc
AZ
1208 # Format can be specified as itag or 'mp4' or 'flv' etc. We pick the highest quality
1209 # available in the specified format. For example,
1210 # if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
1211 # if '1/mp4/3/4' is requested and '1' and '5' (is a mp4) are available, we pick '1'.
1212 # if '1/mp4/3/4' is requested and '4' and '5' (is a mp4) are available, we pick '5'.
1d043b93
JMF
1213 req_formats = req_format.split('/')
1214 video_url_list = None
1215 for rf in req_formats:
1216 if rf in url_map:
1217 video_url_list = [(rf, url_map[rf])]
1218 break
bdc6b3fc
AZ
1219 if rf in self._video_formats_map:
1220 for srf in self._video_formats_map[rf]:
1221 if srf in url_map:
1222 video_url_list = [(srf, url_map[srf])]
1223 break
1224 else:
1225 continue
1226 break
1d043b93
JMF
1227 if video_url_list is None:
1228 raise ExtractorError(u'requested format not available')
1229 return video_url_list
1230
1231 def _extract_from_m3u8(self, manifest_url, video_id):
1232 url_map = {}
1233 def _get_urls(_manifest):
1234 lines = _manifest.split('\n')
1235 urls = filter(lambda l: l and not l.startswith('#'),
1236 lines)
1237 return urls
1238 manifest = self._download_webpage(manifest_url, video_id, u'Downloading formats manifest')
1239 formats_urls = _get_urls(manifest)
1240 for format_url in formats_urls:
890f62e8 1241 itag = self._search_regex(r'itag/(\d+?)/', format_url, 'itag')
1d043b93
JMF
1242 url_map[itag] = format_url
1243 return url_map
1244
1fb07d10
JG
1245 def _extract_annotations(self, video_id):
1246 url = 'https://www.youtube.com/annotations_invideo?features=1&legacy=1&video_id=%s' % video_id
1247 return self._download_webpage(url, video_id, note=u'Searching for annotations.', errnote=u'Unable to download video annotations.')
1248
c5e8d7af
PH
1249 def _real_extract(self, url):
1250 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
1251 mobj = re.search(self._NEXT_URL_RE, url)
1252 if mobj:
1253 url = 'https://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
1254 video_id = self._extract_id(url)
1255
1256 # Get video webpage
c5e8d7af 1257 url = 'https://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
336c3a69 1258 video_webpage = self._download_webpage(url, video_id)
c5e8d7af
PH
1259
1260 # Attempt to extract SWF player URL
e0df6211 1261 mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
c5e8d7af
PH
1262 if mobj is not None:
1263 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1264 else:
1265 player_url = None
1266
1267 # Get video info
1268 self.report_video_info_webpage_download(video_id)
c108eb73
JMF
1269 if re.search(r'player-age-gate-content">', video_webpage) is not None:
1270 self.report_age_confirmation()
1271 age_gate = True
1272 # We simulate the access to the video from www.youtube.com/v/{video_id}
1273 # this can be viewed without login into Youtube
1274 data = compat_urllib_parse.urlencode({'video_id': video_id,
fccd3771 1275 'el': 'player_embedded',
c108eb73
JMF
1276 'gl': 'US',
1277 'hl': 'en',
1278 'eurl': 'https://youtube.googleapis.com/v/' + video_id,
1279 'asv': 3,
1280 'sts':'1588',
1281 })
1282 video_info_url = 'https://www.youtube.com/get_video_info?' + data
c5e8d7af
PH
1283 video_info_webpage = self._download_webpage(video_info_url, video_id,
1284 note=False,
1285 errnote='unable to download video info webpage')
1286 video_info = compat_parse_qs(video_info_webpage)
c108eb73
JMF
1287 else:
1288 age_gate = False
1289 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1290 video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1291 % (video_id, el_type))
1292 video_info_webpage = self._download_webpage(video_info_url, video_id,
1293 note=False,
1294 errnote='unable to download video info webpage')
1295 video_info = compat_parse_qs(video_info_webpage)
1296 if 'token' in video_info:
1297 break
c5e8d7af
PH
1298 if 'token' not in video_info:
1299 if 'reason' in video_info:
9a82b238 1300 raise ExtractorError(u'YouTube said: %s' % video_info['reason'][0], expected=True)
c5e8d7af
PH
1301 else:
1302 raise ExtractorError(u'"token" parameter not in video info for unknown reason')
1303
1d699755
PH
1304 if 'view_count' in video_info:
1305 view_count = int(video_info['view_count'][0])
1306 else:
1307 view_count = None
1308
c5e8d7af
PH
1309 # Check for "rental" videos
1310 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
1311 raise ExtractorError(u'"rental" videos not supported')
1312
1313 # Start extracting information
1314 self.report_information_extraction(video_id)
1315
1316 # uploader
1317 if 'author' not in video_info:
1318 raise ExtractorError(u'Unable to extract uploader name')
1319 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
1320
1321 # uploader_id
1322 video_uploader_id = None
1323 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
1324 if mobj is not None:
1325 video_uploader_id = mobj.group(1)
1326 else:
1327 self._downloader.report_warning(u'unable to extract uploader nickname')
1328
1329 # title
a8c6b241
PH
1330 if 'title' in video_info:
1331 video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
1332 else:
1333 self._downloader.report_warning(u'Unable to extract video title')
1334 video_title = u'_'
c5e8d7af
PH
1335
1336 # thumbnail image
7763b04e
JMF
1337 # We try first to get a high quality image:
1338 m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">',
1339 video_webpage, re.DOTALL)
1340 if m_thumb is not None:
1341 video_thumbnail = m_thumb.group(1)
1342 elif 'thumbnail_url' not in video_info:
c5e8d7af 1343 self._downloader.report_warning(u'unable to extract video thumbnail')
f490e77e 1344 video_thumbnail = None
c5e8d7af
PH
1345 else: # don't panic if we can't find it
1346 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
1347
1348 # upload date
1349 upload_date = None
1350 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
1351 if mobj is not None:
1352 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1353 upload_date = unified_strdate(upload_date)
1354
1355 # description
1356 video_description = get_element_by_id("eow-description", video_webpage)
1357 if video_description:
27dcce19
PH
1358 video_description = re.sub(r'''(?x)
1359 <a\s+
1360 (?:[a-zA-Z-]+="[^"]+"\s+)*?
1361 title="([^"]+)"\s+
1362 (?:[a-zA-Z-]+="[^"]+"\s+)*?
1363 class="yt-uix-redirect-link"\s*>
1364 [^<]+
1365 </a>
1366 ''', r'\1', video_description)
c5e8d7af
PH
1367 video_description = clean_html(video_description)
1368 else:
1369 fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
1370 if fd_mobj:
1371 video_description = unescapeHTML(fd_mobj.group(1))
1372 else:
1373 video_description = u''
1374
336c3a69
JMF
1375 def _extract_count(klass):
1376 count = self._search_regex(r'class="%s">([\d,]+)</span>' % re.escape(klass), video_webpage, klass, fatal=False)
1377 if count is not None:
1378 return int(count.replace(',', ''))
1379 return None
1380 like_count = _extract_count(u'likes-count')
1381 dislike_count = _extract_count(u'dislikes-count')
1382
c5e8d7af 1383 # subtitles
d82134c3 1384 video_subtitles = self.extract_subtitles(video_id, video_webpage)
c5e8d7af 1385
c5e8d7af 1386 if self._downloader.params.get('listsubtitles', False):
d665f8d3 1387 self._list_available_subtitles(video_id, video_webpage)
c5e8d7af
PH
1388 return
1389
1390 if 'length_seconds' not in video_info:
1391 self._downloader.report_warning(u'unable to extract video duration')
1392 video_duration = ''
1393 else:
1394 video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
1395
1fb07d10
JG
1396 # annotations
1397 video_annotations = None
1398 if self._downloader.params.get('writeannotations', False):
1399 video_annotations = self._extract_annotations(video_id)
1400
c5e8d7af 1401 # Decide which formats to download
c5e8d7af
PH
1402
1403 try:
1404 mobj = re.search(r';ytplayer.config = ({.*?});', video_webpage)
50be92c1
PH
1405 if not mobj:
1406 raise ValueError('Could not find vevo ID')
c5e8d7af
PH
1407 info = json.loads(mobj.group(1))
1408 args = info['args']
7ce7e394
JMF
1409 # Easy way to know if the 's' value is in url_encoded_fmt_stream_map
1410 # this signatures are encrypted
44d46655 1411 if 'url_encoded_fmt_stream_map' not in args:
f10503db 1412 raise ValueError(u'No stream_map present') # caught below
00fe14fc
JMF
1413 re_signature = re.compile(r'[&,]s=')
1414 m_s = re_signature.search(args['url_encoded_fmt_stream_map'])
7ce7e394
JMF
1415 if m_s is not None:
1416 self.to_screen(u'%s: Encrypted signatures detected.' % video_id)
c5e8d7af 1417 video_info['url_encoded_fmt_stream_map'] = [args['url_encoded_fmt_stream_map']]
00fe14fc 1418 m_s = re_signature.search(args.get('adaptive_fmts', u''))
b7a68384 1419 if m_s is not None:
00fe14fc
JMF
1420 if 'adaptive_fmts' in video_info:
1421 video_info['adaptive_fmts'][0] += ',' + args['adaptive_fmts']
37b6d5f6 1422 else:
00fe14fc 1423 video_info['adaptive_fmts'] = [args['adaptive_fmts']]
c5e8d7af
PH
1424 except ValueError:
1425 pass
1426
1427 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1428 self.report_rtmp_download()
1429 video_url_list = [(None, video_info['conn'][0])]
00fe14fc
JMF
1430 elif len(video_info.get('url_encoded_fmt_stream_map', [])) >= 1 or len(video_info.get('adaptive_fmts', [])) >= 1:
1431 encoded_url_map = video_info.get('url_encoded_fmt_stream_map', [''])[0] + ',' + video_info.get('adaptive_fmts',[''])[0]
1432 if 'rtmpe%3Dyes' in encoded_url_map:
a7055eb9 1433 raise ExtractorError('rtmpe downloads are not supported, see https://github.com/rg3/youtube-dl/issues/343 for more information.', expected=True)
c5e8d7af 1434 url_map = {}
00fe14fc 1435 for url_data_str in encoded_url_map.split(','):
c5e8d7af
PH
1436 url_data = compat_parse_qs(url_data_str)
1437 if 'itag' in url_data and 'url' in url_data:
1438 url = url_data['url'][0]
1439 if 'sig' in url_data:
1440 url += '&signature=' + url_data['sig'][0]
1441 elif 's' in url_data:
e0df6211 1442 encrypted_sig = url_data['s'][0]
769fda3c 1443 if self._downloader.params.get('verbose'):
c108eb73 1444 if age_gate:
bdde940e
PH
1445 if player_url is None:
1446 player_version = 'unknown'
1447 else:
1448 player_version = self._search_regex(
1449 r'-(.+)\.swf$', player_url,
1450 u'flash player', fatal=False)
e0df6211 1451 player_desc = 'flash player %s' % player_version
c108eb73 1452 else:
83799698
PH
1453 player_version = self._search_regex(
1454 r'html5player-(.+?)\.js', video_webpage,
c108eb73 1455 'html5 player', fatal=False)
e0df6211
PH
1456 player_desc = u'html5 player %s' % player_version
1457
1458 parts_sizes = u'.'.join(compat_str(len(part)) for part in encrypted_sig.split('.'))
5a76c651 1459 self.to_screen(u'encrypted signature length %d (%s), itag %s, %s' %
e0df6211
PH
1460 (len(encrypted_sig), parts_sizes, url_data['itag'][0], player_desc))
1461
83799698 1462 if not age_gate:
e0df6211
PH
1463 jsplayer_url_json = self._search_regex(
1464 r'"assets":.+?"js":\s*("[^"]+")',
1465 video_webpage, u'JS player URL')
83799698 1466 player_url = json.loads(jsplayer_url_json)
e0df6211 1467
83799698
PH
1468 signature = self._decrypt_signature(
1469 encrypted_sig, video_id, player_url, age_gate)
c5e8d7af
PH
1470 url += '&signature=' + signature
1471 if 'ratebypass' not in url:
1472 url += '&ratebypass=yes'
1473 url_map[url_data['itag'][0]] = url
1d043b93
JMF
1474 video_url_list = self._get_video_url_list(url_map)
1475 if not video_url_list:
c5e8d7af 1476 return
1d043b93
JMF
1477 elif video_info.get('hlsvp'):
1478 manifest_url = video_info['hlsvp'][0]
1479 url_map = self._extract_from_m3u8(manifest_url, video_id)
1480 video_url_list = self._get_video_url_list(url_map)
1481 if not video_url_list:
1482 return
1483
c5e8d7af 1484 else:
9abb3204 1485 raise ExtractorError(u'no conn, hlsvp or url_encoded_fmt_stream_map information found in video info')
c5e8d7af
PH
1486
1487 results = []
600cc1a4 1488 for itag, video_real_url in video_url_list:
c5e8d7af 1489 # Extension
600cc1a4 1490 video_extension = self._video_extensions.get(itag, 'flv')
c5e8d7af 1491
600cc1a4
JMF
1492 video_format = '{0} - {1}{2}'.format(itag if itag else video_extension,
1493 self._video_dimensions.get(itag, '???'),
1494 ' ('+self._special_itags[itag]+')' if itag in self._special_itags else '')
c5e8d7af
PH
1495
1496 results.append({
1497 'id': video_id,
1498 'url': video_real_url,
1499 'uploader': video_uploader,
1500 'uploader_id': video_uploader_id,
1501 'upload_date': upload_date,
1502 'title': video_title,
1503 'ext': video_extension,
1504 'format': video_format,
600cc1a4 1505 'format_id': itag,
c5e8d7af
PH
1506 'thumbnail': video_thumbnail,
1507 'description': video_description,
1508 'player_url': player_url,
1509 'subtitles': video_subtitles,
8dbe9899 1510 'duration': video_duration,
cfadd183 1511 'age_limit': 18 if age_gate else 0,
9103bbc5
JMF
1512 'annotations': video_annotations,
1513 'webpage_url': 'https://www.youtube.com/watch?v=%s' % video_id,
1d699755 1514 'view_count': view_count,
336c3a69
JMF
1515 'like_count': like_count,
1516 'dislike_count': dislike_count,
c5e8d7af
PH
1517 })
1518 return results
1519
880e1c52 1520class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
0f818663 1521 IE_DESC = u'YouTube.com playlists'
c5e8d7af
PH
1522 _VALID_URL = r"""(?:
1523 (?:https?://)?
1524 (?:\w+\.)?
1525 youtube\.com/
1526 (?:
1527 (?:course|view_play_list|my_playlists|artist|playlist|watch)
1528 \? (?:.*?&)*? (?:p|a|list)=
1529 | p/
1530 )
c626a3d9 1531 ((?:PL|EC|UU|FL)?[0-9A-Za-z-_]{10,})
c5e8d7af
PH
1532 .*
1533 |
c626a3d9 1534 ((?:PL|EC|UU|FL)[0-9A-Za-z-_]{10,})
c5e8d7af 1535 )"""
dcbb4580
JMF
1536 _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s&page=%s'
1537 _MORE_PAGES_INDICATOR = r'data-link-type="next"'
6e47b51e 1538 _VIDEO_RE = r'href="/watch\?v=(?P<id>[0-9A-Za-z_-]{11})&amp;[^"]*?index=(?P<index>\d+)'
c5e8d7af
PH
1539 IE_NAME = u'youtube:playlist'
1540
1541 @classmethod
1542 def suitable(cls, url):
1543 """Receives a URL and returns True if suitable for this IE."""
1544 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1545
880e1c52
JMF
1546 def _real_initialize(self):
1547 self._login()
1548
652cdaa2
JMF
1549 def _ids_to_results(self, ids):
1550 return [self.url_result(vid_id, 'Youtube', video_id=vid_id)
1551 for vid_id in ids]
1552
1553 def _extract_mix(self, playlist_id):
1554 # The mixes are generated from a a single video
1555 # the id of the playlist is just 'RD' + video_id
7d4afc55 1556 url = 'https://youtube.com/watch?v=%s&list=%s' % (playlist_id[-11:], playlist_id)
652cdaa2 1557 webpage = self._download_webpage(url, playlist_id, u'Downloading Youtube mix')
76d1700b
JMF
1558 title_span = (get_element_by_attribute('class', 'title long-title', webpage) or
1559 get_element_by_attribute('class', 'title ', webpage))
1560 title = clean_html(title_span)
652cdaa2
JMF
1561 video_re = r'data-index="\d+".*?href="/watch\?v=([0-9A-Za-z_-]{11})&amp;[^"]*?list=%s' % re.escape(playlist_id)
1562 ids = orderedSet(re.findall(video_re, webpage))
1563 url_results = self._ids_to_results(ids)
1564
1565 return self.playlist_result(url_results, playlist_id, title)
1566
c5e8d7af
PH
1567 def _real_extract(self, url):
1568 # Extract playlist id
1569 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1570 if mobj is None:
1571 raise ExtractorError(u'Invalid URL: %s' % url)
47192f92
FV
1572 playlist_id = mobj.group(1) or mobj.group(2)
1573
1574 # Check if it's a video-specific URL
7c61bd36 1575 query_dict = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
47192f92
FV
1576 if 'v' in query_dict:
1577 video_id = query_dict['v'][0]
1578 if self._downloader.params.get('noplaylist'):
1579 self.to_screen(u'Downloading just video %s because of --no-playlist' % video_id)
7012b23c 1580 return self.url_result(video_id, 'Youtube', video_id=video_id)
47192f92
FV
1581 else:
1582 self.to_screen(u'Downloading playlist PL%s - add --no-playlist to just download video %s' % (playlist_id, video_id))
c5e8d7af 1583
7d4afc55 1584 if playlist_id.startswith('RD'):
652cdaa2
JMF
1585 # Mixes require a custom extraction process
1586 return self._extract_mix(playlist_id)
1587
dcbb4580
JMF
1588 # Extract the video ids from the playlist pages
1589 ids = []
c5e8d7af 1590
755eb032 1591 for page_num in itertools.count(1):
dcbb4580 1592 url = self._TEMPLATE_URL % (playlist_id, page_num)
c5e8d7af 1593 page = self._download_webpage(url, playlist_id, u'Downloading page #%s' % page_num)
6e47b51e
JMF
1594 matches = re.finditer(self._VIDEO_RE, page)
1595 # We remove the duplicates and the link with index 0
1596 # (it's not the first video of the playlist)
1597 new_ids = orderedSet(m.group('id') for m in matches if m.group('index') != '0')
dcbb4580 1598 ids.extend(new_ids)
c5e8d7af 1599
dcbb4580 1600 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
c5e8d7af
PH
1601 break
1602
dcbb4580 1603 playlist_title = self._og_search_title(page)
c5e8d7af 1604
652cdaa2 1605 url_results = self._ids_to_results(ids)
dcbb4580 1606 return self.playlist_result(url_results, playlist_id, playlist_title)
c5e8d7af
PH
1607
1608
1609class YoutubeChannelIE(InfoExtractor):
0f818663 1610 IE_DESC = u'YouTube.com channels'
c5e8d7af 1611 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
c5e8d7af 1612 _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
252580c5 1613 _MORE_PAGES_URL = 'http://www.youtube.com/c4_browse_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
c5e8d7af
PH
1614 IE_NAME = u'youtube:channel'
1615
1616 def extract_videos_from_page(self, page):
1617 ids_in_page = []
1618 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1619 if mobj.group(1) not in ids_in_page:
1620 ids_in_page.append(mobj.group(1))
1621 return ids_in_page
1622
1623 def _real_extract(self, url):
1624 # Extract channel id
1625 mobj = re.match(self._VALID_URL, url)
1626 if mobj is None:
1627 raise ExtractorError(u'Invalid URL: %s' % url)
1628
1629 # Download channel page
1630 channel_id = mobj.group(1)
1631 video_ids = []
b9643eed
JMF
1632 url = 'https://www.youtube.com/channel/%s/videos' % channel_id
1633 channel_page = self._download_webpage(url, channel_id)
1634 if re.search(r'channel-header-autogenerated-label', channel_page) is not None:
1635 autogenerated = True
1636 else:
1637 autogenerated = False
c5e8d7af 1638
b9643eed
JMF
1639 if autogenerated:
1640 # The videos are contained in a single page
1641 # the ajax pages can't be used, they are empty
1642 video_ids = self.extract_videos_from_page(channel_page)
1643 else:
1644 # Download all channel pages using the json-based channel_ajax query
1645 for pagenum in itertools.count(1):
1646 url = self._MORE_PAGES_URL % (pagenum, channel_id)
1647 page = self._download_webpage(url, channel_id,
1648 u'Downloading page #%s' % pagenum)
1649
1650 page = json.loads(page)
1651
1652 ids_in_page = self.extract_videos_from_page(page['content_html'])
1653 video_ids.extend(ids_in_page)
1654
1655 if self._MORE_PAGES_INDICATOR not in page['load_more_widget_html']:
1656 break
c5e8d7af
PH
1657
1658 self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1659
7012b23c
PH
1660 url_entries = [self.url_result(video_id, 'Youtube', video_id=video_id)
1661 for video_id in video_ids]
1662 return self.playlist_result(url_entries, channel_id)
c5e8d7af
PH
1663
1664
1665class YoutubeUserIE(InfoExtractor):
0f818663 1666 IE_DESC = u'YouTube.com user videos (URL or "ytuser" keyword)'
57da92b7 1667 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:user/)?(?!(?:attribution_link|watch)(?:$|[^a-z_A-Z0-9-])))|ytuser:)(?!feed/)([A-Za-z0-9_-]+)'
c5e8d7af
PH
1668 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1669 _GDATA_PAGE_SIZE = 50
fd9cf738 1670 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d&alt=json'
c5e8d7af
PH
1671 IE_NAME = u'youtube:user'
1672
e3ea4790 1673 @classmethod
f4b05232 1674 def suitable(cls, url):
e3ea4790
JMF
1675 # Don't return True if the url can be extracted with other youtube
1676 # extractor, the regex would is too permissive and it would match.
1677 other_ies = iter(klass for (name, klass) in globals().items() if name.endswith('IE') and klass is not cls)
1678 if any(ie.suitable(url) for ie in other_ies): return False
f4b05232
JMF
1679 else: return super(YoutubeUserIE, cls).suitable(url)
1680
c5e8d7af
PH
1681 def _real_extract(self, url):
1682 # Extract username
1683 mobj = re.match(self._VALID_URL, url)
1684 if mobj is None:
1685 raise ExtractorError(u'Invalid URL: %s' % url)
1686
1687 username = mobj.group(1)
1688
1689 # Download video ids using YouTube Data API. Result size per
1690 # query is limited (currently to 50 videos) so we need to query
1691 # page by page until there are no video ids - it means we got
1692 # all of them.
1693
1694 video_ids = []
c5e8d7af 1695
755eb032 1696 for pagenum in itertools.count(0):
c5e8d7af
PH
1697 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1698
1699 gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
1700 page = self._download_webpage(gdata_url, username,
1701 u'Downloading video ids from %d to %d' % (start_index, start_index + self._GDATA_PAGE_SIZE))
1702
fd9cf738
JMF
1703 try:
1704 response = json.loads(page)
1705 except ValueError as err:
1706 raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
71c82637
JMF
1707 if 'entry' not in response['feed']:
1708 # Number of videos is a multiple of self._MAX_RESULTS
1709 break
fd9cf738 1710
c5e8d7af
PH
1711 # Extract video identifiers
1712 ids_in_page = []
fd9cf738
JMF
1713 for entry in response['feed']['entry']:
1714 ids_in_page.append(entry['id']['$t'].split('/')[-1])
c5e8d7af
PH
1715 video_ids.extend(ids_in_page)
1716
1717 # A little optimization - if current page is not
1718 # "full", ie. does not contain PAGE_SIZE video ids then
1719 # we can assume that this page is the last one - there
1720 # are no more ids on further pages - no need to query
1721 # again.
1722
1723 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1724 break
1725
7012b23c
PH
1726 url_results = [
1727 self.url_result(video_id, 'Youtube', video_id=video_id)
1728 for video_id in video_ids]
1729 return self.playlist_result(url_results, playlist_title=username)
1730
b05654f0
PH
1731
1732class YoutubeSearchIE(SearchInfoExtractor):
0f818663 1733 IE_DESC = u'YouTube.com searches'
b05654f0
PH
1734 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1735 _MAX_RESULTS = 1000
1736 IE_NAME = u'youtube:search'
1737 _SEARCH_KEY = 'ytsearch'
1738
1739 def report_download_page(self, query, pagenum):
1740 """Report attempt to download search page with given number."""
1741 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1742
1743 def _get_n_results(self, query, n):
1744 """Get a specified number of results for a query"""
1745
1746 video_ids = []
1747 pagenum = 0
1748 limit = n
1749
1750 while (50 * pagenum) < limit:
1751 self.report_download_page(query, pagenum+1)
1752 result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1753 request = compat_urllib_request.Request(result_url)
1754 try:
1755 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
1756 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1757 raise ExtractorError(u'Unable to download API page: %s' % compat_str(err))
1758 api_response = json.loads(data)['data']
1759
1760 if not 'items' in api_response:
1761 raise ExtractorError(u'[youtube] No video results')
1762
1763 new_ids = list(video['id'] for video in api_response['items'])
1764 video_ids += new_ids
1765
1766 limit = min(n, api_response['totalItems'])
1767 pagenum += 1
1768
1769 if len(video_ids) > n:
1770 video_ids = video_ids[:n]
7012b23c
PH
1771 videos = [self.url_result(video_id, 'Youtube', video_id=video_id)
1772 for video_id in video_ids]
b05654f0 1773 return self.playlist_result(videos, query)
75dff0ee 1774
a3dd9248 1775class YoutubeSearchDateIE(YoutubeSearchIE):
cb7fb546 1776 IE_NAME = YoutubeSearchIE.IE_NAME + ':date'
a3dd9248
CM
1777 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc&orderby=published'
1778 _SEARCH_KEY = 'ytsearchdate'
08fb86c4 1779 IE_DESC = u'YouTube.com searches, newest videos first'
75dff0ee
JMF
1780
1781class YoutubeShowIE(InfoExtractor):
0f818663 1782 IE_DESC = u'YouTube.com (multi-season) shows'
75dff0ee
JMF
1783 _VALID_URL = r'https?://www\.youtube\.com/show/(.*)'
1784 IE_NAME = u'youtube:show'
1785
1786 def _real_extract(self, url):
1787 mobj = re.match(self._VALID_URL, url)
1788 show_name = mobj.group(1)
1789 webpage = self._download_webpage(url, show_name, u'Downloading show webpage')
1790 # There's one playlist for each season of the show
1791 m_seasons = list(re.finditer(r'href="(/playlist\?list=.*?)"', webpage))
1792 self.to_screen(u'%s: Found %s seasons' % (show_name, len(m_seasons)))
1793 return [self.url_result('https://www.youtube.com' + season.group(1), 'YoutubePlaylist') for season in m_seasons]
04cc9617
JMF
1794
1795
b2e8bc1b 1796class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
d7ae0639
JMF
1797 """
1798 Base class for extractors that fetch info from
1799 http://www.youtube.com/feed_ajax
1800 Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.
1801 """
b2e8bc1b 1802 _LOGIN_REQUIRED = True
43ba5456
JMF
1803 # use action_load_personal_feed instead of action_load_system_feed
1804 _PERSONAL_FEED = False
04cc9617 1805
d7ae0639
JMF
1806 @property
1807 def _FEED_TEMPLATE(self):
43ba5456
JMF
1808 action = 'action_load_system_feed'
1809 if self._PERSONAL_FEED:
1810 action = 'action_load_personal_feed'
1811 return 'http://www.youtube.com/feed_ajax?%s=1&feed_name=%s&paging=%%s' % (action, self._FEED_NAME)
d7ae0639
JMF
1812
1813 @property
1814 def IE_NAME(self):
1815 return u'youtube:%s' % self._FEED_NAME
04cc9617 1816
81f0259b 1817 def _real_initialize(self):
b2e8bc1b 1818 self._login()
81f0259b 1819
04cc9617
JMF
1820 def _real_extract(self, url):
1821 feed_entries = []
0e44d838
JMF
1822 paging = 0
1823 for i in itertools.count(1):
d7ae0639
JMF
1824 info = self._download_webpage(self._FEED_TEMPLATE % paging,
1825 u'%s feed' % self._FEED_NAME,
04cc9617
JMF
1826 u'Downloading page %s' % i)
1827 info = json.loads(info)
1828 feed_html = info['feed_html']
43ba5456 1829 m_ids = re.finditer(r'"/watch\?v=(.*?)["&]', feed_html)
04cc9617 1830 ids = orderedSet(m.group(1) for m in m_ids)
7012b23c
PH
1831 feed_entries.extend(
1832 self.url_result(video_id, 'Youtube', video_id=video_id)
1833 for video_id in ids)
04cc9617
JMF
1834 if info['paging'] is None:
1835 break
0e44d838 1836 paging = info['paging']
d7ae0639
JMF
1837 return self.playlist_result(feed_entries, playlist_title=self._PLAYLIST_TITLE)
1838
1839class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor):
1840 IE_DESC = u'YouTube.com subscriptions feed, "ytsubs" keyword(requires authentication)'
1841 _VALID_URL = r'https?://www\.youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
1842 _FEED_NAME = 'subscriptions'
1843 _PLAYLIST_TITLE = u'Youtube Subscriptions'
1844
1845class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
1846 IE_DESC = u'YouTube.com recommended videos, "ytrec" keyword (requires authentication)'
1847 _VALID_URL = r'https?://www\.youtube\.com/feed/recommended|:ytrec(?:ommended)?'
1848 _FEED_NAME = 'recommended'
1849 _PLAYLIST_TITLE = u'Youtube Recommended videos'
c626a3d9 1850
43ba5456
JMF
1851class YoutubeWatchLaterIE(YoutubeFeedsInfoExtractor):
1852 IE_DESC = u'Youtube watch later list, "ytwatchlater" keyword (requires authentication)'
1853 _VALID_URL = r'https?://www\.youtube\.com/feed/watch_later|:ytwatchlater'
1854 _FEED_NAME = 'watch_later'
1855 _PLAYLIST_TITLE = u'Youtube Watch Later'
43ba5456 1856 _PERSONAL_FEED = True
c626a3d9 1857
f459d170
JMF
1858class YoutubeHistoryIE(YoutubeFeedsInfoExtractor):
1859 IE_DESC = u'Youtube watch history, "ythistory" keyword (requires authentication)'
1860 _VALID_URL = u'https?://www\.youtube\.com/feed/history|:ythistory'
1861 _FEED_NAME = 'history'
1862 _PERSONAL_FEED = True
1863 _PLAYLIST_TITLE = u'Youtube Watch History'
1864
c626a3d9
JMF
1865class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
1866 IE_NAME = u'youtube:favorites'
1867 IE_DESC = u'YouTube.com favourite videos, "ytfav" keyword (requires authentication)'
c7a7750d 1868 _VALID_URL = r'https?://www\.youtube\.com/my_favorites|:ytfav(?:ou?rites)?'
c626a3d9
JMF
1869 _LOGIN_REQUIRED = True
1870
1871 def _real_extract(self, url):
1872 webpage = self._download_webpage('https://www.youtube.com/my_favorites', 'Youtube Favourites videos')
1873 playlist_id = self._search_regex(r'list=(.+?)["&]', webpage, u'favourites playlist id')
1874 return self.url_result(playlist_id, 'YoutubePlaylist')
15870e90
PH
1875
1876
1877class YoutubeTruncatedURLIE(InfoExtractor):
1878 IE_NAME = 'youtube:truncated_url'
1879 IE_DESC = False # Do not list
1880 _VALID_URL = r'(?:https?://)?[^/]+/watch\?feature=[a-z_]+$'
1881
1882 def _real_extract(self, url):
1883 raise ExtractorError(
1884 u'Did you forget to quote the URL? Remember that & is a meta '
1885 u'character in most shells, so you want to put the URL in quotes, '
1886 u'like youtube-dl '
1887 u'\'http://www.youtube.com/watch?feature=foo&v=BaW_jenozKc\''
1888 u' (or simply youtube-dl BaW_jenozKc ).',
1889 expected=True)