]> jfr.im git - yt-dlp.git/blame - youtube_dl/extractor/youtube.py
Add more tests for format selection
[yt-dlp.git] / youtube_dl / extractor / youtube.py
CommitLineData
c5e8d7af 1# coding: utf-8
c5e8d7af 2
e0df6211 3import collections
edf3e38e 4import errno
e0df6211 5import io
0ca96d48 6import itertools
c5e8d7af 7import json
c4417ddb 8import os.path
c5e8d7af 9import re
e0df6211
PH
10import string
11import struct
12import traceback
13import zlib
c5e8d7af 14
b05654f0 15from .common import InfoExtractor, SearchInfoExtractor
54d39d8b 16from .subtitles import SubtitlesInfoExtractor
c5e8d7af 17from ..utils import (
edf3e38e 18 compat_chr,
c5e8d7af 19 compat_parse_qs,
c5e8d7af
PH
20 compat_urllib_parse,
21 compat_urllib_request,
7c61bd36 22 compat_urlparse,
c5e8d7af
PH
23 compat_str,
24
25 clean_html,
c38b1e77 26 get_cachedir,
c5e8d7af 27 get_element_by_id,
652cdaa2 28 get_element_by_attribute,
c5e8d7af
PH
29 ExtractorError,
30 unescapeHTML,
31 unified_strdate,
04cc9617 32 orderedSet,
edf3e38e 33 write_json_file,
c5e8d7af
PH
34)
35
de7f3446 36class YoutubeBaseInfoExtractor(InfoExtractor):
b2e8bc1b
JMF
37 """Provide base functions for Youtube extractors"""
38 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
39 _LANG_URL = r'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
40 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
41 _NETRC_MACHINE = 'youtube'
42 # If True it will raise an error if no login info is provided
43 _LOGIN_REQUIRED = False
44
b2e8bc1b 45 def _set_language(self):
7cc3570e
PH
46 return bool(self._download_webpage(
47 self._LANG_URL, None,
48 note=u'Setting language', errnote='unable to set language',
49 fatal=False))
b2e8bc1b
JMF
50
51 def _login(self):
52 (username, password) = self._get_login_info()
53 # No authentication to be performed
54 if username is None:
55 if self._LOGIN_REQUIRED:
56 raise ExtractorError(u'No login info available, needed for using %s.' % self.IE_NAME, expected=True)
57 return False
58
7cc3570e
PH
59 login_page = self._download_webpage(
60 self._LOGIN_URL, None,
61 note=u'Downloading login page',
62 errnote=u'unable to fetch login page', fatal=False)
63 if login_page is False:
64 return
b2e8bc1b 65
795f28f8
PH
66 galx = self._search_regex(r'(?s)<input.+?name="GALX".+?value="(.+?)"',
67 login_page, u'Login GALX parameter')
c5e8d7af 68
b2e8bc1b
JMF
69 # Log in
70 login_form_strs = {
71 u'continue': u'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
72 u'Email': username,
73 u'GALX': galx,
74 u'Passwd': password,
75 u'PersistentCookie': u'yes',
76 u'_utf8': u'霱',
77 u'bgresponse': u'js_disabled',
78 u'checkConnection': u'',
79 u'checkedDomains': u'youtube',
80 u'dnConn': u'',
b2e8bc1b
JMF
81 u'pstMsg': u'0',
82 u'rmShown': u'1',
83 u'secTok': u'',
84 u'signIn': u'Sign in',
85 u'timeStmp': u'',
86 u'service': u'youtube',
87 u'uilel': u'3',
88 u'hl': u'en_US',
89 }
90 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
91 # chokes on unicode
92 login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
93 login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
7cc3570e
PH
94
95 req = compat_urllib_request.Request(self._LOGIN_URL, login_data)
96 login_results = self._download_webpage(
97 req, None,
98 note=u'Logging in', errnote=u'unable to log in', fatal=False)
99 if login_results is False:
100 return False
101 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
102 self._downloader.report_warning(u'unable to log in: bad username or password')
b2e8bc1b
JMF
103 return False
104 return True
105
106 def _confirm_age(self):
107 age_form = {
7cc3570e
PH
108 'next_url': '/',
109 'action_confirm': 'Confirm',
110 }
111 req = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
112
113 self._download_webpage(
114 req, None,
115 note=u'Confirming age', errnote=u'Unable to confirm age')
b2e8bc1b
JMF
116 return True
117
118 def _real_initialize(self):
119 if self._downloader is None:
120 return
121 if not self._set_language():
122 return
123 if not self._login():
124 return
125 self._confirm_age()
c5e8d7af 126
8377574c 127
de7f3446 128class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
0f818663 129 IE_DESC = u'YouTube.com'
cb7dfeea 130 _VALID_URL = r"""(?x)^
c5e8d7af 131 (
83aa5293 132 (?:https?://|//)? # http(s):// or protocol-independent URL (optional)
cb7dfeea 133 (?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/|
e69ae5b9
JMF
134 tube\.majestyc\.net/|
135 youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains
c5e8d7af
PH
136 (?:.*?\#/)? # handle anchor (#/) redirect urls
137 (?: # the various things that can precede the ID:
138 (?:(?:v|embed|e)/) # v/ or embed/ or e/
139 |(?: # or the v= param in all its forms
d741e55a 140 (?:(?:watch|movie)(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
c5e8d7af
PH
141 (?:\?|\#!?) # the params delimiter ? or # or #!
142 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
143 v=
144 )
f4b05232
JMF
145 ))
146 |youtu\.be/ # just youtu.be/xxxx
147 )
c5e8d7af 148 )? # all until now is optional -> you can pass the naked ID
8963d9c2 149 ([0-9A-Za-z_-]{11}) # here is it! the YouTube video ID
c5e8d7af
PH
150 (?(1).+)? # if we found the ID, everything can follow
151 $"""
c5e8d7af 152 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
c5e8d7af 153 # Listed in order of quality
bdc6b3fc 154 _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '36', '17', '13',
96fb5605 155 # Apple HTTP Live Streaming
bdc6b3fc 156 '96', '95', '94', '93', '92', '132', '151',
939fbd26
JMF
157 # 3D
158 '85', '84', '102', '83', '101', '82', '100',
159 # Dash video
160 '138', '137', '248', '136', '247', '135', '246',
161 '245', '244', '134', '243', '133', '242', '160',
162 # Dash audio
163 '141', '172', '140', '171', '139',
1d043b93 164 ]
c5e8d7af
PH
165 _video_extensions = {
166 '13': '3gp',
bdc6b3fc 167 '17': '3gp',
c5e8d7af
PH
168 '18': 'mp4',
169 '22': 'mp4',
bdc6b3fc 170 '36': '3gp',
c5e8d7af 171 '37': 'mp4',
d69cf69a 172 '38': 'mp4',
c5e8d7af
PH
173 '43': 'webm',
174 '44': 'webm',
175 '45': 'webm',
176 '46': 'webm',
1d043b93 177
86fe61c8
AZ
178 # 3d videos
179 '82': 'mp4',
180 '83': 'mp4',
181 '84': 'mp4',
182 '85': 'mp4',
183 '100': 'webm',
184 '101': 'webm',
185 '102': 'webm',
836a086c 186
96fb5605 187 # Apple HTTP Live Streaming
1d043b93
JMF
188 '92': 'mp4',
189 '93': 'mp4',
190 '94': 'mp4',
191 '95': 'mp4',
192 '96': 'mp4',
193 '132': 'mp4',
194 '151': 'mp4',
836a086c
AZ
195
196 # Dash mp4
197 '133': 'mp4',
198 '134': 'mp4',
199 '135': 'mp4',
200 '136': 'mp4',
201 '137': 'mp4',
202 '138': 'mp4',
836a086c
AZ
203 '160': 'mp4',
204
f6f1fc92
RB
205 # Dash mp4 audio
206 '139': 'm4a',
16f36a6f
RB
207 '140': 'm4a',
208 '141': 'm4a',
836a086c
AZ
209
210 # Dash webm
211 '171': 'webm',
212 '172': 'webm',
213 '242': 'webm',
214 '243': 'webm',
215 '244': 'webm',
216 '245': 'webm',
217 '246': 'webm',
218 '247': 'webm',
219 '248': 'webm',
c5e8d7af
PH
220 }
221 _video_dimensions = {
dbd1988e 222 '5': {'width': 400, 'height': 240},
223 '6': {},
224 '13': {},
225 '17': {'width': 176, 'height': 144},
226 '18': {'width': 640, 'height': 360},
227 '22': {'width': 1280, 'height': 720},
228 '34': {'width': 640, 'height': 360},
229 '35': {'width': 854, 'height': 480},
230 '36': {'width': 320, 'height': 240},
231 '37': {'width': 1920, 'height': 1080},
232 '38': {'width': 4096, 'height': 3072},
233 '43': {'width': 640, 'height': 360},
234 '44': {'width': 854, 'height': 480},
235 '45': {'width': 1280, 'height': 720},
236 '46': {'width': 1920, 'height': 1080},
237 '82': {'height': 360, 'display': '360p'},
238 '83': {'height': 480, 'display': '480p'},
239 '84': {'height': 720, 'display': '720p'},
240 '85': {'height': 1080, 'display': '1080p'},
241 '92': {'height': 240, 'display': '240p'},
242 '93': {'height': 360, 'display': '360p'},
243 '94': {'height': 480, 'display': '480p'},
244 '95': {'height': 720, 'display': '720p'},
245 '96': {'height': 1080, 'display': '1080p'},
246 '100': {'height': 360, 'display': '360p'},
247 '101': {'height': 480, 'display': '480p'},
248 '102': {'height': 720, 'display': '720p'},
249 '132': {'height': 240, 'display': '240p'},
250 '151': {'height': 72, 'display': '72p'},
251 '133': {'height': 240, 'display': '240p'},
252 '134': {'height': 360, 'display': '360p'},
253 '135': {'height': 480, 'display': '480p'},
254 '136': {'height': 720, 'display': '720p'},
255 '137': {'height': 1080, 'display': '1080p'},
256 '138': {'height': 1081, 'display': '>1080p'},
257 '139': {'display': '48k'},
258 '140': {'display': '128k'},
259 '141': {'display': '256k'},
260 '160': {'height': 192, 'display': '192p'},
261 '171': {'display': '128k'},
262 '172': {'display': '256k'},
263 '242': {'height': 240, 'display': '240p'},
264 '243': {'height': 360, 'display': '360p'},
265 '244': {'height': 480, 'display': '480p'},
266 '245': {'height': 480, 'display': '480p'},
267 '246': {'height': 480, 'display': '480p'},
268 '247': {'height': 720, 'display': '720p'},
269 '248': {'height': 1080, 'display': '1080p'},
c5e8d7af 270 }
836a086c
AZ
271 _special_itags = {
272 '82': '3D',
273 '83': '3D',
274 '84': '3D',
275 '85': '3D',
276 '100': '3D',
277 '101': '3D',
278 '102': '3D',
279 '133': 'DASH Video',
280 '134': 'DASH Video',
281 '135': 'DASH Video',
282 '136': 'DASH Video',
283 '137': 'DASH Video',
284 '138': 'DASH Video',
285 '139': 'DASH Audio',
286 '140': 'DASH Audio',
287 '141': 'DASH Audio',
288 '160': 'DASH Video',
289 '171': 'DASH Audio',
290 '172': 'DASH Audio',
291 '242': 'DASH Video',
292 '243': 'DASH Video',
293 '244': 'DASH Video',
294 '245': 'DASH Video',
295 '246': 'DASH Video',
296 '247': 'DASH Video',
297 '248': 'DASH Video',
c5e8d7af 298 }
836a086c 299
c5e8d7af 300 IE_NAME = u'youtube'
2eb88d95
PH
301 _TESTS = [
302 {
0e853ca4
PH
303 u"url": u"http://www.youtube.com/watch?v=BaW_jenozKc",
304 u"file": u"BaW_jenozKc.mp4",
305 u"info_dict": {
306 u"title": u"youtube-dl test video \"'/\\ä↭𝕐",
307 u"uploader": u"Philipp Hagemeister",
308 u"uploader_id": u"phihag",
309 u"upload_date": u"20121002",
27dcce19 310 u"description": u"test chars: \"'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de ."
2eb88d95 311 }
0e853ca4 312 },
0e853ca4
PH
313 {
314 u"url": u"http://www.youtube.com/watch?v=UxxajLWwzqY",
315 u"file": u"UxxajLWwzqY.mp4",
316 u"note": u"Test generic use_cipher_signature video (#897)",
317 u"info_dict": {
318 u"upload_date": u"20120506",
319 u"title": u"Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]",
2dc59299 320 u"description": u"md5:5b292926389560516e384ac437c0ec07",
45ed795c 321 u"uploader": u"Icona Pop",
0e853ca4 322 u"uploader_id": u"IconaPop"
2eb88d95 323 }
c108eb73
JMF
324 },
325 {
326 u"url": u"https://www.youtube.com/watch?v=07FYdnEawAQ",
327 u"file": u"07FYdnEawAQ.mp4",
328 u"note": u"Test VEVO video with age protection (#956)",
329 u"info_dict": {
330 u"upload_date": u"20130703",
331 u"title": u"Justin Timberlake - Tunnel Vision (Explicit)",
332 u"description": u"md5:64249768eec3bc4276236606ea996373",
333 u"uploader": u"justintimberlakeVEVO",
334 u"uploader_id": u"justintimberlakeVEVO"
335 }
336 },
fccd3771 337 {
83aa5293 338 u"url": u"//www.YouTube.com/watch?v=yZIXLfi8CZQ",
fccd3771
PH
339 u"file": u"yZIXLfi8CZQ.mp4",
340 u"note": u"Embed-only video (#1746)",
341 u"info_dict": {
342 u"upload_date": u"20120608",
343 u"title": u"Principal Sexually Assaults A Teacher - Episode 117 - 8th June 2012",
344 u"description": u"md5:09b78bd971f1e3e289601dfba15ca4f7",
345 u"uploader": u"SET India",
346 u"uploader_id": u"setindia"
347 }
348 },
2eb88d95
PH
349 ]
350
c5e8d7af
PH
351
352 @classmethod
353 def suitable(cls, url):
354 """Receives a URL and returns True if suitable for this IE."""
e3ea4790 355 if YoutubePlaylistIE.suitable(url): return False
fccd3771 356 return re.match(cls._VALID_URL, url) is not None
c5e8d7af 357
e0df6211
PH
358 def __init__(self, *args, **kwargs):
359 super(YoutubeIE, self).__init__(*args, **kwargs)
83799698 360 self._player_cache = {}
e0df6211 361
c5e8d7af
PH
362 def report_video_info_webpage_download(self, video_id):
363 """Report attempt to download video info webpage."""
364 self.to_screen(u'%s: Downloading video info webpage' % video_id)
365
c5e8d7af
PH
366 def report_information_extraction(self, video_id):
367 """Report attempt to extract video information."""
368 self.to_screen(u'%s: Extracting video information' % video_id)
369
370 def report_unavailable_format(self, video_id, format):
371 """Report extracted video URL."""
372 self.to_screen(u'%s: Format %s not available' % (video_id, format))
373
374 def report_rtmp_download(self):
375 """Indicate the download will use the RTMP protocol."""
376 self.to_screen(u'RTMP download detected')
377
c4417ddb
PH
378 def _extract_signature_function(self, video_id, player_url, slen):
379 id_m = re.match(r'.*-(?P<id>[a-zA-Z0-9_-]+)\.(?P<ext>[a-z]+)$',
83799698 380 player_url)
e0df6211
PH
381 player_type = id_m.group('ext')
382 player_id = id_m.group('id')
383
c4417ddb
PH
384 # Read from filesystem cache
385 func_id = '%s_%s_%d' % (player_type, player_id, slen)
386 assert os.path.basename(func_id) == func_id
c38b1e77 387 cache_dir = get_cachedir(self._downloader.params)
c4417ddb 388
c3c88a26 389 cache_enabled = cache_dir is not None
f8061589 390 if cache_enabled:
c4417ddb
PH
391 cache_fn = os.path.join(os.path.expanduser(cache_dir),
392 u'youtube-sigfuncs',
393 func_id + '.json')
394 try:
edf3e38e 395 with io.open(cache_fn, 'r', encoding='utf-8') as cachef:
c4417ddb
PH
396 cache_spec = json.load(cachef)
397 return lambda s: u''.join(s[i] for i in cache_spec)
edf3e38e 398 except IOError:
c4417ddb 399 pass # No cache available
83799698 400
e0df6211
PH
401 if player_type == 'js':
402 code = self._download_webpage(
403 player_url, video_id,
83799698 404 note=u'Downloading %s player %s' % (player_type, player_id),
e0df6211 405 errnote=u'Download of %s failed' % player_url)
83799698 406 res = self._parse_sig_js(code)
c4417ddb 407 elif player_type == 'swf':
e0df6211
PH
408 urlh = self._request_webpage(
409 player_url, video_id,
83799698 410 note=u'Downloading %s player %s' % (player_type, player_id),
e0df6211
PH
411 errnote=u'Download of %s failed' % player_url)
412 code = urlh.read()
83799698 413 res = self._parse_sig_swf(code)
e0df6211
PH
414 else:
415 assert False, 'Invalid player type %r' % player_type
416
f8061589 417 if cache_enabled:
edf3e38e 418 try:
c705320f
PH
419 test_string = u''.join(map(compat_chr, range(slen)))
420 cache_res = res(test_string)
edf3e38e
PH
421 cache_spec = [ord(c) for c in cache_res]
422 try:
423 os.makedirs(os.path.dirname(cache_fn))
424 except OSError as ose:
425 if ose.errno != errno.EEXIST:
426 raise
427 write_json_file(cache_spec, cache_fn)
0ca96d48 428 except Exception:
edf3e38e
PH
429 tb = traceback.format_exc()
430 self._downloader.report_warning(
431 u'Writing cache to %r failed: %s' % (cache_fn, tb))
83799698
PH
432
433 return res
434
edf3e38e
PH
435 def _print_sig_code(self, func, slen):
436 def gen_sig_code(idxs):
437 def _genslice(start, end, step):
438 starts = u'' if start == 0 else str(start)
e35e4ddc
PH
439 ends = (u':%d' % (end+step)) if end + step >= 0 else u':'
440 steps = u'' if step == 1 else (u':%d' % step)
edf3e38e
PH
441 return u's[%s%s%s]' % (starts, ends, steps)
442
443 step = None
0ca96d48
PH
444 start = '(Never used)' # Quelch pyflakes warnings - start will be
445 # set as soon as step is set
edf3e38e
PH
446 for i, prev in zip(idxs[1:], idxs[:-1]):
447 if step is not None:
448 if i - prev == step:
449 continue
450 yield _genslice(start, prev, step)
451 step = None
452 continue
453 if i - prev in [-1, 1]:
454 step = i - prev
455 start = prev
456 continue
457 else:
458 yield u's[%d]' % prev
459 if step is None:
460 yield u's[%d]' % i
461 else:
462 yield _genslice(start, i, step)
463
c705320f
PH
464 test_string = u''.join(map(compat_chr, range(slen)))
465 cache_res = func(test_string)
edf3e38e
PH
466 cache_spec = [ord(c) for c in cache_res]
467 expr_code = u' + '.join(gen_sig_code(cache_spec))
468 code = u'if len(s) == %d:\n return %s\n' % (slen, expr_code)
f8061589 469 self.to_screen(u'Extracted signature function:\n' + code)
edf3e38e 470
e0df6211
PH
471 def _parse_sig_js(self, jscode):
472 funcname = self._search_regex(
473 r'signature=([a-zA-Z]+)', jscode,
474 u'Initial JS player signature function name')
475
476 functions = {}
477
478 def argidx(varname):
479 return string.lowercase.index(varname)
480
481 def interpret_statement(stmt, local_vars, allow_recursion=20):
482 if allow_recursion < 0:
0ca96d48 483 raise ExtractorError(u'Recursion limit reached')
e0df6211
PH
484
485 if stmt.startswith(u'var '):
486 stmt = stmt[len(u'var '):]
487 ass_m = re.match(r'^(?P<out>[a-z]+)(?:\[(?P<index>[^\]]+)\])?' +
488 r'=(?P<expr>.*)$', stmt)
489 if ass_m:
490 if ass_m.groupdict().get('index'):
491 def assign(val):
492 lvar = local_vars[ass_m.group('out')]
493 idx = interpret_expression(ass_m.group('index'),
494 local_vars, allow_recursion)
495 assert isinstance(idx, int)
496 lvar[idx] = val
497 return val
498 expr = ass_m.group('expr')
499 else:
500 def assign(val):
501 local_vars[ass_m.group('out')] = val
502 return val
503 expr = ass_m.group('expr')
504 elif stmt.startswith(u'return '):
505 assign = lambda v: v
506 expr = stmt[len(u'return '):]
507 else:
508 raise ExtractorError(
509 u'Cannot determine left side of statement in %r' % stmt)
510
511 v = interpret_expression(expr, local_vars, allow_recursion)
512 return assign(v)
513
514 def interpret_expression(expr, local_vars, allow_recursion):
515 if expr.isdigit():
516 return int(expr)
517
518 if expr.isalpha():
519 return local_vars[expr]
520
521 m = re.match(r'^(?P<in>[a-z]+)\.(?P<member>.*)$', expr)
522 if m:
523 member = m.group('member')
524 val = local_vars[m.group('in')]
525 if member == 'split("")':
526 return list(val)
527 if member == 'join("")':
528 return u''.join(val)
529 if member == 'length':
530 return len(val)
531 if member == 'reverse()':
532 return val[::-1]
533 slice_m = re.match(r'slice\((?P<idx>.*)\)', member)
534 if slice_m:
535 idx = interpret_expression(
536 slice_m.group('idx'), local_vars, allow_recursion-1)
537 return val[idx:]
538
539 m = re.match(
540 r'^(?P<in>[a-z]+)\[(?P<idx>.+)\]$', expr)
541 if m:
542 val = local_vars[m.group('in')]
543 idx = interpret_expression(m.group('idx'), local_vars,
544 allow_recursion-1)
545 return val[idx]
546
547 m = re.match(r'^(?P<a>.+?)(?P<op>[%])(?P<b>.+?)$', expr)
548 if m:
549 a = interpret_expression(m.group('a'),
550 local_vars, allow_recursion)
551 b = interpret_expression(m.group('b'),
552 local_vars, allow_recursion)
553 return a % b
554
555 m = re.match(
556 r'^(?P<func>[a-zA-Z]+)\((?P<args>[a-z0-9,]+)\)$', expr)
557 if m:
558 fname = m.group('func')
559 if fname not in functions:
560 functions[fname] = extract_function(fname)
561 argvals = [int(v) if v.isdigit() else local_vars[v]
562 for v in m.group('args').split(',')]
563 return functions[fname](argvals)
564 raise ExtractorError(u'Unsupported JS expression %r' % expr)
565
566 def extract_function(funcname):
567 func_m = re.search(
568 r'function ' + re.escape(funcname) +
569 r'\((?P<args>[a-z,]+)\){(?P<code>[^}]+)}',
570 jscode)
571 argnames = func_m.group('args').split(',')
572
573 def resf(args):
574 local_vars = dict(zip(argnames, args))
575 for stmt in func_m.group('code').split(';'):
576 res = interpret_statement(stmt, local_vars)
577 return res
578 return resf
579
580 initial_function = extract_function(funcname)
581 return lambda s: initial_function([s])
582
583 def _parse_sig_swf(self, file_contents):
584 if file_contents[1:3] != b'WS':
585 raise ExtractorError(
586 u'Not an SWF file; header is %r' % file_contents[:3])
587 if file_contents[:1] == b'C':
588 content = zlib.decompress(file_contents[8:])
589 else:
590 raise NotImplementedError(u'Unsupported compression format %r' %
591 file_contents[:1])
592
593 def extract_tags(content):
594 pos = 0
595 while pos < len(content):
596 header16 = struct.unpack('<H', content[pos:pos+2])[0]
597 pos += 2
598 tag_code = header16 >> 6
599 tag_len = header16 & 0x3f
600 if tag_len == 0x3f:
601 tag_len = struct.unpack('<I', content[pos:pos+4])[0]
602 pos += 4
603 assert pos+tag_len <= len(content)
604 yield (tag_code, content[pos:pos+tag_len])
605 pos += tag_len
606
607 code_tag = next(tag
608 for tag_code, tag in extract_tags(content)
609 if tag_code == 82)
610 p = code_tag.index(b'\0', 4) + 1
ba552f54 611 code_reader = io.BytesIO(code_tag[p:])
e0df6211
PH
612
613 # Parse ABC (AVM2 ByteCode)
ba552f54
PH
614 def read_int(reader=None):
615 if reader is None:
616 reader = code_reader
e0df6211
PH
617 res = 0
618 shift = 0
619 for _ in range(5):
ba552f54
PH
620 buf = reader.read(1)
621 assert len(buf) == 1
622 b = struct.unpack('<B', buf)[0]
e0df6211
PH
623 res = res | ((b & 0x7f) << shift)
624 if b & 0x80 == 0:
625 break
626 shift += 7
ba552f54
PH
627 return res
628
629 def u30(reader=None):
630 res = read_int(reader)
631 assert res & 0xf0000000 == 0
e0df6211
PH
632 return res
633 u32 = read_int
634
ba552f54
PH
635 def s32(reader=None):
636 v = read_int(reader)
e0df6211
PH
637 if v & 0x80000000 != 0:
638 v = - ((v ^ 0xffffffff) + 1)
ba552f54
PH
639 return v
640
0ca96d48 641 def read_string(reader=None):
ba552f54
PH
642 if reader is None:
643 reader = code_reader
644 slen = u30(reader)
645 resb = reader.read(slen)
646 assert len(resb) == slen
647 return resb.decode('utf-8')
648
649 def read_bytes(count, reader=None):
650 if reader is None:
651 reader = code_reader
652 resb = reader.read(count)
653 assert len(resb) == count
654 return resb
655
656 def read_byte(reader=None):
657 resb = read_bytes(1, reader=reader)
658 res = struct.unpack('<B', resb)[0]
659 return res
e0df6211
PH
660
661 # minor_version + major_version
0ca96d48 662 read_bytes(2 + 2)
e0df6211
PH
663
664 # Constant pool
ba552f54 665 int_count = u30()
e0df6211 666 for _c in range(1, int_count):
0ca96d48 667 s32()
ba552f54 668 uint_count = u30()
e0df6211 669 for _c in range(1, uint_count):
0ca96d48 670 u32()
ba552f54 671 double_count = u30()
0ca96d48 672 read_bytes((double_count-1) * 8)
ba552f54 673 string_count = u30()
e0df6211
PH
674 constant_strings = [u'']
675 for _c in range(1, string_count):
0ca96d48 676 s = read_string()
e0df6211 677 constant_strings.append(s)
ba552f54 678 namespace_count = u30()
e0df6211 679 for _c in range(1, namespace_count):
0ca96d48
PH
680 read_bytes(1) # kind
681 u30() # name
ba552f54 682 ns_set_count = u30()
e0df6211 683 for _c in range(1, ns_set_count):
ba552f54 684 count = u30()
e0df6211 685 for _c2 in range(count):
0ca96d48 686 u30()
ba552f54 687 multiname_count = u30()
e0df6211
PH
688 MULTINAME_SIZES = {
689 0x07: 2, # QName
690 0x0d: 2, # QNameA
691 0x0f: 1, # RTQName
692 0x10: 1, # RTQNameA
693 0x11: 0, # RTQNameL
694 0x12: 0, # RTQNameLA
695 0x09: 2, # Multiname
696 0x0e: 2, # MultinameA
697 0x1b: 1, # MultinameL
698 0x1c: 1, # MultinameLA
699 }
700 multinames = [u'']
701 for _c in range(1, multiname_count):
ba552f54 702 kind = u30()
e0df6211
PH
703 assert kind in MULTINAME_SIZES, u'Invalid multiname kind %r' % kind
704 if kind == 0x07:
0ca96d48 705 u30() # namespace_idx
ba552f54 706 name_idx = u30()
e0df6211
PH
707 multinames.append(constant_strings[name_idx])
708 else:
709 multinames.append('[MULTINAME kind: %d]' % kind)
710 for _c2 in range(MULTINAME_SIZES[kind]):
0ca96d48 711 u30()
e0df6211
PH
712
713 # Methods
ba552f54 714 method_count = u30()
e0df6211
PH
715 MethodInfo = collections.namedtuple(
716 'MethodInfo',
717 ['NEED_ARGUMENTS', 'NEED_REST'])
718 method_infos = []
719 for method_id in range(method_count):
ba552f54 720 param_count = u30()
0ca96d48 721 u30() # return type
e0df6211 722 for _ in range(param_count):
0ca96d48
PH
723 u30() # param type
724 u30() # name index (always 0 for youtube)
ba552f54 725 flags = read_byte()
e0df6211
PH
726 if flags & 0x08 != 0:
727 # Options present
ba552f54 728 option_count = u30()
e0df6211 729 for c in range(option_count):
0ca96d48
PH
730 u30() # val
731 read_bytes(1) # kind
e0df6211
PH
732 if flags & 0x80 != 0:
733 # Param names present
734 for _ in range(param_count):
0ca96d48 735 u30() # param name
e0df6211
PH
736 mi = MethodInfo(flags & 0x01 != 0, flags & 0x04 != 0)
737 method_infos.append(mi)
738
739 # Metadata
ba552f54 740 metadata_count = u30()
e0df6211 741 for _c in range(metadata_count):
0ca96d48 742 u30() # name
ba552f54 743 item_count = u30()
e0df6211 744 for _c2 in range(item_count):
0ca96d48
PH
745 u30() # key
746 u30() # value
ba552f54
PH
747
748 def parse_traits_info():
749 trait_name_idx = u30()
750 kind_full = read_byte()
e0df6211
PH
751 kind = kind_full & 0x0f
752 attrs = kind_full >> 4
753 methods = {}
754 if kind in [0x00, 0x06]: # Slot or Const
0ca96d48
PH
755 u30() # Slot id
756 u30() # type_name_idx
ba552f54 757 vindex = u30()
e0df6211 758 if vindex != 0:
0ca96d48 759 read_byte() # vkind
e0df6211 760 elif kind in [0x01, 0x02, 0x03]: # Method / Getter / Setter
0ca96d48 761 u30() # disp_id
ba552f54 762 method_idx = u30()
e0df6211
PH
763 methods[multinames[trait_name_idx]] = method_idx
764 elif kind == 0x04: # Class
0ca96d48
PH
765 u30() # slot_id
766 u30() # classi
e0df6211 767 elif kind == 0x05: # Function
0ca96d48 768 u30() # slot_id
ba552f54 769 function_idx = u30()
e0df6211
PH
770 methods[function_idx] = multinames[trait_name_idx]
771 else:
772 raise ExtractorError(u'Unsupported trait kind %d' % kind)
773
774 if attrs & 0x4 != 0: # Metadata present
ba552f54 775 metadata_count = u30()
e0df6211 776 for _c3 in range(metadata_count):
0ca96d48 777 u30() # metadata index
e0df6211 778
ba552f54 779 return methods
e0df6211
PH
780
781 # Classes
782 TARGET_CLASSNAME = u'SignatureDecipher'
783 searched_idx = multinames.index(TARGET_CLASSNAME)
784 searched_class_id = None
ba552f54 785 class_count = u30()
e0df6211 786 for class_id in range(class_count):
ba552f54 787 name_idx = u30()
e0df6211
PH
788 if name_idx == searched_idx:
789 # We found the class we're looking for!
790 searched_class_id = class_id
0ca96d48 791 u30() # super_name idx
ba552f54 792 flags = read_byte()
e0df6211 793 if flags & 0x08 != 0: # Protected namespace is present
0ca96d48 794 u30() # protected_ns_idx
ba552f54 795 intrf_count = u30()
e0df6211 796 for _c2 in range(intrf_count):
0ca96d48
PH
797 u30()
798 u30() # iinit
ba552f54 799 trait_count = u30()
e0df6211 800 for _c2 in range(trait_count):
0ca96d48 801 parse_traits_info()
e0df6211
PH
802
803 if searched_class_id is None:
804 raise ExtractorError(u'Target class %r not found' %
805 TARGET_CLASSNAME)
806
807 method_names = {}
808 method_idxs = {}
809 for class_id in range(class_count):
0ca96d48 810 u30() # cinit
ba552f54 811 trait_count = u30()
e0df6211 812 for _c2 in range(trait_count):
ba552f54 813 trait_methods = parse_traits_info()
e0df6211
PH
814 if class_id == searched_class_id:
815 method_names.update(trait_methods.items())
816 method_idxs.update(dict(
817 (idx, name)
818 for name, idx in trait_methods.items()))
819
820 # Scripts
ba552f54 821 script_count = u30()
e0df6211 822 for _c in range(script_count):
0ca96d48 823 u30() # init
ba552f54 824 trait_count = u30()
e0df6211 825 for _c2 in range(trait_count):
0ca96d48 826 parse_traits_info()
e0df6211
PH
827
828 # Method bodies
ba552f54 829 method_body_count = u30()
e0df6211
PH
830 Method = collections.namedtuple('Method', ['code', 'local_count'])
831 methods = {}
832 for _c in range(method_body_count):
ba552f54 833 method_idx = u30()
0ca96d48 834 u30() # max_stack
ba552f54 835 local_count = u30()
0ca96d48
PH
836 u30() # init_scope_depth
837 u30() # max_scope_depth
ba552f54
PH
838 code_length = u30()
839 code = read_bytes(code_length)
e0df6211 840 if method_idx in method_idxs:
ba552f54 841 m = Method(code, local_count)
e0df6211 842 methods[method_idxs[method_idx]] = m
ba552f54 843 exception_count = u30()
e0df6211 844 for _c2 in range(exception_count):
0ca96d48
PH
845 u30() # from
846 u30() # to
847 u30() # target
848 u30() # exc_type
849 u30() # var_name
ba552f54 850 trait_count = u30()
e0df6211 851 for _c2 in range(trait_count):
0ca96d48 852 parse_traits_info()
e0df6211 853
ba552f54 854 assert p + code_reader.tell() == len(code_tag)
e0df6211
PH
855 assert len(methods) == len(method_idxs)
856
857 method_pyfunctions = {}
858
859 def extract_function(func_name):
860 if func_name in method_pyfunctions:
861 return method_pyfunctions[func_name]
862 if func_name not in methods:
863 raise ExtractorError(u'Cannot find function %r' % func_name)
864 m = methods[func_name]
865
866 def resfunc(args):
e0df6211
PH
867 registers = ['(this)'] + list(args) + [None] * m.local_count
868 stack = []
869 coder = io.BytesIO(m.code)
870 while True:
871 opcode = struct.unpack('!B', coder.read(1))[0]
a7177865 872 if opcode == 36: # pushbyte
e0df6211
PH
873 v = struct.unpack('!B', coder.read(1))[0]
874 stack.append(v)
875 elif opcode == 44: # pushstring
876 idx = u30(coder)
877 stack.append(constant_strings[idx])
878 elif opcode == 48: # pushscope
879 # We don't implement the scope register, so we'll just
880 # ignore the popped value
881 stack.pop()
882 elif opcode == 70: # callproperty
883 index = u30(coder)
884 mname = multinames[index]
885 arg_count = u30(coder)
886 args = list(reversed(
887 [stack.pop() for _ in range(arg_count)]))
888 obj = stack.pop()
889 if mname == u'split':
890 assert len(args) == 1
891 assert isinstance(args[0], compat_str)
892 assert isinstance(obj, compat_str)
893 if args[0] == u'':
894 res = list(obj)
895 else:
896 res = obj.split(args[0])
897 stack.append(res)
a7177865
PH
898 elif mname == u'slice':
899 assert len(args) == 1
900 assert isinstance(args[0], int)
901 assert isinstance(obj, list)
902 res = obj[args[0]:]
903 stack.append(res)
904 elif mname == u'join':
905 assert len(args) == 1
906 assert isinstance(args[0], compat_str)
907 assert isinstance(obj, list)
908 res = args[0].join(obj)
909 stack.append(res)
e0df6211
PH
910 elif mname in method_pyfunctions:
911 stack.append(method_pyfunctions[mname](args))
912 else:
913 raise NotImplementedError(
914 u'Unsupported property %r on %r'
915 % (mname, obj))
a7177865
PH
916 elif opcode == 72: # returnvalue
917 res = stack.pop()
918 return res
919 elif opcode == 79: # callpropvoid
920 index = u30(coder)
921 mname = multinames[index]
922 arg_count = u30(coder)
923 args = list(reversed(
924 [stack.pop() for _ in range(arg_count)]))
925 obj = stack.pop()
926 if mname == u'reverse':
927 assert isinstance(obj, list)
928 obj.reverse()
929 else:
930 raise NotImplementedError(
931 u'Unsupported (void) property %r on %r'
932 % (mname, obj))
e0df6211
PH
933 elif opcode == 93: # findpropstrict
934 index = u30(coder)
935 mname = multinames[index]
936 res = extract_function(mname)
937 stack.append(res)
938 elif opcode == 97: # setproperty
939 index = u30(coder)
940 value = stack.pop()
941 idx = stack.pop()
942 obj = stack.pop()
943 assert isinstance(obj, list)
944 assert isinstance(idx, int)
945 obj[idx] = value
946 elif opcode == 98: # getlocal
947 index = u30(coder)
948 stack.append(registers[index])
949 elif opcode == 99: # setlocal
950 index = u30(coder)
951 value = stack.pop()
952 registers[index] = value
953 elif opcode == 102: # getproperty
954 index = u30(coder)
955 pname = multinames[index]
956 if pname == u'length':
957 obj = stack.pop()
958 assert isinstance(obj, list)
959 stack.append(len(obj))
960 else: # Assume attribute access
961 idx = stack.pop()
962 assert isinstance(idx, int)
963 obj = stack.pop()
964 assert isinstance(obj, list)
965 stack.append(obj[idx])
966 elif opcode == 128: # coerce
0ca96d48 967 u30(coder)
e0df6211
PH
968 elif opcode == 133: # coerce_s
969 assert isinstance(stack[-1], (type(None), compat_str))
970 elif opcode == 164: # modulo
971 value2 = stack.pop()
972 value1 = stack.pop()
973 res = value1 % value2
974 stack.append(res)
a7177865
PH
975 elif opcode == 208: # getlocal_0
976 stack.append(registers[0])
977 elif opcode == 209: # getlocal_1
978 stack.append(registers[1])
979 elif opcode == 210: # getlocal_2
980 stack.append(registers[2])
981 elif opcode == 211: # getlocal_3
982 stack.append(registers[3])
e0df6211
PH
983 elif opcode == 214: # setlocal_2
984 registers[2] = stack.pop()
985 elif opcode == 215: # setlocal_3
986 registers[3] = stack.pop()
987 else:
988 raise NotImplementedError(
989 u'Unsupported opcode %d' % opcode)
990
991 method_pyfunctions[func_name] = resfunc
992 return resfunc
993
994 initial_function = extract_function(u'decipher')
995 return lambda s: initial_function([s])
996
83799698 997 def _decrypt_signature(self, s, video_id, player_url, age_gate=False):
257a2501 998 """Turn the encrypted s field into a working signature"""
6b37f0be 999
83799698 1000 if player_url is not None:
9f9be844
PH
1001 if player_url.startswith(u'//'):
1002 player_url = u'https:' + player_url
e0df6211 1003 try:
7f8ae73a
PH
1004 player_id = (player_url, len(s))
1005 if player_id not in self._player_cache:
83799698 1006 func = self._extract_signature_function(
c4417ddb 1007 video_id, player_url, len(s)
e0df6211 1008 )
7f8ae73a
PH
1009 self._player_cache[player_id] = func
1010 func = self._player_cache[player_id]
edf3e38e
PH
1011 if self._downloader.params.get('youtube_print_sig_code'):
1012 self._print_sig_code(func, len(s))
1013 return func(s)
0ca96d48 1014 except Exception:
e0df6211 1015 tb = traceback.format_exc()
83799698
PH
1016 self._downloader.report_warning(
1017 u'Automatic signature extraction failed: ' + tb)
e0df6211 1018
d2d8f895
PH
1019 self._downloader.report_warning(
1020 u'Warning: Falling back to static signature algorithm')
920de7a2 1021
2f2ffea9
PH
1022 return self._static_decrypt_signature(
1023 s, video_id, player_url, age_gate)
e0df6211 1024
2f2ffea9 1025 def _static_decrypt_signature(self, s, video_id, player_url, age_gate):
e0df6211
PH
1026 if age_gate:
1027 # The videos with age protection use another player, so the
1028 # algorithms can be different.
1029 if len(s) == 86:
1030 return s[2:63] + s[82] + s[64:82] + s[63]
1031
bc4b9008 1032 if len(s) == 93:
1033 return s[86:29:-1] + s[88] + s[28:5:-1]
1034 elif len(s) == 92:
444b1165 1035 return s[25] + s[3:25] + s[0] + s[26:42] + s[79] + s[43:79] + s[91] + s[80:83]
38d025b3
JMF
1036 elif len(s) == 91:
1037 return s[84:27:-1] + s[86] + s[26:5:-1]
444b1165
JMF
1038 elif len(s) == 90:
1039 return s[25] + s[3:25] + s[2] + s[26:40] + s[77] + s[41:77] + s[89] + s[78:81]
8a9d86a2 1040 elif len(s) == 89:
1041 return s[84:78:-1] + s[87] + s[77:60:-1] + s[0] + s[59:3:-1]
444b1165 1042 elif len(s) == 88:
3e223834 1043 return s[7:28] + s[87] + s[29:45] + s[55] + s[46:55] + s[2] + s[56:87] + s[28]
be547e1d 1044 elif len(s) == 87:
3a725669 1045 return s[6:27] + s[4] + s[28:39] + s[27] + s[40:59] + s[2] + s[60:]
be547e1d 1046 elif len(s) == 86:
f2c327fd 1047 return s[80:72:-1] + s[16] + s[71:39:-1] + s[72] + s[38:16:-1] + s[82] + s[15::-1]
be547e1d 1048 elif len(s) == 85:
6ae8ee3f 1049 return s[3:11] + s[0] + s[12:55] + s[84] + s[56:84]
be547e1d 1050 elif len(s) == 84:
6f56389b 1051 return s[78:70:-1] + s[14] + s[69:37:-1] + s[70] + s[36:14:-1] + s[80] + s[:14][::-1]
be547e1d 1052 elif len(s) == 83:
920de7a2 1053 return s[80:63:-1] + s[0] + s[62:0:-1] + s[63]
be547e1d 1054 elif len(s) == 82:
c21315f2 1055 return s[80:37:-1] + s[7] + s[36:7:-1] + s[0] + s[6:0:-1] + s[37]
be547e1d 1056 elif len(s) == 81:
aedd6bb9 1057 return s[56] + s[79:56:-1] + s[41] + s[55:41:-1] + s[80] + s[40:34:-1] + s[0] + s[33:29:-1] + s[34] + s[28:9:-1] + s[29] + s[8:0:-1] + s[9]
066090dd
JMF
1058 elif len(s) == 80:
1059 return s[1:19] + s[0] + s[20:68] + s[19] + s[69:80]
5c468ca8
JMF
1060 elif len(s) == 79:
1061 return s[54] + s[77:54:-1] + s[39] + s[53:39:-1] + s[78] + s[38:34:-1] + s[0] + s[33:29:-1] + s[34] + s[28:9:-1] + s[29] + s[8:0:-1] + s[9]
be547e1d
PH
1062
1063 else:
1064 raise ExtractorError(u'Unable to decrypt signature, key length %d not supported; retrying might work' % (len(s)))
c5e8d7af 1065
1f343eaa 1066 def _get_available_subtitles(self, video_id, webpage):
de7f3446 1067 try:
7fad1c63
JMF
1068 sub_list = self._download_webpage(
1069 'http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,
1070 video_id, note=False)
1071 except ExtractorError as err:
de7f3446
JMF
1072 self._downloader.report_warning(u'unable to download video subtitles: %s' % compat_str(err))
1073 return {}
1074 lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
1075
1076 sub_lang_list = {}
1077 for l in lang_list:
1078 lang = l[1]
1079 params = compat_urllib_parse.urlencode({
1080 'lang': lang,
1081 'v': video_id,
ca715127 1082 'fmt': self._downloader.params.get('subtitlesformat', 'srt'),
8eddf3e9 1083 'name': l[0].encode('utf-8'),
de7f3446
JMF
1084 })
1085 url = u'http://www.youtube.com/api/timedtext?' + params
1086 sub_lang_list[lang] = url
1087 if not sub_lang_list:
1088 self._downloader.report_warning(u'video doesn\'t have subtitles')
1089 return {}
1090 return sub_lang_list
1091
055e6f36 1092 def _get_available_automatic_caption(self, video_id, webpage):
de7f3446
JMF
1093 """We need the webpage for getting the captions url, pass it as an
1094 argument to speed up the process."""
ca715127 1095 sub_format = self._downloader.params.get('subtitlesformat', 'srt')
de7f3446
JMF
1096 self.to_screen(u'%s: Looking for automatic captions' % video_id)
1097 mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
055e6f36 1098 err_msg = u'Couldn\'t find automatic captions for %s' % video_id
de7f3446
JMF
1099 if mobj is None:
1100 self._downloader.report_warning(err_msg)
1101 return {}
1102 player_config = json.loads(mobj.group(1))
1103 try:
1104 args = player_config[u'args']
1105 caption_url = args[u'ttsurl']
1106 timestamp = args[u'timestamp']
055e6f36
JMF
1107 # We get the available subtitles
1108 list_params = compat_urllib_parse.urlencode({
1109 'type': 'list',
1110 'tlangs': 1,
1111 'asrs': 1,
de7f3446 1112 })
055e6f36 1113 list_url = caption_url + '&' + list_params
e26f8712 1114 caption_list = self._download_xml(list_url, video_id)
e3dc22ca 1115 original_lang_node = caption_list.find('track')
f6a54188 1116 if original_lang_node is None or original_lang_node.attrib.get('kind') != 'asr' :
e3dc22ca
JMF
1117 self._downloader.report_warning(u'Video doesn\'t have automatic captions')
1118 return {}
1119 original_lang = original_lang_node.attrib['lang_code']
055e6f36
JMF
1120
1121 sub_lang_list = {}
1122 for lang_node in caption_list.findall('target'):
1123 sub_lang = lang_node.attrib['lang_code']
1124 params = compat_urllib_parse.urlencode({
1125 'lang': original_lang,
1126 'tlang': sub_lang,
1127 'fmt': sub_format,
1128 'ts': timestamp,
1129 'kind': 'asr',
1130 })
1131 sub_lang_list[sub_lang] = caption_url + '&' + params
1132 return sub_lang_list
de7f3446
JMF
1133 # An extractor error can be raise by the download process if there are
1134 # no automatic captions but there are subtitles
1135 except (KeyError, ExtractorError):
1136 self._downloader.report_warning(err_msg)
1137 return {}
1138
c5e8d7af
PH
1139 def _extract_id(self, url):
1140 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1141 if mobj is None:
1142 raise ExtractorError(u'Invalid URL: %s' % url)
1143 video_id = mobj.group(2)
1144 return video_id
1145
1d043b93
JMF
1146 def _get_video_url_list(self, url_map):
1147 """
1148 Transform a dictionary in the format {itag:url} to a list of (itag, url)
1149 with the requested formats.
1150 """
4ea3be0a 1151 existing_formats = [x for x in self._available_formats if x in url_map]
1d043b93
JMF
1152 if len(existing_formats) == 0:
1153 raise ExtractorError(u'no known formats available for video')
4ea3be0a 1154 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1155 video_url_list.reverse() # order worst to best
1d043b93
JMF
1156 return video_url_list
1157
1158 def _extract_from_m3u8(self, manifest_url, video_id):
1159 url_map = {}
1160 def _get_urls(_manifest):
1161 lines = _manifest.split('\n')
1162 urls = filter(lambda l: l and not l.startswith('#'),
1163 lines)
1164 return urls
1165 manifest = self._download_webpage(manifest_url, video_id, u'Downloading formats manifest')
1166 formats_urls = _get_urls(manifest)
1167 for format_url in formats_urls:
890f62e8 1168 itag = self._search_regex(r'itag/(\d+?)/', format_url, 'itag')
1d043b93
JMF
1169 url_map[itag] = format_url
1170 return url_map
1171
1fb07d10
JG
1172 def _extract_annotations(self, video_id):
1173 url = 'https://www.youtube.com/annotations_invideo?features=1&legacy=1&video_id=%s' % video_id
1174 return self._download_webpage(url, video_id, note=u'Searching for annotations.', errnote=u'Unable to download video annotations.')
1175
c5e8d7af
PH
1176 def _real_extract(self, url):
1177 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
1178 mobj = re.search(self._NEXT_URL_RE, url)
1179 if mobj:
1180 url = 'https://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
1181 video_id = self._extract_id(url)
1182
1183 # Get video webpage
c5e8d7af 1184 url = 'https://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
336c3a69 1185 video_webpage = self._download_webpage(url, video_id)
c5e8d7af
PH
1186
1187 # Attempt to extract SWF player URL
e0df6211 1188 mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
c5e8d7af
PH
1189 if mobj is not None:
1190 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1191 else:
1192 player_url = None
1193
1194 # Get video info
1195 self.report_video_info_webpage_download(video_id)
c108eb73
JMF
1196 if re.search(r'player-age-gate-content">', video_webpage) is not None:
1197 self.report_age_confirmation()
1198 age_gate = True
1199 # We simulate the access to the video from www.youtube.com/v/{video_id}
1200 # this can be viewed without login into Youtube
1201 data = compat_urllib_parse.urlencode({'video_id': video_id,
fccd3771 1202 'el': 'player_embedded',
c108eb73
JMF
1203 'gl': 'US',
1204 'hl': 'en',
1205 'eurl': 'https://youtube.googleapis.com/v/' + video_id,
1206 'asv': 3,
1207 'sts':'1588',
1208 })
1209 video_info_url = 'https://www.youtube.com/get_video_info?' + data
c5e8d7af
PH
1210 video_info_webpage = self._download_webpage(video_info_url, video_id,
1211 note=False,
1212 errnote='unable to download video info webpage')
1213 video_info = compat_parse_qs(video_info_webpage)
c108eb73
JMF
1214 else:
1215 age_gate = False
1216 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1217 video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1218 % (video_id, el_type))
1219 video_info_webpage = self._download_webpage(video_info_url, video_id,
1220 note=False,
1221 errnote='unable to download video info webpage')
1222 video_info = compat_parse_qs(video_info_webpage)
1223 if 'token' in video_info:
1224 break
c5e8d7af
PH
1225 if 'token' not in video_info:
1226 if 'reason' in video_info:
9a82b238 1227 raise ExtractorError(u'YouTube said: %s' % video_info['reason'][0], expected=True)
c5e8d7af
PH
1228 else:
1229 raise ExtractorError(u'"token" parameter not in video info for unknown reason')
1230
1d699755
PH
1231 if 'view_count' in video_info:
1232 view_count = int(video_info['view_count'][0])
1233 else:
1234 view_count = None
1235
c5e8d7af
PH
1236 # Check for "rental" videos
1237 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
1238 raise ExtractorError(u'"rental" videos not supported')
1239
1240 # Start extracting information
1241 self.report_information_extraction(video_id)
1242
1243 # uploader
1244 if 'author' not in video_info:
1245 raise ExtractorError(u'Unable to extract uploader name')
1246 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
1247
1248 # uploader_id
1249 video_uploader_id = None
1250 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
1251 if mobj is not None:
1252 video_uploader_id = mobj.group(1)
1253 else:
1254 self._downloader.report_warning(u'unable to extract uploader nickname')
1255
1256 # title
a8c6b241
PH
1257 if 'title' in video_info:
1258 video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
1259 else:
1260 self._downloader.report_warning(u'Unable to extract video title')
1261 video_title = u'_'
c5e8d7af
PH
1262
1263 # thumbnail image
7763b04e
JMF
1264 # We try first to get a high quality image:
1265 m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">',
1266 video_webpage, re.DOTALL)
1267 if m_thumb is not None:
1268 video_thumbnail = m_thumb.group(1)
1269 elif 'thumbnail_url' not in video_info:
c5e8d7af 1270 self._downloader.report_warning(u'unable to extract video thumbnail')
f490e77e 1271 video_thumbnail = None
c5e8d7af
PH
1272 else: # don't panic if we can't find it
1273 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
1274
1275 # upload date
1276 upload_date = None
1277 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
1278 if mobj is not None:
1279 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1280 upload_date = unified_strdate(upload_date)
1281
1282 # description
1283 video_description = get_element_by_id("eow-description", video_webpage)
1284 if video_description:
27dcce19
PH
1285 video_description = re.sub(r'''(?x)
1286 <a\s+
1287 (?:[a-zA-Z-]+="[^"]+"\s+)*?
1288 title="([^"]+)"\s+
1289 (?:[a-zA-Z-]+="[^"]+"\s+)*?
1290 class="yt-uix-redirect-link"\s*>
1291 [^<]+
1292 </a>
1293 ''', r'\1', video_description)
c5e8d7af
PH
1294 video_description = clean_html(video_description)
1295 else:
1296 fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
1297 if fd_mobj:
1298 video_description = unescapeHTML(fd_mobj.group(1))
1299 else:
1300 video_description = u''
1301
336c3a69 1302 def _extract_count(klass):
46374a56
PH
1303 count = self._search_regex(
1304 r'class="%s">([\d,]+)</span>' % re.escape(klass),
1305 video_webpage, klass, default=None)
336c3a69
JMF
1306 if count is not None:
1307 return int(count.replace(',', ''))
1308 return None
1309 like_count = _extract_count(u'likes-count')
1310 dislike_count = _extract_count(u'dislikes-count')
1311
c5e8d7af 1312 # subtitles
d82134c3 1313 video_subtitles = self.extract_subtitles(video_id, video_webpage)
c5e8d7af 1314
c5e8d7af 1315 if self._downloader.params.get('listsubtitles', False):
d665f8d3 1316 self._list_available_subtitles(video_id, video_webpage)
c5e8d7af
PH
1317 return
1318
1319 if 'length_seconds' not in video_info:
1320 self._downloader.report_warning(u'unable to extract video duration')
b466b702 1321 video_duration = None
c5e8d7af 1322 else:
b466b702 1323 video_duration = int(compat_urllib_parse.unquote_plus(video_info['length_seconds'][0]))
c5e8d7af 1324
1fb07d10
JG
1325 # annotations
1326 video_annotations = None
1327 if self._downloader.params.get('writeannotations', False):
1328 video_annotations = self._extract_annotations(video_id)
1329
c5e8d7af 1330 # Decide which formats to download
c5e8d7af
PH
1331
1332 try:
1333 mobj = re.search(r';ytplayer.config = ({.*?});', video_webpage)
50be92c1
PH
1334 if not mobj:
1335 raise ValueError('Could not find vevo ID')
c5e8d7af
PH
1336 info = json.loads(mobj.group(1))
1337 args = info['args']
7ce7e394
JMF
1338 # Easy way to know if the 's' value is in url_encoded_fmt_stream_map
1339 # this signatures are encrypted
44d46655 1340 if 'url_encoded_fmt_stream_map' not in args:
f10503db 1341 raise ValueError(u'No stream_map present') # caught below
00fe14fc
JMF
1342 re_signature = re.compile(r'[&,]s=')
1343 m_s = re_signature.search(args['url_encoded_fmt_stream_map'])
7ce7e394
JMF
1344 if m_s is not None:
1345 self.to_screen(u'%s: Encrypted signatures detected.' % video_id)
c5e8d7af 1346 video_info['url_encoded_fmt_stream_map'] = [args['url_encoded_fmt_stream_map']]
00fe14fc 1347 m_s = re_signature.search(args.get('adaptive_fmts', u''))
b7a68384 1348 if m_s is not None:
00fe14fc
JMF
1349 if 'adaptive_fmts' in video_info:
1350 video_info['adaptive_fmts'][0] += ',' + args['adaptive_fmts']
37b6d5f6 1351 else:
00fe14fc 1352 video_info['adaptive_fmts'] = [args['adaptive_fmts']]
c5e8d7af
PH
1353 except ValueError:
1354 pass
1355
1356 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1357 self.report_rtmp_download()
1358 video_url_list = [(None, video_info['conn'][0])]
00fe14fc
JMF
1359 elif len(video_info.get('url_encoded_fmt_stream_map', [])) >= 1 or len(video_info.get('adaptive_fmts', [])) >= 1:
1360 encoded_url_map = video_info.get('url_encoded_fmt_stream_map', [''])[0] + ',' + video_info.get('adaptive_fmts',[''])[0]
1361 if 'rtmpe%3Dyes' in encoded_url_map:
a7055eb9 1362 raise ExtractorError('rtmpe downloads are not supported, see https://github.com/rg3/youtube-dl/issues/343 for more information.', expected=True)
c5e8d7af 1363 url_map = {}
00fe14fc 1364 for url_data_str in encoded_url_map.split(','):
c5e8d7af
PH
1365 url_data = compat_parse_qs(url_data_str)
1366 if 'itag' in url_data and 'url' in url_data:
1367 url = url_data['url'][0]
1368 if 'sig' in url_data:
1369 url += '&signature=' + url_data['sig'][0]
1370 elif 's' in url_data:
e0df6211 1371 encrypted_sig = url_data['s'][0]
769fda3c 1372 if self._downloader.params.get('verbose'):
c108eb73 1373 if age_gate:
bdde940e
PH
1374 if player_url is None:
1375 player_version = 'unknown'
1376 else:
1377 player_version = self._search_regex(
1378 r'-(.+)\.swf$', player_url,
1379 u'flash player', fatal=False)
e0df6211 1380 player_desc = 'flash player %s' % player_version
c108eb73 1381 else:
83799698
PH
1382 player_version = self._search_regex(
1383 r'html5player-(.+?)\.js', video_webpage,
c108eb73 1384 'html5 player', fatal=False)
e0df6211
PH
1385 player_desc = u'html5 player %s' % player_version
1386
1387 parts_sizes = u'.'.join(compat_str(len(part)) for part in encrypted_sig.split('.'))
5a76c651 1388 self.to_screen(u'encrypted signature length %d (%s), itag %s, %s' %
e0df6211
PH
1389 (len(encrypted_sig), parts_sizes, url_data['itag'][0], player_desc))
1390
83799698 1391 if not age_gate:
e0df6211
PH
1392 jsplayer_url_json = self._search_regex(
1393 r'"assets":.+?"js":\s*("[^"]+")',
1394 video_webpage, u'JS player URL')
83799698 1395 player_url = json.loads(jsplayer_url_json)
e0df6211 1396
83799698
PH
1397 signature = self._decrypt_signature(
1398 encrypted_sig, video_id, player_url, age_gate)
c5e8d7af
PH
1399 url += '&signature=' + signature
1400 if 'ratebypass' not in url:
1401 url += '&ratebypass=yes'
1402 url_map[url_data['itag'][0]] = url
1d043b93 1403 video_url_list = self._get_video_url_list(url_map)
1d043b93
JMF
1404 elif video_info.get('hlsvp'):
1405 manifest_url = video_info['hlsvp'][0]
1406 url_map = self._extract_from_m3u8(manifest_url, video_id)
1407 video_url_list = self._get_video_url_list(url_map)
c5e8d7af 1408 else:
9abb3204 1409 raise ExtractorError(u'no conn, hlsvp or url_encoded_fmt_stream_map information found in video info')
c5e8d7af 1410
4ea3be0a 1411 formats = []
600cc1a4 1412 for itag, video_real_url in video_url_list:
c5e8d7af 1413 # Extension
600cc1a4 1414 video_extension = self._video_extensions.get(itag, 'flv')
dbd1988e 1415 resolution = self._video_dimensions.get(itag, {}).get('display')
1416 width = self._video_dimensions.get(itag, {}).get('width')
1417 height = self._video_dimensions.get(itag, {}).get('height')
1418 note = self._special_itags.get(itag)
c5e8d7af 1419
600cc1a4 1420 video_format = '{0} - {1}{2}'.format(itag if itag else video_extension,
dbd1988e 1421 '%dx%d' % (width, height) if width is not None and height is not None else (resolution if resolution is not None else '???'),
600cc1a4 1422 ' ('+self._special_itags[itag]+')' if itag in self._special_itags else '')
c5e8d7af 1423
4ea3be0a 1424 formats.append({
1425 'url': video_real_url,
1426 'ext': video_extension,
1427 'format': video_format,
1428 'format_id': itag,
1429 'player_url': player_url,
1430 '_resolution': resolution,
dbd1988e 1431 'width': width,
1432 'height': height,
4ea3be0a 1433 'format_note': note,
c5e8d7af 1434 })
d80044c2 1435
4bcc7bd1 1436 self._sort_formats(formats)
4ea3be0a 1437
1438 return {
1439 'id': video_id,
1440 'uploader': video_uploader,
1441 'uploader_id': video_uploader_id,
1442 'upload_date': upload_date,
1443 'title': video_title,
1444 'thumbnail': video_thumbnail,
1445 'description': video_description,
1446 'subtitles': video_subtitles,
1447 'duration': video_duration,
1448 'age_limit': 18 if age_gate else 0,
1449 'annotations': video_annotations,
1450 'webpage_url': 'https://www.youtube.com/watch?v=%s' % video_id,
1451 'view_count': view_count,
1452 'like_count': like_count,
1453 'dislike_count': dislike_count,
1454 'formats': formats,
1455 }
c5e8d7af 1456
880e1c52 1457class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
0f818663 1458 IE_DESC = u'YouTube.com playlists'
c5e8d7af
PH
1459 _VALID_URL = r"""(?:
1460 (?:https?://)?
1461 (?:\w+\.)?
1462 youtube\.com/
1463 (?:
1464 (?:course|view_play_list|my_playlists|artist|playlist|watch)
1465 \? (?:.*?&)*? (?:p|a|list)=
1466 | p/
1467 )
715c8e7b 1468 ((?:PL|EC|UU|FL|RD)?[0-9A-Za-z-_]{10,})
c5e8d7af
PH
1469 .*
1470 |
715c8e7b 1471 ((?:PL|EC|UU|FL|RD)[0-9A-Za-z-_]{10,})
c5e8d7af 1472 )"""
dcbb4580
JMF
1473 _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s&page=%s'
1474 _MORE_PAGES_INDICATOR = r'data-link-type="next"'
6e47b51e 1475 _VIDEO_RE = r'href="/watch\?v=(?P<id>[0-9A-Za-z_-]{11})&amp;[^"]*?index=(?P<index>\d+)'
c5e8d7af
PH
1476 IE_NAME = u'youtube:playlist'
1477
1478 @classmethod
1479 def suitable(cls, url):
1480 """Receives a URL and returns True if suitable for this IE."""
1481 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1482
880e1c52
JMF
1483 def _real_initialize(self):
1484 self._login()
1485
652cdaa2
JMF
1486 def _ids_to_results(self, ids):
1487 return [self.url_result(vid_id, 'Youtube', video_id=vid_id)
1488 for vid_id in ids]
1489
1490 def _extract_mix(self, playlist_id):
1491 # The mixes are generated from a a single video
1492 # the id of the playlist is just 'RD' + video_id
7d4afc55 1493 url = 'https://youtube.com/watch?v=%s&list=%s' % (playlist_id[-11:], playlist_id)
652cdaa2 1494 webpage = self._download_webpage(url, playlist_id, u'Downloading Youtube mix')
76d1700b
JMF
1495 title_span = (get_element_by_attribute('class', 'title long-title', webpage) or
1496 get_element_by_attribute('class', 'title ', webpage))
1497 title = clean_html(title_span)
652cdaa2
JMF
1498 video_re = r'data-index="\d+".*?href="/watch\?v=([0-9A-Za-z_-]{11})&amp;[^"]*?list=%s' % re.escape(playlist_id)
1499 ids = orderedSet(re.findall(video_re, webpage))
1500 url_results = self._ids_to_results(ids)
1501
1502 return self.playlist_result(url_results, playlist_id, title)
1503
c5e8d7af
PH
1504 def _real_extract(self, url):
1505 # Extract playlist id
1506 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1507 if mobj is None:
1508 raise ExtractorError(u'Invalid URL: %s' % url)
47192f92
FV
1509 playlist_id = mobj.group(1) or mobj.group(2)
1510
1511 # Check if it's a video-specific URL
7c61bd36 1512 query_dict = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
47192f92
FV
1513 if 'v' in query_dict:
1514 video_id = query_dict['v'][0]
1515 if self._downloader.params.get('noplaylist'):
1516 self.to_screen(u'Downloading just video %s because of --no-playlist' % video_id)
7012b23c 1517 return self.url_result(video_id, 'Youtube', video_id=video_id)
47192f92
FV
1518 else:
1519 self.to_screen(u'Downloading playlist PL%s - add --no-playlist to just download video %s' % (playlist_id, video_id))
c5e8d7af 1520
7d4afc55 1521 if playlist_id.startswith('RD'):
652cdaa2
JMF
1522 # Mixes require a custom extraction process
1523 return self._extract_mix(playlist_id)
0a688bc0
JMF
1524 if playlist_id.startswith('TL'):
1525 raise ExtractorError(u'For downloading YouTube.com top lists, use '
1526 u'the "yttoplist" keyword, for example "youtube-dl \'yttoplist:music:Top Tracks\'"', expected=True)
652cdaa2 1527
dcbb4580
JMF
1528 # Extract the video ids from the playlist pages
1529 ids = []
c5e8d7af 1530
755eb032 1531 for page_num in itertools.count(1):
dcbb4580 1532 url = self._TEMPLATE_URL % (playlist_id, page_num)
c5e8d7af 1533 page = self._download_webpage(url, playlist_id, u'Downloading page #%s' % page_num)
6e47b51e
JMF
1534 matches = re.finditer(self._VIDEO_RE, page)
1535 # We remove the duplicates and the link with index 0
1536 # (it's not the first video of the playlist)
1537 new_ids = orderedSet(m.group('id') for m in matches if m.group('index') != '0')
dcbb4580 1538 ids.extend(new_ids)
c5e8d7af 1539
dcbb4580 1540 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
c5e8d7af
PH
1541 break
1542
dcbb4580 1543 playlist_title = self._og_search_title(page)
c5e8d7af 1544
652cdaa2 1545 url_results = self._ids_to_results(ids)
dcbb4580 1546 return self.playlist_result(url_results, playlist_id, playlist_title)
c5e8d7af
PH
1547
1548
0a688bc0
JMF
1549class YoutubeTopListIE(YoutubePlaylistIE):
1550 IE_NAME = u'youtube:toplist'
1551 IE_DESC = (u'YouTube.com top lists, "yttoplist:{channel}:{list title}"'
1552 u' (Example: "yttoplist:music:Top Tracks")')
1553 _VALID_URL = r'yttoplist:(?P<chann>.*?):(?P<title>.*?)$'
1554
1555 def _real_extract(self, url):
1556 mobj = re.match(self._VALID_URL, url)
1557 channel = mobj.group('chann')
1558 title = mobj.group('title')
1559 query = compat_urllib_parse.urlencode({'title': title})
1560 playlist_re = 'href="([^"]+?%s[^"]+?)"' % re.escape(query)
1561 channel_page = self._download_webpage('https://www.youtube.com/%s' % channel, title)
1562 link = self._html_search_regex(playlist_re, channel_page, u'list')
1563 url = compat_urlparse.urljoin('https://www.youtube.com/', link)
1564
1565 video_re = r'data-index="\d+".*?data-video-id="([0-9A-Za-z_-]{11})"'
1566 ids = []
1567 # sometimes the webpage doesn't contain the videos
1568 # retry until we get them
1569 for i in itertools.count(0):
1570 msg = u'Downloading Youtube mix'
1571 if i > 0:
1572 msg += ', retry #%d' % i
1573 webpage = self._download_webpage(url, title, msg)
1574 ids = orderedSet(re.findall(video_re, webpage))
1575 if ids:
1576 break
1577 url_results = self._ids_to_results(ids)
1578 return self.playlist_result(url_results, playlist_title=title)
1579
1580
c5e8d7af 1581class YoutubeChannelIE(InfoExtractor):
0f818663 1582 IE_DESC = u'YouTube.com channels'
c5e8d7af 1583 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
c5e8d7af 1584 _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
252580c5 1585 _MORE_PAGES_URL = 'http://www.youtube.com/c4_browse_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
c5e8d7af
PH
1586 IE_NAME = u'youtube:channel'
1587
1588 def extract_videos_from_page(self, page):
1589 ids_in_page = []
1590 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1591 if mobj.group(1) not in ids_in_page:
1592 ids_in_page.append(mobj.group(1))
1593 return ids_in_page
1594
1595 def _real_extract(self, url):
1596 # Extract channel id
1597 mobj = re.match(self._VALID_URL, url)
1598 if mobj is None:
1599 raise ExtractorError(u'Invalid URL: %s' % url)
1600
1601 # Download channel page
1602 channel_id = mobj.group(1)
1603 video_ids = []
b9643eed
JMF
1604 url = 'https://www.youtube.com/channel/%s/videos' % channel_id
1605 channel_page = self._download_webpage(url, channel_id)
31812a9e
PH
1606 autogenerated = re.search(r'''(?x)
1607 class="[^"]*?(?:
1608 channel-header-autogenerated-label|
1609 yt-channel-title-autogenerated
1610 )[^"]*"''', channel_page) is not None
c5e8d7af 1611
b9643eed
JMF
1612 if autogenerated:
1613 # The videos are contained in a single page
1614 # the ajax pages can't be used, they are empty
1615 video_ids = self.extract_videos_from_page(channel_page)
1616 else:
1617 # Download all channel pages using the json-based channel_ajax query
1618 for pagenum in itertools.count(1):
1619 url = self._MORE_PAGES_URL % (pagenum, channel_id)
1620 page = self._download_webpage(url, channel_id,
1621 u'Downloading page #%s' % pagenum)
1622
1623 page = json.loads(page)
1624
1625 ids_in_page = self.extract_videos_from_page(page['content_html'])
1626 video_ids.extend(ids_in_page)
1627
1628 if self._MORE_PAGES_INDICATOR not in page['load_more_widget_html']:
1629 break
c5e8d7af
PH
1630
1631 self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1632
7012b23c
PH
1633 url_entries = [self.url_result(video_id, 'Youtube', video_id=video_id)
1634 for video_id in video_ids]
1635 return self.playlist_result(url_entries, channel_id)
c5e8d7af
PH
1636
1637
1638class YoutubeUserIE(InfoExtractor):
0f818663 1639 IE_DESC = u'YouTube.com user videos (URL or "ytuser" keyword)'
57da92b7 1640 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:user/)?(?!(?:attribution_link|watch)(?:$|[^a-z_A-Z0-9-])))|ytuser:)(?!feed/)([A-Za-z0-9_-]+)'
c5e8d7af
PH
1641 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1642 _GDATA_PAGE_SIZE = 50
fd9cf738 1643 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d&alt=json'
c5e8d7af
PH
1644 IE_NAME = u'youtube:user'
1645
e3ea4790 1646 @classmethod
f4b05232 1647 def suitable(cls, url):
e3ea4790
JMF
1648 # Don't return True if the url can be extracted with other youtube
1649 # extractor, the regex would is too permissive and it would match.
1650 other_ies = iter(klass for (name, klass) in globals().items() if name.endswith('IE') and klass is not cls)
1651 if any(ie.suitable(url) for ie in other_ies): return False
f4b05232
JMF
1652 else: return super(YoutubeUserIE, cls).suitable(url)
1653
c5e8d7af
PH
1654 def _real_extract(self, url):
1655 # Extract username
1656 mobj = re.match(self._VALID_URL, url)
1657 if mobj is None:
1658 raise ExtractorError(u'Invalid URL: %s' % url)
1659
1660 username = mobj.group(1)
1661
1662 # Download video ids using YouTube Data API. Result size per
1663 # query is limited (currently to 50 videos) so we need to query
1664 # page by page until there are no video ids - it means we got
1665 # all of them.
1666
e302f9ce 1667 url_results = []
c5e8d7af 1668
755eb032 1669 for pagenum in itertools.count(0):
c5e8d7af
PH
1670 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1671
1672 gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
1673 page = self._download_webpage(gdata_url, username,
1674 u'Downloading video ids from %d to %d' % (start_index, start_index + self._GDATA_PAGE_SIZE))
1675
fd9cf738
JMF
1676 try:
1677 response = json.loads(page)
1678 except ValueError as err:
1679 raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
71c82637
JMF
1680 if 'entry' not in response['feed']:
1681 # Number of videos is a multiple of self._MAX_RESULTS
1682 break
fd9cf738 1683
c5e8d7af 1684 # Extract video identifiers
e302f9ce
PH
1685 entries = response['feed']['entry']
1686 for entry in entries:
1687 title = entry['title']['$t']
1688 video_id = entry['id']['$t'].split('/')[-1]
1689 url_results.append({
1690 '_type': 'url',
1691 'url': video_id,
1692 'ie_key': 'Youtube',
1693 'id': 'video_id',
1694 'title': title,
1695 })
c5e8d7af
PH
1696
1697 # A little optimization - if current page is not
1698 # "full", ie. does not contain PAGE_SIZE video ids then
1699 # we can assume that this page is the last one - there
1700 # are no more ids on further pages - no need to query
1701 # again.
1702
e302f9ce 1703 if len(entries) < self._GDATA_PAGE_SIZE:
c5e8d7af
PH
1704 break
1705
7012b23c
PH
1706 return self.playlist_result(url_results, playlist_title=username)
1707
b05654f0
PH
1708
1709class YoutubeSearchIE(SearchInfoExtractor):
0f818663 1710 IE_DESC = u'YouTube.com searches'
b05654f0
PH
1711 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1712 _MAX_RESULTS = 1000
1713 IE_NAME = u'youtube:search'
1714 _SEARCH_KEY = 'ytsearch'
1715
b05654f0
PH
1716 def _get_n_results(self, query, n):
1717 """Get a specified number of results for a query"""
1718
1719 video_ids = []
1720 pagenum = 0
1721 limit = n
1722
1723 while (50 * pagenum) < limit:
b05654f0 1724 result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
7cc3570e
PH
1725 data_json = self._download_webpage(
1726 result_url, video_id=u'query "%s"' % query,
1727 note=u'Downloading page %s' % (pagenum + 1),
1728 errnote=u'Unable to download API page')
1729 data = json.loads(data_json)
1730 api_response = data['data']
1731
1732 if 'items' not in api_response:
b05654f0
PH
1733 raise ExtractorError(u'[youtube] No video results')
1734
1735 new_ids = list(video['id'] for video in api_response['items'])
1736 video_ids += new_ids
1737
1738 limit = min(n, api_response['totalItems'])
1739 pagenum += 1
1740
1741 if len(video_ids) > n:
1742 video_ids = video_ids[:n]
7012b23c
PH
1743 videos = [self.url_result(video_id, 'Youtube', video_id=video_id)
1744 for video_id in video_ids]
b05654f0 1745 return self.playlist_result(videos, query)
75dff0ee 1746
a3dd9248 1747class YoutubeSearchDateIE(YoutubeSearchIE):
cb7fb546 1748 IE_NAME = YoutubeSearchIE.IE_NAME + ':date'
a3dd9248
CM
1749 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc&orderby=published'
1750 _SEARCH_KEY = 'ytsearchdate'
08fb86c4 1751 IE_DESC = u'YouTube.com searches, newest videos first'
75dff0ee
JMF
1752
1753class YoutubeShowIE(InfoExtractor):
0f818663 1754 IE_DESC = u'YouTube.com (multi-season) shows'
75dff0ee
JMF
1755 _VALID_URL = r'https?://www\.youtube\.com/show/(.*)'
1756 IE_NAME = u'youtube:show'
1757
1758 def _real_extract(self, url):
1759 mobj = re.match(self._VALID_URL, url)
1760 show_name = mobj.group(1)
1761 webpage = self._download_webpage(url, show_name, u'Downloading show webpage')
1762 # There's one playlist for each season of the show
1763 m_seasons = list(re.finditer(r'href="(/playlist\?list=.*?)"', webpage))
1764 self.to_screen(u'%s: Found %s seasons' % (show_name, len(m_seasons)))
1765 return [self.url_result('https://www.youtube.com' + season.group(1), 'YoutubePlaylist') for season in m_seasons]
04cc9617
JMF
1766
1767
b2e8bc1b 1768class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
d7ae0639
JMF
1769 """
1770 Base class for extractors that fetch info from
1771 http://www.youtube.com/feed_ajax
1772 Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.
1773 """
b2e8bc1b 1774 _LOGIN_REQUIRED = True
43ba5456
JMF
1775 # use action_load_personal_feed instead of action_load_system_feed
1776 _PERSONAL_FEED = False
04cc9617 1777
d7ae0639
JMF
1778 @property
1779 def _FEED_TEMPLATE(self):
43ba5456
JMF
1780 action = 'action_load_system_feed'
1781 if self._PERSONAL_FEED:
1782 action = 'action_load_personal_feed'
1783 return 'http://www.youtube.com/feed_ajax?%s=1&feed_name=%s&paging=%%s' % (action, self._FEED_NAME)
d7ae0639
JMF
1784
1785 @property
1786 def IE_NAME(self):
1787 return u'youtube:%s' % self._FEED_NAME
04cc9617 1788
81f0259b 1789 def _real_initialize(self):
b2e8bc1b 1790 self._login()
81f0259b 1791
04cc9617
JMF
1792 def _real_extract(self, url):
1793 feed_entries = []
0e44d838
JMF
1794 paging = 0
1795 for i in itertools.count(1):
d7ae0639
JMF
1796 info = self._download_webpage(self._FEED_TEMPLATE % paging,
1797 u'%s feed' % self._FEED_NAME,
04cc9617
JMF
1798 u'Downloading page %s' % i)
1799 info = json.loads(info)
1800 feed_html = info['feed_html']
43ba5456 1801 m_ids = re.finditer(r'"/watch\?v=(.*?)["&]', feed_html)
04cc9617 1802 ids = orderedSet(m.group(1) for m in m_ids)
7012b23c
PH
1803 feed_entries.extend(
1804 self.url_result(video_id, 'Youtube', video_id=video_id)
1805 for video_id in ids)
04cc9617
JMF
1806 if info['paging'] is None:
1807 break
0e44d838 1808 paging = info['paging']
d7ae0639
JMF
1809 return self.playlist_result(feed_entries, playlist_title=self._PLAYLIST_TITLE)
1810
1811class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor):
1812 IE_DESC = u'YouTube.com subscriptions feed, "ytsubs" keyword(requires authentication)'
1813 _VALID_URL = r'https?://www\.youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
1814 _FEED_NAME = 'subscriptions'
1815 _PLAYLIST_TITLE = u'Youtube Subscriptions'
1816
1817class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
1818 IE_DESC = u'YouTube.com recommended videos, "ytrec" keyword (requires authentication)'
1819 _VALID_URL = r'https?://www\.youtube\.com/feed/recommended|:ytrec(?:ommended)?'
1820 _FEED_NAME = 'recommended'
1821 _PLAYLIST_TITLE = u'Youtube Recommended videos'
c626a3d9 1822
43ba5456
JMF
1823class YoutubeWatchLaterIE(YoutubeFeedsInfoExtractor):
1824 IE_DESC = u'Youtube watch later list, "ytwatchlater" keyword (requires authentication)'
1825 _VALID_URL = r'https?://www\.youtube\.com/feed/watch_later|:ytwatchlater'
1826 _FEED_NAME = 'watch_later'
1827 _PLAYLIST_TITLE = u'Youtube Watch Later'
43ba5456 1828 _PERSONAL_FEED = True
c626a3d9 1829
f459d170
JMF
1830class YoutubeHistoryIE(YoutubeFeedsInfoExtractor):
1831 IE_DESC = u'Youtube watch history, "ythistory" keyword (requires authentication)'
1832 _VALID_URL = u'https?://www\.youtube\.com/feed/history|:ythistory'
1833 _FEED_NAME = 'history'
1834 _PERSONAL_FEED = True
1835 _PLAYLIST_TITLE = u'Youtube Watch History'
1836
c626a3d9
JMF
1837class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
1838 IE_NAME = u'youtube:favorites'
1839 IE_DESC = u'YouTube.com favourite videos, "ytfav" keyword (requires authentication)'
c7a7750d 1840 _VALID_URL = r'https?://www\.youtube\.com/my_favorites|:ytfav(?:ou?rites)?'
c626a3d9
JMF
1841 _LOGIN_REQUIRED = True
1842
1843 def _real_extract(self, url):
1844 webpage = self._download_webpage('https://www.youtube.com/my_favorites', 'Youtube Favourites videos')
1845 playlist_id = self._search_regex(r'list=(.+?)["&]', webpage, u'favourites playlist id')
1846 return self.url_result(playlist_id, 'YoutubePlaylist')
15870e90
PH
1847
1848
1849class YoutubeTruncatedURLIE(InfoExtractor):
1850 IE_NAME = 'youtube:truncated_url'
1851 IE_DESC = False # Do not list
1852 _VALID_URL = r'(?:https?://)?[^/]+/watch\?feature=[a-z_]+$'
1853
1854 def _real_extract(self, url):
1855 raise ExtractorError(
1856 u'Did you forget to quote the URL? Remember that & is a meta '
1857 u'character in most shells, so you want to put the URL in quotes, '
1858 u'like youtube-dl '
1859 u'\'http://www.youtube.com/watch?feature=foo&v=BaW_jenozKc\''
1860 u' (or simply youtube-dl BaW_jenozKc ).',
1861 expected=True)