]> jfr.im git - yt-dlp.git/blame - youtube_dl/extractor/youtube.py
release 2013.11.17
[yt-dlp.git] / youtube_dl / extractor / youtube.py
CommitLineData
c5e8d7af 1# coding: utf-8
c5e8d7af 2
e0df6211 3import collections
edf3e38e 4import errno
e0df6211 5import io
0ca96d48 6import itertools
c5e8d7af 7import json
c4417ddb 8import os.path
c5e8d7af
PH
9import re
10import socket
e0df6211
PH
11import string
12import struct
13import traceback
055e6f36 14import xml.etree.ElementTree
e0df6211 15import zlib
c5e8d7af 16
b05654f0 17from .common import InfoExtractor, SearchInfoExtractor
54d39d8b 18from .subtitles import SubtitlesInfoExtractor
c5e8d7af 19from ..utils import (
edf3e38e 20 compat_chr,
c5e8d7af
PH
21 compat_http_client,
22 compat_parse_qs,
23 compat_urllib_error,
24 compat_urllib_parse,
25 compat_urllib_request,
7c61bd36 26 compat_urlparse,
c5e8d7af
PH
27 compat_str,
28
29 clean_html,
c38b1e77 30 get_cachedir,
c5e8d7af
PH
31 get_element_by_id,
32 ExtractorError,
33 unescapeHTML,
34 unified_strdate,
04cc9617 35 orderedSet,
edf3e38e 36 write_json_file,
c5e8d7af
PH
37)
38
de7f3446 39class YoutubeBaseInfoExtractor(InfoExtractor):
b2e8bc1b
JMF
40 """Provide base functions for Youtube extractors"""
41 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
42 _LANG_URL = r'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
43 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
44 _NETRC_MACHINE = 'youtube'
45 # If True it will raise an error if no login info is provided
46 _LOGIN_REQUIRED = False
47
48 def report_lang(self):
49 """Report attempt to set language."""
50 self.to_screen(u'Setting language')
51
52 def _set_language(self):
53 request = compat_urllib_request.Request(self._LANG_URL)
54 try:
55 self.report_lang()
56 compat_urllib_request.urlopen(request).read()
57 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
58 self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
59 return False
60 return True
61
62 def _login(self):
63 (username, password) = self._get_login_info()
64 # No authentication to be performed
65 if username is None:
66 if self._LOGIN_REQUIRED:
67 raise ExtractorError(u'No login info available, needed for using %s.' % self.IE_NAME, expected=True)
68 return False
69
70 request = compat_urllib_request.Request(self._LOGIN_URL)
71 try:
72 login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
73 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
74 self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err))
75 return False
76
795f28f8
PH
77 galx = self._search_regex(r'(?s)<input.+?name="GALX".+?value="(.+?)"',
78 login_page, u'Login GALX parameter')
c5e8d7af 79
b2e8bc1b
JMF
80 # Log in
81 login_form_strs = {
82 u'continue': u'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
83 u'Email': username,
84 u'GALX': galx,
85 u'Passwd': password,
86 u'PersistentCookie': u'yes',
87 u'_utf8': u'霱',
88 u'bgresponse': u'js_disabled',
89 u'checkConnection': u'',
90 u'checkedDomains': u'youtube',
91 u'dnConn': u'',
b2e8bc1b
JMF
92 u'pstMsg': u'0',
93 u'rmShown': u'1',
94 u'secTok': u'',
95 u'signIn': u'Sign in',
96 u'timeStmp': u'',
97 u'service': u'youtube',
98 u'uilel': u'3',
99 u'hl': u'en_US',
100 }
101 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
102 # chokes on unicode
103 login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
104 login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
105 request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
106 try:
107 self.report_login()
108 login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
109 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
110 self._downloader.report_warning(u'unable to log in: bad username or password')
111 return False
112 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
113 self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
114 return False
115 return True
116
117 def _confirm_age(self):
118 age_form = {
119 'next_url': '/',
120 'action_confirm': 'Confirm',
121 }
122 request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
123 try:
124 self.report_age_confirmation()
125 compat_urllib_request.urlopen(request).read().decode('utf-8')
126 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
127 raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err))
128 return True
129
130 def _real_initialize(self):
131 if self._downloader is None:
132 return
133 if not self._set_language():
134 return
135 if not self._login():
136 return
137 self._confirm_age()
c5e8d7af 138
8377574c 139
de7f3446 140class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
0f818663 141 IE_DESC = u'YouTube.com'
c5e8d7af
PH
142 _VALID_URL = r"""^
143 (
144 (?:https?://)? # http(s):// (optional)
f4b05232 145 (?:(?:(?:(?:\w+\.)?youtube(?:-nocookie)?\.com/|
e69ae5b9
JMF
146 tube\.majestyc\.net/|
147 youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains
c5e8d7af
PH
148 (?:.*?\#/)? # handle anchor (#/) redirect urls
149 (?: # the various things that can precede the ID:
150 (?:(?:v|embed|e)/) # v/ or embed/ or e/
151 |(?: # or the v= param in all its forms
d741e55a 152 (?:(?:watch|movie)(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
c5e8d7af
PH
153 (?:\?|\#!?) # the params delimiter ? or # or #!
154 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
155 v=
156 )
f4b05232
JMF
157 ))
158 |youtu\.be/ # just youtu.be/xxxx
159 )
c5e8d7af 160 )? # all until now is optional -> you can pass the naked ID
8963d9c2 161 ([0-9A-Za-z_-]{11}) # here is it! the YouTube video ID
c5e8d7af
PH
162 (?(1).+)? # if we found the ID, everything can follow
163 $"""
c5e8d7af 164 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
c5e8d7af 165 # Listed in order of quality
bdc6b3fc 166 _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '36', '17', '13',
96fb5605 167 # Apple HTTP Live Streaming
bdc6b3fc 168 '96', '95', '94', '93', '92', '132', '151',
939fbd26
JMF
169 # 3D
170 '85', '84', '102', '83', '101', '82', '100',
171 # Dash video
172 '138', '137', '248', '136', '247', '135', '246',
173 '245', '244', '134', '243', '133', '242', '160',
174 # Dash audio
175 '141', '172', '140', '171', '139',
1d043b93 176 ]
bdc6b3fc 177 _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '36', '17', '13',
96fb5605 178 # Apple HTTP Live Streaming
bdc6b3fc
AZ
179 '96', '95', '94', '93', '92', '132', '151',
180 # 3D
86fe61c8 181 '85', '102', '84', '101', '83', '100', '82',
939fbd26
JMF
182 # Dash video
183 '138', '248', '137', '247', '136', '246', '245',
184 '244', '135', '243', '134', '242', '133', '160',
185 # Dash audio
186 '172', '141', '171', '140', '139',
1d043b93 187 ]
bdc6b3fc
AZ
188 _video_formats_map = {
189 'flv': ['35', '34', '6', '5'],
190 '3gp': ['36', '17', '13'],
191 'mp4': ['38', '37', '22', '18'],
192 'webm': ['46', '45', '44', '43'],
193 }
c5e8d7af
PH
194 _video_extensions = {
195 '13': '3gp',
bdc6b3fc 196 '17': '3gp',
c5e8d7af
PH
197 '18': 'mp4',
198 '22': 'mp4',
bdc6b3fc 199 '36': '3gp',
c5e8d7af 200 '37': 'mp4',
d69cf69a 201 '38': 'mp4',
c5e8d7af
PH
202 '43': 'webm',
203 '44': 'webm',
204 '45': 'webm',
205 '46': 'webm',
1d043b93 206
86fe61c8
AZ
207 # 3d videos
208 '82': 'mp4',
209 '83': 'mp4',
210 '84': 'mp4',
211 '85': 'mp4',
212 '100': 'webm',
213 '101': 'webm',
214 '102': 'webm',
836a086c 215
96fb5605 216 # Apple HTTP Live Streaming
1d043b93
JMF
217 '92': 'mp4',
218 '93': 'mp4',
219 '94': 'mp4',
220 '95': 'mp4',
221 '96': 'mp4',
222 '132': 'mp4',
223 '151': 'mp4',
836a086c
AZ
224
225 # Dash mp4
226 '133': 'mp4',
227 '134': 'mp4',
228 '135': 'mp4',
229 '136': 'mp4',
230 '137': 'mp4',
231 '138': 'mp4',
836a086c
AZ
232 '160': 'mp4',
233
f6f1fc92
RB
234 # Dash mp4 audio
235 '139': 'm4a',
16f36a6f
RB
236 '140': 'm4a',
237 '141': 'm4a',
836a086c
AZ
238
239 # Dash webm
240 '171': 'webm',
241 '172': 'webm',
242 '242': 'webm',
243 '243': 'webm',
244 '244': 'webm',
245 '245': 'webm',
246 '246': 'webm',
247 '247': 'webm',
248 '248': 'webm',
c5e8d7af
PH
249 }
250 _video_dimensions = {
251 '5': '240x400',
252 '6': '???',
253 '13': '???',
254 '17': '144x176',
255 '18': '360x640',
256 '22': '720x1280',
257 '34': '360x640',
258 '35': '480x854',
bdc6b3fc 259 '36': '240x320',
c5e8d7af
PH
260 '37': '1080x1920',
261 '38': '3072x4096',
262 '43': '360x640',
263 '44': '480x854',
264 '45': '720x1280',
265 '46': '1080x1920',
86fe61c8
AZ
266 '82': '360p',
267 '83': '480p',
268 '84': '720p',
269 '85': '1080p',
1d043b93
JMF
270 '92': '240p',
271 '93': '360p',
272 '94': '480p',
273 '95': '720p',
274 '96': '1080p',
86fe61c8
AZ
275 '100': '360p',
276 '101': '480p',
836a086c 277 '102': '720p',
1d043b93
JMF
278 '132': '240p',
279 '151': '72p',
836a086c
AZ
280 '133': '240p',
281 '134': '360p',
282 '135': '480p',
283 '136': '720p',
284 '137': '1080p',
285 '138': '>1080p',
286 '139': '48k',
287 '140': '128k',
288 '141': '256k',
289 '160': '192p',
290 '171': '128k',
291 '172': '256k',
292 '242': '240p',
293 '243': '360p',
294 '244': '480p',
295 '245': '480p',
296 '246': '480p',
297 '247': '720p',
298 '248': '1080p',
c5e8d7af 299 }
836a086c
AZ
300 _special_itags = {
301 '82': '3D',
302 '83': '3D',
303 '84': '3D',
304 '85': '3D',
305 '100': '3D',
306 '101': '3D',
307 '102': '3D',
308 '133': 'DASH Video',
309 '134': 'DASH Video',
310 '135': 'DASH Video',
311 '136': 'DASH Video',
312 '137': 'DASH Video',
313 '138': 'DASH Video',
314 '139': 'DASH Audio',
315 '140': 'DASH Audio',
316 '141': 'DASH Audio',
317 '160': 'DASH Video',
318 '171': 'DASH Audio',
319 '172': 'DASH Audio',
320 '242': 'DASH Video',
321 '243': 'DASH Video',
322 '244': 'DASH Video',
323 '245': 'DASH Video',
324 '246': 'DASH Video',
325 '247': 'DASH Video',
326 '248': 'DASH Video',
c5e8d7af 327 }
836a086c 328
c5e8d7af 329 IE_NAME = u'youtube'
2eb88d95
PH
330 _TESTS = [
331 {
0e853ca4
PH
332 u"url": u"http://www.youtube.com/watch?v=BaW_jenozKc",
333 u"file": u"BaW_jenozKc.mp4",
334 u"info_dict": {
335 u"title": u"youtube-dl test video \"'/\\ä↭𝕐",
336 u"uploader": u"Philipp Hagemeister",
337 u"uploader_id": u"phihag",
338 u"upload_date": u"20121002",
339 u"description": u"test chars: \"'/\\ä↭𝕐\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de ."
2eb88d95 340 }
0e853ca4 341 },
0e853ca4
PH
342 {
343 u"url": u"http://www.youtube.com/watch?v=UxxajLWwzqY",
344 u"file": u"UxxajLWwzqY.mp4",
345 u"note": u"Test generic use_cipher_signature video (#897)",
346 u"info_dict": {
347 u"upload_date": u"20120506",
348 u"title": u"Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]",
2dc59299 349 u"description": u"md5:5b292926389560516e384ac437c0ec07",
45ed795c 350 u"uploader": u"Icona Pop",
0e853ca4 351 u"uploader_id": u"IconaPop"
2eb88d95 352 }
c108eb73
JMF
353 },
354 {
355 u"url": u"https://www.youtube.com/watch?v=07FYdnEawAQ",
356 u"file": u"07FYdnEawAQ.mp4",
357 u"note": u"Test VEVO video with age protection (#956)",
358 u"info_dict": {
359 u"upload_date": u"20130703",
360 u"title": u"Justin Timberlake - Tunnel Vision (Explicit)",
361 u"description": u"md5:64249768eec3bc4276236606ea996373",
362 u"uploader": u"justintimberlakeVEVO",
363 u"uploader_id": u"justintimberlakeVEVO"
364 }
365 },
2eb88d95
PH
366 ]
367
c5e8d7af
PH
368
369 @classmethod
370 def suitable(cls, url):
371 """Receives a URL and returns True if suitable for this IE."""
e3ea4790 372 if YoutubePlaylistIE.suitable(url): return False
c5e8d7af
PH
373 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
374
e0df6211
PH
375 def __init__(self, *args, **kwargs):
376 super(YoutubeIE, self).__init__(*args, **kwargs)
83799698 377 self._player_cache = {}
e0df6211 378
c5e8d7af
PH
379 def report_video_webpage_download(self, video_id):
380 """Report attempt to download video webpage."""
381 self.to_screen(u'%s: Downloading video webpage' % video_id)
382
383 def report_video_info_webpage_download(self, video_id):
384 """Report attempt to download video info webpage."""
385 self.to_screen(u'%s: Downloading video info webpage' % video_id)
386
c5e8d7af
PH
387 def report_information_extraction(self, video_id):
388 """Report attempt to extract video information."""
389 self.to_screen(u'%s: Extracting video information' % video_id)
390
391 def report_unavailable_format(self, video_id, format):
392 """Report extracted video URL."""
393 self.to_screen(u'%s: Format %s not available' % (video_id, format))
394
395 def report_rtmp_download(self):
396 """Indicate the download will use the RTMP protocol."""
397 self.to_screen(u'RTMP download detected')
398
c4417ddb
PH
399 def _extract_signature_function(self, video_id, player_url, slen):
400 id_m = re.match(r'.*-(?P<id>[a-zA-Z0-9_-]+)\.(?P<ext>[a-z]+)$',
83799698 401 player_url)
e0df6211
PH
402 player_type = id_m.group('ext')
403 player_id = id_m.group('id')
404
c4417ddb
PH
405 # Read from filesystem cache
406 func_id = '%s_%s_%d' % (player_type, player_id, slen)
407 assert os.path.basename(func_id) == func_id
c38b1e77 408 cache_dir = get_cachedir(self._downloader.params)
c4417ddb 409
c3c88a26 410 cache_enabled = cache_dir is not None
f8061589 411 if cache_enabled:
c4417ddb
PH
412 cache_fn = os.path.join(os.path.expanduser(cache_dir),
413 u'youtube-sigfuncs',
414 func_id + '.json')
415 try:
edf3e38e 416 with io.open(cache_fn, 'r', encoding='utf-8') as cachef:
c4417ddb
PH
417 cache_spec = json.load(cachef)
418 return lambda s: u''.join(s[i] for i in cache_spec)
edf3e38e 419 except IOError:
c4417ddb 420 pass # No cache available
83799698 421
e0df6211
PH
422 if player_type == 'js':
423 code = self._download_webpage(
424 player_url, video_id,
83799698 425 note=u'Downloading %s player %s' % (player_type, player_id),
e0df6211 426 errnote=u'Download of %s failed' % player_url)
83799698 427 res = self._parse_sig_js(code)
c4417ddb 428 elif player_type == 'swf':
e0df6211
PH
429 urlh = self._request_webpage(
430 player_url, video_id,
83799698 431 note=u'Downloading %s player %s' % (player_type, player_id),
e0df6211
PH
432 errnote=u'Download of %s failed' % player_url)
433 code = urlh.read()
83799698 434 res = self._parse_sig_swf(code)
e0df6211
PH
435 else:
436 assert False, 'Invalid player type %r' % player_type
437
f8061589 438 if cache_enabled:
edf3e38e 439 try:
c705320f
PH
440 test_string = u''.join(map(compat_chr, range(slen)))
441 cache_res = res(test_string)
edf3e38e
PH
442 cache_spec = [ord(c) for c in cache_res]
443 try:
444 os.makedirs(os.path.dirname(cache_fn))
445 except OSError as ose:
446 if ose.errno != errno.EEXIST:
447 raise
448 write_json_file(cache_spec, cache_fn)
0ca96d48 449 except Exception:
edf3e38e
PH
450 tb = traceback.format_exc()
451 self._downloader.report_warning(
452 u'Writing cache to %r failed: %s' % (cache_fn, tb))
83799698
PH
453
454 return res
455
edf3e38e
PH
456 def _print_sig_code(self, func, slen):
457 def gen_sig_code(idxs):
458 def _genslice(start, end, step):
459 starts = u'' if start == 0 else str(start)
e35e4ddc
PH
460 ends = (u':%d' % (end+step)) if end + step >= 0 else u':'
461 steps = u'' if step == 1 else (u':%d' % step)
edf3e38e
PH
462 return u's[%s%s%s]' % (starts, ends, steps)
463
464 step = None
0ca96d48
PH
465 start = '(Never used)' # Quelch pyflakes warnings - start will be
466 # set as soon as step is set
edf3e38e
PH
467 for i, prev in zip(idxs[1:], idxs[:-1]):
468 if step is not None:
469 if i - prev == step:
470 continue
471 yield _genslice(start, prev, step)
472 step = None
473 continue
474 if i - prev in [-1, 1]:
475 step = i - prev
476 start = prev
477 continue
478 else:
479 yield u's[%d]' % prev
480 if step is None:
481 yield u's[%d]' % i
482 else:
483 yield _genslice(start, i, step)
484
c705320f
PH
485 test_string = u''.join(map(compat_chr, range(slen)))
486 cache_res = func(test_string)
edf3e38e
PH
487 cache_spec = [ord(c) for c in cache_res]
488 expr_code = u' + '.join(gen_sig_code(cache_spec))
489 code = u'if len(s) == %d:\n return %s\n' % (slen, expr_code)
f8061589 490 self.to_screen(u'Extracted signature function:\n' + code)
edf3e38e 491
e0df6211
PH
492 def _parse_sig_js(self, jscode):
493 funcname = self._search_regex(
494 r'signature=([a-zA-Z]+)', jscode,
495 u'Initial JS player signature function name')
496
497 functions = {}
498
499 def argidx(varname):
500 return string.lowercase.index(varname)
501
502 def interpret_statement(stmt, local_vars, allow_recursion=20):
503 if allow_recursion < 0:
0ca96d48 504 raise ExtractorError(u'Recursion limit reached')
e0df6211
PH
505
506 if stmt.startswith(u'var '):
507 stmt = stmt[len(u'var '):]
508 ass_m = re.match(r'^(?P<out>[a-z]+)(?:\[(?P<index>[^\]]+)\])?' +
509 r'=(?P<expr>.*)$', stmt)
510 if ass_m:
511 if ass_m.groupdict().get('index'):
512 def assign(val):
513 lvar = local_vars[ass_m.group('out')]
514 idx = interpret_expression(ass_m.group('index'),
515 local_vars, allow_recursion)
516 assert isinstance(idx, int)
517 lvar[idx] = val
518 return val
519 expr = ass_m.group('expr')
520 else:
521 def assign(val):
522 local_vars[ass_m.group('out')] = val
523 return val
524 expr = ass_m.group('expr')
525 elif stmt.startswith(u'return '):
526 assign = lambda v: v
527 expr = stmt[len(u'return '):]
528 else:
529 raise ExtractorError(
530 u'Cannot determine left side of statement in %r' % stmt)
531
532 v = interpret_expression(expr, local_vars, allow_recursion)
533 return assign(v)
534
535 def interpret_expression(expr, local_vars, allow_recursion):
536 if expr.isdigit():
537 return int(expr)
538
539 if expr.isalpha():
540 return local_vars[expr]
541
542 m = re.match(r'^(?P<in>[a-z]+)\.(?P<member>.*)$', expr)
543 if m:
544 member = m.group('member')
545 val = local_vars[m.group('in')]
546 if member == 'split("")':
547 return list(val)
548 if member == 'join("")':
549 return u''.join(val)
550 if member == 'length':
551 return len(val)
552 if member == 'reverse()':
553 return val[::-1]
554 slice_m = re.match(r'slice\((?P<idx>.*)\)', member)
555 if slice_m:
556 idx = interpret_expression(
557 slice_m.group('idx'), local_vars, allow_recursion-1)
558 return val[idx:]
559
560 m = re.match(
561 r'^(?P<in>[a-z]+)\[(?P<idx>.+)\]$', expr)
562 if m:
563 val = local_vars[m.group('in')]
564 idx = interpret_expression(m.group('idx'), local_vars,
565 allow_recursion-1)
566 return val[idx]
567
568 m = re.match(r'^(?P<a>.+?)(?P<op>[%])(?P<b>.+?)$', expr)
569 if m:
570 a = interpret_expression(m.group('a'),
571 local_vars, allow_recursion)
572 b = interpret_expression(m.group('b'),
573 local_vars, allow_recursion)
574 return a % b
575
576 m = re.match(
577 r'^(?P<func>[a-zA-Z]+)\((?P<args>[a-z0-9,]+)\)$', expr)
578 if m:
579 fname = m.group('func')
580 if fname not in functions:
581 functions[fname] = extract_function(fname)
582 argvals = [int(v) if v.isdigit() else local_vars[v]
583 for v in m.group('args').split(',')]
584 return functions[fname](argvals)
585 raise ExtractorError(u'Unsupported JS expression %r' % expr)
586
587 def extract_function(funcname):
588 func_m = re.search(
589 r'function ' + re.escape(funcname) +
590 r'\((?P<args>[a-z,]+)\){(?P<code>[^}]+)}',
591 jscode)
592 argnames = func_m.group('args').split(',')
593
594 def resf(args):
595 local_vars = dict(zip(argnames, args))
596 for stmt in func_m.group('code').split(';'):
597 res = interpret_statement(stmt, local_vars)
598 return res
599 return resf
600
601 initial_function = extract_function(funcname)
602 return lambda s: initial_function([s])
603
604 def _parse_sig_swf(self, file_contents):
605 if file_contents[1:3] != b'WS':
606 raise ExtractorError(
607 u'Not an SWF file; header is %r' % file_contents[:3])
608 if file_contents[:1] == b'C':
609 content = zlib.decompress(file_contents[8:])
610 else:
611 raise NotImplementedError(u'Unsupported compression format %r' %
612 file_contents[:1])
613
614 def extract_tags(content):
615 pos = 0
616 while pos < len(content):
617 header16 = struct.unpack('<H', content[pos:pos+2])[0]
618 pos += 2
619 tag_code = header16 >> 6
620 tag_len = header16 & 0x3f
621 if tag_len == 0x3f:
622 tag_len = struct.unpack('<I', content[pos:pos+4])[0]
623 pos += 4
624 assert pos+tag_len <= len(content)
625 yield (tag_code, content[pos:pos+tag_len])
626 pos += tag_len
627
628 code_tag = next(tag
629 for tag_code, tag in extract_tags(content)
630 if tag_code == 82)
631 p = code_tag.index(b'\0', 4) + 1
ba552f54 632 code_reader = io.BytesIO(code_tag[p:])
e0df6211
PH
633
634 # Parse ABC (AVM2 ByteCode)
ba552f54
PH
635 def read_int(reader=None):
636 if reader is None:
637 reader = code_reader
e0df6211
PH
638 res = 0
639 shift = 0
640 for _ in range(5):
ba552f54
PH
641 buf = reader.read(1)
642 assert len(buf) == 1
643 b = struct.unpack('<B', buf)[0]
e0df6211
PH
644 res = res | ((b & 0x7f) << shift)
645 if b & 0x80 == 0:
646 break
647 shift += 7
ba552f54
PH
648 return res
649
650 def u30(reader=None):
651 res = read_int(reader)
652 assert res & 0xf0000000 == 0
e0df6211
PH
653 return res
654 u32 = read_int
655
ba552f54
PH
656 def s32(reader=None):
657 v = read_int(reader)
e0df6211
PH
658 if v & 0x80000000 != 0:
659 v = - ((v ^ 0xffffffff) + 1)
ba552f54
PH
660 return v
661
0ca96d48 662 def read_string(reader=None):
ba552f54
PH
663 if reader is None:
664 reader = code_reader
665 slen = u30(reader)
666 resb = reader.read(slen)
667 assert len(resb) == slen
668 return resb.decode('utf-8')
669
670 def read_bytes(count, reader=None):
671 if reader is None:
672 reader = code_reader
673 resb = reader.read(count)
674 assert len(resb) == count
675 return resb
676
677 def read_byte(reader=None):
678 resb = read_bytes(1, reader=reader)
679 res = struct.unpack('<B', resb)[0]
680 return res
e0df6211
PH
681
682 # minor_version + major_version
0ca96d48 683 read_bytes(2 + 2)
e0df6211
PH
684
685 # Constant pool
ba552f54 686 int_count = u30()
e0df6211 687 for _c in range(1, int_count):
0ca96d48 688 s32()
ba552f54 689 uint_count = u30()
e0df6211 690 for _c in range(1, uint_count):
0ca96d48 691 u32()
ba552f54 692 double_count = u30()
0ca96d48 693 read_bytes((double_count-1) * 8)
ba552f54 694 string_count = u30()
e0df6211
PH
695 constant_strings = [u'']
696 for _c in range(1, string_count):
0ca96d48 697 s = read_string()
e0df6211 698 constant_strings.append(s)
ba552f54 699 namespace_count = u30()
e0df6211 700 for _c in range(1, namespace_count):
0ca96d48
PH
701 read_bytes(1) # kind
702 u30() # name
ba552f54 703 ns_set_count = u30()
e0df6211 704 for _c in range(1, ns_set_count):
ba552f54 705 count = u30()
e0df6211 706 for _c2 in range(count):
0ca96d48 707 u30()
ba552f54 708 multiname_count = u30()
e0df6211
PH
709 MULTINAME_SIZES = {
710 0x07: 2, # QName
711 0x0d: 2, # QNameA
712 0x0f: 1, # RTQName
713 0x10: 1, # RTQNameA
714 0x11: 0, # RTQNameL
715 0x12: 0, # RTQNameLA
716 0x09: 2, # Multiname
717 0x0e: 2, # MultinameA
718 0x1b: 1, # MultinameL
719 0x1c: 1, # MultinameLA
720 }
721 multinames = [u'']
722 for _c in range(1, multiname_count):
ba552f54 723 kind = u30()
e0df6211
PH
724 assert kind in MULTINAME_SIZES, u'Invalid multiname kind %r' % kind
725 if kind == 0x07:
0ca96d48 726 u30() # namespace_idx
ba552f54 727 name_idx = u30()
e0df6211
PH
728 multinames.append(constant_strings[name_idx])
729 else:
730 multinames.append('[MULTINAME kind: %d]' % kind)
731 for _c2 in range(MULTINAME_SIZES[kind]):
0ca96d48 732 u30()
e0df6211
PH
733
734 # Methods
ba552f54 735 method_count = u30()
e0df6211
PH
736 MethodInfo = collections.namedtuple(
737 'MethodInfo',
738 ['NEED_ARGUMENTS', 'NEED_REST'])
739 method_infos = []
740 for method_id in range(method_count):
ba552f54 741 param_count = u30()
0ca96d48 742 u30() # return type
e0df6211 743 for _ in range(param_count):
0ca96d48
PH
744 u30() # param type
745 u30() # name index (always 0 for youtube)
ba552f54 746 flags = read_byte()
e0df6211
PH
747 if flags & 0x08 != 0:
748 # Options present
ba552f54 749 option_count = u30()
e0df6211 750 for c in range(option_count):
0ca96d48
PH
751 u30() # val
752 read_bytes(1) # kind
e0df6211
PH
753 if flags & 0x80 != 0:
754 # Param names present
755 for _ in range(param_count):
0ca96d48 756 u30() # param name
e0df6211
PH
757 mi = MethodInfo(flags & 0x01 != 0, flags & 0x04 != 0)
758 method_infos.append(mi)
759
760 # Metadata
ba552f54 761 metadata_count = u30()
e0df6211 762 for _c in range(metadata_count):
0ca96d48 763 u30() # name
ba552f54 764 item_count = u30()
e0df6211 765 for _c2 in range(item_count):
0ca96d48
PH
766 u30() # key
767 u30() # value
ba552f54
PH
768
769 def parse_traits_info():
770 trait_name_idx = u30()
771 kind_full = read_byte()
e0df6211
PH
772 kind = kind_full & 0x0f
773 attrs = kind_full >> 4
774 methods = {}
775 if kind in [0x00, 0x06]: # Slot or Const
0ca96d48
PH
776 u30() # Slot id
777 u30() # type_name_idx
ba552f54 778 vindex = u30()
e0df6211 779 if vindex != 0:
0ca96d48 780 read_byte() # vkind
e0df6211 781 elif kind in [0x01, 0x02, 0x03]: # Method / Getter / Setter
0ca96d48 782 u30() # disp_id
ba552f54 783 method_idx = u30()
e0df6211
PH
784 methods[multinames[trait_name_idx]] = method_idx
785 elif kind == 0x04: # Class
0ca96d48
PH
786 u30() # slot_id
787 u30() # classi
e0df6211 788 elif kind == 0x05: # Function
0ca96d48 789 u30() # slot_id
ba552f54 790 function_idx = u30()
e0df6211
PH
791 methods[function_idx] = multinames[trait_name_idx]
792 else:
793 raise ExtractorError(u'Unsupported trait kind %d' % kind)
794
795 if attrs & 0x4 != 0: # Metadata present
ba552f54 796 metadata_count = u30()
e0df6211 797 for _c3 in range(metadata_count):
0ca96d48 798 u30() # metadata index
e0df6211 799
ba552f54 800 return methods
e0df6211
PH
801
802 # Classes
803 TARGET_CLASSNAME = u'SignatureDecipher'
804 searched_idx = multinames.index(TARGET_CLASSNAME)
805 searched_class_id = None
ba552f54 806 class_count = u30()
e0df6211 807 for class_id in range(class_count):
ba552f54 808 name_idx = u30()
e0df6211
PH
809 if name_idx == searched_idx:
810 # We found the class we're looking for!
811 searched_class_id = class_id
0ca96d48 812 u30() # super_name idx
ba552f54 813 flags = read_byte()
e0df6211 814 if flags & 0x08 != 0: # Protected namespace is present
0ca96d48 815 u30() # protected_ns_idx
ba552f54 816 intrf_count = u30()
e0df6211 817 for _c2 in range(intrf_count):
0ca96d48
PH
818 u30()
819 u30() # iinit
ba552f54 820 trait_count = u30()
e0df6211 821 for _c2 in range(trait_count):
0ca96d48 822 parse_traits_info()
e0df6211
PH
823
824 if searched_class_id is None:
825 raise ExtractorError(u'Target class %r not found' %
826 TARGET_CLASSNAME)
827
828 method_names = {}
829 method_idxs = {}
830 for class_id in range(class_count):
0ca96d48 831 u30() # cinit
ba552f54 832 trait_count = u30()
e0df6211 833 for _c2 in range(trait_count):
ba552f54 834 trait_methods = parse_traits_info()
e0df6211
PH
835 if class_id == searched_class_id:
836 method_names.update(trait_methods.items())
837 method_idxs.update(dict(
838 (idx, name)
839 for name, idx in trait_methods.items()))
840
841 # Scripts
ba552f54 842 script_count = u30()
e0df6211 843 for _c in range(script_count):
0ca96d48 844 u30() # init
ba552f54 845 trait_count = u30()
e0df6211 846 for _c2 in range(trait_count):
0ca96d48 847 parse_traits_info()
e0df6211
PH
848
849 # Method bodies
ba552f54 850 method_body_count = u30()
e0df6211
PH
851 Method = collections.namedtuple('Method', ['code', 'local_count'])
852 methods = {}
853 for _c in range(method_body_count):
ba552f54 854 method_idx = u30()
0ca96d48 855 u30() # max_stack
ba552f54 856 local_count = u30()
0ca96d48
PH
857 u30() # init_scope_depth
858 u30() # max_scope_depth
ba552f54
PH
859 code_length = u30()
860 code = read_bytes(code_length)
e0df6211 861 if method_idx in method_idxs:
ba552f54 862 m = Method(code, local_count)
e0df6211 863 methods[method_idxs[method_idx]] = m
ba552f54 864 exception_count = u30()
e0df6211 865 for _c2 in range(exception_count):
0ca96d48
PH
866 u30() # from
867 u30() # to
868 u30() # target
869 u30() # exc_type
870 u30() # var_name
ba552f54 871 trait_count = u30()
e0df6211 872 for _c2 in range(trait_count):
0ca96d48 873 parse_traits_info()
e0df6211 874
ba552f54 875 assert p + code_reader.tell() == len(code_tag)
e0df6211
PH
876 assert len(methods) == len(method_idxs)
877
878 method_pyfunctions = {}
879
880 def extract_function(func_name):
881 if func_name in method_pyfunctions:
882 return method_pyfunctions[func_name]
883 if func_name not in methods:
884 raise ExtractorError(u'Cannot find function %r' % func_name)
885 m = methods[func_name]
886
887 def resfunc(args):
e0df6211
PH
888 registers = ['(this)'] + list(args) + [None] * m.local_count
889 stack = []
890 coder = io.BytesIO(m.code)
891 while True:
892 opcode = struct.unpack('!B', coder.read(1))[0]
a7177865 893 if opcode == 36: # pushbyte
e0df6211
PH
894 v = struct.unpack('!B', coder.read(1))[0]
895 stack.append(v)
896 elif opcode == 44: # pushstring
897 idx = u30(coder)
898 stack.append(constant_strings[idx])
899 elif opcode == 48: # pushscope
900 # We don't implement the scope register, so we'll just
901 # ignore the popped value
902 stack.pop()
903 elif opcode == 70: # callproperty
904 index = u30(coder)
905 mname = multinames[index]
906 arg_count = u30(coder)
907 args = list(reversed(
908 [stack.pop() for _ in range(arg_count)]))
909 obj = stack.pop()
910 if mname == u'split':
911 assert len(args) == 1
912 assert isinstance(args[0], compat_str)
913 assert isinstance(obj, compat_str)
914 if args[0] == u'':
915 res = list(obj)
916 else:
917 res = obj.split(args[0])
918 stack.append(res)
a7177865
PH
919 elif mname == u'slice':
920 assert len(args) == 1
921 assert isinstance(args[0], int)
922 assert isinstance(obj, list)
923 res = obj[args[0]:]
924 stack.append(res)
925 elif mname == u'join':
926 assert len(args) == 1
927 assert isinstance(args[0], compat_str)
928 assert isinstance(obj, list)
929 res = args[0].join(obj)
930 stack.append(res)
e0df6211
PH
931 elif mname in method_pyfunctions:
932 stack.append(method_pyfunctions[mname](args))
933 else:
934 raise NotImplementedError(
935 u'Unsupported property %r on %r'
936 % (mname, obj))
a7177865
PH
937 elif opcode == 72: # returnvalue
938 res = stack.pop()
939 return res
940 elif opcode == 79: # callpropvoid
941 index = u30(coder)
942 mname = multinames[index]
943 arg_count = u30(coder)
944 args = list(reversed(
945 [stack.pop() for _ in range(arg_count)]))
946 obj = stack.pop()
947 if mname == u'reverse':
948 assert isinstance(obj, list)
949 obj.reverse()
950 else:
951 raise NotImplementedError(
952 u'Unsupported (void) property %r on %r'
953 % (mname, obj))
e0df6211
PH
954 elif opcode == 93: # findpropstrict
955 index = u30(coder)
956 mname = multinames[index]
957 res = extract_function(mname)
958 stack.append(res)
959 elif opcode == 97: # setproperty
960 index = u30(coder)
961 value = stack.pop()
962 idx = stack.pop()
963 obj = stack.pop()
964 assert isinstance(obj, list)
965 assert isinstance(idx, int)
966 obj[idx] = value
967 elif opcode == 98: # getlocal
968 index = u30(coder)
969 stack.append(registers[index])
970 elif opcode == 99: # setlocal
971 index = u30(coder)
972 value = stack.pop()
973 registers[index] = value
974 elif opcode == 102: # getproperty
975 index = u30(coder)
976 pname = multinames[index]
977 if pname == u'length':
978 obj = stack.pop()
979 assert isinstance(obj, list)
980 stack.append(len(obj))
981 else: # Assume attribute access
982 idx = stack.pop()
983 assert isinstance(idx, int)
984 obj = stack.pop()
985 assert isinstance(obj, list)
986 stack.append(obj[idx])
987 elif opcode == 128: # coerce
0ca96d48 988 u30(coder)
e0df6211
PH
989 elif opcode == 133: # coerce_s
990 assert isinstance(stack[-1], (type(None), compat_str))
991 elif opcode == 164: # modulo
992 value2 = stack.pop()
993 value1 = stack.pop()
994 res = value1 % value2
995 stack.append(res)
a7177865
PH
996 elif opcode == 208: # getlocal_0
997 stack.append(registers[0])
998 elif opcode == 209: # getlocal_1
999 stack.append(registers[1])
1000 elif opcode == 210: # getlocal_2
1001 stack.append(registers[2])
1002 elif opcode == 211: # getlocal_3
1003 stack.append(registers[3])
e0df6211
PH
1004 elif opcode == 214: # setlocal_2
1005 registers[2] = stack.pop()
1006 elif opcode == 215: # setlocal_3
1007 registers[3] = stack.pop()
1008 else:
1009 raise NotImplementedError(
1010 u'Unsupported opcode %d' % opcode)
1011
1012 method_pyfunctions[func_name] = resfunc
1013 return resfunc
1014
1015 initial_function = extract_function(u'decipher')
1016 return lambda s: initial_function([s])
1017
83799698 1018 def _decrypt_signature(self, s, video_id, player_url, age_gate=False):
257a2501 1019 """Turn the encrypted s field into a working signature"""
6b37f0be 1020
83799698 1021 if player_url is not None:
9f9be844
PH
1022 if player_url.startswith(u'//'):
1023 player_url = u'https:' + player_url
e0df6211 1024 try:
7f8ae73a
PH
1025 player_id = (player_url, len(s))
1026 if player_id not in self._player_cache:
83799698 1027 func = self._extract_signature_function(
c4417ddb 1028 video_id, player_url, len(s)
e0df6211 1029 )
7f8ae73a
PH
1030 self._player_cache[player_id] = func
1031 func = self._player_cache[player_id]
edf3e38e
PH
1032 if self._downloader.params.get('youtube_print_sig_code'):
1033 self._print_sig_code(func, len(s))
1034 return func(s)
0ca96d48 1035 except Exception:
e0df6211 1036 tb = traceback.format_exc()
83799698
PH
1037 self._downloader.report_warning(
1038 u'Automatic signature extraction failed: ' + tb)
e0df6211 1039
d2d8f895
PH
1040 self._downloader.report_warning(
1041 u'Warning: Falling back to static signature algorithm')
920de7a2 1042
2f2ffea9
PH
1043 return self._static_decrypt_signature(
1044 s, video_id, player_url, age_gate)
e0df6211 1045
2f2ffea9 1046 def _static_decrypt_signature(self, s, video_id, player_url, age_gate):
e0df6211
PH
1047 if age_gate:
1048 # The videos with age protection use another player, so the
1049 # algorithms can be different.
1050 if len(s) == 86:
1051 return s[2:63] + s[82] + s[64:82] + s[63]
1052
bc4b9008 1053 if len(s) == 93:
1054 return s[86:29:-1] + s[88] + s[28:5:-1]
1055 elif len(s) == 92:
444b1165 1056 return s[25] + s[3:25] + s[0] + s[26:42] + s[79] + s[43:79] + s[91] + s[80:83]
38d025b3
JMF
1057 elif len(s) == 91:
1058 return s[84:27:-1] + s[86] + s[26:5:-1]
444b1165
JMF
1059 elif len(s) == 90:
1060 return s[25] + s[3:25] + s[2] + s[26:40] + s[77] + s[41:77] + s[89] + s[78:81]
8a9d86a2 1061 elif len(s) == 89:
1062 return s[84:78:-1] + s[87] + s[77:60:-1] + s[0] + s[59:3:-1]
444b1165 1063 elif len(s) == 88:
3e223834 1064 return s[7:28] + s[87] + s[29:45] + s[55] + s[46:55] + s[2] + s[56:87] + s[28]
be547e1d 1065 elif len(s) == 87:
3a725669 1066 return s[6:27] + s[4] + s[28:39] + s[27] + s[40:59] + s[2] + s[60:]
be547e1d 1067 elif len(s) == 86:
f2c327fd 1068 return s[80:72:-1] + s[16] + s[71:39:-1] + s[72] + s[38:16:-1] + s[82] + s[15::-1]
be547e1d 1069 elif len(s) == 85:
6ae8ee3f 1070 return s[3:11] + s[0] + s[12:55] + s[84] + s[56:84]
be547e1d 1071 elif len(s) == 84:
6f56389b 1072 return s[78:70:-1] + s[14] + s[69:37:-1] + s[70] + s[36:14:-1] + s[80] + s[:14][::-1]
be547e1d 1073 elif len(s) == 83:
920de7a2 1074 return s[80:63:-1] + s[0] + s[62:0:-1] + s[63]
be547e1d 1075 elif len(s) == 82:
c21315f2 1076 return s[80:37:-1] + s[7] + s[36:7:-1] + s[0] + s[6:0:-1] + s[37]
be547e1d 1077 elif len(s) == 81:
aedd6bb9 1078 return s[56] + s[79:56:-1] + s[41] + s[55:41:-1] + s[80] + s[40:34:-1] + s[0] + s[33:29:-1] + s[34] + s[28:9:-1] + s[29] + s[8:0:-1] + s[9]
066090dd
JMF
1079 elif len(s) == 80:
1080 return s[1:19] + s[0] + s[20:68] + s[19] + s[69:80]
5c468ca8
JMF
1081 elif len(s) == 79:
1082 return s[54] + s[77:54:-1] + s[39] + s[53:39:-1] + s[78] + s[38:34:-1] + s[0] + s[33:29:-1] + s[34] + s[28:9:-1] + s[29] + s[8:0:-1] + s[9]
be547e1d
PH
1083
1084 else:
1085 raise ExtractorError(u'Unable to decrypt signature, key length %d not supported; retrying might work' % (len(s)))
c5e8d7af 1086
1f343eaa 1087 def _get_available_subtitles(self, video_id, webpage):
de7f3446 1088 try:
7fad1c63
JMF
1089 sub_list = self._download_webpage(
1090 'http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,
1091 video_id, note=False)
1092 except ExtractorError as err:
de7f3446
JMF
1093 self._downloader.report_warning(u'unable to download video subtitles: %s' % compat_str(err))
1094 return {}
1095 lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
1096
1097 sub_lang_list = {}
1098 for l in lang_list:
1099 lang = l[1]
1100 params = compat_urllib_parse.urlencode({
1101 'lang': lang,
1102 'v': video_id,
ca715127 1103 'fmt': self._downloader.params.get('subtitlesformat', 'srt'),
8eddf3e9 1104 'name': l[0].encode('utf-8'),
de7f3446
JMF
1105 })
1106 url = u'http://www.youtube.com/api/timedtext?' + params
1107 sub_lang_list[lang] = url
1108 if not sub_lang_list:
1109 self._downloader.report_warning(u'video doesn\'t have subtitles')
1110 return {}
1111 return sub_lang_list
1112
055e6f36 1113 def _get_available_automatic_caption(self, video_id, webpage):
de7f3446
JMF
1114 """We need the webpage for getting the captions url, pass it as an
1115 argument to speed up the process."""
ca715127 1116 sub_format = self._downloader.params.get('subtitlesformat', 'srt')
de7f3446
JMF
1117 self.to_screen(u'%s: Looking for automatic captions' % video_id)
1118 mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
055e6f36 1119 err_msg = u'Couldn\'t find automatic captions for %s' % video_id
de7f3446
JMF
1120 if mobj is None:
1121 self._downloader.report_warning(err_msg)
1122 return {}
1123 player_config = json.loads(mobj.group(1))
1124 try:
1125 args = player_config[u'args']
1126 caption_url = args[u'ttsurl']
1127 timestamp = args[u'timestamp']
055e6f36
JMF
1128 # We get the available subtitles
1129 list_params = compat_urllib_parse.urlencode({
1130 'type': 'list',
1131 'tlangs': 1,
1132 'asrs': 1,
de7f3446 1133 })
055e6f36
JMF
1134 list_url = caption_url + '&' + list_params
1135 list_page = self._download_webpage(list_url, video_id)
1136 caption_list = xml.etree.ElementTree.fromstring(list_page.encode('utf-8'))
e3dc22ca 1137 original_lang_node = caption_list.find('track')
f6a54188 1138 if original_lang_node is None or original_lang_node.attrib.get('kind') != 'asr' :
e3dc22ca
JMF
1139 self._downloader.report_warning(u'Video doesn\'t have automatic captions')
1140 return {}
1141 original_lang = original_lang_node.attrib['lang_code']
055e6f36
JMF
1142
1143 sub_lang_list = {}
1144 for lang_node in caption_list.findall('target'):
1145 sub_lang = lang_node.attrib['lang_code']
1146 params = compat_urllib_parse.urlencode({
1147 'lang': original_lang,
1148 'tlang': sub_lang,
1149 'fmt': sub_format,
1150 'ts': timestamp,
1151 'kind': 'asr',
1152 })
1153 sub_lang_list[sub_lang] = caption_url + '&' + params
1154 return sub_lang_list
de7f3446
JMF
1155 # An extractor error can be raise by the download process if there are
1156 # no automatic captions but there are subtitles
1157 except (KeyError, ExtractorError):
1158 self._downloader.report_warning(err_msg)
1159 return {}
1160
c5e8d7af
PH
1161 def _print_formats(self, formats):
1162 print('Available formats:')
1163 for x in formats:
03cc7c20
JMF
1164 print('%s\t:\t%s\t[%s]%s' %(x, self._video_extensions.get(x, 'flv'),
1165 self._video_dimensions.get(x, '???'),
836a086c 1166 ' ('+self._special_itags[x]+')' if x in self._special_itags else ''))
c5e8d7af
PH
1167
1168 def _extract_id(self, url):
1169 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1170 if mobj is None:
1171 raise ExtractorError(u'Invalid URL: %s' % url)
1172 video_id = mobj.group(2)
1173 return video_id
1174
1d043b93
JMF
1175 def _get_video_url_list(self, url_map):
1176 """
1177 Transform a dictionary in the format {itag:url} to a list of (itag, url)
1178 with the requested formats.
1179 """
1180 req_format = self._downloader.params.get('format', None)
1181 format_limit = self._downloader.params.get('format_limit', None)
1182 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
1183 if format_limit is not None and format_limit in available_formats:
1184 format_list = available_formats[available_formats.index(format_limit):]
1185 else:
1186 format_list = available_formats
1187 existing_formats = [x for x in format_list if x in url_map]
1188 if len(existing_formats) == 0:
1189 raise ExtractorError(u'no known formats available for video')
1190 if self._downloader.params.get('listformats', None):
1191 self._print_formats(existing_formats)
1192 return
1193 if req_format is None or req_format == 'best':
1194 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1195 elif req_format == 'worst':
1196 video_url_list = [(existing_formats[-1], url_map[existing_formats[-1]])] # worst quality
1197 elif req_format in ('-1', 'all'):
1198 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1199 else:
1200 # Specific formats. We pick the first in a slash-delimeted sequence.
bdc6b3fc
AZ
1201 # Format can be specified as itag or 'mp4' or 'flv' etc. We pick the highest quality
1202 # available in the specified format. For example,
1203 # if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
1204 # if '1/mp4/3/4' is requested and '1' and '5' (is a mp4) are available, we pick '1'.
1205 # if '1/mp4/3/4' is requested and '4' and '5' (is a mp4) are available, we pick '5'.
1d043b93
JMF
1206 req_formats = req_format.split('/')
1207 video_url_list = None
1208 for rf in req_formats:
1209 if rf in url_map:
1210 video_url_list = [(rf, url_map[rf])]
1211 break
bdc6b3fc
AZ
1212 if rf in self._video_formats_map:
1213 for srf in self._video_formats_map[rf]:
1214 if srf in url_map:
1215 video_url_list = [(srf, url_map[srf])]
1216 break
1217 else:
1218 continue
1219 break
1d043b93
JMF
1220 if video_url_list is None:
1221 raise ExtractorError(u'requested format not available')
1222 return video_url_list
1223
1224 def _extract_from_m3u8(self, manifest_url, video_id):
1225 url_map = {}
1226 def _get_urls(_manifest):
1227 lines = _manifest.split('\n')
1228 urls = filter(lambda l: l and not l.startswith('#'),
1229 lines)
1230 return urls
1231 manifest = self._download_webpage(manifest_url, video_id, u'Downloading formats manifest')
1232 formats_urls = _get_urls(manifest)
1233 for format_url in formats_urls:
890f62e8 1234 itag = self._search_regex(r'itag/(\d+?)/', format_url, 'itag')
1d043b93
JMF
1235 url_map[itag] = format_url
1236 return url_map
1237
1fb07d10
JG
1238 def _extract_annotations(self, video_id):
1239 url = 'https://www.youtube.com/annotations_invideo?features=1&legacy=1&video_id=%s' % video_id
1240 return self._download_webpage(url, video_id, note=u'Searching for annotations.', errnote=u'Unable to download video annotations.')
1241
c5e8d7af
PH
1242 def _real_extract(self, url):
1243 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
1244 mobj = re.search(self._NEXT_URL_RE, url)
1245 if mobj:
1246 url = 'https://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
1247 video_id = self._extract_id(url)
1248
1249 # Get video webpage
1250 self.report_video_webpage_download(video_id)
1251 url = 'https://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
1252 request = compat_urllib_request.Request(url)
1253 try:
1254 video_webpage_bytes = compat_urllib_request.urlopen(request).read()
1255 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1256 raise ExtractorError(u'Unable to download video webpage: %s' % compat_str(err))
1257
1258 video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
1259
1260 # Attempt to extract SWF player URL
e0df6211 1261 mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
c5e8d7af
PH
1262 if mobj is not None:
1263 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1264 else:
1265 player_url = None
1266
1267 # Get video info
1268 self.report_video_info_webpage_download(video_id)
c108eb73
JMF
1269 if re.search(r'player-age-gate-content">', video_webpage) is not None:
1270 self.report_age_confirmation()
1271 age_gate = True
1272 # We simulate the access to the video from www.youtube.com/v/{video_id}
1273 # this can be viewed without login into Youtube
1274 data = compat_urllib_parse.urlencode({'video_id': video_id,
1275 'el': 'embedded',
1276 'gl': 'US',
1277 'hl': 'en',
1278 'eurl': 'https://youtube.googleapis.com/v/' + video_id,
1279 'asv': 3,
1280 'sts':'1588',
1281 })
1282 video_info_url = 'https://www.youtube.com/get_video_info?' + data
c5e8d7af
PH
1283 video_info_webpage = self._download_webpage(video_info_url, video_id,
1284 note=False,
1285 errnote='unable to download video info webpage')
1286 video_info = compat_parse_qs(video_info_webpage)
c108eb73
JMF
1287 else:
1288 age_gate = False
1289 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1290 video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1291 % (video_id, el_type))
1292 video_info_webpage = self._download_webpage(video_info_url, video_id,
1293 note=False,
1294 errnote='unable to download video info webpage')
1295 video_info = compat_parse_qs(video_info_webpage)
1296 if 'token' in video_info:
1297 break
c5e8d7af
PH
1298 if 'token' not in video_info:
1299 if 'reason' in video_info:
9a82b238 1300 raise ExtractorError(u'YouTube said: %s' % video_info['reason'][0], expected=True)
c5e8d7af
PH
1301 else:
1302 raise ExtractorError(u'"token" parameter not in video info for unknown reason')
1303
1d699755
PH
1304 if 'view_count' in video_info:
1305 view_count = int(video_info['view_count'][0])
1306 else:
1307 view_count = None
1308
c5e8d7af
PH
1309 # Check for "rental" videos
1310 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
1311 raise ExtractorError(u'"rental" videos not supported')
1312
1313 # Start extracting information
1314 self.report_information_extraction(video_id)
1315
1316 # uploader
1317 if 'author' not in video_info:
1318 raise ExtractorError(u'Unable to extract uploader name')
1319 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
1320
1321 # uploader_id
1322 video_uploader_id = None
1323 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
1324 if mobj is not None:
1325 video_uploader_id = mobj.group(1)
1326 else:
1327 self._downloader.report_warning(u'unable to extract uploader nickname')
1328
1329 # title
a8c6b241
PH
1330 if 'title' in video_info:
1331 video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
1332 else:
1333 self._downloader.report_warning(u'Unable to extract video title')
1334 video_title = u'_'
c5e8d7af
PH
1335
1336 # thumbnail image
7763b04e
JMF
1337 # We try first to get a high quality image:
1338 m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">',
1339 video_webpage, re.DOTALL)
1340 if m_thumb is not None:
1341 video_thumbnail = m_thumb.group(1)
1342 elif 'thumbnail_url' not in video_info:
c5e8d7af 1343 self._downloader.report_warning(u'unable to extract video thumbnail')
f490e77e 1344 video_thumbnail = None
c5e8d7af
PH
1345 else: # don't panic if we can't find it
1346 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
1347
1348 # upload date
1349 upload_date = None
1350 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
1351 if mobj is not None:
1352 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1353 upload_date = unified_strdate(upload_date)
1354
1355 # description
1356 video_description = get_element_by_id("eow-description", video_webpage)
1357 if video_description:
1358 video_description = clean_html(video_description)
1359 else:
1360 fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
1361 if fd_mobj:
1362 video_description = unescapeHTML(fd_mobj.group(1))
1363 else:
1364 video_description = u''
1365
1366 # subtitles
d82134c3 1367 video_subtitles = self.extract_subtitles(video_id, video_webpage)
c5e8d7af 1368
c5e8d7af 1369 if self._downloader.params.get('listsubtitles', False):
d665f8d3 1370 self._list_available_subtitles(video_id, video_webpage)
c5e8d7af
PH
1371 return
1372
1373 if 'length_seconds' not in video_info:
1374 self._downloader.report_warning(u'unable to extract video duration')
1375 video_duration = ''
1376 else:
1377 video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
1378
1fb07d10
JG
1379 # annotations
1380 video_annotations = None
1381 if self._downloader.params.get('writeannotations', False):
1382 video_annotations = self._extract_annotations(video_id)
1383
c5e8d7af 1384 # Decide which formats to download
c5e8d7af
PH
1385
1386 try:
1387 mobj = re.search(r';ytplayer.config = ({.*?});', video_webpage)
50be92c1
PH
1388 if not mobj:
1389 raise ValueError('Could not find vevo ID')
c5e8d7af
PH
1390 info = json.loads(mobj.group(1))
1391 args = info['args']
7ce7e394
JMF
1392 # Easy way to know if the 's' value is in url_encoded_fmt_stream_map
1393 # this signatures are encrypted
44d46655 1394 if 'url_encoded_fmt_stream_map' not in args:
f10503db 1395 raise ValueError(u'No stream_map present') # caught below
00fe14fc
JMF
1396 re_signature = re.compile(r'[&,]s=')
1397 m_s = re_signature.search(args['url_encoded_fmt_stream_map'])
7ce7e394
JMF
1398 if m_s is not None:
1399 self.to_screen(u'%s: Encrypted signatures detected.' % video_id)
c5e8d7af 1400 video_info['url_encoded_fmt_stream_map'] = [args['url_encoded_fmt_stream_map']]
00fe14fc 1401 m_s = re_signature.search(args.get('adaptive_fmts', u''))
b7a68384 1402 if m_s is not None:
00fe14fc
JMF
1403 if 'adaptive_fmts' in video_info:
1404 video_info['adaptive_fmts'][0] += ',' + args['adaptive_fmts']
37b6d5f6 1405 else:
00fe14fc 1406 video_info['adaptive_fmts'] = [args['adaptive_fmts']]
c5e8d7af
PH
1407 except ValueError:
1408 pass
1409
1410 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1411 self.report_rtmp_download()
1412 video_url_list = [(None, video_info['conn'][0])]
00fe14fc
JMF
1413 elif len(video_info.get('url_encoded_fmt_stream_map', [])) >= 1 or len(video_info.get('adaptive_fmts', [])) >= 1:
1414 encoded_url_map = video_info.get('url_encoded_fmt_stream_map', [''])[0] + ',' + video_info.get('adaptive_fmts',[''])[0]
1415 if 'rtmpe%3Dyes' in encoded_url_map:
a7055eb9 1416 raise ExtractorError('rtmpe downloads are not supported, see https://github.com/rg3/youtube-dl/issues/343 for more information.', expected=True)
c5e8d7af 1417 url_map = {}
00fe14fc 1418 for url_data_str in encoded_url_map.split(','):
c5e8d7af
PH
1419 url_data = compat_parse_qs(url_data_str)
1420 if 'itag' in url_data and 'url' in url_data:
1421 url = url_data['url'][0]
1422 if 'sig' in url_data:
1423 url += '&signature=' + url_data['sig'][0]
1424 elif 's' in url_data:
e0df6211 1425 encrypted_sig = url_data['s'][0]
769fda3c 1426 if self._downloader.params.get('verbose'):
c108eb73 1427 if age_gate:
bdde940e
PH
1428 if player_url is None:
1429 player_version = 'unknown'
1430 else:
1431 player_version = self._search_regex(
1432 r'-(.+)\.swf$', player_url,
1433 u'flash player', fatal=False)
e0df6211 1434 player_desc = 'flash player %s' % player_version
c108eb73 1435 else:
83799698
PH
1436 player_version = self._search_regex(
1437 r'html5player-(.+?)\.js', video_webpage,
c108eb73 1438 'html5 player', fatal=False)
e0df6211
PH
1439 player_desc = u'html5 player %s' % player_version
1440
1441 parts_sizes = u'.'.join(compat_str(len(part)) for part in encrypted_sig.split('.'))
5a76c651 1442 self.to_screen(u'encrypted signature length %d (%s), itag %s, %s' %
e0df6211
PH
1443 (len(encrypted_sig), parts_sizes, url_data['itag'][0], player_desc))
1444
83799698 1445 if not age_gate:
e0df6211
PH
1446 jsplayer_url_json = self._search_regex(
1447 r'"assets":.+?"js":\s*("[^"]+")',
1448 video_webpage, u'JS player URL')
83799698 1449 player_url = json.loads(jsplayer_url_json)
e0df6211 1450
83799698
PH
1451 signature = self._decrypt_signature(
1452 encrypted_sig, video_id, player_url, age_gate)
c5e8d7af
PH
1453 url += '&signature=' + signature
1454 if 'ratebypass' not in url:
1455 url += '&ratebypass=yes'
1456 url_map[url_data['itag'][0]] = url
1d043b93
JMF
1457 video_url_list = self._get_video_url_list(url_map)
1458 if not video_url_list:
c5e8d7af 1459 return
1d043b93
JMF
1460 elif video_info.get('hlsvp'):
1461 manifest_url = video_info['hlsvp'][0]
1462 url_map = self._extract_from_m3u8(manifest_url, video_id)
1463 video_url_list = self._get_video_url_list(url_map)
1464 if not video_url_list:
1465 return
1466
c5e8d7af 1467 else:
9abb3204 1468 raise ExtractorError(u'no conn, hlsvp or url_encoded_fmt_stream_map information found in video info')
c5e8d7af
PH
1469
1470 results = []
600cc1a4 1471 for itag, video_real_url in video_url_list:
c5e8d7af 1472 # Extension
600cc1a4 1473 video_extension = self._video_extensions.get(itag, 'flv')
c5e8d7af 1474
600cc1a4
JMF
1475 video_format = '{0} - {1}{2}'.format(itag if itag else video_extension,
1476 self._video_dimensions.get(itag, '???'),
1477 ' ('+self._special_itags[itag]+')' if itag in self._special_itags else '')
c5e8d7af
PH
1478
1479 results.append({
1480 'id': video_id,
1481 'url': video_real_url,
1482 'uploader': video_uploader,
1483 'uploader_id': video_uploader_id,
1484 'upload_date': upload_date,
1485 'title': video_title,
1486 'ext': video_extension,
1487 'format': video_format,
600cc1a4 1488 'format_id': itag,
c5e8d7af
PH
1489 'thumbnail': video_thumbnail,
1490 'description': video_description,
1491 'player_url': player_url,
1492 'subtitles': video_subtitles,
8dbe9899 1493 'duration': video_duration,
cfadd183 1494 'age_limit': 18 if age_gate else 0,
9103bbc5
JMF
1495 'annotations': video_annotations,
1496 'webpage_url': 'https://www.youtube.com/watch?v=%s' % video_id,
1d699755 1497 'view_count': view_count,
c5e8d7af
PH
1498 })
1499 return results
1500
1501class YoutubePlaylistIE(InfoExtractor):
0f818663 1502 IE_DESC = u'YouTube.com playlists'
c5e8d7af
PH
1503 _VALID_URL = r"""(?:
1504 (?:https?://)?
1505 (?:\w+\.)?
1506 youtube\.com/
1507 (?:
1508 (?:course|view_play_list|my_playlists|artist|playlist|watch)
1509 \? (?:.*?&)*? (?:p|a|list)=
1510 | p/
1511 )
c626a3d9 1512 ((?:PL|EC|UU|FL)?[0-9A-Za-z-_]{10,})
c5e8d7af
PH
1513 .*
1514 |
c626a3d9 1515 ((?:PL|EC|UU|FL)[0-9A-Za-z-_]{10,})
c5e8d7af
PH
1516 )"""
1517 _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json&safeSearch=none'
1518 _MAX_RESULTS = 50
1519 IE_NAME = u'youtube:playlist'
1520
1521 @classmethod
1522 def suitable(cls, url):
1523 """Receives a URL and returns True if suitable for this IE."""
1524 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1525
1526 def _real_extract(self, url):
1527 # Extract playlist id
1528 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1529 if mobj is None:
1530 raise ExtractorError(u'Invalid URL: %s' % url)
47192f92
FV
1531 playlist_id = mobj.group(1) or mobj.group(2)
1532
1533 # Check if it's a video-specific URL
7c61bd36 1534 query_dict = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
47192f92
FV
1535 if 'v' in query_dict:
1536 video_id = query_dict['v'][0]
1537 if self._downloader.params.get('noplaylist'):
1538 self.to_screen(u'Downloading just video %s because of --no-playlist' % video_id)
1539 return self.url_result('https://www.youtube.com/watch?v=' + video_id, 'Youtube')
1540 else:
1541 self.to_screen(u'Downloading playlist PL%s - add --no-playlist to just download video %s' % (playlist_id, video_id))
c5e8d7af
PH
1542
1543 # Download playlist videos from API
c5e8d7af
PH
1544 videos = []
1545
755eb032 1546 for page_num in itertools.count(1):
771822eb
JMF
1547 start_index = self._MAX_RESULTS * (page_num - 1) + 1
1548 if start_index >= 1000:
1549 self._downloader.report_warning(u'Max number of results reached')
1550 break
1551 url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, start_index)
c5e8d7af
PH
1552 page = self._download_webpage(url, playlist_id, u'Downloading page #%s' % page_num)
1553
1554 try:
1555 response = json.loads(page)
1556 except ValueError as err:
1557 raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
1558
1559 if 'feed' not in response:
1560 raise ExtractorError(u'Got a malformed response from YouTube API')
1561 playlist_title = response['feed']['title']['$t']
1562 if 'entry' not in response['feed']:
1563 # Number of videos is a multiple of self._MAX_RESULTS
1564 break
1565
1566 for entry in response['feed']['entry']:
1567 index = entry['yt$position']['$t']
c215217e
JMF
1568 if 'media$group' in entry and 'yt$videoid' in entry['media$group']:
1569 videos.append((
1570 index,
1571 'https://www.youtube.com/watch?v=' + entry['media$group']['yt$videoid']['$t']
1572 ))
c5e8d7af 1573
c5e8d7af
PH
1574 videos = [v[1] for v in sorted(videos)]
1575
20c3893f 1576 url_results = [self.url_result(vurl, 'Youtube') for vurl in videos]
c5e8d7af
PH
1577 return [self.playlist_result(url_results, playlist_id, playlist_title)]
1578
1579
1580class YoutubeChannelIE(InfoExtractor):
0f818663 1581 IE_DESC = u'YouTube.com channels'
c5e8d7af 1582 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
c5e8d7af 1583 _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
252580c5 1584 _MORE_PAGES_URL = 'http://www.youtube.com/c4_browse_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
c5e8d7af
PH
1585 IE_NAME = u'youtube:channel'
1586
1587 def extract_videos_from_page(self, page):
1588 ids_in_page = []
1589 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1590 if mobj.group(1) not in ids_in_page:
1591 ids_in_page.append(mobj.group(1))
1592 return ids_in_page
1593
1594 def _real_extract(self, url):
1595 # Extract channel id
1596 mobj = re.match(self._VALID_URL, url)
1597 if mobj is None:
1598 raise ExtractorError(u'Invalid URL: %s' % url)
1599
1600 # Download channel page
1601 channel_id = mobj.group(1)
1602 video_ids = []
b9643eed
JMF
1603 url = 'https://www.youtube.com/channel/%s/videos' % channel_id
1604 channel_page = self._download_webpage(url, channel_id)
1605 if re.search(r'channel-header-autogenerated-label', channel_page) is not None:
1606 autogenerated = True
1607 else:
1608 autogenerated = False
c5e8d7af 1609
b9643eed
JMF
1610 if autogenerated:
1611 # The videos are contained in a single page
1612 # the ajax pages can't be used, they are empty
1613 video_ids = self.extract_videos_from_page(channel_page)
1614 else:
1615 # Download all channel pages using the json-based channel_ajax query
1616 for pagenum in itertools.count(1):
1617 url = self._MORE_PAGES_URL % (pagenum, channel_id)
1618 page = self._download_webpage(url, channel_id,
1619 u'Downloading page #%s' % pagenum)
1620
1621 page = json.loads(page)
1622
1623 ids_in_page = self.extract_videos_from_page(page['content_html'])
1624 video_ids.extend(ids_in_page)
1625
1626 if self._MORE_PAGES_INDICATOR not in page['load_more_widget_html']:
1627 break
c5e8d7af
PH
1628
1629 self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1630
1631 urls = ['http://www.youtube.com/watch?v=%s' % id for id in video_ids]
20c3893f 1632 url_entries = [self.url_result(eurl, 'Youtube') for eurl in urls]
c5e8d7af
PH
1633 return [self.playlist_result(url_entries, channel_id)]
1634
1635
1636class YoutubeUserIE(InfoExtractor):
0f818663 1637 IE_DESC = u'YouTube.com user videos (URL or "ytuser" keyword)'
57da92b7 1638 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:user/)?(?!(?:attribution_link|watch)(?:$|[^a-z_A-Z0-9-])))|ytuser:)(?!feed/)([A-Za-z0-9_-]+)'
c5e8d7af
PH
1639 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1640 _GDATA_PAGE_SIZE = 50
fd9cf738 1641 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d&alt=json'
c5e8d7af
PH
1642 IE_NAME = u'youtube:user'
1643
e3ea4790 1644 @classmethod
f4b05232 1645 def suitable(cls, url):
e3ea4790
JMF
1646 # Don't return True if the url can be extracted with other youtube
1647 # extractor, the regex would is too permissive and it would match.
1648 other_ies = iter(klass for (name, klass) in globals().items() if name.endswith('IE') and klass is not cls)
1649 if any(ie.suitable(url) for ie in other_ies): return False
f4b05232
JMF
1650 else: return super(YoutubeUserIE, cls).suitable(url)
1651
c5e8d7af
PH
1652 def _real_extract(self, url):
1653 # Extract username
1654 mobj = re.match(self._VALID_URL, url)
1655 if mobj is None:
1656 raise ExtractorError(u'Invalid URL: %s' % url)
1657
1658 username = mobj.group(1)
1659
1660 # Download video ids using YouTube Data API. Result size per
1661 # query is limited (currently to 50 videos) so we need to query
1662 # page by page until there are no video ids - it means we got
1663 # all of them.
1664
1665 video_ids = []
c5e8d7af 1666
755eb032 1667 for pagenum in itertools.count(0):
c5e8d7af
PH
1668 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1669
1670 gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
1671 page = self._download_webpage(gdata_url, username,
1672 u'Downloading video ids from %d to %d' % (start_index, start_index + self._GDATA_PAGE_SIZE))
1673
fd9cf738
JMF
1674 try:
1675 response = json.loads(page)
1676 except ValueError as err:
1677 raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
71c82637
JMF
1678 if 'entry' not in response['feed']:
1679 # Number of videos is a multiple of self._MAX_RESULTS
1680 break
fd9cf738 1681
c5e8d7af
PH
1682 # Extract video identifiers
1683 ids_in_page = []
fd9cf738
JMF
1684 for entry in response['feed']['entry']:
1685 ids_in_page.append(entry['id']['$t'].split('/')[-1])
c5e8d7af
PH
1686 video_ids.extend(ids_in_page)
1687
1688 # A little optimization - if current page is not
1689 # "full", ie. does not contain PAGE_SIZE video ids then
1690 # we can assume that this page is the last one - there
1691 # are no more ids on further pages - no need to query
1692 # again.
1693
1694 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1695 break
1696
c5e8d7af 1697 urls = ['http://www.youtube.com/watch?v=%s' % video_id for video_id in video_ids]
20c3893f 1698 url_results = [self.url_result(rurl, 'Youtube') for rurl in urls]
c5e8d7af 1699 return [self.playlist_result(url_results, playlist_title = username)]
b05654f0
PH
1700
1701class YoutubeSearchIE(SearchInfoExtractor):
0f818663 1702 IE_DESC = u'YouTube.com searches'
b05654f0
PH
1703 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1704 _MAX_RESULTS = 1000
1705 IE_NAME = u'youtube:search'
1706 _SEARCH_KEY = 'ytsearch'
1707
1708 def report_download_page(self, query, pagenum):
1709 """Report attempt to download search page with given number."""
1710 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1711
1712 def _get_n_results(self, query, n):
1713 """Get a specified number of results for a query"""
1714
1715 video_ids = []
1716 pagenum = 0
1717 limit = n
1718
1719 while (50 * pagenum) < limit:
1720 self.report_download_page(query, pagenum+1)
1721 result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1722 request = compat_urllib_request.Request(result_url)
1723 try:
1724 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
1725 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1726 raise ExtractorError(u'Unable to download API page: %s' % compat_str(err))
1727 api_response = json.loads(data)['data']
1728
1729 if not 'items' in api_response:
1730 raise ExtractorError(u'[youtube] No video results')
1731
1732 new_ids = list(video['id'] for video in api_response['items'])
1733 video_ids += new_ids
1734
1735 limit = min(n, api_response['totalItems'])
1736 pagenum += 1
1737
1738 if len(video_ids) > n:
1739 video_ids = video_ids[:n]
1740 videos = [self.url_result('http://www.youtube.com/watch?v=%s' % id, 'Youtube') for id in video_ids]
1741 return self.playlist_result(videos, query)
75dff0ee 1742
a3dd9248
CM
1743class YoutubeSearchDateIE(YoutubeSearchIE):
1744 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc&orderby=published'
1745 _SEARCH_KEY = 'ytsearchdate'
08fb86c4 1746 IE_DESC = u'YouTube.com searches, newest videos first'
75dff0ee
JMF
1747
1748class YoutubeShowIE(InfoExtractor):
0f818663 1749 IE_DESC = u'YouTube.com (multi-season) shows'
75dff0ee
JMF
1750 _VALID_URL = r'https?://www\.youtube\.com/show/(.*)'
1751 IE_NAME = u'youtube:show'
1752
1753 def _real_extract(self, url):
1754 mobj = re.match(self._VALID_URL, url)
1755 show_name = mobj.group(1)
1756 webpage = self._download_webpage(url, show_name, u'Downloading show webpage')
1757 # There's one playlist for each season of the show
1758 m_seasons = list(re.finditer(r'href="(/playlist\?list=.*?)"', webpage))
1759 self.to_screen(u'%s: Found %s seasons' % (show_name, len(m_seasons)))
1760 return [self.url_result('https://www.youtube.com' + season.group(1), 'YoutubePlaylist') for season in m_seasons]
04cc9617
JMF
1761
1762
b2e8bc1b 1763class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
d7ae0639
JMF
1764 """
1765 Base class for extractors that fetch info from
1766 http://www.youtube.com/feed_ajax
1767 Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.
1768 """
b2e8bc1b 1769 _LOGIN_REQUIRED = True
04cc9617 1770 _PAGING_STEP = 30
43ba5456
JMF
1771 # use action_load_personal_feed instead of action_load_system_feed
1772 _PERSONAL_FEED = False
04cc9617 1773
d7ae0639
JMF
1774 @property
1775 def _FEED_TEMPLATE(self):
43ba5456
JMF
1776 action = 'action_load_system_feed'
1777 if self._PERSONAL_FEED:
1778 action = 'action_load_personal_feed'
1779 return 'http://www.youtube.com/feed_ajax?%s=1&feed_name=%s&paging=%%s' % (action, self._FEED_NAME)
d7ae0639
JMF
1780
1781 @property
1782 def IE_NAME(self):
1783 return u'youtube:%s' % self._FEED_NAME
04cc9617 1784
81f0259b 1785 def _real_initialize(self):
b2e8bc1b 1786 self._login()
81f0259b 1787
04cc9617
JMF
1788 def _real_extract(self, url):
1789 feed_entries = []
1790 # The step argument is available only in 2.7 or higher
1791 for i in itertools.count(0):
1792 paging = i*self._PAGING_STEP
d7ae0639
JMF
1793 info = self._download_webpage(self._FEED_TEMPLATE % paging,
1794 u'%s feed' % self._FEED_NAME,
04cc9617
JMF
1795 u'Downloading page %s' % i)
1796 info = json.loads(info)
1797 feed_html = info['feed_html']
43ba5456 1798 m_ids = re.finditer(r'"/watch\?v=(.*?)["&]', feed_html)
04cc9617
JMF
1799 ids = orderedSet(m.group(1) for m in m_ids)
1800 feed_entries.extend(self.url_result(id, 'Youtube') for id in ids)
1801 if info['paging'] is None:
1802 break
d7ae0639
JMF
1803 return self.playlist_result(feed_entries, playlist_title=self._PLAYLIST_TITLE)
1804
1805class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor):
1806 IE_DESC = u'YouTube.com subscriptions feed, "ytsubs" keyword(requires authentication)'
1807 _VALID_URL = r'https?://www\.youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
1808 _FEED_NAME = 'subscriptions'
1809 _PLAYLIST_TITLE = u'Youtube Subscriptions'
1810
1811class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
1812 IE_DESC = u'YouTube.com recommended videos, "ytrec" keyword (requires authentication)'
1813 _VALID_URL = r'https?://www\.youtube\.com/feed/recommended|:ytrec(?:ommended)?'
1814 _FEED_NAME = 'recommended'
1815 _PLAYLIST_TITLE = u'Youtube Recommended videos'
c626a3d9 1816
43ba5456
JMF
1817class YoutubeWatchLaterIE(YoutubeFeedsInfoExtractor):
1818 IE_DESC = u'Youtube watch later list, "ytwatchlater" keyword (requires authentication)'
1819 _VALID_URL = r'https?://www\.youtube\.com/feed/watch_later|:ytwatchlater'
1820 _FEED_NAME = 'watch_later'
1821 _PLAYLIST_TITLE = u'Youtube Watch Later'
1822 _PAGING_STEP = 100
1823 _PERSONAL_FEED = True
c626a3d9
JMF
1824
1825class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
1826 IE_NAME = u'youtube:favorites'
1827 IE_DESC = u'YouTube.com favourite videos, "ytfav" keyword (requires authentication)'
c7a7750d 1828 _VALID_URL = r'https?://www\.youtube\.com/my_favorites|:ytfav(?:ou?rites)?'
c626a3d9
JMF
1829 _LOGIN_REQUIRED = True
1830
1831 def _real_extract(self, url):
1832 webpage = self._download_webpage('https://www.youtube.com/my_favorites', 'Youtube Favourites videos')
1833 playlist_id = self._search_regex(r'list=(.+?)["&]', webpage, u'favourites playlist id')
1834 return self.url_result(playlist_id, 'YoutubePlaylist')
15870e90
PH
1835
1836
1837class YoutubeTruncatedURLIE(InfoExtractor):
1838 IE_NAME = 'youtube:truncated_url'
1839 IE_DESC = False # Do not list
1840 _VALID_URL = r'(?:https?://)?[^/]+/watch\?feature=[a-z_]+$'
1841
1842 def _real_extract(self, url):
1843 raise ExtractorError(
1844 u'Did you forget to quote the URL? Remember that & is a meta '
1845 u'character in most shells, so you want to put the URL in quotes, '
1846 u'like youtube-dl '
1847 u'\'http://www.youtube.com/watch?feature=foo&v=BaW_jenozKc\''
1848 u' (or simply youtube-dl BaW_jenozKc ).',
1849 expected=True)