]> jfr.im git - yt-dlp.git/blame - youtube_dl/extractor/youtube.py
[rtlnow] Replace one of the tests
[yt-dlp.git] / youtube_dl / extractor / youtube.py
CommitLineData
c5e8d7af 1# coding: utf-8
c5e8d7af 2
e0df6211 3import collections
edf3e38e 4import errno
e0df6211 5import io
0ca96d48 6import itertools
c5e8d7af 7import json
c4417ddb 8import os.path
c5e8d7af
PH
9import re
10import socket
e0df6211
PH
11import string
12import struct
13import traceback
055e6f36 14import xml.etree.ElementTree
e0df6211 15import zlib
c5e8d7af 16
b05654f0 17from .common import InfoExtractor, SearchInfoExtractor
54d39d8b 18from .subtitles import SubtitlesInfoExtractor
c5e8d7af 19from ..utils import (
edf3e38e 20 compat_chr,
c5e8d7af
PH
21 compat_http_client,
22 compat_parse_qs,
23 compat_urllib_error,
24 compat_urllib_parse,
25 compat_urllib_request,
26 compat_str,
27
28 clean_html,
29 get_element_by_id,
30 ExtractorError,
31 unescapeHTML,
32 unified_strdate,
04cc9617 33 orderedSet,
edf3e38e 34 write_json_file,
c5e8d7af
PH
35)
36
de7f3446 37class YoutubeBaseInfoExtractor(InfoExtractor):
b2e8bc1b
JMF
38 """Provide base functions for Youtube extractors"""
39 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
40 _LANG_URL = r'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
41 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
42 _NETRC_MACHINE = 'youtube'
43 # If True it will raise an error if no login info is provided
44 _LOGIN_REQUIRED = False
45
46 def report_lang(self):
47 """Report attempt to set language."""
48 self.to_screen(u'Setting language')
49
50 def _set_language(self):
51 request = compat_urllib_request.Request(self._LANG_URL)
52 try:
53 self.report_lang()
54 compat_urllib_request.urlopen(request).read()
55 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
56 self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
57 return False
58 return True
59
60 def _login(self):
61 (username, password) = self._get_login_info()
62 # No authentication to be performed
63 if username is None:
64 if self._LOGIN_REQUIRED:
65 raise ExtractorError(u'No login info available, needed for using %s.' % self.IE_NAME, expected=True)
66 return False
67
68 request = compat_urllib_request.Request(self._LOGIN_URL)
69 try:
70 login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
71 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
72 self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err))
73 return False
74
75 galx = None
76 dsh = None
77 match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
78 if match:
79 galx = match.group(1)
80 match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
81 if match:
82 dsh = match.group(1)
c5e8d7af 83
b2e8bc1b
JMF
84 # Log in
85 login_form_strs = {
86 u'continue': u'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
87 u'Email': username,
88 u'GALX': galx,
89 u'Passwd': password,
90 u'PersistentCookie': u'yes',
91 u'_utf8': u'霱',
92 u'bgresponse': u'js_disabled',
93 u'checkConnection': u'',
94 u'checkedDomains': u'youtube',
95 u'dnConn': u'',
96 u'dsh': dsh,
97 u'pstMsg': u'0',
98 u'rmShown': u'1',
99 u'secTok': u'',
100 u'signIn': u'Sign in',
101 u'timeStmp': u'',
102 u'service': u'youtube',
103 u'uilel': u'3',
104 u'hl': u'en_US',
105 }
106 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
107 # chokes on unicode
108 login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
109 login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
110 request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
111 try:
112 self.report_login()
113 login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
114 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
115 self._downloader.report_warning(u'unable to log in: bad username or password')
116 return False
117 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
118 self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
119 return False
120 return True
121
122 def _confirm_age(self):
123 age_form = {
124 'next_url': '/',
125 'action_confirm': 'Confirm',
126 }
127 request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
128 try:
129 self.report_age_confirmation()
130 compat_urllib_request.urlopen(request).read().decode('utf-8')
131 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
132 raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err))
133 return True
134
135 def _real_initialize(self):
136 if self._downloader is None:
137 return
138 if not self._set_language():
139 return
140 if not self._login():
141 return
142 self._confirm_age()
c5e8d7af 143
8377574c 144
de7f3446 145class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
0f818663 146 IE_DESC = u'YouTube.com'
c5e8d7af
PH
147 _VALID_URL = r"""^
148 (
149 (?:https?://)? # http(s):// (optional)
f4b05232 150 (?:(?:(?:(?:\w+\.)?youtube(?:-nocookie)?\.com/|
e69ae5b9
JMF
151 tube\.majestyc\.net/|
152 youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains
c5e8d7af
PH
153 (?:.*?\#/)? # handle anchor (#/) redirect urls
154 (?: # the various things that can precede the ID:
155 (?:(?:v|embed|e)/) # v/ or embed/ or e/
156 |(?: # or the v= param in all its forms
d741e55a 157 (?:(?:watch|movie)(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
c5e8d7af
PH
158 (?:\?|\#!?) # the params delimiter ? or # or #!
159 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
160 v=
161 )
f4b05232
JMF
162 ))
163 |youtu\.be/ # just youtu.be/xxxx
164 )
c5e8d7af 165 )? # all until now is optional -> you can pass the naked ID
8963d9c2 166 ([0-9A-Za-z_-]{11}) # here is it! the YouTube video ID
c5e8d7af
PH
167 (?(1).+)? # if we found the ID, everything can follow
168 $"""
c5e8d7af 169 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
c5e8d7af 170 # Listed in order of quality
bdc6b3fc 171 _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '36', '17', '13',
96fb5605 172 # Apple HTTP Live Streaming
bdc6b3fc 173 '96', '95', '94', '93', '92', '132', '151',
939fbd26
JMF
174 # 3D
175 '85', '84', '102', '83', '101', '82', '100',
176 # Dash video
177 '138', '137', '248', '136', '247', '135', '246',
178 '245', '244', '134', '243', '133', '242', '160',
179 # Dash audio
180 '141', '172', '140', '171', '139',
1d043b93 181 ]
bdc6b3fc 182 _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '36', '17', '13',
96fb5605 183 # Apple HTTP Live Streaming
bdc6b3fc
AZ
184 '96', '95', '94', '93', '92', '132', '151',
185 # 3D
86fe61c8 186 '85', '102', '84', '101', '83', '100', '82',
939fbd26
JMF
187 # Dash video
188 '138', '248', '137', '247', '136', '246', '245',
189 '244', '135', '243', '134', '242', '133', '160',
190 # Dash audio
191 '172', '141', '171', '140', '139',
1d043b93 192 ]
bdc6b3fc
AZ
193 _video_formats_map = {
194 'flv': ['35', '34', '6', '5'],
195 '3gp': ['36', '17', '13'],
196 'mp4': ['38', '37', '22', '18'],
197 'webm': ['46', '45', '44', '43'],
198 }
c5e8d7af
PH
199 _video_extensions = {
200 '13': '3gp',
bdc6b3fc 201 '17': '3gp',
c5e8d7af
PH
202 '18': 'mp4',
203 '22': 'mp4',
bdc6b3fc 204 '36': '3gp',
c5e8d7af 205 '37': 'mp4',
d69cf69a 206 '38': 'mp4',
c5e8d7af
PH
207 '43': 'webm',
208 '44': 'webm',
209 '45': 'webm',
210 '46': 'webm',
1d043b93 211
86fe61c8
AZ
212 # 3d videos
213 '82': 'mp4',
214 '83': 'mp4',
215 '84': 'mp4',
216 '85': 'mp4',
217 '100': 'webm',
218 '101': 'webm',
219 '102': 'webm',
836a086c 220
96fb5605 221 # Apple HTTP Live Streaming
1d043b93
JMF
222 '92': 'mp4',
223 '93': 'mp4',
224 '94': 'mp4',
225 '95': 'mp4',
226 '96': 'mp4',
227 '132': 'mp4',
228 '151': 'mp4',
836a086c
AZ
229
230 # Dash mp4
231 '133': 'mp4',
232 '134': 'mp4',
233 '135': 'mp4',
234 '136': 'mp4',
235 '137': 'mp4',
236 '138': 'mp4',
237 '139': 'mp4',
238 '140': 'mp4',
239 '141': 'mp4',
240 '160': 'mp4',
241
242 # Dash webm
243 '171': 'webm',
244 '172': 'webm',
245 '242': 'webm',
246 '243': 'webm',
247 '244': 'webm',
248 '245': 'webm',
249 '246': 'webm',
250 '247': 'webm',
251 '248': 'webm',
c5e8d7af
PH
252 }
253 _video_dimensions = {
254 '5': '240x400',
255 '6': '???',
256 '13': '???',
257 '17': '144x176',
258 '18': '360x640',
259 '22': '720x1280',
260 '34': '360x640',
261 '35': '480x854',
bdc6b3fc 262 '36': '240x320',
c5e8d7af
PH
263 '37': '1080x1920',
264 '38': '3072x4096',
265 '43': '360x640',
266 '44': '480x854',
267 '45': '720x1280',
268 '46': '1080x1920',
86fe61c8
AZ
269 '82': '360p',
270 '83': '480p',
271 '84': '720p',
272 '85': '1080p',
1d043b93
JMF
273 '92': '240p',
274 '93': '360p',
275 '94': '480p',
276 '95': '720p',
277 '96': '1080p',
86fe61c8
AZ
278 '100': '360p',
279 '101': '480p',
836a086c 280 '102': '720p',
1d043b93
JMF
281 '132': '240p',
282 '151': '72p',
836a086c
AZ
283 '133': '240p',
284 '134': '360p',
285 '135': '480p',
286 '136': '720p',
287 '137': '1080p',
288 '138': '>1080p',
289 '139': '48k',
290 '140': '128k',
291 '141': '256k',
292 '160': '192p',
293 '171': '128k',
294 '172': '256k',
295 '242': '240p',
296 '243': '360p',
297 '244': '480p',
298 '245': '480p',
299 '246': '480p',
300 '247': '720p',
301 '248': '1080p',
c5e8d7af 302 }
836a086c
AZ
303 _special_itags = {
304 '82': '3D',
305 '83': '3D',
306 '84': '3D',
307 '85': '3D',
308 '100': '3D',
309 '101': '3D',
310 '102': '3D',
311 '133': 'DASH Video',
312 '134': 'DASH Video',
313 '135': 'DASH Video',
314 '136': 'DASH Video',
315 '137': 'DASH Video',
316 '138': 'DASH Video',
317 '139': 'DASH Audio',
318 '140': 'DASH Audio',
319 '141': 'DASH Audio',
320 '160': 'DASH Video',
321 '171': 'DASH Audio',
322 '172': 'DASH Audio',
323 '242': 'DASH Video',
324 '243': 'DASH Video',
325 '244': 'DASH Video',
326 '245': 'DASH Video',
327 '246': 'DASH Video',
328 '247': 'DASH Video',
329 '248': 'DASH Video',
c5e8d7af 330 }
836a086c 331
c5e8d7af 332 IE_NAME = u'youtube'
2eb88d95
PH
333 _TESTS = [
334 {
0e853ca4
PH
335 u"url": u"http://www.youtube.com/watch?v=BaW_jenozKc",
336 u"file": u"BaW_jenozKc.mp4",
337 u"info_dict": {
338 u"title": u"youtube-dl test video \"'/\\ä↭𝕐",
339 u"uploader": u"Philipp Hagemeister",
340 u"uploader_id": u"phihag",
341 u"upload_date": u"20121002",
342 u"description": u"test chars: \"'/\\ä↭𝕐\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de ."
2eb88d95 343 }
0e853ca4
PH
344 },
345 {
346 u"url": u"http://www.youtube.com/watch?v=1ltcDfZMA3U",
347 u"file": u"1ltcDfZMA3U.flv",
348 u"note": u"Test VEVO video (#897)",
349 u"info_dict": {
350 u"upload_date": u"20070518",
351 u"title": u"Maps - It Will Find You",
352 u"description": u"Music video by Maps performing It Will Find You.",
353 u"uploader": u"MuteUSA",
354 u"uploader_id": u"MuteUSA"
2eb88d95 355 }
0e853ca4
PH
356 },
357 {
358 u"url": u"http://www.youtube.com/watch?v=UxxajLWwzqY",
359 u"file": u"UxxajLWwzqY.mp4",
360 u"note": u"Test generic use_cipher_signature video (#897)",
361 u"info_dict": {
362 u"upload_date": u"20120506",
363 u"title": u"Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]",
2dc59299 364 u"description": u"md5:5b292926389560516e384ac437c0ec07",
45ed795c 365 u"uploader": u"Icona Pop",
0e853ca4 366 u"uploader_id": u"IconaPop"
2eb88d95 367 }
c108eb73
JMF
368 },
369 {
370 u"url": u"https://www.youtube.com/watch?v=07FYdnEawAQ",
371 u"file": u"07FYdnEawAQ.mp4",
372 u"note": u"Test VEVO video with age protection (#956)",
373 u"info_dict": {
374 u"upload_date": u"20130703",
375 u"title": u"Justin Timberlake - Tunnel Vision (Explicit)",
376 u"description": u"md5:64249768eec3bc4276236606ea996373",
377 u"uploader": u"justintimberlakeVEVO",
378 u"uploader_id": u"justintimberlakeVEVO"
379 }
380 },
2eb88d95
PH
381 ]
382
c5e8d7af
PH
383
384 @classmethod
385 def suitable(cls, url):
386 """Receives a URL and returns True if suitable for this IE."""
e3ea4790 387 if YoutubePlaylistIE.suitable(url): return False
c5e8d7af
PH
388 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
389
e0df6211
PH
390 def __init__(self, *args, **kwargs):
391 super(YoutubeIE, self).__init__(*args, **kwargs)
83799698 392 self._player_cache = {}
e0df6211 393
c5e8d7af
PH
394 def report_video_webpage_download(self, video_id):
395 """Report attempt to download video webpage."""
396 self.to_screen(u'%s: Downloading video webpage' % video_id)
397
398 def report_video_info_webpage_download(self, video_id):
399 """Report attempt to download video info webpage."""
400 self.to_screen(u'%s: Downloading video info webpage' % video_id)
401
c5e8d7af
PH
402 def report_information_extraction(self, video_id):
403 """Report attempt to extract video information."""
404 self.to_screen(u'%s: Extracting video information' % video_id)
405
406 def report_unavailable_format(self, video_id, format):
407 """Report extracted video URL."""
408 self.to_screen(u'%s: Format %s not available' % (video_id, format))
409
410 def report_rtmp_download(self):
411 """Indicate the download will use the RTMP protocol."""
412 self.to_screen(u'RTMP download detected')
413
c4417ddb
PH
414 def _extract_signature_function(self, video_id, player_url, slen):
415 id_m = re.match(r'.*-(?P<id>[a-zA-Z0-9_-]+)\.(?P<ext>[a-z]+)$',
83799698 416 player_url)
e0df6211
PH
417 player_type = id_m.group('ext')
418 player_id = id_m.group('id')
419
c4417ddb
PH
420 # Read from filesystem cache
421 func_id = '%s_%s_%d' % (player_type, player_id, slen)
422 assert os.path.basename(func_id) == func_id
edf3e38e
PH
423 cache_dir = self._downloader.params.get('cachedir',
424 u'~/.youtube-dl/cache')
c4417ddb 425
c3c88a26 426 cache_enabled = cache_dir is not None
f8061589 427 if cache_enabled:
c4417ddb
PH
428 cache_fn = os.path.join(os.path.expanduser(cache_dir),
429 u'youtube-sigfuncs',
430 func_id + '.json')
431 try:
edf3e38e 432 with io.open(cache_fn, 'r', encoding='utf-8') as cachef:
c4417ddb
PH
433 cache_spec = json.load(cachef)
434 return lambda s: u''.join(s[i] for i in cache_spec)
edf3e38e 435 except IOError:
c4417ddb 436 pass # No cache available
83799698 437
e0df6211
PH
438 if player_type == 'js':
439 code = self._download_webpage(
440 player_url, video_id,
83799698 441 note=u'Downloading %s player %s' % (player_type, player_id),
e0df6211 442 errnote=u'Download of %s failed' % player_url)
83799698 443 res = self._parse_sig_js(code)
c4417ddb 444 elif player_type == 'swf':
e0df6211
PH
445 urlh = self._request_webpage(
446 player_url, video_id,
83799698 447 note=u'Downloading %s player %s' % (player_type, player_id),
e0df6211
PH
448 errnote=u'Download of %s failed' % player_url)
449 code = urlh.read()
83799698 450 res = self._parse_sig_swf(code)
e0df6211
PH
451 else:
452 assert False, 'Invalid player type %r' % player_type
453
f8061589 454 if cache_enabled:
edf3e38e 455 try:
c705320f
PH
456 test_string = u''.join(map(compat_chr, range(slen)))
457 cache_res = res(test_string)
edf3e38e
PH
458 cache_spec = [ord(c) for c in cache_res]
459 try:
460 os.makedirs(os.path.dirname(cache_fn))
461 except OSError as ose:
462 if ose.errno != errno.EEXIST:
463 raise
464 write_json_file(cache_spec, cache_fn)
0ca96d48 465 except Exception:
edf3e38e
PH
466 tb = traceback.format_exc()
467 self._downloader.report_warning(
468 u'Writing cache to %r failed: %s' % (cache_fn, tb))
83799698
PH
469
470 return res
471
edf3e38e
PH
472 def _print_sig_code(self, func, slen):
473 def gen_sig_code(idxs):
474 def _genslice(start, end, step):
475 starts = u'' if start == 0 else str(start)
e35e4ddc
PH
476 ends = (u':%d' % (end+step)) if end + step >= 0 else u':'
477 steps = u'' if step == 1 else (u':%d' % step)
edf3e38e
PH
478 return u's[%s%s%s]' % (starts, ends, steps)
479
480 step = None
0ca96d48
PH
481 start = '(Never used)' # Quelch pyflakes warnings - start will be
482 # set as soon as step is set
edf3e38e
PH
483 for i, prev in zip(idxs[1:], idxs[:-1]):
484 if step is not None:
485 if i - prev == step:
486 continue
487 yield _genslice(start, prev, step)
488 step = None
489 continue
490 if i - prev in [-1, 1]:
491 step = i - prev
492 start = prev
493 continue
494 else:
495 yield u's[%d]' % prev
496 if step is None:
497 yield u's[%d]' % i
498 else:
499 yield _genslice(start, i, step)
500
c705320f
PH
501 test_string = u''.join(map(compat_chr, range(slen)))
502 cache_res = func(test_string)
edf3e38e
PH
503 cache_spec = [ord(c) for c in cache_res]
504 expr_code = u' + '.join(gen_sig_code(cache_spec))
505 code = u'if len(s) == %d:\n return %s\n' % (slen, expr_code)
f8061589 506 self.to_screen(u'Extracted signature function:\n' + code)
edf3e38e 507
e0df6211
PH
508 def _parse_sig_js(self, jscode):
509 funcname = self._search_regex(
510 r'signature=([a-zA-Z]+)', jscode,
511 u'Initial JS player signature function name')
512
513 functions = {}
514
515 def argidx(varname):
516 return string.lowercase.index(varname)
517
518 def interpret_statement(stmt, local_vars, allow_recursion=20):
519 if allow_recursion < 0:
0ca96d48 520 raise ExtractorError(u'Recursion limit reached')
e0df6211
PH
521
522 if stmt.startswith(u'var '):
523 stmt = stmt[len(u'var '):]
524 ass_m = re.match(r'^(?P<out>[a-z]+)(?:\[(?P<index>[^\]]+)\])?' +
525 r'=(?P<expr>.*)$', stmt)
526 if ass_m:
527 if ass_m.groupdict().get('index'):
528 def assign(val):
529 lvar = local_vars[ass_m.group('out')]
530 idx = interpret_expression(ass_m.group('index'),
531 local_vars, allow_recursion)
532 assert isinstance(idx, int)
533 lvar[idx] = val
534 return val
535 expr = ass_m.group('expr')
536 else:
537 def assign(val):
538 local_vars[ass_m.group('out')] = val
539 return val
540 expr = ass_m.group('expr')
541 elif stmt.startswith(u'return '):
542 assign = lambda v: v
543 expr = stmt[len(u'return '):]
544 else:
545 raise ExtractorError(
546 u'Cannot determine left side of statement in %r' % stmt)
547
548 v = interpret_expression(expr, local_vars, allow_recursion)
549 return assign(v)
550
551 def interpret_expression(expr, local_vars, allow_recursion):
552 if expr.isdigit():
553 return int(expr)
554
555 if expr.isalpha():
556 return local_vars[expr]
557
558 m = re.match(r'^(?P<in>[a-z]+)\.(?P<member>.*)$', expr)
559 if m:
560 member = m.group('member')
561 val = local_vars[m.group('in')]
562 if member == 'split("")':
563 return list(val)
564 if member == 'join("")':
565 return u''.join(val)
566 if member == 'length':
567 return len(val)
568 if member == 'reverse()':
569 return val[::-1]
570 slice_m = re.match(r'slice\((?P<idx>.*)\)', member)
571 if slice_m:
572 idx = interpret_expression(
573 slice_m.group('idx'), local_vars, allow_recursion-1)
574 return val[idx:]
575
576 m = re.match(
577 r'^(?P<in>[a-z]+)\[(?P<idx>.+)\]$', expr)
578 if m:
579 val = local_vars[m.group('in')]
580 idx = interpret_expression(m.group('idx'), local_vars,
581 allow_recursion-1)
582 return val[idx]
583
584 m = re.match(r'^(?P<a>.+?)(?P<op>[%])(?P<b>.+?)$', expr)
585 if m:
586 a = interpret_expression(m.group('a'),
587 local_vars, allow_recursion)
588 b = interpret_expression(m.group('b'),
589 local_vars, allow_recursion)
590 return a % b
591
592 m = re.match(
593 r'^(?P<func>[a-zA-Z]+)\((?P<args>[a-z0-9,]+)\)$', expr)
594 if m:
595 fname = m.group('func')
596 if fname not in functions:
597 functions[fname] = extract_function(fname)
598 argvals = [int(v) if v.isdigit() else local_vars[v]
599 for v in m.group('args').split(',')]
600 return functions[fname](argvals)
601 raise ExtractorError(u'Unsupported JS expression %r' % expr)
602
603 def extract_function(funcname):
604 func_m = re.search(
605 r'function ' + re.escape(funcname) +
606 r'\((?P<args>[a-z,]+)\){(?P<code>[^}]+)}',
607 jscode)
608 argnames = func_m.group('args').split(',')
609
610 def resf(args):
611 local_vars = dict(zip(argnames, args))
612 for stmt in func_m.group('code').split(';'):
613 res = interpret_statement(stmt, local_vars)
614 return res
615 return resf
616
617 initial_function = extract_function(funcname)
618 return lambda s: initial_function([s])
619
620 def _parse_sig_swf(self, file_contents):
621 if file_contents[1:3] != b'WS':
622 raise ExtractorError(
623 u'Not an SWF file; header is %r' % file_contents[:3])
624 if file_contents[:1] == b'C':
625 content = zlib.decompress(file_contents[8:])
626 else:
627 raise NotImplementedError(u'Unsupported compression format %r' %
628 file_contents[:1])
629
630 def extract_tags(content):
631 pos = 0
632 while pos < len(content):
633 header16 = struct.unpack('<H', content[pos:pos+2])[0]
634 pos += 2
635 tag_code = header16 >> 6
636 tag_len = header16 & 0x3f
637 if tag_len == 0x3f:
638 tag_len = struct.unpack('<I', content[pos:pos+4])[0]
639 pos += 4
640 assert pos+tag_len <= len(content)
641 yield (tag_code, content[pos:pos+tag_len])
642 pos += tag_len
643
644 code_tag = next(tag
645 for tag_code, tag in extract_tags(content)
646 if tag_code == 82)
647 p = code_tag.index(b'\0', 4) + 1
ba552f54 648 code_reader = io.BytesIO(code_tag[p:])
e0df6211
PH
649
650 # Parse ABC (AVM2 ByteCode)
ba552f54
PH
651 def read_int(reader=None):
652 if reader is None:
653 reader = code_reader
e0df6211
PH
654 res = 0
655 shift = 0
656 for _ in range(5):
ba552f54
PH
657 buf = reader.read(1)
658 assert len(buf) == 1
659 b = struct.unpack('<B', buf)[0]
e0df6211
PH
660 res = res | ((b & 0x7f) << shift)
661 if b & 0x80 == 0:
662 break
663 shift += 7
ba552f54
PH
664 return res
665
666 def u30(reader=None):
667 res = read_int(reader)
668 assert res & 0xf0000000 == 0
e0df6211
PH
669 return res
670 u32 = read_int
671
ba552f54
PH
672 def s32(reader=None):
673 v = read_int(reader)
e0df6211
PH
674 if v & 0x80000000 != 0:
675 v = - ((v ^ 0xffffffff) + 1)
ba552f54
PH
676 return v
677
0ca96d48 678 def read_string(reader=None):
ba552f54
PH
679 if reader is None:
680 reader = code_reader
681 slen = u30(reader)
682 resb = reader.read(slen)
683 assert len(resb) == slen
684 return resb.decode('utf-8')
685
686 def read_bytes(count, reader=None):
687 if reader is None:
688 reader = code_reader
689 resb = reader.read(count)
690 assert len(resb) == count
691 return resb
692
693 def read_byte(reader=None):
694 resb = read_bytes(1, reader=reader)
695 res = struct.unpack('<B', resb)[0]
696 return res
e0df6211
PH
697
698 # minor_version + major_version
0ca96d48 699 read_bytes(2 + 2)
e0df6211
PH
700
701 # Constant pool
ba552f54 702 int_count = u30()
e0df6211 703 for _c in range(1, int_count):
0ca96d48 704 s32()
ba552f54 705 uint_count = u30()
e0df6211 706 for _c in range(1, uint_count):
0ca96d48 707 u32()
ba552f54 708 double_count = u30()
0ca96d48 709 read_bytes((double_count-1) * 8)
ba552f54 710 string_count = u30()
e0df6211
PH
711 constant_strings = [u'']
712 for _c in range(1, string_count):
0ca96d48 713 s = read_string()
e0df6211 714 constant_strings.append(s)
ba552f54 715 namespace_count = u30()
e0df6211 716 for _c in range(1, namespace_count):
0ca96d48
PH
717 read_bytes(1) # kind
718 u30() # name
ba552f54 719 ns_set_count = u30()
e0df6211 720 for _c in range(1, ns_set_count):
ba552f54 721 count = u30()
e0df6211 722 for _c2 in range(count):
0ca96d48 723 u30()
ba552f54 724 multiname_count = u30()
e0df6211
PH
725 MULTINAME_SIZES = {
726 0x07: 2, # QName
727 0x0d: 2, # QNameA
728 0x0f: 1, # RTQName
729 0x10: 1, # RTQNameA
730 0x11: 0, # RTQNameL
731 0x12: 0, # RTQNameLA
732 0x09: 2, # Multiname
733 0x0e: 2, # MultinameA
734 0x1b: 1, # MultinameL
735 0x1c: 1, # MultinameLA
736 }
737 multinames = [u'']
738 for _c in range(1, multiname_count):
ba552f54 739 kind = u30()
e0df6211
PH
740 assert kind in MULTINAME_SIZES, u'Invalid multiname kind %r' % kind
741 if kind == 0x07:
0ca96d48 742 u30() # namespace_idx
ba552f54 743 name_idx = u30()
e0df6211
PH
744 multinames.append(constant_strings[name_idx])
745 else:
746 multinames.append('[MULTINAME kind: %d]' % kind)
747 for _c2 in range(MULTINAME_SIZES[kind]):
0ca96d48 748 u30()
e0df6211
PH
749
750 # Methods
ba552f54 751 method_count = u30()
e0df6211
PH
752 MethodInfo = collections.namedtuple(
753 'MethodInfo',
754 ['NEED_ARGUMENTS', 'NEED_REST'])
755 method_infos = []
756 for method_id in range(method_count):
ba552f54 757 param_count = u30()
0ca96d48 758 u30() # return type
e0df6211 759 for _ in range(param_count):
0ca96d48
PH
760 u30() # param type
761 u30() # name index (always 0 for youtube)
ba552f54 762 flags = read_byte()
e0df6211
PH
763 if flags & 0x08 != 0:
764 # Options present
ba552f54 765 option_count = u30()
e0df6211 766 for c in range(option_count):
0ca96d48
PH
767 u30() # val
768 read_bytes(1) # kind
e0df6211
PH
769 if flags & 0x80 != 0:
770 # Param names present
771 for _ in range(param_count):
0ca96d48 772 u30() # param name
e0df6211
PH
773 mi = MethodInfo(flags & 0x01 != 0, flags & 0x04 != 0)
774 method_infos.append(mi)
775
776 # Metadata
ba552f54 777 metadata_count = u30()
e0df6211 778 for _c in range(metadata_count):
0ca96d48 779 u30() # name
ba552f54 780 item_count = u30()
e0df6211 781 for _c2 in range(item_count):
0ca96d48
PH
782 u30() # key
783 u30() # value
ba552f54
PH
784
785 def parse_traits_info():
786 trait_name_idx = u30()
787 kind_full = read_byte()
e0df6211
PH
788 kind = kind_full & 0x0f
789 attrs = kind_full >> 4
790 methods = {}
791 if kind in [0x00, 0x06]: # Slot or Const
0ca96d48
PH
792 u30() # Slot id
793 u30() # type_name_idx
ba552f54 794 vindex = u30()
e0df6211 795 if vindex != 0:
0ca96d48 796 read_byte() # vkind
e0df6211 797 elif kind in [0x01, 0x02, 0x03]: # Method / Getter / Setter
0ca96d48 798 u30() # disp_id
ba552f54 799 method_idx = u30()
e0df6211
PH
800 methods[multinames[trait_name_idx]] = method_idx
801 elif kind == 0x04: # Class
0ca96d48
PH
802 u30() # slot_id
803 u30() # classi
e0df6211 804 elif kind == 0x05: # Function
0ca96d48 805 u30() # slot_id
ba552f54 806 function_idx = u30()
e0df6211
PH
807 methods[function_idx] = multinames[trait_name_idx]
808 else:
809 raise ExtractorError(u'Unsupported trait kind %d' % kind)
810
811 if attrs & 0x4 != 0: # Metadata present
ba552f54 812 metadata_count = u30()
e0df6211 813 for _c3 in range(metadata_count):
0ca96d48 814 u30() # metadata index
e0df6211 815
ba552f54 816 return methods
e0df6211
PH
817
818 # Classes
819 TARGET_CLASSNAME = u'SignatureDecipher'
820 searched_idx = multinames.index(TARGET_CLASSNAME)
821 searched_class_id = None
ba552f54 822 class_count = u30()
e0df6211 823 for class_id in range(class_count):
ba552f54 824 name_idx = u30()
e0df6211
PH
825 if name_idx == searched_idx:
826 # We found the class we're looking for!
827 searched_class_id = class_id
0ca96d48 828 u30() # super_name idx
ba552f54 829 flags = read_byte()
e0df6211 830 if flags & 0x08 != 0: # Protected namespace is present
0ca96d48 831 u30() # protected_ns_idx
ba552f54 832 intrf_count = u30()
e0df6211 833 for _c2 in range(intrf_count):
0ca96d48
PH
834 u30()
835 u30() # iinit
ba552f54 836 trait_count = u30()
e0df6211 837 for _c2 in range(trait_count):
0ca96d48 838 parse_traits_info()
e0df6211
PH
839
840 if searched_class_id is None:
841 raise ExtractorError(u'Target class %r not found' %
842 TARGET_CLASSNAME)
843
844 method_names = {}
845 method_idxs = {}
846 for class_id in range(class_count):
0ca96d48 847 u30() # cinit
ba552f54 848 trait_count = u30()
e0df6211 849 for _c2 in range(trait_count):
ba552f54 850 trait_methods = parse_traits_info()
e0df6211
PH
851 if class_id == searched_class_id:
852 method_names.update(trait_methods.items())
853 method_idxs.update(dict(
854 (idx, name)
855 for name, idx in trait_methods.items()))
856
857 # Scripts
ba552f54 858 script_count = u30()
e0df6211 859 for _c in range(script_count):
0ca96d48 860 u30() # init
ba552f54 861 trait_count = u30()
e0df6211 862 for _c2 in range(trait_count):
0ca96d48 863 parse_traits_info()
e0df6211
PH
864
865 # Method bodies
ba552f54 866 method_body_count = u30()
e0df6211
PH
867 Method = collections.namedtuple('Method', ['code', 'local_count'])
868 methods = {}
869 for _c in range(method_body_count):
ba552f54 870 method_idx = u30()
0ca96d48 871 u30() # max_stack
ba552f54 872 local_count = u30()
0ca96d48
PH
873 u30() # init_scope_depth
874 u30() # max_scope_depth
ba552f54
PH
875 code_length = u30()
876 code = read_bytes(code_length)
e0df6211 877 if method_idx in method_idxs:
ba552f54 878 m = Method(code, local_count)
e0df6211 879 methods[method_idxs[method_idx]] = m
ba552f54 880 exception_count = u30()
e0df6211 881 for _c2 in range(exception_count):
0ca96d48
PH
882 u30() # from
883 u30() # to
884 u30() # target
885 u30() # exc_type
886 u30() # var_name
ba552f54 887 trait_count = u30()
e0df6211 888 for _c2 in range(trait_count):
0ca96d48 889 parse_traits_info()
e0df6211 890
ba552f54 891 assert p + code_reader.tell() == len(code_tag)
e0df6211
PH
892 assert len(methods) == len(method_idxs)
893
894 method_pyfunctions = {}
895
896 def extract_function(func_name):
897 if func_name in method_pyfunctions:
898 return method_pyfunctions[func_name]
899 if func_name not in methods:
900 raise ExtractorError(u'Cannot find function %r' % func_name)
901 m = methods[func_name]
902
903 def resfunc(args):
e0df6211
PH
904 registers = ['(this)'] + list(args) + [None] * m.local_count
905 stack = []
906 coder = io.BytesIO(m.code)
907 while True:
908 opcode = struct.unpack('!B', coder.read(1))[0]
a7177865 909 if opcode == 36: # pushbyte
e0df6211
PH
910 v = struct.unpack('!B', coder.read(1))[0]
911 stack.append(v)
912 elif opcode == 44: # pushstring
913 idx = u30(coder)
914 stack.append(constant_strings[idx])
915 elif opcode == 48: # pushscope
916 # We don't implement the scope register, so we'll just
917 # ignore the popped value
918 stack.pop()
919 elif opcode == 70: # callproperty
920 index = u30(coder)
921 mname = multinames[index]
922 arg_count = u30(coder)
923 args = list(reversed(
924 [stack.pop() for _ in range(arg_count)]))
925 obj = stack.pop()
926 if mname == u'split':
927 assert len(args) == 1
928 assert isinstance(args[0], compat_str)
929 assert isinstance(obj, compat_str)
930 if args[0] == u'':
931 res = list(obj)
932 else:
933 res = obj.split(args[0])
934 stack.append(res)
a7177865
PH
935 elif mname == u'slice':
936 assert len(args) == 1
937 assert isinstance(args[0], int)
938 assert isinstance(obj, list)
939 res = obj[args[0]:]
940 stack.append(res)
941 elif mname == u'join':
942 assert len(args) == 1
943 assert isinstance(args[0], compat_str)
944 assert isinstance(obj, list)
945 res = args[0].join(obj)
946 stack.append(res)
e0df6211
PH
947 elif mname in method_pyfunctions:
948 stack.append(method_pyfunctions[mname](args))
949 else:
950 raise NotImplementedError(
951 u'Unsupported property %r on %r'
952 % (mname, obj))
a7177865
PH
953 elif opcode == 72: # returnvalue
954 res = stack.pop()
955 return res
956 elif opcode == 79: # callpropvoid
957 index = u30(coder)
958 mname = multinames[index]
959 arg_count = u30(coder)
960 args = list(reversed(
961 [stack.pop() for _ in range(arg_count)]))
962 obj = stack.pop()
963 if mname == u'reverse':
964 assert isinstance(obj, list)
965 obj.reverse()
966 else:
967 raise NotImplementedError(
968 u'Unsupported (void) property %r on %r'
969 % (mname, obj))
e0df6211
PH
970 elif opcode == 93: # findpropstrict
971 index = u30(coder)
972 mname = multinames[index]
973 res = extract_function(mname)
974 stack.append(res)
975 elif opcode == 97: # setproperty
976 index = u30(coder)
977 value = stack.pop()
978 idx = stack.pop()
979 obj = stack.pop()
980 assert isinstance(obj, list)
981 assert isinstance(idx, int)
982 obj[idx] = value
983 elif opcode == 98: # getlocal
984 index = u30(coder)
985 stack.append(registers[index])
986 elif opcode == 99: # setlocal
987 index = u30(coder)
988 value = stack.pop()
989 registers[index] = value
990 elif opcode == 102: # getproperty
991 index = u30(coder)
992 pname = multinames[index]
993 if pname == u'length':
994 obj = stack.pop()
995 assert isinstance(obj, list)
996 stack.append(len(obj))
997 else: # Assume attribute access
998 idx = stack.pop()
999 assert isinstance(idx, int)
1000 obj = stack.pop()
1001 assert isinstance(obj, list)
1002 stack.append(obj[idx])
1003 elif opcode == 128: # coerce
0ca96d48 1004 u30(coder)
e0df6211
PH
1005 elif opcode == 133: # coerce_s
1006 assert isinstance(stack[-1], (type(None), compat_str))
1007 elif opcode == 164: # modulo
1008 value2 = stack.pop()
1009 value1 = stack.pop()
1010 res = value1 % value2
1011 stack.append(res)
a7177865
PH
1012 elif opcode == 208: # getlocal_0
1013 stack.append(registers[0])
1014 elif opcode == 209: # getlocal_1
1015 stack.append(registers[1])
1016 elif opcode == 210: # getlocal_2
1017 stack.append(registers[2])
1018 elif opcode == 211: # getlocal_3
1019 stack.append(registers[3])
e0df6211
PH
1020 elif opcode == 214: # setlocal_2
1021 registers[2] = stack.pop()
1022 elif opcode == 215: # setlocal_3
1023 registers[3] = stack.pop()
1024 else:
1025 raise NotImplementedError(
1026 u'Unsupported opcode %d' % opcode)
1027
1028 method_pyfunctions[func_name] = resfunc
1029 return resfunc
1030
1031 initial_function = extract_function(u'decipher')
1032 return lambda s: initial_function([s])
1033
83799698 1034 def _decrypt_signature(self, s, video_id, player_url, age_gate=False):
257a2501 1035 """Turn the encrypted s field into a working signature"""
6b37f0be 1036
83799698 1037 if player_url is not None:
e0df6211 1038 try:
83799698
PH
1039 if player_url not in self._player_cache:
1040 func = self._extract_signature_function(
c4417ddb 1041 video_id, player_url, len(s)
e0df6211 1042 )
83799698 1043 self._player_cache[player_url] = func
edf3e38e
PH
1044 func = self._player_cache[player_url]
1045 if self._downloader.params.get('youtube_print_sig_code'):
1046 self._print_sig_code(func, len(s))
1047 return func(s)
0ca96d48 1048 except Exception:
e0df6211 1049 tb = traceback.format_exc()
83799698
PH
1050 self._downloader.report_warning(
1051 u'Automatic signature extraction failed: ' + tb)
e0df6211 1052
d2d8f895
PH
1053 self._downloader.report_warning(
1054 u'Warning: Falling back to static signature algorithm')
920de7a2 1055
2f2ffea9
PH
1056 return self._static_decrypt_signature(
1057 s, video_id, player_url, age_gate)
e0df6211 1058
2f2ffea9 1059 def _static_decrypt_signature(self, s, video_id, player_url, age_gate):
e0df6211
PH
1060 if age_gate:
1061 # The videos with age protection use another player, so the
1062 # algorithms can be different.
1063 if len(s) == 86:
1064 return s[2:63] + s[82] + s[64:82] + s[63]
1065
bc4b9008 1066 if len(s) == 93:
1067 return s[86:29:-1] + s[88] + s[28:5:-1]
1068 elif len(s) == 92:
444b1165 1069 return s[25] + s[3:25] + s[0] + s[26:42] + s[79] + s[43:79] + s[91] + s[80:83]
38d025b3
JMF
1070 elif len(s) == 91:
1071 return s[84:27:-1] + s[86] + s[26:5:-1]
444b1165
JMF
1072 elif len(s) == 90:
1073 return s[25] + s[3:25] + s[2] + s[26:40] + s[77] + s[41:77] + s[89] + s[78:81]
8a9d86a2 1074 elif len(s) == 89:
1075 return s[84:78:-1] + s[87] + s[77:60:-1] + s[0] + s[59:3:-1]
444b1165 1076 elif len(s) == 88:
3e223834 1077 return s[7:28] + s[87] + s[29:45] + s[55] + s[46:55] + s[2] + s[56:87] + s[28]
be547e1d 1078 elif len(s) == 87:
3a725669 1079 return s[6:27] + s[4] + s[28:39] + s[27] + s[40:59] + s[2] + s[60:]
be547e1d 1080 elif len(s) == 86:
f2c327fd 1081 return s[80:72:-1] + s[16] + s[71:39:-1] + s[72] + s[38:16:-1] + s[82] + s[15::-1]
be547e1d 1082 elif len(s) == 85:
6ae8ee3f 1083 return s[3:11] + s[0] + s[12:55] + s[84] + s[56:84]
be547e1d 1084 elif len(s) == 84:
6f56389b 1085 return s[78:70:-1] + s[14] + s[69:37:-1] + s[70] + s[36:14:-1] + s[80] + s[:14][::-1]
be547e1d 1086 elif len(s) == 83:
920de7a2 1087 return s[80:63:-1] + s[0] + s[62:0:-1] + s[63]
be547e1d 1088 elif len(s) == 82:
ce85f022 1089 return s[80:73:-1] + s[81] + s[72:54:-1] + s[2] + s[53:43:-1] + s[0] + s[42:2:-1] + s[43] + s[1] + s[54]
be547e1d 1090 elif len(s) == 81:
aedd6bb9 1091 return s[56] + s[79:56:-1] + s[41] + s[55:41:-1] + s[80] + s[40:34:-1] + s[0] + s[33:29:-1] + s[34] + s[28:9:-1] + s[29] + s[8:0:-1] + s[9]
066090dd
JMF
1092 elif len(s) == 80:
1093 return s[1:19] + s[0] + s[20:68] + s[19] + s[69:80]
5c468ca8
JMF
1094 elif len(s) == 79:
1095 return s[54] + s[77:54:-1] + s[39] + s[53:39:-1] + s[78] + s[38:34:-1] + s[0] + s[33:29:-1] + s[34] + s[28:9:-1] + s[29] + s[8:0:-1] + s[9]
be547e1d
PH
1096
1097 else:
1098 raise ExtractorError(u'Unable to decrypt signature, key length %d not supported; retrying might work' % (len(s)))
c5e8d7af 1099
de7f3446 1100 def _get_available_subtitles(self, video_id):
de7f3446 1101 try:
7fad1c63
JMF
1102 sub_list = self._download_webpage(
1103 'http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,
1104 video_id, note=False)
1105 except ExtractorError as err:
de7f3446
JMF
1106 self._downloader.report_warning(u'unable to download video subtitles: %s' % compat_str(err))
1107 return {}
1108 lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
1109
1110 sub_lang_list = {}
1111 for l in lang_list:
1112 lang = l[1]
1113 params = compat_urllib_parse.urlencode({
1114 'lang': lang,
1115 'v': video_id,
1116 'fmt': self._downloader.params.get('subtitlesformat'),
1117 })
1118 url = u'http://www.youtube.com/api/timedtext?' + params
1119 sub_lang_list[lang] = url
1120 if not sub_lang_list:
1121 self._downloader.report_warning(u'video doesn\'t have subtitles')
1122 return {}
1123 return sub_lang_list
1124
055e6f36 1125 def _get_available_automatic_caption(self, video_id, webpage):
de7f3446
JMF
1126 """We need the webpage for getting the captions url, pass it as an
1127 argument to speed up the process."""
de7f3446
JMF
1128 sub_format = self._downloader.params.get('subtitlesformat')
1129 self.to_screen(u'%s: Looking for automatic captions' % video_id)
1130 mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
055e6f36 1131 err_msg = u'Couldn\'t find automatic captions for %s' % video_id
de7f3446
JMF
1132 if mobj is None:
1133 self._downloader.report_warning(err_msg)
1134 return {}
1135 player_config = json.loads(mobj.group(1))
1136 try:
1137 args = player_config[u'args']
1138 caption_url = args[u'ttsurl']
1139 timestamp = args[u'timestamp']
055e6f36
JMF
1140 # We get the available subtitles
1141 list_params = compat_urllib_parse.urlencode({
1142 'type': 'list',
1143 'tlangs': 1,
1144 'asrs': 1,
de7f3446 1145 })
055e6f36
JMF
1146 list_url = caption_url + '&' + list_params
1147 list_page = self._download_webpage(list_url, video_id)
1148 caption_list = xml.etree.ElementTree.fromstring(list_page.encode('utf-8'))
e3dc22ca
JMF
1149 original_lang_node = caption_list.find('track')
1150 if original_lang_node.attrib.get('kind') != 'asr' :
1151 self._downloader.report_warning(u'Video doesn\'t have automatic captions')
1152 return {}
1153 original_lang = original_lang_node.attrib['lang_code']
055e6f36
JMF
1154
1155 sub_lang_list = {}
1156 for lang_node in caption_list.findall('target'):
1157 sub_lang = lang_node.attrib['lang_code']
1158 params = compat_urllib_parse.urlencode({
1159 'lang': original_lang,
1160 'tlang': sub_lang,
1161 'fmt': sub_format,
1162 'ts': timestamp,
1163 'kind': 'asr',
1164 })
1165 sub_lang_list[sub_lang] = caption_url + '&' + params
1166 return sub_lang_list
de7f3446
JMF
1167 # An extractor error can be raise by the download process if there are
1168 # no automatic captions but there are subtitles
1169 except (KeyError, ExtractorError):
1170 self._downloader.report_warning(err_msg)
1171 return {}
1172
c5e8d7af
PH
1173 def _print_formats(self, formats):
1174 print('Available formats:')
1175 for x in formats:
03cc7c20
JMF
1176 print('%s\t:\t%s\t[%s]%s' %(x, self._video_extensions.get(x, 'flv'),
1177 self._video_dimensions.get(x, '???'),
836a086c 1178 ' ('+self._special_itags[x]+')' if x in self._special_itags else ''))
c5e8d7af
PH
1179
1180 def _extract_id(self, url):
1181 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1182 if mobj is None:
1183 raise ExtractorError(u'Invalid URL: %s' % url)
1184 video_id = mobj.group(2)
1185 return video_id
1186
1d043b93
JMF
1187 def _get_video_url_list(self, url_map):
1188 """
1189 Transform a dictionary in the format {itag:url} to a list of (itag, url)
1190 with the requested formats.
1191 """
1192 req_format = self._downloader.params.get('format', None)
1193 format_limit = self._downloader.params.get('format_limit', None)
1194 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
1195 if format_limit is not None and format_limit in available_formats:
1196 format_list = available_formats[available_formats.index(format_limit):]
1197 else:
1198 format_list = available_formats
1199 existing_formats = [x for x in format_list if x in url_map]
1200 if len(existing_formats) == 0:
1201 raise ExtractorError(u'no known formats available for video')
1202 if self._downloader.params.get('listformats', None):
1203 self._print_formats(existing_formats)
1204 return
1205 if req_format is None or req_format == 'best':
1206 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1207 elif req_format == 'worst':
1208 video_url_list = [(existing_formats[-1], url_map[existing_formats[-1]])] # worst quality
1209 elif req_format in ('-1', 'all'):
1210 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1211 else:
1212 # Specific formats. We pick the first in a slash-delimeted sequence.
bdc6b3fc
AZ
1213 # Format can be specified as itag or 'mp4' or 'flv' etc. We pick the highest quality
1214 # available in the specified format. For example,
1215 # if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
1216 # if '1/mp4/3/4' is requested and '1' and '5' (is a mp4) are available, we pick '1'.
1217 # if '1/mp4/3/4' is requested and '4' and '5' (is a mp4) are available, we pick '5'.
1d043b93
JMF
1218 req_formats = req_format.split('/')
1219 video_url_list = None
1220 for rf in req_formats:
1221 if rf in url_map:
1222 video_url_list = [(rf, url_map[rf])]
1223 break
bdc6b3fc
AZ
1224 if rf in self._video_formats_map:
1225 for srf in self._video_formats_map[rf]:
1226 if srf in url_map:
1227 video_url_list = [(srf, url_map[srf])]
1228 break
1229 else:
1230 continue
1231 break
1d043b93
JMF
1232 if video_url_list is None:
1233 raise ExtractorError(u'requested format not available')
1234 return video_url_list
1235
1236 def _extract_from_m3u8(self, manifest_url, video_id):
1237 url_map = {}
1238 def _get_urls(_manifest):
1239 lines = _manifest.split('\n')
1240 urls = filter(lambda l: l and not l.startswith('#'),
1241 lines)
1242 return urls
1243 manifest = self._download_webpage(manifest_url, video_id, u'Downloading formats manifest')
1244 formats_urls = _get_urls(manifest)
1245 for format_url in formats_urls:
890f62e8 1246 itag = self._search_regex(r'itag/(\d+?)/', format_url, 'itag')
1d043b93
JMF
1247 url_map[itag] = format_url
1248 return url_map
1249
c5e8d7af 1250 def _real_extract(self, url):
d7f44b5b
PH
1251 if re.match(r'(?:https?://)?[^/]+/watch\?feature=[a-z_]+$', url):
1252 self._downloader.report_warning(u'Did you forget to quote the URL? Remember that & is a meta-character in most shells, so you want to put the URL in quotes, like youtube-dl \'http://www.youtube.com/watch?feature=foo&v=BaW_jenozKc\' (or simply youtube-dl BaW_jenozKc ).')
1253
c5e8d7af
PH
1254 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
1255 mobj = re.search(self._NEXT_URL_RE, url)
1256 if mobj:
1257 url = 'https://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
1258 video_id = self._extract_id(url)
1259
1260 # Get video webpage
1261 self.report_video_webpage_download(video_id)
1262 url = 'https://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
1263 request = compat_urllib_request.Request(url)
1264 try:
1265 video_webpage_bytes = compat_urllib_request.urlopen(request).read()
1266 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1267 raise ExtractorError(u'Unable to download video webpage: %s' % compat_str(err))
1268
1269 video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
1270
1271 # Attempt to extract SWF player URL
e0df6211 1272 mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
c5e8d7af
PH
1273 if mobj is not None:
1274 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1275 else:
1276 player_url = None
1277
1278 # Get video info
1279 self.report_video_info_webpage_download(video_id)
c108eb73
JMF
1280 if re.search(r'player-age-gate-content">', video_webpage) is not None:
1281 self.report_age_confirmation()
1282 age_gate = True
1283 # We simulate the access to the video from www.youtube.com/v/{video_id}
1284 # this can be viewed without login into Youtube
1285 data = compat_urllib_parse.urlencode({'video_id': video_id,
1286 'el': 'embedded',
1287 'gl': 'US',
1288 'hl': 'en',
1289 'eurl': 'https://youtube.googleapis.com/v/' + video_id,
1290 'asv': 3,
1291 'sts':'1588',
1292 })
1293 video_info_url = 'https://www.youtube.com/get_video_info?' + data
c5e8d7af
PH
1294 video_info_webpage = self._download_webpage(video_info_url, video_id,
1295 note=False,
1296 errnote='unable to download video info webpage')
1297 video_info = compat_parse_qs(video_info_webpage)
c108eb73
JMF
1298 else:
1299 age_gate = False
1300 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1301 video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1302 % (video_id, el_type))
1303 video_info_webpage = self._download_webpage(video_info_url, video_id,
1304 note=False,
1305 errnote='unable to download video info webpage')
1306 video_info = compat_parse_qs(video_info_webpage)
1307 if 'token' in video_info:
1308 break
c5e8d7af
PH
1309 if 'token' not in video_info:
1310 if 'reason' in video_info:
9a82b238 1311 raise ExtractorError(u'YouTube said: %s' % video_info['reason'][0], expected=True)
c5e8d7af
PH
1312 else:
1313 raise ExtractorError(u'"token" parameter not in video info for unknown reason')
1314
1315 # Check for "rental" videos
1316 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
1317 raise ExtractorError(u'"rental" videos not supported')
1318
1319 # Start extracting information
1320 self.report_information_extraction(video_id)
1321
1322 # uploader
1323 if 'author' not in video_info:
1324 raise ExtractorError(u'Unable to extract uploader name')
1325 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
1326
1327 # uploader_id
1328 video_uploader_id = None
1329 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
1330 if mobj is not None:
1331 video_uploader_id = mobj.group(1)
1332 else:
1333 self._downloader.report_warning(u'unable to extract uploader nickname')
1334
1335 # title
1336 if 'title' not in video_info:
1337 raise ExtractorError(u'Unable to extract video title')
1338 video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
1339
1340 # thumbnail image
7763b04e
JMF
1341 # We try first to get a high quality image:
1342 m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">',
1343 video_webpage, re.DOTALL)
1344 if m_thumb is not None:
1345 video_thumbnail = m_thumb.group(1)
1346 elif 'thumbnail_url' not in video_info:
c5e8d7af 1347 self._downloader.report_warning(u'unable to extract video thumbnail')
f490e77e 1348 video_thumbnail = None
c5e8d7af
PH
1349 else: # don't panic if we can't find it
1350 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
1351
1352 # upload date
1353 upload_date = None
1354 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
1355 if mobj is not None:
1356 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1357 upload_date = unified_strdate(upload_date)
1358
1359 # description
1360 video_description = get_element_by_id("eow-description", video_webpage)
1361 if video_description:
1362 video_description = clean_html(video_description)
1363 else:
1364 fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
1365 if fd_mobj:
1366 video_description = unescapeHTML(fd_mobj.group(1))
1367 else:
1368 video_description = u''
1369
1370 # subtitles
d82134c3 1371 video_subtitles = self.extract_subtitles(video_id, video_webpage)
c5e8d7af 1372
c5e8d7af 1373 if self._downloader.params.get('listsubtitles', False):
d665f8d3 1374 self._list_available_subtitles(video_id, video_webpage)
c5e8d7af
PH
1375 return
1376
1377 if 'length_seconds' not in video_info:
1378 self._downloader.report_warning(u'unable to extract video duration')
1379 video_duration = ''
1380 else:
1381 video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
1382
c5e8d7af 1383 # Decide which formats to download
c5e8d7af
PH
1384
1385 try:
1386 mobj = re.search(r';ytplayer.config = ({.*?});', video_webpage)
50be92c1
PH
1387 if not mobj:
1388 raise ValueError('Could not find vevo ID')
c5e8d7af
PH
1389 info = json.loads(mobj.group(1))
1390 args = info['args']
7ce7e394
JMF
1391 # Easy way to know if the 's' value is in url_encoded_fmt_stream_map
1392 # this signatures are encrypted
1393 m_s = re.search(r'[&,]s=', args['url_encoded_fmt_stream_map'])
1394 if m_s is not None:
1395 self.to_screen(u'%s: Encrypted signatures detected.' % video_id)
c5e8d7af 1396 video_info['url_encoded_fmt_stream_map'] = [args['url_encoded_fmt_stream_map']]
cde846b3 1397 m_s = re.search(r'[&,]s=', args.get('adaptive_fmts', u''))
b7a68384 1398 if m_s is not None:
37b6d5f6
AZ
1399 if 'url_encoded_fmt_stream_map' in video_info:
1400 video_info['url_encoded_fmt_stream_map'][0] += ',' + args['adaptive_fmts']
1401 else:
1402 video_info['url_encoded_fmt_stream_map'] = [args['adaptive_fmts']]
211fbc13 1403 elif 'adaptive_fmts' in video_info:
37b6d5f6
AZ
1404 if 'url_encoded_fmt_stream_map' in video_info:
1405 video_info['url_encoded_fmt_stream_map'][0] += ',' + video_info['adaptive_fmts'][0]
1406 else:
1407 video_info['url_encoded_fmt_stream_map'] = video_info['adaptive_fmts']
c5e8d7af
PH
1408 except ValueError:
1409 pass
1410
1411 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1412 self.report_rtmp_download()
1413 video_url_list = [(None, video_info['conn'][0])]
1414 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
a7055eb9
JMF
1415 if 'rtmpe%3Dyes' in video_info['url_encoded_fmt_stream_map'][0]:
1416 raise ExtractorError('rtmpe downloads are not supported, see https://github.com/rg3/youtube-dl/issues/343 for more information.', expected=True)
c5e8d7af
PH
1417 url_map = {}
1418 for url_data_str in video_info['url_encoded_fmt_stream_map'][0].split(','):
1419 url_data = compat_parse_qs(url_data_str)
1420 if 'itag' in url_data and 'url' in url_data:
1421 url = url_data['url'][0]
1422 if 'sig' in url_data:
1423 url += '&signature=' + url_data['sig'][0]
1424 elif 's' in url_data:
e0df6211 1425 encrypted_sig = url_data['s'][0]
769fda3c 1426 if self._downloader.params.get('verbose'):
c108eb73 1427 if age_gate:
bdde940e
PH
1428 if player_url is None:
1429 player_version = 'unknown'
1430 else:
1431 player_version = self._search_regex(
1432 r'-(.+)\.swf$', player_url,
1433 u'flash player', fatal=False)
e0df6211 1434 player_desc = 'flash player %s' % player_version
c108eb73 1435 else:
83799698
PH
1436 player_version = self._search_regex(
1437 r'html5player-(.+?)\.js', video_webpage,
c108eb73 1438 'html5 player', fatal=False)
e0df6211
PH
1439 player_desc = u'html5 player %s' % player_version
1440
1441 parts_sizes = u'.'.join(compat_str(len(part)) for part in encrypted_sig.split('.'))
5a76c651 1442 self.to_screen(u'encrypted signature length %d (%s), itag %s, %s' %
e0df6211
PH
1443 (len(encrypted_sig), parts_sizes, url_data['itag'][0], player_desc))
1444
83799698 1445 if not age_gate:
e0df6211
PH
1446 jsplayer_url_json = self._search_regex(
1447 r'"assets":.+?"js":\s*("[^"]+")',
1448 video_webpage, u'JS player URL')
83799698 1449 player_url = json.loads(jsplayer_url_json)
e0df6211 1450
83799698
PH
1451 signature = self._decrypt_signature(
1452 encrypted_sig, video_id, player_url, age_gate)
c5e8d7af
PH
1453 url += '&signature=' + signature
1454 if 'ratebypass' not in url:
1455 url += '&ratebypass=yes'
1456 url_map[url_data['itag'][0]] = url
1d043b93
JMF
1457 video_url_list = self._get_video_url_list(url_map)
1458 if not video_url_list:
c5e8d7af 1459 return
1d043b93
JMF
1460 elif video_info.get('hlsvp'):
1461 manifest_url = video_info['hlsvp'][0]
1462 url_map = self._extract_from_m3u8(manifest_url, video_id)
1463 video_url_list = self._get_video_url_list(url_map)
1464 if not video_url_list:
1465 return
1466
c5e8d7af 1467 else:
9abb3204 1468 raise ExtractorError(u'no conn, hlsvp or url_encoded_fmt_stream_map information found in video info')
c5e8d7af
PH
1469
1470 results = []
1471 for format_param, video_real_url in video_url_list:
1472 # Extension
1473 video_extension = self._video_extensions.get(format_param, 'flv')
1474
03cc7c20
JMF
1475 video_format = '{0} - {1}{2}'.format(format_param if format_param else video_extension,
1476 self._video_dimensions.get(format_param, '???'),
836a086c 1477 ' ('+self._special_itags[format_param]+')' if format_param in self._special_itags else '')
c5e8d7af
PH
1478
1479 results.append({
1480 'id': video_id,
1481 'url': video_real_url,
1482 'uploader': video_uploader,
1483 'uploader_id': video_uploader_id,
1484 'upload_date': upload_date,
1485 'title': video_title,
1486 'ext': video_extension,
1487 'format': video_format,
1488 'thumbnail': video_thumbnail,
1489 'description': video_description,
1490 'player_url': player_url,
1491 'subtitles': video_subtitles,
1492 'duration': video_duration
1493 })
1494 return results
1495
1496class YoutubePlaylistIE(InfoExtractor):
0f818663 1497 IE_DESC = u'YouTube.com playlists'
c5e8d7af
PH
1498 _VALID_URL = r"""(?:
1499 (?:https?://)?
1500 (?:\w+\.)?
1501 youtube\.com/
1502 (?:
1503 (?:course|view_play_list|my_playlists|artist|playlist|watch)
1504 \? (?:.*?&)*? (?:p|a|list)=
1505 | p/
1506 )
c626a3d9 1507 ((?:PL|EC|UU|FL)?[0-9A-Za-z-_]{10,})
c5e8d7af
PH
1508 .*
1509 |
c626a3d9 1510 ((?:PL|EC|UU|FL)[0-9A-Za-z-_]{10,})
c5e8d7af
PH
1511 )"""
1512 _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json&safeSearch=none'
1513 _MAX_RESULTS = 50
1514 IE_NAME = u'youtube:playlist'
1515
1516 @classmethod
1517 def suitable(cls, url):
1518 """Receives a URL and returns True if suitable for this IE."""
1519 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1520
1521 def _real_extract(self, url):
1522 # Extract playlist id
1523 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1524 if mobj is None:
1525 raise ExtractorError(u'Invalid URL: %s' % url)
1526
1527 # Download playlist videos from API
1528 playlist_id = mobj.group(1) or mobj.group(2)
c5e8d7af
PH
1529 videos = []
1530
755eb032 1531 for page_num in itertools.count(1):
771822eb
JMF
1532 start_index = self._MAX_RESULTS * (page_num - 1) + 1
1533 if start_index >= 1000:
1534 self._downloader.report_warning(u'Max number of results reached')
1535 break
1536 url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, start_index)
c5e8d7af
PH
1537 page = self._download_webpage(url, playlist_id, u'Downloading page #%s' % page_num)
1538
1539 try:
1540 response = json.loads(page)
1541 except ValueError as err:
1542 raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
1543
1544 if 'feed' not in response:
1545 raise ExtractorError(u'Got a malformed response from YouTube API')
1546 playlist_title = response['feed']['title']['$t']
1547 if 'entry' not in response['feed']:
1548 # Number of videos is a multiple of self._MAX_RESULTS
1549 break
1550
1551 for entry in response['feed']['entry']:
1552 index = entry['yt$position']['$t']
c215217e
JMF
1553 if 'media$group' in entry and 'yt$videoid' in entry['media$group']:
1554 videos.append((
1555 index,
1556 'https://www.youtube.com/watch?v=' + entry['media$group']['yt$videoid']['$t']
1557 ))
c5e8d7af 1558
c5e8d7af
PH
1559 videos = [v[1] for v in sorted(videos)]
1560
20c3893f 1561 url_results = [self.url_result(vurl, 'Youtube') for vurl in videos]
c5e8d7af
PH
1562 return [self.playlist_result(url_results, playlist_id, playlist_title)]
1563
1564
1565class YoutubeChannelIE(InfoExtractor):
0f818663 1566 IE_DESC = u'YouTube.com channels'
c5e8d7af
PH
1567 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
1568 _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1569 _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
252580c5 1570 _MORE_PAGES_URL = 'http://www.youtube.com/c4_browse_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
c5e8d7af
PH
1571 IE_NAME = u'youtube:channel'
1572
1573 def extract_videos_from_page(self, page):
1574 ids_in_page = []
1575 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1576 if mobj.group(1) not in ids_in_page:
1577 ids_in_page.append(mobj.group(1))
1578 return ids_in_page
1579
1580 def _real_extract(self, url):
1581 # Extract channel id
1582 mobj = re.match(self._VALID_URL, url)
1583 if mobj is None:
1584 raise ExtractorError(u'Invalid URL: %s' % url)
1585
1586 # Download channel page
1587 channel_id = mobj.group(1)
1588 video_ids = []
1589 pagenum = 1
1590
1591 url = self._TEMPLATE_URL % (channel_id, pagenum)
1592 page = self._download_webpage(url, channel_id,
1593 u'Downloading page #%s' % pagenum)
1594
1595 # Extract video identifiers
1596 ids_in_page = self.extract_videos_from_page(page)
1597 video_ids.extend(ids_in_page)
1598
1599 # Download any subsequent channel pages using the json-based channel_ajax query
1600 if self._MORE_PAGES_INDICATOR in page:
755eb032 1601 for pagenum in itertools.count(1):
c5e8d7af
PH
1602 url = self._MORE_PAGES_URL % (pagenum, channel_id)
1603 page = self._download_webpage(url, channel_id,
1604 u'Downloading page #%s' % pagenum)
1605
1606 page = json.loads(page)
1607
1608 ids_in_page = self.extract_videos_from_page(page['content_html'])
1609 video_ids.extend(ids_in_page)
1610
1611 if self._MORE_PAGES_INDICATOR not in page['load_more_widget_html']:
1612 break
1613
1614 self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1615
1616 urls = ['http://www.youtube.com/watch?v=%s' % id for id in video_ids]
20c3893f 1617 url_entries = [self.url_result(eurl, 'Youtube') for eurl in urls]
c5e8d7af
PH
1618 return [self.playlist_result(url_entries, channel_id)]
1619
1620
1621class YoutubeUserIE(InfoExtractor):
0f818663 1622 IE_DESC = u'YouTube.com user videos (URL or "ytuser" keyword)'
faab1d38 1623 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:user/)?)|ytuser:)(?!feed/)([A-Za-z0-9_-]+)'
c5e8d7af
PH
1624 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1625 _GDATA_PAGE_SIZE = 50
fd9cf738 1626 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d&alt=json'
c5e8d7af
PH
1627 IE_NAME = u'youtube:user'
1628
e3ea4790 1629 @classmethod
f4b05232 1630 def suitable(cls, url):
e3ea4790
JMF
1631 # Don't return True if the url can be extracted with other youtube
1632 # extractor, the regex would is too permissive and it would match.
1633 other_ies = iter(klass for (name, klass) in globals().items() if name.endswith('IE') and klass is not cls)
1634 if any(ie.suitable(url) for ie in other_ies): return False
f4b05232
JMF
1635 else: return super(YoutubeUserIE, cls).suitable(url)
1636
c5e8d7af
PH
1637 def _real_extract(self, url):
1638 # Extract username
1639 mobj = re.match(self._VALID_URL, url)
1640 if mobj is None:
1641 raise ExtractorError(u'Invalid URL: %s' % url)
1642
1643 username = mobj.group(1)
1644
1645 # Download video ids using YouTube Data API. Result size per
1646 # query is limited (currently to 50 videos) so we need to query
1647 # page by page until there are no video ids - it means we got
1648 # all of them.
1649
1650 video_ids = []
c5e8d7af 1651
755eb032 1652 for pagenum in itertools.count(0):
c5e8d7af
PH
1653 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1654
1655 gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
1656 page = self._download_webpage(gdata_url, username,
1657 u'Downloading video ids from %d to %d' % (start_index, start_index + self._GDATA_PAGE_SIZE))
1658
fd9cf738
JMF
1659 try:
1660 response = json.loads(page)
1661 except ValueError as err:
1662 raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
71c82637
JMF
1663 if 'entry' not in response['feed']:
1664 # Number of videos is a multiple of self._MAX_RESULTS
1665 break
fd9cf738 1666
c5e8d7af
PH
1667 # Extract video identifiers
1668 ids_in_page = []
fd9cf738
JMF
1669 for entry in response['feed']['entry']:
1670 ids_in_page.append(entry['id']['$t'].split('/')[-1])
c5e8d7af
PH
1671 video_ids.extend(ids_in_page)
1672
1673 # A little optimization - if current page is not
1674 # "full", ie. does not contain PAGE_SIZE video ids then
1675 # we can assume that this page is the last one - there
1676 # are no more ids on further pages - no need to query
1677 # again.
1678
1679 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1680 break
1681
c5e8d7af 1682 urls = ['http://www.youtube.com/watch?v=%s' % video_id for video_id in video_ids]
20c3893f 1683 url_results = [self.url_result(rurl, 'Youtube') for rurl in urls]
c5e8d7af 1684 return [self.playlist_result(url_results, playlist_title = username)]
b05654f0
PH
1685
1686class YoutubeSearchIE(SearchInfoExtractor):
0f818663 1687 IE_DESC = u'YouTube.com searches'
b05654f0
PH
1688 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1689 _MAX_RESULTS = 1000
1690 IE_NAME = u'youtube:search'
1691 _SEARCH_KEY = 'ytsearch'
1692
1693 def report_download_page(self, query, pagenum):
1694 """Report attempt to download search page with given number."""
1695 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1696
1697 def _get_n_results(self, query, n):
1698 """Get a specified number of results for a query"""
1699
1700 video_ids = []
1701 pagenum = 0
1702 limit = n
1703
1704 while (50 * pagenum) < limit:
1705 self.report_download_page(query, pagenum+1)
1706 result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1707 request = compat_urllib_request.Request(result_url)
1708 try:
1709 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
1710 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1711 raise ExtractorError(u'Unable to download API page: %s' % compat_str(err))
1712 api_response = json.loads(data)['data']
1713
1714 if not 'items' in api_response:
1715 raise ExtractorError(u'[youtube] No video results')
1716
1717 new_ids = list(video['id'] for video in api_response['items'])
1718 video_ids += new_ids
1719
1720 limit = min(n, api_response['totalItems'])
1721 pagenum += 1
1722
1723 if len(video_ids) > n:
1724 video_ids = video_ids[:n]
1725 videos = [self.url_result('http://www.youtube.com/watch?v=%s' % id, 'Youtube') for id in video_ids]
1726 return self.playlist_result(videos, query)
75dff0ee
JMF
1727
1728
1729class YoutubeShowIE(InfoExtractor):
0f818663 1730 IE_DESC = u'YouTube.com (multi-season) shows'
75dff0ee
JMF
1731 _VALID_URL = r'https?://www\.youtube\.com/show/(.*)'
1732 IE_NAME = u'youtube:show'
1733
1734 def _real_extract(self, url):
1735 mobj = re.match(self._VALID_URL, url)
1736 show_name = mobj.group(1)
1737 webpage = self._download_webpage(url, show_name, u'Downloading show webpage')
1738 # There's one playlist for each season of the show
1739 m_seasons = list(re.finditer(r'href="(/playlist\?list=.*?)"', webpage))
1740 self.to_screen(u'%s: Found %s seasons' % (show_name, len(m_seasons)))
1741 return [self.url_result('https://www.youtube.com' + season.group(1), 'YoutubePlaylist') for season in m_seasons]
04cc9617
JMF
1742
1743
b2e8bc1b 1744class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
d7ae0639
JMF
1745 """
1746 Base class for extractors that fetch info from
1747 http://www.youtube.com/feed_ajax
1748 Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.
1749 """
b2e8bc1b 1750 _LOGIN_REQUIRED = True
04cc9617 1751 _PAGING_STEP = 30
43ba5456
JMF
1752 # use action_load_personal_feed instead of action_load_system_feed
1753 _PERSONAL_FEED = False
04cc9617 1754
d7ae0639
JMF
1755 @property
1756 def _FEED_TEMPLATE(self):
43ba5456
JMF
1757 action = 'action_load_system_feed'
1758 if self._PERSONAL_FEED:
1759 action = 'action_load_personal_feed'
1760 return 'http://www.youtube.com/feed_ajax?%s=1&feed_name=%s&paging=%%s' % (action, self._FEED_NAME)
d7ae0639
JMF
1761
1762 @property
1763 def IE_NAME(self):
1764 return u'youtube:%s' % self._FEED_NAME
04cc9617 1765
81f0259b 1766 def _real_initialize(self):
b2e8bc1b 1767 self._login()
81f0259b 1768
04cc9617
JMF
1769 def _real_extract(self, url):
1770 feed_entries = []
1771 # The step argument is available only in 2.7 or higher
1772 for i in itertools.count(0):
1773 paging = i*self._PAGING_STEP
d7ae0639
JMF
1774 info = self._download_webpage(self._FEED_TEMPLATE % paging,
1775 u'%s feed' % self._FEED_NAME,
04cc9617
JMF
1776 u'Downloading page %s' % i)
1777 info = json.loads(info)
1778 feed_html = info['feed_html']
43ba5456 1779 m_ids = re.finditer(r'"/watch\?v=(.*?)["&]', feed_html)
04cc9617
JMF
1780 ids = orderedSet(m.group(1) for m in m_ids)
1781 feed_entries.extend(self.url_result(id, 'Youtube') for id in ids)
1782 if info['paging'] is None:
1783 break
d7ae0639
JMF
1784 return self.playlist_result(feed_entries, playlist_title=self._PLAYLIST_TITLE)
1785
1786class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor):
1787 IE_DESC = u'YouTube.com subscriptions feed, "ytsubs" keyword(requires authentication)'
1788 _VALID_URL = r'https?://www\.youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
1789 _FEED_NAME = 'subscriptions'
1790 _PLAYLIST_TITLE = u'Youtube Subscriptions'
1791
1792class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
1793 IE_DESC = u'YouTube.com recommended videos, "ytrec" keyword (requires authentication)'
1794 _VALID_URL = r'https?://www\.youtube\.com/feed/recommended|:ytrec(?:ommended)?'
1795 _FEED_NAME = 'recommended'
1796 _PLAYLIST_TITLE = u'Youtube Recommended videos'
c626a3d9 1797
43ba5456
JMF
1798class YoutubeWatchLaterIE(YoutubeFeedsInfoExtractor):
1799 IE_DESC = u'Youtube watch later list, "ytwatchlater" keyword (requires authentication)'
1800 _VALID_URL = r'https?://www\.youtube\.com/feed/watch_later|:ytwatchlater'
1801 _FEED_NAME = 'watch_later'
1802 _PLAYLIST_TITLE = u'Youtube Watch Later'
1803 _PAGING_STEP = 100
1804 _PERSONAL_FEED = True
c626a3d9
JMF
1805
1806class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
1807 IE_NAME = u'youtube:favorites'
1808 IE_DESC = u'YouTube.com favourite videos, "ytfav" keyword (requires authentication)'
c7a7750d 1809 _VALID_URL = r'https?://www\.youtube\.com/my_favorites|:ytfav(?:ou?rites)?'
c626a3d9
JMF
1810 _LOGIN_REQUIRED = True
1811
1812 def _real_extract(self, url):
1813 webpage = self._download_webpage('https://www.youtube.com/my_favorites', 'Youtube Favourites videos')
1814 playlist_id = self._search_regex(r'list=(.+?)["&]', webpage, u'favourites playlist id')
1815 return self.url_result(playlist_id, 'YoutubePlaylist')