]> jfr.im git - yt-dlp.git/blame - youtube_dl/extractor/youtube.py
[generic] Support application/ogg for direct links
[yt-dlp.git] / youtube_dl / extractor / youtube.py
CommitLineData
c5e8d7af 1# coding: utf-8
c5e8d7af 2
e0df6211 3import collections
edf3e38e 4import errno
e0df6211 5import io
0ca96d48 6import itertools
c5e8d7af 7import json
c4417ddb 8import os.path
c5e8d7af 9import re
e0df6211
PH
10import string
11import struct
12import traceback
13import zlib
c5e8d7af 14
b05654f0 15from .common import InfoExtractor, SearchInfoExtractor
54d39d8b 16from .subtitles import SubtitlesInfoExtractor
c5e8d7af 17from ..utils import (
edf3e38e 18 compat_chr,
c5e8d7af 19 compat_parse_qs,
c5e8d7af
PH
20 compat_urllib_parse,
21 compat_urllib_request,
7c61bd36 22 compat_urlparse,
c5e8d7af
PH
23 compat_str,
24
25 clean_html,
c38b1e77 26 get_cachedir,
c5e8d7af 27 get_element_by_id,
652cdaa2 28 get_element_by_attribute,
c5e8d7af
PH
29 ExtractorError,
30 unescapeHTML,
31 unified_strdate,
04cc9617 32 orderedSet,
edf3e38e 33 write_json_file,
c5e8d7af
PH
34)
35
de7f3446 36class YoutubeBaseInfoExtractor(InfoExtractor):
b2e8bc1b
JMF
37 """Provide base functions for Youtube extractors"""
38 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
39 _LANG_URL = r'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
40 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
41 _NETRC_MACHINE = 'youtube'
42 # If True it will raise an error if no login info is provided
43 _LOGIN_REQUIRED = False
44
b2e8bc1b 45 def _set_language(self):
7cc3570e
PH
46 return bool(self._download_webpage(
47 self._LANG_URL, None,
48 note=u'Setting language', errnote='unable to set language',
49 fatal=False))
b2e8bc1b
JMF
50
51 def _login(self):
52 (username, password) = self._get_login_info()
53 # No authentication to be performed
54 if username is None:
55 if self._LOGIN_REQUIRED:
56 raise ExtractorError(u'No login info available, needed for using %s.' % self.IE_NAME, expected=True)
57 return False
58
7cc3570e
PH
59 login_page = self._download_webpage(
60 self._LOGIN_URL, None,
61 note=u'Downloading login page',
62 errnote=u'unable to fetch login page', fatal=False)
63 if login_page is False:
64 return
b2e8bc1b 65
795f28f8
PH
66 galx = self._search_regex(r'(?s)<input.+?name="GALX".+?value="(.+?)"',
67 login_page, u'Login GALX parameter')
c5e8d7af 68
b2e8bc1b
JMF
69 # Log in
70 login_form_strs = {
71 u'continue': u'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
72 u'Email': username,
73 u'GALX': galx,
74 u'Passwd': password,
75 u'PersistentCookie': u'yes',
76 u'_utf8': u'霱',
77 u'bgresponse': u'js_disabled',
78 u'checkConnection': u'',
79 u'checkedDomains': u'youtube',
80 u'dnConn': u'',
b2e8bc1b
JMF
81 u'pstMsg': u'0',
82 u'rmShown': u'1',
83 u'secTok': u'',
84 u'signIn': u'Sign in',
85 u'timeStmp': u'',
86 u'service': u'youtube',
87 u'uilel': u'3',
88 u'hl': u'en_US',
89 }
90 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
91 # chokes on unicode
92 login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
93 login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
7cc3570e
PH
94
95 req = compat_urllib_request.Request(self._LOGIN_URL, login_data)
96 login_results = self._download_webpage(
97 req, None,
98 note=u'Logging in', errnote=u'unable to log in', fatal=False)
99 if login_results is False:
100 return False
101 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
102 self._downloader.report_warning(u'unable to log in: bad username or password')
b2e8bc1b
JMF
103 return False
104 return True
105
106 def _confirm_age(self):
107 age_form = {
7cc3570e
PH
108 'next_url': '/',
109 'action_confirm': 'Confirm',
110 }
111 req = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
112
113 self._download_webpage(
114 req, None,
115 note=u'Confirming age', errnote=u'Unable to confirm age')
b2e8bc1b
JMF
116 return True
117
118 def _real_initialize(self):
119 if self._downloader is None:
120 return
121 if not self._set_language():
122 return
123 if not self._login():
124 return
125 self._confirm_age()
c5e8d7af 126
8377574c 127
de7f3446 128class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
0f818663 129 IE_DESC = u'YouTube.com'
cb7dfeea 130 _VALID_URL = r"""(?x)^
c5e8d7af 131 (
83aa5293 132 (?:https?://|//)? # http(s):// or protocol-independent URL (optional)
cb7dfeea 133 (?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/|
e69ae5b9
JMF
134 tube\.majestyc\.net/|
135 youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains
c5e8d7af
PH
136 (?:.*?\#/)? # handle anchor (#/) redirect urls
137 (?: # the various things that can precede the ID:
138 (?:(?:v|embed|e)/) # v/ or embed/ or e/
139 |(?: # or the v= param in all its forms
d741e55a 140 (?:(?:watch|movie)(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
c5e8d7af
PH
141 (?:\?|\#!?) # the params delimiter ? or # or #!
142 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
143 v=
144 )
f4b05232
JMF
145 ))
146 |youtu\.be/ # just youtu.be/xxxx
147 )
c5e8d7af 148 )? # all until now is optional -> you can pass the naked ID
8963d9c2 149 ([0-9A-Za-z_-]{11}) # here is it! the YouTube video ID
c5e8d7af
PH
150 (?(1).+)? # if we found the ID, everything can follow
151 $"""
c5e8d7af 152 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
c5e8d7af 153 # Listed in order of quality
bdc6b3fc 154 _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '36', '17', '13',
96fb5605 155 # Apple HTTP Live Streaming
bdc6b3fc 156 '96', '95', '94', '93', '92', '132', '151',
939fbd26
JMF
157 # 3D
158 '85', '84', '102', '83', '101', '82', '100',
159 # Dash video
160 '138', '137', '248', '136', '247', '135', '246',
161 '245', '244', '134', '243', '133', '242', '160',
162 # Dash audio
163 '141', '172', '140', '171', '139',
1d043b93 164 ]
bdc6b3fc 165 _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '36', '17', '13',
96fb5605 166 # Apple HTTP Live Streaming
bdc6b3fc
AZ
167 '96', '95', '94', '93', '92', '132', '151',
168 # 3D
86fe61c8 169 '85', '102', '84', '101', '83', '100', '82',
939fbd26
JMF
170 # Dash video
171 '138', '248', '137', '247', '136', '246', '245',
172 '244', '135', '243', '134', '242', '133', '160',
173 # Dash audio
174 '172', '141', '171', '140', '139',
1d043b93 175 ]
bdc6b3fc
AZ
176 _video_formats_map = {
177 'flv': ['35', '34', '6', '5'],
178 '3gp': ['36', '17', '13'],
179 'mp4': ['38', '37', '22', '18'],
180 'webm': ['46', '45', '44', '43'],
181 }
c5e8d7af
PH
182 _video_extensions = {
183 '13': '3gp',
bdc6b3fc 184 '17': '3gp',
c5e8d7af
PH
185 '18': 'mp4',
186 '22': 'mp4',
bdc6b3fc 187 '36': '3gp',
c5e8d7af 188 '37': 'mp4',
d69cf69a 189 '38': 'mp4',
c5e8d7af
PH
190 '43': 'webm',
191 '44': 'webm',
192 '45': 'webm',
193 '46': 'webm',
1d043b93 194
86fe61c8
AZ
195 # 3d videos
196 '82': 'mp4',
197 '83': 'mp4',
198 '84': 'mp4',
199 '85': 'mp4',
200 '100': 'webm',
201 '101': 'webm',
202 '102': 'webm',
836a086c 203
96fb5605 204 # Apple HTTP Live Streaming
1d043b93
JMF
205 '92': 'mp4',
206 '93': 'mp4',
207 '94': 'mp4',
208 '95': 'mp4',
209 '96': 'mp4',
210 '132': 'mp4',
211 '151': 'mp4',
836a086c
AZ
212
213 # Dash mp4
214 '133': 'mp4',
215 '134': 'mp4',
216 '135': 'mp4',
217 '136': 'mp4',
218 '137': 'mp4',
219 '138': 'mp4',
836a086c
AZ
220 '160': 'mp4',
221
f6f1fc92
RB
222 # Dash mp4 audio
223 '139': 'm4a',
16f36a6f
RB
224 '140': 'm4a',
225 '141': 'm4a',
836a086c
AZ
226
227 # Dash webm
228 '171': 'webm',
229 '172': 'webm',
230 '242': 'webm',
231 '243': 'webm',
232 '244': 'webm',
233 '245': 'webm',
234 '246': 'webm',
235 '247': 'webm',
236 '248': 'webm',
c5e8d7af
PH
237 }
238 _video_dimensions = {
d5a9bb4e 239 '5': '400x240',
c5e8d7af
PH
240 '6': '???',
241 '13': '???',
d5a9bb4e
RB
242 '17': '176x144',
243 '18': '640x360',
244 '22': '1280x720',
245 '34': '640x360',
246 '35': '854x480',
247 '36': '320x240',
248 '37': '1920x1080',
249 '38': '4096x3072',
250 '43': '640x360',
251 '44': '854x480',
252 '45': '1280x720',
253 '46': '1920x1080',
86fe61c8
AZ
254 '82': '360p',
255 '83': '480p',
256 '84': '720p',
257 '85': '1080p',
1d043b93
JMF
258 '92': '240p',
259 '93': '360p',
260 '94': '480p',
261 '95': '720p',
262 '96': '1080p',
86fe61c8
AZ
263 '100': '360p',
264 '101': '480p',
836a086c 265 '102': '720p',
1d043b93
JMF
266 '132': '240p',
267 '151': '72p',
836a086c
AZ
268 '133': '240p',
269 '134': '360p',
270 '135': '480p',
271 '136': '720p',
272 '137': '1080p',
273 '138': '>1080p',
274 '139': '48k',
275 '140': '128k',
276 '141': '256k',
277 '160': '192p',
278 '171': '128k',
279 '172': '256k',
280 '242': '240p',
281 '243': '360p',
282 '244': '480p',
283 '245': '480p',
284 '246': '480p',
285 '247': '720p',
286 '248': '1080p',
c5e8d7af 287 }
836a086c
AZ
288 _special_itags = {
289 '82': '3D',
290 '83': '3D',
291 '84': '3D',
292 '85': '3D',
293 '100': '3D',
294 '101': '3D',
295 '102': '3D',
296 '133': 'DASH Video',
297 '134': 'DASH Video',
298 '135': 'DASH Video',
299 '136': 'DASH Video',
300 '137': 'DASH Video',
301 '138': 'DASH Video',
302 '139': 'DASH Audio',
303 '140': 'DASH Audio',
304 '141': 'DASH Audio',
305 '160': 'DASH Video',
306 '171': 'DASH Audio',
307 '172': 'DASH Audio',
308 '242': 'DASH Video',
309 '243': 'DASH Video',
310 '244': 'DASH Video',
311 '245': 'DASH Video',
312 '246': 'DASH Video',
313 '247': 'DASH Video',
314 '248': 'DASH Video',
c5e8d7af 315 }
836a086c 316
c5e8d7af 317 IE_NAME = u'youtube'
2eb88d95
PH
318 _TESTS = [
319 {
0e853ca4
PH
320 u"url": u"http://www.youtube.com/watch?v=BaW_jenozKc",
321 u"file": u"BaW_jenozKc.mp4",
322 u"info_dict": {
323 u"title": u"youtube-dl test video \"'/\\ä↭𝕐",
324 u"uploader": u"Philipp Hagemeister",
325 u"uploader_id": u"phihag",
326 u"upload_date": u"20121002",
27dcce19 327 u"description": u"test chars: \"'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de ."
2eb88d95 328 }
0e853ca4 329 },
0e853ca4
PH
330 {
331 u"url": u"http://www.youtube.com/watch?v=UxxajLWwzqY",
332 u"file": u"UxxajLWwzqY.mp4",
333 u"note": u"Test generic use_cipher_signature video (#897)",
334 u"info_dict": {
335 u"upload_date": u"20120506",
336 u"title": u"Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]",
2dc59299 337 u"description": u"md5:5b292926389560516e384ac437c0ec07",
45ed795c 338 u"uploader": u"Icona Pop",
0e853ca4 339 u"uploader_id": u"IconaPop"
2eb88d95 340 }
c108eb73
JMF
341 },
342 {
343 u"url": u"https://www.youtube.com/watch?v=07FYdnEawAQ",
344 u"file": u"07FYdnEawAQ.mp4",
345 u"note": u"Test VEVO video with age protection (#956)",
346 u"info_dict": {
347 u"upload_date": u"20130703",
348 u"title": u"Justin Timberlake - Tunnel Vision (Explicit)",
349 u"description": u"md5:64249768eec3bc4276236606ea996373",
350 u"uploader": u"justintimberlakeVEVO",
351 u"uploader_id": u"justintimberlakeVEVO"
352 }
353 },
fccd3771 354 {
83aa5293 355 u"url": u"//www.YouTube.com/watch?v=yZIXLfi8CZQ",
fccd3771
PH
356 u"file": u"yZIXLfi8CZQ.mp4",
357 u"note": u"Embed-only video (#1746)",
358 u"info_dict": {
359 u"upload_date": u"20120608",
360 u"title": u"Principal Sexually Assaults A Teacher - Episode 117 - 8th June 2012",
361 u"description": u"md5:09b78bd971f1e3e289601dfba15ca4f7",
362 u"uploader": u"SET India",
363 u"uploader_id": u"setindia"
364 }
365 },
2eb88d95
PH
366 ]
367
c5e8d7af
PH
368
369 @classmethod
370 def suitable(cls, url):
371 """Receives a URL and returns True if suitable for this IE."""
e3ea4790 372 if YoutubePlaylistIE.suitable(url): return False
fccd3771 373 return re.match(cls._VALID_URL, url) is not None
c5e8d7af 374
e0df6211
PH
375 def __init__(self, *args, **kwargs):
376 super(YoutubeIE, self).__init__(*args, **kwargs)
83799698 377 self._player_cache = {}
e0df6211 378
c5e8d7af
PH
379 def report_video_info_webpage_download(self, video_id):
380 """Report attempt to download video info webpage."""
381 self.to_screen(u'%s: Downloading video info webpage' % video_id)
382
c5e8d7af
PH
383 def report_information_extraction(self, video_id):
384 """Report attempt to extract video information."""
385 self.to_screen(u'%s: Extracting video information' % video_id)
386
387 def report_unavailable_format(self, video_id, format):
388 """Report extracted video URL."""
389 self.to_screen(u'%s: Format %s not available' % (video_id, format))
390
391 def report_rtmp_download(self):
392 """Indicate the download will use the RTMP protocol."""
393 self.to_screen(u'RTMP download detected')
394
c4417ddb
PH
395 def _extract_signature_function(self, video_id, player_url, slen):
396 id_m = re.match(r'.*-(?P<id>[a-zA-Z0-9_-]+)\.(?P<ext>[a-z]+)$',
83799698 397 player_url)
e0df6211
PH
398 player_type = id_m.group('ext')
399 player_id = id_m.group('id')
400
c4417ddb
PH
401 # Read from filesystem cache
402 func_id = '%s_%s_%d' % (player_type, player_id, slen)
403 assert os.path.basename(func_id) == func_id
c38b1e77 404 cache_dir = get_cachedir(self._downloader.params)
c4417ddb 405
c3c88a26 406 cache_enabled = cache_dir is not None
f8061589 407 if cache_enabled:
c4417ddb
PH
408 cache_fn = os.path.join(os.path.expanduser(cache_dir),
409 u'youtube-sigfuncs',
410 func_id + '.json')
411 try:
edf3e38e 412 with io.open(cache_fn, 'r', encoding='utf-8') as cachef:
c4417ddb
PH
413 cache_spec = json.load(cachef)
414 return lambda s: u''.join(s[i] for i in cache_spec)
edf3e38e 415 except IOError:
c4417ddb 416 pass # No cache available
83799698 417
e0df6211
PH
418 if player_type == 'js':
419 code = self._download_webpage(
420 player_url, video_id,
83799698 421 note=u'Downloading %s player %s' % (player_type, player_id),
e0df6211 422 errnote=u'Download of %s failed' % player_url)
83799698 423 res = self._parse_sig_js(code)
c4417ddb 424 elif player_type == 'swf':
e0df6211
PH
425 urlh = self._request_webpage(
426 player_url, video_id,
83799698 427 note=u'Downloading %s player %s' % (player_type, player_id),
e0df6211
PH
428 errnote=u'Download of %s failed' % player_url)
429 code = urlh.read()
83799698 430 res = self._parse_sig_swf(code)
e0df6211
PH
431 else:
432 assert False, 'Invalid player type %r' % player_type
433
f8061589 434 if cache_enabled:
edf3e38e 435 try:
c705320f
PH
436 test_string = u''.join(map(compat_chr, range(slen)))
437 cache_res = res(test_string)
edf3e38e
PH
438 cache_spec = [ord(c) for c in cache_res]
439 try:
440 os.makedirs(os.path.dirname(cache_fn))
441 except OSError as ose:
442 if ose.errno != errno.EEXIST:
443 raise
444 write_json_file(cache_spec, cache_fn)
0ca96d48 445 except Exception:
edf3e38e
PH
446 tb = traceback.format_exc()
447 self._downloader.report_warning(
448 u'Writing cache to %r failed: %s' % (cache_fn, tb))
83799698
PH
449
450 return res
451
edf3e38e
PH
452 def _print_sig_code(self, func, slen):
453 def gen_sig_code(idxs):
454 def _genslice(start, end, step):
455 starts = u'' if start == 0 else str(start)
e35e4ddc
PH
456 ends = (u':%d' % (end+step)) if end + step >= 0 else u':'
457 steps = u'' if step == 1 else (u':%d' % step)
edf3e38e
PH
458 return u's[%s%s%s]' % (starts, ends, steps)
459
460 step = None
0ca96d48
PH
461 start = '(Never used)' # Quelch pyflakes warnings - start will be
462 # set as soon as step is set
edf3e38e
PH
463 for i, prev in zip(idxs[1:], idxs[:-1]):
464 if step is not None:
465 if i - prev == step:
466 continue
467 yield _genslice(start, prev, step)
468 step = None
469 continue
470 if i - prev in [-1, 1]:
471 step = i - prev
472 start = prev
473 continue
474 else:
475 yield u's[%d]' % prev
476 if step is None:
477 yield u's[%d]' % i
478 else:
479 yield _genslice(start, i, step)
480
c705320f
PH
481 test_string = u''.join(map(compat_chr, range(slen)))
482 cache_res = func(test_string)
edf3e38e
PH
483 cache_spec = [ord(c) for c in cache_res]
484 expr_code = u' + '.join(gen_sig_code(cache_spec))
485 code = u'if len(s) == %d:\n return %s\n' % (slen, expr_code)
f8061589 486 self.to_screen(u'Extracted signature function:\n' + code)
edf3e38e 487
e0df6211
PH
488 def _parse_sig_js(self, jscode):
489 funcname = self._search_regex(
490 r'signature=([a-zA-Z]+)', jscode,
491 u'Initial JS player signature function name')
492
493 functions = {}
494
495 def argidx(varname):
496 return string.lowercase.index(varname)
497
498 def interpret_statement(stmt, local_vars, allow_recursion=20):
499 if allow_recursion < 0:
0ca96d48 500 raise ExtractorError(u'Recursion limit reached')
e0df6211
PH
501
502 if stmt.startswith(u'var '):
503 stmt = stmt[len(u'var '):]
504 ass_m = re.match(r'^(?P<out>[a-z]+)(?:\[(?P<index>[^\]]+)\])?' +
505 r'=(?P<expr>.*)$', stmt)
506 if ass_m:
507 if ass_m.groupdict().get('index'):
508 def assign(val):
509 lvar = local_vars[ass_m.group('out')]
510 idx = interpret_expression(ass_m.group('index'),
511 local_vars, allow_recursion)
512 assert isinstance(idx, int)
513 lvar[idx] = val
514 return val
515 expr = ass_m.group('expr')
516 else:
517 def assign(val):
518 local_vars[ass_m.group('out')] = val
519 return val
520 expr = ass_m.group('expr')
521 elif stmt.startswith(u'return '):
522 assign = lambda v: v
523 expr = stmt[len(u'return '):]
524 else:
525 raise ExtractorError(
526 u'Cannot determine left side of statement in %r' % stmt)
527
528 v = interpret_expression(expr, local_vars, allow_recursion)
529 return assign(v)
530
531 def interpret_expression(expr, local_vars, allow_recursion):
532 if expr.isdigit():
533 return int(expr)
534
535 if expr.isalpha():
536 return local_vars[expr]
537
538 m = re.match(r'^(?P<in>[a-z]+)\.(?P<member>.*)$', expr)
539 if m:
540 member = m.group('member')
541 val = local_vars[m.group('in')]
542 if member == 'split("")':
543 return list(val)
544 if member == 'join("")':
545 return u''.join(val)
546 if member == 'length':
547 return len(val)
548 if member == 'reverse()':
549 return val[::-1]
550 slice_m = re.match(r'slice\((?P<idx>.*)\)', member)
551 if slice_m:
552 idx = interpret_expression(
553 slice_m.group('idx'), local_vars, allow_recursion-1)
554 return val[idx:]
555
556 m = re.match(
557 r'^(?P<in>[a-z]+)\[(?P<idx>.+)\]$', expr)
558 if m:
559 val = local_vars[m.group('in')]
560 idx = interpret_expression(m.group('idx'), local_vars,
561 allow_recursion-1)
562 return val[idx]
563
564 m = re.match(r'^(?P<a>.+?)(?P<op>[%])(?P<b>.+?)$', expr)
565 if m:
566 a = interpret_expression(m.group('a'),
567 local_vars, allow_recursion)
568 b = interpret_expression(m.group('b'),
569 local_vars, allow_recursion)
570 return a % b
571
572 m = re.match(
573 r'^(?P<func>[a-zA-Z]+)\((?P<args>[a-z0-9,]+)\)$', expr)
574 if m:
575 fname = m.group('func')
576 if fname not in functions:
577 functions[fname] = extract_function(fname)
578 argvals = [int(v) if v.isdigit() else local_vars[v]
579 for v in m.group('args').split(',')]
580 return functions[fname](argvals)
581 raise ExtractorError(u'Unsupported JS expression %r' % expr)
582
583 def extract_function(funcname):
584 func_m = re.search(
585 r'function ' + re.escape(funcname) +
586 r'\((?P<args>[a-z,]+)\){(?P<code>[^}]+)}',
587 jscode)
588 argnames = func_m.group('args').split(',')
589
590 def resf(args):
591 local_vars = dict(zip(argnames, args))
592 for stmt in func_m.group('code').split(';'):
593 res = interpret_statement(stmt, local_vars)
594 return res
595 return resf
596
597 initial_function = extract_function(funcname)
598 return lambda s: initial_function([s])
599
600 def _parse_sig_swf(self, file_contents):
601 if file_contents[1:3] != b'WS':
602 raise ExtractorError(
603 u'Not an SWF file; header is %r' % file_contents[:3])
604 if file_contents[:1] == b'C':
605 content = zlib.decompress(file_contents[8:])
606 else:
607 raise NotImplementedError(u'Unsupported compression format %r' %
608 file_contents[:1])
609
610 def extract_tags(content):
611 pos = 0
612 while pos < len(content):
613 header16 = struct.unpack('<H', content[pos:pos+2])[0]
614 pos += 2
615 tag_code = header16 >> 6
616 tag_len = header16 & 0x3f
617 if tag_len == 0x3f:
618 tag_len = struct.unpack('<I', content[pos:pos+4])[0]
619 pos += 4
620 assert pos+tag_len <= len(content)
621 yield (tag_code, content[pos:pos+tag_len])
622 pos += tag_len
623
624 code_tag = next(tag
625 for tag_code, tag in extract_tags(content)
626 if tag_code == 82)
627 p = code_tag.index(b'\0', 4) + 1
ba552f54 628 code_reader = io.BytesIO(code_tag[p:])
e0df6211
PH
629
630 # Parse ABC (AVM2 ByteCode)
ba552f54
PH
631 def read_int(reader=None):
632 if reader is None:
633 reader = code_reader
e0df6211
PH
634 res = 0
635 shift = 0
636 for _ in range(5):
ba552f54
PH
637 buf = reader.read(1)
638 assert len(buf) == 1
639 b = struct.unpack('<B', buf)[0]
e0df6211
PH
640 res = res | ((b & 0x7f) << shift)
641 if b & 0x80 == 0:
642 break
643 shift += 7
ba552f54
PH
644 return res
645
646 def u30(reader=None):
647 res = read_int(reader)
648 assert res & 0xf0000000 == 0
e0df6211
PH
649 return res
650 u32 = read_int
651
ba552f54
PH
652 def s32(reader=None):
653 v = read_int(reader)
e0df6211
PH
654 if v & 0x80000000 != 0:
655 v = - ((v ^ 0xffffffff) + 1)
ba552f54
PH
656 return v
657
0ca96d48 658 def read_string(reader=None):
ba552f54
PH
659 if reader is None:
660 reader = code_reader
661 slen = u30(reader)
662 resb = reader.read(slen)
663 assert len(resb) == slen
664 return resb.decode('utf-8')
665
666 def read_bytes(count, reader=None):
667 if reader is None:
668 reader = code_reader
669 resb = reader.read(count)
670 assert len(resb) == count
671 return resb
672
673 def read_byte(reader=None):
674 resb = read_bytes(1, reader=reader)
675 res = struct.unpack('<B', resb)[0]
676 return res
e0df6211
PH
677
678 # minor_version + major_version
0ca96d48 679 read_bytes(2 + 2)
e0df6211
PH
680
681 # Constant pool
ba552f54 682 int_count = u30()
e0df6211 683 for _c in range(1, int_count):
0ca96d48 684 s32()
ba552f54 685 uint_count = u30()
e0df6211 686 for _c in range(1, uint_count):
0ca96d48 687 u32()
ba552f54 688 double_count = u30()
0ca96d48 689 read_bytes((double_count-1) * 8)
ba552f54 690 string_count = u30()
e0df6211
PH
691 constant_strings = [u'']
692 for _c in range(1, string_count):
0ca96d48 693 s = read_string()
e0df6211 694 constant_strings.append(s)
ba552f54 695 namespace_count = u30()
e0df6211 696 for _c in range(1, namespace_count):
0ca96d48
PH
697 read_bytes(1) # kind
698 u30() # name
ba552f54 699 ns_set_count = u30()
e0df6211 700 for _c in range(1, ns_set_count):
ba552f54 701 count = u30()
e0df6211 702 for _c2 in range(count):
0ca96d48 703 u30()
ba552f54 704 multiname_count = u30()
e0df6211
PH
705 MULTINAME_SIZES = {
706 0x07: 2, # QName
707 0x0d: 2, # QNameA
708 0x0f: 1, # RTQName
709 0x10: 1, # RTQNameA
710 0x11: 0, # RTQNameL
711 0x12: 0, # RTQNameLA
712 0x09: 2, # Multiname
713 0x0e: 2, # MultinameA
714 0x1b: 1, # MultinameL
715 0x1c: 1, # MultinameLA
716 }
717 multinames = [u'']
718 for _c in range(1, multiname_count):
ba552f54 719 kind = u30()
e0df6211
PH
720 assert kind in MULTINAME_SIZES, u'Invalid multiname kind %r' % kind
721 if kind == 0x07:
0ca96d48 722 u30() # namespace_idx
ba552f54 723 name_idx = u30()
e0df6211
PH
724 multinames.append(constant_strings[name_idx])
725 else:
726 multinames.append('[MULTINAME kind: %d]' % kind)
727 for _c2 in range(MULTINAME_SIZES[kind]):
0ca96d48 728 u30()
e0df6211
PH
729
730 # Methods
ba552f54 731 method_count = u30()
e0df6211
PH
732 MethodInfo = collections.namedtuple(
733 'MethodInfo',
734 ['NEED_ARGUMENTS', 'NEED_REST'])
735 method_infos = []
736 for method_id in range(method_count):
ba552f54 737 param_count = u30()
0ca96d48 738 u30() # return type
e0df6211 739 for _ in range(param_count):
0ca96d48
PH
740 u30() # param type
741 u30() # name index (always 0 for youtube)
ba552f54 742 flags = read_byte()
e0df6211
PH
743 if flags & 0x08 != 0:
744 # Options present
ba552f54 745 option_count = u30()
e0df6211 746 for c in range(option_count):
0ca96d48
PH
747 u30() # val
748 read_bytes(1) # kind
e0df6211
PH
749 if flags & 0x80 != 0:
750 # Param names present
751 for _ in range(param_count):
0ca96d48 752 u30() # param name
e0df6211
PH
753 mi = MethodInfo(flags & 0x01 != 0, flags & 0x04 != 0)
754 method_infos.append(mi)
755
756 # Metadata
ba552f54 757 metadata_count = u30()
e0df6211 758 for _c in range(metadata_count):
0ca96d48 759 u30() # name
ba552f54 760 item_count = u30()
e0df6211 761 for _c2 in range(item_count):
0ca96d48
PH
762 u30() # key
763 u30() # value
ba552f54
PH
764
765 def parse_traits_info():
766 trait_name_idx = u30()
767 kind_full = read_byte()
e0df6211
PH
768 kind = kind_full & 0x0f
769 attrs = kind_full >> 4
770 methods = {}
771 if kind in [0x00, 0x06]: # Slot or Const
0ca96d48
PH
772 u30() # Slot id
773 u30() # type_name_idx
ba552f54 774 vindex = u30()
e0df6211 775 if vindex != 0:
0ca96d48 776 read_byte() # vkind
e0df6211 777 elif kind in [0x01, 0x02, 0x03]: # Method / Getter / Setter
0ca96d48 778 u30() # disp_id
ba552f54 779 method_idx = u30()
e0df6211
PH
780 methods[multinames[trait_name_idx]] = method_idx
781 elif kind == 0x04: # Class
0ca96d48
PH
782 u30() # slot_id
783 u30() # classi
e0df6211 784 elif kind == 0x05: # Function
0ca96d48 785 u30() # slot_id
ba552f54 786 function_idx = u30()
e0df6211
PH
787 methods[function_idx] = multinames[trait_name_idx]
788 else:
789 raise ExtractorError(u'Unsupported trait kind %d' % kind)
790
791 if attrs & 0x4 != 0: # Metadata present
ba552f54 792 metadata_count = u30()
e0df6211 793 for _c3 in range(metadata_count):
0ca96d48 794 u30() # metadata index
e0df6211 795
ba552f54 796 return methods
e0df6211
PH
797
798 # Classes
799 TARGET_CLASSNAME = u'SignatureDecipher'
800 searched_idx = multinames.index(TARGET_CLASSNAME)
801 searched_class_id = None
ba552f54 802 class_count = u30()
e0df6211 803 for class_id in range(class_count):
ba552f54 804 name_idx = u30()
e0df6211
PH
805 if name_idx == searched_idx:
806 # We found the class we're looking for!
807 searched_class_id = class_id
0ca96d48 808 u30() # super_name idx
ba552f54 809 flags = read_byte()
e0df6211 810 if flags & 0x08 != 0: # Protected namespace is present
0ca96d48 811 u30() # protected_ns_idx
ba552f54 812 intrf_count = u30()
e0df6211 813 for _c2 in range(intrf_count):
0ca96d48
PH
814 u30()
815 u30() # iinit
ba552f54 816 trait_count = u30()
e0df6211 817 for _c2 in range(trait_count):
0ca96d48 818 parse_traits_info()
e0df6211
PH
819
820 if searched_class_id is None:
821 raise ExtractorError(u'Target class %r not found' %
822 TARGET_CLASSNAME)
823
824 method_names = {}
825 method_idxs = {}
826 for class_id in range(class_count):
0ca96d48 827 u30() # cinit
ba552f54 828 trait_count = u30()
e0df6211 829 for _c2 in range(trait_count):
ba552f54 830 trait_methods = parse_traits_info()
e0df6211
PH
831 if class_id == searched_class_id:
832 method_names.update(trait_methods.items())
833 method_idxs.update(dict(
834 (idx, name)
835 for name, idx in trait_methods.items()))
836
837 # Scripts
ba552f54 838 script_count = u30()
e0df6211 839 for _c in range(script_count):
0ca96d48 840 u30() # init
ba552f54 841 trait_count = u30()
e0df6211 842 for _c2 in range(trait_count):
0ca96d48 843 parse_traits_info()
e0df6211
PH
844
845 # Method bodies
ba552f54 846 method_body_count = u30()
e0df6211
PH
847 Method = collections.namedtuple('Method', ['code', 'local_count'])
848 methods = {}
849 for _c in range(method_body_count):
ba552f54 850 method_idx = u30()
0ca96d48 851 u30() # max_stack
ba552f54 852 local_count = u30()
0ca96d48
PH
853 u30() # init_scope_depth
854 u30() # max_scope_depth
ba552f54
PH
855 code_length = u30()
856 code = read_bytes(code_length)
e0df6211 857 if method_idx in method_idxs:
ba552f54 858 m = Method(code, local_count)
e0df6211 859 methods[method_idxs[method_idx]] = m
ba552f54 860 exception_count = u30()
e0df6211 861 for _c2 in range(exception_count):
0ca96d48
PH
862 u30() # from
863 u30() # to
864 u30() # target
865 u30() # exc_type
866 u30() # var_name
ba552f54 867 trait_count = u30()
e0df6211 868 for _c2 in range(trait_count):
0ca96d48 869 parse_traits_info()
e0df6211 870
ba552f54 871 assert p + code_reader.tell() == len(code_tag)
e0df6211
PH
872 assert len(methods) == len(method_idxs)
873
874 method_pyfunctions = {}
875
876 def extract_function(func_name):
877 if func_name in method_pyfunctions:
878 return method_pyfunctions[func_name]
879 if func_name not in methods:
880 raise ExtractorError(u'Cannot find function %r' % func_name)
881 m = methods[func_name]
882
883 def resfunc(args):
e0df6211
PH
884 registers = ['(this)'] + list(args) + [None] * m.local_count
885 stack = []
886 coder = io.BytesIO(m.code)
887 while True:
888 opcode = struct.unpack('!B', coder.read(1))[0]
a7177865 889 if opcode == 36: # pushbyte
e0df6211
PH
890 v = struct.unpack('!B', coder.read(1))[0]
891 stack.append(v)
892 elif opcode == 44: # pushstring
893 idx = u30(coder)
894 stack.append(constant_strings[idx])
895 elif opcode == 48: # pushscope
896 # We don't implement the scope register, so we'll just
897 # ignore the popped value
898 stack.pop()
899 elif opcode == 70: # callproperty
900 index = u30(coder)
901 mname = multinames[index]
902 arg_count = u30(coder)
903 args = list(reversed(
904 [stack.pop() for _ in range(arg_count)]))
905 obj = stack.pop()
906 if mname == u'split':
907 assert len(args) == 1
908 assert isinstance(args[0], compat_str)
909 assert isinstance(obj, compat_str)
910 if args[0] == u'':
911 res = list(obj)
912 else:
913 res = obj.split(args[0])
914 stack.append(res)
a7177865
PH
915 elif mname == u'slice':
916 assert len(args) == 1
917 assert isinstance(args[0], int)
918 assert isinstance(obj, list)
919 res = obj[args[0]:]
920 stack.append(res)
921 elif mname == u'join':
922 assert len(args) == 1
923 assert isinstance(args[0], compat_str)
924 assert isinstance(obj, list)
925 res = args[0].join(obj)
926 stack.append(res)
e0df6211
PH
927 elif mname in method_pyfunctions:
928 stack.append(method_pyfunctions[mname](args))
929 else:
930 raise NotImplementedError(
931 u'Unsupported property %r on %r'
932 % (mname, obj))
a7177865
PH
933 elif opcode == 72: # returnvalue
934 res = stack.pop()
935 return res
936 elif opcode == 79: # callpropvoid
937 index = u30(coder)
938 mname = multinames[index]
939 arg_count = u30(coder)
940 args = list(reversed(
941 [stack.pop() for _ in range(arg_count)]))
942 obj = stack.pop()
943 if mname == u'reverse':
944 assert isinstance(obj, list)
945 obj.reverse()
946 else:
947 raise NotImplementedError(
948 u'Unsupported (void) property %r on %r'
949 % (mname, obj))
e0df6211
PH
950 elif opcode == 93: # findpropstrict
951 index = u30(coder)
952 mname = multinames[index]
953 res = extract_function(mname)
954 stack.append(res)
955 elif opcode == 97: # setproperty
956 index = u30(coder)
957 value = stack.pop()
958 idx = stack.pop()
959 obj = stack.pop()
960 assert isinstance(obj, list)
961 assert isinstance(idx, int)
962 obj[idx] = value
963 elif opcode == 98: # getlocal
964 index = u30(coder)
965 stack.append(registers[index])
966 elif opcode == 99: # setlocal
967 index = u30(coder)
968 value = stack.pop()
969 registers[index] = value
970 elif opcode == 102: # getproperty
971 index = u30(coder)
972 pname = multinames[index]
973 if pname == u'length':
974 obj = stack.pop()
975 assert isinstance(obj, list)
976 stack.append(len(obj))
977 else: # Assume attribute access
978 idx = stack.pop()
979 assert isinstance(idx, int)
980 obj = stack.pop()
981 assert isinstance(obj, list)
982 stack.append(obj[idx])
983 elif opcode == 128: # coerce
0ca96d48 984 u30(coder)
e0df6211
PH
985 elif opcode == 133: # coerce_s
986 assert isinstance(stack[-1], (type(None), compat_str))
987 elif opcode == 164: # modulo
988 value2 = stack.pop()
989 value1 = stack.pop()
990 res = value1 % value2
991 stack.append(res)
a7177865
PH
992 elif opcode == 208: # getlocal_0
993 stack.append(registers[0])
994 elif opcode == 209: # getlocal_1
995 stack.append(registers[1])
996 elif opcode == 210: # getlocal_2
997 stack.append(registers[2])
998 elif opcode == 211: # getlocal_3
999 stack.append(registers[3])
e0df6211
PH
1000 elif opcode == 214: # setlocal_2
1001 registers[2] = stack.pop()
1002 elif opcode == 215: # setlocal_3
1003 registers[3] = stack.pop()
1004 else:
1005 raise NotImplementedError(
1006 u'Unsupported opcode %d' % opcode)
1007
1008 method_pyfunctions[func_name] = resfunc
1009 return resfunc
1010
1011 initial_function = extract_function(u'decipher')
1012 return lambda s: initial_function([s])
1013
83799698 1014 def _decrypt_signature(self, s, video_id, player_url, age_gate=False):
257a2501 1015 """Turn the encrypted s field into a working signature"""
6b37f0be 1016
83799698 1017 if player_url is not None:
9f9be844
PH
1018 if player_url.startswith(u'//'):
1019 player_url = u'https:' + player_url
e0df6211 1020 try:
7f8ae73a
PH
1021 player_id = (player_url, len(s))
1022 if player_id not in self._player_cache:
83799698 1023 func = self._extract_signature_function(
c4417ddb 1024 video_id, player_url, len(s)
e0df6211 1025 )
7f8ae73a
PH
1026 self._player_cache[player_id] = func
1027 func = self._player_cache[player_id]
edf3e38e
PH
1028 if self._downloader.params.get('youtube_print_sig_code'):
1029 self._print_sig_code(func, len(s))
1030 return func(s)
0ca96d48 1031 except Exception:
e0df6211 1032 tb = traceback.format_exc()
83799698
PH
1033 self._downloader.report_warning(
1034 u'Automatic signature extraction failed: ' + tb)
e0df6211 1035
d2d8f895
PH
1036 self._downloader.report_warning(
1037 u'Warning: Falling back to static signature algorithm')
920de7a2 1038
2f2ffea9
PH
1039 return self._static_decrypt_signature(
1040 s, video_id, player_url, age_gate)
e0df6211 1041
2f2ffea9 1042 def _static_decrypt_signature(self, s, video_id, player_url, age_gate):
e0df6211
PH
1043 if age_gate:
1044 # The videos with age protection use another player, so the
1045 # algorithms can be different.
1046 if len(s) == 86:
1047 return s[2:63] + s[82] + s[64:82] + s[63]
1048
bc4b9008 1049 if len(s) == 93:
1050 return s[86:29:-1] + s[88] + s[28:5:-1]
1051 elif len(s) == 92:
444b1165 1052 return s[25] + s[3:25] + s[0] + s[26:42] + s[79] + s[43:79] + s[91] + s[80:83]
38d025b3
JMF
1053 elif len(s) == 91:
1054 return s[84:27:-1] + s[86] + s[26:5:-1]
444b1165
JMF
1055 elif len(s) == 90:
1056 return s[25] + s[3:25] + s[2] + s[26:40] + s[77] + s[41:77] + s[89] + s[78:81]
8a9d86a2 1057 elif len(s) == 89:
1058 return s[84:78:-1] + s[87] + s[77:60:-1] + s[0] + s[59:3:-1]
444b1165 1059 elif len(s) == 88:
3e223834 1060 return s[7:28] + s[87] + s[29:45] + s[55] + s[46:55] + s[2] + s[56:87] + s[28]
be547e1d 1061 elif len(s) == 87:
3a725669 1062 return s[6:27] + s[4] + s[28:39] + s[27] + s[40:59] + s[2] + s[60:]
be547e1d 1063 elif len(s) == 86:
f2c327fd 1064 return s[80:72:-1] + s[16] + s[71:39:-1] + s[72] + s[38:16:-1] + s[82] + s[15::-1]
be547e1d 1065 elif len(s) == 85:
6ae8ee3f 1066 return s[3:11] + s[0] + s[12:55] + s[84] + s[56:84]
be547e1d 1067 elif len(s) == 84:
6f56389b 1068 return s[78:70:-1] + s[14] + s[69:37:-1] + s[70] + s[36:14:-1] + s[80] + s[:14][::-1]
be547e1d 1069 elif len(s) == 83:
920de7a2 1070 return s[80:63:-1] + s[0] + s[62:0:-1] + s[63]
be547e1d 1071 elif len(s) == 82:
c21315f2 1072 return s[80:37:-1] + s[7] + s[36:7:-1] + s[0] + s[6:0:-1] + s[37]
be547e1d 1073 elif len(s) == 81:
aedd6bb9 1074 return s[56] + s[79:56:-1] + s[41] + s[55:41:-1] + s[80] + s[40:34:-1] + s[0] + s[33:29:-1] + s[34] + s[28:9:-1] + s[29] + s[8:0:-1] + s[9]
066090dd
JMF
1075 elif len(s) == 80:
1076 return s[1:19] + s[0] + s[20:68] + s[19] + s[69:80]
5c468ca8
JMF
1077 elif len(s) == 79:
1078 return s[54] + s[77:54:-1] + s[39] + s[53:39:-1] + s[78] + s[38:34:-1] + s[0] + s[33:29:-1] + s[34] + s[28:9:-1] + s[29] + s[8:0:-1] + s[9]
be547e1d
PH
1079
1080 else:
1081 raise ExtractorError(u'Unable to decrypt signature, key length %d not supported; retrying might work' % (len(s)))
c5e8d7af 1082
1f343eaa 1083 def _get_available_subtitles(self, video_id, webpage):
de7f3446 1084 try:
7fad1c63
JMF
1085 sub_list = self._download_webpage(
1086 'http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,
1087 video_id, note=False)
1088 except ExtractorError as err:
de7f3446
JMF
1089 self._downloader.report_warning(u'unable to download video subtitles: %s' % compat_str(err))
1090 return {}
1091 lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
1092
1093 sub_lang_list = {}
1094 for l in lang_list:
1095 lang = l[1]
1096 params = compat_urllib_parse.urlencode({
1097 'lang': lang,
1098 'v': video_id,
ca715127 1099 'fmt': self._downloader.params.get('subtitlesformat', 'srt'),
8eddf3e9 1100 'name': l[0].encode('utf-8'),
de7f3446
JMF
1101 })
1102 url = u'http://www.youtube.com/api/timedtext?' + params
1103 sub_lang_list[lang] = url
1104 if not sub_lang_list:
1105 self._downloader.report_warning(u'video doesn\'t have subtitles')
1106 return {}
1107 return sub_lang_list
1108
055e6f36 1109 def _get_available_automatic_caption(self, video_id, webpage):
de7f3446
JMF
1110 """We need the webpage for getting the captions url, pass it as an
1111 argument to speed up the process."""
ca715127 1112 sub_format = self._downloader.params.get('subtitlesformat', 'srt')
de7f3446
JMF
1113 self.to_screen(u'%s: Looking for automatic captions' % video_id)
1114 mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
055e6f36 1115 err_msg = u'Couldn\'t find automatic captions for %s' % video_id
de7f3446
JMF
1116 if mobj is None:
1117 self._downloader.report_warning(err_msg)
1118 return {}
1119 player_config = json.loads(mobj.group(1))
1120 try:
1121 args = player_config[u'args']
1122 caption_url = args[u'ttsurl']
1123 timestamp = args[u'timestamp']
055e6f36
JMF
1124 # We get the available subtitles
1125 list_params = compat_urllib_parse.urlencode({
1126 'type': 'list',
1127 'tlangs': 1,
1128 'asrs': 1,
de7f3446 1129 })
055e6f36 1130 list_url = caption_url + '&' + list_params
e26f8712 1131 caption_list = self._download_xml(list_url, video_id)
e3dc22ca 1132 original_lang_node = caption_list.find('track')
f6a54188 1133 if original_lang_node is None or original_lang_node.attrib.get('kind') != 'asr' :
e3dc22ca
JMF
1134 self._downloader.report_warning(u'Video doesn\'t have automatic captions')
1135 return {}
1136 original_lang = original_lang_node.attrib['lang_code']
055e6f36
JMF
1137
1138 sub_lang_list = {}
1139 for lang_node in caption_list.findall('target'):
1140 sub_lang = lang_node.attrib['lang_code']
1141 params = compat_urllib_parse.urlencode({
1142 'lang': original_lang,
1143 'tlang': sub_lang,
1144 'fmt': sub_format,
1145 'ts': timestamp,
1146 'kind': 'asr',
1147 })
1148 sub_lang_list[sub_lang] = caption_url + '&' + params
1149 return sub_lang_list
de7f3446
JMF
1150 # An extractor error can be raise by the download process if there are
1151 # no automatic captions but there are subtitles
1152 except (KeyError, ExtractorError):
1153 self._downloader.report_warning(err_msg)
1154 return {}
1155
c5e8d7af
PH
1156 def _print_formats(self, formats):
1157 print('Available formats:')
1158 for x in formats:
03cc7c20
JMF
1159 print('%s\t:\t%s\t[%s]%s' %(x, self._video_extensions.get(x, 'flv'),
1160 self._video_dimensions.get(x, '???'),
836a086c 1161 ' ('+self._special_itags[x]+')' if x in self._special_itags else ''))
c5e8d7af
PH
1162
1163 def _extract_id(self, url):
1164 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1165 if mobj is None:
1166 raise ExtractorError(u'Invalid URL: %s' % url)
1167 video_id = mobj.group(2)
1168 return video_id
1169
1d043b93
JMF
1170 def _get_video_url_list(self, url_map):
1171 """
1172 Transform a dictionary in the format {itag:url} to a list of (itag, url)
1173 with the requested formats.
1174 """
1175 req_format = self._downloader.params.get('format', None)
1176 format_limit = self._downloader.params.get('format_limit', None)
1177 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
1178 if format_limit is not None and format_limit in available_formats:
1179 format_list = available_formats[available_formats.index(format_limit):]
1180 else:
1181 format_list = available_formats
1182 existing_formats = [x for x in format_list if x in url_map]
1183 if len(existing_formats) == 0:
1184 raise ExtractorError(u'no known formats available for video')
1185 if self._downloader.params.get('listformats', None):
1186 self._print_formats(existing_formats)
1187 return
1188 if req_format is None or req_format == 'best':
1189 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1190 elif req_format == 'worst':
1191 video_url_list = [(existing_formats[-1], url_map[existing_formats[-1]])] # worst quality
1192 elif req_format in ('-1', 'all'):
1193 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1194 else:
1195 # Specific formats. We pick the first in a slash-delimeted sequence.
bdc6b3fc
AZ
1196 # Format can be specified as itag or 'mp4' or 'flv' etc. We pick the highest quality
1197 # available in the specified format. For example,
1198 # if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
1199 # if '1/mp4/3/4' is requested and '1' and '5' (is a mp4) are available, we pick '1'.
1200 # if '1/mp4/3/4' is requested and '4' and '5' (is a mp4) are available, we pick '5'.
1d043b93
JMF
1201 req_formats = req_format.split('/')
1202 video_url_list = None
1203 for rf in req_formats:
1204 if rf in url_map:
1205 video_url_list = [(rf, url_map[rf])]
1206 break
bdc6b3fc
AZ
1207 if rf in self._video_formats_map:
1208 for srf in self._video_formats_map[rf]:
1209 if srf in url_map:
1210 video_url_list = [(srf, url_map[srf])]
1211 break
1212 else:
1213 continue
1214 break
1d043b93
JMF
1215 if video_url_list is None:
1216 raise ExtractorError(u'requested format not available')
1217 return video_url_list
1218
1219 def _extract_from_m3u8(self, manifest_url, video_id):
1220 url_map = {}
1221 def _get_urls(_manifest):
1222 lines = _manifest.split('\n')
1223 urls = filter(lambda l: l and not l.startswith('#'),
1224 lines)
1225 return urls
1226 manifest = self._download_webpage(manifest_url, video_id, u'Downloading formats manifest')
1227 formats_urls = _get_urls(manifest)
1228 for format_url in formats_urls:
890f62e8 1229 itag = self._search_regex(r'itag/(\d+?)/', format_url, 'itag')
1d043b93
JMF
1230 url_map[itag] = format_url
1231 return url_map
1232
1fb07d10
JG
1233 def _extract_annotations(self, video_id):
1234 url = 'https://www.youtube.com/annotations_invideo?features=1&legacy=1&video_id=%s' % video_id
1235 return self._download_webpage(url, video_id, note=u'Searching for annotations.', errnote=u'Unable to download video annotations.')
1236
c5e8d7af
PH
1237 def _real_extract(self, url):
1238 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
1239 mobj = re.search(self._NEXT_URL_RE, url)
1240 if mobj:
1241 url = 'https://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
1242 video_id = self._extract_id(url)
1243
1244 # Get video webpage
c5e8d7af 1245 url = 'https://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
336c3a69 1246 video_webpage = self._download_webpage(url, video_id)
c5e8d7af
PH
1247
1248 # Attempt to extract SWF player URL
e0df6211 1249 mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
c5e8d7af
PH
1250 if mobj is not None:
1251 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1252 else:
1253 player_url = None
1254
1255 # Get video info
1256 self.report_video_info_webpage_download(video_id)
c108eb73
JMF
1257 if re.search(r'player-age-gate-content">', video_webpage) is not None:
1258 self.report_age_confirmation()
1259 age_gate = True
1260 # We simulate the access to the video from www.youtube.com/v/{video_id}
1261 # this can be viewed without login into Youtube
1262 data = compat_urllib_parse.urlencode({'video_id': video_id,
fccd3771 1263 'el': 'player_embedded',
c108eb73
JMF
1264 'gl': 'US',
1265 'hl': 'en',
1266 'eurl': 'https://youtube.googleapis.com/v/' + video_id,
1267 'asv': 3,
1268 'sts':'1588',
1269 })
1270 video_info_url = 'https://www.youtube.com/get_video_info?' + data
c5e8d7af
PH
1271 video_info_webpage = self._download_webpage(video_info_url, video_id,
1272 note=False,
1273 errnote='unable to download video info webpage')
1274 video_info = compat_parse_qs(video_info_webpage)
c108eb73
JMF
1275 else:
1276 age_gate = False
1277 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1278 video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1279 % (video_id, el_type))
1280 video_info_webpage = self._download_webpage(video_info_url, video_id,
1281 note=False,
1282 errnote='unable to download video info webpage')
1283 video_info = compat_parse_qs(video_info_webpage)
1284 if 'token' in video_info:
1285 break
c5e8d7af
PH
1286 if 'token' not in video_info:
1287 if 'reason' in video_info:
9a82b238 1288 raise ExtractorError(u'YouTube said: %s' % video_info['reason'][0], expected=True)
c5e8d7af
PH
1289 else:
1290 raise ExtractorError(u'"token" parameter not in video info for unknown reason')
1291
1d699755
PH
1292 if 'view_count' in video_info:
1293 view_count = int(video_info['view_count'][0])
1294 else:
1295 view_count = None
1296
c5e8d7af
PH
1297 # Check for "rental" videos
1298 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
1299 raise ExtractorError(u'"rental" videos not supported')
1300
1301 # Start extracting information
1302 self.report_information_extraction(video_id)
1303
1304 # uploader
1305 if 'author' not in video_info:
1306 raise ExtractorError(u'Unable to extract uploader name')
1307 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
1308
1309 # uploader_id
1310 video_uploader_id = None
1311 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
1312 if mobj is not None:
1313 video_uploader_id = mobj.group(1)
1314 else:
1315 self._downloader.report_warning(u'unable to extract uploader nickname')
1316
1317 # title
a8c6b241
PH
1318 if 'title' in video_info:
1319 video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
1320 else:
1321 self._downloader.report_warning(u'Unable to extract video title')
1322 video_title = u'_'
c5e8d7af
PH
1323
1324 # thumbnail image
7763b04e
JMF
1325 # We try first to get a high quality image:
1326 m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">',
1327 video_webpage, re.DOTALL)
1328 if m_thumb is not None:
1329 video_thumbnail = m_thumb.group(1)
1330 elif 'thumbnail_url' not in video_info:
c5e8d7af 1331 self._downloader.report_warning(u'unable to extract video thumbnail')
f490e77e 1332 video_thumbnail = None
c5e8d7af
PH
1333 else: # don't panic if we can't find it
1334 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
1335
1336 # upload date
1337 upload_date = None
1338 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
1339 if mobj is not None:
1340 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1341 upload_date = unified_strdate(upload_date)
1342
1343 # description
1344 video_description = get_element_by_id("eow-description", video_webpage)
1345 if video_description:
27dcce19
PH
1346 video_description = re.sub(r'''(?x)
1347 <a\s+
1348 (?:[a-zA-Z-]+="[^"]+"\s+)*?
1349 title="([^"]+)"\s+
1350 (?:[a-zA-Z-]+="[^"]+"\s+)*?
1351 class="yt-uix-redirect-link"\s*>
1352 [^<]+
1353 </a>
1354 ''', r'\1', video_description)
c5e8d7af
PH
1355 video_description = clean_html(video_description)
1356 else:
1357 fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
1358 if fd_mobj:
1359 video_description = unescapeHTML(fd_mobj.group(1))
1360 else:
1361 video_description = u''
1362
336c3a69 1363 def _extract_count(klass):
46374a56
PH
1364 count = self._search_regex(
1365 r'class="%s">([\d,]+)</span>' % re.escape(klass),
1366 video_webpage, klass, default=None)
336c3a69
JMF
1367 if count is not None:
1368 return int(count.replace(',', ''))
1369 return None
1370 like_count = _extract_count(u'likes-count')
1371 dislike_count = _extract_count(u'dislikes-count')
1372
c5e8d7af 1373 # subtitles
d82134c3 1374 video_subtitles = self.extract_subtitles(video_id, video_webpage)
c5e8d7af 1375
c5e8d7af 1376 if self._downloader.params.get('listsubtitles', False):
d665f8d3 1377 self._list_available_subtitles(video_id, video_webpage)
c5e8d7af
PH
1378 return
1379
1380 if 'length_seconds' not in video_info:
1381 self._downloader.report_warning(u'unable to extract video duration')
b466b702 1382 video_duration = None
c5e8d7af 1383 else:
b466b702 1384 video_duration = int(compat_urllib_parse.unquote_plus(video_info['length_seconds'][0]))
c5e8d7af 1385
1fb07d10
JG
1386 # annotations
1387 video_annotations = None
1388 if self._downloader.params.get('writeannotations', False):
1389 video_annotations = self._extract_annotations(video_id)
1390
c5e8d7af 1391 # Decide which formats to download
c5e8d7af
PH
1392
1393 try:
1394 mobj = re.search(r';ytplayer.config = ({.*?});', video_webpage)
50be92c1
PH
1395 if not mobj:
1396 raise ValueError('Could not find vevo ID')
c5e8d7af
PH
1397 info = json.loads(mobj.group(1))
1398 args = info['args']
7ce7e394
JMF
1399 # Easy way to know if the 's' value is in url_encoded_fmt_stream_map
1400 # this signatures are encrypted
44d46655 1401 if 'url_encoded_fmt_stream_map' not in args:
f10503db 1402 raise ValueError(u'No stream_map present') # caught below
00fe14fc
JMF
1403 re_signature = re.compile(r'[&,]s=')
1404 m_s = re_signature.search(args['url_encoded_fmt_stream_map'])
7ce7e394
JMF
1405 if m_s is not None:
1406 self.to_screen(u'%s: Encrypted signatures detected.' % video_id)
c5e8d7af 1407 video_info['url_encoded_fmt_stream_map'] = [args['url_encoded_fmt_stream_map']]
00fe14fc 1408 m_s = re_signature.search(args.get('adaptive_fmts', u''))
b7a68384 1409 if m_s is not None:
00fe14fc
JMF
1410 if 'adaptive_fmts' in video_info:
1411 video_info['adaptive_fmts'][0] += ',' + args['adaptive_fmts']
37b6d5f6 1412 else:
00fe14fc 1413 video_info['adaptive_fmts'] = [args['adaptive_fmts']]
c5e8d7af
PH
1414 except ValueError:
1415 pass
1416
1417 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1418 self.report_rtmp_download()
1419 video_url_list = [(None, video_info['conn'][0])]
00fe14fc
JMF
1420 elif len(video_info.get('url_encoded_fmt_stream_map', [])) >= 1 or len(video_info.get('adaptive_fmts', [])) >= 1:
1421 encoded_url_map = video_info.get('url_encoded_fmt_stream_map', [''])[0] + ',' + video_info.get('adaptive_fmts',[''])[0]
1422 if 'rtmpe%3Dyes' in encoded_url_map:
a7055eb9 1423 raise ExtractorError('rtmpe downloads are not supported, see https://github.com/rg3/youtube-dl/issues/343 for more information.', expected=True)
c5e8d7af 1424 url_map = {}
00fe14fc 1425 for url_data_str in encoded_url_map.split(','):
c5e8d7af
PH
1426 url_data = compat_parse_qs(url_data_str)
1427 if 'itag' in url_data and 'url' in url_data:
1428 url = url_data['url'][0]
1429 if 'sig' in url_data:
1430 url += '&signature=' + url_data['sig'][0]
1431 elif 's' in url_data:
e0df6211 1432 encrypted_sig = url_data['s'][0]
769fda3c 1433 if self._downloader.params.get('verbose'):
c108eb73 1434 if age_gate:
bdde940e
PH
1435 if player_url is None:
1436 player_version = 'unknown'
1437 else:
1438 player_version = self._search_regex(
1439 r'-(.+)\.swf$', player_url,
1440 u'flash player', fatal=False)
e0df6211 1441 player_desc = 'flash player %s' % player_version
c108eb73 1442 else:
83799698
PH
1443 player_version = self._search_regex(
1444 r'html5player-(.+?)\.js', video_webpage,
c108eb73 1445 'html5 player', fatal=False)
e0df6211
PH
1446 player_desc = u'html5 player %s' % player_version
1447
1448 parts_sizes = u'.'.join(compat_str(len(part)) for part in encrypted_sig.split('.'))
5a76c651 1449 self.to_screen(u'encrypted signature length %d (%s), itag %s, %s' %
e0df6211
PH
1450 (len(encrypted_sig), parts_sizes, url_data['itag'][0], player_desc))
1451
83799698 1452 if not age_gate:
e0df6211
PH
1453 jsplayer_url_json = self._search_regex(
1454 r'"assets":.+?"js":\s*("[^"]+")',
1455 video_webpage, u'JS player URL')
83799698 1456 player_url = json.loads(jsplayer_url_json)
e0df6211 1457
83799698
PH
1458 signature = self._decrypt_signature(
1459 encrypted_sig, video_id, player_url, age_gate)
c5e8d7af
PH
1460 url += '&signature=' + signature
1461 if 'ratebypass' not in url:
1462 url += '&ratebypass=yes'
1463 url_map[url_data['itag'][0]] = url
1d043b93
JMF
1464 video_url_list = self._get_video_url_list(url_map)
1465 if not video_url_list:
c5e8d7af 1466 return
1d043b93
JMF
1467 elif video_info.get('hlsvp'):
1468 manifest_url = video_info['hlsvp'][0]
1469 url_map = self._extract_from_m3u8(manifest_url, video_id)
1470 video_url_list = self._get_video_url_list(url_map)
1471 if not video_url_list:
1472 return
1473
c5e8d7af 1474 else:
9abb3204 1475 raise ExtractorError(u'no conn, hlsvp or url_encoded_fmt_stream_map information found in video info')
c5e8d7af
PH
1476
1477 results = []
600cc1a4 1478 for itag, video_real_url in video_url_list:
c5e8d7af 1479 # Extension
600cc1a4 1480 video_extension = self._video_extensions.get(itag, 'flv')
c5e8d7af 1481
600cc1a4
JMF
1482 video_format = '{0} - {1}{2}'.format(itag if itag else video_extension,
1483 self._video_dimensions.get(itag, '???'),
1484 ' ('+self._special_itags[itag]+')' if itag in self._special_itags else '')
c5e8d7af
PH
1485
1486 results.append({
1487 'id': video_id,
1488 'url': video_real_url,
1489 'uploader': video_uploader,
1490 'uploader_id': video_uploader_id,
1491 'upload_date': upload_date,
1492 'title': video_title,
1493 'ext': video_extension,
1494 'format': video_format,
600cc1a4 1495 'format_id': itag,
c5e8d7af
PH
1496 'thumbnail': video_thumbnail,
1497 'description': video_description,
1498 'player_url': player_url,
1499 'subtitles': video_subtitles,
8dbe9899 1500 'duration': video_duration,
cfadd183 1501 'age_limit': 18 if age_gate else 0,
9103bbc5
JMF
1502 'annotations': video_annotations,
1503 'webpage_url': 'https://www.youtube.com/watch?v=%s' % video_id,
1d699755 1504 'view_count': view_count,
336c3a69
JMF
1505 'like_count': like_count,
1506 'dislike_count': dislike_count,
c5e8d7af
PH
1507 })
1508 return results
1509
880e1c52 1510class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
0f818663 1511 IE_DESC = u'YouTube.com playlists'
c5e8d7af
PH
1512 _VALID_URL = r"""(?:
1513 (?:https?://)?
1514 (?:\w+\.)?
1515 youtube\.com/
1516 (?:
1517 (?:course|view_play_list|my_playlists|artist|playlist|watch)
1518 \? (?:.*?&)*? (?:p|a|list)=
1519 | p/
1520 )
715c8e7b 1521 ((?:PL|EC|UU|FL|RD)?[0-9A-Za-z-_]{10,})
c5e8d7af
PH
1522 .*
1523 |
715c8e7b 1524 ((?:PL|EC|UU|FL|RD)[0-9A-Za-z-_]{10,})
c5e8d7af 1525 )"""
dcbb4580
JMF
1526 _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s&page=%s'
1527 _MORE_PAGES_INDICATOR = r'data-link-type="next"'
6e47b51e 1528 _VIDEO_RE = r'href="/watch\?v=(?P<id>[0-9A-Za-z_-]{11})&amp;[^"]*?index=(?P<index>\d+)'
c5e8d7af
PH
1529 IE_NAME = u'youtube:playlist'
1530
1531 @classmethod
1532 def suitable(cls, url):
1533 """Receives a URL and returns True if suitable for this IE."""
1534 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1535
880e1c52
JMF
1536 def _real_initialize(self):
1537 self._login()
1538
652cdaa2
JMF
1539 def _ids_to_results(self, ids):
1540 return [self.url_result(vid_id, 'Youtube', video_id=vid_id)
1541 for vid_id in ids]
1542
1543 def _extract_mix(self, playlist_id):
1544 # The mixes are generated from a a single video
1545 # the id of the playlist is just 'RD' + video_id
7d4afc55 1546 url = 'https://youtube.com/watch?v=%s&list=%s' % (playlist_id[-11:], playlist_id)
652cdaa2 1547 webpage = self._download_webpage(url, playlist_id, u'Downloading Youtube mix')
76d1700b
JMF
1548 title_span = (get_element_by_attribute('class', 'title long-title', webpage) or
1549 get_element_by_attribute('class', 'title ', webpage))
1550 title = clean_html(title_span)
652cdaa2
JMF
1551 video_re = r'data-index="\d+".*?href="/watch\?v=([0-9A-Za-z_-]{11})&amp;[^"]*?list=%s' % re.escape(playlist_id)
1552 ids = orderedSet(re.findall(video_re, webpage))
1553 url_results = self._ids_to_results(ids)
1554
1555 return self.playlist_result(url_results, playlist_id, title)
1556
c5e8d7af
PH
1557 def _real_extract(self, url):
1558 # Extract playlist id
1559 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1560 if mobj is None:
1561 raise ExtractorError(u'Invalid URL: %s' % url)
47192f92
FV
1562 playlist_id = mobj.group(1) or mobj.group(2)
1563
1564 # Check if it's a video-specific URL
7c61bd36 1565 query_dict = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
47192f92
FV
1566 if 'v' in query_dict:
1567 video_id = query_dict['v'][0]
1568 if self._downloader.params.get('noplaylist'):
1569 self.to_screen(u'Downloading just video %s because of --no-playlist' % video_id)
7012b23c 1570 return self.url_result(video_id, 'Youtube', video_id=video_id)
47192f92
FV
1571 else:
1572 self.to_screen(u'Downloading playlist PL%s - add --no-playlist to just download video %s' % (playlist_id, video_id))
c5e8d7af 1573
7d4afc55 1574 if playlist_id.startswith('RD'):
652cdaa2
JMF
1575 # Mixes require a custom extraction process
1576 return self._extract_mix(playlist_id)
0a688bc0
JMF
1577 if playlist_id.startswith('TL'):
1578 raise ExtractorError(u'For downloading YouTube.com top lists, use '
1579 u'the "yttoplist" keyword, for example "youtube-dl \'yttoplist:music:Top Tracks\'"', expected=True)
652cdaa2 1580
dcbb4580
JMF
1581 # Extract the video ids from the playlist pages
1582 ids = []
c5e8d7af 1583
755eb032 1584 for page_num in itertools.count(1):
dcbb4580 1585 url = self._TEMPLATE_URL % (playlist_id, page_num)
c5e8d7af 1586 page = self._download_webpage(url, playlist_id, u'Downloading page #%s' % page_num)
6e47b51e
JMF
1587 matches = re.finditer(self._VIDEO_RE, page)
1588 # We remove the duplicates and the link with index 0
1589 # (it's not the first video of the playlist)
1590 new_ids = orderedSet(m.group('id') for m in matches if m.group('index') != '0')
dcbb4580 1591 ids.extend(new_ids)
c5e8d7af 1592
dcbb4580 1593 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
c5e8d7af
PH
1594 break
1595
dcbb4580 1596 playlist_title = self._og_search_title(page)
c5e8d7af 1597
652cdaa2 1598 url_results = self._ids_to_results(ids)
dcbb4580 1599 return self.playlist_result(url_results, playlist_id, playlist_title)
c5e8d7af
PH
1600
1601
0a688bc0
JMF
1602class YoutubeTopListIE(YoutubePlaylistIE):
1603 IE_NAME = u'youtube:toplist'
1604 IE_DESC = (u'YouTube.com top lists, "yttoplist:{channel}:{list title}"'
1605 u' (Example: "yttoplist:music:Top Tracks")')
1606 _VALID_URL = r'yttoplist:(?P<chann>.*?):(?P<title>.*?)$'
1607
1608 def _real_extract(self, url):
1609 mobj = re.match(self._VALID_URL, url)
1610 channel = mobj.group('chann')
1611 title = mobj.group('title')
1612 query = compat_urllib_parse.urlencode({'title': title})
1613 playlist_re = 'href="([^"]+?%s[^"]+?)"' % re.escape(query)
1614 channel_page = self._download_webpage('https://www.youtube.com/%s' % channel, title)
1615 link = self._html_search_regex(playlist_re, channel_page, u'list')
1616 url = compat_urlparse.urljoin('https://www.youtube.com/', link)
1617
1618 video_re = r'data-index="\d+".*?data-video-id="([0-9A-Za-z_-]{11})"'
1619 ids = []
1620 # sometimes the webpage doesn't contain the videos
1621 # retry until we get them
1622 for i in itertools.count(0):
1623 msg = u'Downloading Youtube mix'
1624 if i > 0:
1625 msg += ', retry #%d' % i
1626 webpage = self._download_webpage(url, title, msg)
1627 ids = orderedSet(re.findall(video_re, webpage))
1628 if ids:
1629 break
1630 url_results = self._ids_to_results(ids)
1631 return self.playlist_result(url_results, playlist_title=title)
1632
1633
c5e8d7af 1634class YoutubeChannelIE(InfoExtractor):
0f818663 1635 IE_DESC = u'YouTube.com channels'
c5e8d7af 1636 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
c5e8d7af 1637 _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
252580c5 1638 _MORE_PAGES_URL = 'http://www.youtube.com/c4_browse_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
c5e8d7af
PH
1639 IE_NAME = u'youtube:channel'
1640
1641 def extract_videos_from_page(self, page):
1642 ids_in_page = []
1643 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1644 if mobj.group(1) not in ids_in_page:
1645 ids_in_page.append(mobj.group(1))
1646 return ids_in_page
1647
1648 def _real_extract(self, url):
1649 # Extract channel id
1650 mobj = re.match(self._VALID_URL, url)
1651 if mobj is None:
1652 raise ExtractorError(u'Invalid URL: %s' % url)
1653
1654 # Download channel page
1655 channel_id = mobj.group(1)
1656 video_ids = []
b9643eed
JMF
1657 url = 'https://www.youtube.com/channel/%s/videos' % channel_id
1658 channel_page = self._download_webpage(url, channel_id)
31812a9e
PH
1659 autogenerated = re.search(r'''(?x)
1660 class="[^"]*?(?:
1661 channel-header-autogenerated-label|
1662 yt-channel-title-autogenerated
1663 )[^"]*"''', channel_page) is not None
c5e8d7af 1664
b9643eed
JMF
1665 if autogenerated:
1666 # The videos are contained in a single page
1667 # the ajax pages can't be used, they are empty
1668 video_ids = self.extract_videos_from_page(channel_page)
1669 else:
1670 # Download all channel pages using the json-based channel_ajax query
1671 for pagenum in itertools.count(1):
1672 url = self._MORE_PAGES_URL % (pagenum, channel_id)
1673 page = self._download_webpage(url, channel_id,
1674 u'Downloading page #%s' % pagenum)
1675
1676 page = json.loads(page)
1677
1678 ids_in_page = self.extract_videos_from_page(page['content_html'])
1679 video_ids.extend(ids_in_page)
1680
1681 if self._MORE_PAGES_INDICATOR not in page['load_more_widget_html']:
1682 break
c5e8d7af
PH
1683
1684 self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1685
7012b23c
PH
1686 url_entries = [self.url_result(video_id, 'Youtube', video_id=video_id)
1687 for video_id in video_ids]
1688 return self.playlist_result(url_entries, channel_id)
c5e8d7af
PH
1689
1690
1691class YoutubeUserIE(InfoExtractor):
0f818663 1692 IE_DESC = u'YouTube.com user videos (URL or "ytuser" keyword)'
57da92b7 1693 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:user/)?(?!(?:attribution_link|watch)(?:$|[^a-z_A-Z0-9-])))|ytuser:)(?!feed/)([A-Za-z0-9_-]+)'
c5e8d7af
PH
1694 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1695 _GDATA_PAGE_SIZE = 50
fd9cf738 1696 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d&alt=json'
c5e8d7af
PH
1697 IE_NAME = u'youtube:user'
1698
e3ea4790 1699 @classmethod
f4b05232 1700 def suitable(cls, url):
e3ea4790
JMF
1701 # Don't return True if the url can be extracted with other youtube
1702 # extractor, the regex would is too permissive and it would match.
1703 other_ies = iter(klass for (name, klass) in globals().items() if name.endswith('IE') and klass is not cls)
1704 if any(ie.suitable(url) for ie in other_ies): return False
f4b05232
JMF
1705 else: return super(YoutubeUserIE, cls).suitable(url)
1706
c5e8d7af
PH
1707 def _real_extract(self, url):
1708 # Extract username
1709 mobj = re.match(self._VALID_URL, url)
1710 if mobj is None:
1711 raise ExtractorError(u'Invalid URL: %s' % url)
1712
1713 username = mobj.group(1)
1714
1715 # Download video ids using YouTube Data API. Result size per
1716 # query is limited (currently to 50 videos) so we need to query
1717 # page by page until there are no video ids - it means we got
1718 # all of them.
1719
1720 video_ids = []
c5e8d7af 1721
755eb032 1722 for pagenum in itertools.count(0):
c5e8d7af
PH
1723 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1724
1725 gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
1726 page = self._download_webpage(gdata_url, username,
1727 u'Downloading video ids from %d to %d' % (start_index, start_index + self._GDATA_PAGE_SIZE))
1728
fd9cf738
JMF
1729 try:
1730 response = json.loads(page)
1731 except ValueError as err:
1732 raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
71c82637
JMF
1733 if 'entry' not in response['feed']:
1734 # Number of videos is a multiple of self._MAX_RESULTS
1735 break
fd9cf738 1736
c5e8d7af
PH
1737 # Extract video identifiers
1738 ids_in_page = []
fd9cf738
JMF
1739 for entry in response['feed']['entry']:
1740 ids_in_page.append(entry['id']['$t'].split('/')[-1])
c5e8d7af
PH
1741 video_ids.extend(ids_in_page)
1742
1743 # A little optimization - if current page is not
1744 # "full", ie. does not contain PAGE_SIZE video ids then
1745 # we can assume that this page is the last one - there
1746 # are no more ids on further pages - no need to query
1747 # again.
1748
1749 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1750 break
1751
7012b23c
PH
1752 url_results = [
1753 self.url_result(video_id, 'Youtube', video_id=video_id)
1754 for video_id in video_ids]
1755 return self.playlist_result(url_results, playlist_title=username)
1756
b05654f0
PH
1757
1758class YoutubeSearchIE(SearchInfoExtractor):
0f818663 1759 IE_DESC = u'YouTube.com searches'
b05654f0
PH
1760 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1761 _MAX_RESULTS = 1000
1762 IE_NAME = u'youtube:search'
1763 _SEARCH_KEY = 'ytsearch'
1764
b05654f0
PH
1765 def _get_n_results(self, query, n):
1766 """Get a specified number of results for a query"""
1767
1768 video_ids = []
1769 pagenum = 0
1770 limit = n
1771
1772 while (50 * pagenum) < limit:
b05654f0 1773 result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
7cc3570e
PH
1774 data_json = self._download_webpage(
1775 result_url, video_id=u'query "%s"' % query,
1776 note=u'Downloading page %s' % (pagenum + 1),
1777 errnote=u'Unable to download API page')
1778 data = json.loads(data_json)
1779 api_response = data['data']
1780
1781 if 'items' not in api_response:
b05654f0
PH
1782 raise ExtractorError(u'[youtube] No video results')
1783
1784 new_ids = list(video['id'] for video in api_response['items'])
1785 video_ids += new_ids
1786
1787 limit = min(n, api_response['totalItems'])
1788 pagenum += 1
1789
1790 if len(video_ids) > n:
1791 video_ids = video_ids[:n]
7012b23c
PH
1792 videos = [self.url_result(video_id, 'Youtube', video_id=video_id)
1793 for video_id in video_ids]
b05654f0 1794 return self.playlist_result(videos, query)
75dff0ee 1795
a3dd9248 1796class YoutubeSearchDateIE(YoutubeSearchIE):
cb7fb546 1797 IE_NAME = YoutubeSearchIE.IE_NAME + ':date'
a3dd9248
CM
1798 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc&orderby=published'
1799 _SEARCH_KEY = 'ytsearchdate'
08fb86c4 1800 IE_DESC = u'YouTube.com searches, newest videos first'
75dff0ee
JMF
1801
1802class YoutubeShowIE(InfoExtractor):
0f818663 1803 IE_DESC = u'YouTube.com (multi-season) shows'
75dff0ee
JMF
1804 _VALID_URL = r'https?://www\.youtube\.com/show/(.*)'
1805 IE_NAME = u'youtube:show'
1806
1807 def _real_extract(self, url):
1808 mobj = re.match(self._VALID_URL, url)
1809 show_name = mobj.group(1)
1810 webpage = self._download_webpage(url, show_name, u'Downloading show webpage')
1811 # There's one playlist for each season of the show
1812 m_seasons = list(re.finditer(r'href="(/playlist\?list=.*?)"', webpage))
1813 self.to_screen(u'%s: Found %s seasons' % (show_name, len(m_seasons)))
1814 return [self.url_result('https://www.youtube.com' + season.group(1), 'YoutubePlaylist') for season in m_seasons]
04cc9617
JMF
1815
1816
b2e8bc1b 1817class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
d7ae0639
JMF
1818 """
1819 Base class for extractors that fetch info from
1820 http://www.youtube.com/feed_ajax
1821 Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.
1822 """
b2e8bc1b 1823 _LOGIN_REQUIRED = True
43ba5456
JMF
1824 # use action_load_personal_feed instead of action_load_system_feed
1825 _PERSONAL_FEED = False
04cc9617 1826
d7ae0639
JMF
1827 @property
1828 def _FEED_TEMPLATE(self):
43ba5456
JMF
1829 action = 'action_load_system_feed'
1830 if self._PERSONAL_FEED:
1831 action = 'action_load_personal_feed'
1832 return 'http://www.youtube.com/feed_ajax?%s=1&feed_name=%s&paging=%%s' % (action, self._FEED_NAME)
d7ae0639
JMF
1833
1834 @property
1835 def IE_NAME(self):
1836 return u'youtube:%s' % self._FEED_NAME
04cc9617 1837
81f0259b 1838 def _real_initialize(self):
b2e8bc1b 1839 self._login()
81f0259b 1840
04cc9617
JMF
1841 def _real_extract(self, url):
1842 feed_entries = []
0e44d838
JMF
1843 paging = 0
1844 for i in itertools.count(1):
d7ae0639
JMF
1845 info = self._download_webpage(self._FEED_TEMPLATE % paging,
1846 u'%s feed' % self._FEED_NAME,
04cc9617
JMF
1847 u'Downloading page %s' % i)
1848 info = json.loads(info)
1849 feed_html = info['feed_html']
43ba5456 1850 m_ids = re.finditer(r'"/watch\?v=(.*?)["&]', feed_html)
04cc9617 1851 ids = orderedSet(m.group(1) for m in m_ids)
7012b23c
PH
1852 feed_entries.extend(
1853 self.url_result(video_id, 'Youtube', video_id=video_id)
1854 for video_id in ids)
04cc9617
JMF
1855 if info['paging'] is None:
1856 break
0e44d838 1857 paging = info['paging']
d7ae0639
JMF
1858 return self.playlist_result(feed_entries, playlist_title=self._PLAYLIST_TITLE)
1859
1860class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor):
1861 IE_DESC = u'YouTube.com subscriptions feed, "ytsubs" keyword(requires authentication)'
1862 _VALID_URL = r'https?://www\.youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
1863 _FEED_NAME = 'subscriptions'
1864 _PLAYLIST_TITLE = u'Youtube Subscriptions'
1865
1866class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
1867 IE_DESC = u'YouTube.com recommended videos, "ytrec" keyword (requires authentication)'
1868 _VALID_URL = r'https?://www\.youtube\.com/feed/recommended|:ytrec(?:ommended)?'
1869 _FEED_NAME = 'recommended'
1870 _PLAYLIST_TITLE = u'Youtube Recommended videos'
c626a3d9 1871
43ba5456
JMF
1872class YoutubeWatchLaterIE(YoutubeFeedsInfoExtractor):
1873 IE_DESC = u'Youtube watch later list, "ytwatchlater" keyword (requires authentication)'
1874 _VALID_URL = r'https?://www\.youtube\.com/feed/watch_later|:ytwatchlater'
1875 _FEED_NAME = 'watch_later'
1876 _PLAYLIST_TITLE = u'Youtube Watch Later'
43ba5456 1877 _PERSONAL_FEED = True
c626a3d9 1878
f459d170
JMF
1879class YoutubeHistoryIE(YoutubeFeedsInfoExtractor):
1880 IE_DESC = u'Youtube watch history, "ythistory" keyword (requires authentication)'
1881 _VALID_URL = u'https?://www\.youtube\.com/feed/history|:ythistory'
1882 _FEED_NAME = 'history'
1883 _PERSONAL_FEED = True
1884 _PLAYLIST_TITLE = u'Youtube Watch History'
1885
c626a3d9
JMF
1886class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
1887 IE_NAME = u'youtube:favorites'
1888 IE_DESC = u'YouTube.com favourite videos, "ytfav" keyword (requires authentication)'
c7a7750d 1889 _VALID_URL = r'https?://www\.youtube\.com/my_favorites|:ytfav(?:ou?rites)?'
c626a3d9
JMF
1890 _LOGIN_REQUIRED = True
1891
1892 def _real_extract(self, url):
1893 webpage = self._download_webpage('https://www.youtube.com/my_favorites', 'Youtube Favourites videos')
1894 playlist_id = self._search_regex(r'list=(.+?)["&]', webpage, u'favourites playlist id')
1895 return self.url_result(playlist_id, 'YoutubePlaylist')
15870e90
PH
1896
1897
1898class YoutubeTruncatedURLIE(InfoExtractor):
1899 IE_NAME = 'youtube:truncated_url'
1900 IE_DESC = False # Do not list
1901 _VALID_URL = r'(?:https?://)?[^/]+/watch\?feature=[a-z_]+$'
1902
1903 def _real_extract(self, url):
1904 raise ExtractorError(
1905 u'Did you forget to quote the URL? Remember that & is a meta '
1906 u'character in most shells, so you want to put the URL in quotes, '
1907 u'like youtube-dl '
1908 u'\'http://www.youtube.com/watch?feature=foo&v=BaW_jenozKc\''
1909 u' (or simply youtube-dl BaW_jenozKc ).',
1910 expected=True)