]> jfr.im git - yt-dlp.git/blame - youtube_dl/extractor/youtube.py
[youtube] Correct subtitle URL (Fixes #2120)
[yt-dlp.git] / youtube_dl / extractor / youtube.py
CommitLineData
c5e8d7af 1# coding: utf-8
c5e8d7af 2
e0df6211 3import collections
edf3e38e 4import errno
e0df6211 5import io
0ca96d48 6import itertools
c5e8d7af 7import json
c4417ddb 8import os.path
c5e8d7af 9import re
e0df6211
PH
10import string
11import struct
12import traceback
13import zlib
c5e8d7af 14
b05654f0 15from .common import InfoExtractor, SearchInfoExtractor
54d39d8b 16from .subtitles import SubtitlesInfoExtractor
c5e8d7af 17from ..utils import (
edf3e38e 18 compat_chr,
c5e8d7af 19 compat_parse_qs,
c5e8d7af
PH
20 compat_urllib_parse,
21 compat_urllib_request,
7c61bd36 22 compat_urlparse,
c5e8d7af
PH
23 compat_str,
24
25 clean_html,
c38b1e77 26 get_cachedir,
c5e8d7af 27 get_element_by_id,
652cdaa2 28 get_element_by_attribute,
c5e8d7af
PH
29 ExtractorError,
30 unescapeHTML,
31 unified_strdate,
04cc9617 32 orderedSet,
edf3e38e 33 write_json_file,
c5e8d7af
PH
34)
35
de7f3446 36class YoutubeBaseInfoExtractor(InfoExtractor):
b2e8bc1b
JMF
37 """Provide base functions for Youtube extractors"""
38 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
39 _LANG_URL = r'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
40 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
41 _NETRC_MACHINE = 'youtube'
42 # If True it will raise an error if no login info is provided
43 _LOGIN_REQUIRED = False
44
b2e8bc1b 45 def _set_language(self):
7cc3570e
PH
46 return bool(self._download_webpage(
47 self._LANG_URL, None,
48 note=u'Setting language', errnote='unable to set language',
49 fatal=False))
b2e8bc1b
JMF
50
51 def _login(self):
52 (username, password) = self._get_login_info()
53 # No authentication to be performed
54 if username is None:
55 if self._LOGIN_REQUIRED:
56 raise ExtractorError(u'No login info available, needed for using %s.' % self.IE_NAME, expected=True)
57 return False
58
7cc3570e
PH
59 login_page = self._download_webpage(
60 self._LOGIN_URL, None,
61 note=u'Downloading login page',
62 errnote=u'unable to fetch login page', fatal=False)
63 if login_page is False:
64 return
b2e8bc1b 65
795f28f8
PH
66 galx = self._search_regex(r'(?s)<input.+?name="GALX".+?value="(.+?)"',
67 login_page, u'Login GALX parameter')
c5e8d7af 68
b2e8bc1b
JMF
69 # Log in
70 login_form_strs = {
71 u'continue': u'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
72 u'Email': username,
73 u'GALX': galx,
74 u'Passwd': password,
75 u'PersistentCookie': u'yes',
76 u'_utf8': u'霱',
77 u'bgresponse': u'js_disabled',
78 u'checkConnection': u'',
79 u'checkedDomains': u'youtube',
80 u'dnConn': u'',
b2e8bc1b
JMF
81 u'pstMsg': u'0',
82 u'rmShown': u'1',
83 u'secTok': u'',
84 u'signIn': u'Sign in',
85 u'timeStmp': u'',
86 u'service': u'youtube',
87 u'uilel': u'3',
88 u'hl': u'en_US',
89 }
90 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
91 # chokes on unicode
92 login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
93 login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
7cc3570e
PH
94
95 req = compat_urllib_request.Request(self._LOGIN_URL, login_data)
96 login_results = self._download_webpage(
97 req, None,
98 note=u'Logging in', errnote=u'unable to log in', fatal=False)
99 if login_results is False:
100 return False
101 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
102 self._downloader.report_warning(u'unable to log in: bad username or password')
b2e8bc1b
JMF
103 return False
104 return True
105
106 def _confirm_age(self):
107 age_form = {
7cc3570e
PH
108 'next_url': '/',
109 'action_confirm': 'Confirm',
110 }
111 req = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
112
113 self._download_webpage(
114 req, None,
115 note=u'Confirming age', errnote=u'Unable to confirm age')
b2e8bc1b
JMF
116 return True
117
118 def _real_initialize(self):
119 if self._downloader is None:
120 return
121 if not self._set_language():
122 return
123 if not self._login():
124 return
125 self._confirm_age()
c5e8d7af 126
8377574c 127
de7f3446 128class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
0f818663 129 IE_DESC = u'YouTube.com'
cb7dfeea 130 _VALID_URL = r"""(?x)^
c5e8d7af 131 (
83aa5293 132 (?:https?://|//)? # http(s):// or protocol-independent URL (optional)
cb7dfeea 133 (?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/|
e69ae5b9
JMF
134 tube\.majestyc\.net/|
135 youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains
c5e8d7af
PH
136 (?:.*?\#/)? # handle anchor (#/) redirect urls
137 (?: # the various things that can precede the ID:
138 (?:(?:v|embed|e)/) # v/ or embed/ or e/
139 |(?: # or the v= param in all its forms
d741e55a 140 (?:(?:watch|movie)(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
c5e8d7af
PH
141 (?:\?|\#!?) # the params delimiter ? or # or #!
142 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
143 v=
144 )
f4b05232
JMF
145 ))
146 |youtu\.be/ # just youtu.be/xxxx
147 )
c5e8d7af 148 )? # all until now is optional -> you can pass the naked ID
8963d9c2 149 ([0-9A-Za-z_-]{11}) # here is it! the YouTube video ID
c5e8d7af
PH
150 (?(1).+)? # if we found the ID, everything can follow
151 $"""
c5e8d7af 152 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
2c62dc26
PH
153 _formats = {
154 '5': {'ext': 'flv', 'width': 400, 'height': 240},
155 '6': {'ext': 'flv', 'width': 450, 'height': 270},
156 '13': {'ext': '3gp'},
157 '17': {'ext': '3gp', 'width': 176, 'height': 144},
158 '18': {'ext': 'mp4', 'width': 640, 'height': 360},
159 '22': {'ext': 'mp4', 'width': 1280, 'height': 720},
160 '34': {'ext': 'flv', 'width': 640, 'height': 360},
161 '35': {'ext': 'flv', 'width': 854, 'height': 480},
162 '36': {'ext': '3gp', 'width': 320, 'height': 240},
163 '37': {'ext': 'mp4', 'width': 1920, 'height': 1080},
164 '38': {'ext': 'mp4', 'width': 4096, 'height': 3072},
165 '43': {'ext': 'webm', 'width': 640, 'height': 360},
166 '44': {'ext': 'webm', 'width': 854, 'height': 480},
167 '45': {'ext': 'webm', 'width': 1280, 'height': 720},
168 '46': {'ext': 'webm', 'width': 1920, 'height': 1080},
169
1d043b93 170
86fe61c8 171 # 3d videos
2c62dc26
PH
172 '82': {'ext': 'mp4', 'height': 360, 'resolution': '360p', 'format_note': '3D', 'preference': -20},
173 '83': {'ext': 'mp4', 'height': 480, 'resolution': '480p', 'format_note': '3D', 'preference': -20},
174 '84': {'ext': 'mp4', 'height': 720, 'resolution': '720p', 'format_note': '3D', 'preference': -20},
175 '85': {'ext': 'mp4', 'height': 1080, 'resolution': '1080p', 'format_note': '3D', 'preference': -20},
176 '100': {'ext': 'webm', 'height': 360, 'resolution': '360p', 'format_note': '3D', 'preference': -20},
177 '101': {'ext': 'webm', 'height': 480, 'resolution': '480p', 'format_note': '3D', 'preference': -20},
178 '102': {'ext': 'webm', 'height': 720, 'resolution': '720p', 'format_note': '3D', 'preference': -20},
836a086c 179
96fb5605 180 # Apple HTTP Live Streaming
2c62dc26
PH
181 '92': {'ext': 'mp4', 'height': 240, 'resolution': '240p', 'format_note': 'HLS', 'preference': -10},
182 '93': {'ext': 'mp4', 'height': 360, 'resolution': '360p', 'format_note': 'HLS', 'preference': -10},
183 '94': {'ext': 'mp4', 'height': 480, 'resolution': '480p', 'format_note': 'HLS', 'preference': -10},
184 '95': {'ext': 'mp4', 'height': 720, 'resolution': '720p', 'format_note': 'HLS', 'preference': -10},
185 '96': {'ext': 'mp4', 'height': 1080, 'resolution': '1080p', 'format_note': 'HLS', 'preference': -10},
186 '132': {'ext': 'mp4', 'height': 240, 'resolution': '240p', 'format_note': 'HLS', 'preference': -10},
187 '151': {'ext': 'mp4', 'height': 72, 'resolution': '72p', 'format_note': 'HLS', 'preference': -10},
188
189 # DASH mp4 video
190 '133': {'ext': 'mp4', 'height': 240, 'resolution': '240p', 'format_note': 'DASH video', 'preference': -40},
191 '134': {'ext': 'mp4', 'height': 360, 'resolution': '360p', 'format_note': 'DASH video', 'preference': -40},
192 '135': {'ext': 'mp4', 'height': 480, 'resolution': '480p', 'format_note': 'DASH video', 'preference': -40},
193 '136': {'ext': 'mp4', 'height': 720, 'resolution': '720p', 'format_note': 'DASH video', 'preference': -40},
194 '137': {'ext': 'mp4', 'height': 1080, 'resolution': '1080p', 'format_note': 'DASH video', 'preference': -40},
195 '138': {'ext': 'mp4', 'height': 1081, 'resolution': '>1080p', 'format_note': 'DASH video', 'preference': -40},
196 '160': {'ext': 'mp4', 'height': 192, 'resolution': '192p', 'format_note': 'DASH video', 'preference': -40},
8fa8a629 197 '264': {'ext': 'mp4', 'height': 1080, 'resolution': '1080p', 'format_note': 'DASH video', 'preference': -40},
836a086c 198
f6f1fc92 199 # Dash mp4 audio
2c62dc26
PH
200 '139': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 48, 'preference': -50},
201 '140': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 128, 'preference': -50},
202 '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 256, 'preference': -50},
836a086c
AZ
203
204 # Dash webm
2c62dc26
PH
205 '242': {'ext': 'webm', 'height': 240, 'resolution': '240p', 'format_note': 'DASH webm', 'preference': -40},
206 '243': {'ext': 'webm', 'height': 360, 'resolution': '360p', 'format_note': 'DASH webm', 'preference': -40},
207 '244': {'ext': 'webm', 'height': 480, 'resolution': '480p', 'format_note': 'DASH webm', 'preference': -40},
208 '245': {'ext': 'webm', 'height': 480, 'resolution': '480p', 'format_note': 'DASH webm', 'preference': -40},
209 '246': {'ext': 'webm', 'height': 480, 'resolution': '480p', 'format_note': 'DASH webm', 'preference': -40},
210 '247': {'ext': 'webm', 'height': 720, 'resolution': '720p', 'format_note': 'DASH webm', 'preference': -40},
211 '248': {'ext': 'webm', 'height': 1080, 'resolution': '1080p', 'format_note': 'DASH webm', 'preference': -40},
212
213 # Dash webm audio
214 '171': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH webm audio', 'abr': 48, 'preference': -50},
215 '172': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH webm audio', 'abr': 256, 'preference': -50},
c5e8d7af 216 }
836a086c 217
c5e8d7af 218 IE_NAME = u'youtube'
2eb88d95
PH
219 _TESTS = [
220 {
0e853ca4
PH
221 u"url": u"http://www.youtube.com/watch?v=BaW_jenozKc",
222 u"file": u"BaW_jenozKc.mp4",
223 u"info_dict": {
224 u"title": u"youtube-dl test video \"'/\\ä↭𝕐",
225 u"uploader": u"Philipp Hagemeister",
226 u"uploader_id": u"phihag",
227 u"upload_date": u"20121002",
27dcce19 228 u"description": u"test chars: \"'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de ."
2eb88d95 229 }
0e853ca4 230 },
0e853ca4
PH
231 {
232 u"url": u"http://www.youtube.com/watch?v=UxxajLWwzqY",
233 u"file": u"UxxajLWwzqY.mp4",
234 u"note": u"Test generic use_cipher_signature video (#897)",
235 u"info_dict": {
236 u"upload_date": u"20120506",
237 u"title": u"Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]",
2dc59299 238 u"description": u"md5:5b292926389560516e384ac437c0ec07",
45ed795c 239 u"uploader": u"Icona Pop",
0e853ca4 240 u"uploader_id": u"IconaPop"
2eb88d95 241 }
c108eb73
JMF
242 },
243 {
244 u"url": u"https://www.youtube.com/watch?v=07FYdnEawAQ",
245 u"file": u"07FYdnEawAQ.mp4",
246 u"note": u"Test VEVO video with age protection (#956)",
247 u"info_dict": {
248 u"upload_date": u"20130703",
249 u"title": u"Justin Timberlake - Tunnel Vision (Explicit)",
250 u"description": u"md5:64249768eec3bc4276236606ea996373",
251 u"uploader": u"justintimberlakeVEVO",
252 u"uploader_id": u"justintimberlakeVEVO"
253 }
254 },
fccd3771 255 {
83aa5293 256 u"url": u"//www.YouTube.com/watch?v=yZIXLfi8CZQ",
fccd3771
PH
257 u"file": u"yZIXLfi8CZQ.mp4",
258 u"note": u"Embed-only video (#1746)",
259 u"info_dict": {
260 u"upload_date": u"20120608",
261 u"title": u"Principal Sexually Assaults A Teacher - Episode 117 - 8th June 2012",
262 u"description": u"md5:09b78bd971f1e3e289601dfba15ca4f7",
263 u"uploader": u"SET India",
264 u"uploader_id": u"setindia"
265 }
266 },
2eb88d95
PH
267 ]
268
c5e8d7af
PH
269
270 @classmethod
271 def suitable(cls, url):
272 """Receives a URL and returns True if suitable for this IE."""
e3ea4790 273 if YoutubePlaylistIE.suitable(url): return False
fccd3771 274 return re.match(cls._VALID_URL, url) is not None
c5e8d7af 275
e0df6211
PH
276 def __init__(self, *args, **kwargs):
277 super(YoutubeIE, self).__init__(*args, **kwargs)
83799698 278 self._player_cache = {}
e0df6211 279
c5e8d7af
PH
280 def report_video_info_webpage_download(self, video_id):
281 """Report attempt to download video info webpage."""
282 self.to_screen(u'%s: Downloading video info webpage' % video_id)
283
c5e8d7af
PH
284 def report_information_extraction(self, video_id):
285 """Report attempt to extract video information."""
286 self.to_screen(u'%s: Extracting video information' % video_id)
287
288 def report_unavailable_format(self, video_id, format):
289 """Report extracted video URL."""
290 self.to_screen(u'%s: Format %s not available' % (video_id, format))
291
292 def report_rtmp_download(self):
293 """Indicate the download will use the RTMP protocol."""
294 self.to_screen(u'RTMP download detected')
295
c4417ddb
PH
296 def _extract_signature_function(self, video_id, player_url, slen):
297 id_m = re.match(r'.*-(?P<id>[a-zA-Z0-9_-]+)\.(?P<ext>[a-z]+)$',
83799698 298 player_url)
e0df6211
PH
299 player_type = id_m.group('ext')
300 player_id = id_m.group('id')
301
c4417ddb
PH
302 # Read from filesystem cache
303 func_id = '%s_%s_%d' % (player_type, player_id, slen)
304 assert os.path.basename(func_id) == func_id
c38b1e77 305 cache_dir = get_cachedir(self._downloader.params)
c4417ddb 306
c3c88a26 307 cache_enabled = cache_dir is not None
f8061589 308 if cache_enabled:
c4417ddb
PH
309 cache_fn = os.path.join(os.path.expanduser(cache_dir),
310 u'youtube-sigfuncs',
311 func_id + '.json')
312 try:
edf3e38e 313 with io.open(cache_fn, 'r', encoding='utf-8') as cachef:
c4417ddb
PH
314 cache_spec = json.load(cachef)
315 return lambda s: u''.join(s[i] for i in cache_spec)
edf3e38e 316 except IOError:
c4417ddb 317 pass # No cache available
83799698 318
e0df6211
PH
319 if player_type == 'js':
320 code = self._download_webpage(
321 player_url, video_id,
83799698 322 note=u'Downloading %s player %s' % (player_type, player_id),
e0df6211 323 errnote=u'Download of %s failed' % player_url)
83799698 324 res = self._parse_sig_js(code)
c4417ddb 325 elif player_type == 'swf':
e0df6211
PH
326 urlh = self._request_webpage(
327 player_url, video_id,
83799698 328 note=u'Downloading %s player %s' % (player_type, player_id),
e0df6211
PH
329 errnote=u'Download of %s failed' % player_url)
330 code = urlh.read()
83799698 331 res = self._parse_sig_swf(code)
e0df6211
PH
332 else:
333 assert False, 'Invalid player type %r' % player_type
334
f8061589 335 if cache_enabled:
edf3e38e 336 try:
c705320f
PH
337 test_string = u''.join(map(compat_chr, range(slen)))
338 cache_res = res(test_string)
edf3e38e
PH
339 cache_spec = [ord(c) for c in cache_res]
340 try:
341 os.makedirs(os.path.dirname(cache_fn))
342 except OSError as ose:
343 if ose.errno != errno.EEXIST:
344 raise
345 write_json_file(cache_spec, cache_fn)
0ca96d48 346 except Exception:
edf3e38e
PH
347 tb = traceback.format_exc()
348 self._downloader.report_warning(
349 u'Writing cache to %r failed: %s' % (cache_fn, tb))
83799698
PH
350
351 return res
352
edf3e38e
PH
353 def _print_sig_code(self, func, slen):
354 def gen_sig_code(idxs):
355 def _genslice(start, end, step):
356 starts = u'' if start == 0 else str(start)
e35e4ddc
PH
357 ends = (u':%d' % (end+step)) if end + step >= 0 else u':'
358 steps = u'' if step == 1 else (u':%d' % step)
edf3e38e
PH
359 return u's[%s%s%s]' % (starts, ends, steps)
360
361 step = None
0ca96d48
PH
362 start = '(Never used)' # Quelch pyflakes warnings - start will be
363 # set as soon as step is set
edf3e38e
PH
364 for i, prev in zip(idxs[1:], idxs[:-1]):
365 if step is not None:
366 if i - prev == step:
367 continue
368 yield _genslice(start, prev, step)
369 step = None
370 continue
371 if i - prev in [-1, 1]:
372 step = i - prev
373 start = prev
374 continue
375 else:
376 yield u's[%d]' % prev
377 if step is None:
378 yield u's[%d]' % i
379 else:
380 yield _genslice(start, i, step)
381
c705320f
PH
382 test_string = u''.join(map(compat_chr, range(slen)))
383 cache_res = func(test_string)
edf3e38e
PH
384 cache_spec = [ord(c) for c in cache_res]
385 expr_code = u' + '.join(gen_sig_code(cache_spec))
386 code = u'if len(s) == %d:\n return %s\n' % (slen, expr_code)
f8061589 387 self.to_screen(u'Extracted signature function:\n' + code)
edf3e38e 388
e0df6211
PH
389 def _parse_sig_js(self, jscode):
390 funcname = self._search_regex(
391 r'signature=([a-zA-Z]+)', jscode,
392 u'Initial JS player signature function name')
393
394 functions = {}
395
396 def argidx(varname):
397 return string.lowercase.index(varname)
398
399 def interpret_statement(stmt, local_vars, allow_recursion=20):
400 if allow_recursion < 0:
0ca96d48 401 raise ExtractorError(u'Recursion limit reached')
e0df6211
PH
402
403 if stmt.startswith(u'var '):
404 stmt = stmt[len(u'var '):]
405 ass_m = re.match(r'^(?P<out>[a-z]+)(?:\[(?P<index>[^\]]+)\])?' +
406 r'=(?P<expr>.*)$', stmt)
407 if ass_m:
408 if ass_m.groupdict().get('index'):
409 def assign(val):
410 lvar = local_vars[ass_m.group('out')]
411 idx = interpret_expression(ass_m.group('index'),
412 local_vars, allow_recursion)
413 assert isinstance(idx, int)
414 lvar[idx] = val
415 return val
416 expr = ass_m.group('expr')
417 else:
418 def assign(val):
419 local_vars[ass_m.group('out')] = val
420 return val
421 expr = ass_m.group('expr')
422 elif stmt.startswith(u'return '):
423 assign = lambda v: v
424 expr = stmt[len(u'return '):]
425 else:
426 raise ExtractorError(
427 u'Cannot determine left side of statement in %r' % stmt)
428
429 v = interpret_expression(expr, local_vars, allow_recursion)
430 return assign(v)
431
432 def interpret_expression(expr, local_vars, allow_recursion):
433 if expr.isdigit():
434 return int(expr)
435
436 if expr.isalpha():
437 return local_vars[expr]
438
439 m = re.match(r'^(?P<in>[a-z]+)\.(?P<member>.*)$', expr)
440 if m:
441 member = m.group('member')
442 val = local_vars[m.group('in')]
443 if member == 'split("")':
444 return list(val)
445 if member == 'join("")':
446 return u''.join(val)
447 if member == 'length':
448 return len(val)
449 if member == 'reverse()':
450 return val[::-1]
451 slice_m = re.match(r'slice\((?P<idx>.*)\)', member)
452 if slice_m:
453 idx = interpret_expression(
454 slice_m.group('idx'), local_vars, allow_recursion-1)
455 return val[idx:]
456
457 m = re.match(
458 r'^(?P<in>[a-z]+)\[(?P<idx>.+)\]$', expr)
459 if m:
460 val = local_vars[m.group('in')]
461 idx = interpret_expression(m.group('idx'), local_vars,
462 allow_recursion-1)
463 return val[idx]
464
465 m = re.match(r'^(?P<a>.+?)(?P<op>[%])(?P<b>.+?)$', expr)
466 if m:
467 a = interpret_expression(m.group('a'),
468 local_vars, allow_recursion)
469 b = interpret_expression(m.group('b'),
470 local_vars, allow_recursion)
471 return a % b
472
473 m = re.match(
474 r'^(?P<func>[a-zA-Z]+)\((?P<args>[a-z0-9,]+)\)$', expr)
475 if m:
476 fname = m.group('func')
477 if fname not in functions:
478 functions[fname] = extract_function(fname)
479 argvals = [int(v) if v.isdigit() else local_vars[v]
480 for v in m.group('args').split(',')]
481 return functions[fname](argvals)
482 raise ExtractorError(u'Unsupported JS expression %r' % expr)
483
484 def extract_function(funcname):
485 func_m = re.search(
486 r'function ' + re.escape(funcname) +
487 r'\((?P<args>[a-z,]+)\){(?P<code>[^}]+)}',
488 jscode)
489 argnames = func_m.group('args').split(',')
490
491 def resf(args):
492 local_vars = dict(zip(argnames, args))
493 for stmt in func_m.group('code').split(';'):
494 res = interpret_statement(stmt, local_vars)
495 return res
496 return resf
497
498 initial_function = extract_function(funcname)
499 return lambda s: initial_function([s])
500
501 def _parse_sig_swf(self, file_contents):
502 if file_contents[1:3] != b'WS':
503 raise ExtractorError(
504 u'Not an SWF file; header is %r' % file_contents[:3])
505 if file_contents[:1] == b'C':
506 content = zlib.decompress(file_contents[8:])
507 else:
508 raise NotImplementedError(u'Unsupported compression format %r' %
509 file_contents[:1])
510
511 def extract_tags(content):
512 pos = 0
513 while pos < len(content):
514 header16 = struct.unpack('<H', content[pos:pos+2])[0]
515 pos += 2
516 tag_code = header16 >> 6
517 tag_len = header16 & 0x3f
518 if tag_len == 0x3f:
519 tag_len = struct.unpack('<I', content[pos:pos+4])[0]
520 pos += 4
521 assert pos+tag_len <= len(content)
522 yield (tag_code, content[pos:pos+tag_len])
523 pos += tag_len
524
525 code_tag = next(tag
526 for tag_code, tag in extract_tags(content)
527 if tag_code == 82)
528 p = code_tag.index(b'\0', 4) + 1
ba552f54 529 code_reader = io.BytesIO(code_tag[p:])
e0df6211
PH
530
531 # Parse ABC (AVM2 ByteCode)
ba552f54
PH
532 def read_int(reader=None):
533 if reader is None:
534 reader = code_reader
e0df6211
PH
535 res = 0
536 shift = 0
537 for _ in range(5):
ba552f54
PH
538 buf = reader.read(1)
539 assert len(buf) == 1
540 b = struct.unpack('<B', buf)[0]
e0df6211
PH
541 res = res | ((b & 0x7f) << shift)
542 if b & 0x80 == 0:
543 break
544 shift += 7
ba552f54
PH
545 return res
546
547 def u30(reader=None):
548 res = read_int(reader)
549 assert res & 0xf0000000 == 0
e0df6211
PH
550 return res
551 u32 = read_int
552
ba552f54
PH
553 def s32(reader=None):
554 v = read_int(reader)
e0df6211
PH
555 if v & 0x80000000 != 0:
556 v = - ((v ^ 0xffffffff) + 1)
ba552f54
PH
557 return v
558
0ca96d48 559 def read_string(reader=None):
ba552f54
PH
560 if reader is None:
561 reader = code_reader
562 slen = u30(reader)
563 resb = reader.read(slen)
564 assert len(resb) == slen
565 return resb.decode('utf-8')
566
567 def read_bytes(count, reader=None):
568 if reader is None:
569 reader = code_reader
570 resb = reader.read(count)
571 assert len(resb) == count
572 return resb
573
574 def read_byte(reader=None):
575 resb = read_bytes(1, reader=reader)
576 res = struct.unpack('<B', resb)[0]
577 return res
e0df6211
PH
578
579 # minor_version + major_version
0ca96d48 580 read_bytes(2 + 2)
e0df6211
PH
581
582 # Constant pool
ba552f54 583 int_count = u30()
e0df6211 584 for _c in range(1, int_count):
0ca96d48 585 s32()
ba552f54 586 uint_count = u30()
e0df6211 587 for _c in range(1, uint_count):
0ca96d48 588 u32()
ba552f54 589 double_count = u30()
0ca96d48 590 read_bytes((double_count-1) * 8)
ba552f54 591 string_count = u30()
e0df6211
PH
592 constant_strings = [u'']
593 for _c in range(1, string_count):
0ca96d48 594 s = read_string()
e0df6211 595 constant_strings.append(s)
ba552f54 596 namespace_count = u30()
e0df6211 597 for _c in range(1, namespace_count):
0ca96d48
PH
598 read_bytes(1) # kind
599 u30() # name
ba552f54 600 ns_set_count = u30()
e0df6211 601 for _c in range(1, ns_set_count):
ba552f54 602 count = u30()
e0df6211 603 for _c2 in range(count):
0ca96d48 604 u30()
ba552f54 605 multiname_count = u30()
e0df6211
PH
606 MULTINAME_SIZES = {
607 0x07: 2, # QName
608 0x0d: 2, # QNameA
609 0x0f: 1, # RTQName
610 0x10: 1, # RTQNameA
611 0x11: 0, # RTQNameL
612 0x12: 0, # RTQNameLA
613 0x09: 2, # Multiname
614 0x0e: 2, # MultinameA
615 0x1b: 1, # MultinameL
616 0x1c: 1, # MultinameLA
617 }
618 multinames = [u'']
619 for _c in range(1, multiname_count):
ba552f54 620 kind = u30()
e0df6211
PH
621 assert kind in MULTINAME_SIZES, u'Invalid multiname kind %r' % kind
622 if kind == 0x07:
0ca96d48 623 u30() # namespace_idx
ba552f54 624 name_idx = u30()
e0df6211
PH
625 multinames.append(constant_strings[name_idx])
626 else:
627 multinames.append('[MULTINAME kind: %d]' % kind)
628 for _c2 in range(MULTINAME_SIZES[kind]):
0ca96d48 629 u30()
e0df6211
PH
630
631 # Methods
ba552f54 632 method_count = u30()
e0df6211
PH
633 MethodInfo = collections.namedtuple(
634 'MethodInfo',
635 ['NEED_ARGUMENTS', 'NEED_REST'])
636 method_infos = []
637 for method_id in range(method_count):
ba552f54 638 param_count = u30()
0ca96d48 639 u30() # return type
e0df6211 640 for _ in range(param_count):
0ca96d48
PH
641 u30() # param type
642 u30() # name index (always 0 for youtube)
ba552f54 643 flags = read_byte()
e0df6211
PH
644 if flags & 0x08 != 0:
645 # Options present
ba552f54 646 option_count = u30()
e0df6211 647 for c in range(option_count):
0ca96d48
PH
648 u30() # val
649 read_bytes(1) # kind
e0df6211
PH
650 if flags & 0x80 != 0:
651 # Param names present
652 for _ in range(param_count):
0ca96d48 653 u30() # param name
e0df6211
PH
654 mi = MethodInfo(flags & 0x01 != 0, flags & 0x04 != 0)
655 method_infos.append(mi)
656
657 # Metadata
ba552f54 658 metadata_count = u30()
e0df6211 659 for _c in range(metadata_count):
0ca96d48 660 u30() # name
ba552f54 661 item_count = u30()
e0df6211 662 for _c2 in range(item_count):
0ca96d48
PH
663 u30() # key
664 u30() # value
ba552f54
PH
665
666 def parse_traits_info():
667 trait_name_idx = u30()
668 kind_full = read_byte()
e0df6211
PH
669 kind = kind_full & 0x0f
670 attrs = kind_full >> 4
671 methods = {}
672 if kind in [0x00, 0x06]: # Slot or Const
0ca96d48
PH
673 u30() # Slot id
674 u30() # type_name_idx
ba552f54 675 vindex = u30()
e0df6211 676 if vindex != 0:
0ca96d48 677 read_byte() # vkind
e0df6211 678 elif kind in [0x01, 0x02, 0x03]: # Method / Getter / Setter
0ca96d48 679 u30() # disp_id
ba552f54 680 method_idx = u30()
e0df6211
PH
681 methods[multinames[trait_name_idx]] = method_idx
682 elif kind == 0x04: # Class
0ca96d48
PH
683 u30() # slot_id
684 u30() # classi
e0df6211 685 elif kind == 0x05: # Function
0ca96d48 686 u30() # slot_id
ba552f54 687 function_idx = u30()
e0df6211
PH
688 methods[function_idx] = multinames[trait_name_idx]
689 else:
690 raise ExtractorError(u'Unsupported trait kind %d' % kind)
691
692 if attrs & 0x4 != 0: # Metadata present
ba552f54 693 metadata_count = u30()
e0df6211 694 for _c3 in range(metadata_count):
0ca96d48 695 u30() # metadata index
e0df6211 696
ba552f54 697 return methods
e0df6211
PH
698
699 # Classes
700 TARGET_CLASSNAME = u'SignatureDecipher'
701 searched_idx = multinames.index(TARGET_CLASSNAME)
702 searched_class_id = None
ba552f54 703 class_count = u30()
e0df6211 704 for class_id in range(class_count):
ba552f54 705 name_idx = u30()
e0df6211
PH
706 if name_idx == searched_idx:
707 # We found the class we're looking for!
708 searched_class_id = class_id
0ca96d48 709 u30() # super_name idx
ba552f54 710 flags = read_byte()
e0df6211 711 if flags & 0x08 != 0: # Protected namespace is present
0ca96d48 712 u30() # protected_ns_idx
ba552f54 713 intrf_count = u30()
e0df6211 714 for _c2 in range(intrf_count):
0ca96d48
PH
715 u30()
716 u30() # iinit
ba552f54 717 trait_count = u30()
e0df6211 718 for _c2 in range(trait_count):
0ca96d48 719 parse_traits_info()
e0df6211
PH
720
721 if searched_class_id is None:
722 raise ExtractorError(u'Target class %r not found' %
723 TARGET_CLASSNAME)
724
725 method_names = {}
726 method_idxs = {}
727 for class_id in range(class_count):
0ca96d48 728 u30() # cinit
ba552f54 729 trait_count = u30()
e0df6211 730 for _c2 in range(trait_count):
ba552f54 731 trait_methods = parse_traits_info()
e0df6211
PH
732 if class_id == searched_class_id:
733 method_names.update(trait_methods.items())
734 method_idxs.update(dict(
735 (idx, name)
736 for name, idx in trait_methods.items()))
737
738 # Scripts
ba552f54 739 script_count = u30()
e0df6211 740 for _c in range(script_count):
0ca96d48 741 u30() # init
ba552f54 742 trait_count = u30()
e0df6211 743 for _c2 in range(trait_count):
0ca96d48 744 parse_traits_info()
e0df6211
PH
745
746 # Method bodies
ba552f54 747 method_body_count = u30()
e0df6211
PH
748 Method = collections.namedtuple('Method', ['code', 'local_count'])
749 methods = {}
750 for _c in range(method_body_count):
ba552f54 751 method_idx = u30()
0ca96d48 752 u30() # max_stack
ba552f54 753 local_count = u30()
0ca96d48
PH
754 u30() # init_scope_depth
755 u30() # max_scope_depth
ba552f54
PH
756 code_length = u30()
757 code = read_bytes(code_length)
e0df6211 758 if method_idx in method_idxs:
ba552f54 759 m = Method(code, local_count)
e0df6211 760 methods[method_idxs[method_idx]] = m
ba552f54 761 exception_count = u30()
e0df6211 762 for _c2 in range(exception_count):
0ca96d48
PH
763 u30() # from
764 u30() # to
765 u30() # target
766 u30() # exc_type
767 u30() # var_name
ba552f54 768 trait_count = u30()
e0df6211 769 for _c2 in range(trait_count):
0ca96d48 770 parse_traits_info()
e0df6211 771
ba552f54 772 assert p + code_reader.tell() == len(code_tag)
e0df6211
PH
773 assert len(methods) == len(method_idxs)
774
775 method_pyfunctions = {}
776
777 def extract_function(func_name):
778 if func_name in method_pyfunctions:
779 return method_pyfunctions[func_name]
780 if func_name not in methods:
781 raise ExtractorError(u'Cannot find function %r' % func_name)
782 m = methods[func_name]
783
784 def resfunc(args):
e0df6211
PH
785 registers = ['(this)'] + list(args) + [None] * m.local_count
786 stack = []
787 coder = io.BytesIO(m.code)
788 while True:
789 opcode = struct.unpack('!B', coder.read(1))[0]
a7177865 790 if opcode == 36: # pushbyte
e0df6211
PH
791 v = struct.unpack('!B', coder.read(1))[0]
792 stack.append(v)
793 elif opcode == 44: # pushstring
794 idx = u30(coder)
795 stack.append(constant_strings[idx])
796 elif opcode == 48: # pushscope
797 # We don't implement the scope register, so we'll just
798 # ignore the popped value
799 stack.pop()
800 elif opcode == 70: # callproperty
801 index = u30(coder)
802 mname = multinames[index]
803 arg_count = u30(coder)
804 args = list(reversed(
805 [stack.pop() for _ in range(arg_count)]))
806 obj = stack.pop()
807 if mname == u'split':
808 assert len(args) == 1
809 assert isinstance(args[0], compat_str)
810 assert isinstance(obj, compat_str)
811 if args[0] == u'':
812 res = list(obj)
813 else:
814 res = obj.split(args[0])
815 stack.append(res)
a7177865
PH
816 elif mname == u'slice':
817 assert len(args) == 1
818 assert isinstance(args[0], int)
819 assert isinstance(obj, list)
820 res = obj[args[0]:]
821 stack.append(res)
822 elif mname == u'join':
823 assert len(args) == 1
824 assert isinstance(args[0], compat_str)
825 assert isinstance(obj, list)
826 res = args[0].join(obj)
827 stack.append(res)
e0df6211
PH
828 elif mname in method_pyfunctions:
829 stack.append(method_pyfunctions[mname](args))
830 else:
831 raise NotImplementedError(
832 u'Unsupported property %r on %r'
833 % (mname, obj))
a7177865
PH
834 elif opcode == 72: # returnvalue
835 res = stack.pop()
836 return res
837 elif opcode == 79: # callpropvoid
838 index = u30(coder)
839 mname = multinames[index]
840 arg_count = u30(coder)
841 args = list(reversed(
842 [stack.pop() for _ in range(arg_count)]))
843 obj = stack.pop()
844 if mname == u'reverse':
845 assert isinstance(obj, list)
846 obj.reverse()
847 else:
848 raise NotImplementedError(
849 u'Unsupported (void) property %r on %r'
850 % (mname, obj))
e0df6211
PH
851 elif opcode == 93: # findpropstrict
852 index = u30(coder)
853 mname = multinames[index]
854 res = extract_function(mname)
855 stack.append(res)
856 elif opcode == 97: # setproperty
857 index = u30(coder)
858 value = stack.pop()
859 idx = stack.pop()
860 obj = stack.pop()
861 assert isinstance(obj, list)
862 assert isinstance(idx, int)
863 obj[idx] = value
864 elif opcode == 98: # getlocal
865 index = u30(coder)
866 stack.append(registers[index])
867 elif opcode == 99: # setlocal
868 index = u30(coder)
869 value = stack.pop()
870 registers[index] = value
871 elif opcode == 102: # getproperty
872 index = u30(coder)
873 pname = multinames[index]
874 if pname == u'length':
875 obj = stack.pop()
876 assert isinstance(obj, list)
877 stack.append(len(obj))
878 else: # Assume attribute access
879 idx = stack.pop()
880 assert isinstance(idx, int)
881 obj = stack.pop()
882 assert isinstance(obj, list)
883 stack.append(obj[idx])
884 elif opcode == 128: # coerce
0ca96d48 885 u30(coder)
e0df6211
PH
886 elif opcode == 133: # coerce_s
887 assert isinstance(stack[-1], (type(None), compat_str))
888 elif opcode == 164: # modulo
889 value2 = stack.pop()
890 value1 = stack.pop()
891 res = value1 % value2
892 stack.append(res)
a7177865
PH
893 elif opcode == 208: # getlocal_0
894 stack.append(registers[0])
895 elif opcode == 209: # getlocal_1
896 stack.append(registers[1])
897 elif opcode == 210: # getlocal_2
898 stack.append(registers[2])
899 elif opcode == 211: # getlocal_3
900 stack.append(registers[3])
e0df6211
PH
901 elif opcode == 214: # setlocal_2
902 registers[2] = stack.pop()
903 elif opcode == 215: # setlocal_3
904 registers[3] = stack.pop()
905 else:
906 raise NotImplementedError(
907 u'Unsupported opcode %d' % opcode)
908
909 method_pyfunctions[func_name] = resfunc
910 return resfunc
911
912 initial_function = extract_function(u'decipher')
913 return lambda s: initial_function([s])
914
83799698 915 def _decrypt_signature(self, s, video_id, player_url, age_gate=False):
257a2501 916 """Turn the encrypted s field into a working signature"""
6b37f0be 917
83799698 918 if player_url is not None:
9f9be844
PH
919 if player_url.startswith(u'//'):
920 player_url = u'https:' + player_url
e0df6211 921 try:
7f8ae73a
PH
922 player_id = (player_url, len(s))
923 if player_id not in self._player_cache:
83799698 924 func = self._extract_signature_function(
c4417ddb 925 video_id, player_url, len(s)
e0df6211 926 )
7f8ae73a
PH
927 self._player_cache[player_id] = func
928 func = self._player_cache[player_id]
edf3e38e
PH
929 if self._downloader.params.get('youtube_print_sig_code'):
930 self._print_sig_code(func, len(s))
931 return func(s)
0ca96d48 932 except Exception:
e0df6211 933 tb = traceback.format_exc()
83799698
PH
934 self._downloader.report_warning(
935 u'Automatic signature extraction failed: ' + tb)
e0df6211 936
d2d8f895
PH
937 self._downloader.report_warning(
938 u'Warning: Falling back to static signature algorithm')
920de7a2 939
2f2ffea9
PH
940 return self._static_decrypt_signature(
941 s, video_id, player_url, age_gate)
e0df6211 942
2f2ffea9 943 def _static_decrypt_signature(self, s, video_id, player_url, age_gate):
e0df6211
PH
944 if age_gate:
945 # The videos with age protection use another player, so the
946 # algorithms can be different.
947 if len(s) == 86:
948 return s[2:63] + s[82] + s[64:82] + s[63]
949
bc4b9008 950 if len(s) == 93:
951 return s[86:29:-1] + s[88] + s[28:5:-1]
952 elif len(s) == 92:
444b1165 953 return s[25] + s[3:25] + s[0] + s[26:42] + s[79] + s[43:79] + s[91] + s[80:83]
38d025b3
JMF
954 elif len(s) == 91:
955 return s[84:27:-1] + s[86] + s[26:5:-1]
444b1165
JMF
956 elif len(s) == 90:
957 return s[25] + s[3:25] + s[2] + s[26:40] + s[77] + s[41:77] + s[89] + s[78:81]
8a9d86a2 958 elif len(s) == 89:
959 return s[84:78:-1] + s[87] + s[77:60:-1] + s[0] + s[59:3:-1]
444b1165 960 elif len(s) == 88:
3e223834 961 return s[7:28] + s[87] + s[29:45] + s[55] + s[46:55] + s[2] + s[56:87] + s[28]
be547e1d 962 elif len(s) == 87:
3a725669 963 return s[6:27] + s[4] + s[28:39] + s[27] + s[40:59] + s[2] + s[60:]
be547e1d 964 elif len(s) == 86:
f2c327fd 965 return s[80:72:-1] + s[16] + s[71:39:-1] + s[72] + s[38:16:-1] + s[82] + s[15::-1]
be547e1d 966 elif len(s) == 85:
6ae8ee3f 967 return s[3:11] + s[0] + s[12:55] + s[84] + s[56:84]
be547e1d 968 elif len(s) == 84:
6f56389b 969 return s[78:70:-1] + s[14] + s[69:37:-1] + s[70] + s[36:14:-1] + s[80] + s[:14][::-1]
be547e1d 970 elif len(s) == 83:
920de7a2 971 return s[80:63:-1] + s[0] + s[62:0:-1] + s[63]
be547e1d 972 elif len(s) == 82:
c21315f2 973 return s[80:37:-1] + s[7] + s[36:7:-1] + s[0] + s[6:0:-1] + s[37]
be547e1d 974 elif len(s) == 81:
aedd6bb9 975 return s[56] + s[79:56:-1] + s[41] + s[55:41:-1] + s[80] + s[40:34:-1] + s[0] + s[33:29:-1] + s[34] + s[28:9:-1] + s[29] + s[8:0:-1] + s[9]
066090dd
JMF
976 elif len(s) == 80:
977 return s[1:19] + s[0] + s[20:68] + s[19] + s[69:80]
5c468ca8
JMF
978 elif len(s) == 79:
979 return s[54] + s[77:54:-1] + s[39] + s[53:39:-1] + s[78] + s[38:34:-1] + s[0] + s[33:29:-1] + s[34] + s[28:9:-1] + s[29] + s[8:0:-1] + s[9]
be547e1d
PH
980
981 else:
982 raise ExtractorError(u'Unable to decrypt signature, key length %d not supported; retrying might work' % (len(s)))
c5e8d7af 983
1f343eaa 984 def _get_available_subtitles(self, video_id, webpage):
de7f3446 985 try:
7fad1c63
JMF
986 sub_list = self._download_webpage(
987 'http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,
988 video_id, note=False)
989 except ExtractorError as err:
de7f3446
JMF
990 self._downloader.report_warning(u'unable to download video subtitles: %s' % compat_str(err))
991 return {}
992 lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
993
994 sub_lang_list = {}
995 for l in lang_list:
996 lang = l[1]
997 params = compat_urllib_parse.urlencode({
998 'lang': lang,
999 'v': video_id,
ca715127 1000 'fmt': self._downloader.params.get('subtitlesformat', 'srt'),
c3197e3e 1001 'name': unescapeHTML(l[0]).encode('utf-8'),
de7f3446
JMF
1002 })
1003 url = u'http://www.youtube.com/api/timedtext?' + params
1004 sub_lang_list[lang] = url
1005 if not sub_lang_list:
1006 self._downloader.report_warning(u'video doesn\'t have subtitles')
1007 return {}
1008 return sub_lang_list
1009
055e6f36 1010 def _get_available_automatic_caption(self, video_id, webpage):
de7f3446
JMF
1011 """We need the webpage for getting the captions url, pass it as an
1012 argument to speed up the process."""
ca715127 1013 sub_format = self._downloader.params.get('subtitlesformat', 'srt')
de7f3446
JMF
1014 self.to_screen(u'%s: Looking for automatic captions' % video_id)
1015 mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
055e6f36 1016 err_msg = u'Couldn\'t find automatic captions for %s' % video_id
de7f3446
JMF
1017 if mobj is None:
1018 self._downloader.report_warning(err_msg)
1019 return {}
1020 player_config = json.loads(mobj.group(1))
1021 try:
1022 args = player_config[u'args']
1023 caption_url = args[u'ttsurl']
1024 timestamp = args[u'timestamp']
055e6f36
JMF
1025 # We get the available subtitles
1026 list_params = compat_urllib_parse.urlencode({
1027 'type': 'list',
1028 'tlangs': 1,
1029 'asrs': 1,
de7f3446 1030 })
055e6f36 1031 list_url = caption_url + '&' + list_params
e26f8712 1032 caption_list = self._download_xml(list_url, video_id)
e3dc22ca 1033 original_lang_node = caption_list.find('track')
f6a54188 1034 if original_lang_node is None or original_lang_node.attrib.get('kind') != 'asr' :
e3dc22ca
JMF
1035 self._downloader.report_warning(u'Video doesn\'t have automatic captions')
1036 return {}
1037 original_lang = original_lang_node.attrib['lang_code']
055e6f36
JMF
1038
1039 sub_lang_list = {}
1040 for lang_node in caption_list.findall('target'):
1041 sub_lang = lang_node.attrib['lang_code']
1042 params = compat_urllib_parse.urlencode({
1043 'lang': original_lang,
1044 'tlang': sub_lang,
1045 'fmt': sub_format,
1046 'ts': timestamp,
1047 'kind': 'asr',
1048 })
1049 sub_lang_list[sub_lang] = caption_url + '&' + params
1050 return sub_lang_list
de7f3446
JMF
1051 # An extractor error can be raise by the download process if there are
1052 # no automatic captions but there are subtitles
1053 except (KeyError, ExtractorError):
1054 self._downloader.report_warning(err_msg)
1055 return {}
1056
c5e8d7af
PH
1057 def _extract_id(self, url):
1058 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1059 if mobj is None:
1060 raise ExtractorError(u'Invalid URL: %s' % url)
1061 video_id = mobj.group(2)
1062 return video_id
1063
1d043b93
JMF
1064 def _get_video_url_list(self, url_map):
1065 """
1066 Transform a dictionary in the format {itag:url} to a list of (itag, url)
1067 with the requested formats.
1068 """
2c62dc26 1069 existing_formats = [x for x in self._formats if x in url_map]
1d043b93
JMF
1070 if len(existing_formats) == 0:
1071 raise ExtractorError(u'no known formats available for video')
4ea3be0a 1072 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1073 video_url_list.reverse() # order worst to best
1d043b93
JMF
1074 return video_url_list
1075
1076 def _extract_from_m3u8(self, manifest_url, video_id):
1077 url_map = {}
1078 def _get_urls(_manifest):
1079 lines = _manifest.split('\n')
1080 urls = filter(lambda l: l and not l.startswith('#'),
1081 lines)
1082 return urls
1083 manifest = self._download_webpage(manifest_url, video_id, u'Downloading formats manifest')
1084 formats_urls = _get_urls(manifest)
1085 for format_url in formats_urls:
890f62e8 1086 itag = self._search_regex(r'itag/(\d+?)/', format_url, 'itag')
1d043b93
JMF
1087 url_map[itag] = format_url
1088 return url_map
1089
1fb07d10
JG
1090 def _extract_annotations(self, video_id):
1091 url = 'https://www.youtube.com/annotations_invideo?features=1&legacy=1&video_id=%s' % video_id
1092 return self._download_webpage(url, video_id, note=u'Searching for annotations.', errnote=u'Unable to download video annotations.')
1093
c5e8d7af
PH
1094 def _real_extract(self, url):
1095 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
1096 mobj = re.search(self._NEXT_URL_RE, url)
1097 if mobj:
1098 url = 'https://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
1099 video_id = self._extract_id(url)
1100
1101 # Get video webpage
c5e8d7af 1102 url = 'https://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
336c3a69 1103 video_webpage = self._download_webpage(url, video_id)
c5e8d7af
PH
1104
1105 # Attempt to extract SWF player URL
e0df6211 1106 mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
c5e8d7af
PH
1107 if mobj is not None:
1108 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1109 else:
1110 player_url = None
1111
1112 # Get video info
1113 self.report_video_info_webpage_download(video_id)
c108eb73
JMF
1114 if re.search(r'player-age-gate-content">', video_webpage) is not None:
1115 self.report_age_confirmation()
1116 age_gate = True
1117 # We simulate the access to the video from www.youtube.com/v/{video_id}
1118 # this can be viewed without login into Youtube
1119 data = compat_urllib_parse.urlencode({'video_id': video_id,
fccd3771 1120 'el': 'player_embedded',
c108eb73
JMF
1121 'gl': 'US',
1122 'hl': 'en',
1123 'eurl': 'https://youtube.googleapis.com/v/' + video_id,
1124 'asv': 3,
1125 'sts':'1588',
1126 })
1127 video_info_url = 'https://www.youtube.com/get_video_info?' + data
c5e8d7af
PH
1128 video_info_webpage = self._download_webpage(video_info_url, video_id,
1129 note=False,
1130 errnote='unable to download video info webpage')
1131 video_info = compat_parse_qs(video_info_webpage)
c108eb73
JMF
1132 else:
1133 age_gate = False
1134 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1135 video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1136 % (video_id, el_type))
1137 video_info_webpage = self._download_webpage(video_info_url, video_id,
1138 note=False,
1139 errnote='unable to download video info webpage')
1140 video_info = compat_parse_qs(video_info_webpage)
1141 if 'token' in video_info:
1142 break
c5e8d7af
PH
1143 if 'token' not in video_info:
1144 if 'reason' in video_info:
9a82b238 1145 raise ExtractorError(u'YouTube said: %s' % video_info['reason'][0], expected=True)
c5e8d7af
PH
1146 else:
1147 raise ExtractorError(u'"token" parameter not in video info for unknown reason')
1148
1d699755
PH
1149 if 'view_count' in video_info:
1150 view_count = int(video_info['view_count'][0])
1151 else:
1152 view_count = None
1153
c5e8d7af
PH
1154 # Check for "rental" videos
1155 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
1156 raise ExtractorError(u'"rental" videos not supported')
1157
1158 # Start extracting information
1159 self.report_information_extraction(video_id)
1160
1161 # uploader
1162 if 'author' not in video_info:
1163 raise ExtractorError(u'Unable to extract uploader name')
1164 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
1165
1166 # uploader_id
1167 video_uploader_id = None
1168 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
1169 if mobj is not None:
1170 video_uploader_id = mobj.group(1)
1171 else:
1172 self._downloader.report_warning(u'unable to extract uploader nickname')
1173
1174 # title
a8c6b241
PH
1175 if 'title' in video_info:
1176 video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
1177 else:
1178 self._downloader.report_warning(u'Unable to extract video title')
1179 video_title = u'_'
c5e8d7af
PH
1180
1181 # thumbnail image
7763b04e
JMF
1182 # We try first to get a high quality image:
1183 m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">',
1184 video_webpage, re.DOTALL)
1185 if m_thumb is not None:
1186 video_thumbnail = m_thumb.group(1)
1187 elif 'thumbnail_url' not in video_info:
c5e8d7af 1188 self._downloader.report_warning(u'unable to extract video thumbnail')
f490e77e 1189 video_thumbnail = None
c5e8d7af
PH
1190 else: # don't panic if we can't find it
1191 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
1192
1193 # upload date
1194 upload_date = None
1195 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
1196 if mobj is not None:
1197 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1198 upload_date = unified_strdate(upload_date)
1199
1200 # description
1201 video_description = get_element_by_id("eow-description", video_webpage)
1202 if video_description:
27dcce19
PH
1203 video_description = re.sub(r'''(?x)
1204 <a\s+
1205 (?:[a-zA-Z-]+="[^"]+"\s+)*?
1206 title="([^"]+)"\s+
1207 (?:[a-zA-Z-]+="[^"]+"\s+)*?
1208 class="yt-uix-redirect-link"\s*>
1209 [^<]+
1210 </a>
1211 ''', r'\1', video_description)
c5e8d7af
PH
1212 video_description = clean_html(video_description)
1213 else:
1214 fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
1215 if fd_mobj:
1216 video_description = unescapeHTML(fd_mobj.group(1))
1217 else:
1218 video_description = u''
1219
336c3a69 1220 def _extract_count(klass):
46374a56
PH
1221 count = self._search_regex(
1222 r'class="%s">([\d,]+)</span>' % re.escape(klass),
1223 video_webpage, klass, default=None)
336c3a69
JMF
1224 if count is not None:
1225 return int(count.replace(',', ''))
1226 return None
1227 like_count = _extract_count(u'likes-count')
1228 dislike_count = _extract_count(u'dislikes-count')
1229
c5e8d7af 1230 # subtitles
d82134c3 1231 video_subtitles = self.extract_subtitles(video_id, video_webpage)
c5e8d7af 1232
c5e8d7af 1233 if self._downloader.params.get('listsubtitles', False):
d665f8d3 1234 self._list_available_subtitles(video_id, video_webpage)
c5e8d7af
PH
1235 return
1236
1237 if 'length_seconds' not in video_info:
1238 self._downloader.report_warning(u'unable to extract video duration')
b466b702 1239 video_duration = None
c5e8d7af 1240 else:
b466b702 1241 video_duration = int(compat_urllib_parse.unquote_plus(video_info['length_seconds'][0]))
c5e8d7af 1242
1fb07d10
JG
1243 # annotations
1244 video_annotations = None
1245 if self._downloader.params.get('writeannotations', False):
1246 video_annotations = self._extract_annotations(video_id)
1247
c5e8d7af 1248 # Decide which formats to download
c5e8d7af
PH
1249
1250 try:
1251 mobj = re.search(r';ytplayer.config = ({.*?});', video_webpage)
50be92c1
PH
1252 if not mobj:
1253 raise ValueError('Could not find vevo ID')
c5e8d7af
PH
1254 info = json.loads(mobj.group(1))
1255 args = info['args']
7ce7e394
JMF
1256 # Easy way to know if the 's' value is in url_encoded_fmt_stream_map
1257 # this signatures are encrypted
44d46655 1258 if 'url_encoded_fmt_stream_map' not in args:
f10503db 1259 raise ValueError(u'No stream_map present') # caught below
00fe14fc
JMF
1260 re_signature = re.compile(r'[&,]s=')
1261 m_s = re_signature.search(args['url_encoded_fmt_stream_map'])
7ce7e394
JMF
1262 if m_s is not None:
1263 self.to_screen(u'%s: Encrypted signatures detected.' % video_id)
c5e8d7af 1264 video_info['url_encoded_fmt_stream_map'] = [args['url_encoded_fmt_stream_map']]
00fe14fc 1265 m_s = re_signature.search(args.get('adaptive_fmts', u''))
b7a68384 1266 if m_s is not None:
00fe14fc
JMF
1267 if 'adaptive_fmts' in video_info:
1268 video_info['adaptive_fmts'][0] += ',' + args['adaptive_fmts']
37b6d5f6 1269 else:
00fe14fc 1270 video_info['adaptive_fmts'] = [args['adaptive_fmts']]
c5e8d7af
PH
1271 except ValueError:
1272 pass
1273
1274 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1275 self.report_rtmp_download()
1276 video_url_list = [(None, video_info['conn'][0])]
00fe14fc
JMF
1277 elif len(video_info.get('url_encoded_fmt_stream_map', [])) >= 1 or len(video_info.get('adaptive_fmts', [])) >= 1:
1278 encoded_url_map = video_info.get('url_encoded_fmt_stream_map', [''])[0] + ',' + video_info.get('adaptive_fmts',[''])[0]
1279 if 'rtmpe%3Dyes' in encoded_url_map:
a7055eb9 1280 raise ExtractorError('rtmpe downloads are not supported, see https://github.com/rg3/youtube-dl/issues/343 for more information.', expected=True)
c5e8d7af 1281 url_map = {}
00fe14fc 1282 for url_data_str in encoded_url_map.split(','):
c5e8d7af
PH
1283 url_data = compat_parse_qs(url_data_str)
1284 if 'itag' in url_data and 'url' in url_data:
1285 url = url_data['url'][0]
1286 if 'sig' in url_data:
1287 url += '&signature=' + url_data['sig'][0]
1288 elif 's' in url_data:
e0df6211 1289 encrypted_sig = url_data['s'][0]
769fda3c 1290 if self._downloader.params.get('verbose'):
c108eb73 1291 if age_gate:
bdde940e
PH
1292 if player_url is None:
1293 player_version = 'unknown'
1294 else:
1295 player_version = self._search_regex(
1296 r'-(.+)\.swf$', player_url,
1297 u'flash player', fatal=False)
e0df6211 1298 player_desc = 'flash player %s' % player_version
c108eb73 1299 else:
83799698
PH
1300 player_version = self._search_regex(
1301 r'html5player-(.+?)\.js', video_webpage,
c108eb73 1302 'html5 player', fatal=False)
e0df6211
PH
1303 player_desc = u'html5 player %s' % player_version
1304
1305 parts_sizes = u'.'.join(compat_str(len(part)) for part in encrypted_sig.split('.'))
5a76c651 1306 self.to_screen(u'encrypted signature length %d (%s), itag %s, %s' %
e0df6211
PH
1307 (len(encrypted_sig), parts_sizes, url_data['itag'][0], player_desc))
1308
83799698 1309 if not age_gate:
e0df6211
PH
1310 jsplayer_url_json = self._search_regex(
1311 r'"assets":.+?"js":\s*("[^"]+")',
1312 video_webpage, u'JS player URL')
83799698 1313 player_url = json.loads(jsplayer_url_json)
e0df6211 1314
83799698
PH
1315 signature = self._decrypt_signature(
1316 encrypted_sig, video_id, player_url, age_gate)
c5e8d7af
PH
1317 url += '&signature=' + signature
1318 if 'ratebypass' not in url:
1319 url += '&ratebypass=yes'
1320 url_map[url_data['itag'][0]] = url
1d043b93 1321 video_url_list = self._get_video_url_list(url_map)
1d043b93
JMF
1322 elif video_info.get('hlsvp'):
1323 manifest_url = video_info['hlsvp'][0]
1324 url_map = self._extract_from_m3u8(manifest_url, video_id)
1325 video_url_list = self._get_video_url_list(url_map)
c5e8d7af 1326 else:
9abb3204 1327 raise ExtractorError(u'no conn, hlsvp or url_encoded_fmt_stream_map information found in video info')
c5e8d7af 1328
4ea3be0a 1329 formats = []
600cc1a4 1330 for itag, video_real_url in video_url_list:
2c62dc26
PH
1331 dct = {
1332 'format_id': itag,
1333 'url': video_real_url,
1334 'player_url': player_url,
1335 }
1336 dct.update(self._formats[itag])
1337 formats.append(dct)
d80044c2 1338
4bcc7bd1 1339 self._sort_formats(formats)
4ea3be0a 1340
1341 return {
1342 'id': video_id,
1343 'uploader': video_uploader,
1344 'uploader_id': video_uploader_id,
1345 'upload_date': upload_date,
1346 'title': video_title,
1347 'thumbnail': video_thumbnail,
1348 'description': video_description,
1349 'subtitles': video_subtitles,
1350 'duration': video_duration,
1351 'age_limit': 18 if age_gate else 0,
1352 'annotations': video_annotations,
1353 'webpage_url': 'https://www.youtube.com/watch?v=%s' % video_id,
1354 'view_count': view_count,
1355 'like_count': like_count,
1356 'dislike_count': dislike_count,
1357 'formats': formats,
1358 }
c5e8d7af 1359
880e1c52 1360class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
0f818663 1361 IE_DESC = u'YouTube.com playlists'
c5e8d7af
PH
1362 _VALID_URL = r"""(?:
1363 (?:https?://)?
1364 (?:\w+\.)?
1365 youtube\.com/
1366 (?:
1367 (?:course|view_play_list|my_playlists|artist|playlist|watch)
1368 \? (?:.*?&)*? (?:p|a|list)=
1369 | p/
1370 )
715c8e7b 1371 ((?:PL|EC|UU|FL|RD)?[0-9A-Za-z-_]{10,})
c5e8d7af
PH
1372 .*
1373 |
715c8e7b 1374 ((?:PL|EC|UU|FL|RD)[0-9A-Za-z-_]{10,})
c5e8d7af 1375 )"""
dcbb4580
JMF
1376 _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s&page=%s'
1377 _MORE_PAGES_INDICATOR = r'data-link-type="next"'
6e47b51e 1378 _VIDEO_RE = r'href="/watch\?v=(?P<id>[0-9A-Za-z_-]{11})&amp;[^"]*?index=(?P<index>\d+)'
c5e8d7af
PH
1379 IE_NAME = u'youtube:playlist'
1380
1381 @classmethod
1382 def suitable(cls, url):
1383 """Receives a URL and returns True if suitable for this IE."""
1384 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1385
880e1c52
JMF
1386 def _real_initialize(self):
1387 self._login()
1388
652cdaa2
JMF
1389 def _ids_to_results(self, ids):
1390 return [self.url_result(vid_id, 'Youtube', video_id=vid_id)
1391 for vid_id in ids]
1392
1393 def _extract_mix(self, playlist_id):
1394 # The mixes are generated from a a single video
1395 # the id of the playlist is just 'RD' + video_id
7d4afc55 1396 url = 'https://youtube.com/watch?v=%s&list=%s' % (playlist_id[-11:], playlist_id)
652cdaa2 1397 webpage = self._download_webpage(url, playlist_id, u'Downloading Youtube mix')
76d1700b
JMF
1398 title_span = (get_element_by_attribute('class', 'title long-title', webpage) or
1399 get_element_by_attribute('class', 'title ', webpage))
1400 title = clean_html(title_span)
652cdaa2
JMF
1401 video_re = r'data-index="\d+".*?href="/watch\?v=([0-9A-Za-z_-]{11})&amp;[^"]*?list=%s' % re.escape(playlist_id)
1402 ids = orderedSet(re.findall(video_re, webpage))
1403 url_results = self._ids_to_results(ids)
1404
1405 return self.playlist_result(url_results, playlist_id, title)
1406
c5e8d7af
PH
1407 def _real_extract(self, url):
1408 # Extract playlist id
1409 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1410 if mobj is None:
1411 raise ExtractorError(u'Invalid URL: %s' % url)
47192f92
FV
1412 playlist_id = mobj.group(1) or mobj.group(2)
1413
1414 # Check if it's a video-specific URL
7c61bd36 1415 query_dict = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
47192f92
FV
1416 if 'v' in query_dict:
1417 video_id = query_dict['v'][0]
1418 if self._downloader.params.get('noplaylist'):
1419 self.to_screen(u'Downloading just video %s because of --no-playlist' % video_id)
7012b23c 1420 return self.url_result(video_id, 'Youtube', video_id=video_id)
47192f92
FV
1421 else:
1422 self.to_screen(u'Downloading playlist PL%s - add --no-playlist to just download video %s' % (playlist_id, video_id))
c5e8d7af 1423
7d4afc55 1424 if playlist_id.startswith('RD'):
652cdaa2
JMF
1425 # Mixes require a custom extraction process
1426 return self._extract_mix(playlist_id)
0a688bc0
JMF
1427 if playlist_id.startswith('TL'):
1428 raise ExtractorError(u'For downloading YouTube.com top lists, use '
1429 u'the "yttoplist" keyword, for example "youtube-dl \'yttoplist:music:Top Tracks\'"', expected=True)
652cdaa2 1430
dcbb4580
JMF
1431 # Extract the video ids from the playlist pages
1432 ids = []
c5e8d7af 1433
755eb032 1434 for page_num in itertools.count(1):
dcbb4580 1435 url = self._TEMPLATE_URL % (playlist_id, page_num)
c5e8d7af 1436 page = self._download_webpage(url, playlist_id, u'Downloading page #%s' % page_num)
6e47b51e
JMF
1437 matches = re.finditer(self._VIDEO_RE, page)
1438 # We remove the duplicates and the link with index 0
1439 # (it's not the first video of the playlist)
1440 new_ids = orderedSet(m.group('id') for m in matches if m.group('index') != '0')
dcbb4580 1441 ids.extend(new_ids)
c5e8d7af 1442
dcbb4580 1443 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
c5e8d7af
PH
1444 break
1445
dcbb4580 1446 playlist_title = self._og_search_title(page)
c5e8d7af 1447
652cdaa2 1448 url_results = self._ids_to_results(ids)
dcbb4580 1449 return self.playlist_result(url_results, playlist_id, playlist_title)
c5e8d7af
PH
1450
1451
0a688bc0
JMF
1452class YoutubeTopListIE(YoutubePlaylistIE):
1453 IE_NAME = u'youtube:toplist'
1454 IE_DESC = (u'YouTube.com top lists, "yttoplist:{channel}:{list title}"'
1455 u' (Example: "yttoplist:music:Top Tracks")')
1456 _VALID_URL = r'yttoplist:(?P<chann>.*?):(?P<title>.*?)$'
1457
1458 def _real_extract(self, url):
1459 mobj = re.match(self._VALID_URL, url)
1460 channel = mobj.group('chann')
1461 title = mobj.group('title')
1462 query = compat_urllib_parse.urlencode({'title': title})
1463 playlist_re = 'href="([^"]+?%s[^"]+?)"' % re.escape(query)
1464 channel_page = self._download_webpage('https://www.youtube.com/%s' % channel, title)
1465 link = self._html_search_regex(playlist_re, channel_page, u'list')
1466 url = compat_urlparse.urljoin('https://www.youtube.com/', link)
1467
1468 video_re = r'data-index="\d+".*?data-video-id="([0-9A-Za-z_-]{11})"'
1469 ids = []
1470 # sometimes the webpage doesn't contain the videos
1471 # retry until we get them
1472 for i in itertools.count(0):
1473 msg = u'Downloading Youtube mix'
1474 if i > 0:
1475 msg += ', retry #%d' % i
1476 webpage = self._download_webpage(url, title, msg)
1477 ids = orderedSet(re.findall(video_re, webpage))
1478 if ids:
1479 break
1480 url_results = self._ids_to_results(ids)
1481 return self.playlist_result(url_results, playlist_title=title)
1482
1483
c5e8d7af 1484class YoutubeChannelIE(InfoExtractor):
0f818663 1485 IE_DESC = u'YouTube.com channels'
c5e8d7af 1486 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
c5e8d7af 1487 _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
252580c5 1488 _MORE_PAGES_URL = 'http://www.youtube.com/c4_browse_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
c5e8d7af
PH
1489 IE_NAME = u'youtube:channel'
1490
1491 def extract_videos_from_page(self, page):
1492 ids_in_page = []
1493 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1494 if mobj.group(1) not in ids_in_page:
1495 ids_in_page.append(mobj.group(1))
1496 return ids_in_page
1497
1498 def _real_extract(self, url):
1499 # Extract channel id
1500 mobj = re.match(self._VALID_URL, url)
1501 if mobj is None:
1502 raise ExtractorError(u'Invalid URL: %s' % url)
1503
1504 # Download channel page
1505 channel_id = mobj.group(1)
1506 video_ids = []
b9643eed
JMF
1507 url = 'https://www.youtube.com/channel/%s/videos' % channel_id
1508 channel_page = self._download_webpage(url, channel_id)
31812a9e
PH
1509 autogenerated = re.search(r'''(?x)
1510 class="[^"]*?(?:
1511 channel-header-autogenerated-label|
1512 yt-channel-title-autogenerated
1513 )[^"]*"''', channel_page) is not None
c5e8d7af 1514
b9643eed
JMF
1515 if autogenerated:
1516 # The videos are contained in a single page
1517 # the ajax pages can't be used, they are empty
1518 video_ids = self.extract_videos_from_page(channel_page)
1519 else:
1520 # Download all channel pages using the json-based channel_ajax query
1521 for pagenum in itertools.count(1):
1522 url = self._MORE_PAGES_URL % (pagenum, channel_id)
1523 page = self._download_webpage(url, channel_id,
1524 u'Downloading page #%s' % pagenum)
1525
1526 page = json.loads(page)
1527
1528 ids_in_page = self.extract_videos_from_page(page['content_html'])
1529 video_ids.extend(ids_in_page)
1530
1531 if self._MORE_PAGES_INDICATOR not in page['load_more_widget_html']:
1532 break
c5e8d7af
PH
1533
1534 self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1535
7012b23c
PH
1536 url_entries = [self.url_result(video_id, 'Youtube', video_id=video_id)
1537 for video_id in video_ids]
1538 return self.playlist_result(url_entries, channel_id)
c5e8d7af
PH
1539
1540
1541class YoutubeUserIE(InfoExtractor):
0f818663 1542 IE_DESC = u'YouTube.com user videos (URL or "ytuser" keyword)'
57da92b7 1543 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:user/)?(?!(?:attribution_link|watch)(?:$|[^a-z_A-Z0-9-])))|ytuser:)(?!feed/)([A-Za-z0-9_-]+)'
c5e8d7af
PH
1544 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1545 _GDATA_PAGE_SIZE = 50
fd9cf738 1546 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d&alt=json'
c5e8d7af
PH
1547 IE_NAME = u'youtube:user'
1548
e3ea4790 1549 @classmethod
f4b05232 1550 def suitable(cls, url):
e3ea4790
JMF
1551 # Don't return True if the url can be extracted with other youtube
1552 # extractor, the regex would is too permissive and it would match.
1553 other_ies = iter(klass for (name, klass) in globals().items() if name.endswith('IE') and klass is not cls)
1554 if any(ie.suitable(url) for ie in other_ies): return False
f4b05232
JMF
1555 else: return super(YoutubeUserIE, cls).suitable(url)
1556
c5e8d7af
PH
1557 def _real_extract(self, url):
1558 # Extract username
1559 mobj = re.match(self._VALID_URL, url)
1560 if mobj is None:
1561 raise ExtractorError(u'Invalid URL: %s' % url)
1562
1563 username = mobj.group(1)
1564
1565 # Download video ids using YouTube Data API. Result size per
1566 # query is limited (currently to 50 videos) so we need to query
1567 # page by page until there are no video ids - it means we got
1568 # all of them.
1569
e302f9ce 1570 url_results = []
c5e8d7af 1571
755eb032 1572 for pagenum in itertools.count(0):
c5e8d7af
PH
1573 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1574
1575 gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
1576 page = self._download_webpage(gdata_url, username,
1577 u'Downloading video ids from %d to %d' % (start_index, start_index + self._GDATA_PAGE_SIZE))
1578
fd9cf738
JMF
1579 try:
1580 response = json.loads(page)
1581 except ValueError as err:
1582 raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
71c82637
JMF
1583 if 'entry' not in response['feed']:
1584 # Number of videos is a multiple of self._MAX_RESULTS
1585 break
fd9cf738 1586
c5e8d7af 1587 # Extract video identifiers
e302f9ce
PH
1588 entries = response['feed']['entry']
1589 for entry in entries:
1590 title = entry['title']['$t']
1591 video_id = entry['id']['$t'].split('/')[-1]
1592 url_results.append({
1593 '_type': 'url',
1594 'url': video_id,
1595 'ie_key': 'Youtube',
1596 'id': 'video_id',
1597 'title': title,
1598 })
c5e8d7af
PH
1599
1600 # A little optimization - if current page is not
1601 # "full", ie. does not contain PAGE_SIZE video ids then
1602 # we can assume that this page is the last one - there
1603 # are no more ids on further pages - no need to query
1604 # again.
1605
e302f9ce 1606 if len(entries) < self._GDATA_PAGE_SIZE:
c5e8d7af
PH
1607 break
1608
7012b23c
PH
1609 return self.playlist_result(url_results, playlist_title=username)
1610
b05654f0
PH
1611
1612class YoutubeSearchIE(SearchInfoExtractor):
0f818663 1613 IE_DESC = u'YouTube.com searches'
b05654f0
PH
1614 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1615 _MAX_RESULTS = 1000
1616 IE_NAME = u'youtube:search'
1617 _SEARCH_KEY = 'ytsearch'
1618
b05654f0
PH
1619 def _get_n_results(self, query, n):
1620 """Get a specified number of results for a query"""
1621
1622 video_ids = []
1623 pagenum = 0
1624 limit = n
1625
1626 while (50 * pagenum) < limit:
b05654f0 1627 result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
7cc3570e
PH
1628 data_json = self._download_webpage(
1629 result_url, video_id=u'query "%s"' % query,
1630 note=u'Downloading page %s' % (pagenum + 1),
1631 errnote=u'Unable to download API page')
1632 data = json.loads(data_json)
1633 api_response = data['data']
1634
1635 if 'items' not in api_response:
b05654f0
PH
1636 raise ExtractorError(u'[youtube] No video results')
1637
1638 new_ids = list(video['id'] for video in api_response['items'])
1639 video_ids += new_ids
1640
1641 limit = min(n, api_response['totalItems'])
1642 pagenum += 1
1643
1644 if len(video_ids) > n:
1645 video_ids = video_ids[:n]
7012b23c
PH
1646 videos = [self.url_result(video_id, 'Youtube', video_id=video_id)
1647 for video_id in video_ids]
b05654f0 1648 return self.playlist_result(videos, query)
75dff0ee 1649
a3dd9248 1650class YoutubeSearchDateIE(YoutubeSearchIE):
cb7fb546 1651 IE_NAME = YoutubeSearchIE.IE_NAME + ':date'
a3dd9248
CM
1652 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc&orderby=published'
1653 _SEARCH_KEY = 'ytsearchdate'
08fb86c4 1654 IE_DESC = u'YouTube.com searches, newest videos first'
75dff0ee
JMF
1655
1656class YoutubeShowIE(InfoExtractor):
0f818663 1657 IE_DESC = u'YouTube.com (multi-season) shows'
75dff0ee
JMF
1658 _VALID_URL = r'https?://www\.youtube\.com/show/(.*)'
1659 IE_NAME = u'youtube:show'
1660
1661 def _real_extract(self, url):
1662 mobj = re.match(self._VALID_URL, url)
1663 show_name = mobj.group(1)
1664 webpage = self._download_webpage(url, show_name, u'Downloading show webpage')
1665 # There's one playlist for each season of the show
1666 m_seasons = list(re.finditer(r'href="(/playlist\?list=.*?)"', webpage))
1667 self.to_screen(u'%s: Found %s seasons' % (show_name, len(m_seasons)))
1668 return [self.url_result('https://www.youtube.com' + season.group(1), 'YoutubePlaylist') for season in m_seasons]
04cc9617
JMF
1669
1670
b2e8bc1b 1671class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
d7ae0639
JMF
1672 """
1673 Base class for extractors that fetch info from
1674 http://www.youtube.com/feed_ajax
1675 Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.
1676 """
b2e8bc1b 1677 _LOGIN_REQUIRED = True
43ba5456
JMF
1678 # use action_load_personal_feed instead of action_load_system_feed
1679 _PERSONAL_FEED = False
04cc9617 1680
d7ae0639
JMF
1681 @property
1682 def _FEED_TEMPLATE(self):
43ba5456
JMF
1683 action = 'action_load_system_feed'
1684 if self._PERSONAL_FEED:
1685 action = 'action_load_personal_feed'
1686 return 'http://www.youtube.com/feed_ajax?%s=1&feed_name=%s&paging=%%s' % (action, self._FEED_NAME)
d7ae0639
JMF
1687
1688 @property
1689 def IE_NAME(self):
1690 return u'youtube:%s' % self._FEED_NAME
04cc9617 1691
81f0259b 1692 def _real_initialize(self):
b2e8bc1b 1693 self._login()
81f0259b 1694
04cc9617
JMF
1695 def _real_extract(self, url):
1696 feed_entries = []
0e44d838
JMF
1697 paging = 0
1698 for i in itertools.count(1):
d7ae0639
JMF
1699 info = self._download_webpage(self._FEED_TEMPLATE % paging,
1700 u'%s feed' % self._FEED_NAME,
04cc9617
JMF
1701 u'Downloading page %s' % i)
1702 info = json.loads(info)
1703 feed_html = info['feed_html']
43ba5456 1704 m_ids = re.finditer(r'"/watch\?v=(.*?)["&]', feed_html)
04cc9617 1705 ids = orderedSet(m.group(1) for m in m_ids)
7012b23c
PH
1706 feed_entries.extend(
1707 self.url_result(video_id, 'Youtube', video_id=video_id)
1708 for video_id in ids)
04cc9617
JMF
1709 if info['paging'] is None:
1710 break
0e44d838 1711 paging = info['paging']
d7ae0639
JMF
1712 return self.playlist_result(feed_entries, playlist_title=self._PLAYLIST_TITLE)
1713
1714class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor):
1715 IE_DESC = u'YouTube.com subscriptions feed, "ytsubs" keyword(requires authentication)'
1716 _VALID_URL = r'https?://www\.youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
1717 _FEED_NAME = 'subscriptions'
1718 _PLAYLIST_TITLE = u'Youtube Subscriptions'
1719
1720class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
1721 IE_DESC = u'YouTube.com recommended videos, "ytrec" keyword (requires authentication)'
1722 _VALID_URL = r'https?://www\.youtube\.com/feed/recommended|:ytrec(?:ommended)?'
1723 _FEED_NAME = 'recommended'
1724 _PLAYLIST_TITLE = u'Youtube Recommended videos'
c626a3d9 1725
43ba5456
JMF
1726class YoutubeWatchLaterIE(YoutubeFeedsInfoExtractor):
1727 IE_DESC = u'Youtube watch later list, "ytwatchlater" keyword (requires authentication)'
1728 _VALID_URL = r'https?://www\.youtube\.com/feed/watch_later|:ytwatchlater'
1729 _FEED_NAME = 'watch_later'
1730 _PLAYLIST_TITLE = u'Youtube Watch Later'
43ba5456 1731 _PERSONAL_FEED = True
c626a3d9 1732
f459d170
JMF
1733class YoutubeHistoryIE(YoutubeFeedsInfoExtractor):
1734 IE_DESC = u'Youtube watch history, "ythistory" keyword (requires authentication)'
1735 _VALID_URL = u'https?://www\.youtube\.com/feed/history|:ythistory'
1736 _FEED_NAME = 'history'
1737 _PERSONAL_FEED = True
1738 _PLAYLIST_TITLE = u'Youtube Watch History'
1739
c626a3d9
JMF
1740class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
1741 IE_NAME = u'youtube:favorites'
1742 IE_DESC = u'YouTube.com favourite videos, "ytfav" keyword (requires authentication)'
c7a7750d 1743 _VALID_URL = r'https?://www\.youtube\.com/my_favorites|:ytfav(?:ou?rites)?'
c626a3d9
JMF
1744 _LOGIN_REQUIRED = True
1745
1746 def _real_extract(self, url):
1747 webpage = self._download_webpage('https://www.youtube.com/my_favorites', 'Youtube Favourites videos')
1748 playlist_id = self._search_regex(r'list=(.+?)["&]', webpage, u'favourites playlist id')
1749 return self.url_result(playlist_id, 'YoutubePlaylist')
15870e90
PH
1750
1751
1752class YoutubeTruncatedURLIE(InfoExtractor):
1753 IE_NAME = 'youtube:truncated_url'
1754 IE_DESC = False # Do not list
1755 _VALID_URL = r'(?:https?://)?[^/]+/watch\?feature=[a-z_]+$'
1756
1757 def _real_extract(self, url):
1758 raise ExtractorError(
1759 u'Did you forget to quote the URL? Remember that & is a meta '
1760 u'character in most shells, so you want to put the URL in quotes, '
1761 u'like youtube-dl '
b4622a32
PH
1762 u'"http://www.youtube.com/watch?feature=foo&v=BaW_jenozKc" '
1763 u' or simply youtube-dl BaW_jenozKc .',
15870e90 1764 expected=True)