]> jfr.im git - yt-dlp.git/blame - youtube_dl/extractor/youtube.py
[youtube] Simplify format specification
[yt-dlp.git] / youtube_dl / extractor / youtube.py
CommitLineData
c5e8d7af 1# coding: utf-8
c5e8d7af 2
e0df6211 3import collections
edf3e38e 4import errno
e0df6211 5import io
0ca96d48 6import itertools
c5e8d7af 7import json
c4417ddb 8import os.path
c5e8d7af 9import re
e0df6211
PH
10import string
11import struct
12import traceback
13import zlib
c5e8d7af 14
b05654f0 15from .common import InfoExtractor, SearchInfoExtractor
54d39d8b 16from .subtitles import SubtitlesInfoExtractor
c5e8d7af 17from ..utils import (
edf3e38e 18 compat_chr,
c5e8d7af 19 compat_parse_qs,
c5e8d7af
PH
20 compat_urllib_parse,
21 compat_urllib_request,
7c61bd36 22 compat_urlparse,
c5e8d7af
PH
23 compat_str,
24
25 clean_html,
c38b1e77 26 get_cachedir,
c5e8d7af 27 get_element_by_id,
652cdaa2 28 get_element_by_attribute,
c5e8d7af
PH
29 ExtractorError,
30 unescapeHTML,
31 unified_strdate,
04cc9617 32 orderedSet,
edf3e38e 33 write_json_file,
c5e8d7af
PH
34)
35
de7f3446 36class YoutubeBaseInfoExtractor(InfoExtractor):
b2e8bc1b
JMF
37 """Provide base functions for Youtube extractors"""
38 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
39 _LANG_URL = r'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
40 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
41 _NETRC_MACHINE = 'youtube'
42 # If True it will raise an error if no login info is provided
43 _LOGIN_REQUIRED = False
44
b2e8bc1b 45 def _set_language(self):
7cc3570e
PH
46 return bool(self._download_webpage(
47 self._LANG_URL, None,
48 note=u'Setting language', errnote='unable to set language',
49 fatal=False))
b2e8bc1b
JMF
50
51 def _login(self):
52 (username, password) = self._get_login_info()
53 # No authentication to be performed
54 if username is None:
55 if self._LOGIN_REQUIRED:
56 raise ExtractorError(u'No login info available, needed for using %s.' % self.IE_NAME, expected=True)
57 return False
58
7cc3570e
PH
59 login_page = self._download_webpage(
60 self._LOGIN_URL, None,
61 note=u'Downloading login page',
62 errnote=u'unable to fetch login page', fatal=False)
63 if login_page is False:
64 return
b2e8bc1b 65
795f28f8
PH
66 galx = self._search_regex(r'(?s)<input.+?name="GALX".+?value="(.+?)"',
67 login_page, u'Login GALX parameter')
c5e8d7af 68
b2e8bc1b
JMF
69 # Log in
70 login_form_strs = {
71 u'continue': u'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
72 u'Email': username,
73 u'GALX': galx,
74 u'Passwd': password,
75 u'PersistentCookie': u'yes',
76 u'_utf8': u'霱',
77 u'bgresponse': u'js_disabled',
78 u'checkConnection': u'',
79 u'checkedDomains': u'youtube',
80 u'dnConn': u'',
b2e8bc1b
JMF
81 u'pstMsg': u'0',
82 u'rmShown': u'1',
83 u'secTok': u'',
84 u'signIn': u'Sign in',
85 u'timeStmp': u'',
86 u'service': u'youtube',
87 u'uilel': u'3',
88 u'hl': u'en_US',
89 }
90 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
91 # chokes on unicode
92 login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
93 login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
7cc3570e
PH
94
95 req = compat_urllib_request.Request(self._LOGIN_URL, login_data)
96 login_results = self._download_webpage(
97 req, None,
98 note=u'Logging in', errnote=u'unable to log in', fatal=False)
99 if login_results is False:
100 return False
101 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
102 self._downloader.report_warning(u'unable to log in: bad username or password')
b2e8bc1b
JMF
103 return False
104 return True
105
106 def _confirm_age(self):
107 age_form = {
7cc3570e
PH
108 'next_url': '/',
109 'action_confirm': 'Confirm',
110 }
111 req = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
112
113 self._download_webpage(
114 req, None,
115 note=u'Confirming age', errnote=u'Unable to confirm age')
b2e8bc1b
JMF
116 return True
117
118 def _real_initialize(self):
119 if self._downloader is None:
120 return
121 if not self._set_language():
122 return
123 if not self._login():
124 return
125 self._confirm_age()
c5e8d7af 126
8377574c 127
de7f3446 128class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
0f818663 129 IE_DESC = u'YouTube.com'
cb7dfeea 130 _VALID_URL = r"""(?x)^
c5e8d7af 131 (
83aa5293 132 (?:https?://|//)? # http(s):// or protocol-independent URL (optional)
cb7dfeea 133 (?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/|
e69ae5b9
JMF
134 tube\.majestyc\.net/|
135 youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains
c5e8d7af
PH
136 (?:.*?\#/)? # handle anchor (#/) redirect urls
137 (?: # the various things that can precede the ID:
138 (?:(?:v|embed|e)/) # v/ or embed/ or e/
139 |(?: # or the v= param in all its forms
d741e55a 140 (?:(?:watch|movie)(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
c5e8d7af
PH
141 (?:\?|\#!?) # the params delimiter ? or # or #!
142 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
143 v=
144 )
f4b05232
JMF
145 ))
146 |youtu\.be/ # just youtu.be/xxxx
147 )
c5e8d7af 148 )? # all until now is optional -> you can pass the naked ID
8963d9c2 149 ([0-9A-Za-z_-]{11}) # here is it! the YouTube video ID
c5e8d7af
PH
150 (?(1).+)? # if we found the ID, everything can follow
151 $"""
c5e8d7af 152 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
2c62dc26
PH
153 _formats = {
154 '5': {'ext': 'flv', 'width': 400, 'height': 240},
155 '6': {'ext': 'flv', 'width': 450, 'height': 270},
156 '13': {'ext': '3gp'},
157 '17': {'ext': '3gp', 'width': 176, 'height': 144},
158 '18': {'ext': 'mp4', 'width': 640, 'height': 360},
159 '22': {'ext': 'mp4', 'width': 1280, 'height': 720},
160 '34': {'ext': 'flv', 'width': 640, 'height': 360},
161 '35': {'ext': 'flv', 'width': 854, 'height': 480},
162 '36': {'ext': '3gp', 'width': 320, 'height': 240},
163 '37': {'ext': 'mp4', 'width': 1920, 'height': 1080},
164 '38': {'ext': 'mp4', 'width': 4096, 'height': 3072},
165 '43': {'ext': 'webm', 'width': 640, 'height': 360},
166 '44': {'ext': 'webm', 'width': 854, 'height': 480},
167 '45': {'ext': 'webm', 'width': 1280, 'height': 720},
168 '46': {'ext': 'webm', 'width': 1920, 'height': 1080},
169
1d043b93 170
86fe61c8 171 # 3d videos
2c62dc26
PH
172 '82': {'ext': 'mp4', 'height': 360, 'resolution': '360p', 'format_note': '3D', 'preference': -20},
173 '83': {'ext': 'mp4', 'height': 480, 'resolution': '480p', 'format_note': '3D', 'preference': -20},
174 '84': {'ext': 'mp4', 'height': 720, 'resolution': '720p', 'format_note': '3D', 'preference': -20},
175 '85': {'ext': 'mp4', 'height': 1080, 'resolution': '1080p', 'format_note': '3D', 'preference': -20},
176 '100': {'ext': 'webm', 'height': 360, 'resolution': '360p', 'format_note': '3D', 'preference': -20},
177 '101': {'ext': 'webm', 'height': 480, 'resolution': '480p', 'format_note': '3D', 'preference': -20},
178 '102': {'ext': 'webm', 'height': 720, 'resolution': '720p', 'format_note': '3D', 'preference': -20},
836a086c 179
96fb5605 180 # Apple HTTP Live Streaming
2c62dc26
PH
181 '92': {'ext': 'mp4', 'height': 240, 'resolution': '240p', 'format_note': 'HLS', 'preference': -10},
182 '93': {'ext': 'mp4', 'height': 360, 'resolution': '360p', 'format_note': 'HLS', 'preference': -10},
183 '94': {'ext': 'mp4', 'height': 480, 'resolution': '480p', 'format_note': 'HLS', 'preference': -10},
184 '95': {'ext': 'mp4', 'height': 720, 'resolution': '720p', 'format_note': 'HLS', 'preference': -10},
185 '96': {'ext': 'mp4', 'height': 1080, 'resolution': '1080p', 'format_note': 'HLS', 'preference': -10},
186 '132': {'ext': 'mp4', 'height': 240, 'resolution': '240p', 'format_note': 'HLS', 'preference': -10},
187 '151': {'ext': 'mp4', 'height': 72, 'resolution': '72p', 'format_note': 'HLS', 'preference': -10},
188
189 # DASH mp4 video
190 '133': {'ext': 'mp4', 'height': 240, 'resolution': '240p', 'format_note': 'DASH video', 'preference': -40},
191 '134': {'ext': 'mp4', 'height': 360, 'resolution': '360p', 'format_note': 'DASH video', 'preference': -40},
192 '135': {'ext': 'mp4', 'height': 480, 'resolution': '480p', 'format_note': 'DASH video', 'preference': -40},
193 '136': {'ext': 'mp4', 'height': 720, 'resolution': '720p', 'format_note': 'DASH video', 'preference': -40},
194 '137': {'ext': 'mp4', 'height': 1080, 'resolution': '1080p', 'format_note': 'DASH video', 'preference': -40},
195 '138': {'ext': 'mp4', 'height': 1081, 'resolution': '>1080p', 'format_note': 'DASH video', 'preference': -40},
196 '160': {'ext': 'mp4', 'height': 192, 'resolution': '192p', 'format_note': 'DASH video', 'preference': -40},
836a086c 197
f6f1fc92 198 # Dash mp4 audio
2c62dc26
PH
199 '139': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 48, 'preference': -50},
200 '140': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 128, 'preference': -50},
201 '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 256, 'preference': -50},
836a086c
AZ
202
203 # Dash webm
2c62dc26
PH
204 '242': {'ext': 'webm', 'height': 240, 'resolution': '240p', 'format_note': 'DASH webm', 'preference': -40},
205 '243': {'ext': 'webm', 'height': 360, 'resolution': '360p', 'format_note': 'DASH webm', 'preference': -40},
206 '244': {'ext': 'webm', 'height': 480, 'resolution': '480p', 'format_note': 'DASH webm', 'preference': -40},
207 '245': {'ext': 'webm', 'height': 480, 'resolution': '480p', 'format_note': 'DASH webm', 'preference': -40},
208 '246': {'ext': 'webm', 'height': 480, 'resolution': '480p', 'format_note': 'DASH webm', 'preference': -40},
209 '247': {'ext': 'webm', 'height': 720, 'resolution': '720p', 'format_note': 'DASH webm', 'preference': -40},
210 '248': {'ext': 'webm', 'height': 1080, 'resolution': '1080p', 'format_note': 'DASH webm', 'preference': -40},
211
212 # Dash webm audio
213 '171': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH webm audio', 'abr': 48, 'preference': -50},
214 '172': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH webm audio', 'abr': 256, 'preference': -50},
c5e8d7af 215 }
836a086c 216
c5e8d7af 217 IE_NAME = u'youtube'
2eb88d95
PH
218 _TESTS = [
219 {
0e853ca4
PH
220 u"url": u"http://www.youtube.com/watch?v=BaW_jenozKc",
221 u"file": u"BaW_jenozKc.mp4",
222 u"info_dict": {
223 u"title": u"youtube-dl test video \"'/\\ä↭𝕐",
224 u"uploader": u"Philipp Hagemeister",
225 u"uploader_id": u"phihag",
226 u"upload_date": u"20121002",
27dcce19 227 u"description": u"test chars: \"'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de ."
2eb88d95 228 }
0e853ca4 229 },
0e853ca4
PH
230 {
231 u"url": u"http://www.youtube.com/watch?v=UxxajLWwzqY",
232 u"file": u"UxxajLWwzqY.mp4",
233 u"note": u"Test generic use_cipher_signature video (#897)",
234 u"info_dict": {
235 u"upload_date": u"20120506",
236 u"title": u"Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]",
2dc59299 237 u"description": u"md5:5b292926389560516e384ac437c0ec07",
45ed795c 238 u"uploader": u"Icona Pop",
0e853ca4 239 u"uploader_id": u"IconaPop"
2eb88d95 240 }
c108eb73
JMF
241 },
242 {
243 u"url": u"https://www.youtube.com/watch?v=07FYdnEawAQ",
244 u"file": u"07FYdnEawAQ.mp4",
245 u"note": u"Test VEVO video with age protection (#956)",
246 u"info_dict": {
247 u"upload_date": u"20130703",
248 u"title": u"Justin Timberlake - Tunnel Vision (Explicit)",
249 u"description": u"md5:64249768eec3bc4276236606ea996373",
250 u"uploader": u"justintimberlakeVEVO",
251 u"uploader_id": u"justintimberlakeVEVO"
252 }
253 },
fccd3771 254 {
83aa5293 255 u"url": u"//www.YouTube.com/watch?v=yZIXLfi8CZQ",
fccd3771
PH
256 u"file": u"yZIXLfi8CZQ.mp4",
257 u"note": u"Embed-only video (#1746)",
258 u"info_dict": {
259 u"upload_date": u"20120608",
260 u"title": u"Principal Sexually Assaults A Teacher - Episode 117 - 8th June 2012",
261 u"description": u"md5:09b78bd971f1e3e289601dfba15ca4f7",
262 u"uploader": u"SET India",
263 u"uploader_id": u"setindia"
264 }
265 },
2eb88d95
PH
266 ]
267
c5e8d7af
PH
268
269 @classmethod
270 def suitable(cls, url):
271 """Receives a URL and returns True if suitable for this IE."""
e3ea4790 272 if YoutubePlaylistIE.suitable(url): return False
fccd3771 273 return re.match(cls._VALID_URL, url) is not None
c5e8d7af 274
e0df6211
PH
275 def __init__(self, *args, **kwargs):
276 super(YoutubeIE, self).__init__(*args, **kwargs)
83799698 277 self._player_cache = {}
e0df6211 278
c5e8d7af
PH
279 def report_video_info_webpage_download(self, video_id):
280 """Report attempt to download video info webpage."""
281 self.to_screen(u'%s: Downloading video info webpage' % video_id)
282
c5e8d7af
PH
283 def report_information_extraction(self, video_id):
284 """Report attempt to extract video information."""
285 self.to_screen(u'%s: Extracting video information' % video_id)
286
287 def report_unavailable_format(self, video_id, format):
288 """Report extracted video URL."""
289 self.to_screen(u'%s: Format %s not available' % (video_id, format))
290
291 def report_rtmp_download(self):
292 """Indicate the download will use the RTMP protocol."""
293 self.to_screen(u'RTMP download detected')
294
c4417ddb
PH
295 def _extract_signature_function(self, video_id, player_url, slen):
296 id_m = re.match(r'.*-(?P<id>[a-zA-Z0-9_-]+)\.(?P<ext>[a-z]+)$',
83799698 297 player_url)
e0df6211
PH
298 player_type = id_m.group('ext')
299 player_id = id_m.group('id')
300
c4417ddb
PH
301 # Read from filesystem cache
302 func_id = '%s_%s_%d' % (player_type, player_id, slen)
303 assert os.path.basename(func_id) == func_id
c38b1e77 304 cache_dir = get_cachedir(self._downloader.params)
c4417ddb 305
c3c88a26 306 cache_enabled = cache_dir is not None
f8061589 307 if cache_enabled:
c4417ddb
PH
308 cache_fn = os.path.join(os.path.expanduser(cache_dir),
309 u'youtube-sigfuncs',
310 func_id + '.json')
311 try:
edf3e38e 312 with io.open(cache_fn, 'r', encoding='utf-8') as cachef:
c4417ddb
PH
313 cache_spec = json.load(cachef)
314 return lambda s: u''.join(s[i] for i in cache_spec)
edf3e38e 315 except IOError:
c4417ddb 316 pass # No cache available
83799698 317
e0df6211
PH
318 if player_type == 'js':
319 code = self._download_webpage(
320 player_url, video_id,
83799698 321 note=u'Downloading %s player %s' % (player_type, player_id),
e0df6211 322 errnote=u'Download of %s failed' % player_url)
83799698 323 res = self._parse_sig_js(code)
c4417ddb 324 elif player_type == 'swf':
e0df6211
PH
325 urlh = self._request_webpage(
326 player_url, video_id,
83799698 327 note=u'Downloading %s player %s' % (player_type, player_id),
e0df6211
PH
328 errnote=u'Download of %s failed' % player_url)
329 code = urlh.read()
83799698 330 res = self._parse_sig_swf(code)
e0df6211
PH
331 else:
332 assert False, 'Invalid player type %r' % player_type
333
f8061589 334 if cache_enabled:
edf3e38e 335 try:
c705320f
PH
336 test_string = u''.join(map(compat_chr, range(slen)))
337 cache_res = res(test_string)
edf3e38e
PH
338 cache_spec = [ord(c) for c in cache_res]
339 try:
340 os.makedirs(os.path.dirname(cache_fn))
341 except OSError as ose:
342 if ose.errno != errno.EEXIST:
343 raise
344 write_json_file(cache_spec, cache_fn)
0ca96d48 345 except Exception:
edf3e38e
PH
346 tb = traceback.format_exc()
347 self._downloader.report_warning(
348 u'Writing cache to %r failed: %s' % (cache_fn, tb))
83799698
PH
349
350 return res
351
edf3e38e
PH
352 def _print_sig_code(self, func, slen):
353 def gen_sig_code(idxs):
354 def _genslice(start, end, step):
355 starts = u'' if start == 0 else str(start)
e35e4ddc
PH
356 ends = (u':%d' % (end+step)) if end + step >= 0 else u':'
357 steps = u'' if step == 1 else (u':%d' % step)
edf3e38e
PH
358 return u's[%s%s%s]' % (starts, ends, steps)
359
360 step = None
0ca96d48
PH
361 start = '(Never used)' # Quelch pyflakes warnings - start will be
362 # set as soon as step is set
edf3e38e
PH
363 for i, prev in zip(idxs[1:], idxs[:-1]):
364 if step is not None:
365 if i - prev == step:
366 continue
367 yield _genslice(start, prev, step)
368 step = None
369 continue
370 if i - prev in [-1, 1]:
371 step = i - prev
372 start = prev
373 continue
374 else:
375 yield u's[%d]' % prev
376 if step is None:
377 yield u's[%d]' % i
378 else:
379 yield _genslice(start, i, step)
380
c705320f
PH
381 test_string = u''.join(map(compat_chr, range(slen)))
382 cache_res = func(test_string)
edf3e38e
PH
383 cache_spec = [ord(c) for c in cache_res]
384 expr_code = u' + '.join(gen_sig_code(cache_spec))
385 code = u'if len(s) == %d:\n return %s\n' % (slen, expr_code)
f8061589 386 self.to_screen(u'Extracted signature function:\n' + code)
edf3e38e 387
e0df6211
PH
388 def _parse_sig_js(self, jscode):
389 funcname = self._search_regex(
390 r'signature=([a-zA-Z]+)', jscode,
391 u'Initial JS player signature function name')
392
393 functions = {}
394
395 def argidx(varname):
396 return string.lowercase.index(varname)
397
398 def interpret_statement(stmt, local_vars, allow_recursion=20):
399 if allow_recursion < 0:
0ca96d48 400 raise ExtractorError(u'Recursion limit reached')
e0df6211
PH
401
402 if stmt.startswith(u'var '):
403 stmt = stmt[len(u'var '):]
404 ass_m = re.match(r'^(?P<out>[a-z]+)(?:\[(?P<index>[^\]]+)\])?' +
405 r'=(?P<expr>.*)$', stmt)
406 if ass_m:
407 if ass_m.groupdict().get('index'):
408 def assign(val):
409 lvar = local_vars[ass_m.group('out')]
410 idx = interpret_expression(ass_m.group('index'),
411 local_vars, allow_recursion)
412 assert isinstance(idx, int)
413 lvar[idx] = val
414 return val
415 expr = ass_m.group('expr')
416 else:
417 def assign(val):
418 local_vars[ass_m.group('out')] = val
419 return val
420 expr = ass_m.group('expr')
421 elif stmt.startswith(u'return '):
422 assign = lambda v: v
423 expr = stmt[len(u'return '):]
424 else:
425 raise ExtractorError(
426 u'Cannot determine left side of statement in %r' % stmt)
427
428 v = interpret_expression(expr, local_vars, allow_recursion)
429 return assign(v)
430
431 def interpret_expression(expr, local_vars, allow_recursion):
432 if expr.isdigit():
433 return int(expr)
434
435 if expr.isalpha():
436 return local_vars[expr]
437
438 m = re.match(r'^(?P<in>[a-z]+)\.(?P<member>.*)$', expr)
439 if m:
440 member = m.group('member')
441 val = local_vars[m.group('in')]
442 if member == 'split("")':
443 return list(val)
444 if member == 'join("")':
445 return u''.join(val)
446 if member == 'length':
447 return len(val)
448 if member == 'reverse()':
449 return val[::-1]
450 slice_m = re.match(r'slice\((?P<idx>.*)\)', member)
451 if slice_m:
452 idx = interpret_expression(
453 slice_m.group('idx'), local_vars, allow_recursion-1)
454 return val[idx:]
455
456 m = re.match(
457 r'^(?P<in>[a-z]+)\[(?P<idx>.+)\]$', expr)
458 if m:
459 val = local_vars[m.group('in')]
460 idx = interpret_expression(m.group('idx'), local_vars,
461 allow_recursion-1)
462 return val[idx]
463
464 m = re.match(r'^(?P<a>.+?)(?P<op>[%])(?P<b>.+?)$', expr)
465 if m:
466 a = interpret_expression(m.group('a'),
467 local_vars, allow_recursion)
468 b = interpret_expression(m.group('b'),
469 local_vars, allow_recursion)
470 return a % b
471
472 m = re.match(
473 r'^(?P<func>[a-zA-Z]+)\((?P<args>[a-z0-9,]+)\)$', expr)
474 if m:
475 fname = m.group('func')
476 if fname not in functions:
477 functions[fname] = extract_function(fname)
478 argvals = [int(v) if v.isdigit() else local_vars[v]
479 for v in m.group('args').split(',')]
480 return functions[fname](argvals)
481 raise ExtractorError(u'Unsupported JS expression %r' % expr)
482
483 def extract_function(funcname):
484 func_m = re.search(
485 r'function ' + re.escape(funcname) +
486 r'\((?P<args>[a-z,]+)\){(?P<code>[^}]+)}',
487 jscode)
488 argnames = func_m.group('args').split(',')
489
490 def resf(args):
491 local_vars = dict(zip(argnames, args))
492 for stmt in func_m.group('code').split(';'):
493 res = interpret_statement(stmt, local_vars)
494 return res
495 return resf
496
497 initial_function = extract_function(funcname)
498 return lambda s: initial_function([s])
499
500 def _parse_sig_swf(self, file_contents):
501 if file_contents[1:3] != b'WS':
502 raise ExtractorError(
503 u'Not an SWF file; header is %r' % file_contents[:3])
504 if file_contents[:1] == b'C':
505 content = zlib.decompress(file_contents[8:])
506 else:
507 raise NotImplementedError(u'Unsupported compression format %r' %
508 file_contents[:1])
509
510 def extract_tags(content):
511 pos = 0
512 while pos < len(content):
513 header16 = struct.unpack('<H', content[pos:pos+2])[0]
514 pos += 2
515 tag_code = header16 >> 6
516 tag_len = header16 & 0x3f
517 if tag_len == 0x3f:
518 tag_len = struct.unpack('<I', content[pos:pos+4])[0]
519 pos += 4
520 assert pos+tag_len <= len(content)
521 yield (tag_code, content[pos:pos+tag_len])
522 pos += tag_len
523
524 code_tag = next(tag
525 for tag_code, tag in extract_tags(content)
526 if tag_code == 82)
527 p = code_tag.index(b'\0', 4) + 1
ba552f54 528 code_reader = io.BytesIO(code_tag[p:])
e0df6211
PH
529
530 # Parse ABC (AVM2 ByteCode)
ba552f54
PH
531 def read_int(reader=None):
532 if reader is None:
533 reader = code_reader
e0df6211
PH
534 res = 0
535 shift = 0
536 for _ in range(5):
ba552f54
PH
537 buf = reader.read(1)
538 assert len(buf) == 1
539 b = struct.unpack('<B', buf)[0]
e0df6211
PH
540 res = res | ((b & 0x7f) << shift)
541 if b & 0x80 == 0:
542 break
543 shift += 7
ba552f54
PH
544 return res
545
546 def u30(reader=None):
547 res = read_int(reader)
548 assert res & 0xf0000000 == 0
e0df6211
PH
549 return res
550 u32 = read_int
551
ba552f54
PH
552 def s32(reader=None):
553 v = read_int(reader)
e0df6211
PH
554 if v & 0x80000000 != 0:
555 v = - ((v ^ 0xffffffff) + 1)
ba552f54
PH
556 return v
557
0ca96d48 558 def read_string(reader=None):
ba552f54
PH
559 if reader is None:
560 reader = code_reader
561 slen = u30(reader)
562 resb = reader.read(slen)
563 assert len(resb) == slen
564 return resb.decode('utf-8')
565
566 def read_bytes(count, reader=None):
567 if reader is None:
568 reader = code_reader
569 resb = reader.read(count)
570 assert len(resb) == count
571 return resb
572
573 def read_byte(reader=None):
574 resb = read_bytes(1, reader=reader)
575 res = struct.unpack('<B', resb)[0]
576 return res
e0df6211
PH
577
578 # minor_version + major_version
0ca96d48 579 read_bytes(2 + 2)
e0df6211
PH
580
581 # Constant pool
ba552f54 582 int_count = u30()
e0df6211 583 for _c in range(1, int_count):
0ca96d48 584 s32()
ba552f54 585 uint_count = u30()
e0df6211 586 for _c in range(1, uint_count):
0ca96d48 587 u32()
ba552f54 588 double_count = u30()
0ca96d48 589 read_bytes((double_count-1) * 8)
ba552f54 590 string_count = u30()
e0df6211
PH
591 constant_strings = [u'']
592 for _c in range(1, string_count):
0ca96d48 593 s = read_string()
e0df6211 594 constant_strings.append(s)
ba552f54 595 namespace_count = u30()
e0df6211 596 for _c in range(1, namespace_count):
0ca96d48
PH
597 read_bytes(1) # kind
598 u30() # name
ba552f54 599 ns_set_count = u30()
e0df6211 600 for _c in range(1, ns_set_count):
ba552f54 601 count = u30()
e0df6211 602 for _c2 in range(count):
0ca96d48 603 u30()
ba552f54 604 multiname_count = u30()
e0df6211
PH
605 MULTINAME_SIZES = {
606 0x07: 2, # QName
607 0x0d: 2, # QNameA
608 0x0f: 1, # RTQName
609 0x10: 1, # RTQNameA
610 0x11: 0, # RTQNameL
611 0x12: 0, # RTQNameLA
612 0x09: 2, # Multiname
613 0x0e: 2, # MultinameA
614 0x1b: 1, # MultinameL
615 0x1c: 1, # MultinameLA
616 }
617 multinames = [u'']
618 for _c in range(1, multiname_count):
ba552f54 619 kind = u30()
e0df6211
PH
620 assert kind in MULTINAME_SIZES, u'Invalid multiname kind %r' % kind
621 if kind == 0x07:
0ca96d48 622 u30() # namespace_idx
ba552f54 623 name_idx = u30()
e0df6211
PH
624 multinames.append(constant_strings[name_idx])
625 else:
626 multinames.append('[MULTINAME kind: %d]' % kind)
627 for _c2 in range(MULTINAME_SIZES[kind]):
0ca96d48 628 u30()
e0df6211
PH
629
630 # Methods
ba552f54 631 method_count = u30()
e0df6211
PH
632 MethodInfo = collections.namedtuple(
633 'MethodInfo',
634 ['NEED_ARGUMENTS', 'NEED_REST'])
635 method_infos = []
636 for method_id in range(method_count):
ba552f54 637 param_count = u30()
0ca96d48 638 u30() # return type
e0df6211 639 for _ in range(param_count):
0ca96d48
PH
640 u30() # param type
641 u30() # name index (always 0 for youtube)
ba552f54 642 flags = read_byte()
e0df6211
PH
643 if flags & 0x08 != 0:
644 # Options present
ba552f54 645 option_count = u30()
e0df6211 646 for c in range(option_count):
0ca96d48
PH
647 u30() # val
648 read_bytes(1) # kind
e0df6211
PH
649 if flags & 0x80 != 0:
650 # Param names present
651 for _ in range(param_count):
0ca96d48 652 u30() # param name
e0df6211
PH
653 mi = MethodInfo(flags & 0x01 != 0, flags & 0x04 != 0)
654 method_infos.append(mi)
655
656 # Metadata
ba552f54 657 metadata_count = u30()
e0df6211 658 for _c in range(metadata_count):
0ca96d48 659 u30() # name
ba552f54 660 item_count = u30()
e0df6211 661 for _c2 in range(item_count):
0ca96d48
PH
662 u30() # key
663 u30() # value
ba552f54
PH
664
665 def parse_traits_info():
666 trait_name_idx = u30()
667 kind_full = read_byte()
e0df6211
PH
668 kind = kind_full & 0x0f
669 attrs = kind_full >> 4
670 methods = {}
671 if kind in [0x00, 0x06]: # Slot or Const
0ca96d48
PH
672 u30() # Slot id
673 u30() # type_name_idx
ba552f54 674 vindex = u30()
e0df6211 675 if vindex != 0:
0ca96d48 676 read_byte() # vkind
e0df6211 677 elif kind in [0x01, 0x02, 0x03]: # Method / Getter / Setter
0ca96d48 678 u30() # disp_id
ba552f54 679 method_idx = u30()
e0df6211
PH
680 methods[multinames[trait_name_idx]] = method_idx
681 elif kind == 0x04: # Class
0ca96d48
PH
682 u30() # slot_id
683 u30() # classi
e0df6211 684 elif kind == 0x05: # Function
0ca96d48 685 u30() # slot_id
ba552f54 686 function_idx = u30()
e0df6211
PH
687 methods[function_idx] = multinames[trait_name_idx]
688 else:
689 raise ExtractorError(u'Unsupported trait kind %d' % kind)
690
691 if attrs & 0x4 != 0: # Metadata present
ba552f54 692 metadata_count = u30()
e0df6211 693 for _c3 in range(metadata_count):
0ca96d48 694 u30() # metadata index
e0df6211 695
ba552f54 696 return methods
e0df6211
PH
697
698 # Classes
699 TARGET_CLASSNAME = u'SignatureDecipher'
700 searched_idx = multinames.index(TARGET_CLASSNAME)
701 searched_class_id = None
ba552f54 702 class_count = u30()
e0df6211 703 for class_id in range(class_count):
ba552f54 704 name_idx = u30()
e0df6211
PH
705 if name_idx == searched_idx:
706 # We found the class we're looking for!
707 searched_class_id = class_id
0ca96d48 708 u30() # super_name idx
ba552f54 709 flags = read_byte()
e0df6211 710 if flags & 0x08 != 0: # Protected namespace is present
0ca96d48 711 u30() # protected_ns_idx
ba552f54 712 intrf_count = u30()
e0df6211 713 for _c2 in range(intrf_count):
0ca96d48
PH
714 u30()
715 u30() # iinit
ba552f54 716 trait_count = u30()
e0df6211 717 for _c2 in range(trait_count):
0ca96d48 718 parse_traits_info()
e0df6211
PH
719
720 if searched_class_id is None:
721 raise ExtractorError(u'Target class %r not found' %
722 TARGET_CLASSNAME)
723
724 method_names = {}
725 method_idxs = {}
726 for class_id in range(class_count):
0ca96d48 727 u30() # cinit
ba552f54 728 trait_count = u30()
e0df6211 729 for _c2 in range(trait_count):
ba552f54 730 trait_methods = parse_traits_info()
e0df6211
PH
731 if class_id == searched_class_id:
732 method_names.update(trait_methods.items())
733 method_idxs.update(dict(
734 (idx, name)
735 for name, idx in trait_methods.items()))
736
737 # Scripts
ba552f54 738 script_count = u30()
e0df6211 739 for _c in range(script_count):
0ca96d48 740 u30() # init
ba552f54 741 trait_count = u30()
e0df6211 742 for _c2 in range(trait_count):
0ca96d48 743 parse_traits_info()
e0df6211
PH
744
745 # Method bodies
ba552f54 746 method_body_count = u30()
e0df6211
PH
747 Method = collections.namedtuple('Method', ['code', 'local_count'])
748 methods = {}
749 for _c in range(method_body_count):
ba552f54 750 method_idx = u30()
0ca96d48 751 u30() # max_stack
ba552f54 752 local_count = u30()
0ca96d48
PH
753 u30() # init_scope_depth
754 u30() # max_scope_depth
ba552f54
PH
755 code_length = u30()
756 code = read_bytes(code_length)
e0df6211 757 if method_idx in method_idxs:
ba552f54 758 m = Method(code, local_count)
e0df6211 759 methods[method_idxs[method_idx]] = m
ba552f54 760 exception_count = u30()
e0df6211 761 for _c2 in range(exception_count):
0ca96d48
PH
762 u30() # from
763 u30() # to
764 u30() # target
765 u30() # exc_type
766 u30() # var_name
ba552f54 767 trait_count = u30()
e0df6211 768 for _c2 in range(trait_count):
0ca96d48 769 parse_traits_info()
e0df6211 770
ba552f54 771 assert p + code_reader.tell() == len(code_tag)
e0df6211
PH
772 assert len(methods) == len(method_idxs)
773
774 method_pyfunctions = {}
775
776 def extract_function(func_name):
777 if func_name in method_pyfunctions:
778 return method_pyfunctions[func_name]
779 if func_name not in methods:
780 raise ExtractorError(u'Cannot find function %r' % func_name)
781 m = methods[func_name]
782
783 def resfunc(args):
e0df6211
PH
784 registers = ['(this)'] + list(args) + [None] * m.local_count
785 stack = []
786 coder = io.BytesIO(m.code)
787 while True:
788 opcode = struct.unpack('!B', coder.read(1))[0]
a7177865 789 if opcode == 36: # pushbyte
e0df6211
PH
790 v = struct.unpack('!B', coder.read(1))[0]
791 stack.append(v)
792 elif opcode == 44: # pushstring
793 idx = u30(coder)
794 stack.append(constant_strings[idx])
795 elif opcode == 48: # pushscope
796 # We don't implement the scope register, so we'll just
797 # ignore the popped value
798 stack.pop()
799 elif opcode == 70: # callproperty
800 index = u30(coder)
801 mname = multinames[index]
802 arg_count = u30(coder)
803 args = list(reversed(
804 [stack.pop() for _ in range(arg_count)]))
805 obj = stack.pop()
806 if mname == u'split':
807 assert len(args) == 1
808 assert isinstance(args[0], compat_str)
809 assert isinstance(obj, compat_str)
810 if args[0] == u'':
811 res = list(obj)
812 else:
813 res = obj.split(args[0])
814 stack.append(res)
a7177865
PH
815 elif mname == u'slice':
816 assert len(args) == 1
817 assert isinstance(args[0], int)
818 assert isinstance(obj, list)
819 res = obj[args[0]:]
820 stack.append(res)
821 elif mname == u'join':
822 assert len(args) == 1
823 assert isinstance(args[0], compat_str)
824 assert isinstance(obj, list)
825 res = args[0].join(obj)
826 stack.append(res)
e0df6211
PH
827 elif mname in method_pyfunctions:
828 stack.append(method_pyfunctions[mname](args))
829 else:
830 raise NotImplementedError(
831 u'Unsupported property %r on %r'
832 % (mname, obj))
a7177865
PH
833 elif opcode == 72: # returnvalue
834 res = stack.pop()
835 return res
836 elif opcode == 79: # callpropvoid
837 index = u30(coder)
838 mname = multinames[index]
839 arg_count = u30(coder)
840 args = list(reversed(
841 [stack.pop() for _ in range(arg_count)]))
842 obj = stack.pop()
843 if mname == u'reverse':
844 assert isinstance(obj, list)
845 obj.reverse()
846 else:
847 raise NotImplementedError(
848 u'Unsupported (void) property %r on %r'
849 % (mname, obj))
e0df6211
PH
850 elif opcode == 93: # findpropstrict
851 index = u30(coder)
852 mname = multinames[index]
853 res = extract_function(mname)
854 stack.append(res)
855 elif opcode == 97: # setproperty
856 index = u30(coder)
857 value = stack.pop()
858 idx = stack.pop()
859 obj = stack.pop()
860 assert isinstance(obj, list)
861 assert isinstance(idx, int)
862 obj[idx] = value
863 elif opcode == 98: # getlocal
864 index = u30(coder)
865 stack.append(registers[index])
866 elif opcode == 99: # setlocal
867 index = u30(coder)
868 value = stack.pop()
869 registers[index] = value
870 elif opcode == 102: # getproperty
871 index = u30(coder)
872 pname = multinames[index]
873 if pname == u'length':
874 obj = stack.pop()
875 assert isinstance(obj, list)
876 stack.append(len(obj))
877 else: # Assume attribute access
878 idx = stack.pop()
879 assert isinstance(idx, int)
880 obj = stack.pop()
881 assert isinstance(obj, list)
882 stack.append(obj[idx])
883 elif opcode == 128: # coerce
0ca96d48 884 u30(coder)
e0df6211
PH
885 elif opcode == 133: # coerce_s
886 assert isinstance(stack[-1], (type(None), compat_str))
887 elif opcode == 164: # modulo
888 value2 = stack.pop()
889 value1 = stack.pop()
890 res = value1 % value2
891 stack.append(res)
a7177865
PH
892 elif opcode == 208: # getlocal_0
893 stack.append(registers[0])
894 elif opcode == 209: # getlocal_1
895 stack.append(registers[1])
896 elif opcode == 210: # getlocal_2
897 stack.append(registers[2])
898 elif opcode == 211: # getlocal_3
899 stack.append(registers[3])
e0df6211
PH
900 elif opcode == 214: # setlocal_2
901 registers[2] = stack.pop()
902 elif opcode == 215: # setlocal_3
903 registers[3] = stack.pop()
904 else:
905 raise NotImplementedError(
906 u'Unsupported opcode %d' % opcode)
907
908 method_pyfunctions[func_name] = resfunc
909 return resfunc
910
911 initial_function = extract_function(u'decipher')
912 return lambda s: initial_function([s])
913
83799698 914 def _decrypt_signature(self, s, video_id, player_url, age_gate=False):
257a2501 915 """Turn the encrypted s field into a working signature"""
6b37f0be 916
83799698 917 if player_url is not None:
9f9be844
PH
918 if player_url.startswith(u'//'):
919 player_url = u'https:' + player_url
e0df6211 920 try:
7f8ae73a
PH
921 player_id = (player_url, len(s))
922 if player_id not in self._player_cache:
83799698 923 func = self._extract_signature_function(
c4417ddb 924 video_id, player_url, len(s)
e0df6211 925 )
7f8ae73a
PH
926 self._player_cache[player_id] = func
927 func = self._player_cache[player_id]
edf3e38e
PH
928 if self._downloader.params.get('youtube_print_sig_code'):
929 self._print_sig_code(func, len(s))
930 return func(s)
0ca96d48 931 except Exception:
e0df6211 932 tb = traceback.format_exc()
83799698
PH
933 self._downloader.report_warning(
934 u'Automatic signature extraction failed: ' + tb)
e0df6211 935
d2d8f895
PH
936 self._downloader.report_warning(
937 u'Warning: Falling back to static signature algorithm')
920de7a2 938
2f2ffea9
PH
939 return self._static_decrypt_signature(
940 s, video_id, player_url, age_gate)
e0df6211 941
2f2ffea9 942 def _static_decrypt_signature(self, s, video_id, player_url, age_gate):
e0df6211
PH
943 if age_gate:
944 # The videos with age protection use another player, so the
945 # algorithms can be different.
946 if len(s) == 86:
947 return s[2:63] + s[82] + s[64:82] + s[63]
948
bc4b9008 949 if len(s) == 93:
950 return s[86:29:-1] + s[88] + s[28:5:-1]
951 elif len(s) == 92:
444b1165 952 return s[25] + s[3:25] + s[0] + s[26:42] + s[79] + s[43:79] + s[91] + s[80:83]
38d025b3
JMF
953 elif len(s) == 91:
954 return s[84:27:-1] + s[86] + s[26:5:-1]
444b1165
JMF
955 elif len(s) == 90:
956 return s[25] + s[3:25] + s[2] + s[26:40] + s[77] + s[41:77] + s[89] + s[78:81]
8a9d86a2 957 elif len(s) == 89:
958 return s[84:78:-1] + s[87] + s[77:60:-1] + s[0] + s[59:3:-1]
444b1165 959 elif len(s) == 88:
3e223834 960 return s[7:28] + s[87] + s[29:45] + s[55] + s[46:55] + s[2] + s[56:87] + s[28]
be547e1d 961 elif len(s) == 87:
3a725669 962 return s[6:27] + s[4] + s[28:39] + s[27] + s[40:59] + s[2] + s[60:]
be547e1d 963 elif len(s) == 86:
f2c327fd 964 return s[80:72:-1] + s[16] + s[71:39:-1] + s[72] + s[38:16:-1] + s[82] + s[15::-1]
be547e1d 965 elif len(s) == 85:
6ae8ee3f 966 return s[3:11] + s[0] + s[12:55] + s[84] + s[56:84]
be547e1d 967 elif len(s) == 84:
6f56389b 968 return s[78:70:-1] + s[14] + s[69:37:-1] + s[70] + s[36:14:-1] + s[80] + s[:14][::-1]
be547e1d 969 elif len(s) == 83:
920de7a2 970 return s[80:63:-1] + s[0] + s[62:0:-1] + s[63]
be547e1d 971 elif len(s) == 82:
c21315f2 972 return s[80:37:-1] + s[7] + s[36:7:-1] + s[0] + s[6:0:-1] + s[37]
be547e1d 973 elif len(s) == 81:
aedd6bb9 974 return s[56] + s[79:56:-1] + s[41] + s[55:41:-1] + s[80] + s[40:34:-1] + s[0] + s[33:29:-1] + s[34] + s[28:9:-1] + s[29] + s[8:0:-1] + s[9]
066090dd
JMF
975 elif len(s) == 80:
976 return s[1:19] + s[0] + s[20:68] + s[19] + s[69:80]
5c468ca8
JMF
977 elif len(s) == 79:
978 return s[54] + s[77:54:-1] + s[39] + s[53:39:-1] + s[78] + s[38:34:-1] + s[0] + s[33:29:-1] + s[34] + s[28:9:-1] + s[29] + s[8:0:-1] + s[9]
be547e1d
PH
979
980 else:
981 raise ExtractorError(u'Unable to decrypt signature, key length %d not supported; retrying might work' % (len(s)))
c5e8d7af 982
1f343eaa 983 def _get_available_subtitles(self, video_id, webpage):
de7f3446 984 try:
7fad1c63
JMF
985 sub_list = self._download_webpage(
986 'http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,
987 video_id, note=False)
988 except ExtractorError as err:
de7f3446
JMF
989 self._downloader.report_warning(u'unable to download video subtitles: %s' % compat_str(err))
990 return {}
991 lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
992
993 sub_lang_list = {}
994 for l in lang_list:
995 lang = l[1]
996 params = compat_urllib_parse.urlencode({
997 'lang': lang,
998 'v': video_id,
ca715127 999 'fmt': self._downloader.params.get('subtitlesformat', 'srt'),
8eddf3e9 1000 'name': l[0].encode('utf-8'),
de7f3446
JMF
1001 })
1002 url = u'http://www.youtube.com/api/timedtext?' + params
1003 sub_lang_list[lang] = url
1004 if not sub_lang_list:
1005 self._downloader.report_warning(u'video doesn\'t have subtitles')
1006 return {}
1007 return sub_lang_list
1008
055e6f36 1009 def _get_available_automatic_caption(self, video_id, webpage):
de7f3446
JMF
1010 """We need the webpage for getting the captions url, pass it as an
1011 argument to speed up the process."""
ca715127 1012 sub_format = self._downloader.params.get('subtitlesformat', 'srt')
de7f3446
JMF
1013 self.to_screen(u'%s: Looking for automatic captions' % video_id)
1014 mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
055e6f36 1015 err_msg = u'Couldn\'t find automatic captions for %s' % video_id
de7f3446
JMF
1016 if mobj is None:
1017 self._downloader.report_warning(err_msg)
1018 return {}
1019 player_config = json.loads(mobj.group(1))
1020 try:
1021 args = player_config[u'args']
1022 caption_url = args[u'ttsurl']
1023 timestamp = args[u'timestamp']
055e6f36
JMF
1024 # We get the available subtitles
1025 list_params = compat_urllib_parse.urlencode({
1026 'type': 'list',
1027 'tlangs': 1,
1028 'asrs': 1,
de7f3446 1029 })
055e6f36 1030 list_url = caption_url + '&' + list_params
e26f8712 1031 caption_list = self._download_xml(list_url, video_id)
e3dc22ca 1032 original_lang_node = caption_list.find('track')
f6a54188 1033 if original_lang_node is None or original_lang_node.attrib.get('kind') != 'asr' :
e3dc22ca
JMF
1034 self._downloader.report_warning(u'Video doesn\'t have automatic captions')
1035 return {}
1036 original_lang = original_lang_node.attrib['lang_code']
055e6f36
JMF
1037
1038 sub_lang_list = {}
1039 for lang_node in caption_list.findall('target'):
1040 sub_lang = lang_node.attrib['lang_code']
1041 params = compat_urllib_parse.urlencode({
1042 'lang': original_lang,
1043 'tlang': sub_lang,
1044 'fmt': sub_format,
1045 'ts': timestamp,
1046 'kind': 'asr',
1047 })
1048 sub_lang_list[sub_lang] = caption_url + '&' + params
1049 return sub_lang_list
de7f3446
JMF
1050 # An extractor error can be raise by the download process if there are
1051 # no automatic captions but there are subtitles
1052 except (KeyError, ExtractorError):
1053 self._downloader.report_warning(err_msg)
1054 return {}
1055
c5e8d7af
PH
1056 def _extract_id(self, url):
1057 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1058 if mobj is None:
1059 raise ExtractorError(u'Invalid URL: %s' % url)
1060 video_id = mobj.group(2)
1061 return video_id
1062
1d043b93
JMF
1063 def _get_video_url_list(self, url_map):
1064 """
1065 Transform a dictionary in the format {itag:url} to a list of (itag, url)
1066 with the requested formats.
1067 """
2c62dc26 1068 existing_formats = [x for x in self._formats if x in url_map]
1d043b93
JMF
1069 if len(existing_formats) == 0:
1070 raise ExtractorError(u'no known formats available for video')
4ea3be0a 1071 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1072 video_url_list.reverse() # order worst to best
1d043b93
JMF
1073 return video_url_list
1074
1075 def _extract_from_m3u8(self, manifest_url, video_id):
1076 url_map = {}
1077 def _get_urls(_manifest):
1078 lines = _manifest.split('\n')
1079 urls = filter(lambda l: l and not l.startswith('#'),
1080 lines)
1081 return urls
1082 manifest = self._download_webpage(manifest_url, video_id, u'Downloading formats manifest')
1083 formats_urls = _get_urls(manifest)
1084 for format_url in formats_urls:
890f62e8 1085 itag = self._search_regex(r'itag/(\d+?)/', format_url, 'itag')
1d043b93
JMF
1086 url_map[itag] = format_url
1087 return url_map
1088
1fb07d10
JG
1089 def _extract_annotations(self, video_id):
1090 url = 'https://www.youtube.com/annotations_invideo?features=1&legacy=1&video_id=%s' % video_id
1091 return self._download_webpage(url, video_id, note=u'Searching for annotations.', errnote=u'Unable to download video annotations.')
1092
c5e8d7af
PH
1093 def _real_extract(self, url):
1094 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
1095 mobj = re.search(self._NEXT_URL_RE, url)
1096 if mobj:
1097 url = 'https://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
1098 video_id = self._extract_id(url)
1099
1100 # Get video webpage
c5e8d7af 1101 url = 'https://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
336c3a69 1102 video_webpage = self._download_webpage(url, video_id)
c5e8d7af
PH
1103
1104 # Attempt to extract SWF player URL
e0df6211 1105 mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
c5e8d7af
PH
1106 if mobj is not None:
1107 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1108 else:
1109 player_url = None
1110
1111 # Get video info
1112 self.report_video_info_webpage_download(video_id)
c108eb73
JMF
1113 if re.search(r'player-age-gate-content">', video_webpage) is not None:
1114 self.report_age_confirmation()
1115 age_gate = True
1116 # We simulate the access to the video from www.youtube.com/v/{video_id}
1117 # this can be viewed without login into Youtube
1118 data = compat_urllib_parse.urlencode({'video_id': video_id,
fccd3771 1119 'el': 'player_embedded',
c108eb73
JMF
1120 'gl': 'US',
1121 'hl': 'en',
1122 'eurl': 'https://youtube.googleapis.com/v/' + video_id,
1123 'asv': 3,
1124 'sts':'1588',
1125 })
1126 video_info_url = 'https://www.youtube.com/get_video_info?' + data
c5e8d7af
PH
1127 video_info_webpage = self._download_webpage(video_info_url, video_id,
1128 note=False,
1129 errnote='unable to download video info webpage')
1130 video_info = compat_parse_qs(video_info_webpage)
c108eb73
JMF
1131 else:
1132 age_gate = False
1133 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1134 video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1135 % (video_id, el_type))
1136 video_info_webpage = self._download_webpage(video_info_url, video_id,
1137 note=False,
1138 errnote='unable to download video info webpage')
1139 video_info = compat_parse_qs(video_info_webpage)
1140 if 'token' in video_info:
1141 break
c5e8d7af
PH
1142 if 'token' not in video_info:
1143 if 'reason' in video_info:
9a82b238 1144 raise ExtractorError(u'YouTube said: %s' % video_info['reason'][0], expected=True)
c5e8d7af
PH
1145 else:
1146 raise ExtractorError(u'"token" parameter not in video info for unknown reason')
1147
1d699755
PH
1148 if 'view_count' in video_info:
1149 view_count = int(video_info['view_count'][0])
1150 else:
1151 view_count = None
1152
c5e8d7af
PH
1153 # Check for "rental" videos
1154 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
1155 raise ExtractorError(u'"rental" videos not supported')
1156
1157 # Start extracting information
1158 self.report_information_extraction(video_id)
1159
1160 # uploader
1161 if 'author' not in video_info:
1162 raise ExtractorError(u'Unable to extract uploader name')
1163 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
1164
1165 # uploader_id
1166 video_uploader_id = None
1167 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
1168 if mobj is not None:
1169 video_uploader_id = mobj.group(1)
1170 else:
1171 self._downloader.report_warning(u'unable to extract uploader nickname')
1172
1173 # title
a8c6b241
PH
1174 if 'title' in video_info:
1175 video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
1176 else:
1177 self._downloader.report_warning(u'Unable to extract video title')
1178 video_title = u'_'
c5e8d7af
PH
1179
1180 # thumbnail image
7763b04e
JMF
1181 # We try first to get a high quality image:
1182 m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">',
1183 video_webpage, re.DOTALL)
1184 if m_thumb is not None:
1185 video_thumbnail = m_thumb.group(1)
1186 elif 'thumbnail_url' not in video_info:
c5e8d7af 1187 self._downloader.report_warning(u'unable to extract video thumbnail')
f490e77e 1188 video_thumbnail = None
c5e8d7af
PH
1189 else: # don't panic if we can't find it
1190 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
1191
1192 # upload date
1193 upload_date = None
1194 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
1195 if mobj is not None:
1196 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1197 upload_date = unified_strdate(upload_date)
1198
1199 # description
1200 video_description = get_element_by_id("eow-description", video_webpage)
1201 if video_description:
27dcce19
PH
1202 video_description = re.sub(r'''(?x)
1203 <a\s+
1204 (?:[a-zA-Z-]+="[^"]+"\s+)*?
1205 title="([^"]+)"\s+
1206 (?:[a-zA-Z-]+="[^"]+"\s+)*?
1207 class="yt-uix-redirect-link"\s*>
1208 [^<]+
1209 </a>
1210 ''', r'\1', video_description)
c5e8d7af
PH
1211 video_description = clean_html(video_description)
1212 else:
1213 fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
1214 if fd_mobj:
1215 video_description = unescapeHTML(fd_mobj.group(1))
1216 else:
1217 video_description = u''
1218
336c3a69 1219 def _extract_count(klass):
46374a56
PH
1220 count = self._search_regex(
1221 r'class="%s">([\d,]+)</span>' % re.escape(klass),
1222 video_webpage, klass, default=None)
336c3a69
JMF
1223 if count is not None:
1224 return int(count.replace(',', ''))
1225 return None
1226 like_count = _extract_count(u'likes-count')
1227 dislike_count = _extract_count(u'dislikes-count')
1228
c5e8d7af 1229 # subtitles
d82134c3 1230 video_subtitles = self.extract_subtitles(video_id, video_webpage)
c5e8d7af 1231
c5e8d7af 1232 if self._downloader.params.get('listsubtitles', False):
d665f8d3 1233 self._list_available_subtitles(video_id, video_webpage)
c5e8d7af
PH
1234 return
1235
1236 if 'length_seconds' not in video_info:
1237 self._downloader.report_warning(u'unable to extract video duration')
b466b702 1238 video_duration = None
c5e8d7af 1239 else:
b466b702 1240 video_duration = int(compat_urllib_parse.unquote_plus(video_info['length_seconds'][0]))
c5e8d7af 1241
1fb07d10
JG
1242 # annotations
1243 video_annotations = None
1244 if self._downloader.params.get('writeannotations', False):
1245 video_annotations = self._extract_annotations(video_id)
1246
c5e8d7af 1247 # Decide which formats to download
c5e8d7af
PH
1248
1249 try:
1250 mobj = re.search(r';ytplayer.config = ({.*?});', video_webpage)
50be92c1
PH
1251 if not mobj:
1252 raise ValueError('Could not find vevo ID')
c5e8d7af
PH
1253 info = json.loads(mobj.group(1))
1254 args = info['args']
7ce7e394
JMF
1255 # Easy way to know if the 's' value is in url_encoded_fmt_stream_map
1256 # this signatures are encrypted
44d46655 1257 if 'url_encoded_fmt_stream_map' not in args:
f10503db 1258 raise ValueError(u'No stream_map present') # caught below
00fe14fc
JMF
1259 re_signature = re.compile(r'[&,]s=')
1260 m_s = re_signature.search(args['url_encoded_fmt_stream_map'])
7ce7e394
JMF
1261 if m_s is not None:
1262 self.to_screen(u'%s: Encrypted signatures detected.' % video_id)
c5e8d7af 1263 video_info['url_encoded_fmt_stream_map'] = [args['url_encoded_fmt_stream_map']]
00fe14fc 1264 m_s = re_signature.search(args.get('adaptive_fmts', u''))
b7a68384 1265 if m_s is not None:
00fe14fc
JMF
1266 if 'adaptive_fmts' in video_info:
1267 video_info['adaptive_fmts'][0] += ',' + args['adaptive_fmts']
37b6d5f6 1268 else:
00fe14fc 1269 video_info['adaptive_fmts'] = [args['adaptive_fmts']]
c5e8d7af
PH
1270 except ValueError:
1271 pass
1272
1273 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1274 self.report_rtmp_download()
1275 video_url_list = [(None, video_info['conn'][0])]
00fe14fc
JMF
1276 elif len(video_info.get('url_encoded_fmt_stream_map', [])) >= 1 or len(video_info.get('adaptive_fmts', [])) >= 1:
1277 encoded_url_map = video_info.get('url_encoded_fmt_stream_map', [''])[0] + ',' + video_info.get('adaptive_fmts',[''])[0]
1278 if 'rtmpe%3Dyes' in encoded_url_map:
a7055eb9 1279 raise ExtractorError('rtmpe downloads are not supported, see https://github.com/rg3/youtube-dl/issues/343 for more information.', expected=True)
c5e8d7af 1280 url_map = {}
00fe14fc 1281 for url_data_str in encoded_url_map.split(','):
c5e8d7af
PH
1282 url_data = compat_parse_qs(url_data_str)
1283 if 'itag' in url_data and 'url' in url_data:
1284 url = url_data['url'][0]
1285 if 'sig' in url_data:
1286 url += '&signature=' + url_data['sig'][0]
1287 elif 's' in url_data:
e0df6211 1288 encrypted_sig = url_data['s'][0]
769fda3c 1289 if self._downloader.params.get('verbose'):
c108eb73 1290 if age_gate:
bdde940e
PH
1291 if player_url is None:
1292 player_version = 'unknown'
1293 else:
1294 player_version = self._search_regex(
1295 r'-(.+)\.swf$', player_url,
1296 u'flash player', fatal=False)
e0df6211 1297 player_desc = 'flash player %s' % player_version
c108eb73 1298 else:
83799698
PH
1299 player_version = self._search_regex(
1300 r'html5player-(.+?)\.js', video_webpage,
c108eb73 1301 'html5 player', fatal=False)
e0df6211
PH
1302 player_desc = u'html5 player %s' % player_version
1303
1304 parts_sizes = u'.'.join(compat_str(len(part)) for part in encrypted_sig.split('.'))
5a76c651 1305 self.to_screen(u'encrypted signature length %d (%s), itag %s, %s' %
e0df6211
PH
1306 (len(encrypted_sig), parts_sizes, url_data['itag'][0], player_desc))
1307
83799698 1308 if not age_gate:
e0df6211
PH
1309 jsplayer_url_json = self._search_regex(
1310 r'"assets":.+?"js":\s*("[^"]+")',
1311 video_webpage, u'JS player URL')
83799698 1312 player_url = json.loads(jsplayer_url_json)
e0df6211 1313
83799698
PH
1314 signature = self._decrypt_signature(
1315 encrypted_sig, video_id, player_url, age_gate)
c5e8d7af
PH
1316 url += '&signature=' + signature
1317 if 'ratebypass' not in url:
1318 url += '&ratebypass=yes'
1319 url_map[url_data['itag'][0]] = url
1d043b93 1320 video_url_list = self._get_video_url_list(url_map)
1d043b93
JMF
1321 elif video_info.get('hlsvp'):
1322 manifest_url = video_info['hlsvp'][0]
1323 url_map = self._extract_from_m3u8(manifest_url, video_id)
1324 video_url_list = self._get_video_url_list(url_map)
c5e8d7af 1325 else:
9abb3204 1326 raise ExtractorError(u'no conn, hlsvp or url_encoded_fmt_stream_map information found in video info')
c5e8d7af 1327
4ea3be0a 1328 formats = []
600cc1a4 1329 for itag, video_real_url in video_url_list:
2c62dc26
PH
1330 dct = {
1331 'format_id': itag,
1332 'url': video_real_url,
1333 'player_url': player_url,
1334 }
1335 dct.update(self._formats[itag])
1336 formats.append(dct)
d80044c2 1337
4bcc7bd1 1338 self._sort_formats(formats)
4ea3be0a 1339
1340 return {
1341 'id': video_id,
1342 'uploader': video_uploader,
1343 'uploader_id': video_uploader_id,
1344 'upload_date': upload_date,
1345 'title': video_title,
1346 'thumbnail': video_thumbnail,
1347 'description': video_description,
1348 'subtitles': video_subtitles,
1349 'duration': video_duration,
1350 'age_limit': 18 if age_gate else 0,
1351 'annotations': video_annotations,
1352 'webpage_url': 'https://www.youtube.com/watch?v=%s' % video_id,
1353 'view_count': view_count,
1354 'like_count': like_count,
1355 'dislike_count': dislike_count,
1356 'formats': formats,
1357 }
c5e8d7af 1358
880e1c52 1359class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
0f818663 1360 IE_DESC = u'YouTube.com playlists'
c5e8d7af
PH
1361 _VALID_URL = r"""(?:
1362 (?:https?://)?
1363 (?:\w+\.)?
1364 youtube\.com/
1365 (?:
1366 (?:course|view_play_list|my_playlists|artist|playlist|watch)
1367 \? (?:.*?&)*? (?:p|a|list)=
1368 | p/
1369 )
715c8e7b 1370 ((?:PL|EC|UU|FL|RD)?[0-9A-Za-z-_]{10,})
c5e8d7af
PH
1371 .*
1372 |
715c8e7b 1373 ((?:PL|EC|UU|FL|RD)[0-9A-Za-z-_]{10,})
c5e8d7af 1374 )"""
dcbb4580
JMF
1375 _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s&page=%s'
1376 _MORE_PAGES_INDICATOR = r'data-link-type="next"'
6e47b51e 1377 _VIDEO_RE = r'href="/watch\?v=(?P<id>[0-9A-Za-z_-]{11})&amp;[^"]*?index=(?P<index>\d+)'
c5e8d7af
PH
1378 IE_NAME = u'youtube:playlist'
1379
1380 @classmethod
1381 def suitable(cls, url):
1382 """Receives a URL and returns True if suitable for this IE."""
1383 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1384
880e1c52
JMF
1385 def _real_initialize(self):
1386 self._login()
1387
652cdaa2
JMF
1388 def _ids_to_results(self, ids):
1389 return [self.url_result(vid_id, 'Youtube', video_id=vid_id)
1390 for vid_id in ids]
1391
1392 def _extract_mix(self, playlist_id):
1393 # The mixes are generated from a a single video
1394 # the id of the playlist is just 'RD' + video_id
7d4afc55 1395 url = 'https://youtube.com/watch?v=%s&list=%s' % (playlist_id[-11:], playlist_id)
652cdaa2 1396 webpage = self._download_webpage(url, playlist_id, u'Downloading Youtube mix')
76d1700b
JMF
1397 title_span = (get_element_by_attribute('class', 'title long-title', webpage) or
1398 get_element_by_attribute('class', 'title ', webpage))
1399 title = clean_html(title_span)
652cdaa2
JMF
1400 video_re = r'data-index="\d+".*?href="/watch\?v=([0-9A-Za-z_-]{11})&amp;[^"]*?list=%s' % re.escape(playlist_id)
1401 ids = orderedSet(re.findall(video_re, webpage))
1402 url_results = self._ids_to_results(ids)
1403
1404 return self.playlist_result(url_results, playlist_id, title)
1405
c5e8d7af
PH
1406 def _real_extract(self, url):
1407 # Extract playlist id
1408 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1409 if mobj is None:
1410 raise ExtractorError(u'Invalid URL: %s' % url)
47192f92
FV
1411 playlist_id = mobj.group(1) or mobj.group(2)
1412
1413 # Check if it's a video-specific URL
7c61bd36 1414 query_dict = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
47192f92
FV
1415 if 'v' in query_dict:
1416 video_id = query_dict['v'][0]
1417 if self._downloader.params.get('noplaylist'):
1418 self.to_screen(u'Downloading just video %s because of --no-playlist' % video_id)
7012b23c 1419 return self.url_result(video_id, 'Youtube', video_id=video_id)
47192f92
FV
1420 else:
1421 self.to_screen(u'Downloading playlist PL%s - add --no-playlist to just download video %s' % (playlist_id, video_id))
c5e8d7af 1422
7d4afc55 1423 if playlist_id.startswith('RD'):
652cdaa2
JMF
1424 # Mixes require a custom extraction process
1425 return self._extract_mix(playlist_id)
0a688bc0
JMF
1426 if playlist_id.startswith('TL'):
1427 raise ExtractorError(u'For downloading YouTube.com top lists, use '
1428 u'the "yttoplist" keyword, for example "youtube-dl \'yttoplist:music:Top Tracks\'"', expected=True)
652cdaa2 1429
dcbb4580
JMF
1430 # Extract the video ids from the playlist pages
1431 ids = []
c5e8d7af 1432
755eb032 1433 for page_num in itertools.count(1):
dcbb4580 1434 url = self._TEMPLATE_URL % (playlist_id, page_num)
c5e8d7af 1435 page = self._download_webpage(url, playlist_id, u'Downloading page #%s' % page_num)
6e47b51e
JMF
1436 matches = re.finditer(self._VIDEO_RE, page)
1437 # We remove the duplicates and the link with index 0
1438 # (it's not the first video of the playlist)
1439 new_ids = orderedSet(m.group('id') for m in matches if m.group('index') != '0')
dcbb4580 1440 ids.extend(new_ids)
c5e8d7af 1441
dcbb4580 1442 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
c5e8d7af
PH
1443 break
1444
dcbb4580 1445 playlist_title = self._og_search_title(page)
c5e8d7af 1446
652cdaa2 1447 url_results = self._ids_to_results(ids)
dcbb4580 1448 return self.playlist_result(url_results, playlist_id, playlist_title)
c5e8d7af
PH
1449
1450
0a688bc0
JMF
1451class YoutubeTopListIE(YoutubePlaylistIE):
1452 IE_NAME = u'youtube:toplist'
1453 IE_DESC = (u'YouTube.com top lists, "yttoplist:{channel}:{list title}"'
1454 u' (Example: "yttoplist:music:Top Tracks")')
1455 _VALID_URL = r'yttoplist:(?P<chann>.*?):(?P<title>.*?)$'
1456
1457 def _real_extract(self, url):
1458 mobj = re.match(self._VALID_URL, url)
1459 channel = mobj.group('chann')
1460 title = mobj.group('title')
1461 query = compat_urllib_parse.urlencode({'title': title})
1462 playlist_re = 'href="([^"]+?%s[^"]+?)"' % re.escape(query)
1463 channel_page = self._download_webpage('https://www.youtube.com/%s' % channel, title)
1464 link = self._html_search_regex(playlist_re, channel_page, u'list')
1465 url = compat_urlparse.urljoin('https://www.youtube.com/', link)
1466
1467 video_re = r'data-index="\d+".*?data-video-id="([0-9A-Za-z_-]{11})"'
1468 ids = []
1469 # sometimes the webpage doesn't contain the videos
1470 # retry until we get them
1471 for i in itertools.count(0):
1472 msg = u'Downloading Youtube mix'
1473 if i > 0:
1474 msg += ', retry #%d' % i
1475 webpage = self._download_webpage(url, title, msg)
1476 ids = orderedSet(re.findall(video_re, webpage))
1477 if ids:
1478 break
1479 url_results = self._ids_to_results(ids)
1480 return self.playlist_result(url_results, playlist_title=title)
1481
1482
c5e8d7af 1483class YoutubeChannelIE(InfoExtractor):
0f818663 1484 IE_DESC = u'YouTube.com channels'
c5e8d7af 1485 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
c5e8d7af 1486 _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
252580c5 1487 _MORE_PAGES_URL = 'http://www.youtube.com/c4_browse_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
c5e8d7af
PH
1488 IE_NAME = u'youtube:channel'
1489
1490 def extract_videos_from_page(self, page):
1491 ids_in_page = []
1492 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1493 if mobj.group(1) not in ids_in_page:
1494 ids_in_page.append(mobj.group(1))
1495 return ids_in_page
1496
1497 def _real_extract(self, url):
1498 # Extract channel id
1499 mobj = re.match(self._VALID_URL, url)
1500 if mobj is None:
1501 raise ExtractorError(u'Invalid URL: %s' % url)
1502
1503 # Download channel page
1504 channel_id = mobj.group(1)
1505 video_ids = []
b9643eed
JMF
1506 url = 'https://www.youtube.com/channel/%s/videos' % channel_id
1507 channel_page = self._download_webpage(url, channel_id)
31812a9e
PH
1508 autogenerated = re.search(r'''(?x)
1509 class="[^"]*?(?:
1510 channel-header-autogenerated-label|
1511 yt-channel-title-autogenerated
1512 )[^"]*"''', channel_page) is not None
c5e8d7af 1513
b9643eed
JMF
1514 if autogenerated:
1515 # The videos are contained in a single page
1516 # the ajax pages can't be used, they are empty
1517 video_ids = self.extract_videos_from_page(channel_page)
1518 else:
1519 # Download all channel pages using the json-based channel_ajax query
1520 for pagenum in itertools.count(1):
1521 url = self._MORE_PAGES_URL % (pagenum, channel_id)
1522 page = self._download_webpage(url, channel_id,
1523 u'Downloading page #%s' % pagenum)
1524
1525 page = json.loads(page)
1526
1527 ids_in_page = self.extract_videos_from_page(page['content_html'])
1528 video_ids.extend(ids_in_page)
1529
1530 if self._MORE_PAGES_INDICATOR not in page['load_more_widget_html']:
1531 break
c5e8d7af
PH
1532
1533 self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1534
7012b23c
PH
1535 url_entries = [self.url_result(video_id, 'Youtube', video_id=video_id)
1536 for video_id in video_ids]
1537 return self.playlist_result(url_entries, channel_id)
c5e8d7af
PH
1538
1539
1540class YoutubeUserIE(InfoExtractor):
0f818663 1541 IE_DESC = u'YouTube.com user videos (URL or "ytuser" keyword)'
57da92b7 1542 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:user/)?(?!(?:attribution_link|watch)(?:$|[^a-z_A-Z0-9-])))|ytuser:)(?!feed/)([A-Za-z0-9_-]+)'
c5e8d7af
PH
1543 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1544 _GDATA_PAGE_SIZE = 50
fd9cf738 1545 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d&alt=json'
c5e8d7af
PH
1546 IE_NAME = u'youtube:user'
1547
e3ea4790 1548 @classmethod
f4b05232 1549 def suitable(cls, url):
e3ea4790
JMF
1550 # Don't return True if the url can be extracted with other youtube
1551 # extractor, the regex would is too permissive and it would match.
1552 other_ies = iter(klass for (name, klass) in globals().items() if name.endswith('IE') and klass is not cls)
1553 if any(ie.suitable(url) for ie in other_ies): return False
f4b05232
JMF
1554 else: return super(YoutubeUserIE, cls).suitable(url)
1555
c5e8d7af
PH
1556 def _real_extract(self, url):
1557 # Extract username
1558 mobj = re.match(self._VALID_URL, url)
1559 if mobj is None:
1560 raise ExtractorError(u'Invalid URL: %s' % url)
1561
1562 username = mobj.group(1)
1563
1564 # Download video ids using YouTube Data API. Result size per
1565 # query is limited (currently to 50 videos) so we need to query
1566 # page by page until there are no video ids - it means we got
1567 # all of them.
1568
e302f9ce 1569 url_results = []
c5e8d7af 1570
755eb032 1571 for pagenum in itertools.count(0):
c5e8d7af
PH
1572 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1573
1574 gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
1575 page = self._download_webpage(gdata_url, username,
1576 u'Downloading video ids from %d to %d' % (start_index, start_index + self._GDATA_PAGE_SIZE))
1577
fd9cf738
JMF
1578 try:
1579 response = json.loads(page)
1580 except ValueError as err:
1581 raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
71c82637
JMF
1582 if 'entry' not in response['feed']:
1583 # Number of videos is a multiple of self._MAX_RESULTS
1584 break
fd9cf738 1585
c5e8d7af 1586 # Extract video identifiers
e302f9ce
PH
1587 entries = response['feed']['entry']
1588 for entry in entries:
1589 title = entry['title']['$t']
1590 video_id = entry['id']['$t'].split('/')[-1]
1591 url_results.append({
1592 '_type': 'url',
1593 'url': video_id,
1594 'ie_key': 'Youtube',
1595 'id': 'video_id',
1596 'title': title,
1597 })
c5e8d7af
PH
1598
1599 # A little optimization - if current page is not
1600 # "full", ie. does not contain PAGE_SIZE video ids then
1601 # we can assume that this page is the last one - there
1602 # are no more ids on further pages - no need to query
1603 # again.
1604
e302f9ce 1605 if len(entries) < self._GDATA_PAGE_SIZE:
c5e8d7af
PH
1606 break
1607
7012b23c
PH
1608 return self.playlist_result(url_results, playlist_title=username)
1609
b05654f0
PH
1610
1611class YoutubeSearchIE(SearchInfoExtractor):
0f818663 1612 IE_DESC = u'YouTube.com searches'
b05654f0
PH
1613 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1614 _MAX_RESULTS = 1000
1615 IE_NAME = u'youtube:search'
1616 _SEARCH_KEY = 'ytsearch'
1617
b05654f0
PH
1618 def _get_n_results(self, query, n):
1619 """Get a specified number of results for a query"""
1620
1621 video_ids = []
1622 pagenum = 0
1623 limit = n
1624
1625 while (50 * pagenum) < limit:
b05654f0 1626 result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
7cc3570e
PH
1627 data_json = self._download_webpage(
1628 result_url, video_id=u'query "%s"' % query,
1629 note=u'Downloading page %s' % (pagenum + 1),
1630 errnote=u'Unable to download API page')
1631 data = json.loads(data_json)
1632 api_response = data['data']
1633
1634 if 'items' not in api_response:
b05654f0
PH
1635 raise ExtractorError(u'[youtube] No video results')
1636
1637 new_ids = list(video['id'] for video in api_response['items'])
1638 video_ids += new_ids
1639
1640 limit = min(n, api_response['totalItems'])
1641 pagenum += 1
1642
1643 if len(video_ids) > n:
1644 video_ids = video_ids[:n]
7012b23c
PH
1645 videos = [self.url_result(video_id, 'Youtube', video_id=video_id)
1646 for video_id in video_ids]
b05654f0 1647 return self.playlist_result(videos, query)
75dff0ee 1648
a3dd9248 1649class YoutubeSearchDateIE(YoutubeSearchIE):
cb7fb546 1650 IE_NAME = YoutubeSearchIE.IE_NAME + ':date'
a3dd9248
CM
1651 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc&orderby=published'
1652 _SEARCH_KEY = 'ytsearchdate'
08fb86c4 1653 IE_DESC = u'YouTube.com searches, newest videos first'
75dff0ee
JMF
1654
1655class YoutubeShowIE(InfoExtractor):
0f818663 1656 IE_DESC = u'YouTube.com (multi-season) shows'
75dff0ee
JMF
1657 _VALID_URL = r'https?://www\.youtube\.com/show/(.*)'
1658 IE_NAME = u'youtube:show'
1659
1660 def _real_extract(self, url):
1661 mobj = re.match(self._VALID_URL, url)
1662 show_name = mobj.group(1)
1663 webpage = self._download_webpage(url, show_name, u'Downloading show webpage')
1664 # There's one playlist for each season of the show
1665 m_seasons = list(re.finditer(r'href="(/playlist\?list=.*?)"', webpage))
1666 self.to_screen(u'%s: Found %s seasons' % (show_name, len(m_seasons)))
1667 return [self.url_result('https://www.youtube.com' + season.group(1), 'YoutubePlaylist') for season in m_seasons]
04cc9617
JMF
1668
1669
b2e8bc1b 1670class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
d7ae0639
JMF
1671 """
1672 Base class for extractors that fetch info from
1673 http://www.youtube.com/feed_ajax
1674 Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.
1675 """
b2e8bc1b 1676 _LOGIN_REQUIRED = True
43ba5456
JMF
1677 # use action_load_personal_feed instead of action_load_system_feed
1678 _PERSONAL_FEED = False
04cc9617 1679
d7ae0639
JMF
1680 @property
1681 def _FEED_TEMPLATE(self):
43ba5456
JMF
1682 action = 'action_load_system_feed'
1683 if self._PERSONAL_FEED:
1684 action = 'action_load_personal_feed'
1685 return 'http://www.youtube.com/feed_ajax?%s=1&feed_name=%s&paging=%%s' % (action, self._FEED_NAME)
d7ae0639
JMF
1686
1687 @property
1688 def IE_NAME(self):
1689 return u'youtube:%s' % self._FEED_NAME
04cc9617 1690
81f0259b 1691 def _real_initialize(self):
b2e8bc1b 1692 self._login()
81f0259b 1693
04cc9617
JMF
1694 def _real_extract(self, url):
1695 feed_entries = []
0e44d838
JMF
1696 paging = 0
1697 for i in itertools.count(1):
d7ae0639
JMF
1698 info = self._download_webpage(self._FEED_TEMPLATE % paging,
1699 u'%s feed' % self._FEED_NAME,
04cc9617
JMF
1700 u'Downloading page %s' % i)
1701 info = json.loads(info)
1702 feed_html = info['feed_html']
43ba5456 1703 m_ids = re.finditer(r'"/watch\?v=(.*?)["&]', feed_html)
04cc9617 1704 ids = orderedSet(m.group(1) for m in m_ids)
7012b23c
PH
1705 feed_entries.extend(
1706 self.url_result(video_id, 'Youtube', video_id=video_id)
1707 for video_id in ids)
04cc9617
JMF
1708 if info['paging'] is None:
1709 break
0e44d838 1710 paging = info['paging']
d7ae0639
JMF
1711 return self.playlist_result(feed_entries, playlist_title=self._PLAYLIST_TITLE)
1712
1713class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor):
1714 IE_DESC = u'YouTube.com subscriptions feed, "ytsubs" keyword(requires authentication)'
1715 _VALID_URL = r'https?://www\.youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
1716 _FEED_NAME = 'subscriptions'
1717 _PLAYLIST_TITLE = u'Youtube Subscriptions'
1718
1719class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
1720 IE_DESC = u'YouTube.com recommended videos, "ytrec" keyword (requires authentication)'
1721 _VALID_URL = r'https?://www\.youtube\.com/feed/recommended|:ytrec(?:ommended)?'
1722 _FEED_NAME = 'recommended'
1723 _PLAYLIST_TITLE = u'Youtube Recommended videos'
c626a3d9 1724
43ba5456
JMF
1725class YoutubeWatchLaterIE(YoutubeFeedsInfoExtractor):
1726 IE_DESC = u'Youtube watch later list, "ytwatchlater" keyword (requires authentication)'
1727 _VALID_URL = r'https?://www\.youtube\.com/feed/watch_later|:ytwatchlater'
1728 _FEED_NAME = 'watch_later'
1729 _PLAYLIST_TITLE = u'Youtube Watch Later'
43ba5456 1730 _PERSONAL_FEED = True
c626a3d9 1731
f459d170
JMF
1732class YoutubeHistoryIE(YoutubeFeedsInfoExtractor):
1733 IE_DESC = u'Youtube watch history, "ythistory" keyword (requires authentication)'
1734 _VALID_URL = u'https?://www\.youtube\.com/feed/history|:ythistory'
1735 _FEED_NAME = 'history'
1736 _PERSONAL_FEED = True
1737 _PLAYLIST_TITLE = u'Youtube Watch History'
1738
c626a3d9
JMF
1739class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
1740 IE_NAME = u'youtube:favorites'
1741 IE_DESC = u'YouTube.com favourite videos, "ytfav" keyword (requires authentication)'
c7a7750d 1742 _VALID_URL = r'https?://www\.youtube\.com/my_favorites|:ytfav(?:ou?rites)?'
c626a3d9
JMF
1743 _LOGIN_REQUIRED = True
1744
1745 def _real_extract(self, url):
1746 webpage = self._download_webpage('https://www.youtube.com/my_favorites', 'Youtube Favourites videos')
1747 playlist_id = self._search_regex(r'list=(.+?)["&]', webpage, u'favourites playlist id')
1748 return self.url_result(playlist_id, 'YoutubePlaylist')
15870e90
PH
1749
1750
1751class YoutubeTruncatedURLIE(InfoExtractor):
1752 IE_NAME = 'youtube:truncated_url'
1753 IE_DESC = False # Do not list
1754 _VALID_URL = r'(?:https?://)?[^/]+/watch\?feature=[a-z_]+$'
1755
1756 def _real_extract(self, url):
1757 raise ExtractorError(
1758 u'Did you forget to quote the URL? Remember that & is a meta '
1759 u'character in most shells, so you want to put the URL in quotes, '
1760 u'like youtube-dl '
1761 u'\'http://www.youtube.com/watch?feature=foo&v=BaW_jenozKc\''
1762 u' (or simply youtube-dl BaW_jenozKc ).',
1763 expected=True)