]> jfr.im git - yt-dlp.git/blame - youtube_dl/extractor/youtube.py
Add infrastructure for paged lists
[yt-dlp.git] / youtube_dl / extractor / youtube.py
CommitLineData
c5e8d7af 1# coding: utf-8
c5e8d7af 2
e0df6211 3import collections
edf3e38e 4import errno
e0df6211 5import io
0ca96d48 6import itertools
c5e8d7af 7import json
c4417ddb 8import os.path
c5e8d7af 9import re
e0df6211
PH
10import string
11import struct
12import traceback
13import zlib
c5e8d7af 14
b05654f0 15from .common import InfoExtractor, SearchInfoExtractor
54d39d8b 16from .subtitles import SubtitlesInfoExtractor
c5e8d7af 17from ..utils import (
edf3e38e 18 compat_chr,
c5e8d7af 19 compat_parse_qs,
c5e8d7af
PH
20 compat_urllib_parse,
21 compat_urllib_request,
7c61bd36 22 compat_urlparse,
c5e8d7af
PH
23 compat_str,
24
25 clean_html,
c38b1e77 26 get_cachedir,
c5e8d7af 27 get_element_by_id,
652cdaa2 28 get_element_by_attribute,
c5e8d7af 29 ExtractorError,
b7ab0590 30 PagedList,
c91778f8 31 RegexNotFoundError,
c5e8d7af
PH
32 unescapeHTML,
33 unified_strdate,
04cc9617 34 orderedSet,
edf3e38e 35 write_json_file,
c5e8d7af
PH
36)
37
de7f3446 38class YoutubeBaseInfoExtractor(InfoExtractor):
b2e8bc1b
JMF
39 """Provide base functions for Youtube extractors"""
40 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
41 _LANG_URL = r'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
42 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
43 _NETRC_MACHINE = 'youtube'
44 # If True it will raise an error if no login info is provided
45 _LOGIN_REQUIRED = False
46
b2e8bc1b 47 def _set_language(self):
7cc3570e
PH
48 return bool(self._download_webpage(
49 self._LANG_URL, None,
50 note=u'Setting language', errnote='unable to set language',
51 fatal=False))
b2e8bc1b
JMF
52
53 def _login(self):
54 (username, password) = self._get_login_info()
55 # No authentication to be performed
56 if username is None:
57 if self._LOGIN_REQUIRED:
58 raise ExtractorError(u'No login info available, needed for using %s.' % self.IE_NAME, expected=True)
59 return False
60
7cc3570e
PH
61 login_page = self._download_webpage(
62 self._LOGIN_URL, None,
63 note=u'Downloading login page',
64 errnote=u'unable to fetch login page', fatal=False)
65 if login_page is False:
66 return
b2e8bc1b 67
795f28f8
PH
68 galx = self._search_regex(r'(?s)<input.+?name="GALX".+?value="(.+?)"',
69 login_page, u'Login GALX parameter')
c5e8d7af 70
b2e8bc1b
JMF
71 # Log in
72 login_form_strs = {
73 u'continue': u'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
74 u'Email': username,
75 u'GALX': galx,
76 u'Passwd': password,
77 u'PersistentCookie': u'yes',
78 u'_utf8': u'霱',
79 u'bgresponse': u'js_disabled',
80 u'checkConnection': u'',
81 u'checkedDomains': u'youtube',
82 u'dnConn': u'',
b2e8bc1b
JMF
83 u'pstMsg': u'0',
84 u'rmShown': u'1',
85 u'secTok': u'',
86 u'signIn': u'Sign in',
87 u'timeStmp': u'',
88 u'service': u'youtube',
89 u'uilel': u'3',
90 u'hl': u'en_US',
91 }
92 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
93 # chokes on unicode
94 login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
95 login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
7cc3570e
PH
96
97 req = compat_urllib_request.Request(self._LOGIN_URL, login_data)
98 login_results = self._download_webpage(
99 req, None,
100 note=u'Logging in', errnote=u'unable to log in', fatal=False)
101 if login_results is False:
102 return False
103 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
104 self._downloader.report_warning(u'unable to log in: bad username or password')
b2e8bc1b
JMF
105 return False
106 return True
107
108 def _confirm_age(self):
109 age_form = {
7cc3570e
PH
110 'next_url': '/',
111 'action_confirm': 'Confirm',
112 }
113 req = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
114
115 self._download_webpage(
116 req, None,
117 note=u'Confirming age', errnote=u'Unable to confirm age')
b2e8bc1b
JMF
118 return True
119
120 def _real_initialize(self):
121 if self._downloader is None:
122 return
123 if not self._set_language():
124 return
125 if not self._login():
126 return
127 self._confirm_age()
c5e8d7af 128
8377574c 129
de7f3446 130class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
0f818663 131 IE_DESC = u'YouTube.com'
cb7dfeea 132 _VALID_URL = r"""(?x)^
c5e8d7af 133 (
83aa5293 134 (?:https?://|//)? # http(s):// or protocol-independent URL (optional)
cb7dfeea 135 (?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/|
484aaeb2
PH
136 (?:www\.)?deturl\.com/www\.youtube\.com/|
137 (?:www\.)?pwnyoutube\.com|
e69ae5b9
JMF
138 tube\.majestyc\.net/|
139 youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains
c5e8d7af
PH
140 (?:.*?\#/)? # handle anchor (#/) redirect urls
141 (?: # the various things that can precede the ID:
142 (?:(?:v|embed|e)/) # v/ or embed/ or e/
143 |(?: # or the v= param in all its forms
d741e55a 144 (?:(?:watch|movie)(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
c5e8d7af
PH
145 (?:\?|\#!?) # the params delimiter ? or # or #!
146 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
147 v=
148 )
f4b05232
JMF
149 ))
150 |youtu\.be/ # just youtu.be/xxxx
151 )
c5e8d7af 152 )? # all until now is optional -> you can pass the naked ID
8963d9c2 153 ([0-9A-Za-z_-]{11}) # here is it! the YouTube video ID
c5e8d7af
PH
154 (?(1).+)? # if we found the ID, everything can follow
155 $"""
c5e8d7af 156 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
2c62dc26
PH
157 _formats = {
158 '5': {'ext': 'flv', 'width': 400, 'height': 240},
159 '6': {'ext': 'flv', 'width': 450, 'height': 270},
160 '13': {'ext': '3gp'},
161 '17': {'ext': '3gp', 'width': 176, 'height': 144},
162 '18': {'ext': 'mp4', 'width': 640, 'height': 360},
163 '22': {'ext': 'mp4', 'width': 1280, 'height': 720},
164 '34': {'ext': 'flv', 'width': 640, 'height': 360},
165 '35': {'ext': 'flv', 'width': 854, 'height': 480},
166 '36': {'ext': '3gp', 'width': 320, 'height': 240},
167 '37': {'ext': 'mp4', 'width': 1920, 'height': 1080},
168 '38': {'ext': 'mp4', 'width': 4096, 'height': 3072},
169 '43': {'ext': 'webm', 'width': 640, 'height': 360},
170 '44': {'ext': 'webm', 'width': 854, 'height': 480},
171 '45': {'ext': 'webm', 'width': 1280, 'height': 720},
172 '46': {'ext': 'webm', 'width': 1920, 'height': 1080},
173
1d043b93 174
86fe61c8 175 # 3d videos
2c62dc26
PH
176 '82': {'ext': 'mp4', 'height': 360, 'resolution': '360p', 'format_note': '3D', 'preference': -20},
177 '83': {'ext': 'mp4', 'height': 480, 'resolution': '480p', 'format_note': '3D', 'preference': -20},
178 '84': {'ext': 'mp4', 'height': 720, 'resolution': '720p', 'format_note': '3D', 'preference': -20},
179 '85': {'ext': 'mp4', 'height': 1080, 'resolution': '1080p', 'format_note': '3D', 'preference': -20},
180 '100': {'ext': 'webm', 'height': 360, 'resolution': '360p', 'format_note': '3D', 'preference': -20},
181 '101': {'ext': 'webm', 'height': 480, 'resolution': '480p', 'format_note': '3D', 'preference': -20},
182 '102': {'ext': 'webm', 'height': 720, 'resolution': '720p', 'format_note': '3D', 'preference': -20},
836a086c 183
96fb5605 184 # Apple HTTP Live Streaming
2c62dc26
PH
185 '92': {'ext': 'mp4', 'height': 240, 'resolution': '240p', 'format_note': 'HLS', 'preference': -10},
186 '93': {'ext': 'mp4', 'height': 360, 'resolution': '360p', 'format_note': 'HLS', 'preference': -10},
187 '94': {'ext': 'mp4', 'height': 480, 'resolution': '480p', 'format_note': 'HLS', 'preference': -10},
188 '95': {'ext': 'mp4', 'height': 720, 'resolution': '720p', 'format_note': 'HLS', 'preference': -10},
189 '96': {'ext': 'mp4', 'height': 1080, 'resolution': '1080p', 'format_note': 'HLS', 'preference': -10},
190 '132': {'ext': 'mp4', 'height': 240, 'resolution': '240p', 'format_note': 'HLS', 'preference': -10},
191 '151': {'ext': 'mp4', 'height': 72, 'resolution': '72p', 'format_note': 'HLS', 'preference': -10},
192
193 # DASH mp4 video
194 '133': {'ext': 'mp4', 'height': 240, 'resolution': '240p', 'format_note': 'DASH video', 'preference': -40},
195 '134': {'ext': 'mp4', 'height': 360, 'resolution': '360p', 'format_note': 'DASH video', 'preference': -40},
196 '135': {'ext': 'mp4', 'height': 480, 'resolution': '480p', 'format_note': 'DASH video', 'preference': -40},
197 '136': {'ext': 'mp4', 'height': 720, 'resolution': '720p', 'format_note': 'DASH video', 'preference': -40},
198 '137': {'ext': 'mp4', 'height': 1080, 'resolution': '1080p', 'format_note': 'DASH video', 'preference': -40},
199 '138': {'ext': 'mp4', 'height': 1081, 'resolution': '>1080p', 'format_note': 'DASH video', 'preference': -40},
200 '160': {'ext': 'mp4', 'height': 192, 'resolution': '192p', 'format_note': 'DASH video', 'preference': -40},
8fa8a629 201 '264': {'ext': 'mp4', 'height': 1080, 'resolution': '1080p', 'format_note': 'DASH video', 'preference': -40},
836a086c 202
f6f1fc92 203 # Dash mp4 audio
2c62dc26
PH
204 '139': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 48, 'preference': -50},
205 '140': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 128, 'preference': -50},
206 '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 256, 'preference': -50},
836a086c
AZ
207
208 # Dash webm
2c62dc26
PH
209 '242': {'ext': 'webm', 'height': 240, 'resolution': '240p', 'format_note': 'DASH webm', 'preference': -40},
210 '243': {'ext': 'webm', 'height': 360, 'resolution': '360p', 'format_note': 'DASH webm', 'preference': -40},
211 '244': {'ext': 'webm', 'height': 480, 'resolution': '480p', 'format_note': 'DASH webm', 'preference': -40},
212 '245': {'ext': 'webm', 'height': 480, 'resolution': '480p', 'format_note': 'DASH webm', 'preference': -40},
213 '246': {'ext': 'webm', 'height': 480, 'resolution': '480p', 'format_note': 'DASH webm', 'preference': -40},
214 '247': {'ext': 'webm', 'height': 720, 'resolution': '720p', 'format_note': 'DASH webm', 'preference': -40},
215 '248': {'ext': 'webm', 'height': 1080, 'resolution': '1080p', 'format_note': 'DASH webm', 'preference': -40},
216
217 # Dash webm audio
218 '171': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH webm audio', 'abr': 48, 'preference': -50},
219 '172': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH webm audio', 'abr': 256, 'preference': -50},
ce6b9a2d
PH
220
221 # RTMP (unnamed)
222 '_rtmp': {'protocol': 'rtmp'},
c5e8d7af 223 }
836a086c 224
c5e8d7af 225 IE_NAME = u'youtube'
2eb88d95
PH
226 _TESTS = [
227 {
0e853ca4
PH
228 u"url": u"http://www.youtube.com/watch?v=BaW_jenozKc",
229 u"file": u"BaW_jenozKc.mp4",
230 u"info_dict": {
231 u"title": u"youtube-dl test video \"'/\\ä↭𝕐",
232 u"uploader": u"Philipp Hagemeister",
233 u"uploader_id": u"phihag",
234 u"upload_date": u"20121002",
27dcce19 235 u"description": u"test chars: \"'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de ."
2eb88d95 236 }
0e853ca4 237 },
0e853ca4
PH
238 {
239 u"url": u"http://www.youtube.com/watch?v=UxxajLWwzqY",
240 u"file": u"UxxajLWwzqY.mp4",
241 u"note": u"Test generic use_cipher_signature video (#897)",
242 u"info_dict": {
243 u"upload_date": u"20120506",
244 u"title": u"Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]",
2dc59299 245 u"description": u"md5:5b292926389560516e384ac437c0ec07",
45ed795c 246 u"uploader": u"Icona Pop",
0e853ca4 247 u"uploader_id": u"IconaPop"
2eb88d95 248 }
c108eb73
JMF
249 },
250 {
251 u"url": u"https://www.youtube.com/watch?v=07FYdnEawAQ",
252 u"file": u"07FYdnEawAQ.mp4",
253 u"note": u"Test VEVO video with age protection (#956)",
254 u"info_dict": {
255 u"upload_date": u"20130703",
256 u"title": u"Justin Timberlake - Tunnel Vision (Explicit)",
257 u"description": u"md5:64249768eec3bc4276236606ea996373",
258 u"uploader": u"justintimberlakeVEVO",
259 u"uploader_id": u"justintimberlakeVEVO"
260 }
261 },
fccd3771 262 {
83aa5293 263 u"url": u"//www.YouTube.com/watch?v=yZIXLfi8CZQ",
fccd3771
PH
264 u"file": u"yZIXLfi8CZQ.mp4",
265 u"note": u"Embed-only video (#1746)",
266 u"info_dict": {
267 u"upload_date": u"20120608",
268 u"title": u"Principal Sexually Assaults A Teacher - Episode 117 - 8th June 2012",
269 u"description": u"md5:09b78bd971f1e3e289601dfba15ca4f7",
270 u"uploader": u"SET India",
271 u"uploader_id": u"setindia"
272 }
273 },
2eb88d95
PH
274 ]
275
c5e8d7af
PH
276
277 @classmethod
278 def suitable(cls, url):
279 """Receives a URL and returns True if suitable for this IE."""
e3ea4790 280 if YoutubePlaylistIE.suitable(url): return False
fccd3771 281 return re.match(cls._VALID_URL, url) is not None
c5e8d7af 282
e0df6211
PH
283 def __init__(self, *args, **kwargs):
284 super(YoutubeIE, self).__init__(*args, **kwargs)
83799698 285 self._player_cache = {}
e0df6211 286
c5e8d7af
PH
287 def report_video_info_webpage_download(self, video_id):
288 """Report attempt to download video info webpage."""
289 self.to_screen(u'%s: Downloading video info webpage' % video_id)
290
c5e8d7af
PH
291 def report_information_extraction(self, video_id):
292 """Report attempt to extract video information."""
293 self.to_screen(u'%s: Extracting video information' % video_id)
294
295 def report_unavailable_format(self, video_id, format):
296 """Report extracted video URL."""
297 self.to_screen(u'%s: Format %s not available' % (video_id, format))
298
299 def report_rtmp_download(self):
300 """Indicate the download will use the RTMP protocol."""
301 self.to_screen(u'RTMP download detected')
302
c4417ddb
PH
303 def _extract_signature_function(self, video_id, player_url, slen):
304 id_m = re.match(r'.*-(?P<id>[a-zA-Z0-9_-]+)\.(?P<ext>[a-z]+)$',
83799698 305 player_url)
e0df6211
PH
306 player_type = id_m.group('ext')
307 player_id = id_m.group('id')
308
c4417ddb
PH
309 # Read from filesystem cache
310 func_id = '%s_%s_%d' % (player_type, player_id, slen)
311 assert os.path.basename(func_id) == func_id
c38b1e77 312 cache_dir = get_cachedir(self._downloader.params)
c4417ddb 313
c3c88a26 314 cache_enabled = cache_dir is not None
f8061589 315 if cache_enabled:
c4417ddb
PH
316 cache_fn = os.path.join(os.path.expanduser(cache_dir),
317 u'youtube-sigfuncs',
318 func_id + '.json')
319 try:
edf3e38e 320 with io.open(cache_fn, 'r', encoding='utf-8') as cachef:
c4417ddb
PH
321 cache_spec = json.load(cachef)
322 return lambda s: u''.join(s[i] for i in cache_spec)
edf3e38e 323 except IOError:
c4417ddb 324 pass # No cache available
83799698 325
e0df6211
PH
326 if player_type == 'js':
327 code = self._download_webpage(
328 player_url, video_id,
83799698 329 note=u'Downloading %s player %s' % (player_type, player_id),
e0df6211 330 errnote=u'Download of %s failed' % player_url)
83799698 331 res = self._parse_sig_js(code)
c4417ddb 332 elif player_type == 'swf':
e0df6211
PH
333 urlh = self._request_webpage(
334 player_url, video_id,
83799698 335 note=u'Downloading %s player %s' % (player_type, player_id),
e0df6211
PH
336 errnote=u'Download of %s failed' % player_url)
337 code = urlh.read()
83799698 338 res = self._parse_sig_swf(code)
e0df6211
PH
339 else:
340 assert False, 'Invalid player type %r' % player_type
341
f8061589 342 if cache_enabled:
edf3e38e 343 try:
c705320f
PH
344 test_string = u''.join(map(compat_chr, range(slen)))
345 cache_res = res(test_string)
edf3e38e
PH
346 cache_spec = [ord(c) for c in cache_res]
347 try:
348 os.makedirs(os.path.dirname(cache_fn))
349 except OSError as ose:
350 if ose.errno != errno.EEXIST:
351 raise
352 write_json_file(cache_spec, cache_fn)
0ca96d48 353 except Exception:
edf3e38e
PH
354 tb = traceback.format_exc()
355 self._downloader.report_warning(
356 u'Writing cache to %r failed: %s' % (cache_fn, tb))
83799698
PH
357
358 return res
359
edf3e38e
PH
360 def _print_sig_code(self, func, slen):
361 def gen_sig_code(idxs):
362 def _genslice(start, end, step):
363 starts = u'' if start == 0 else str(start)
e35e4ddc
PH
364 ends = (u':%d' % (end+step)) if end + step >= 0 else u':'
365 steps = u'' if step == 1 else (u':%d' % step)
edf3e38e
PH
366 return u's[%s%s%s]' % (starts, ends, steps)
367
368 step = None
0ca96d48
PH
369 start = '(Never used)' # Quelch pyflakes warnings - start will be
370 # set as soon as step is set
edf3e38e
PH
371 for i, prev in zip(idxs[1:], idxs[:-1]):
372 if step is not None:
373 if i - prev == step:
374 continue
375 yield _genslice(start, prev, step)
376 step = None
377 continue
378 if i - prev in [-1, 1]:
379 step = i - prev
380 start = prev
381 continue
382 else:
383 yield u's[%d]' % prev
384 if step is None:
385 yield u's[%d]' % i
386 else:
387 yield _genslice(start, i, step)
388
c705320f
PH
389 test_string = u''.join(map(compat_chr, range(slen)))
390 cache_res = func(test_string)
edf3e38e
PH
391 cache_spec = [ord(c) for c in cache_res]
392 expr_code = u' + '.join(gen_sig_code(cache_spec))
393 code = u'if len(s) == %d:\n return %s\n' % (slen, expr_code)
f8061589 394 self.to_screen(u'Extracted signature function:\n' + code)
edf3e38e 395
e0df6211
PH
396 def _parse_sig_js(self, jscode):
397 funcname = self._search_regex(
398 r'signature=([a-zA-Z]+)', jscode,
399 u'Initial JS player signature function name')
400
401 functions = {}
402
403 def argidx(varname):
404 return string.lowercase.index(varname)
405
406 def interpret_statement(stmt, local_vars, allow_recursion=20):
407 if allow_recursion < 0:
0ca96d48 408 raise ExtractorError(u'Recursion limit reached')
e0df6211
PH
409
410 if stmt.startswith(u'var '):
411 stmt = stmt[len(u'var '):]
412 ass_m = re.match(r'^(?P<out>[a-z]+)(?:\[(?P<index>[^\]]+)\])?' +
413 r'=(?P<expr>.*)$', stmt)
414 if ass_m:
415 if ass_m.groupdict().get('index'):
416 def assign(val):
417 lvar = local_vars[ass_m.group('out')]
418 idx = interpret_expression(ass_m.group('index'),
419 local_vars, allow_recursion)
420 assert isinstance(idx, int)
421 lvar[idx] = val
422 return val
423 expr = ass_m.group('expr')
424 else:
425 def assign(val):
426 local_vars[ass_m.group('out')] = val
427 return val
428 expr = ass_m.group('expr')
429 elif stmt.startswith(u'return '):
430 assign = lambda v: v
431 expr = stmt[len(u'return '):]
432 else:
433 raise ExtractorError(
434 u'Cannot determine left side of statement in %r' % stmt)
435
436 v = interpret_expression(expr, local_vars, allow_recursion)
437 return assign(v)
438
439 def interpret_expression(expr, local_vars, allow_recursion):
440 if expr.isdigit():
441 return int(expr)
442
443 if expr.isalpha():
444 return local_vars[expr]
445
446 m = re.match(r'^(?P<in>[a-z]+)\.(?P<member>.*)$', expr)
447 if m:
448 member = m.group('member')
449 val = local_vars[m.group('in')]
450 if member == 'split("")':
451 return list(val)
452 if member == 'join("")':
453 return u''.join(val)
454 if member == 'length':
455 return len(val)
456 if member == 'reverse()':
457 return val[::-1]
458 slice_m = re.match(r'slice\((?P<idx>.*)\)', member)
459 if slice_m:
460 idx = interpret_expression(
461 slice_m.group('idx'), local_vars, allow_recursion-1)
462 return val[idx:]
463
464 m = re.match(
465 r'^(?P<in>[a-z]+)\[(?P<idx>.+)\]$', expr)
466 if m:
467 val = local_vars[m.group('in')]
468 idx = interpret_expression(m.group('idx'), local_vars,
469 allow_recursion-1)
470 return val[idx]
471
472 m = re.match(r'^(?P<a>.+?)(?P<op>[%])(?P<b>.+?)$', expr)
473 if m:
474 a = interpret_expression(m.group('a'),
475 local_vars, allow_recursion)
476 b = interpret_expression(m.group('b'),
477 local_vars, allow_recursion)
478 return a % b
479
480 m = re.match(
481 r'^(?P<func>[a-zA-Z]+)\((?P<args>[a-z0-9,]+)\)$', expr)
482 if m:
483 fname = m.group('func')
484 if fname not in functions:
485 functions[fname] = extract_function(fname)
486 argvals = [int(v) if v.isdigit() else local_vars[v]
487 for v in m.group('args').split(',')]
488 return functions[fname](argvals)
489 raise ExtractorError(u'Unsupported JS expression %r' % expr)
490
491 def extract_function(funcname):
492 func_m = re.search(
493 r'function ' + re.escape(funcname) +
494 r'\((?P<args>[a-z,]+)\){(?P<code>[^}]+)}',
495 jscode)
496 argnames = func_m.group('args').split(',')
497
498 def resf(args):
499 local_vars = dict(zip(argnames, args))
500 for stmt in func_m.group('code').split(';'):
501 res = interpret_statement(stmt, local_vars)
502 return res
503 return resf
504
505 initial_function = extract_function(funcname)
506 return lambda s: initial_function([s])
507
508 def _parse_sig_swf(self, file_contents):
509 if file_contents[1:3] != b'WS':
510 raise ExtractorError(
511 u'Not an SWF file; header is %r' % file_contents[:3])
512 if file_contents[:1] == b'C':
513 content = zlib.decompress(file_contents[8:])
514 else:
515 raise NotImplementedError(u'Unsupported compression format %r' %
516 file_contents[:1])
517
518 def extract_tags(content):
519 pos = 0
520 while pos < len(content):
521 header16 = struct.unpack('<H', content[pos:pos+2])[0]
522 pos += 2
523 tag_code = header16 >> 6
524 tag_len = header16 & 0x3f
525 if tag_len == 0x3f:
526 tag_len = struct.unpack('<I', content[pos:pos+4])[0]
527 pos += 4
528 assert pos+tag_len <= len(content)
529 yield (tag_code, content[pos:pos+tag_len])
530 pos += tag_len
531
532 code_tag = next(tag
533 for tag_code, tag in extract_tags(content)
534 if tag_code == 82)
535 p = code_tag.index(b'\0', 4) + 1
ba552f54 536 code_reader = io.BytesIO(code_tag[p:])
e0df6211
PH
537
538 # Parse ABC (AVM2 ByteCode)
ba552f54
PH
539 def read_int(reader=None):
540 if reader is None:
541 reader = code_reader
e0df6211
PH
542 res = 0
543 shift = 0
544 for _ in range(5):
ba552f54
PH
545 buf = reader.read(1)
546 assert len(buf) == 1
547 b = struct.unpack('<B', buf)[0]
e0df6211
PH
548 res = res | ((b & 0x7f) << shift)
549 if b & 0x80 == 0:
550 break
551 shift += 7
ba552f54
PH
552 return res
553
554 def u30(reader=None):
555 res = read_int(reader)
556 assert res & 0xf0000000 == 0
e0df6211
PH
557 return res
558 u32 = read_int
559
ba552f54
PH
560 def s32(reader=None):
561 v = read_int(reader)
e0df6211
PH
562 if v & 0x80000000 != 0:
563 v = - ((v ^ 0xffffffff) + 1)
ba552f54
PH
564 return v
565
0ca96d48 566 def read_string(reader=None):
ba552f54
PH
567 if reader is None:
568 reader = code_reader
569 slen = u30(reader)
570 resb = reader.read(slen)
571 assert len(resb) == slen
572 return resb.decode('utf-8')
573
574 def read_bytes(count, reader=None):
575 if reader is None:
576 reader = code_reader
577 resb = reader.read(count)
578 assert len(resb) == count
579 return resb
580
581 def read_byte(reader=None):
582 resb = read_bytes(1, reader=reader)
583 res = struct.unpack('<B', resb)[0]
584 return res
e0df6211
PH
585
586 # minor_version + major_version
0ca96d48 587 read_bytes(2 + 2)
e0df6211
PH
588
589 # Constant pool
ba552f54 590 int_count = u30()
e0df6211 591 for _c in range(1, int_count):
0ca96d48 592 s32()
ba552f54 593 uint_count = u30()
e0df6211 594 for _c in range(1, uint_count):
0ca96d48 595 u32()
ba552f54 596 double_count = u30()
0ca96d48 597 read_bytes((double_count-1) * 8)
ba552f54 598 string_count = u30()
e0df6211
PH
599 constant_strings = [u'']
600 for _c in range(1, string_count):
0ca96d48 601 s = read_string()
e0df6211 602 constant_strings.append(s)
ba552f54 603 namespace_count = u30()
e0df6211 604 for _c in range(1, namespace_count):
0ca96d48
PH
605 read_bytes(1) # kind
606 u30() # name
ba552f54 607 ns_set_count = u30()
e0df6211 608 for _c in range(1, ns_set_count):
ba552f54 609 count = u30()
e0df6211 610 for _c2 in range(count):
0ca96d48 611 u30()
ba552f54 612 multiname_count = u30()
e0df6211
PH
613 MULTINAME_SIZES = {
614 0x07: 2, # QName
615 0x0d: 2, # QNameA
616 0x0f: 1, # RTQName
617 0x10: 1, # RTQNameA
618 0x11: 0, # RTQNameL
619 0x12: 0, # RTQNameLA
620 0x09: 2, # Multiname
621 0x0e: 2, # MultinameA
622 0x1b: 1, # MultinameL
623 0x1c: 1, # MultinameLA
624 }
625 multinames = [u'']
626 for _c in range(1, multiname_count):
ba552f54 627 kind = u30()
e0df6211
PH
628 assert kind in MULTINAME_SIZES, u'Invalid multiname kind %r' % kind
629 if kind == 0x07:
0ca96d48 630 u30() # namespace_idx
ba552f54 631 name_idx = u30()
e0df6211
PH
632 multinames.append(constant_strings[name_idx])
633 else:
634 multinames.append('[MULTINAME kind: %d]' % kind)
635 for _c2 in range(MULTINAME_SIZES[kind]):
0ca96d48 636 u30()
e0df6211
PH
637
638 # Methods
ba552f54 639 method_count = u30()
e0df6211
PH
640 MethodInfo = collections.namedtuple(
641 'MethodInfo',
642 ['NEED_ARGUMENTS', 'NEED_REST'])
643 method_infos = []
644 for method_id in range(method_count):
ba552f54 645 param_count = u30()
0ca96d48 646 u30() # return type
e0df6211 647 for _ in range(param_count):
0ca96d48
PH
648 u30() # param type
649 u30() # name index (always 0 for youtube)
ba552f54 650 flags = read_byte()
e0df6211
PH
651 if flags & 0x08 != 0:
652 # Options present
ba552f54 653 option_count = u30()
e0df6211 654 for c in range(option_count):
0ca96d48
PH
655 u30() # val
656 read_bytes(1) # kind
e0df6211
PH
657 if flags & 0x80 != 0:
658 # Param names present
659 for _ in range(param_count):
0ca96d48 660 u30() # param name
e0df6211
PH
661 mi = MethodInfo(flags & 0x01 != 0, flags & 0x04 != 0)
662 method_infos.append(mi)
663
664 # Metadata
ba552f54 665 metadata_count = u30()
e0df6211 666 for _c in range(metadata_count):
0ca96d48 667 u30() # name
ba552f54 668 item_count = u30()
e0df6211 669 for _c2 in range(item_count):
0ca96d48
PH
670 u30() # key
671 u30() # value
ba552f54
PH
672
673 def parse_traits_info():
674 trait_name_idx = u30()
675 kind_full = read_byte()
e0df6211
PH
676 kind = kind_full & 0x0f
677 attrs = kind_full >> 4
678 methods = {}
679 if kind in [0x00, 0x06]: # Slot or Const
0ca96d48
PH
680 u30() # Slot id
681 u30() # type_name_idx
ba552f54 682 vindex = u30()
e0df6211 683 if vindex != 0:
0ca96d48 684 read_byte() # vkind
e0df6211 685 elif kind in [0x01, 0x02, 0x03]: # Method / Getter / Setter
0ca96d48 686 u30() # disp_id
ba552f54 687 method_idx = u30()
e0df6211
PH
688 methods[multinames[trait_name_idx]] = method_idx
689 elif kind == 0x04: # Class
0ca96d48
PH
690 u30() # slot_id
691 u30() # classi
e0df6211 692 elif kind == 0x05: # Function
0ca96d48 693 u30() # slot_id
ba552f54 694 function_idx = u30()
e0df6211
PH
695 methods[function_idx] = multinames[trait_name_idx]
696 else:
697 raise ExtractorError(u'Unsupported trait kind %d' % kind)
698
699 if attrs & 0x4 != 0: # Metadata present
ba552f54 700 metadata_count = u30()
e0df6211 701 for _c3 in range(metadata_count):
0ca96d48 702 u30() # metadata index
e0df6211 703
ba552f54 704 return methods
e0df6211
PH
705
706 # Classes
707 TARGET_CLASSNAME = u'SignatureDecipher'
708 searched_idx = multinames.index(TARGET_CLASSNAME)
709 searched_class_id = None
ba552f54 710 class_count = u30()
e0df6211 711 for class_id in range(class_count):
ba552f54 712 name_idx = u30()
e0df6211
PH
713 if name_idx == searched_idx:
714 # We found the class we're looking for!
715 searched_class_id = class_id
0ca96d48 716 u30() # super_name idx
ba552f54 717 flags = read_byte()
e0df6211 718 if flags & 0x08 != 0: # Protected namespace is present
0ca96d48 719 u30() # protected_ns_idx
ba552f54 720 intrf_count = u30()
e0df6211 721 for _c2 in range(intrf_count):
0ca96d48
PH
722 u30()
723 u30() # iinit
ba552f54 724 trait_count = u30()
e0df6211 725 for _c2 in range(trait_count):
0ca96d48 726 parse_traits_info()
e0df6211
PH
727
728 if searched_class_id is None:
729 raise ExtractorError(u'Target class %r not found' %
730 TARGET_CLASSNAME)
731
732 method_names = {}
733 method_idxs = {}
734 for class_id in range(class_count):
0ca96d48 735 u30() # cinit
ba552f54 736 trait_count = u30()
e0df6211 737 for _c2 in range(trait_count):
ba552f54 738 trait_methods = parse_traits_info()
e0df6211
PH
739 if class_id == searched_class_id:
740 method_names.update(trait_methods.items())
741 method_idxs.update(dict(
742 (idx, name)
743 for name, idx in trait_methods.items()))
744
745 # Scripts
ba552f54 746 script_count = u30()
e0df6211 747 for _c in range(script_count):
0ca96d48 748 u30() # init
ba552f54 749 trait_count = u30()
e0df6211 750 for _c2 in range(trait_count):
0ca96d48 751 parse_traits_info()
e0df6211
PH
752
753 # Method bodies
ba552f54 754 method_body_count = u30()
e0df6211
PH
755 Method = collections.namedtuple('Method', ['code', 'local_count'])
756 methods = {}
757 for _c in range(method_body_count):
ba552f54 758 method_idx = u30()
0ca96d48 759 u30() # max_stack
ba552f54 760 local_count = u30()
0ca96d48
PH
761 u30() # init_scope_depth
762 u30() # max_scope_depth
ba552f54
PH
763 code_length = u30()
764 code = read_bytes(code_length)
e0df6211 765 if method_idx in method_idxs:
ba552f54 766 m = Method(code, local_count)
e0df6211 767 methods[method_idxs[method_idx]] = m
ba552f54 768 exception_count = u30()
e0df6211 769 for _c2 in range(exception_count):
0ca96d48
PH
770 u30() # from
771 u30() # to
772 u30() # target
773 u30() # exc_type
774 u30() # var_name
ba552f54 775 trait_count = u30()
e0df6211 776 for _c2 in range(trait_count):
0ca96d48 777 parse_traits_info()
e0df6211 778
ba552f54 779 assert p + code_reader.tell() == len(code_tag)
e0df6211
PH
780 assert len(methods) == len(method_idxs)
781
782 method_pyfunctions = {}
783
784 def extract_function(func_name):
785 if func_name in method_pyfunctions:
786 return method_pyfunctions[func_name]
787 if func_name not in methods:
788 raise ExtractorError(u'Cannot find function %r' % func_name)
789 m = methods[func_name]
790
791 def resfunc(args):
e0df6211
PH
792 registers = ['(this)'] + list(args) + [None] * m.local_count
793 stack = []
794 coder = io.BytesIO(m.code)
795 while True:
796 opcode = struct.unpack('!B', coder.read(1))[0]
a7177865 797 if opcode == 36: # pushbyte
e0df6211
PH
798 v = struct.unpack('!B', coder.read(1))[0]
799 stack.append(v)
800 elif opcode == 44: # pushstring
801 idx = u30(coder)
802 stack.append(constant_strings[idx])
803 elif opcode == 48: # pushscope
804 # We don't implement the scope register, so we'll just
805 # ignore the popped value
806 stack.pop()
807 elif opcode == 70: # callproperty
808 index = u30(coder)
809 mname = multinames[index]
810 arg_count = u30(coder)
811 args = list(reversed(
812 [stack.pop() for _ in range(arg_count)]))
813 obj = stack.pop()
814 if mname == u'split':
815 assert len(args) == 1
816 assert isinstance(args[0], compat_str)
817 assert isinstance(obj, compat_str)
818 if args[0] == u'':
819 res = list(obj)
820 else:
821 res = obj.split(args[0])
822 stack.append(res)
a7177865
PH
823 elif mname == u'slice':
824 assert len(args) == 1
825 assert isinstance(args[0], int)
826 assert isinstance(obj, list)
827 res = obj[args[0]:]
828 stack.append(res)
829 elif mname == u'join':
830 assert len(args) == 1
831 assert isinstance(args[0], compat_str)
832 assert isinstance(obj, list)
833 res = args[0].join(obj)
834 stack.append(res)
e0df6211
PH
835 elif mname in method_pyfunctions:
836 stack.append(method_pyfunctions[mname](args))
837 else:
838 raise NotImplementedError(
839 u'Unsupported property %r on %r'
840 % (mname, obj))
a7177865
PH
841 elif opcode == 72: # returnvalue
842 res = stack.pop()
843 return res
844 elif opcode == 79: # callpropvoid
845 index = u30(coder)
846 mname = multinames[index]
847 arg_count = u30(coder)
848 args = list(reversed(
849 [stack.pop() for _ in range(arg_count)]))
850 obj = stack.pop()
851 if mname == u'reverse':
852 assert isinstance(obj, list)
853 obj.reverse()
854 else:
855 raise NotImplementedError(
856 u'Unsupported (void) property %r on %r'
857 % (mname, obj))
e0df6211
PH
858 elif opcode == 93: # findpropstrict
859 index = u30(coder)
860 mname = multinames[index]
861 res = extract_function(mname)
862 stack.append(res)
863 elif opcode == 97: # setproperty
864 index = u30(coder)
865 value = stack.pop()
866 idx = stack.pop()
867 obj = stack.pop()
868 assert isinstance(obj, list)
869 assert isinstance(idx, int)
870 obj[idx] = value
871 elif opcode == 98: # getlocal
872 index = u30(coder)
873 stack.append(registers[index])
874 elif opcode == 99: # setlocal
875 index = u30(coder)
876 value = stack.pop()
877 registers[index] = value
878 elif opcode == 102: # getproperty
879 index = u30(coder)
880 pname = multinames[index]
881 if pname == u'length':
882 obj = stack.pop()
883 assert isinstance(obj, list)
884 stack.append(len(obj))
885 else: # Assume attribute access
886 idx = stack.pop()
887 assert isinstance(idx, int)
888 obj = stack.pop()
889 assert isinstance(obj, list)
890 stack.append(obj[idx])
891 elif opcode == 128: # coerce
0ca96d48 892 u30(coder)
e0df6211
PH
893 elif opcode == 133: # coerce_s
894 assert isinstance(stack[-1], (type(None), compat_str))
895 elif opcode == 164: # modulo
896 value2 = stack.pop()
897 value1 = stack.pop()
898 res = value1 % value2
899 stack.append(res)
a7177865
PH
900 elif opcode == 208: # getlocal_0
901 stack.append(registers[0])
902 elif opcode == 209: # getlocal_1
903 stack.append(registers[1])
904 elif opcode == 210: # getlocal_2
905 stack.append(registers[2])
906 elif opcode == 211: # getlocal_3
907 stack.append(registers[3])
e0df6211
PH
908 elif opcode == 214: # setlocal_2
909 registers[2] = stack.pop()
910 elif opcode == 215: # setlocal_3
911 registers[3] = stack.pop()
912 else:
913 raise NotImplementedError(
914 u'Unsupported opcode %d' % opcode)
915
916 method_pyfunctions[func_name] = resfunc
917 return resfunc
918
919 initial_function = extract_function(u'decipher')
920 return lambda s: initial_function([s])
921
83799698 922 def _decrypt_signature(self, s, video_id, player_url, age_gate=False):
257a2501 923 """Turn the encrypted s field into a working signature"""
6b37f0be 924
83799698 925 if player_url is not None:
9f9be844
PH
926 if player_url.startswith(u'//'):
927 player_url = u'https:' + player_url
e0df6211 928 try:
7f8ae73a
PH
929 player_id = (player_url, len(s))
930 if player_id not in self._player_cache:
83799698 931 func = self._extract_signature_function(
c4417ddb 932 video_id, player_url, len(s)
e0df6211 933 )
7f8ae73a
PH
934 self._player_cache[player_id] = func
935 func = self._player_cache[player_id]
edf3e38e
PH
936 if self._downloader.params.get('youtube_print_sig_code'):
937 self._print_sig_code(func, len(s))
938 return func(s)
0ca96d48 939 except Exception:
e0df6211 940 tb = traceback.format_exc()
83799698
PH
941 self._downloader.report_warning(
942 u'Automatic signature extraction failed: ' + tb)
e0df6211 943
d2d8f895
PH
944 self._downloader.report_warning(
945 u'Warning: Falling back to static signature algorithm')
920de7a2 946
2f2ffea9
PH
947 return self._static_decrypt_signature(
948 s, video_id, player_url, age_gate)
e0df6211 949
2f2ffea9 950 def _static_decrypt_signature(self, s, video_id, player_url, age_gate):
e0df6211
PH
951 if age_gate:
952 # The videos with age protection use another player, so the
953 # algorithms can be different.
954 if len(s) == 86:
955 return s[2:63] + s[82] + s[64:82] + s[63]
956
bc4b9008 957 if len(s) == 93:
958 return s[86:29:-1] + s[88] + s[28:5:-1]
959 elif len(s) == 92:
444b1165 960 return s[25] + s[3:25] + s[0] + s[26:42] + s[79] + s[43:79] + s[91] + s[80:83]
38d025b3
JMF
961 elif len(s) == 91:
962 return s[84:27:-1] + s[86] + s[26:5:-1]
444b1165
JMF
963 elif len(s) == 90:
964 return s[25] + s[3:25] + s[2] + s[26:40] + s[77] + s[41:77] + s[89] + s[78:81]
8a9d86a2 965 elif len(s) == 89:
966 return s[84:78:-1] + s[87] + s[77:60:-1] + s[0] + s[59:3:-1]
444b1165 967 elif len(s) == 88:
3e223834 968 return s[7:28] + s[87] + s[29:45] + s[55] + s[46:55] + s[2] + s[56:87] + s[28]
be547e1d 969 elif len(s) == 87:
3a725669 970 return s[6:27] + s[4] + s[28:39] + s[27] + s[40:59] + s[2] + s[60:]
be547e1d 971 elif len(s) == 86:
f2c327fd 972 return s[80:72:-1] + s[16] + s[71:39:-1] + s[72] + s[38:16:-1] + s[82] + s[15::-1]
be547e1d 973 elif len(s) == 85:
6ae8ee3f 974 return s[3:11] + s[0] + s[12:55] + s[84] + s[56:84]
be547e1d 975 elif len(s) == 84:
6f56389b 976 return s[78:70:-1] + s[14] + s[69:37:-1] + s[70] + s[36:14:-1] + s[80] + s[:14][::-1]
be547e1d 977 elif len(s) == 83:
920de7a2 978 return s[80:63:-1] + s[0] + s[62:0:-1] + s[63]
be547e1d 979 elif len(s) == 82:
c21315f2 980 return s[80:37:-1] + s[7] + s[36:7:-1] + s[0] + s[6:0:-1] + s[37]
be547e1d 981 elif len(s) == 81:
aedd6bb9 982 return s[56] + s[79:56:-1] + s[41] + s[55:41:-1] + s[80] + s[40:34:-1] + s[0] + s[33:29:-1] + s[34] + s[28:9:-1] + s[29] + s[8:0:-1] + s[9]
066090dd
JMF
983 elif len(s) == 80:
984 return s[1:19] + s[0] + s[20:68] + s[19] + s[69:80]
5c468ca8
JMF
985 elif len(s) == 79:
986 return s[54] + s[77:54:-1] + s[39] + s[53:39:-1] + s[78] + s[38:34:-1] + s[0] + s[33:29:-1] + s[34] + s[28:9:-1] + s[29] + s[8:0:-1] + s[9]
be547e1d
PH
987
988 else:
989 raise ExtractorError(u'Unable to decrypt signature, key length %d not supported; retrying might work' % (len(s)))
c5e8d7af 990
1f343eaa 991 def _get_available_subtitles(self, video_id, webpage):
de7f3446 992 try:
7fad1c63
JMF
993 sub_list = self._download_webpage(
994 'http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,
995 video_id, note=False)
996 except ExtractorError as err:
de7f3446
JMF
997 self._downloader.report_warning(u'unable to download video subtitles: %s' % compat_str(err))
998 return {}
999 lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
1000
1001 sub_lang_list = {}
1002 for l in lang_list:
1003 lang = l[1]
1004 params = compat_urllib_parse.urlencode({
1005 'lang': lang,
1006 'v': video_id,
ca715127 1007 'fmt': self._downloader.params.get('subtitlesformat', 'srt'),
c3197e3e 1008 'name': unescapeHTML(l[0]).encode('utf-8'),
de7f3446
JMF
1009 })
1010 url = u'http://www.youtube.com/api/timedtext?' + params
1011 sub_lang_list[lang] = url
1012 if not sub_lang_list:
1013 self._downloader.report_warning(u'video doesn\'t have subtitles')
1014 return {}
1015 return sub_lang_list
1016
055e6f36 1017 def _get_available_automatic_caption(self, video_id, webpage):
de7f3446
JMF
1018 """We need the webpage for getting the captions url, pass it as an
1019 argument to speed up the process."""
ca715127 1020 sub_format = self._downloader.params.get('subtitlesformat', 'srt')
de7f3446
JMF
1021 self.to_screen(u'%s: Looking for automatic captions' % video_id)
1022 mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
055e6f36 1023 err_msg = u'Couldn\'t find automatic captions for %s' % video_id
de7f3446
JMF
1024 if mobj is None:
1025 self._downloader.report_warning(err_msg)
1026 return {}
1027 player_config = json.loads(mobj.group(1))
1028 try:
1029 args = player_config[u'args']
1030 caption_url = args[u'ttsurl']
1031 timestamp = args[u'timestamp']
055e6f36
JMF
1032 # We get the available subtitles
1033 list_params = compat_urllib_parse.urlencode({
1034 'type': 'list',
1035 'tlangs': 1,
1036 'asrs': 1,
de7f3446 1037 })
055e6f36 1038 list_url = caption_url + '&' + list_params
e26f8712 1039 caption_list = self._download_xml(list_url, video_id)
e3dc22ca 1040 original_lang_node = caption_list.find('track')
f6a54188 1041 if original_lang_node is None or original_lang_node.attrib.get('kind') != 'asr' :
e3dc22ca
JMF
1042 self._downloader.report_warning(u'Video doesn\'t have automatic captions')
1043 return {}
1044 original_lang = original_lang_node.attrib['lang_code']
055e6f36
JMF
1045
1046 sub_lang_list = {}
1047 for lang_node in caption_list.findall('target'):
1048 sub_lang = lang_node.attrib['lang_code']
1049 params = compat_urllib_parse.urlencode({
1050 'lang': original_lang,
1051 'tlang': sub_lang,
1052 'fmt': sub_format,
1053 'ts': timestamp,
1054 'kind': 'asr',
1055 })
1056 sub_lang_list[sub_lang] = caption_url + '&' + params
1057 return sub_lang_list
de7f3446
JMF
1058 # An extractor error can be raise by the download process if there are
1059 # no automatic captions but there are subtitles
1060 except (KeyError, ExtractorError):
1061 self._downloader.report_warning(err_msg)
1062 return {}
1063
c5e8d7af
PH
1064 def _extract_id(self, url):
1065 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1066 if mobj is None:
1067 raise ExtractorError(u'Invalid URL: %s' % url)
1068 video_id = mobj.group(2)
1069 return video_id
1070
1d043b93
JMF
1071 def _get_video_url_list(self, url_map):
1072 """
1073 Transform a dictionary in the format {itag:url} to a list of (itag, url)
1074 with the requested formats.
1075 """
2c62dc26 1076 existing_formats = [x for x in self._formats if x in url_map]
1d043b93
JMF
1077 if len(existing_formats) == 0:
1078 raise ExtractorError(u'no known formats available for video')
4ea3be0a 1079 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1080 video_url_list.reverse() # order worst to best
1d043b93
JMF
1081 return video_url_list
1082
1083 def _extract_from_m3u8(self, manifest_url, video_id):
1084 url_map = {}
1085 def _get_urls(_manifest):
1086 lines = _manifest.split('\n')
1087 urls = filter(lambda l: l and not l.startswith('#'),
1088 lines)
1089 return urls
1090 manifest = self._download_webpage(manifest_url, video_id, u'Downloading formats manifest')
1091 formats_urls = _get_urls(manifest)
1092 for format_url in formats_urls:
890f62e8 1093 itag = self._search_regex(r'itag/(\d+?)/', format_url, 'itag')
1d043b93
JMF
1094 url_map[itag] = format_url
1095 return url_map
1096
1fb07d10
JG
1097 def _extract_annotations(self, video_id):
1098 url = 'https://www.youtube.com/annotations_invideo?features=1&legacy=1&video_id=%s' % video_id
1099 return self._download_webpage(url, video_id, note=u'Searching for annotations.', errnote=u'Unable to download video annotations.')
1100
c5e8d7af
PH
1101 def _real_extract(self, url):
1102 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
1103 mobj = re.search(self._NEXT_URL_RE, url)
1104 if mobj:
1105 url = 'https://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
1106 video_id = self._extract_id(url)
1107
1108 # Get video webpage
c5e8d7af 1109 url = 'https://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
336c3a69 1110 video_webpage = self._download_webpage(url, video_id)
c5e8d7af
PH
1111
1112 # Attempt to extract SWF player URL
e0df6211 1113 mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
c5e8d7af
PH
1114 if mobj is not None:
1115 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1116 else:
1117 player_url = None
1118
1119 # Get video info
1120 self.report_video_info_webpage_download(video_id)
c108eb73
JMF
1121 if re.search(r'player-age-gate-content">', video_webpage) is not None:
1122 self.report_age_confirmation()
1123 age_gate = True
1124 # We simulate the access to the video from www.youtube.com/v/{video_id}
1125 # this can be viewed without login into Youtube
1126 data = compat_urllib_parse.urlencode({'video_id': video_id,
fccd3771 1127 'el': 'player_embedded',
c108eb73
JMF
1128 'gl': 'US',
1129 'hl': 'en',
1130 'eurl': 'https://youtube.googleapis.com/v/' + video_id,
1131 'asv': 3,
1132 'sts':'1588',
1133 })
1134 video_info_url = 'https://www.youtube.com/get_video_info?' + data
c5e8d7af
PH
1135 video_info_webpage = self._download_webpage(video_info_url, video_id,
1136 note=False,
1137 errnote='unable to download video info webpage')
1138 video_info = compat_parse_qs(video_info_webpage)
c108eb73
JMF
1139 else:
1140 age_gate = False
1141 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1142 video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1143 % (video_id, el_type))
1144 video_info_webpage = self._download_webpage(video_info_url, video_id,
1145 note=False,
1146 errnote='unable to download video info webpage')
1147 video_info = compat_parse_qs(video_info_webpage)
1148 if 'token' in video_info:
1149 break
c5e8d7af
PH
1150 if 'token' not in video_info:
1151 if 'reason' in video_info:
9a82b238 1152 raise ExtractorError(u'YouTube said: %s' % video_info['reason'][0], expected=True)
c5e8d7af
PH
1153 else:
1154 raise ExtractorError(u'"token" parameter not in video info for unknown reason')
1155
1d699755
PH
1156 if 'view_count' in video_info:
1157 view_count = int(video_info['view_count'][0])
1158 else:
1159 view_count = None
1160
c5e8d7af
PH
1161 # Check for "rental" videos
1162 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
1163 raise ExtractorError(u'"rental" videos not supported')
1164
1165 # Start extracting information
1166 self.report_information_extraction(video_id)
1167
1168 # uploader
1169 if 'author' not in video_info:
1170 raise ExtractorError(u'Unable to extract uploader name')
1171 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
1172
1173 # uploader_id
1174 video_uploader_id = None
1175 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
1176 if mobj is not None:
1177 video_uploader_id = mobj.group(1)
1178 else:
1179 self._downloader.report_warning(u'unable to extract uploader nickname')
1180
1181 # title
a8c6b241
PH
1182 if 'title' in video_info:
1183 video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
1184 else:
1185 self._downloader.report_warning(u'Unable to extract video title')
1186 video_title = u'_'
c5e8d7af
PH
1187
1188 # thumbnail image
7763b04e
JMF
1189 # We try first to get a high quality image:
1190 m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">',
1191 video_webpage, re.DOTALL)
1192 if m_thumb is not None:
1193 video_thumbnail = m_thumb.group(1)
1194 elif 'thumbnail_url' not in video_info:
c5e8d7af 1195 self._downloader.report_warning(u'unable to extract video thumbnail')
f490e77e 1196 video_thumbnail = None
c5e8d7af
PH
1197 else: # don't panic if we can't find it
1198 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
1199
1200 # upload date
1201 upload_date = None
1202 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
1203 if mobj is not None:
1204 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1205 upload_date = unified_strdate(upload_date)
1206
1207 # description
1208 video_description = get_element_by_id("eow-description", video_webpage)
1209 if video_description:
27dcce19
PH
1210 video_description = re.sub(r'''(?x)
1211 <a\s+
1212 (?:[a-zA-Z-]+="[^"]+"\s+)*?
1213 title="([^"]+)"\s+
1214 (?:[a-zA-Z-]+="[^"]+"\s+)*?
1215 class="yt-uix-redirect-link"\s*>
1216 [^<]+
1217 </a>
1218 ''', r'\1', video_description)
c5e8d7af
PH
1219 video_description = clean_html(video_description)
1220 else:
1221 fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
1222 if fd_mobj:
1223 video_description = unescapeHTML(fd_mobj.group(1))
1224 else:
1225 video_description = u''
1226
336c3a69 1227 def _extract_count(klass):
46374a56
PH
1228 count = self._search_regex(
1229 r'class="%s">([\d,]+)</span>' % re.escape(klass),
1230 video_webpage, klass, default=None)
336c3a69
JMF
1231 if count is not None:
1232 return int(count.replace(',', ''))
1233 return None
1234 like_count = _extract_count(u'likes-count')
1235 dislike_count = _extract_count(u'dislikes-count')
1236
c5e8d7af 1237 # subtitles
d82134c3 1238 video_subtitles = self.extract_subtitles(video_id, video_webpage)
c5e8d7af 1239
c5e8d7af 1240 if self._downloader.params.get('listsubtitles', False):
d665f8d3 1241 self._list_available_subtitles(video_id, video_webpage)
c5e8d7af
PH
1242 return
1243
1244 if 'length_seconds' not in video_info:
1245 self._downloader.report_warning(u'unable to extract video duration')
b466b702 1246 video_duration = None
c5e8d7af 1247 else:
b466b702 1248 video_duration = int(compat_urllib_parse.unquote_plus(video_info['length_seconds'][0]))
c5e8d7af 1249
1fb07d10
JG
1250 # annotations
1251 video_annotations = None
1252 if self._downloader.params.get('writeannotations', False):
1253 video_annotations = self._extract_annotations(video_id)
1254
c5e8d7af 1255 # Decide which formats to download
c5e8d7af
PH
1256
1257 try:
1258 mobj = re.search(r';ytplayer.config = ({.*?});', video_webpage)
50be92c1
PH
1259 if not mobj:
1260 raise ValueError('Could not find vevo ID')
c5e8d7af
PH
1261 info = json.loads(mobj.group(1))
1262 args = info['args']
7ce7e394
JMF
1263 # Easy way to know if the 's' value is in url_encoded_fmt_stream_map
1264 # this signatures are encrypted
44d46655 1265 if 'url_encoded_fmt_stream_map' not in args:
f10503db 1266 raise ValueError(u'No stream_map present') # caught below
00fe14fc
JMF
1267 re_signature = re.compile(r'[&,]s=')
1268 m_s = re_signature.search(args['url_encoded_fmt_stream_map'])
7ce7e394
JMF
1269 if m_s is not None:
1270 self.to_screen(u'%s: Encrypted signatures detected.' % video_id)
c5e8d7af 1271 video_info['url_encoded_fmt_stream_map'] = [args['url_encoded_fmt_stream_map']]
00fe14fc 1272 m_s = re_signature.search(args.get('adaptive_fmts', u''))
b7a68384 1273 if m_s is not None:
00fe14fc
JMF
1274 if 'adaptive_fmts' in video_info:
1275 video_info['adaptive_fmts'][0] += ',' + args['adaptive_fmts']
37b6d5f6 1276 else:
00fe14fc 1277 video_info['adaptive_fmts'] = [args['adaptive_fmts']]
c5e8d7af
PH
1278 except ValueError:
1279 pass
1280
1281 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1282 self.report_rtmp_download()
ce6b9a2d 1283 video_url_list = [('_rtmp', video_info['conn'][0])]
00fe14fc
JMF
1284 elif len(video_info.get('url_encoded_fmt_stream_map', [])) >= 1 or len(video_info.get('adaptive_fmts', [])) >= 1:
1285 encoded_url_map = video_info.get('url_encoded_fmt_stream_map', [''])[0] + ',' + video_info.get('adaptive_fmts',[''])[0]
1286 if 'rtmpe%3Dyes' in encoded_url_map:
a7055eb9 1287 raise ExtractorError('rtmpe downloads are not supported, see https://github.com/rg3/youtube-dl/issues/343 for more information.', expected=True)
c5e8d7af 1288 url_map = {}
00fe14fc 1289 for url_data_str in encoded_url_map.split(','):
c5e8d7af
PH
1290 url_data = compat_parse_qs(url_data_str)
1291 if 'itag' in url_data and 'url' in url_data:
1292 url = url_data['url'][0]
1293 if 'sig' in url_data:
1294 url += '&signature=' + url_data['sig'][0]
1295 elif 's' in url_data:
e0df6211 1296 encrypted_sig = url_data['s'][0]
769fda3c 1297 if self._downloader.params.get('verbose'):
c108eb73 1298 if age_gate:
bdde940e
PH
1299 if player_url is None:
1300 player_version = 'unknown'
1301 else:
1302 player_version = self._search_regex(
1303 r'-(.+)\.swf$', player_url,
1304 u'flash player', fatal=False)
e0df6211 1305 player_desc = 'flash player %s' % player_version
c108eb73 1306 else:
83799698
PH
1307 player_version = self._search_regex(
1308 r'html5player-(.+?)\.js', video_webpage,
c108eb73 1309 'html5 player', fatal=False)
e0df6211
PH
1310 player_desc = u'html5 player %s' % player_version
1311
1312 parts_sizes = u'.'.join(compat_str(len(part)) for part in encrypted_sig.split('.'))
5a76c651 1313 self.to_screen(u'encrypted signature length %d (%s), itag %s, %s' %
e0df6211
PH
1314 (len(encrypted_sig), parts_sizes, url_data['itag'][0], player_desc))
1315
83799698 1316 if not age_gate:
e0df6211
PH
1317 jsplayer_url_json = self._search_regex(
1318 r'"assets":.+?"js":\s*("[^"]+")',
1319 video_webpage, u'JS player URL')
83799698 1320 player_url = json.loads(jsplayer_url_json)
e0df6211 1321
83799698
PH
1322 signature = self._decrypt_signature(
1323 encrypted_sig, video_id, player_url, age_gate)
c5e8d7af
PH
1324 url += '&signature=' + signature
1325 if 'ratebypass' not in url:
1326 url += '&ratebypass=yes'
1327 url_map[url_data['itag'][0]] = url
1d043b93 1328 video_url_list = self._get_video_url_list(url_map)
1d043b93
JMF
1329 elif video_info.get('hlsvp'):
1330 manifest_url = video_info['hlsvp'][0]
1331 url_map = self._extract_from_m3u8(manifest_url, video_id)
1332 video_url_list = self._get_video_url_list(url_map)
c5e8d7af 1333 else:
9abb3204 1334 raise ExtractorError(u'no conn, hlsvp or url_encoded_fmt_stream_map information found in video info')
c5e8d7af 1335
4ea3be0a 1336 formats = []
600cc1a4 1337 for itag, video_real_url in video_url_list:
2c62dc26
PH
1338 dct = {
1339 'format_id': itag,
1340 'url': video_real_url,
1341 'player_url': player_url,
1342 }
1343 dct.update(self._formats[itag])
1344 formats.append(dct)
d80044c2 1345
4bcc7bd1 1346 self._sort_formats(formats)
4ea3be0a 1347
1348 return {
1349 'id': video_id,
1350 'uploader': video_uploader,
1351 'uploader_id': video_uploader_id,
1352 'upload_date': upload_date,
1353 'title': video_title,
1354 'thumbnail': video_thumbnail,
1355 'description': video_description,
1356 'subtitles': video_subtitles,
1357 'duration': video_duration,
1358 'age_limit': 18 if age_gate else 0,
1359 'annotations': video_annotations,
1360 'webpage_url': 'https://www.youtube.com/watch?v=%s' % video_id,
1361 'view_count': view_count,
1362 'like_count': like_count,
1363 'dislike_count': dislike_count,
1364 'formats': formats,
1365 }
c5e8d7af 1366
880e1c52 1367class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
0f818663 1368 IE_DESC = u'YouTube.com playlists'
c5e8d7af
PH
1369 _VALID_URL = r"""(?:
1370 (?:https?://)?
1371 (?:\w+\.)?
1372 youtube\.com/
1373 (?:
1374 (?:course|view_play_list|my_playlists|artist|playlist|watch)
1375 \? (?:.*?&)*? (?:p|a|list)=
1376 | p/
1377 )
715c8e7b 1378 ((?:PL|EC|UU|FL|RD)?[0-9A-Za-z-_]{10,})
c5e8d7af
PH
1379 .*
1380 |
715c8e7b 1381 ((?:PL|EC|UU|FL|RD)[0-9A-Za-z-_]{10,})
c5e8d7af 1382 )"""
dcbb4580
JMF
1383 _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s&page=%s'
1384 _MORE_PAGES_INDICATOR = r'data-link-type="next"'
6e47b51e 1385 _VIDEO_RE = r'href="/watch\?v=(?P<id>[0-9A-Za-z_-]{11})&amp;[^"]*?index=(?P<index>\d+)'
c5e8d7af
PH
1386 IE_NAME = u'youtube:playlist'
1387
1388 @classmethod
1389 def suitable(cls, url):
1390 """Receives a URL and returns True if suitable for this IE."""
1391 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1392
880e1c52
JMF
1393 def _real_initialize(self):
1394 self._login()
1395
652cdaa2
JMF
1396 def _ids_to_results(self, ids):
1397 return [self.url_result(vid_id, 'Youtube', video_id=vid_id)
1398 for vid_id in ids]
1399
1400 def _extract_mix(self, playlist_id):
1401 # The mixes are generated from a a single video
1402 # the id of the playlist is just 'RD' + video_id
7d4afc55 1403 url = 'https://youtube.com/watch?v=%s&list=%s' % (playlist_id[-11:], playlist_id)
652cdaa2 1404 webpage = self._download_webpage(url, playlist_id, u'Downloading Youtube mix')
76d1700b
JMF
1405 title_span = (get_element_by_attribute('class', 'title long-title', webpage) or
1406 get_element_by_attribute('class', 'title ', webpage))
1407 title = clean_html(title_span)
652cdaa2
JMF
1408 video_re = r'data-index="\d+".*?href="/watch\?v=([0-9A-Za-z_-]{11})&amp;[^"]*?list=%s' % re.escape(playlist_id)
1409 ids = orderedSet(re.findall(video_re, webpage))
1410 url_results = self._ids_to_results(ids)
1411
1412 return self.playlist_result(url_results, playlist_id, title)
1413
c5e8d7af
PH
1414 def _real_extract(self, url):
1415 # Extract playlist id
1416 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1417 if mobj is None:
1418 raise ExtractorError(u'Invalid URL: %s' % url)
47192f92
FV
1419 playlist_id = mobj.group(1) or mobj.group(2)
1420
1421 # Check if it's a video-specific URL
7c61bd36 1422 query_dict = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
47192f92
FV
1423 if 'v' in query_dict:
1424 video_id = query_dict['v'][0]
1425 if self._downloader.params.get('noplaylist'):
1426 self.to_screen(u'Downloading just video %s because of --no-playlist' % video_id)
7012b23c 1427 return self.url_result(video_id, 'Youtube', video_id=video_id)
47192f92
FV
1428 else:
1429 self.to_screen(u'Downloading playlist PL%s - add --no-playlist to just download video %s' % (playlist_id, video_id))
c5e8d7af 1430
7d4afc55 1431 if playlist_id.startswith('RD'):
652cdaa2
JMF
1432 # Mixes require a custom extraction process
1433 return self._extract_mix(playlist_id)
0a688bc0
JMF
1434 if playlist_id.startswith('TL'):
1435 raise ExtractorError(u'For downloading YouTube.com top lists, use '
1436 u'the "yttoplist" keyword, for example "youtube-dl \'yttoplist:music:Top Tracks\'"', expected=True)
652cdaa2 1437
dcbb4580
JMF
1438 # Extract the video ids from the playlist pages
1439 ids = []
c5e8d7af 1440
755eb032 1441 for page_num in itertools.count(1):
dcbb4580 1442 url = self._TEMPLATE_URL % (playlist_id, page_num)
c5e8d7af 1443 page = self._download_webpage(url, playlist_id, u'Downloading page #%s' % page_num)
6e47b51e
JMF
1444 matches = re.finditer(self._VIDEO_RE, page)
1445 # We remove the duplicates and the link with index 0
1446 # (it's not the first video of the playlist)
1447 new_ids = orderedSet(m.group('id') for m in matches if m.group('index') != '0')
dcbb4580 1448 ids.extend(new_ids)
c5e8d7af 1449
dcbb4580 1450 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
c5e8d7af
PH
1451 break
1452
c91778f8
PH
1453 try:
1454 playlist_title = self._og_search_title(page)
1455 except RegexNotFoundError:
1456 self.report_warning(
1457 u'Playlist page is missing OpenGraph title, falling back ...',
1458 playlist_id)
1459 playlist_title = self._html_search_regex(
1460 r'<h1 class="pl-header-title">(.*?)</h1>', page, u'title')
c5e8d7af 1461
652cdaa2 1462 url_results = self._ids_to_results(ids)
dcbb4580 1463 return self.playlist_result(url_results, playlist_id, playlist_title)
c5e8d7af
PH
1464
1465
0a688bc0
JMF
1466class YoutubeTopListIE(YoutubePlaylistIE):
1467 IE_NAME = u'youtube:toplist'
1468 IE_DESC = (u'YouTube.com top lists, "yttoplist:{channel}:{list title}"'
1469 u' (Example: "yttoplist:music:Top Tracks")')
1470 _VALID_URL = r'yttoplist:(?P<chann>.*?):(?P<title>.*?)$'
1471
1472 def _real_extract(self, url):
1473 mobj = re.match(self._VALID_URL, url)
1474 channel = mobj.group('chann')
1475 title = mobj.group('title')
1476 query = compat_urllib_parse.urlencode({'title': title})
1477 playlist_re = 'href="([^"]+?%s[^"]+?)"' % re.escape(query)
1478 channel_page = self._download_webpage('https://www.youtube.com/%s' % channel, title)
1479 link = self._html_search_regex(playlist_re, channel_page, u'list')
1480 url = compat_urlparse.urljoin('https://www.youtube.com/', link)
1481
1482 video_re = r'data-index="\d+".*?data-video-id="([0-9A-Za-z_-]{11})"'
1483 ids = []
1484 # sometimes the webpage doesn't contain the videos
1485 # retry until we get them
1486 for i in itertools.count(0):
1487 msg = u'Downloading Youtube mix'
1488 if i > 0:
1489 msg += ', retry #%d' % i
1490 webpage = self._download_webpage(url, title, msg)
1491 ids = orderedSet(re.findall(video_re, webpage))
1492 if ids:
1493 break
1494 url_results = self._ids_to_results(ids)
1495 return self.playlist_result(url_results, playlist_title=title)
1496
1497
c5e8d7af 1498class YoutubeChannelIE(InfoExtractor):
0f818663 1499 IE_DESC = u'YouTube.com channels'
c5e8d7af 1500 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
c5e8d7af 1501 _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
252580c5 1502 _MORE_PAGES_URL = 'http://www.youtube.com/c4_browse_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
c5e8d7af
PH
1503 IE_NAME = u'youtube:channel'
1504
1505 def extract_videos_from_page(self, page):
1506 ids_in_page = []
1507 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1508 if mobj.group(1) not in ids_in_page:
1509 ids_in_page.append(mobj.group(1))
1510 return ids_in_page
1511
1512 def _real_extract(self, url):
1513 # Extract channel id
1514 mobj = re.match(self._VALID_URL, url)
1515 if mobj is None:
1516 raise ExtractorError(u'Invalid URL: %s' % url)
1517
1518 # Download channel page
1519 channel_id = mobj.group(1)
1520 video_ids = []
b9643eed
JMF
1521 url = 'https://www.youtube.com/channel/%s/videos' % channel_id
1522 channel_page = self._download_webpage(url, channel_id)
31812a9e
PH
1523 autogenerated = re.search(r'''(?x)
1524 class="[^"]*?(?:
1525 channel-header-autogenerated-label|
1526 yt-channel-title-autogenerated
1527 )[^"]*"''', channel_page) is not None
c5e8d7af 1528
b9643eed
JMF
1529 if autogenerated:
1530 # The videos are contained in a single page
1531 # the ajax pages can't be used, they are empty
1532 video_ids = self.extract_videos_from_page(channel_page)
1533 else:
1534 # Download all channel pages using the json-based channel_ajax query
1535 for pagenum in itertools.count(1):
1536 url = self._MORE_PAGES_URL % (pagenum, channel_id)
1537 page = self._download_webpage(url, channel_id,
1538 u'Downloading page #%s' % pagenum)
1539
1540 page = json.loads(page)
1541
1542 ids_in_page = self.extract_videos_from_page(page['content_html'])
1543 video_ids.extend(ids_in_page)
1544
1545 if self._MORE_PAGES_INDICATOR not in page['load_more_widget_html']:
1546 break
c5e8d7af
PH
1547
1548 self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1549
7012b23c
PH
1550 url_entries = [self.url_result(video_id, 'Youtube', video_id=video_id)
1551 for video_id in video_ids]
1552 return self.playlist_result(url_entries, channel_id)
c5e8d7af
PH
1553
1554
1555class YoutubeUserIE(InfoExtractor):
0f818663 1556 IE_DESC = u'YouTube.com user videos (URL or "ytuser" keyword)'
57da92b7 1557 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:user/)?(?!(?:attribution_link|watch)(?:$|[^a-z_A-Z0-9-])))|ytuser:)(?!feed/)([A-Za-z0-9_-]+)'
c5e8d7af
PH
1558 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1559 _GDATA_PAGE_SIZE = 50
fd9cf738 1560 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d&alt=json'
c5e8d7af
PH
1561 IE_NAME = u'youtube:user'
1562
e3ea4790 1563 @classmethod
f4b05232 1564 def suitable(cls, url):
e3ea4790
JMF
1565 # Don't return True if the url can be extracted with other youtube
1566 # extractor, the regex would is too permissive and it would match.
1567 other_ies = iter(klass for (name, klass) in globals().items() if name.endswith('IE') and klass is not cls)
1568 if any(ie.suitable(url) for ie in other_ies): return False
f4b05232
JMF
1569 else: return super(YoutubeUserIE, cls).suitable(url)
1570
c5e8d7af
PH
1571 def _real_extract(self, url):
1572 # Extract username
1573 mobj = re.match(self._VALID_URL, url)
1574 if mobj is None:
1575 raise ExtractorError(u'Invalid URL: %s' % url)
1576
1577 username = mobj.group(1)
1578
1579 # Download video ids using YouTube Data API. Result size per
1580 # query is limited (currently to 50 videos) so we need to query
1581 # page by page until there are no video ids - it means we got
1582 # all of them.
1583
b7ab0590 1584 def download_page(pagenum):
c5e8d7af
PH
1585 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1586
1587 gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
b7ab0590
PH
1588 page = self._download_webpage(
1589 gdata_url, username,
1590 u'Downloading video ids from %d to %d' % (
1591 start_index, start_index + self._GDATA_PAGE_SIZE))
c5e8d7af 1592
fd9cf738
JMF
1593 try:
1594 response = json.loads(page)
1595 except ValueError as err:
1596 raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
71c82637 1597 if 'entry' not in response['feed']:
b7ab0590 1598 return
fd9cf738 1599
c5e8d7af 1600 # Extract video identifiers
e302f9ce
PH
1601 entries = response['feed']['entry']
1602 for entry in entries:
1603 title = entry['title']['$t']
1604 video_id = entry['id']['$t'].split('/')[-1]
b7ab0590 1605 yield {
e302f9ce
PH
1606 '_type': 'url',
1607 'url': video_id,
1608 'ie_key': 'Youtube',
1609 'id': 'video_id',
1610 'title': title,
b7ab0590
PH
1611 }
1612 url_results = PagedList(download_page, self._GDATA_PAGE_SIZE)
c5e8d7af 1613
7012b23c
PH
1614 return self.playlist_result(url_results, playlist_title=username)
1615
b05654f0
PH
1616
1617class YoutubeSearchIE(SearchInfoExtractor):
0f818663 1618 IE_DESC = u'YouTube.com searches'
b05654f0
PH
1619 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1620 _MAX_RESULTS = 1000
1621 IE_NAME = u'youtube:search'
1622 _SEARCH_KEY = 'ytsearch'
1623
b05654f0
PH
1624 def _get_n_results(self, query, n):
1625 """Get a specified number of results for a query"""
1626
1627 video_ids = []
1628 pagenum = 0
1629 limit = n
1630
1631 while (50 * pagenum) < limit:
b05654f0 1632 result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
7cc3570e
PH
1633 data_json = self._download_webpage(
1634 result_url, video_id=u'query "%s"' % query,
1635 note=u'Downloading page %s' % (pagenum + 1),
1636 errnote=u'Unable to download API page')
1637 data = json.loads(data_json)
1638 api_response = data['data']
1639
1640 if 'items' not in api_response:
b05654f0
PH
1641 raise ExtractorError(u'[youtube] No video results')
1642
1643 new_ids = list(video['id'] for video in api_response['items'])
1644 video_ids += new_ids
1645
1646 limit = min(n, api_response['totalItems'])
1647 pagenum += 1
1648
1649 if len(video_ids) > n:
1650 video_ids = video_ids[:n]
7012b23c
PH
1651 videos = [self.url_result(video_id, 'Youtube', video_id=video_id)
1652 for video_id in video_ids]
b05654f0 1653 return self.playlist_result(videos, query)
75dff0ee 1654
a3dd9248 1655class YoutubeSearchDateIE(YoutubeSearchIE):
cb7fb546 1656 IE_NAME = YoutubeSearchIE.IE_NAME + ':date'
a3dd9248
CM
1657 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc&orderby=published'
1658 _SEARCH_KEY = 'ytsearchdate'
08fb86c4 1659 IE_DESC = u'YouTube.com searches, newest videos first'
75dff0ee
JMF
1660
1661class YoutubeShowIE(InfoExtractor):
0f818663 1662 IE_DESC = u'YouTube.com (multi-season) shows'
75dff0ee
JMF
1663 _VALID_URL = r'https?://www\.youtube\.com/show/(.*)'
1664 IE_NAME = u'youtube:show'
1665
1666 def _real_extract(self, url):
1667 mobj = re.match(self._VALID_URL, url)
1668 show_name = mobj.group(1)
1669 webpage = self._download_webpage(url, show_name, u'Downloading show webpage')
1670 # There's one playlist for each season of the show
1671 m_seasons = list(re.finditer(r'href="(/playlist\?list=.*?)"', webpage))
1672 self.to_screen(u'%s: Found %s seasons' % (show_name, len(m_seasons)))
1673 return [self.url_result('https://www.youtube.com' + season.group(1), 'YoutubePlaylist') for season in m_seasons]
04cc9617
JMF
1674
1675
b2e8bc1b 1676class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
d7ae0639
JMF
1677 """
1678 Base class for extractors that fetch info from
1679 http://www.youtube.com/feed_ajax
1680 Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.
1681 """
b2e8bc1b 1682 _LOGIN_REQUIRED = True
43ba5456
JMF
1683 # use action_load_personal_feed instead of action_load_system_feed
1684 _PERSONAL_FEED = False
04cc9617 1685
d7ae0639
JMF
1686 @property
1687 def _FEED_TEMPLATE(self):
43ba5456
JMF
1688 action = 'action_load_system_feed'
1689 if self._PERSONAL_FEED:
1690 action = 'action_load_personal_feed'
1691 return 'http://www.youtube.com/feed_ajax?%s=1&feed_name=%s&paging=%%s' % (action, self._FEED_NAME)
d7ae0639
JMF
1692
1693 @property
1694 def IE_NAME(self):
1695 return u'youtube:%s' % self._FEED_NAME
04cc9617 1696
81f0259b 1697 def _real_initialize(self):
b2e8bc1b 1698 self._login()
81f0259b 1699
04cc9617
JMF
1700 def _real_extract(self, url):
1701 feed_entries = []
0e44d838
JMF
1702 paging = 0
1703 for i in itertools.count(1):
d7ae0639
JMF
1704 info = self._download_webpage(self._FEED_TEMPLATE % paging,
1705 u'%s feed' % self._FEED_NAME,
04cc9617
JMF
1706 u'Downloading page %s' % i)
1707 info = json.loads(info)
1708 feed_html = info['feed_html']
43ba5456 1709 m_ids = re.finditer(r'"/watch\?v=(.*?)["&]', feed_html)
04cc9617 1710 ids = orderedSet(m.group(1) for m in m_ids)
7012b23c
PH
1711 feed_entries.extend(
1712 self.url_result(video_id, 'Youtube', video_id=video_id)
1713 for video_id in ids)
04cc9617
JMF
1714 if info['paging'] is None:
1715 break
0e44d838 1716 paging = info['paging']
d7ae0639
JMF
1717 return self.playlist_result(feed_entries, playlist_title=self._PLAYLIST_TITLE)
1718
1719class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor):
1720 IE_DESC = u'YouTube.com subscriptions feed, "ytsubs" keyword(requires authentication)'
1721 _VALID_URL = r'https?://www\.youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
1722 _FEED_NAME = 'subscriptions'
1723 _PLAYLIST_TITLE = u'Youtube Subscriptions'
1724
1725class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
1726 IE_DESC = u'YouTube.com recommended videos, "ytrec" keyword (requires authentication)'
1727 _VALID_URL = r'https?://www\.youtube\.com/feed/recommended|:ytrec(?:ommended)?'
1728 _FEED_NAME = 'recommended'
1729 _PLAYLIST_TITLE = u'Youtube Recommended videos'
c626a3d9 1730
43ba5456
JMF
1731class YoutubeWatchLaterIE(YoutubeFeedsInfoExtractor):
1732 IE_DESC = u'Youtube watch later list, "ytwatchlater" keyword (requires authentication)'
1733 _VALID_URL = r'https?://www\.youtube\.com/feed/watch_later|:ytwatchlater'
1734 _FEED_NAME = 'watch_later'
1735 _PLAYLIST_TITLE = u'Youtube Watch Later'
43ba5456 1736 _PERSONAL_FEED = True
c626a3d9 1737
f459d170
JMF
1738class YoutubeHistoryIE(YoutubeFeedsInfoExtractor):
1739 IE_DESC = u'Youtube watch history, "ythistory" keyword (requires authentication)'
1740 _VALID_URL = u'https?://www\.youtube\.com/feed/history|:ythistory'
1741 _FEED_NAME = 'history'
1742 _PERSONAL_FEED = True
1743 _PLAYLIST_TITLE = u'Youtube Watch History'
1744
c626a3d9
JMF
1745class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
1746 IE_NAME = u'youtube:favorites'
1747 IE_DESC = u'YouTube.com favourite videos, "ytfav" keyword (requires authentication)'
c7a7750d 1748 _VALID_URL = r'https?://www\.youtube\.com/my_favorites|:ytfav(?:ou?rites)?'
c626a3d9
JMF
1749 _LOGIN_REQUIRED = True
1750
1751 def _real_extract(self, url):
1752 webpage = self._download_webpage('https://www.youtube.com/my_favorites', 'Youtube Favourites videos')
1753 playlist_id = self._search_regex(r'list=(.+?)["&]', webpage, u'favourites playlist id')
1754 return self.url_result(playlist_id, 'YoutubePlaylist')
15870e90
PH
1755
1756
1757class YoutubeTruncatedURLIE(InfoExtractor):
1758 IE_NAME = 'youtube:truncated_url'
1759 IE_DESC = False # Do not list
1760 _VALID_URL = r'(?:https?://)?[^/]+/watch\?feature=[a-z_]+$'
1761
1762 def _real_extract(self, url):
1763 raise ExtractorError(
1764 u'Did you forget to quote the URL? Remember that & is a meta '
1765 u'character in most shells, so you want to put the URL in quotes, '
1766 u'like youtube-dl '
b4622a32
PH
1767 u'"http://www.youtube.com/watch?feature=foo&v=BaW_jenozKc" '
1768 u' or simply youtube-dl BaW_jenozKc .',
15870e90 1769 expected=True)