]> jfr.im git - yt-dlp.git/blame - youtube_dl/extractor/youtube.py
[youtube] Add a pseudo format for rtmp videos (#2123)
[yt-dlp.git] / youtube_dl / extractor / youtube.py
CommitLineData
c5e8d7af 1# coding: utf-8
c5e8d7af 2
e0df6211 3import collections
edf3e38e 4import errno
e0df6211 5import io
0ca96d48 6import itertools
c5e8d7af 7import json
c4417ddb 8import os.path
c5e8d7af 9import re
e0df6211
PH
10import string
11import struct
12import traceback
13import zlib
c5e8d7af 14
b05654f0 15from .common import InfoExtractor, SearchInfoExtractor
54d39d8b 16from .subtitles import SubtitlesInfoExtractor
c5e8d7af 17from ..utils import (
edf3e38e 18 compat_chr,
c5e8d7af 19 compat_parse_qs,
c5e8d7af
PH
20 compat_urllib_parse,
21 compat_urllib_request,
7c61bd36 22 compat_urlparse,
c5e8d7af
PH
23 compat_str,
24
25 clean_html,
c38b1e77 26 get_cachedir,
c5e8d7af 27 get_element_by_id,
652cdaa2 28 get_element_by_attribute,
c5e8d7af
PH
29 ExtractorError,
30 unescapeHTML,
31 unified_strdate,
04cc9617 32 orderedSet,
edf3e38e 33 write_json_file,
c5e8d7af
PH
34)
35
de7f3446 36class YoutubeBaseInfoExtractor(InfoExtractor):
b2e8bc1b
JMF
37 """Provide base functions for Youtube extractors"""
38 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
39 _LANG_URL = r'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
40 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
41 _NETRC_MACHINE = 'youtube'
42 # If True it will raise an error if no login info is provided
43 _LOGIN_REQUIRED = False
44
b2e8bc1b 45 def _set_language(self):
7cc3570e
PH
46 return bool(self._download_webpage(
47 self._LANG_URL, None,
48 note=u'Setting language', errnote='unable to set language',
49 fatal=False))
b2e8bc1b
JMF
50
51 def _login(self):
52 (username, password) = self._get_login_info()
53 # No authentication to be performed
54 if username is None:
55 if self._LOGIN_REQUIRED:
56 raise ExtractorError(u'No login info available, needed for using %s.' % self.IE_NAME, expected=True)
57 return False
58
7cc3570e
PH
59 login_page = self._download_webpage(
60 self._LOGIN_URL, None,
61 note=u'Downloading login page',
62 errnote=u'unable to fetch login page', fatal=False)
63 if login_page is False:
64 return
b2e8bc1b 65
795f28f8
PH
66 galx = self._search_regex(r'(?s)<input.+?name="GALX".+?value="(.+?)"',
67 login_page, u'Login GALX parameter')
c5e8d7af 68
b2e8bc1b
JMF
69 # Log in
70 login_form_strs = {
71 u'continue': u'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
72 u'Email': username,
73 u'GALX': galx,
74 u'Passwd': password,
75 u'PersistentCookie': u'yes',
76 u'_utf8': u'霱',
77 u'bgresponse': u'js_disabled',
78 u'checkConnection': u'',
79 u'checkedDomains': u'youtube',
80 u'dnConn': u'',
b2e8bc1b
JMF
81 u'pstMsg': u'0',
82 u'rmShown': u'1',
83 u'secTok': u'',
84 u'signIn': u'Sign in',
85 u'timeStmp': u'',
86 u'service': u'youtube',
87 u'uilel': u'3',
88 u'hl': u'en_US',
89 }
90 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
91 # chokes on unicode
92 login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
93 login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
7cc3570e
PH
94
95 req = compat_urllib_request.Request(self._LOGIN_URL, login_data)
96 login_results = self._download_webpage(
97 req, None,
98 note=u'Logging in', errnote=u'unable to log in', fatal=False)
99 if login_results is False:
100 return False
101 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
102 self._downloader.report_warning(u'unable to log in: bad username or password')
b2e8bc1b
JMF
103 return False
104 return True
105
106 def _confirm_age(self):
107 age_form = {
7cc3570e
PH
108 'next_url': '/',
109 'action_confirm': 'Confirm',
110 }
111 req = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
112
113 self._download_webpage(
114 req, None,
115 note=u'Confirming age', errnote=u'Unable to confirm age')
b2e8bc1b
JMF
116 return True
117
118 def _real_initialize(self):
119 if self._downloader is None:
120 return
121 if not self._set_language():
122 return
123 if not self._login():
124 return
125 self._confirm_age()
c5e8d7af 126
8377574c 127
de7f3446 128class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
0f818663 129 IE_DESC = u'YouTube.com'
cb7dfeea 130 _VALID_URL = r"""(?x)^
c5e8d7af 131 (
83aa5293 132 (?:https?://|//)? # http(s):// or protocol-independent URL (optional)
cb7dfeea 133 (?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/|
e69ae5b9
JMF
134 tube\.majestyc\.net/|
135 youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains
c5e8d7af
PH
136 (?:.*?\#/)? # handle anchor (#/) redirect urls
137 (?: # the various things that can precede the ID:
138 (?:(?:v|embed|e)/) # v/ or embed/ or e/
139 |(?: # or the v= param in all its forms
d741e55a 140 (?:(?:watch|movie)(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
c5e8d7af
PH
141 (?:\?|\#!?) # the params delimiter ? or # or #!
142 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
143 v=
144 )
f4b05232
JMF
145 ))
146 |youtu\.be/ # just youtu.be/xxxx
147 )
c5e8d7af 148 )? # all until now is optional -> you can pass the naked ID
8963d9c2 149 ([0-9A-Za-z_-]{11}) # here is it! the YouTube video ID
c5e8d7af
PH
150 (?(1).+)? # if we found the ID, everything can follow
151 $"""
c5e8d7af 152 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
2c62dc26
PH
153 _formats = {
154 '5': {'ext': 'flv', 'width': 400, 'height': 240},
155 '6': {'ext': 'flv', 'width': 450, 'height': 270},
156 '13': {'ext': '3gp'},
157 '17': {'ext': '3gp', 'width': 176, 'height': 144},
158 '18': {'ext': 'mp4', 'width': 640, 'height': 360},
159 '22': {'ext': 'mp4', 'width': 1280, 'height': 720},
160 '34': {'ext': 'flv', 'width': 640, 'height': 360},
161 '35': {'ext': 'flv', 'width': 854, 'height': 480},
162 '36': {'ext': '3gp', 'width': 320, 'height': 240},
163 '37': {'ext': 'mp4', 'width': 1920, 'height': 1080},
164 '38': {'ext': 'mp4', 'width': 4096, 'height': 3072},
165 '43': {'ext': 'webm', 'width': 640, 'height': 360},
166 '44': {'ext': 'webm', 'width': 854, 'height': 480},
167 '45': {'ext': 'webm', 'width': 1280, 'height': 720},
168 '46': {'ext': 'webm', 'width': 1920, 'height': 1080},
169
1d043b93 170
86fe61c8 171 # 3d videos
2c62dc26
PH
172 '82': {'ext': 'mp4', 'height': 360, 'resolution': '360p', 'format_note': '3D', 'preference': -20},
173 '83': {'ext': 'mp4', 'height': 480, 'resolution': '480p', 'format_note': '3D', 'preference': -20},
174 '84': {'ext': 'mp4', 'height': 720, 'resolution': '720p', 'format_note': '3D', 'preference': -20},
175 '85': {'ext': 'mp4', 'height': 1080, 'resolution': '1080p', 'format_note': '3D', 'preference': -20},
176 '100': {'ext': 'webm', 'height': 360, 'resolution': '360p', 'format_note': '3D', 'preference': -20},
177 '101': {'ext': 'webm', 'height': 480, 'resolution': '480p', 'format_note': '3D', 'preference': -20},
178 '102': {'ext': 'webm', 'height': 720, 'resolution': '720p', 'format_note': '3D', 'preference': -20},
836a086c 179
96fb5605 180 # Apple HTTP Live Streaming
2c62dc26
PH
181 '92': {'ext': 'mp4', 'height': 240, 'resolution': '240p', 'format_note': 'HLS', 'preference': -10},
182 '93': {'ext': 'mp4', 'height': 360, 'resolution': '360p', 'format_note': 'HLS', 'preference': -10},
183 '94': {'ext': 'mp4', 'height': 480, 'resolution': '480p', 'format_note': 'HLS', 'preference': -10},
184 '95': {'ext': 'mp4', 'height': 720, 'resolution': '720p', 'format_note': 'HLS', 'preference': -10},
185 '96': {'ext': 'mp4', 'height': 1080, 'resolution': '1080p', 'format_note': 'HLS', 'preference': -10},
186 '132': {'ext': 'mp4', 'height': 240, 'resolution': '240p', 'format_note': 'HLS', 'preference': -10},
187 '151': {'ext': 'mp4', 'height': 72, 'resolution': '72p', 'format_note': 'HLS', 'preference': -10},
188
189 # DASH mp4 video
190 '133': {'ext': 'mp4', 'height': 240, 'resolution': '240p', 'format_note': 'DASH video', 'preference': -40},
191 '134': {'ext': 'mp4', 'height': 360, 'resolution': '360p', 'format_note': 'DASH video', 'preference': -40},
192 '135': {'ext': 'mp4', 'height': 480, 'resolution': '480p', 'format_note': 'DASH video', 'preference': -40},
193 '136': {'ext': 'mp4', 'height': 720, 'resolution': '720p', 'format_note': 'DASH video', 'preference': -40},
194 '137': {'ext': 'mp4', 'height': 1080, 'resolution': '1080p', 'format_note': 'DASH video', 'preference': -40},
195 '138': {'ext': 'mp4', 'height': 1081, 'resolution': '>1080p', 'format_note': 'DASH video', 'preference': -40},
196 '160': {'ext': 'mp4', 'height': 192, 'resolution': '192p', 'format_note': 'DASH video', 'preference': -40},
8fa8a629 197 '264': {'ext': 'mp4', 'height': 1080, 'resolution': '1080p', 'format_note': 'DASH video', 'preference': -40},
836a086c 198
f6f1fc92 199 # Dash mp4 audio
2c62dc26
PH
200 '139': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 48, 'preference': -50},
201 '140': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 128, 'preference': -50},
202 '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 256, 'preference': -50},
836a086c
AZ
203
204 # Dash webm
2c62dc26
PH
205 '242': {'ext': 'webm', 'height': 240, 'resolution': '240p', 'format_note': 'DASH webm', 'preference': -40},
206 '243': {'ext': 'webm', 'height': 360, 'resolution': '360p', 'format_note': 'DASH webm', 'preference': -40},
207 '244': {'ext': 'webm', 'height': 480, 'resolution': '480p', 'format_note': 'DASH webm', 'preference': -40},
208 '245': {'ext': 'webm', 'height': 480, 'resolution': '480p', 'format_note': 'DASH webm', 'preference': -40},
209 '246': {'ext': 'webm', 'height': 480, 'resolution': '480p', 'format_note': 'DASH webm', 'preference': -40},
210 '247': {'ext': 'webm', 'height': 720, 'resolution': '720p', 'format_note': 'DASH webm', 'preference': -40},
211 '248': {'ext': 'webm', 'height': 1080, 'resolution': '1080p', 'format_note': 'DASH webm', 'preference': -40},
212
213 # Dash webm audio
214 '171': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH webm audio', 'abr': 48, 'preference': -50},
215 '172': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH webm audio', 'abr': 256, 'preference': -50},
ce6b9a2d
PH
216
217 # RTMP (unnamed)
218 '_rtmp': {'protocol': 'rtmp'},
c5e8d7af 219 }
836a086c 220
c5e8d7af 221 IE_NAME = u'youtube'
2eb88d95
PH
222 _TESTS = [
223 {
0e853ca4
PH
224 u"url": u"http://www.youtube.com/watch?v=BaW_jenozKc",
225 u"file": u"BaW_jenozKc.mp4",
226 u"info_dict": {
227 u"title": u"youtube-dl test video \"'/\\ä↭𝕐",
228 u"uploader": u"Philipp Hagemeister",
229 u"uploader_id": u"phihag",
230 u"upload_date": u"20121002",
27dcce19 231 u"description": u"test chars: \"'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de ."
2eb88d95 232 }
0e853ca4 233 },
0e853ca4
PH
234 {
235 u"url": u"http://www.youtube.com/watch?v=UxxajLWwzqY",
236 u"file": u"UxxajLWwzqY.mp4",
237 u"note": u"Test generic use_cipher_signature video (#897)",
238 u"info_dict": {
239 u"upload_date": u"20120506",
240 u"title": u"Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]",
2dc59299 241 u"description": u"md5:5b292926389560516e384ac437c0ec07",
45ed795c 242 u"uploader": u"Icona Pop",
0e853ca4 243 u"uploader_id": u"IconaPop"
2eb88d95 244 }
c108eb73
JMF
245 },
246 {
247 u"url": u"https://www.youtube.com/watch?v=07FYdnEawAQ",
248 u"file": u"07FYdnEawAQ.mp4",
249 u"note": u"Test VEVO video with age protection (#956)",
250 u"info_dict": {
251 u"upload_date": u"20130703",
252 u"title": u"Justin Timberlake - Tunnel Vision (Explicit)",
253 u"description": u"md5:64249768eec3bc4276236606ea996373",
254 u"uploader": u"justintimberlakeVEVO",
255 u"uploader_id": u"justintimberlakeVEVO"
256 }
257 },
fccd3771 258 {
83aa5293 259 u"url": u"//www.YouTube.com/watch?v=yZIXLfi8CZQ",
fccd3771
PH
260 u"file": u"yZIXLfi8CZQ.mp4",
261 u"note": u"Embed-only video (#1746)",
262 u"info_dict": {
263 u"upload_date": u"20120608",
264 u"title": u"Principal Sexually Assaults A Teacher - Episode 117 - 8th June 2012",
265 u"description": u"md5:09b78bd971f1e3e289601dfba15ca4f7",
266 u"uploader": u"SET India",
267 u"uploader_id": u"setindia"
268 }
269 },
2eb88d95
PH
270 ]
271
c5e8d7af
PH
272
273 @classmethod
274 def suitable(cls, url):
275 """Receives a URL and returns True if suitable for this IE."""
e3ea4790 276 if YoutubePlaylistIE.suitable(url): return False
fccd3771 277 return re.match(cls._VALID_URL, url) is not None
c5e8d7af 278
e0df6211
PH
279 def __init__(self, *args, **kwargs):
280 super(YoutubeIE, self).__init__(*args, **kwargs)
83799698 281 self._player_cache = {}
e0df6211 282
c5e8d7af
PH
283 def report_video_info_webpage_download(self, video_id):
284 """Report attempt to download video info webpage."""
285 self.to_screen(u'%s: Downloading video info webpage' % video_id)
286
c5e8d7af
PH
287 def report_information_extraction(self, video_id):
288 """Report attempt to extract video information."""
289 self.to_screen(u'%s: Extracting video information' % video_id)
290
291 def report_unavailable_format(self, video_id, format):
292 """Report extracted video URL."""
293 self.to_screen(u'%s: Format %s not available' % (video_id, format))
294
295 def report_rtmp_download(self):
296 """Indicate the download will use the RTMP protocol."""
297 self.to_screen(u'RTMP download detected')
298
c4417ddb
PH
299 def _extract_signature_function(self, video_id, player_url, slen):
300 id_m = re.match(r'.*-(?P<id>[a-zA-Z0-9_-]+)\.(?P<ext>[a-z]+)$',
83799698 301 player_url)
e0df6211
PH
302 player_type = id_m.group('ext')
303 player_id = id_m.group('id')
304
c4417ddb
PH
305 # Read from filesystem cache
306 func_id = '%s_%s_%d' % (player_type, player_id, slen)
307 assert os.path.basename(func_id) == func_id
c38b1e77 308 cache_dir = get_cachedir(self._downloader.params)
c4417ddb 309
c3c88a26 310 cache_enabled = cache_dir is not None
f8061589 311 if cache_enabled:
c4417ddb
PH
312 cache_fn = os.path.join(os.path.expanduser(cache_dir),
313 u'youtube-sigfuncs',
314 func_id + '.json')
315 try:
edf3e38e 316 with io.open(cache_fn, 'r', encoding='utf-8') as cachef:
c4417ddb
PH
317 cache_spec = json.load(cachef)
318 return lambda s: u''.join(s[i] for i in cache_spec)
edf3e38e 319 except IOError:
c4417ddb 320 pass # No cache available
83799698 321
e0df6211
PH
322 if player_type == 'js':
323 code = self._download_webpage(
324 player_url, video_id,
83799698 325 note=u'Downloading %s player %s' % (player_type, player_id),
e0df6211 326 errnote=u'Download of %s failed' % player_url)
83799698 327 res = self._parse_sig_js(code)
c4417ddb 328 elif player_type == 'swf':
e0df6211
PH
329 urlh = self._request_webpage(
330 player_url, video_id,
83799698 331 note=u'Downloading %s player %s' % (player_type, player_id),
e0df6211
PH
332 errnote=u'Download of %s failed' % player_url)
333 code = urlh.read()
83799698 334 res = self._parse_sig_swf(code)
e0df6211
PH
335 else:
336 assert False, 'Invalid player type %r' % player_type
337
f8061589 338 if cache_enabled:
edf3e38e 339 try:
c705320f
PH
340 test_string = u''.join(map(compat_chr, range(slen)))
341 cache_res = res(test_string)
edf3e38e
PH
342 cache_spec = [ord(c) for c in cache_res]
343 try:
344 os.makedirs(os.path.dirname(cache_fn))
345 except OSError as ose:
346 if ose.errno != errno.EEXIST:
347 raise
348 write_json_file(cache_spec, cache_fn)
0ca96d48 349 except Exception:
edf3e38e
PH
350 tb = traceback.format_exc()
351 self._downloader.report_warning(
352 u'Writing cache to %r failed: %s' % (cache_fn, tb))
83799698
PH
353
354 return res
355
edf3e38e
PH
356 def _print_sig_code(self, func, slen):
357 def gen_sig_code(idxs):
358 def _genslice(start, end, step):
359 starts = u'' if start == 0 else str(start)
e35e4ddc
PH
360 ends = (u':%d' % (end+step)) if end + step >= 0 else u':'
361 steps = u'' if step == 1 else (u':%d' % step)
edf3e38e
PH
362 return u's[%s%s%s]' % (starts, ends, steps)
363
364 step = None
0ca96d48
PH
365 start = '(Never used)' # Quelch pyflakes warnings - start will be
366 # set as soon as step is set
edf3e38e
PH
367 for i, prev in zip(idxs[1:], idxs[:-1]):
368 if step is not None:
369 if i - prev == step:
370 continue
371 yield _genslice(start, prev, step)
372 step = None
373 continue
374 if i - prev in [-1, 1]:
375 step = i - prev
376 start = prev
377 continue
378 else:
379 yield u's[%d]' % prev
380 if step is None:
381 yield u's[%d]' % i
382 else:
383 yield _genslice(start, i, step)
384
c705320f
PH
385 test_string = u''.join(map(compat_chr, range(slen)))
386 cache_res = func(test_string)
edf3e38e
PH
387 cache_spec = [ord(c) for c in cache_res]
388 expr_code = u' + '.join(gen_sig_code(cache_spec))
389 code = u'if len(s) == %d:\n return %s\n' % (slen, expr_code)
f8061589 390 self.to_screen(u'Extracted signature function:\n' + code)
edf3e38e 391
e0df6211
PH
392 def _parse_sig_js(self, jscode):
393 funcname = self._search_regex(
394 r'signature=([a-zA-Z]+)', jscode,
395 u'Initial JS player signature function name')
396
397 functions = {}
398
399 def argidx(varname):
400 return string.lowercase.index(varname)
401
402 def interpret_statement(stmt, local_vars, allow_recursion=20):
403 if allow_recursion < 0:
0ca96d48 404 raise ExtractorError(u'Recursion limit reached')
e0df6211
PH
405
406 if stmt.startswith(u'var '):
407 stmt = stmt[len(u'var '):]
408 ass_m = re.match(r'^(?P<out>[a-z]+)(?:\[(?P<index>[^\]]+)\])?' +
409 r'=(?P<expr>.*)$', stmt)
410 if ass_m:
411 if ass_m.groupdict().get('index'):
412 def assign(val):
413 lvar = local_vars[ass_m.group('out')]
414 idx = interpret_expression(ass_m.group('index'),
415 local_vars, allow_recursion)
416 assert isinstance(idx, int)
417 lvar[idx] = val
418 return val
419 expr = ass_m.group('expr')
420 else:
421 def assign(val):
422 local_vars[ass_m.group('out')] = val
423 return val
424 expr = ass_m.group('expr')
425 elif stmt.startswith(u'return '):
426 assign = lambda v: v
427 expr = stmt[len(u'return '):]
428 else:
429 raise ExtractorError(
430 u'Cannot determine left side of statement in %r' % stmt)
431
432 v = interpret_expression(expr, local_vars, allow_recursion)
433 return assign(v)
434
435 def interpret_expression(expr, local_vars, allow_recursion):
436 if expr.isdigit():
437 return int(expr)
438
439 if expr.isalpha():
440 return local_vars[expr]
441
442 m = re.match(r'^(?P<in>[a-z]+)\.(?P<member>.*)$', expr)
443 if m:
444 member = m.group('member')
445 val = local_vars[m.group('in')]
446 if member == 'split("")':
447 return list(val)
448 if member == 'join("")':
449 return u''.join(val)
450 if member == 'length':
451 return len(val)
452 if member == 'reverse()':
453 return val[::-1]
454 slice_m = re.match(r'slice\((?P<idx>.*)\)', member)
455 if slice_m:
456 idx = interpret_expression(
457 slice_m.group('idx'), local_vars, allow_recursion-1)
458 return val[idx:]
459
460 m = re.match(
461 r'^(?P<in>[a-z]+)\[(?P<idx>.+)\]$', expr)
462 if m:
463 val = local_vars[m.group('in')]
464 idx = interpret_expression(m.group('idx'), local_vars,
465 allow_recursion-1)
466 return val[idx]
467
468 m = re.match(r'^(?P<a>.+?)(?P<op>[%])(?P<b>.+?)$', expr)
469 if m:
470 a = interpret_expression(m.group('a'),
471 local_vars, allow_recursion)
472 b = interpret_expression(m.group('b'),
473 local_vars, allow_recursion)
474 return a % b
475
476 m = re.match(
477 r'^(?P<func>[a-zA-Z]+)\((?P<args>[a-z0-9,]+)\)$', expr)
478 if m:
479 fname = m.group('func')
480 if fname not in functions:
481 functions[fname] = extract_function(fname)
482 argvals = [int(v) if v.isdigit() else local_vars[v]
483 for v in m.group('args').split(',')]
484 return functions[fname](argvals)
485 raise ExtractorError(u'Unsupported JS expression %r' % expr)
486
487 def extract_function(funcname):
488 func_m = re.search(
489 r'function ' + re.escape(funcname) +
490 r'\((?P<args>[a-z,]+)\){(?P<code>[^}]+)}',
491 jscode)
492 argnames = func_m.group('args').split(',')
493
494 def resf(args):
495 local_vars = dict(zip(argnames, args))
496 for stmt in func_m.group('code').split(';'):
497 res = interpret_statement(stmt, local_vars)
498 return res
499 return resf
500
501 initial_function = extract_function(funcname)
502 return lambda s: initial_function([s])
503
504 def _parse_sig_swf(self, file_contents):
505 if file_contents[1:3] != b'WS':
506 raise ExtractorError(
507 u'Not an SWF file; header is %r' % file_contents[:3])
508 if file_contents[:1] == b'C':
509 content = zlib.decompress(file_contents[8:])
510 else:
511 raise NotImplementedError(u'Unsupported compression format %r' %
512 file_contents[:1])
513
514 def extract_tags(content):
515 pos = 0
516 while pos < len(content):
517 header16 = struct.unpack('<H', content[pos:pos+2])[0]
518 pos += 2
519 tag_code = header16 >> 6
520 tag_len = header16 & 0x3f
521 if tag_len == 0x3f:
522 tag_len = struct.unpack('<I', content[pos:pos+4])[0]
523 pos += 4
524 assert pos+tag_len <= len(content)
525 yield (tag_code, content[pos:pos+tag_len])
526 pos += tag_len
527
528 code_tag = next(tag
529 for tag_code, tag in extract_tags(content)
530 if tag_code == 82)
531 p = code_tag.index(b'\0', 4) + 1
ba552f54 532 code_reader = io.BytesIO(code_tag[p:])
e0df6211
PH
533
534 # Parse ABC (AVM2 ByteCode)
ba552f54
PH
535 def read_int(reader=None):
536 if reader is None:
537 reader = code_reader
e0df6211
PH
538 res = 0
539 shift = 0
540 for _ in range(5):
ba552f54
PH
541 buf = reader.read(1)
542 assert len(buf) == 1
543 b = struct.unpack('<B', buf)[0]
e0df6211
PH
544 res = res | ((b & 0x7f) << shift)
545 if b & 0x80 == 0:
546 break
547 shift += 7
ba552f54
PH
548 return res
549
550 def u30(reader=None):
551 res = read_int(reader)
552 assert res & 0xf0000000 == 0
e0df6211
PH
553 return res
554 u32 = read_int
555
ba552f54
PH
556 def s32(reader=None):
557 v = read_int(reader)
e0df6211
PH
558 if v & 0x80000000 != 0:
559 v = - ((v ^ 0xffffffff) + 1)
ba552f54
PH
560 return v
561
0ca96d48 562 def read_string(reader=None):
ba552f54
PH
563 if reader is None:
564 reader = code_reader
565 slen = u30(reader)
566 resb = reader.read(slen)
567 assert len(resb) == slen
568 return resb.decode('utf-8')
569
570 def read_bytes(count, reader=None):
571 if reader is None:
572 reader = code_reader
573 resb = reader.read(count)
574 assert len(resb) == count
575 return resb
576
577 def read_byte(reader=None):
578 resb = read_bytes(1, reader=reader)
579 res = struct.unpack('<B', resb)[0]
580 return res
e0df6211
PH
581
582 # minor_version + major_version
0ca96d48 583 read_bytes(2 + 2)
e0df6211
PH
584
585 # Constant pool
ba552f54 586 int_count = u30()
e0df6211 587 for _c in range(1, int_count):
0ca96d48 588 s32()
ba552f54 589 uint_count = u30()
e0df6211 590 for _c in range(1, uint_count):
0ca96d48 591 u32()
ba552f54 592 double_count = u30()
0ca96d48 593 read_bytes((double_count-1) * 8)
ba552f54 594 string_count = u30()
e0df6211
PH
595 constant_strings = [u'']
596 for _c in range(1, string_count):
0ca96d48 597 s = read_string()
e0df6211 598 constant_strings.append(s)
ba552f54 599 namespace_count = u30()
e0df6211 600 for _c in range(1, namespace_count):
0ca96d48
PH
601 read_bytes(1) # kind
602 u30() # name
ba552f54 603 ns_set_count = u30()
e0df6211 604 for _c in range(1, ns_set_count):
ba552f54 605 count = u30()
e0df6211 606 for _c2 in range(count):
0ca96d48 607 u30()
ba552f54 608 multiname_count = u30()
e0df6211
PH
609 MULTINAME_SIZES = {
610 0x07: 2, # QName
611 0x0d: 2, # QNameA
612 0x0f: 1, # RTQName
613 0x10: 1, # RTQNameA
614 0x11: 0, # RTQNameL
615 0x12: 0, # RTQNameLA
616 0x09: 2, # Multiname
617 0x0e: 2, # MultinameA
618 0x1b: 1, # MultinameL
619 0x1c: 1, # MultinameLA
620 }
621 multinames = [u'']
622 for _c in range(1, multiname_count):
ba552f54 623 kind = u30()
e0df6211
PH
624 assert kind in MULTINAME_SIZES, u'Invalid multiname kind %r' % kind
625 if kind == 0x07:
0ca96d48 626 u30() # namespace_idx
ba552f54 627 name_idx = u30()
e0df6211
PH
628 multinames.append(constant_strings[name_idx])
629 else:
630 multinames.append('[MULTINAME kind: %d]' % kind)
631 for _c2 in range(MULTINAME_SIZES[kind]):
0ca96d48 632 u30()
e0df6211
PH
633
634 # Methods
ba552f54 635 method_count = u30()
e0df6211
PH
636 MethodInfo = collections.namedtuple(
637 'MethodInfo',
638 ['NEED_ARGUMENTS', 'NEED_REST'])
639 method_infos = []
640 for method_id in range(method_count):
ba552f54 641 param_count = u30()
0ca96d48 642 u30() # return type
e0df6211 643 for _ in range(param_count):
0ca96d48
PH
644 u30() # param type
645 u30() # name index (always 0 for youtube)
ba552f54 646 flags = read_byte()
e0df6211
PH
647 if flags & 0x08 != 0:
648 # Options present
ba552f54 649 option_count = u30()
e0df6211 650 for c in range(option_count):
0ca96d48
PH
651 u30() # val
652 read_bytes(1) # kind
e0df6211
PH
653 if flags & 0x80 != 0:
654 # Param names present
655 for _ in range(param_count):
0ca96d48 656 u30() # param name
e0df6211
PH
657 mi = MethodInfo(flags & 0x01 != 0, flags & 0x04 != 0)
658 method_infos.append(mi)
659
660 # Metadata
ba552f54 661 metadata_count = u30()
e0df6211 662 for _c in range(metadata_count):
0ca96d48 663 u30() # name
ba552f54 664 item_count = u30()
e0df6211 665 for _c2 in range(item_count):
0ca96d48
PH
666 u30() # key
667 u30() # value
ba552f54
PH
668
669 def parse_traits_info():
670 trait_name_idx = u30()
671 kind_full = read_byte()
e0df6211
PH
672 kind = kind_full & 0x0f
673 attrs = kind_full >> 4
674 methods = {}
675 if kind in [0x00, 0x06]: # Slot or Const
0ca96d48
PH
676 u30() # Slot id
677 u30() # type_name_idx
ba552f54 678 vindex = u30()
e0df6211 679 if vindex != 0:
0ca96d48 680 read_byte() # vkind
e0df6211 681 elif kind in [0x01, 0x02, 0x03]: # Method / Getter / Setter
0ca96d48 682 u30() # disp_id
ba552f54 683 method_idx = u30()
e0df6211
PH
684 methods[multinames[trait_name_idx]] = method_idx
685 elif kind == 0x04: # Class
0ca96d48
PH
686 u30() # slot_id
687 u30() # classi
e0df6211 688 elif kind == 0x05: # Function
0ca96d48 689 u30() # slot_id
ba552f54 690 function_idx = u30()
e0df6211
PH
691 methods[function_idx] = multinames[trait_name_idx]
692 else:
693 raise ExtractorError(u'Unsupported trait kind %d' % kind)
694
695 if attrs & 0x4 != 0: # Metadata present
ba552f54 696 metadata_count = u30()
e0df6211 697 for _c3 in range(metadata_count):
0ca96d48 698 u30() # metadata index
e0df6211 699
ba552f54 700 return methods
e0df6211
PH
701
702 # Classes
703 TARGET_CLASSNAME = u'SignatureDecipher'
704 searched_idx = multinames.index(TARGET_CLASSNAME)
705 searched_class_id = None
ba552f54 706 class_count = u30()
e0df6211 707 for class_id in range(class_count):
ba552f54 708 name_idx = u30()
e0df6211
PH
709 if name_idx == searched_idx:
710 # We found the class we're looking for!
711 searched_class_id = class_id
0ca96d48 712 u30() # super_name idx
ba552f54 713 flags = read_byte()
e0df6211 714 if flags & 0x08 != 0: # Protected namespace is present
0ca96d48 715 u30() # protected_ns_idx
ba552f54 716 intrf_count = u30()
e0df6211 717 for _c2 in range(intrf_count):
0ca96d48
PH
718 u30()
719 u30() # iinit
ba552f54 720 trait_count = u30()
e0df6211 721 for _c2 in range(trait_count):
0ca96d48 722 parse_traits_info()
e0df6211
PH
723
724 if searched_class_id is None:
725 raise ExtractorError(u'Target class %r not found' %
726 TARGET_CLASSNAME)
727
728 method_names = {}
729 method_idxs = {}
730 for class_id in range(class_count):
0ca96d48 731 u30() # cinit
ba552f54 732 trait_count = u30()
e0df6211 733 for _c2 in range(trait_count):
ba552f54 734 trait_methods = parse_traits_info()
e0df6211
PH
735 if class_id == searched_class_id:
736 method_names.update(trait_methods.items())
737 method_idxs.update(dict(
738 (idx, name)
739 for name, idx in trait_methods.items()))
740
741 # Scripts
ba552f54 742 script_count = u30()
e0df6211 743 for _c in range(script_count):
0ca96d48 744 u30() # init
ba552f54 745 trait_count = u30()
e0df6211 746 for _c2 in range(trait_count):
0ca96d48 747 parse_traits_info()
e0df6211
PH
748
749 # Method bodies
ba552f54 750 method_body_count = u30()
e0df6211
PH
751 Method = collections.namedtuple('Method', ['code', 'local_count'])
752 methods = {}
753 for _c in range(method_body_count):
ba552f54 754 method_idx = u30()
0ca96d48 755 u30() # max_stack
ba552f54 756 local_count = u30()
0ca96d48
PH
757 u30() # init_scope_depth
758 u30() # max_scope_depth
ba552f54
PH
759 code_length = u30()
760 code = read_bytes(code_length)
e0df6211 761 if method_idx in method_idxs:
ba552f54 762 m = Method(code, local_count)
e0df6211 763 methods[method_idxs[method_idx]] = m
ba552f54 764 exception_count = u30()
e0df6211 765 for _c2 in range(exception_count):
0ca96d48
PH
766 u30() # from
767 u30() # to
768 u30() # target
769 u30() # exc_type
770 u30() # var_name
ba552f54 771 trait_count = u30()
e0df6211 772 for _c2 in range(trait_count):
0ca96d48 773 parse_traits_info()
e0df6211 774
ba552f54 775 assert p + code_reader.tell() == len(code_tag)
e0df6211
PH
776 assert len(methods) == len(method_idxs)
777
778 method_pyfunctions = {}
779
780 def extract_function(func_name):
781 if func_name in method_pyfunctions:
782 return method_pyfunctions[func_name]
783 if func_name not in methods:
784 raise ExtractorError(u'Cannot find function %r' % func_name)
785 m = methods[func_name]
786
787 def resfunc(args):
e0df6211
PH
788 registers = ['(this)'] + list(args) + [None] * m.local_count
789 stack = []
790 coder = io.BytesIO(m.code)
791 while True:
792 opcode = struct.unpack('!B', coder.read(1))[0]
a7177865 793 if opcode == 36: # pushbyte
e0df6211
PH
794 v = struct.unpack('!B', coder.read(1))[0]
795 stack.append(v)
796 elif opcode == 44: # pushstring
797 idx = u30(coder)
798 stack.append(constant_strings[idx])
799 elif opcode == 48: # pushscope
800 # We don't implement the scope register, so we'll just
801 # ignore the popped value
802 stack.pop()
803 elif opcode == 70: # callproperty
804 index = u30(coder)
805 mname = multinames[index]
806 arg_count = u30(coder)
807 args = list(reversed(
808 [stack.pop() for _ in range(arg_count)]))
809 obj = stack.pop()
810 if mname == u'split':
811 assert len(args) == 1
812 assert isinstance(args[0], compat_str)
813 assert isinstance(obj, compat_str)
814 if args[0] == u'':
815 res = list(obj)
816 else:
817 res = obj.split(args[0])
818 stack.append(res)
a7177865
PH
819 elif mname == u'slice':
820 assert len(args) == 1
821 assert isinstance(args[0], int)
822 assert isinstance(obj, list)
823 res = obj[args[0]:]
824 stack.append(res)
825 elif mname == u'join':
826 assert len(args) == 1
827 assert isinstance(args[0], compat_str)
828 assert isinstance(obj, list)
829 res = args[0].join(obj)
830 stack.append(res)
e0df6211
PH
831 elif mname in method_pyfunctions:
832 stack.append(method_pyfunctions[mname](args))
833 else:
834 raise NotImplementedError(
835 u'Unsupported property %r on %r'
836 % (mname, obj))
a7177865
PH
837 elif opcode == 72: # returnvalue
838 res = stack.pop()
839 return res
840 elif opcode == 79: # callpropvoid
841 index = u30(coder)
842 mname = multinames[index]
843 arg_count = u30(coder)
844 args = list(reversed(
845 [stack.pop() for _ in range(arg_count)]))
846 obj = stack.pop()
847 if mname == u'reverse':
848 assert isinstance(obj, list)
849 obj.reverse()
850 else:
851 raise NotImplementedError(
852 u'Unsupported (void) property %r on %r'
853 % (mname, obj))
e0df6211
PH
854 elif opcode == 93: # findpropstrict
855 index = u30(coder)
856 mname = multinames[index]
857 res = extract_function(mname)
858 stack.append(res)
859 elif opcode == 97: # setproperty
860 index = u30(coder)
861 value = stack.pop()
862 idx = stack.pop()
863 obj = stack.pop()
864 assert isinstance(obj, list)
865 assert isinstance(idx, int)
866 obj[idx] = value
867 elif opcode == 98: # getlocal
868 index = u30(coder)
869 stack.append(registers[index])
870 elif opcode == 99: # setlocal
871 index = u30(coder)
872 value = stack.pop()
873 registers[index] = value
874 elif opcode == 102: # getproperty
875 index = u30(coder)
876 pname = multinames[index]
877 if pname == u'length':
878 obj = stack.pop()
879 assert isinstance(obj, list)
880 stack.append(len(obj))
881 else: # Assume attribute access
882 idx = stack.pop()
883 assert isinstance(idx, int)
884 obj = stack.pop()
885 assert isinstance(obj, list)
886 stack.append(obj[idx])
887 elif opcode == 128: # coerce
0ca96d48 888 u30(coder)
e0df6211
PH
889 elif opcode == 133: # coerce_s
890 assert isinstance(stack[-1], (type(None), compat_str))
891 elif opcode == 164: # modulo
892 value2 = stack.pop()
893 value1 = stack.pop()
894 res = value1 % value2
895 stack.append(res)
a7177865
PH
896 elif opcode == 208: # getlocal_0
897 stack.append(registers[0])
898 elif opcode == 209: # getlocal_1
899 stack.append(registers[1])
900 elif opcode == 210: # getlocal_2
901 stack.append(registers[2])
902 elif opcode == 211: # getlocal_3
903 stack.append(registers[3])
e0df6211
PH
904 elif opcode == 214: # setlocal_2
905 registers[2] = stack.pop()
906 elif opcode == 215: # setlocal_3
907 registers[3] = stack.pop()
908 else:
909 raise NotImplementedError(
910 u'Unsupported opcode %d' % opcode)
911
912 method_pyfunctions[func_name] = resfunc
913 return resfunc
914
915 initial_function = extract_function(u'decipher')
916 return lambda s: initial_function([s])
917
83799698 918 def _decrypt_signature(self, s, video_id, player_url, age_gate=False):
257a2501 919 """Turn the encrypted s field into a working signature"""
6b37f0be 920
83799698 921 if player_url is not None:
9f9be844
PH
922 if player_url.startswith(u'//'):
923 player_url = u'https:' + player_url
e0df6211 924 try:
7f8ae73a
PH
925 player_id = (player_url, len(s))
926 if player_id not in self._player_cache:
83799698 927 func = self._extract_signature_function(
c4417ddb 928 video_id, player_url, len(s)
e0df6211 929 )
7f8ae73a
PH
930 self._player_cache[player_id] = func
931 func = self._player_cache[player_id]
edf3e38e
PH
932 if self._downloader.params.get('youtube_print_sig_code'):
933 self._print_sig_code(func, len(s))
934 return func(s)
0ca96d48 935 except Exception:
e0df6211 936 tb = traceback.format_exc()
83799698
PH
937 self._downloader.report_warning(
938 u'Automatic signature extraction failed: ' + tb)
e0df6211 939
d2d8f895
PH
940 self._downloader.report_warning(
941 u'Warning: Falling back to static signature algorithm')
920de7a2 942
2f2ffea9
PH
943 return self._static_decrypt_signature(
944 s, video_id, player_url, age_gate)
e0df6211 945
2f2ffea9 946 def _static_decrypt_signature(self, s, video_id, player_url, age_gate):
e0df6211
PH
947 if age_gate:
948 # The videos with age protection use another player, so the
949 # algorithms can be different.
950 if len(s) == 86:
951 return s[2:63] + s[82] + s[64:82] + s[63]
952
bc4b9008 953 if len(s) == 93:
954 return s[86:29:-1] + s[88] + s[28:5:-1]
955 elif len(s) == 92:
444b1165 956 return s[25] + s[3:25] + s[0] + s[26:42] + s[79] + s[43:79] + s[91] + s[80:83]
38d025b3
JMF
957 elif len(s) == 91:
958 return s[84:27:-1] + s[86] + s[26:5:-1]
444b1165
JMF
959 elif len(s) == 90:
960 return s[25] + s[3:25] + s[2] + s[26:40] + s[77] + s[41:77] + s[89] + s[78:81]
8a9d86a2 961 elif len(s) == 89:
962 return s[84:78:-1] + s[87] + s[77:60:-1] + s[0] + s[59:3:-1]
444b1165 963 elif len(s) == 88:
3e223834 964 return s[7:28] + s[87] + s[29:45] + s[55] + s[46:55] + s[2] + s[56:87] + s[28]
be547e1d 965 elif len(s) == 87:
3a725669 966 return s[6:27] + s[4] + s[28:39] + s[27] + s[40:59] + s[2] + s[60:]
be547e1d 967 elif len(s) == 86:
f2c327fd 968 return s[80:72:-1] + s[16] + s[71:39:-1] + s[72] + s[38:16:-1] + s[82] + s[15::-1]
be547e1d 969 elif len(s) == 85:
6ae8ee3f 970 return s[3:11] + s[0] + s[12:55] + s[84] + s[56:84]
be547e1d 971 elif len(s) == 84:
6f56389b 972 return s[78:70:-1] + s[14] + s[69:37:-1] + s[70] + s[36:14:-1] + s[80] + s[:14][::-1]
be547e1d 973 elif len(s) == 83:
920de7a2 974 return s[80:63:-1] + s[0] + s[62:0:-1] + s[63]
be547e1d 975 elif len(s) == 82:
c21315f2 976 return s[80:37:-1] + s[7] + s[36:7:-1] + s[0] + s[6:0:-1] + s[37]
be547e1d 977 elif len(s) == 81:
aedd6bb9 978 return s[56] + s[79:56:-1] + s[41] + s[55:41:-1] + s[80] + s[40:34:-1] + s[0] + s[33:29:-1] + s[34] + s[28:9:-1] + s[29] + s[8:0:-1] + s[9]
066090dd
JMF
979 elif len(s) == 80:
980 return s[1:19] + s[0] + s[20:68] + s[19] + s[69:80]
5c468ca8
JMF
981 elif len(s) == 79:
982 return s[54] + s[77:54:-1] + s[39] + s[53:39:-1] + s[78] + s[38:34:-1] + s[0] + s[33:29:-1] + s[34] + s[28:9:-1] + s[29] + s[8:0:-1] + s[9]
be547e1d
PH
983
984 else:
985 raise ExtractorError(u'Unable to decrypt signature, key length %d not supported; retrying might work' % (len(s)))
c5e8d7af 986
1f343eaa 987 def _get_available_subtitles(self, video_id, webpage):
de7f3446 988 try:
7fad1c63
JMF
989 sub_list = self._download_webpage(
990 'http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,
991 video_id, note=False)
992 except ExtractorError as err:
de7f3446
JMF
993 self._downloader.report_warning(u'unable to download video subtitles: %s' % compat_str(err))
994 return {}
995 lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
996
997 sub_lang_list = {}
998 for l in lang_list:
999 lang = l[1]
1000 params = compat_urllib_parse.urlencode({
1001 'lang': lang,
1002 'v': video_id,
ca715127 1003 'fmt': self._downloader.params.get('subtitlesformat', 'srt'),
c3197e3e 1004 'name': unescapeHTML(l[0]).encode('utf-8'),
de7f3446
JMF
1005 })
1006 url = u'http://www.youtube.com/api/timedtext?' + params
1007 sub_lang_list[lang] = url
1008 if not sub_lang_list:
1009 self._downloader.report_warning(u'video doesn\'t have subtitles')
1010 return {}
1011 return sub_lang_list
1012
055e6f36 1013 def _get_available_automatic_caption(self, video_id, webpage):
de7f3446
JMF
1014 """We need the webpage for getting the captions url, pass it as an
1015 argument to speed up the process."""
ca715127 1016 sub_format = self._downloader.params.get('subtitlesformat', 'srt')
de7f3446
JMF
1017 self.to_screen(u'%s: Looking for automatic captions' % video_id)
1018 mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
055e6f36 1019 err_msg = u'Couldn\'t find automatic captions for %s' % video_id
de7f3446
JMF
1020 if mobj is None:
1021 self._downloader.report_warning(err_msg)
1022 return {}
1023 player_config = json.loads(mobj.group(1))
1024 try:
1025 args = player_config[u'args']
1026 caption_url = args[u'ttsurl']
1027 timestamp = args[u'timestamp']
055e6f36
JMF
1028 # We get the available subtitles
1029 list_params = compat_urllib_parse.urlencode({
1030 'type': 'list',
1031 'tlangs': 1,
1032 'asrs': 1,
de7f3446 1033 })
055e6f36 1034 list_url = caption_url + '&' + list_params
e26f8712 1035 caption_list = self._download_xml(list_url, video_id)
e3dc22ca 1036 original_lang_node = caption_list.find('track')
f6a54188 1037 if original_lang_node is None or original_lang_node.attrib.get('kind') != 'asr' :
e3dc22ca
JMF
1038 self._downloader.report_warning(u'Video doesn\'t have automatic captions')
1039 return {}
1040 original_lang = original_lang_node.attrib['lang_code']
055e6f36
JMF
1041
1042 sub_lang_list = {}
1043 for lang_node in caption_list.findall('target'):
1044 sub_lang = lang_node.attrib['lang_code']
1045 params = compat_urllib_parse.urlencode({
1046 'lang': original_lang,
1047 'tlang': sub_lang,
1048 'fmt': sub_format,
1049 'ts': timestamp,
1050 'kind': 'asr',
1051 })
1052 sub_lang_list[sub_lang] = caption_url + '&' + params
1053 return sub_lang_list
de7f3446
JMF
1054 # An extractor error can be raise by the download process if there are
1055 # no automatic captions but there are subtitles
1056 except (KeyError, ExtractorError):
1057 self._downloader.report_warning(err_msg)
1058 return {}
1059
c5e8d7af
PH
1060 def _extract_id(self, url):
1061 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1062 if mobj is None:
1063 raise ExtractorError(u'Invalid URL: %s' % url)
1064 video_id = mobj.group(2)
1065 return video_id
1066
1d043b93
JMF
1067 def _get_video_url_list(self, url_map):
1068 """
1069 Transform a dictionary in the format {itag:url} to a list of (itag, url)
1070 with the requested formats.
1071 """
2c62dc26 1072 existing_formats = [x for x in self._formats if x in url_map]
1d043b93
JMF
1073 if len(existing_formats) == 0:
1074 raise ExtractorError(u'no known formats available for video')
4ea3be0a 1075 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1076 video_url_list.reverse() # order worst to best
1d043b93
JMF
1077 return video_url_list
1078
1079 def _extract_from_m3u8(self, manifest_url, video_id):
1080 url_map = {}
1081 def _get_urls(_manifest):
1082 lines = _manifest.split('\n')
1083 urls = filter(lambda l: l and not l.startswith('#'),
1084 lines)
1085 return urls
1086 manifest = self._download_webpage(manifest_url, video_id, u'Downloading formats manifest')
1087 formats_urls = _get_urls(manifest)
1088 for format_url in formats_urls:
890f62e8 1089 itag = self._search_regex(r'itag/(\d+?)/', format_url, 'itag')
1d043b93
JMF
1090 url_map[itag] = format_url
1091 return url_map
1092
1fb07d10
JG
1093 def _extract_annotations(self, video_id):
1094 url = 'https://www.youtube.com/annotations_invideo?features=1&legacy=1&video_id=%s' % video_id
1095 return self._download_webpage(url, video_id, note=u'Searching for annotations.', errnote=u'Unable to download video annotations.')
1096
c5e8d7af
PH
1097 def _real_extract(self, url):
1098 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
1099 mobj = re.search(self._NEXT_URL_RE, url)
1100 if mobj:
1101 url = 'https://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
1102 video_id = self._extract_id(url)
1103
1104 # Get video webpage
c5e8d7af 1105 url = 'https://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
336c3a69 1106 video_webpage = self._download_webpage(url, video_id)
c5e8d7af
PH
1107
1108 # Attempt to extract SWF player URL
e0df6211 1109 mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
c5e8d7af
PH
1110 if mobj is not None:
1111 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1112 else:
1113 player_url = None
1114
1115 # Get video info
1116 self.report_video_info_webpage_download(video_id)
c108eb73
JMF
1117 if re.search(r'player-age-gate-content">', video_webpage) is not None:
1118 self.report_age_confirmation()
1119 age_gate = True
1120 # We simulate the access to the video from www.youtube.com/v/{video_id}
1121 # this can be viewed without login into Youtube
1122 data = compat_urllib_parse.urlencode({'video_id': video_id,
fccd3771 1123 'el': 'player_embedded',
c108eb73
JMF
1124 'gl': 'US',
1125 'hl': 'en',
1126 'eurl': 'https://youtube.googleapis.com/v/' + video_id,
1127 'asv': 3,
1128 'sts':'1588',
1129 })
1130 video_info_url = 'https://www.youtube.com/get_video_info?' + data
c5e8d7af
PH
1131 video_info_webpage = self._download_webpage(video_info_url, video_id,
1132 note=False,
1133 errnote='unable to download video info webpage')
1134 video_info = compat_parse_qs(video_info_webpage)
c108eb73
JMF
1135 else:
1136 age_gate = False
1137 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1138 video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1139 % (video_id, el_type))
1140 video_info_webpage = self._download_webpage(video_info_url, video_id,
1141 note=False,
1142 errnote='unable to download video info webpage')
1143 video_info = compat_parse_qs(video_info_webpage)
1144 if 'token' in video_info:
1145 break
c5e8d7af
PH
1146 if 'token' not in video_info:
1147 if 'reason' in video_info:
9a82b238 1148 raise ExtractorError(u'YouTube said: %s' % video_info['reason'][0], expected=True)
c5e8d7af
PH
1149 else:
1150 raise ExtractorError(u'"token" parameter not in video info for unknown reason')
1151
1d699755
PH
1152 if 'view_count' in video_info:
1153 view_count = int(video_info['view_count'][0])
1154 else:
1155 view_count = None
1156
c5e8d7af
PH
1157 # Check for "rental" videos
1158 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
1159 raise ExtractorError(u'"rental" videos not supported')
1160
1161 # Start extracting information
1162 self.report_information_extraction(video_id)
1163
1164 # uploader
1165 if 'author' not in video_info:
1166 raise ExtractorError(u'Unable to extract uploader name')
1167 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
1168
1169 # uploader_id
1170 video_uploader_id = None
1171 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
1172 if mobj is not None:
1173 video_uploader_id = mobj.group(1)
1174 else:
1175 self._downloader.report_warning(u'unable to extract uploader nickname')
1176
1177 # title
a8c6b241
PH
1178 if 'title' in video_info:
1179 video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
1180 else:
1181 self._downloader.report_warning(u'Unable to extract video title')
1182 video_title = u'_'
c5e8d7af
PH
1183
1184 # thumbnail image
7763b04e
JMF
1185 # We try first to get a high quality image:
1186 m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">',
1187 video_webpage, re.DOTALL)
1188 if m_thumb is not None:
1189 video_thumbnail = m_thumb.group(1)
1190 elif 'thumbnail_url' not in video_info:
c5e8d7af 1191 self._downloader.report_warning(u'unable to extract video thumbnail')
f490e77e 1192 video_thumbnail = None
c5e8d7af
PH
1193 else: # don't panic if we can't find it
1194 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
1195
1196 # upload date
1197 upload_date = None
1198 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
1199 if mobj is not None:
1200 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1201 upload_date = unified_strdate(upload_date)
1202
1203 # description
1204 video_description = get_element_by_id("eow-description", video_webpage)
1205 if video_description:
27dcce19
PH
1206 video_description = re.sub(r'''(?x)
1207 <a\s+
1208 (?:[a-zA-Z-]+="[^"]+"\s+)*?
1209 title="([^"]+)"\s+
1210 (?:[a-zA-Z-]+="[^"]+"\s+)*?
1211 class="yt-uix-redirect-link"\s*>
1212 [^<]+
1213 </a>
1214 ''', r'\1', video_description)
c5e8d7af
PH
1215 video_description = clean_html(video_description)
1216 else:
1217 fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
1218 if fd_mobj:
1219 video_description = unescapeHTML(fd_mobj.group(1))
1220 else:
1221 video_description = u''
1222
336c3a69 1223 def _extract_count(klass):
46374a56
PH
1224 count = self._search_regex(
1225 r'class="%s">([\d,]+)</span>' % re.escape(klass),
1226 video_webpage, klass, default=None)
336c3a69
JMF
1227 if count is not None:
1228 return int(count.replace(',', ''))
1229 return None
1230 like_count = _extract_count(u'likes-count')
1231 dislike_count = _extract_count(u'dislikes-count')
1232
c5e8d7af 1233 # subtitles
d82134c3 1234 video_subtitles = self.extract_subtitles(video_id, video_webpage)
c5e8d7af 1235
c5e8d7af 1236 if self._downloader.params.get('listsubtitles', False):
d665f8d3 1237 self._list_available_subtitles(video_id, video_webpage)
c5e8d7af
PH
1238 return
1239
1240 if 'length_seconds' not in video_info:
1241 self._downloader.report_warning(u'unable to extract video duration')
b466b702 1242 video_duration = None
c5e8d7af 1243 else:
b466b702 1244 video_duration = int(compat_urllib_parse.unquote_plus(video_info['length_seconds'][0]))
c5e8d7af 1245
1fb07d10
JG
1246 # annotations
1247 video_annotations = None
1248 if self._downloader.params.get('writeannotations', False):
1249 video_annotations = self._extract_annotations(video_id)
1250
c5e8d7af 1251 # Decide which formats to download
c5e8d7af
PH
1252
1253 try:
1254 mobj = re.search(r';ytplayer.config = ({.*?});', video_webpage)
50be92c1
PH
1255 if not mobj:
1256 raise ValueError('Could not find vevo ID')
c5e8d7af
PH
1257 info = json.loads(mobj.group(1))
1258 args = info['args']
7ce7e394
JMF
1259 # Easy way to know if the 's' value is in url_encoded_fmt_stream_map
1260 # this signatures are encrypted
44d46655 1261 if 'url_encoded_fmt_stream_map' not in args:
f10503db 1262 raise ValueError(u'No stream_map present') # caught below
00fe14fc
JMF
1263 re_signature = re.compile(r'[&,]s=')
1264 m_s = re_signature.search(args['url_encoded_fmt_stream_map'])
7ce7e394
JMF
1265 if m_s is not None:
1266 self.to_screen(u'%s: Encrypted signatures detected.' % video_id)
c5e8d7af 1267 video_info['url_encoded_fmt_stream_map'] = [args['url_encoded_fmt_stream_map']]
00fe14fc 1268 m_s = re_signature.search(args.get('adaptive_fmts', u''))
b7a68384 1269 if m_s is not None:
00fe14fc
JMF
1270 if 'adaptive_fmts' in video_info:
1271 video_info['adaptive_fmts'][0] += ',' + args['adaptive_fmts']
37b6d5f6 1272 else:
00fe14fc 1273 video_info['adaptive_fmts'] = [args['adaptive_fmts']]
c5e8d7af
PH
1274 except ValueError:
1275 pass
1276
1277 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1278 self.report_rtmp_download()
ce6b9a2d 1279 video_url_list = [('_rtmp', video_info['conn'][0])]
00fe14fc
JMF
1280 elif len(video_info.get('url_encoded_fmt_stream_map', [])) >= 1 or len(video_info.get('adaptive_fmts', [])) >= 1:
1281 encoded_url_map = video_info.get('url_encoded_fmt_stream_map', [''])[0] + ',' + video_info.get('adaptive_fmts',[''])[0]
1282 if 'rtmpe%3Dyes' in encoded_url_map:
a7055eb9 1283 raise ExtractorError('rtmpe downloads are not supported, see https://github.com/rg3/youtube-dl/issues/343 for more information.', expected=True)
c5e8d7af 1284 url_map = {}
00fe14fc 1285 for url_data_str in encoded_url_map.split(','):
c5e8d7af
PH
1286 url_data = compat_parse_qs(url_data_str)
1287 if 'itag' in url_data and 'url' in url_data:
1288 url = url_data['url'][0]
1289 if 'sig' in url_data:
1290 url += '&signature=' + url_data['sig'][0]
1291 elif 's' in url_data:
e0df6211 1292 encrypted_sig = url_data['s'][0]
769fda3c 1293 if self._downloader.params.get('verbose'):
c108eb73 1294 if age_gate:
bdde940e
PH
1295 if player_url is None:
1296 player_version = 'unknown'
1297 else:
1298 player_version = self._search_regex(
1299 r'-(.+)\.swf$', player_url,
1300 u'flash player', fatal=False)
e0df6211 1301 player_desc = 'flash player %s' % player_version
c108eb73 1302 else:
83799698
PH
1303 player_version = self._search_regex(
1304 r'html5player-(.+?)\.js', video_webpage,
c108eb73 1305 'html5 player', fatal=False)
e0df6211
PH
1306 player_desc = u'html5 player %s' % player_version
1307
1308 parts_sizes = u'.'.join(compat_str(len(part)) for part in encrypted_sig.split('.'))
5a76c651 1309 self.to_screen(u'encrypted signature length %d (%s), itag %s, %s' %
e0df6211
PH
1310 (len(encrypted_sig), parts_sizes, url_data['itag'][0], player_desc))
1311
83799698 1312 if not age_gate:
e0df6211
PH
1313 jsplayer_url_json = self._search_regex(
1314 r'"assets":.+?"js":\s*("[^"]+")',
1315 video_webpage, u'JS player URL')
83799698 1316 player_url = json.loads(jsplayer_url_json)
e0df6211 1317
83799698
PH
1318 signature = self._decrypt_signature(
1319 encrypted_sig, video_id, player_url, age_gate)
c5e8d7af
PH
1320 url += '&signature=' + signature
1321 if 'ratebypass' not in url:
1322 url += '&ratebypass=yes'
1323 url_map[url_data['itag'][0]] = url
1d043b93 1324 video_url_list = self._get_video_url_list(url_map)
1d043b93
JMF
1325 elif video_info.get('hlsvp'):
1326 manifest_url = video_info['hlsvp'][0]
1327 url_map = self._extract_from_m3u8(manifest_url, video_id)
1328 video_url_list = self._get_video_url_list(url_map)
c5e8d7af 1329 else:
9abb3204 1330 raise ExtractorError(u'no conn, hlsvp or url_encoded_fmt_stream_map information found in video info')
c5e8d7af 1331
4ea3be0a 1332 formats = []
600cc1a4 1333 for itag, video_real_url in video_url_list:
2c62dc26
PH
1334 dct = {
1335 'format_id': itag,
1336 'url': video_real_url,
1337 'player_url': player_url,
1338 }
1339 dct.update(self._formats[itag])
1340 formats.append(dct)
d80044c2 1341
4bcc7bd1 1342 self._sort_formats(formats)
4ea3be0a 1343
1344 return {
1345 'id': video_id,
1346 'uploader': video_uploader,
1347 'uploader_id': video_uploader_id,
1348 'upload_date': upload_date,
1349 'title': video_title,
1350 'thumbnail': video_thumbnail,
1351 'description': video_description,
1352 'subtitles': video_subtitles,
1353 'duration': video_duration,
1354 'age_limit': 18 if age_gate else 0,
1355 'annotations': video_annotations,
1356 'webpage_url': 'https://www.youtube.com/watch?v=%s' % video_id,
1357 'view_count': view_count,
1358 'like_count': like_count,
1359 'dislike_count': dislike_count,
1360 'formats': formats,
1361 }
c5e8d7af 1362
880e1c52 1363class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
0f818663 1364 IE_DESC = u'YouTube.com playlists'
c5e8d7af
PH
1365 _VALID_URL = r"""(?:
1366 (?:https?://)?
1367 (?:\w+\.)?
1368 youtube\.com/
1369 (?:
1370 (?:course|view_play_list|my_playlists|artist|playlist|watch)
1371 \? (?:.*?&)*? (?:p|a|list)=
1372 | p/
1373 )
715c8e7b 1374 ((?:PL|EC|UU|FL|RD)?[0-9A-Za-z-_]{10,})
c5e8d7af
PH
1375 .*
1376 |
715c8e7b 1377 ((?:PL|EC|UU|FL|RD)[0-9A-Za-z-_]{10,})
c5e8d7af 1378 )"""
dcbb4580
JMF
1379 _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s&page=%s'
1380 _MORE_PAGES_INDICATOR = r'data-link-type="next"'
6e47b51e 1381 _VIDEO_RE = r'href="/watch\?v=(?P<id>[0-9A-Za-z_-]{11})&amp;[^"]*?index=(?P<index>\d+)'
c5e8d7af
PH
1382 IE_NAME = u'youtube:playlist'
1383
1384 @classmethod
1385 def suitable(cls, url):
1386 """Receives a URL and returns True if suitable for this IE."""
1387 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1388
880e1c52
JMF
1389 def _real_initialize(self):
1390 self._login()
1391
652cdaa2
JMF
1392 def _ids_to_results(self, ids):
1393 return [self.url_result(vid_id, 'Youtube', video_id=vid_id)
1394 for vid_id in ids]
1395
1396 def _extract_mix(self, playlist_id):
1397 # The mixes are generated from a a single video
1398 # the id of the playlist is just 'RD' + video_id
7d4afc55 1399 url = 'https://youtube.com/watch?v=%s&list=%s' % (playlist_id[-11:], playlist_id)
652cdaa2 1400 webpage = self._download_webpage(url, playlist_id, u'Downloading Youtube mix')
76d1700b
JMF
1401 title_span = (get_element_by_attribute('class', 'title long-title', webpage) or
1402 get_element_by_attribute('class', 'title ', webpage))
1403 title = clean_html(title_span)
652cdaa2
JMF
1404 video_re = r'data-index="\d+".*?href="/watch\?v=([0-9A-Za-z_-]{11})&amp;[^"]*?list=%s' % re.escape(playlist_id)
1405 ids = orderedSet(re.findall(video_re, webpage))
1406 url_results = self._ids_to_results(ids)
1407
1408 return self.playlist_result(url_results, playlist_id, title)
1409
c5e8d7af
PH
1410 def _real_extract(self, url):
1411 # Extract playlist id
1412 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1413 if mobj is None:
1414 raise ExtractorError(u'Invalid URL: %s' % url)
47192f92
FV
1415 playlist_id = mobj.group(1) or mobj.group(2)
1416
1417 # Check if it's a video-specific URL
7c61bd36 1418 query_dict = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
47192f92
FV
1419 if 'v' in query_dict:
1420 video_id = query_dict['v'][0]
1421 if self._downloader.params.get('noplaylist'):
1422 self.to_screen(u'Downloading just video %s because of --no-playlist' % video_id)
7012b23c 1423 return self.url_result(video_id, 'Youtube', video_id=video_id)
47192f92
FV
1424 else:
1425 self.to_screen(u'Downloading playlist PL%s - add --no-playlist to just download video %s' % (playlist_id, video_id))
c5e8d7af 1426
7d4afc55 1427 if playlist_id.startswith('RD'):
652cdaa2
JMF
1428 # Mixes require a custom extraction process
1429 return self._extract_mix(playlist_id)
0a688bc0
JMF
1430 if playlist_id.startswith('TL'):
1431 raise ExtractorError(u'For downloading YouTube.com top lists, use '
1432 u'the "yttoplist" keyword, for example "youtube-dl \'yttoplist:music:Top Tracks\'"', expected=True)
652cdaa2 1433
dcbb4580
JMF
1434 # Extract the video ids from the playlist pages
1435 ids = []
c5e8d7af 1436
755eb032 1437 for page_num in itertools.count(1):
dcbb4580 1438 url = self._TEMPLATE_URL % (playlist_id, page_num)
c5e8d7af 1439 page = self._download_webpage(url, playlist_id, u'Downloading page #%s' % page_num)
6e47b51e
JMF
1440 matches = re.finditer(self._VIDEO_RE, page)
1441 # We remove the duplicates and the link with index 0
1442 # (it's not the first video of the playlist)
1443 new_ids = orderedSet(m.group('id') for m in matches if m.group('index') != '0')
dcbb4580 1444 ids.extend(new_ids)
c5e8d7af 1445
dcbb4580 1446 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
c5e8d7af
PH
1447 break
1448
dcbb4580 1449 playlist_title = self._og_search_title(page)
c5e8d7af 1450
652cdaa2 1451 url_results = self._ids_to_results(ids)
dcbb4580 1452 return self.playlist_result(url_results, playlist_id, playlist_title)
c5e8d7af
PH
1453
1454
0a688bc0
JMF
1455class YoutubeTopListIE(YoutubePlaylistIE):
1456 IE_NAME = u'youtube:toplist'
1457 IE_DESC = (u'YouTube.com top lists, "yttoplist:{channel}:{list title}"'
1458 u' (Example: "yttoplist:music:Top Tracks")')
1459 _VALID_URL = r'yttoplist:(?P<chann>.*?):(?P<title>.*?)$'
1460
1461 def _real_extract(self, url):
1462 mobj = re.match(self._VALID_URL, url)
1463 channel = mobj.group('chann')
1464 title = mobj.group('title')
1465 query = compat_urllib_parse.urlencode({'title': title})
1466 playlist_re = 'href="([^"]+?%s[^"]+?)"' % re.escape(query)
1467 channel_page = self._download_webpage('https://www.youtube.com/%s' % channel, title)
1468 link = self._html_search_regex(playlist_re, channel_page, u'list')
1469 url = compat_urlparse.urljoin('https://www.youtube.com/', link)
1470
1471 video_re = r'data-index="\d+".*?data-video-id="([0-9A-Za-z_-]{11})"'
1472 ids = []
1473 # sometimes the webpage doesn't contain the videos
1474 # retry until we get them
1475 for i in itertools.count(0):
1476 msg = u'Downloading Youtube mix'
1477 if i > 0:
1478 msg += ', retry #%d' % i
1479 webpage = self._download_webpage(url, title, msg)
1480 ids = orderedSet(re.findall(video_re, webpage))
1481 if ids:
1482 break
1483 url_results = self._ids_to_results(ids)
1484 return self.playlist_result(url_results, playlist_title=title)
1485
1486
c5e8d7af 1487class YoutubeChannelIE(InfoExtractor):
0f818663 1488 IE_DESC = u'YouTube.com channels'
c5e8d7af 1489 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
c5e8d7af 1490 _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
252580c5 1491 _MORE_PAGES_URL = 'http://www.youtube.com/c4_browse_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
c5e8d7af
PH
1492 IE_NAME = u'youtube:channel'
1493
1494 def extract_videos_from_page(self, page):
1495 ids_in_page = []
1496 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1497 if mobj.group(1) not in ids_in_page:
1498 ids_in_page.append(mobj.group(1))
1499 return ids_in_page
1500
1501 def _real_extract(self, url):
1502 # Extract channel id
1503 mobj = re.match(self._VALID_URL, url)
1504 if mobj is None:
1505 raise ExtractorError(u'Invalid URL: %s' % url)
1506
1507 # Download channel page
1508 channel_id = mobj.group(1)
1509 video_ids = []
b9643eed
JMF
1510 url = 'https://www.youtube.com/channel/%s/videos' % channel_id
1511 channel_page = self._download_webpage(url, channel_id)
31812a9e
PH
1512 autogenerated = re.search(r'''(?x)
1513 class="[^"]*?(?:
1514 channel-header-autogenerated-label|
1515 yt-channel-title-autogenerated
1516 )[^"]*"''', channel_page) is not None
c5e8d7af 1517
b9643eed
JMF
1518 if autogenerated:
1519 # The videos are contained in a single page
1520 # the ajax pages can't be used, they are empty
1521 video_ids = self.extract_videos_from_page(channel_page)
1522 else:
1523 # Download all channel pages using the json-based channel_ajax query
1524 for pagenum in itertools.count(1):
1525 url = self._MORE_PAGES_URL % (pagenum, channel_id)
1526 page = self._download_webpage(url, channel_id,
1527 u'Downloading page #%s' % pagenum)
1528
1529 page = json.loads(page)
1530
1531 ids_in_page = self.extract_videos_from_page(page['content_html'])
1532 video_ids.extend(ids_in_page)
1533
1534 if self._MORE_PAGES_INDICATOR not in page['load_more_widget_html']:
1535 break
c5e8d7af
PH
1536
1537 self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1538
7012b23c
PH
1539 url_entries = [self.url_result(video_id, 'Youtube', video_id=video_id)
1540 for video_id in video_ids]
1541 return self.playlist_result(url_entries, channel_id)
c5e8d7af
PH
1542
1543
1544class YoutubeUserIE(InfoExtractor):
0f818663 1545 IE_DESC = u'YouTube.com user videos (URL or "ytuser" keyword)'
57da92b7 1546 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:user/)?(?!(?:attribution_link|watch)(?:$|[^a-z_A-Z0-9-])))|ytuser:)(?!feed/)([A-Za-z0-9_-]+)'
c5e8d7af
PH
1547 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1548 _GDATA_PAGE_SIZE = 50
fd9cf738 1549 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d&alt=json'
c5e8d7af
PH
1550 IE_NAME = u'youtube:user'
1551
e3ea4790 1552 @classmethod
f4b05232 1553 def suitable(cls, url):
e3ea4790
JMF
1554 # Don't return True if the url can be extracted with other youtube
1555 # extractor, the regex would is too permissive and it would match.
1556 other_ies = iter(klass for (name, klass) in globals().items() if name.endswith('IE') and klass is not cls)
1557 if any(ie.suitable(url) for ie in other_ies): return False
f4b05232
JMF
1558 else: return super(YoutubeUserIE, cls).suitable(url)
1559
c5e8d7af
PH
1560 def _real_extract(self, url):
1561 # Extract username
1562 mobj = re.match(self._VALID_URL, url)
1563 if mobj is None:
1564 raise ExtractorError(u'Invalid URL: %s' % url)
1565
1566 username = mobj.group(1)
1567
1568 # Download video ids using YouTube Data API. Result size per
1569 # query is limited (currently to 50 videos) so we need to query
1570 # page by page until there are no video ids - it means we got
1571 # all of them.
1572
e302f9ce 1573 url_results = []
c5e8d7af 1574
755eb032 1575 for pagenum in itertools.count(0):
c5e8d7af
PH
1576 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1577
1578 gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
1579 page = self._download_webpage(gdata_url, username,
1580 u'Downloading video ids from %d to %d' % (start_index, start_index + self._GDATA_PAGE_SIZE))
1581
fd9cf738
JMF
1582 try:
1583 response = json.loads(page)
1584 except ValueError as err:
1585 raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
71c82637
JMF
1586 if 'entry' not in response['feed']:
1587 # Number of videos is a multiple of self._MAX_RESULTS
1588 break
fd9cf738 1589
c5e8d7af 1590 # Extract video identifiers
e302f9ce
PH
1591 entries = response['feed']['entry']
1592 for entry in entries:
1593 title = entry['title']['$t']
1594 video_id = entry['id']['$t'].split('/')[-1]
1595 url_results.append({
1596 '_type': 'url',
1597 'url': video_id,
1598 'ie_key': 'Youtube',
1599 'id': 'video_id',
1600 'title': title,
1601 })
c5e8d7af
PH
1602
1603 # A little optimization - if current page is not
1604 # "full", ie. does not contain PAGE_SIZE video ids then
1605 # we can assume that this page is the last one - there
1606 # are no more ids on further pages - no need to query
1607 # again.
1608
e302f9ce 1609 if len(entries) < self._GDATA_PAGE_SIZE:
c5e8d7af
PH
1610 break
1611
7012b23c
PH
1612 return self.playlist_result(url_results, playlist_title=username)
1613
b05654f0
PH
1614
1615class YoutubeSearchIE(SearchInfoExtractor):
0f818663 1616 IE_DESC = u'YouTube.com searches'
b05654f0
PH
1617 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1618 _MAX_RESULTS = 1000
1619 IE_NAME = u'youtube:search'
1620 _SEARCH_KEY = 'ytsearch'
1621
b05654f0
PH
1622 def _get_n_results(self, query, n):
1623 """Get a specified number of results for a query"""
1624
1625 video_ids = []
1626 pagenum = 0
1627 limit = n
1628
1629 while (50 * pagenum) < limit:
b05654f0 1630 result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
7cc3570e
PH
1631 data_json = self._download_webpage(
1632 result_url, video_id=u'query "%s"' % query,
1633 note=u'Downloading page %s' % (pagenum + 1),
1634 errnote=u'Unable to download API page')
1635 data = json.loads(data_json)
1636 api_response = data['data']
1637
1638 if 'items' not in api_response:
b05654f0
PH
1639 raise ExtractorError(u'[youtube] No video results')
1640
1641 new_ids = list(video['id'] for video in api_response['items'])
1642 video_ids += new_ids
1643
1644 limit = min(n, api_response['totalItems'])
1645 pagenum += 1
1646
1647 if len(video_ids) > n:
1648 video_ids = video_ids[:n]
7012b23c
PH
1649 videos = [self.url_result(video_id, 'Youtube', video_id=video_id)
1650 for video_id in video_ids]
b05654f0 1651 return self.playlist_result(videos, query)
75dff0ee 1652
a3dd9248 1653class YoutubeSearchDateIE(YoutubeSearchIE):
cb7fb546 1654 IE_NAME = YoutubeSearchIE.IE_NAME + ':date'
a3dd9248
CM
1655 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc&orderby=published'
1656 _SEARCH_KEY = 'ytsearchdate'
08fb86c4 1657 IE_DESC = u'YouTube.com searches, newest videos first'
75dff0ee
JMF
1658
1659class YoutubeShowIE(InfoExtractor):
0f818663 1660 IE_DESC = u'YouTube.com (multi-season) shows'
75dff0ee
JMF
1661 _VALID_URL = r'https?://www\.youtube\.com/show/(.*)'
1662 IE_NAME = u'youtube:show'
1663
1664 def _real_extract(self, url):
1665 mobj = re.match(self._VALID_URL, url)
1666 show_name = mobj.group(1)
1667 webpage = self._download_webpage(url, show_name, u'Downloading show webpage')
1668 # There's one playlist for each season of the show
1669 m_seasons = list(re.finditer(r'href="(/playlist\?list=.*?)"', webpage))
1670 self.to_screen(u'%s: Found %s seasons' % (show_name, len(m_seasons)))
1671 return [self.url_result('https://www.youtube.com' + season.group(1), 'YoutubePlaylist') for season in m_seasons]
04cc9617
JMF
1672
1673
b2e8bc1b 1674class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
d7ae0639
JMF
1675 """
1676 Base class for extractors that fetch info from
1677 http://www.youtube.com/feed_ajax
1678 Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.
1679 """
b2e8bc1b 1680 _LOGIN_REQUIRED = True
43ba5456
JMF
1681 # use action_load_personal_feed instead of action_load_system_feed
1682 _PERSONAL_FEED = False
04cc9617 1683
d7ae0639
JMF
1684 @property
1685 def _FEED_TEMPLATE(self):
43ba5456
JMF
1686 action = 'action_load_system_feed'
1687 if self._PERSONAL_FEED:
1688 action = 'action_load_personal_feed'
1689 return 'http://www.youtube.com/feed_ajax?%s=1&feed_name=%s&paging=%%s' % (action, self._FEED_NAME)
d7ae0639
JMF
1690
1691 @property
1692 def IE_NAME(self):
1693 return u'youtube:%s' % self._FEED_NAME
04cc9617 1694
81f0259b 1695 def _real_initialize(self):
b2e8bc1b 1696 self._login()
81f0259b 1697
04cc9617
JMF
1698 def _real_extract(self, url):
1699 feed_entries = []
0e44d838
JMF
1700 paging = 0
1701 for i in itertools.count(1):
d7ae0639
JMF
1702 info = self._download_webpage(self._FEED_TEMPLATE % paging,
1703 u'%s feed' % self._FEED_NAME,
04cc9617
JMF
1704 u'Downloading page %s' % i)
1705 info = json.loads(info)
1706 feed_html = info['feed_html']
43ba5456 1707 m_ids = re.finditer(r'"/watch\?v=(.*?)["&]', feed_html)
04cc9617 1708 ids = orderedSet(m.group(1) for m in m_ids)
7012b23c
PH
1709 feed_entries.extend(
1710 self.url_result(video_id, 'Youtube', video_id=video_id)
1711 for video_id in ids)
04cc9617
JMF
1712 if info['paging'] is None:
1713 break
0e44d838 1714 paging = info['paging']
d7ae0639
JMF
1715 return self.playlist_result(feed_entries, playlist_title=self._PLAYLIST_TITLE)
1716
1717class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor):
1718 IE_DESC = u'YouTube.com subscriptions feed, "ytsubs" keyword(requires authentication)'
1719 _VALID_URL = r'https?://www\.youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
1720 _FEED_NAME = 'subscriptions'
1721 _PLAYLIST_TITLE = u'Youtube Subscriptions'
1722
1723class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
1724 IE_DESC = u'YouTube.com recommended videos, "ytrec" keyword (requires authentication)'
1725 _VALID_URL = r'https?://www\.youtube\.com/feed/recommended|:ytrec(?:ommended)?'
1726 _FEED_NAME = 'recommended'
1727 _PLAYLIST_TITLE = u'Youtube Recommended videos'
c626a3d9 1728
43ba5456
JMF
1729class YoutubeWatchLaterIE(YoutubeFeedsInfoExtractor):
1730 IE_DESC = u'Youtube watch later list, "ytwatchlater" keyword (requires authentication)'
1731 _VALID_URL = r'https?://www\.youtube\.com/feed/watch_later|:ytwatchlater'
1732 _FEED_NAME = 'watch_later'
1733 _PLAYLIST_TITLE = u'Youtube Watch Later'
43ba5456 1734 _PERSONAL_FEED = True
c626a3d9 1735
f459d170
JMF
1736class YoutubeHistoryIE(YoutubeFeedsInfoExtractor):
1737 IE_DESC = u'Youtube watch history, "ythistory" keyword (requires authentication)'
1738 _VALID_URL = u'https?://www\.youtube\.com/feed/history|:ythistory'
1739 _FEED_NAME = 'history'
1740 _PERSONAL_FEED = True
1741 _PLAYLIST_TITLE = u'Youtube Watch History'
1742
c626a3d9
JMF
1743class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
1744 IE_NAME = u'youtube:favorites'
1745 IE_DESC = u'YouTube.com favourite videos, "ytfav" keyword (requires authentication)'
c7a7750d 1746 _VALID_URL = r'https?://www\.youtube\.com/my_favorites|:ytfav(?:ou?rites)?'
c626a3d9
JMF
1747 _LOGIN_REQUIRED = True
1748
1749 def _real_extract(self, url):
1750 webpage = self._download_webpage('https://www.youtube.com/my_favorites', 'Youtube Favourites videos')
1751 playlist_id = self._search_regex(r'list=(.+?)["&]', webpage, u'favourites playlist id')
1752 return self.url_result(playlist_id, 'YoutubePlaylist')
15870e90
PH
1753
1754
1755class YoutubeTruncatedURLIE(InfoExtractor):
1756 IE_NAME = 'youtube:truncated_url'
1757 IE_DESC = False # Do not list
1758 _VALID_URL = r'(?:https?://)?[^/]+/watch\?feature=[a-z_]+$'
1759
1760 def _real_extract(self, url):
1761 raise ExtractorError(
1762 u'Did you forget to quote the URL? Remember that & is a meta '
1763 u'character in most shells, so you want to put the URL in quotes, '
1764 u'like youtube-dl '
b4622a32
PH
1765 u'"http://www.youtube.com/watch?feature=foo&v=BaW_jenozKc" '
1766 u' or simply youtube-dl BaW_jenozKc .',
15870e90 1767 expected=True)