]> jfr.im git - yt-dlp.git/blame - youtube_dl/extractor/youtube.py
Merge pull request #2153 from jaimeMF/ffmpeg-merger-check-install
[yt-dlp.git] / youtube_dl / extractor / youtube.py
CommitLineData
c5e8d7af 1# coding: utf-8
c5e8d7af 2
e0df6211 3import collections
edf3e38e 4import errno
e0df6211 5import io
0ca96d48 6import itertools
c5e8d7af 7import json
c4417ddb 8import os.path
c5e8d7af 9import re
e0df6211
PH
10import string
11import struct
12import traceback
13import zlib
c5e8d7af 14
b05654f0 15from .common import InfoExtractor, SearchInfoExtractor
54d39d8b 16from .subtitles import SubtitlesInfoExtractor
c5e8d7af 17from ..utils import (
edf3e38e 18 compat_chr,
c5e8d7af 19 compat_parse_qs,
c5e8d7af
PH
20 compat_urllib_parse,
21 compat_urllib_request,
7c61bd36 22 compat_urlparse,
c5e8d7af
PH
23 compat_str,
24
25 clean_html,
c38b1e77 26 get_cachedir,
c5e8d7af 27 get_element_by_id,
652cdaa2 28 get_element_by_attribute,
c5e8d7af
PH
29 ExtractorError,
30 unescapeHTML,
31 unified_strdate,
04cc9617 32 orderedSet,
edf3e38e 33 write_json_file,
c5e8d7af
PH
34)
35
de7f3446 36class YoutubeBaseInfoExtractor(InfoExtractor):
b2e8bc1b
JMF
37 """Provide base functions for Youtube extractors"""
38 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
39 _LANG_URL = r'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
40 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
41 _NETRC_MACHINE = 'youtube'
42 # If True it will raise an error if no login info is provided
43 _LOGIN_REQUIRED = False
44
b2e8bc1b 45 def _set_language(self):
7cc3570e
PH
46 return bool(self._download_webpage(
47 self._LANG_URL, None,
48 note=u'Setting language', errnote='unable to set language',
49 fatal=False))
b2e8bc1b
JMF
50
51 def _login(self):
52 (username, password) = self._get_login_info()
53 # No authentication to be performed
54 if username is None:
55 if self._LOGIN_REQUIRED:
56 raise ExtractorError(u'No login info available, needed for using %s.' % self.IE_NAME, expected=True)
57 return False
58
7cc3570e
PH
59 login_page = self._download_webpage(
60 self._LOGIN_URL, None,
61 note=u'Downloading login page',
62 errnote=u'unable to fetch login page', fatal=False)
63 if login_page is False:
64 return
b2e8bc1b 65
795f28f8
PH
66 galx = self._search_regex(r'(?s)<input.+?name="GALX".+?value="(.+?)"',
67 login_page, u'Login GALX parameter')
c5e8d7af 68
b2e8bc1b
JMF
69 # Log in
70 login_form_strs = {
71 u'continue': u'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
72 u'Email': username,
73 u'GALX': galx,
74 u'Passwd': password,
75 u'PersistentCookie': u'yes',
76 u'_utf8': u'霱',
77 u'bgresponse': u'js_disabled',
78 u'checkConnection': u'',
79 u'checkedDomains': u'youtube',
80 u'dnConn': u'',
b2e8bc1b
JMF
81 u'pstMsg': u'0',
82 u'rmShown': u'1',
83 u'secTok': u'',
84 u'signIn': u'Sign in',
85 u'timeStmp': u'',
86 u'service': u'youtube',
87 u'uilel': u'3',
88 u'hl': u'en_US',
89 }
90 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
91 # chokes on unicode
92 login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
93 login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
7cc3570e
PH
94
95 req = compat_urllib_request.Request(self._LOGIN_URL, login_data)
96 login_results = self._download_webpage(
97 req, None,
98 note=u'Logging in', errnote=u'unable to log in', fatal=False)
99 if login_results is False:
100 return False
101 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
102 self._downloader.report_warning(u'unable to log in: bad username or password')
b2e8bc1b
JMF
103 return False
104 return True
105
106 def _confirm_age(self):
107 age_form = {
7cc3570e
PH
108 'next_url': '/',
109 'action_confirm': 'Confirm',
110 }
111 req = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
112
113 self._download_webpage(
114 req, None,
115 note=u'Confirming age', errnote=u'Unable to confirm age')
b2e8bc1b
JMF
116 return True
117
118 def _real_initialize(self):
119 if self._downloader is None:
120 return
121 if not self._set_language():
122 return
123 if not self._login():
124 return
125 self._confirm_age()
c5e8d7af 126
8377574c 127
de7f3446 128class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
0f818663 129 IE_DESC = u'YouTube.com'
cb7dfeea 130 _VALID_URL = r"""(?x)^
c5e8d7af 131 (
83aa5293 132 (?:https?://|//)? # http(s):// or protocol-independent URL (optional)
cb7dfeea 133 (?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/|
484aaeb2
PH
134 (?:www\.)?deturl\.com/www\.youtube\.com/|
135 (?:www\.)?pwnyoutube\.com|
e69ae5b9
JMF
136 tube\.majestyc\.net/|
137 youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains
c5e8d7af
PH
138 (?:.*?\#/)? # handle anchor (#/) redirect urls
139 (?: # the various things that can precede the ID:
140 (?:(?:v|embed|e)/) # v/ or embed/ or e/
141 |(?: # or the v= param in all its forms
d741e55a 142 (?:(?:watch|movie)(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
c5e8d7af
PH
143 (?:\?|\#!?) # the params delimiter ? or # or #!
144 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
145 v=
146 )
f4b05232
JMF
147 ))
148 |youtu\.be/ # just youtu.be/xxxx
149 )
c5e8d7af 150 )? # all until now is optional -> you can pass the naked ID
8963d9c2 151 ([0-9A-Za-z_-]{11}) # here is it! the YouTube video ID
c5e8d7af
PH
152 (?(1).+)? # if we found the ID, everything can follow
153 $"""
c5e8d7af 154 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
2c62dc26
PH
155 _formats = {
156 '5': {'ext': 'flv', 'width': 400, 'height': 240},
157 '6': {'ext': 'flv', 'width': 450, 'height': 270},
158 '13': {'ext': '3gp'},
159 '17': {'ext': '3gp', 'width': 176, 'height': 144},
160 '18': {'ext': 'mp4', 'width': 640, 'height': 360},
161 '22': {'ext': 'mp4', 'width': 1280, 'height': 720},
162 '34': {'ext': 'flv', 'width': 640, 'height': 360},
163 '35': {'ext': 'flv', 'width': 854, 'height': 480},
164 '36': {'ext': '3gp', 'width': 320, 'height': 240},
165 '37': {'ext': 'mp4', 'width': 1920, 'height': 1080},
166 '38': {'ext': 'mp4', 'width': 4096, 'height': 3072},
167 '43': {'ext': 'webm', 'width': 640, 'height': 360},
168 '44': {'ext': 'webm', 'width': 854, 'height': 480},
169 '45': {'ext': 'webm', 'width': 1280, 'height': 720},
170 '46': {'ext': 'webm', 'width': 1920, 'height': 1080},
171
1d043b93 172
86fe61c8 173 # 3d videos
2c62dc26
PH
174 '82': {'ext': 'mp4', 'height': 360, 'resolution': '360p', 'format_note': '3D', 'preference': -20},
175 '83': {'ext': 'mp4', 'height': 480, 'resolution': '480p', 'format_note': '3D', 'preference': -20},
176 '84': {'ext': 'mp4', 'height': 720, 'resolution': '720p', 'format_note': '3D', 'preference': -20},
177 '85': {'ext': 'mp4', 'height': 1080, 'resolution': '1080p', 'format_note': '3D', 'preference': -20},
178 '100': {'ext': 'webm', 'height': 360, 'resolution': '360p', 'format_note': '3D', 'preference': -20},
179 '101': {'ext': 'webm', 'height': 480, 'resolution': '480p', 'format_note': '3D', 'preference': -20},
180 '102': {'ext': 'webm', 'height': 720, 'resolution': '720p', 'format_note': '3D', 'preference': -20},
836a086c 181
96fb5605 182 # Apple HTTP Live Streaming
2c62dc26
PH
183 '92': {'ext': 'mp4', 'height': 240, 'resolution': '240p', 'format_note': 'HLS', 'preference': -10},
184 '93': {'ext': 'mp4', 'height': 360, 'resolution': '360p', 'format_note': 'HLS', 'preference': -10},
185 '94': {'ext': 'mp4', 'height': 480, 'resolution': '480p', 'format_note': 'HLS', 'preference': -10},
186 '95': {'ext': 'mp4', 'height': 720, 'resolution': '720p', 'format_note': 'HLS', 'preference': -10},
187 '96': {'ext': 'mp4', 'height': 1080, 'resolution': '1080p', 'format_note': 'HLS', 'preference': -10},
188 '132': {'ext': 'mp4', 'height': 240, 'resolution': '240p', 'format_note': 'HLS', 'preference': -10},
189 '151': {'ext': 'mp4', 'height': 72, 'resolution': '72p', 'format_note': 'HLS', 'preference': -10},
190
191 # DASH mp4 video
192 '133': {'ext': 'mp4', 'height': 240, 'resolution': '240p', 'format_note': 'DASH video', 'preference': -40},
193 '134': {'ext': 'mp4', 'height': 360, 'resolution': '360p', 'format_note': 'DASH video', 'preference': -40},
194 '135': {'ext': 'mp4', 'height': 480, 'resolution': '480p', 'format_note': 'DASH video', 'preference': -40},
195 '136': {'ext': 'mp4', 'height': 720, 'resolution': '720p', 'format_note': 'DASH video', 'preference': -40},
196 '137': {'ext': 'mp4', 'height': 1080, 'resolution': '1080p', 'format_note': 'DASH video', 'preference': -40},
197 '138': {'ext': 'mp4', 'height': 1081, 'resolution': '>1080p', 'format_note': 'DASH video', 'preference': -40},
198 '160': {'ext': 'mp4', 'height': 192, 'resolution': '192p', 'format_note': 'DASH video', 'preference': -40},
8fa8a629 199 '264': {'ext': 'mp4', 'height': 1080, 'resolution': '1080p', 'format_note': 'DASH video', 'preference': -40},
836a086c 200
f6f1fc92 201 # Dash mp4 audio
2c62dc26
PH
202 '139': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 48, 'preference': -50},
203 '140': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 128, 'preference': -50},
204 '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 256, 'preference': -50},
836a086c
AZ
205
206 # Dash webm
2c62dc26
PH
207 '242': {'ext': 'webm', 'height': 240, 'resolution': '240p', 'format_note': 'DASH webm', 'preference': -40},
208 '243': {'ext': 'webm', 'height': 360, 'resolution': '360p', 'format_note': 'DASH webm', 'preference': -40},
209 '244': {'ext': 'webm', 'height': 480, 'resolution': '480p', 'format_note': 'DASH webm', 'preference': -40},
210 '245': {'ext': 'webm', 'height': 480, 'resolution': '480p', 'format_note': 'DASH webm', 'preference': -40},
211 '246': {'ext': 'webm', 'height': 480, 'resolution': '480p', 'format_note': 'DASH webm', 'preference': -40},
212 '247': {'ext': 'webm', 'height': 720, 'resolution': '720p', 'format_note': 'DASH webm', 'preference': -40},
213 '248': {'ext': 'webm', 'height': 1080, 'resolution': '1080p', 'format_note': 'DASH webm', 'preference': -40},
214
215 # Dash webm audio
216 '171': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH webm audio', 'abr': 48, 'preference': -50},
217 '172': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH webm audio', 'abr': 256, 'preference': -50},
ce6b9a2d
PH
218
219 # RTMP (unnamed)
220 '_rtmp': {'protocol': 'rtmp'},
c5e8d7af 221 }
836a086c 222
c5e8d7af 223 IE_NAME = u'youtube'
2eb88d95
PH
224 _TESTS = [
225 {
0e853ca4
PH
226 u"url": u"http://www.youtube.com/watch?v=BaW_jenozKc",
227 u"file": u"BaW_jenozKc.mp4",
228 u"info_dict": {
229 u"title": u"youtube-dl test video \"'/\\ä↭𝕐",
230 u"uploader": u"Philipp Hagemeister",
231 u"uploader_id": u"phihag",
232 u"upload_date": u"20121002",
27dcce19 233 u"description": u"test chars: \"'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de ."
2eb88d95 234 }
0e853ca4 235 },
0e853ca4
PH
236 {
237 u"url": u"http://www.youtube.com/watch?v=UxxajLWwzqY",
238 u"file": u"UxxajLWwzqY.mp4",
239 u"note": u"Test generic use_cipher_signature video (#897)",
240 u"info_dict": {
241 u"upload_date": u"20120506",
242 u"title": u"Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]",
2dc59299 243 u"description": u"md5:5b292926389560516e384ac437c0ec07",
45ed795c 244 u"uploader": u"Icona Pop",
0e853ca4 245 u"uploader_id": u"IconaPop"
2eb88d95 246 }
c108eb73
JMF
247 },
248 {
249 u"url": u"https://www.youtube.com/watch?v=07FYdnEawAQ",
250 u"file": u"07FYdnEawAQ.mp4",
251 u"note": u"Test VEVO video with age protection (#956)",
252 u"info_dict": {
253 u"upload_date": u"20130703",
254 u"title": u"Justin Timberlake - Tunnel Vision (Explicit)",
255 u"description": u"md5:64249768eec3bc4276236606ea996373",
256 u"uploader": u"justintimberlakeVEVO",
257 u"uploader_id": u"justintimberlakeVEVO"
258 }
259 },
fccd3771 260 {
83aa5293 261 u"url": u"//www.YouTube.com/watch?v=yZIXLfi8CZQ",
fccd3771
PH
262 u"file": u"yZIXLfi8CZQ.mp4",
263 u"note": u"Embed-only video (#1746)",
264 u"info_dict": {
265 u"upload_date": u"20120608",
266 u"title": u"Principal Sexually Assaults A Teacher - Episode 117 - 8th June 2012",
267 u"description": u"md5:09b78bd971f1e3e289601dfba15ca4f7",
268 u"uploader": u"SET India",
269 u"uploader_id": u"setindia"
270 }
271 },
2eb88d95
PH
272 ]
273
c5e8d7af
PH
274
275 @classmethod
276 def suitable(cls, url):
277 """Receives a URL and returns True if suitable for this IE."""
e3ea4790 278 if YoutubePlaylistIE.suitable(url): return False
fccd3771 279 return re.match(cls._VALID_URL, url) is not None
c5e8d7af 280
e0df6211
PH
281 def __init__(self, *args, **kwargs):
282 super(YoutubeIE, self).__init__(*args, **kwargs)
83799698 283 self._player_cache = {}
e0df6211 284
c5e8d7af
PH
285 def report_video_info_webpage_download(self, video_id):
286 """Report attempt to download video info webpage."""
287 self.to_screen(u'%s: Downloading video info webpage' % video_id)
288
c5e8d7af
PH
289 def report_information_extraction(self, video_id):
290 """Report attempt to extract video information."""
291 self.to_screen(u'%s: Extracting video information' % video_id)
292
293 def report_unavailable_format(self, video_id, format):
294 """Report extracted video URL."""
295 self.to_screen(u'%s: Format %s not available' % (video_id, format))
296
297 def report_rtmp_download(self):
298 """Indicate the download will use the RTMP protocol."""
299 self.to_screen(u'RTMP download detected')
300
c4417ddb
PH
301 def _extract_signature_function(self, video_id, player_url, slen):
302 id_m = re.match(r'.*-(?P<id>[a-zA-Z0-9_-]+)\.(?P<ext>[a-z]+)$',
83799698 303 player_url)
e0df6211
PH
304 player_type = id_m.group('ext')
305 player_id = id_m.group('id')
306
c4417ddb
PH
307 # Read from filesystem cache
308 func_id = '%s_%s_%d' % (player_type, player_id, slen)
309 assert os.path.basename(func_id) == func_id
c38b1e77 310 cache_dir = get_cachedir(self._downloader.params)
c4417ddb 311
c3c88a26 312 cache_enabled = cache_dir is not None
f8061589 313 if cache_enabled:
c4417ddb
PH
314 cache_fn = os.path.join(os.path.expanduser(cache_dir),
315 u'youtube-sigfuncs',
316 func_id + '.json')
317 try:
edf3e38e 318 with io.open(cache_fn, 'r', encoding='utf-8') as cachef:
c4417ddb
PH
319 cache_spec = json.load(cachef)
320 return lambda s: u''.join(s[i] for i in cache_spec)
edf3e38e 321 except IOError:
c4417ddb 322 pass # No cache available
83799698 323
e0df6211
PH
324 if player_type == 'js':
325 code = self._download_webpage(
326 player_url, video_id,
83799698 327 note=u'Downloading %s player %s' % (player_type, player_id),
e0df6211 328 errnote=u'Download of %s failed' % player_url)
83799698 329 res = self._parse_sig_js(code)
c4417ddb 330 elif player_type == 'swf':
e0df6211
PH
331 urlh = self._request_webpage(
332 player_url, video_id,
83799698 333 note=u'Downloading %s player %s' % (player_type, player_id),
e0df6211
PH
334 errnote=u'Download of %s failed' % player_url)
335 code = urlh.read()
83799698 336 res = self._parse_sig_swf(code)
e0df6211
PH
337 else:
338 assert False, 'Invalid player type %r' % player_type
339
f8061589 340 if cache_enabled:
edf3e38e 341 try:
c705320f
PH
342 test_string = u''.join(map(compat_chr, range(slen)))
343 cache_res = res(test_string)
edf3e38e
PH
344 cache_spec = [ord(c) for c in cache_res]
345 try:
346 os.makedirs(os.path.dirname(cache_fn))
347 except OSError as ose:
348 if ose.errno != errno.EEXIST:
349 raise
350 write_json_file(cache_spec, cache_fn)
0ca96d48 351 except Exception:
edf3e38e
PH
352 tb = traceback.format_exc()
353 self._downloader.report_warning(
354 u'Writing cache to %r failed: %s' % (cache_fn, tb))
83799698
PH
355
356 return res
357
edf3e38e
PH
358 def _print_sig_code(self, func, slen):
359 def gen_sig_code(idxs):
360 def _genslice(start, end, step):
361 starts = u'' if start == 0 else str(start)
e35e4ddc
PH
362 ends = (u':%d' % (end+step)) if end + step >= 0 else u':'
363 steps = u'' if step == 1 else (u':%d' % step)
edf3e38e
PH
364 return u's[%s%s%s]' % (starts, ends, steps)
365
366 step = None
0ca96d48
PH
367 start = '(Never used)' # Quelch pyflakes warnings - start will be
368 # set as soon as step is set
edf3e38e
PH
369 for i, prev in zip(idxs[1:], idxs[:-1]):
370 if step is not None:
371 if i - prev == step:
372 continue
373 yield _genslice(start, prev, step)
374 step = None
375 continue
376 if i - prev in [-1, 1]:
377 step = i - prev
378 start = prev
379 continue
380 else:
381 yield u's[%d]' % prev
382 if step is None:
383 yield u's[%d]' % i
384 else:
385 yield _genslice(start, i, step)
386
c705320f
PH
387 test_string = u''.join(map(compat_chr, range(slen)))
388 cache_res = func(test_string)
edf3e38e
PH
389 cache_spec = [ord(c) for c in cache_res]
390 expr_code = u' + '.join(gen_sig_code(cache_spec))
391 code = u'if len(s) == %d:\n return %s\n' % (slen, expr_code)
f8061589 392 self.to_screen(u'Extracted signature function:\n' + code)
edf3e38e 393
e0df6211
PH
394 def _parse_sig_js(self, jscode):
395 funcname = self._search_regex(
396 r'signature=([a-zA-Z]+)', jscode,
397 u'Initial JS player signature function name')
398
399 functions = {}
400
401 def argidx(varname):
402 return string.lowercase.index(varname)
403
404 def interpret_statement(stmt, local_vars, allow_recursion=20):
405 if allow_recursion < 0:
0ca96d48 406 raise ExtractorError(u'Recursion limit reached')
e0df6211
PH
407
408 if stmt.startswith(u'var '):
409 stmt = stmt[len(u'var '):]
410 ass_m = re.match(r'^(?P<out>[a-z]+)(?:\[(?P<index>[^\]]+)\])?' +
411 r'=(?P<expr>.*)$', stmt)
412 if ass_m:
413 if ass_m.groupdict().get('index'):
414 def assign(val):
415 lvar = local_vars[ass_m.group('out')]
416 idx = interpret_expression(ass_m.group('index'),
417 local_vars, allow_recursion)
418 assert isinstance(idx, int)
419 lvar[idx] = val
420 return val
421 expr = ass_m.group('expr')
422 else:
423 def assign(val):
424 local_vars[ass_m.group('out')] = val
425 return val
426 expr = ass_m.group('expr')
427 elif stmt.startswith(u'return '):
428 assign = lambda v: v
429 expr = stmt[len(u'return '):]
430 else:
431 raise ExtractorError(
432 u'Cannot determine left side of statement in %r' % stmt)
433
434 v = interpret_expression(expr, local_vars, allow_recursion)
435 return assign(v)
436
437 def interpret_expression(expr, local_vars, allow_recursion):
438 if expr.isdigit():
439 return int(expr)
440
441 if expr.isalpha():
442 return local_vars[expr]
443
444 m = re.match(r'^(?P<in>[a-z]+)\.(?P<member>.*)$', expr)
445 if m:
446 member = m.group('member')
447 val = local_vars[m.group('in')]
448 if member == 'split("")':
449 return list(val)
450 if member == 'join("")':
451 return u''.join(val)
452 if member == 'length':
453 return len(val)
454 if member == 'reverse()':
455 return val[::-1]
456 slice_m = re.match(r'slice\((?P<idx>.*)\)', member)
457 if slice_m:
458 idx = interpret_expression(
459 slice_m.group('idx'), local_vars, allow_recursion-1)
460 return val[idx:]
461
462 m = re.match(
463 r'^(?P<in>[a-z]+)\[(?P<idx>.+)\]$', expr)
464 if m:
465 val = local_vars[m.group('in')]
466 idx = interpret_expression(m.group('idx'), local_vars,
467 allow_recursion-1)
468 return val[idx]
469
470 m = re.match(r'^(?P<a>.+?)(?P<op>[%])(?P<b>.+?)$', expr)
471 if m:
472 a = interpret_expression(m.group('a'),
473 local_vars, allow_recursion)
474 b = interpret_expression(m.group('b'),
475 local_vars, allow_recursion)
476 return a % b
477
478 m = re.match(
479 r'^(?P<func>[a-zA-Z]+)\((?P<args>[a-z0-9,]+)\)$', expr)
480 if m:
481 fname = m.group('func')
482 if fname not in functions:
483 functions[fname] = extract_function(fname)
484 argvals = [int(v) if v.isdigit() else local_vars[v]
485 for v in m.group('args').split(',')]
486 return functions[fname](argvals)
487 raise ExtractorError(u'Unsupported JS expression %r' % expr)
488
489 def extract_function(funcname):
490 func_m = re.search(
491 r'function ' + re.escape(funcname) +
492 r'\((?P<args>[a-z,]+)\){(?P<code>[^}]+)}',
493 jscode)
494 argnames = func_m.group('args').split(',')
495
496 def resf(args):
497 local_vars = dict(zip(argnames, args))
498 for stmt in func_m.group('code').split(';'):
499 res = interpret_statement(stmt, local_vars)
500 return res
501 return resf
502
503 initial_function = extract_function(funcname)
504 return lambda s: initial_function([s])
505
506 def _parse_sig_swf(self, file_contents):
507 if file_contents[1:3] != b'WS':
508 raise ExtractorError(
509 u'Not an SWF file; header is %r' % file_contents[:3])
510 if file_contents[:1] == b'C':
511 content = zlib.decompress(file_contents[8:])
512 else:
513 raise NotImplementedError(u'Unsupported compression format %r' %
514 file_contents[:1])
515
516 def extract_tags(content):
517 pos = 0
518 while pos < len(content):
519 header16 = struct.unpack('<H', content[pos:pos+2])[0]
520 pos += 2
521 tag_code = header16 >> 6
522 tag_len = header16 & 0x3f
523 if tag_len == 0x3f:
524 tag_len = struct.unpack('<I', content[pos:pos+4])[0]
525 pos += 4
526 assert pos+tag_len <= len(content)
527 yield (tag_code, content[pos:pos+tag_len])
528 pos += tag_len
529
530 code_tag = next(tag
531 for tag_code, tag in extract_tags(content)
532 if tag_code == 82)
533 p = code_tag.index(b'\0', 4) + 1
ba552f54 534 code_reader = io.BytesIO(code_tag[p:])
e0df6211
PH
535
536 # Parse ABC (AVM2 ByteCode)
ba552f54
PH
537 def read_int(reader=None):
538 if reader is None:
539 reader = code_reader
e0df6211
PH
540 res = 0
541 shift = 0
542 for _ in range(5):
ba552f54
PH
543 buf = reader.read(1)
544 assert len(buf) == 1
545 b = struct.unpack('<B', buf)[0]
e0df6211
PH
546 res = res | ((b & 0x7f) << shift)
547 if b & 0x80 == 0:
548 break
549 shift += 7
ba552f54
PH
550 return res
551
552 def u30(reader=None):
553 res = read_int(reader)
554 assert res & 0xf0000000 == 0
e0df6211
PH
555 return res
556 u32 = read_int
557
ba552f54
PH
558 def s32(reader=None):
559 v = read_int(reader)
e0df6211
PH
560 if v & 0x80000000 != 0:
561 v = - ((v ^ 0xffffffff) + 1)
ba552f54
PH
562 return v
563
0ca96d48 564 def read_string(reader=None):
ba552f54
PH
565 if reader is None:
566 reader = code_reader
567 slen = u30(reader)
568 resb = reader.read(slen)
569 assert len(resb) == slen
570 return resb.decode('utf-8')
571
572 def read_bytes(count, reader=None):
573 if reader is None:
574 reader = code_reader
575 resb = reader.read(count)
576 assert len(resb) == count
577 return resb
578
579 def read_byte(reader=None):
580 resb = read_bytes(1, reader=reader)
581 res = struct.unpack('<B', resb)[0]
582 return res
e0df6211
PH
583
584 # minor_version + major_version
0ca96d48 585 read_bytes(2 + 2)
e0df6211
PH
586
587 # Constant pool
ba552f54 588 int_count = u30()
e0df6211 589 for _c in range(1, int_count):
0ca96d48 590 s32()
ba552f54 591 uint_count = u30()
e0df6211 592 for _c in range(1, uint_count):
0ca96d48 593 u32()
ba552f54 594 double_count = u30()
0ca96d48 595 read_bytes((double_count-1) * 8)
ba552f54 596 string_count = u30()
e0df6211
PH
597 constant_strings = [u'']
598 for _c in range(1, string_count):
0ca96d48 599 s = read_string()
e0df6211 600 constant_strings.append(s)
ba552f54 601 namespace_count = u30()
e0df6211 602 for _c in range(1, namespace_count):
0ca96d48
PH
603 read_bytes(1) # kind
604 u30() # name
ba552f54 605 ns_set_count = u30()
e0df6211 606 for _c in range(1, ns_set_count):
ba552f54 607 count = u30()
e0df6211 608 for _c2 in range(count):
0ca96d48 609 u30()
ba552f54 610 multiname_count = u30()
e0df6211
PH
611 MULTINAME_SIZES = {
612 0x07: 2, # QName
613 0x0d: 2, # QNameA
614 0x0f: 1, # RTQName
615 0x10: 1, # RTQNameA
616 0x11: 0, # RTQNameL
617 0x12: 0, # RTQNameLA
618 0x09: 2, # Multiname
619 0x0e: 2, # MultinameA
620 0x1b: 1, # MultinameL
621 0x1c: 1, # MultinameLA
622 }
623 multinames = [u'']
624 for _c in range(1, multiname_count):
ba552f54 625 kind = u30()
e0df6211
PH
626 assert kind in MULTINAME_SIZES, u'Invalid multiname kind %r' % kind
627 if kind == 0x07:
0ca96d48 628 u30() # namespace_idx
ba552f54 629 name_idx = u30()
e0df6211
PH
630 multinames.append(constant_strings[name_idx])
631 else:
632 multinames.append('[MULTINAME kind: %d]' % kind)
633 for _c2 in range(MULTINAME_SIZES[kind]):
0ca96d48 634 u30()
e0df6211
PH
635
636 # Methods
ba552f54 637 method_count = u30()
e0df6211
PH
638 MethodInfo = collections.namedtuple(
639 'MethodInfo',
640 ['NEED_ARGUMENTS', 'NEED_REST'])
641 method_infos = []
642 for method_id in range(method_count):
ba552f54 643 param_count = u30()
0ca96d48 644 u30() # return type
e0df6211 645 for _ in range(param_count):
0ca96d48
PH
646 u30() # param type
647 u30() # name index (always 0 for youtube)
ba552f54 648 flags = read_byte()
e0df6211
PH
649 if flags & 0x08 != 0:
650 # Options present
ba552f54 651 option_count = u30()
e0df6211 652 for c in range(option_count):
0ca96d48
PH
653 u30() # val
654 read_bytes(1) # kind
e0df6211
PH
655 if flags & 0x80 != 0:
656 # Param names present
657 for _ in range(param_count):
0ca96d48 658 u30() # param name
e0df6211
PH
659 mi = MethodInfo(flags & 0x01 != 0, flags & 0x04 != 0)
660 method_infos.append(mi)
661
662 # Metadata
ba552f54 663 metadata_count = u30()
e0df6211 664 for _c in range(metadata_count):
0ca96d48 665 u30() # name
ba552f54 666 item_count = u30()
e0df6211 667 for _c2 in range(item_count):
0ca96d48
PH
668 u30() # key
669 u30() # value
ba552f54
PH
670
671 def parse_traits_info():
672 trait_name_idx = u30()
673 kind_full = read_byte()
e0df6211
PH
674 kind = kind_full & 0x0f
675 attrs = kind_full >> 4
676 methods = {}
677 if kind in [0x00, 0x06]: # Slot or Const
0ca96d48
PH
678 u30() # Slot id
679 u30() # type_name_idx
ba552f54 680 vindex = u30()
e0df6211 681 if vindex != 0:
0ca96d48 682 read_byte() # vkind
e0df6211 683 elif kind in [0x01, 0x02, 0x03]: # Method / Getter / Setter
0ca96d48 684 u30() # disp_id
ba552f54 685 method_idx = u30()
e0df6211
PH
686 methods[multinames[trait_name_idx]] = method_idx
687 elif kind == 0x04: # Class
0ca96d48
PH
688 u30() # slot_id
689 u30() # classi
e0df6211 690 elif kind == 0x05: # Function
0ca96d48 691 u30() # slot_id
ba552f54 692 function_idx = u30()
e0df6211
PH
693 methods[function_idx] = multinames[trait_name_idx]
694 else:
695 raise ExtractorError(u'Unsupported trait kind %d' % kind)
696
697 if attrs & 0x4 != 0: # Metadata present
ba552f54 698 metadata_count = u30()
e0df6211 699 for _c3 in range(metadata_count):
0ca96d48 700 u30() # metadata index
e0df6211 701
ba552f54 702 return methods
e0df6211
PH
703
704 # Classes
705 TARGET_CLASSNAME = u'SignatureDecipher'
706 searched_idx = multinames.index(TARGET_CLASSNAME)
707 searched_class_id = None
ba552f54 708 class_count = u30()
e0df6211 709 for class_id in range(class_count):
ba552f54 710 name_idx = u30()
e0df6211
PH
711 if name_idx == searched_idx:
712 # We found the class we're looking for!
713 searched_class_id = class_id
0ca96d48 714 u30() # super_name idx
ba552f54 715 flags = read_byte()
e0df6211 716 if flags & 0x08 != 0: # Protected namespace is present
0ca96d48 717 u30() # protected_ns_idx
ba552f54 718 intrf_count = u30()
e0df6211 719 for _c2 in range(intrf_count):
0ca96d48
PH
720 u30()
721 u30() # iinit
ba552f54 722 trait_count = u30()
e0df6211 723 for _c2 in range(trait_count):
0ca96d48 724 parse_traits_info()
e0df6211
PH
725
726 if searched_class_id is None:
727 raise ExtractorError(u'Target class %r not found' %
728 TARGET_CLASSNAME)
729
730 method_names = {}
731 method_idxs = {}
732 for class_id in range(class_count):
0ca96d48 733 u30() # cinit
ba552f54 734 trait_count = u30()
e0df6211 735 for _c2 in range(trait_count):
ba552f54 736 trait_methods = parse_traits_info()
e0df6211
PH
737 if class_id == searched_class_id:
738 method_names.update(trait_methods.items())
739 method_idxs.update(dict(
740 (idx, name)
741 for name, idx in trait_methods.items()))
742
743 # Scripts
ba552f54 744 script_count = u30()
e0df6211 745 for _c in range(script_count):
0ca96d48 746 u30() # init
ba552f54 747 trait_count = u30()
e0df6211 748 for _c2 in range(trait_count):
0ca96d48 749 parse_traits_info()
e0df6211
PH
750
751 # Method bodies
ba552f54 752 method_body_count = u30()
e0df6211
PH
753 Method = collections.namedtuple('Method', ['code', 'local_count'])
754 methods = {}
755 for _c in range(method_body_count):
ba552f54 756 method_idx = u30()
0ca96d48 757 u30() # max_stack
ba552f54 758 local_count = u30()
0ca96d48
PH
759 u30() # init_scope_depth
760 u30() # max_scope_depth
ba552f54
PH
761 code_length = u30()
762 code = read_bytes(code_length)
e0df6211 763 if method_idx in method_idxs:
ba552f54 764 m = Method(code, local_count)
e0df6211 765 methods[method_idxs[method_idx]] = m
ba552f54 766 exception_count = u30()
e0df6211 767 for _c2 in range(exception_count):
0ca96d48
PH
768 u30() # from
769 u30() # to
770 u30() # target
771 u30() # exc_type
772 u30() # var_name
ba552f54 773 trait_count = u30()
e0df6211 774 for _c2 in range(trait_count):
0ca96d48 775 parse_traits_info()
e0df6211 776
ba552f54 777 assert p + code_reader.tell() == len(code_tag)
e0df6211
PH
778 assert len(methods) == len(method_idxs)
779
780 method_pyfunctions = {}
781
782 def extract_function(func_name):
783 if func_name in method_pyfunctions:
784 return method_pyfunctions[func_name]
785 if func_name not in methods:
786 raise ExtractorError(u'Cannot find function %r' % func_name)
787 m = methods[func_name]
788
789 def resfunc(args):
e0df6211
PH
790 registers = ['(this)'] + list(args) + [None] * m.local_count
791 stack = []
792 coder = io.BytesIO(m.code)
793 while True:
794 opcode = struct.unpack('!B', coder.read(1))[0]
a7177865 795 if opcode == 36: # pushbyte
e0df6211
PH
796 v = struct.unpack('!B', coder.read(1))[0]
797 stack.append(v)
798 elif opcode == 44: # pushstring
799 idx = u30(coder)
800 stack.append(constant_strings[idx])
801 elif opcode == 48: # pushscope
802 # We don't implement the scope register, so we'll just
803 # ignore the popped value
804 stack.pop()
805 elif opcode == 70: # callproperty
806 index = u30(coder)
807 mname = multinames[index]
808 arg_count = u30(coder)
809 args = list(reversed(
810 [stack.pop() for _ in range(arg_count)]))
811 obj = stack.pop()
812 if mname == u'split':
813 assert len(args) == 1
814 assert isinstance(args[0], compat_str)
815 assert isinstance(obj, compat_str)
816 if args[0] == u'':
817 res = list(obj)
818 else:
819 res = obj.split(args[0])
820 stack.append(res)
a7177865
PH
821 elif mname == u'slice':
822 assert len(args) == 1
823 assert isinstance(args[0], int)
824 assert isinstance(obj, list)
825 res = obj[args[0]:]
826 stack.append(res)
827 elif mname == u'join':
828 assert len(args) == 1
829 assert isinstance(args[0], compat_str)
830 assert isinstance(obj, list)
831 res = args[0].join(obj)
832 stack.append(res)
e0df6211
PH
833 elif mname in method_pyfunctions:
834 stack.append(method_pyfunctions[mname](args))
835 else:
836 raise NotImplementedError(
837 u'Unsupported property %r on %r'
838 % (mname, obj))
a7177865
PH
839 elif opcode == 72: # returnvalue
840 res = stack.pop()
841 return res
842 elif opcode == 79: # callpropvoid
843 index = u30(coder)
844 mname = multinames[index]
845 arg_count = u30(coder)
846 args = list(reversed(
847 [stack.pop() for _ in range(arg_count)]))
848 obj = stack.pop()
849 if mname == u'reverse':
850 assert isinstance(obj, list)
851 obj.reverse()
852 else:
853 raise NotImplementedError(
854 u'Unsupported (void) property %r on %r'
855 % (mname, obj))
e0df6211
PH
856 elif opcode == 93: # findpropstrict
857 index = u30(coder)
858 mname = multinames[index]
859 res = extract_function(mname)
860 stack.append(res)
861 elif opcode == 97: # setproperty
862 index = u30(coder)
863 value = stack.pop()
864 idx = stack.pop()
865 obj = stack.pop()
866 assert isinstance(obj, list)
867 assert isinstance(idx, int)
868 obj[idx] = value
869 elif opcode == 98: # getlocal
870 index = u30(coder)
871 stack.append(registers[index])
872 elif opcode == 99: # setlocal
873 index = u30(coder)
874 value = stack.pop()
875 registers[index] = value
876 elif opcode == 102: # getproperty
877 index = u30(coder)
878 pname = multinames[index]
879 if pname == u'length':
880 obj = stack.pop()
881 assert isinstance(obj, list)
882 stack.append(len(obj))
883 else: # Assume attribute access
884 idx = stack.pop()
885 assert isinstance(idx, int)
886 obj = stack.pop()
887 assert isinstance(obj, list)
888 stack.append(obj[idx])
889 elif opcode == 128: # coerce
0ca96d48 890 u30(coder)
e0df6211
PH
891 elif opcode == 133: # coerce_s
892 assert isinstance(stack[-1], (type(None), compat_str))
893 elif opcode == 164: # modulo
894 value2 = stack.pop()
895 value1 = stack.pop()
896 res = value1 % value2
897 stack.append(res)
a7177865
PH
898 elif opcode == 208: # getlocal_0
899 stack.append(registers[0])
900 elif opcode == 209: # getlocal_1
901 stack.append(registers[1])
902 elif opcode == 210: # getlocal_2
903 stack.append(registers[2])
904 elif opcode == 211: # getlocal_3
905 stack.append(registers[3])
e0df6211
PH
906 elif opcode == 214: # setlocal_2
907 registers[2] = stack.pop()
908 elif opcode == 215: # setlocal_3
909 registers[3] = stack.pop()
910 else:
911 raise NotImplementedError(
912 u'Unsupported opcode %d' % opcode)
913
914 method_pyfunctions[func_name] = resfunc
915 return resfunc
916
917 initial_function = extract_function(u'decipher')
918 return lambda s: initial_function([s])
919
83799698 920 def _decrypt_signature(self, s, video_id, player_url, age_gate=False):
257a2501 921 """Turn the encrypted s field into a working signature"""
6b37f0be 922
83799698 923 if player_url is not None:
9f9be844
PH
924 if player_url.startswith(u'//'):
925 player_url = u'https:' + player_url
e0df6211 926 try:
7f8ae73a
PH
927 player_id = (player_url, len(s))
928 if player_id not in self._player_cache:
83799698 929 func = self._extract_signature_function(
c4417ddb 930 video_id, player_url, len(s)
e0df6211 931 )
7f8ae73a
PH
932 self._player_cache[player_id] = func
933 func = self._player_cache[player_id]
edf3e38e
PH
934 if self._downloader.params.get('youtube_print_sig_code'):
935 self._print_sig_code(func, len(s))
936 return func(s)
0ca96d48 937 except Exception:
e0df6211 938 tb = traceback.format_exc()
83799698
PH
939 self._downloader.report_warning(
940 u'Automatic signature extraction failed: ' + tb)
e0df6211 941
d2d8f895
PH
942 self._downloader.report_warning(
943 u'Warning: Falling back to static signature algorithm')
920de7a2 944
2f2ffea9
PH
945 return self._static_decrypt_signature(
946 s, video_id, player_url, age_gate)
e0df6211 947
2f2ffea9 948 def _static_decrypt_signature(self, s, video_id, player_url, age_gate):
e0df6211
PH
949 if age_gate:
950 # The videos with age protection use another player, so the
951 # algorithms can be different.
952 if len(s) == 86:
953 return s[2:63] + s[82] + s[64:82] + s[63]
954
bc4b9008 955 if len(s) == 93:
956 return s[86:29:-1] + s[88] + s[28:5:-1]
957 elif len(s) == 92:
444b1165 958 return s[25] + s[3:25] + s[0] + s[26:42] + s[79] + s[43:79] + s[91] + s[80:83]
38d025b3
JMF
959 elif len(s) == 91:
960 return s[84:27:-1] + s[86] + s[26:5:-1]
444b1165
JMF
961 elif len(s) == 90:
962 return s[25] + s[3:25] + s[2] + s[26:40] + s[77] + s[41:77] + s[89] + s[78:81]
8a9d86a2 963 elif len(s) == 89:
964 return s[84:78:-1] + s[87] + s[77:60:-1] + s[0] + s[59:3:-1]
444b1165 965 elif len(s) == 88:
3e223834 966 return s[7:28] + s[87] + s[29:45] + s[55] + s[46:55] + s[2] + s[56:87] + s[28]
be547e1d 967 elif len(s) == 87:
3a725669 968 return s[6:27] + s[4] + s[28:39] + s[27] + s[40:59] + s[2] + s[60:]
be547e1d 969 elif len(s) == 86:
f2c327fd 970 return s[80:72:-1] + s[16] + s[71:39:-1] + s[72] + s[38:16:-1] + s[82] + s[15::-1]
be547e1d 971 elif len(s) == 85:
6ae8ee3f 972 return s[3:11] + s[0] + s[12:55] + s[84] + s[56:84]
be547e1d 973 elif len(s) == 84:
6f56389b 974 return s[78:70:-1] + s[14] + s[69:37:-1] + s[70] + s[36:14:-1] + s[80] + s[:14][::-1]
be547e1d 975 elif len(s) == 83:
920de7a2 976 return s[80:63:-1] + s[0] + s[62:0:-1] + s[63]
be547e1d 977 elif len(s) == 82:
c21315f2 978 return s[80:37:-1] + s[7] + s[36:7:-1] + s[0] + s[6:0:-1] + s[37]
be547e1d 979 elif len(s) == 81:
aedd6bb9 980 return s[56] + s[79:56:-1] + s[41] + s[55:41:-1] + s[80] + s[40:34:-1] + s[0] + s[33:29:-1] + s[34] + s[28:9:-1] + s[29] + s[8:0:-1] + s[9]
066090dd
JMF
981 elif len(s) == 80:
982 return s[1:19] + s[0] + s[20:68] + s[19] + s[69:80]
5c468ca8
JMF
983 elif len(s) == 79:
984 return s[54] + s[77:54:-1] + s[39] + s[53:39:-1] + s[78] + s[38:34:-1] + s[0] + s[33:29:-1] + s[34] + s[28:9:-1] + s[29] + s[8:0:-1] + s[9]
be547e1d
PH
985
986 else:
987 raise ExtractorError(u'Unable to decrypt signature, key length %d not supported; retrying might work' % (len(s)))
c5e8d7af 988
1f343eaa 989 def _get_available_subtitles(self, video_id, webpage):
de7f3446 990 try:
7fad1c63
JMF
991 sub_list = self._download_webpage(
992 'http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,
993 video_id, note=False)
994 except ExtractorError as err:
de7f3446
JMF
995 self._downloader.report_warning(u'unable to download video subtitles: %s' % compat_str(err))
996 return {}
997 lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
998
999 sub_lang_list = {}
1000 for l in lang_list:
1001 lang = l[1]
1002 params = compat_urllib_parse.urlencode({
1003 'lang': lang,
1004 'v': video_id,
ca715127 1005 'fmt': self._downloader.params.get('subtitlesformat', 'srt'),
c3197e3e 1006 'name': unescapeHTML(l[0]).encode('utf-8'),
de7f3446
JMF
1007 })
1008 url = u'http://www.youtube.com/api/timedtext?' + params
1009 sub_lang_list[lang] = url
1010 if not sub_lang_list:
1011 self._downloader.report_warning(u'video doesn\'t have subtitles')
1012 return {}
1013 return sub_lang_list
1014
055e6f36 1015 def _get_available_automatic_caption(self, video_id, webpage):
de7f3446
JMF
1016 """We need the webpage for getting the captions url, pass it as an
1017 argument to speed up the process."""
ca715127 1018 sub_format = self._downloader.params.get('subtitlesformat', 'srt')
de7f3446
JMF
1019 self.to_screen(u'%s: Looking for automatic captions' % video_id)
1020 mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
055e6f36 1021 err_msg = u'Couldn\'t find automatic captions for %s' % video_id
de7f3446
JMF
1022 if mobj is None:
1023 self._downloader.report_warning(err_msg)
1024 return {}
1025 player_config = json.loads(mobj.group(1))
1026 try:
1027 args = player_config[u'args']
1028 caption_url = args[u'ttsurl']
1029 timestamp = args[u'timestamp']
055e6f36
JMF
1030 # We get the available subtitles
1031 list_params = compat_urllib_parse.urlencode({
1032 'type': 'list',
1033 'tlangs': 1,
1034 'asrs': 1,
de7f3446 1035 })
055e6f36 1036 list_url = caption_url + '&' + list_params
e26f8712 1037 caption_list = self._download_xml(list_url, video_id)
e3dc22ca 1038 original_lang_node = caption_list.find('track')
f6a54188 1039 if original_lang_node is None or original_lang_node.attrib.get('kind') != 'asr' :
e3dc22ca
JMF
1040 self._downloader.report_warning(u'Video doesn\'t have automatic captions')
1041 return {}
1042 original_lang = original_lang_node.attrib['lang_code']
055e6f36
JMF
1043
1044 sub_lang_list = {}
1045 for lang_node in caption_list.findall('target'):
1046 sub_lang = lang_node.attrib['lang_code']
1047 params = compat_urllib_parse.urlencode({
1048 'lang': original_lang,
1049 'tlang': sub_lang,
1050 'fmt': sub_format,
1051 'ts': timestamp,
1052 'kind': 'asr',
1053 })
1054 sub_lang_list[sub_lang] = caption_url + '&' + params
1055 return sub_lang_list
de7f3446
JMF
1056 # An extractor error can be raise by the download process if there are
1057 # no automatic captions but there are subtitles
1058 except (KeyError, ExtractorError):
1059 self._downloader.report_warning(err_msg)
1060 return {}
1061
c5e8d7af
PH
1062 def _extract_id(self, url):
1063 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1064 if mobj is None:
1065 raise ExtractorError(u'Invalid URL: %s' % url)
1066 video_id = mobj.group(2)
1067 return video_id
1068
1d043b93
JMF
1069 def _get_video_url_list(self, url_map):
1070 """
1071 Transform a dictionary in the format {itag:url} to a list of (itag, url)
1072 with the requested formats.
1073 """
2c62dc26 1074 existing_formats = [x for x in self._formats if x in url_map]
1d043b93
JMF
1075 if len(existing_formats) == 0:
1076 raise ExtractorError(u'no known formats available for video')
4ea3be0a 1077 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1078 video_url_list.reverse() # order worst to best
1d043b93
JMF
1079 return video_url_list
1080
1081 def _extract_from_m3u8(self, manifest_url, video_id):
1082 url_map = {}
1083 def _get_urls(_manifest):
1084 lines = _manifest.split('\n')
1085 urls = filter(lambda l: l and not l.startswith('#'),
1086 lines)
1087 return urls
1088 manifest = self._download_webpage(manifest_url, video_id, u'Downloading formats manifest')
1089 formats_urls = _get_urls(manifest)
1090 for format_url in formats_urls:
890f62e8 1091 itag = self._search_regex(r'itag/(\d+?)/', format_url, 'itag')
1d043b93
JMF
1092 url_map[itag] = format_url
1093 return url_map
1094
1fb07d10
JG
1095 def _extract_annotations(self, video_id):
1096 url = 'https://www.youtube.com/annotations_invideo?features=1&legacy=1&video_id=%s' % video_id
1097 return self._download_webpage(url, video_id, note=u'Searching for annotations.', errnote=u'Unable to download video annotations.')
1098
c5e8d7af
PH
1099 def _real_extract(self, url):
1100 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
1101 mobj = re.search(self._NEXT_URL_RE, url)
1102 if mobj:
1103 url = 'https://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
1104 video_id = self._extract_id(url)
1105
1106 # Get video webpage
c5e8d7af 1107 url = 'https://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
336c3a69 1108 video_webpage = self._download_webpage(url, video_id)
c5e8d7af
PH
1109
1110 # Attempt to extract SWF player URL
e0df6211 1111 mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
c5e8d7af
PH
1112 if mobj is not None:
1113 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1114 else:
1115 player_url = None
1116
1117 # Get video info
1118 self.report_video_info_webpage_download(video_id)
c108eb73
JMF
1119 if re.search(r'player-age-gate-content">', video_webpage) is not None:
1120 self.report_age_confirmation()
1121 age_gate = True
1122 # We simulate the access to the video from www.youtube.com/v/{video_id}
1123 # this can be viewed without login into Youtube
1124 data = compat_urllib_parse.urlencode({'video_id': video_id,
fccd3771 1125 'el': 'player_embedded',
c108eb73
JMF
1126 'gl': 'US',
1127 'hl': 'en',
1128 'eurl': 'https://youtube.googleapis.com/v/' + video_id,
1129 'asv': 3,
1130 'sts':'1588',
1131 })
1132 video_info_url = 'https://www.youtube.com/get_video_info?' + data
c5e8d7af
PH
1133 video_info_webpage = self._download_webpage(video_info_url, video_id,
1134 note=False,
1135 errnote='unable to download video info webpage')
1136 video_info = compat_parse_qs(video_info_webpage)
c108eb73
JMF
1137 else:
1138 age_gate = False
1139 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1140 video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1141 % (video_id, el_type))
1142 video_info_webpage = self._download_webpage(video_info_url, video_id,
1143 note=False,
1144 errnote='unable to download video info webpage')
1145 video_info = compat_parse_qs(video_info_webpage)
1146 if 'token' in video_info:
1147 break
c5e8d7af
PH
1148 if 'token' not in video_info:
1149 if 'reason' in video_info:
9a82b238 1150 raise ExtractorError(u'YouTube said: %s' % video_info['reason'][0], expected=True)
c5e8d7af
PH
1151 else:
1152 raise ExtractorError(u'"token" parameter not in video info for unknown reason')
1153
1d699755
PH
1154 if 'view_count' in video_info:
1155 view_count = int(video_info['view_count'][0])
1156 else:
1157 view_count = None
1158
c5e8d7af
PH
1159 # Check for "rental" videos
1160 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
1161 raise ExtractorError(u'"rental" videos not supported')
1162
1163 # Start extracting information
1164 self.report_information_extraction(video_id)
1165
1166 # uploader
1167 if 'author' not in video_info:
1168 raise ExtractorError(u'Unable to extract uploader name')
1169 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
1170
1171 # uploader_id
1172 video_uploader_id = None
1173 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
1174 if mobj is not None:
1175 video_uploader_id = mobj.group(1)
1176 else:
1177 self._downloader.report_warning(u'unable to extract uploader nickname')
1178
1179 # title
a8c6b241
PH
1180 if 'title' in video_info:
1181 video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
1182 else:
1183 self._downloader.report_warning(u'Unable to extract video title')
1184 video_title = u'_'
c5e8d7af
PH
1185
1186 # thumbnail image
7763b04e
JMF
1187 # We try first to get a high quality image:
1188 m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">',
1189 video_webpage, re.DOTALL)
1190 if m_thumb is not None:
1191 video_thumbnail = m_thumb.group(1)
1192 elif 'thumbnail_url' not in video_info:
c5e8d7af 1193 self._downloader.report_warning(u'unable to extract video thumbnail')
f490e77e 1194 video_thumbnail = None
c5e8d7af
PH
1195 else: # don't panic if we can't find it
1196 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
1197
1198 # upload date
1199 upload_date = None
1200 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
1201 if mobj is not None:
1202 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1203 upload_date = unified_strdate(upload_date)
1204
1205 # description
1206 video_description = get_element_by_id("eow-description", video_webpage)
1207 if video_description:
27dcce19
PH
1208 video_description = re.sub(r'''(?x)
1209 <a\s+
1210 (?:[a-zA-Z-]+="[^"]+"\s+)*?
1211 title="([^"]+)"\s+
1212 (?:[a-zA-Z-]+="[^"]+"\s+)*?
1213 class="yt-uix-redirect-link"\s*>
1214 [^<]+
1215 </a>
1216 ''', r'\1', video_description)
c5e8d7af
PH
1217 video_description = clean_html(video_description)
1218 else:
1219 fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
1220 if fd_mobj:
1221 video_description = unescapeHTML(fd_mobj.group(1))
1222 else:
1223 video_description = u''
1224
336c3a69 1225 def _extract_count(klass):
46374a56
PH
1226 count = self._search_regex(
1227 r'class="%s">([\d,]+)</span>' % re.escape(klass),
1228 video_webpage, klass, default=None)
336c3a69
JMF
1229 if count is not None:
1230 return int(count.replace(',', ''))
1231 return None
1232 like_count = _extract_count(u'likes-count')
1233 dislike_count = _extract_count(u'dislikes-count')
1234
c5e8d7af 1235 # subtitles
d82134c3 1236 video_subtitles = self.extract_subtitles(video_id, video_webpage)
c5e8d7af 1237
c5e8d7af 1238 if self._downloader.params.get('listsubtitles', False):
d665f8d3 1239 self._list_available_subtitles(video_id, video_webpage)
c5e8d7af
PH
1240 return
1241
1242 if 'length_seconds' not in video_info:
1243 self._downloader.report_warning(u'unable to extract video duration')
b466b702 1244 video_duration = None
c5e8d7af 1245 else:
b466b702 1246 video_duration = int(compat_urllib_parse.unquote_plus(video_info['length_seconds'][0]))
c5e8d7af 1247
1fb07d10
JG
1248 # annotations
1249 video_annotations = None
1250 if self._downloader.params.get('writeannotations', False):
1251 video_annotations = self._extract_annotations(video_id)
1252
c5e8d7af 1253 # Decide which formats to download
c5e8d7af
PH
1254
1255 try:
1256 mobj = re.search(r';ytplayer.config = ({.*?});', video_webpage)
50be92c1
PH
1257 if not mobj:
1258 raise ValueError('Could not find vevo ID')
c5e8d7af
PH
1259 info = json.loads(mobj.group(1))
1260 args = info['args']
7ce7e394
JMF
1261 # Easy way to know if the 's' value is in url_encoded_fmt_stream_map
1262 # this signatures are encrypted
44d46655 1263 if 'url_encoded_fmt_stream_map' not in args:
f10503db 1264 raise ValueError(u'No stream_map present') # caught below
00fe14fc
JMF
1265 re_signature = re.compile(r'[&,]s=')
1266 m_s = re_signature.search(args['url_encoded_fmt_stream_map'])
7ce7e394
JMF
1267 if m_s is not None:
1268 self.to_screen(u'%s: Encrypted signatures detected.' % video_id)
c5e8d7af 1269 video_info['url_encoded_fmt_stream_map'] = [args['url_encoded_fmt_stream_map']]
00fe14fc 1270 m_s = re_signature.search(args.get('adaptive_fmts', u''))
b7a68384 1271 if m_s is not None:
00fe14fc
JMF
1272 if 'adaptive_fmts' in video_info:
1273 video_info['adaptive_fmts'][0] += ',' + args['adaptive_fmts']
37b6d5f6 1274 else:
00fe14fc 1275 video_info['adaptive_fmts'] = [args['adaptive_fmts']]
c5e8d7af
PH
1276 except ValueError:
1277 pass
1278
1279 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1280 self.report_rtmp_download()
ce6b9a2d 1281 video_url_list = [('_rtmp', video_info['conn'][0])]
00fe14fc
JMF
1282 elif len(video_info.get('url_encoded_fmt_stream_map', [])) >= 1 or len(video_info.get('adaptive_fmts', [])) >= 1:
1283 encoded_url_map = video_info.get('url_encoded_fmt_stream_map', [''])[0] + ',' + video_info.get('adaptive_fmts',[''])[0]
1284 if 'rtmpe%3Dyes' in encoded_url_map:
a7055eb9 1285 raise ExtractorError('rtmpe downloads are not supported, see https://github.com/rg3/youtube-dl/issues/343 for more information.', expected=True)
c5e8d7af 1286 url_map = {}
00fe14fc 1287 for url_data_str in encoded_url_map.split(','):
c5e8d7af
PH
1288 url_data = compat_parse_qs(url_data_str)
1289 if 'itag' in url_data and 'url' in url_data:
1290 url = url_data['url'][0]
1291 if 'sig' in url_data:
1292 url += '&signature=' + url_data['sig'][0]
1293 elif 's' in url_data:
e0df6211 1294 encrypted_sig = url_data['s'][0]
769fda3c 1295 if self._downloader.params.get('verbose'):
c108eb73 1296 if age_gate:
bdde940e
PH
1297 if player_url is None:
1298 player_version = 'unknown'
1299 else:
1300 player_version = self._search_regex(
1301 r'-(.+)\.swf$', player_url,
1302 u'flash player', fatal=False)
e0df6211 1303 player_desc = 'flash player %s' % player_version
c108eb73 1304 else:
83799698
PH
1305 player_version = self._search_regex(
1306 r'html5player-(.+?)\.js', video_webpage,
c108eb73 1307 'html5 player', fatal=False)
e0df6211
PH
1308 player_desc = u'html5 player %s' % player_version
1309
1310 parts_sizes = u'.'.join(compat_str(len(part)) for part in encrypted_sig.split('.'))
5a76c651 1311 self.to_screen(u'encrypted signature length %d (%s), itag %s, %s' %
e0df6211
PH
1312 (len(encrypted_sig), parts_sizes, url_data['itag'][0], player_desc))
1313
83799698 1314 if not age_gate:
e0df6211
PH
1315 jsplayer_url_json = self._search_regex(
1316 r'"assets":.+?"js":\s*("[^"]+")',
1317 video_webpage, u'JS player URL')
83799698 1318 player_url = json.loads(jsplayer_url_json)
e0df6211 1319
83799698
PH
1320 signature = self._decrypt_signature(
1321 encrypted_sig, video_id, player_url, age_gate)
c5e8d7af
PH
1322 url += '&signature=' + signature
1323 if 'ratebypass' not in url:
1324 url += '&ratebypass=yes'
1325 url_map[url_data['itag'][0]] = url
1d043b93 1326 video_url_list = self._get_video_url_list(url_map)
1d043b93
JMF
1327 elif video_info.get('hlsvp'):
1328 manifest_url = video_info['hlsvp'][0]
1329 url_map = self._extract_from_m3u8(manifest_url, video_id)
1330 video_url_list = self._get_video_url_list(url_map)
c5e8d7af 1331 else:
9abb3204 1332 raise ExtractorError(u'no conn, hlsvp or url_encoded_fmt_stream_map information found in video info')
c5e8d7af 1333
4ea3be0a 1334 formats = []
600cc1a4 1335 for itag, video_real_url in video_url_list:
2c62dc26
PH
1336 dct = {
1337 'format_id': itag,
1338 'url': video_real_url,
1339 'player_url': player_url,
1340 }
1341 dct.update(self._formats[itag])
1342 formats.append(dct)
d80044c2 1343
4bcc7bd1 1344 self._sort_formats(formats)
4ea3be0a 1345
1346 return {
1347 'id': video_id,
1348 'uploader': video_uploader,
1349 'uploader_id': video_uploader_id,
1350 'upload_date': upload_date,
1351 'title': video_title,
1352 'thumbnail': video_thumbnail,
1353 'description': video_description,
1354 'subtitles': video_subtitles,
1355 'duration': video_duration,
1356 'age_limit': 18 if age_gate else 0,
1357 'annotations': video_annotations,
1358 'webpage_url': 'https://www.youtube.com/watch?v=%s' % video_id,
1359 'view_count': view_count,
1360 'like_count': like_count,
1361 'dislike_count': dislike_count,
1362 'formats': formats,
1363 }
c5e8d7af 1364
880e1c52 1365class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
0f818663 1366 IE_DESC = u'YouTube.com playlists'
c5e8d7af
PH
1367 _VALID_URL = r"""(?:
1368 (?:https?://)?
1369 (?:\w+\.)?
1370 youtube\.com/
1371 (?:
1372 (?:course|view_play_list|my_playlists|artist|playlist|watch)
1373 \? (?:.*?&)*? (?:p|a|list)=
1374 | p/
1375 )
715c8e7b 1376 ((?:PL|EC|UU|FL|RD)?[0-9A-Za-z-_]{10,})
c5e8d7af
PH
1377 .*
1378 |
715c8e7b 1379 ((?:PL|EC|UU|FL|RD)[0-9A-Za-z-_]{10,})
c5e8d7af 1380 )"""
dcbb4580
JMF
1381 _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s&page=%s'
1382 _MORE_PAGES_INDICATOR = r'data-link-type="next"'
6e47b51e 1383 _VIDEO_RE = r'href="/watch\?v=(?P<id>[0-9A-Za-z_-]{11})&amp;[^"]*?index=(?P<index>\d+)'
c5e8d7af
PH
1384 IE_NAME = u'youtube:playlist'
1385
1386 @classmethod
1387 def suitable(cls, url):
1388 """Receives a URL and returns True if suitable for this IE."""
1389 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1390
880e1c52
JMF
1391 def _real_initialize(self):
1392 self._login()
1393
652cdaa2
JMF
1394 def _ids_to_results(self, ids):
1395 return [self.url_result(vid_id, 'Youtube', video_id=vid_id)
1396 for vid_id in ids]
1397
1398 def _extract_mix(self, playlist_id):
1399 # The mixes are generated from a a single video
1400 # the id of the playlist is just 'RD' + video_id
7d4afc55 1401 url = 'https://youtube.com/watch?v=%s&list=%s' % (playlist_id[-11:], playlist_id)
652cdaa2 1402 webpage = self._download_webpage(url, playlist_id, u'Downloading Youtube mix')
76d1700b
JMF
1403 title_span = (get_element_by_attribute('class', 'title long-title', webpage) or
1404 get_element_by_attribute('class', 'title ', webpage))
1405 title = clean_html(title_span)
652cdaa2
JMF
1406 video_re = r'data-index="\d+".*?href="/watch\?v=([0-9A-Za-z_-]{11})&amp;[^"]*?list=%s' % re.escape(playlist_id)
1407 ids = orderedSet(re.findall(video_re, webpage))
1408 url_results = self._ids_to_results(ids)
1409
1410 return self.playlist_result(url_results, playlist_id, title)
1411
c5e8d7af
PH
1412 def _real_extract(self, url):
1413 # Extract playlist id
1414 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1415 if mobj is None:
1416 raise ExtractorError(u'Invalid URL: %s' % url)
47192f92
FV
1417 playlist_id = mobj.group(1) or mobj.group(2)
1418
1419 # Check if it's a video-specific URL
7c61bd36 1420 query_dict = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
47192f92
FV
1421 if 'v' in query_dict:
1422 video_id = query_dict['v'][0]
1423 if self._downloader.params.get('noplaylist'):
1424 self.to_screen(u'Downloading just video %s because of --no-playlist' % video_id)
7012b23c 1425 return self.url_result(video_id, 'Youtube', video_id=video_id)
47192f92
FV
1426 else:
1427 self.to_screen(u'Downloading playlist PL%s - add --no-playlist to just download video %s' % (playlist_id, video_id))
c5e8d7af 1428
7d4afc55 1429 if playlist_id.startswith('RD'):
652cdaa2
JMF
1430 # Mixes require a custom extraction process
1431 return self._extract_mix(playlist_id)
0a688bc0
JMF
1432 if playlist_id.startswith('TL'):
1433 raise ExtractorError(u'For downloading YouTube.com top lists, use '
1434 u'the "yttoplist" keyword, for example "youtube-dl \'yttoplist:music:Top Tracks\'"', expected=True)
652cdaa2 1435
dcbb4580
JMF
1436 # Extract the video ids from the playlist pages
1437 ids = []
c5e8d7af 1438
755eb032 1439 for page_num in itertools.count(1):
dcbb4580 1440 url = self._TEMPLATE_URL % (playlist_id, page_num)
c5e8d7af 1441 page = self._download_webpage(url, playlist_id, u'Downloading page #%s' % page_num)
6e47b51e
JMF
1442 matches = re.finditer(self._VIDEO_RE, page)
1443 # We remove the duplicates and the link with index 0
1444 # (it's not the first video of the playlist)
1445 new_ids = orderedSet(m.group('id') for m in matches if m.group('index') != '0')
dcbb4580 1446 ids.extend(new_ids)
c5e8d7af 1447
dcbb4580 1448 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
c5e8d7af
PH
1449 break
1450
dcbb4580 1451 playlist_title = self._og_search_title(page)
c5e8d7af 1452
652cdaa2 1453 url_results = self._ids_to_results(ids)
dcbb4580 1454 return self.playlist_result(url_results, playlist_id, playlist_title)
c5e8d7af
PH
1455
1456
0a688bc0
JMF
1457class YoutubeTopListIE(YoutubePlaylistIE):
1458 IE_NAME = u'youtube:toplist'
1459 IE_DESC = (u'YouTube.com top lists, "yttoplist:{channel}:{list title}"'
1460 u' (Example: "yttoplist:music:Top Tracks")')
1461 _VALID_URL = r'yttoplist:(?P<chann>.*?):(?P<title>.*?)$'
1462
1463 def _real_extract(self, url):
1464 mobj = re.match(self._VALID_URL, url)
1465 channel = mobj.group('chann')
1466 title = mobj.group('title')
1467 query = compat_urllib_parse.urlencode({'title': title})
1468 playlist_re = 'href="([^"]+?%s[^"]+?)"' % re.escape(query)
1469 channel_page = self._download_webpage('https://www.youtube.com/%s' % channel, title)
1470 link = self._html_search_regex(playlist_re, channel_page, u'list')
1471 url = compat_urlparse.urljoin('https://www.youtube.com/', link)
1472
1473 video_re = r'data-index="\d+".*?data-video-id="([0-9A-Za-z_-]{11})"'
1474 ids = []
1475 # sometimes the webpage doesn't contain the videos
1476 # retry until we get them
1477 for i in itertools.count(0):
1478 msg = u'Downloading Youtube mix'
1479 if i > 0:
1480 msg += ', retry #%d' % i
1481 webpage = self._download_webpage(url, title, msg)
1482 ids = orderedSet(re.findall(video_re, webpage))
1483 if ids:
1484 break
1485 url_results = self._ids_to_results(ids)
1486 return self.playlist_result(url_results, playlist_title=title)
1487
1488
c5e8d7af 1489class YoutubeChannelIE(InfoExtractor):
0f818663 1490 IE_DESC = u'YouTube.com channels'
c5e8d7af 1491 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
c5e8d7af 1492 _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
252580c5 1493 _MORE_PAGES_URL = 'http://www.youtube.com/c4_browse_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
c5e8d7af
PH
1494 IE_NAME = u'youtube:channel'
1495
1496 def extract_videos_from_page(self, page):
1497 ids_in_page = []
1498 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1499 if mobj.group(1) not in ids_in_page:
1500 ids_in_page.append(mobj.group(1))
1501 return ids_in_page
1502
1503 def _real_extract(self, url):
1504 # Extract channel id
1505 mobj = re.match(self._VALID_URL, url)
1506 if mobj is None:
1507 raise ExtractorError(u'Invalid URL: %s' % url)
1508
1509 # Download channel page
1510 channel_id = mobj.group(1)
1511 video_ids = []
b9643eed
JMF
1512 url = 'https://www.youtube.com/channel/%s/videos' % channel_id
1513 channel_page = self._download_webpage(url, channel_id)
31812a9e
PH
1514 autogenerated = re.search(r'''(?x)
1515 class="[^"]*?(?:
1516 channel-header-autogenerated-label|
1517 yt-channel-title-autogenerated
1518 )[^"]*"''', channel_page) is not None
c5e8d7af 1519
b9643eed
JMF
1520 if autogenerated:
1521 # The videos are contained in a single page
1522 # the ajax pages can't be used, they are empty
1523 video_ids = self.extract_videos_from_page(channel_page)
1524 else:
1525 # Download all channel pages using the json-based channel_ajax query
1526 for pagenum in itertools.count(1):
1527 url = self._MORE_PAGES_URL % (pagenum, channel_id)
1528 page = self._download_webpage(url, channel_id,
1529 u'Downloading page #%s' % pagenum)
1530
1531 page = json.loads(page)
1532
1533 ids_in_page = self.extract_videos_from_page(page['content_html'])
1534 video_ids.extend(ids_in_page)
1535
1536 if self._MORE_PAGES_INDICATOR not in page['load_more_widget_html']:
1537 break
c5e8d7af
PH
1538
1539 self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1540
7012b23c
PH
1541 url_entries = [self.url_result(video_id, 'Youtube', video_id=video_id)
1542 for video_id in video_ids]
1543 return self.playlist_result(url_entries, channel_id)
c5e8d7af
PH
1544
1545
1546class YoutubeUserIE(InfoExtractor):
0f818663 1547 IE_DESC = u'YouTube.com user videos (URL or "ytuser" keyword)'
57da92b7 1548 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:user/)?(?!(?:attribution_link|watch)(?:$|[^a-z_A-Z0-9-])))|ytuser:)(?!feed/)([A-Za-z0-9_-]+)'
c5e8d7af
PH
1549 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1550 _GDATA_PAGE_SIZE = 50
fd9cf738 1551 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d&alt=json'
c5e8d7af
PH
1552 IE_NAME = u'youtube:user'
1553
e3ea4790 1554 @classmethod
f4b05232 1555 def suitable(cls, url):
e3ea4790
JMF
1556 # Don't return True if the url can be extracted with other youtube
1557 # extractor, the regex would is too permissive and it would match.
1558 other_ies = iter(klass for (name, klass) in globals().items() if name.endswith('IE') and klass is not cls)
1559 if any(ie.suitable(url) for ie in other_ies): return False
f4b05232
JMF
1560 else: return super(YoutubeUserIE, cls).suitable(url)
1561
c5e8d7af
PH
1562 def _real_extract(self, url):
1563 # Extract username
1564 mobj = re.match(self._VALID_URL, url)
1565 if mobj is None:
1566 raise ExtractorError(u'Invalid URL: %s' % url)
1567
1568 username = mobj.group(1)
1569
1570 # Download video ids using YouTube Data API. Result size per
1571 # query is limited (currently to 50 videos) so we need to query
1572 # page by page until there are no video ids - it means we got
1573 # all of them.
1574
e302f9ce 1575 url_results = []
c5e8d7af 1576
755eb032 1577 for pagenum in itertools.count(0):
c5e8d7af
PH
1578 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1579
1580 gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
1581 page = self._download_webpage(gdata_url, username,
1582 u'Downloading video ids from %d to %d' % (start_index, start_index + self._GDATA_PAGE_SIZE))
1583
fd9cf738
JMF
1584 try:
1585 response = json.loads(page)
1586 except ValueError as err:
1587 raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
71c82637
JMF
1588 if 'entry' not in response['feed']:
1589 # Number of videos is a multiple of self._MAX_RESULTS
1590 break
fd9cf738 1591
c5e8d7af 1592 # Extract video identifiers
e302f9ce
PH
1593 entries = response['feed']['entry']
1594 for entry in entries:
1595 title = entry['title']['$t']
1596 video_id = entry['id']['$t'].split('/')[-1]
1597 url_results.append({
1598 '_type': 'url',
1599 'url': video_id,
1600 'ie_key': 'Youtube',
1601 'id': 'video_id',
1602 'title': title,
1603 })
c5e8d7af
PH
1604
1605 # A little optimization - if current page is not
1606 # "full", ie. does not contain PAGE_SIZE video ids then
1607 # we can assume that this page is the last one - there
1608 # are no more ids on further pages - no need to query
1609 # again.
1610
e302f9ce 1611 if len(entries) < self._GDATA_PAGE_SIZE:
c5e8d7af
PH
1612 break
1613
7012b23c
PH
1614 return self.playlist_result(url_results, playlist_title=username)
1615
b05654f0
PH
1616
1617class YoutubeSearchIE(SearchInfoExtractor):
0f818663 1618 IE_DESC = u'YouTube.com searches'
b05654f0
PH
1619 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1620 _MAX_RESULTS = 1000
1621 IE_NAME = u'youtube:search'
1622 _SEARCH_KEY = 'ytsearch'
1623
b05654f0
PH
1624 def _get_n_results(self, query, n):
1625 """Get a specified number of results for a query"""
1626
1627 video_ids = []
1628 pagenum = 0
1629 limit = n
1630
1631 while (50 * pagenum) < limit:
b05654f0 1632 result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
7cc3570e
PH
1633 data_json = self._download_webpage(
1634 result_url, video_id=u'query "%s"' % query,
1635 note=u'Downloading page %s' % (pagenum + 1),
1636 errnote=u'Unable to download API page')
1637 data = json.loads(data_json)
1638 api_response = data['data']
1639
1640 if 'items' not in api_response:
b05654f0
PH
1641 raise ExtractorError(u'[youtube] No video results')
1642
1643 new_ids = list(video['id'] for video in api_response['items'])
1644 video_ids += new_ids
1645
1646 limit = min(n, api_response['totalItems'])
1647 pagenum += 1
1648
1649 if len(video_ids) > n:
1650 video_ids = video_ids[:n]
7012b23c
PH
1651 videos = [self.url_result(video_id, 'Youtube', video_id=video_id)
1652 for video_id in video_ids]
b05654f0 1653 return self.playlist_result(videos, query)
75dff0ee 1654
a3dd9248 1655class YoutubeSearchDateIE(YoutubeSearchIE):
cb7fb546 1656 IE_NAME = YoutubeSearchIE.IE_NAME + ':date'
a3dd9248
CM
1657 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc&orderby=published'
1658 _SEARCH_KEY = 'ytsearchdate'
08fb86c4 1659 IE_DESC = u'YouTube.com searches, newest videos first'
75dff0ee
JMF
1660
1661class YoutubeShowIE(InfoExtractor):
0f818663 1662 IE_DESC = u'YouTube.com (multi-season) shows'
75dff0ee
JMF
1663 _VALID_URL = r'https?://www\.youtube\.com/show/(.*)'
1664 IE_NAME = u'youtube:show'
1665
1666 def _real_extract(self, url):
1667 mobj = re.match(self._VALID_URL, url)
1668 show_name = mobj.group(1)
1669 webpage = self._download_webpage(url, show_name, u'Downloading show webpage')
1670 # There's one playlist for each season of the show
1671 m_seasons = list(re.finditer(r'href="(/playlist\?list=.*?)"', webpage))
1672 self.to_screen(u'%s: Found %s seasons' % (show_name, len(m_seasons)))
1673 return [self.url_result('https://www.youtube.com' + season.group(1), 'YoutubePlaylist') for season in m_seasons]
04cc9617
JMF
1674
1675
b2e8bc1b 1676class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
d7ae0639
JMF
1677 """
1678 Base class for extractors that fetch info from
1679 http://www.youtube.com/feed_ajax
1680 Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.
1681 """
b2e8bc1b 1682 _LOGIN_REQUIRED = True
43ba5456
JMF
1683 # use action_load_personal_feed instead of action_load_system_feed
1684 _PERSONAL_FEED = False
04cc9617 1685
d7ae0639
JMF
1686 @property
1687 def _FEED_TEMPLATE(self):
43ba5456
JMF
1688 action = 'action_load_system_feed'
1689 if self._PERSONAL_FEED:
1690 action = 'action_load_personal_feed'
1691 return 'http://www.youtube.com/feed_ajax?%s=1&feed_name=%s&paging=%%s' % (action, self._FEED_NAME)
d7ae0639
JMF
1692
1693 @property
1694 def IE_NAME(self):
1695 return u'youtube:%s' % self._FEED_NAME
04cc9617 1696
81f0259b 1697 def _real_initialize(self):
b2e8bc1b 1698 self._login()
81f0259b 1699
04cc9617
JMF
1700 def _real_extract(self, url):
1701 feed_entries = []
0e44d838
JMF
1702 paging = 0
1703 for i in itertools.count(1):
d7ae0639
JMF
1704 info = self._download_webpage(self._FEED_TEMPLATE % paging,
1705 u'%s feed' % self._FEED_NAME,
04cc9617
JMF
1706 u'Downloading page %s' % i)
1707 info = json.loads(info)
1708 feed_html = info['feed_html']
43ba5456 1709 m_ids = re.finditer(r'"/watch\?v=(.*?)["&]', feed_html)
04cc9617 1710 ids = orderedSet(m.group(1) for m in m_ids)
7012b23c
PH
1711 feed_entries.extend(
1712 self.url_result(video_id, 'Youtube', video_id=video_id)
1713 for video_id in ids)
04cc9617
JMF
1714 if info['paging'] is None:
1715 break
0e44d838 1716 paging = info['paging']
d7ae0639
JMF
1717 return self.playlist_result(feed_entries, playlist_title=self._PLAYLIST_TITLE)
1718
1719class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor):
1720 IE_DESC = u'YouTube.com subscriptions feed, "ytsubs" keyword(requires authentication)'
1721 _VALID_URL = r'https?://www\.youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
1722 _FEED_NAME = 'subscriptions'
1723 _PLAYLIST_TITLE = u'Youtube Subscriptions'
1724
1725class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
1726 IE_DESC = u'YouTube.com recommended videos, "ytrec" keyword (requires authentication)'
1727 _VALID_URL = r'https?://www\.youtube\.com/feed/recommended|:ytrec(?:ommended)?'
1728 _FEED_NAME = 'recommended'
1729 _PLAYLIST_TITLE = u'Youtube Recommended videos'
c626a3d9 1730
43ba5456
JMF
1731class YoutubeWatchLaterIE(YoutubeFeedsInfoExtractor):
1732 IE_DESC = u'Youtube watch later list, "ytwatchlater" keyword (requires authentication)'
1733 _VALID_URL = r'https?://www\.youtube\.com/feed/watch_later|:ytwatchlater'
1734 _FEED_NAME = 'watch_later'
1735 _PLAYLIST_TITLE = u'Youtube Watch Later'
43ba5456 1736 _PERSONAL_FEED = True
c626a3d9 1737
f459d170
JMF
1738class YoutubeHistoryIE(YoutubeFeedsInfoExtractor):
1739 IE_DESC = u'Youtube watch history, "ythistory" keyword (requires authentication)'
1740 _VALID_URL = u'https?://www\.youtube\.com/feed/history|:ythistory'
1741 _FEED_NAME = 'history'
1742 _PERSONAL_FEED = True
1743 _PLAYLIST_TITLE = u'Youtube Watch History'
1744
c626a3d9
JMF
1745class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
1746 IE_NAME = u'youtube:favorites'
1747 IE_DESC = u'YouTube.com favourite videos, "ytfav" keyword (requires authentication)'
c7a7750d 1748 _VALID_URL = r'https?://www\.youtube\.com/my_favorites|:ytfav(?:ou?rites)?'
c626a3d9
JMF
1749 _LOGIN_REQUIRED = True
1750
1751 def _real_extract(self, url):
1752 webpage = self._download_webpage('https://www.youtube.com/my_favorites', 'Youtube Favourites videos')
1753 playlist_id = self._search_regex(r'list=(.+?)["&]', webpage, u'favourites playlist id')
1754 return self.url_result(playlist_id, 'YoutubePlaylist')
15870e90
PH
1755
1756
1757class YoutubeTruncatedURLIE(InfoExtractor):
1758 IE_NAME = 'youtube:truncated_url'
1759 IE_DESC = False # Do not list
1760 _VALID_URL = r'(?:https?://)?[^/]+/watch\?feature=[a-z_]+$'
1761
1762 def _real_extract(self, url):
1763 raise ExtractorError(
1764 u'Did you forget to quote the URL? Remember that & is a meta '
1765 u'character in most shells, so you want to put the URL in quotes, '
1766 u'like youtube-dl '
b4622a32
PH
1767 u'"http://www.youtube.com/watch?feature=foo&v=BaW_jenozKc" '
1768 u' or simply youtube-dl BaW_jenozKc .',
15870e90 1769 expected=True)