]> jfr.im git - yt-dlp.git/blame - youtube_dl/extractor/youtube.py
[youtube] Fall back to header if playlist title is not available
[yt-dlp.git] / youtube_dl / extractor / youtube.py
CommitLineData
c5e8d7af 1# coding: utf-8
c5e8d7af 2
e0df6211 3import collections
edf3e38e 4import errno
e0df6211 5import io
0ca96d48 6import itertools
c5e8d7af 7import json
c4417ddb 8import os.path
c5e8d7af 9import re
e0df6211
PH
10import string
11import struct
12import traceback
13import zlib
c5e8d7af 14
b05654f0 15from .common import InfoExtractor, SearchInfoExtractor
54d39d8b 16from .subtitles import SubtitlesInfoExtractor
c5e8d7af 17from ..utils import (
edf3e38e 18 compat_chr,
c5e8d7af 19 compat_parse_qs,
c5e8d7af
PH
20 compat_urllib_parse,
21 compat_urllib_request,
7c61bd36 22 compat_urlparse,
c5e8d7af
PH
23 compat_str,
24
25 clean_html,
c38b1e77 26 get_cachedir,
c5e8d7af 27 get_element_by_id,
652cdaa2 28 get_element_by_attribute,
c5e8d7af 29 ExtractorError,
c91778f8 30 RegexNotFoundError,
c5e8d7af
PH
31 unescapeHTML,
32 unified_strdate,
04cc9617 33 orderedSet,
edf3e38e 34 write_json_file,
c5e8d7af
PH
35)
36
de7f3446 37class YoutubeBaseInfoExtractor(InfoExtractor):
b2e8bc1b
JMF
38 """Provide base functions for Youtube extractors"""
39 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
40 _LANG_URL = r'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
41 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
42 _NETRC_MACHINE = 'youtube'
43 # If True it will raise an error if no login info is provided
44 _LOGIN_REQUIRED = False
45
b2e8bc1b 46 def _set_language(self):
7cc3570e
PH
47 return bool(self._download_webpage(
48 self._LANG_URL, None,
49 note=u'Setting language', errnote='unable to set language',
50 fatal=False))
b2e8bc1b
JMF
51
52 def _login(self):
53 (username, password) = self._get_login_info()
54 # No authentication to be performed
55 if username is None:
56 if self._LOGIN_REQUIRED:
57 raise ExtractorError(u'No login info available, needed for using %s.' % self.IE_NAME, expected=True)
58 return False
59
7cc3570e
PH
60 login_page = self._download_webpage(
61 self._LOGIN_URL, None,
62 note=u'Downloading login page',
63 errnote=u'unable to fetch login page', fatal=False)
64 if login_page is False:
65 return
b2e8bc1b 66
795f28f8
PH
67 galx = self._search_regex(r'(?s)<input.+?name="GALX".+?value="(.+?)"',
68 login_page, u'Login GALX parameter')
c5e8d7af 69
b2e8bc1b
JMF
70 # Log in
71 login_form_strs = {
72 u'continue': u'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
73 u'Email': username,
74 u'GALX': galx,
75 u'Passwd': password,
76 u'PersistentCookie': u'yes',
77 u'_utf8': u'霱',
78 u'bgresponse': u'js_disabled',
79 u'checkConnection': u'',
80 u'checkedDomains': u'youtube',
81 u'dnConn': u'',
b2e8bc1b
JMF
82 u'pstMsg': u'0',
83 u'rmShown': u'1',
84 u'secTok': u'',
85 u'signIn': u'Sign in',
86 u'timeStmp': u'',
87 u'service': u'youtube',
88 u'uilel': u'3',
89 u'hl': u'en_US',
90 }
91 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
92 # chokes on unicode
93 login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
94 login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
7cc3570e
PH
95
96 req = compat_urllib_request.Request(self._LOGIN_URL, login_data)
97 login_results = self._download_webpage(
98 req, None,
99 note=u'Logging in', errnote=u'unable to log in', fatal=False)
100 if login_results is False:
101 return False
102 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
103 self._downloader.report_warning(u'unable to log in: bad username or password')
b2e8bc1b
JMF
104 return False
105 return True
106
107 def _confirm_age(self):
108 age_form = {
7cc3570e
PH
109 'next_url': '/',
110 'action_confirm': 'Confirm',
111 }
112 req = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
113
114 self._download_webpage(
115 req, None,
116 note=u'Confirming age', errnote=u'Unable to confirm age')
b2e8bc1b
JMF
117 return True
118
119 def _real_initialize(self):
120 if self._downloader is None:
121 return
122 if not self._set_language():
123 return
124 if not self._login():
125 return
126 self._confirm_age()
c5e8d7af 127
8377574c 128
de7f3446 129class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
0f818663 130 IE_DESC = u'YouTube.com'
cb7dfeea 131 _VALID_URL = r"""(?x)^
c5e8d7af 132 (
83aa5293 133 (?:https?://|//)? # http(s):// or protocol-independent URL (optional)
cb7dfeea 134 (?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/|
484aaeb2
PH
135 (?:www\.)?deturl\.com/www\.youtube\.com/|
136 (?:www\.)?pwnyoutube\.com|
e69ae5b9
JMF
137 tube\.majestyc\.net/|
138 youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains
c5e8d7af
PH
139 (?:.*?\#/)? # handle anchor (#/) redirect urls
140 (?: # the various things that can precede the ID:
141 (?:(?:v|embed|e)/) # v/ or embed/ or e/
142 |(?: # or the v= param in all its forms
d741e55a 143 (?:(?:watch|movie)(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
c5e8d7af
PH
144 (?:\?|\#!?) # the params delimiter ? or # or #!
145 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
146 v=
147 )
f4b05232
JMF
148 ))
149 |youtu\.be/ # just youtu.be/xxxx
150 )
c5e8d7af 151 )? # all until now is optional -> you can pass the naked ID
8963d9c2 152 ([0-9A-Za-z_-]{11}) # here is it! the YouTube video ID
c5e8d7af
PH
153 (?(1).+)? # if we found the ID, everything can follow
154 $"""
c5e8d7af 155 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
2c62dc26
PH
156 _formats = {
157 '5': {'ext': 'flv', 'width': 400, 'height': 240},
158 '6': {'ext': 'flv', 'width': 450, 'height': 270},
159 '13': {'ext': '3gp'},
160 '17': {'ext': '3gp', 'width': 176, 'height': 144},
161 '18': {'ext': 'mp4', 'width': 640, 'height': 360},
162 '22': {'ext': 'mp4', 'width': 1280, 'height': 720},
163 '34': {'ext': 'flv', 'width': 640, 'height': 360},
164 '35': {'ext': 'flv', 'width': 854, 'height': 480},
165 '36': {'ext': '3gp', 'width': 320, 'height': 240},
166 '37': {'ext': 'mp4', 'width': 1920, 'height': 1080},
167 '38': {'ext': 'mp4', 'width': 4096, 'height': 3072},
168 '43': {'ext': 'webm', 'width': 640, 'height': 360},
169 '44': {'ext': 'webm', 'width': 854, 'height': 480},
170 '45': {'ext': 'webm', 'width': 1280, 'height': 720},
171 '46': {'ext': 'webm', 'width': 1920, 'height': 1080},
172
1d043b93 173
86fe61c8 174 # 3d videos
2c62dc26
PH
175 '82': {'ext': 'mp4', 'height': 360, 'resolution': '360p', 'format_note': '3D', 'preference': -20},
176 '83': {'ext': 'mp4', 'height': 480, 'resolution': '480p', 'format_note': '3D', 'preference': -20},
177 '84': {'ext': 'mp4', 'height': 720, 'resolution': '720p', 'format_note': '3D', 'preference': -20},
178 '85': {'ext': 'mp4', 'height': 1080, 'resolution': '1080p', 'format_note': '3D', 'preference': -20},
179 '100': {'ext': 'webm', 'height': 360, 'resolution': '360p', 'format_note': '3D', 'preference': -20},
180 '101': {'ext': 'webm', 'height': 480, 'resolution': '480p', 'format_note': '3D', 'preference': -20},
181 '102': {'ext': 'webm', 'height': 720, 'resolution': '720p', 'format_note': '3D', 'preference': -20},
836a086c 182
96fb5605 183 # Apple HTTP Live Streaming
2c62dc26
PH
184 '92': {'ext': 'mp4', 'height': 240, 'resolution': '240p', 'format_note': 'HLS', 'preference': -10},
185 '93': {'ext': 'mp4', 'height': 360, 'resolution': '360p', 'format_note': 'HLS', 'preference': -10},
186 '94': {'ext': 'mp4', 'height': 480, 'resolution': '480p', 'format_note': 'HLS', 'preference': -10},
187 '95': {'ext': 'mp4', 'height': 720, 'resolution': '720p', 'format_note': 'HLS', 'preference': -10},
188 '96': {'ext': 'mp4', 'height': 1080, 'resolution': '1080p', 'format_note': 'HLS', 'preference': -10},
189 '132': {'ext': 'mp4', 'height': 240, 'resolution': '240p', 'format_note': 'HLS', 'preference': -10},
190 '151': {'ext': 'mp4', 'height': 72, 'resolution': '72p', 'format_note': 'HLS', 'preference': -10},
191
192 # DASH mp4 video
193 '133': {'ext': 'mp4', 'height': 240, 'resolution': '240p', 'format_note': 'DASH video', 'preference': -40},
194 '134': {'ext': 'mp4', 'height': 360, 'resolution': '360p', 'format_note': 'DASH video', 'preference': -40},
195 '135': {'ext': 'mp4', 'height': 480, 'resolution': '480p', 'format_note': 'DASH video', 'preference': -40},
196 '136': {'ext': 'mp4', 'height': 720, 'resolution': '720p', 'format_note': 'DASH video', 'preference': -40},
197 '137': {'ext': 'mp4', 'height': 1080, 'resolution': '1080p', 'format_note': 'DASH video', 'preference': -40},
198 '138': {'ext': 'mp4', 'height': 1081, 'resolution': '>1080p', 'format_note': 'DASH video', 'preference': -40},
199 '160': {'ext': 'mp4', 'height': 192, 'resolution': '192p', 'format_note': 'DASH video', 'preference': -40},
8fa8a629 200 '264': {'ext': 'mp4', 'height': 1080, 'resolution': '1080p', 'format_note': 'DASH video', 'preference': -40},
836a086c 201
f6f1fc92 202 # Dash mp4 audio
2c62dc26
PH
203 '139': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 48, 'preference': -50},
204 '140': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 128, 'preference': -50},
205 '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 256, 'preference': -50},
836a086c
AZ
206
207 # Dash webm
2c62dc26
PH
208 '242': {'ext': 'webm', 'height': 240, 'resolution': '240p', 'format_note': 'DASH webm', 'preference': -40},
209 '243': {'ext': 'webm', 'height': 360, 'resolution': '360p', 'format_note': 'DASH webm', 'preference': -40},
210 '244': {'ext': 'webm', 'height': 480, 'resolution': '480p', 'format_note': 'DASH webm', 'preference': -40},
211 '245': {'ext': 'webm', 'height': 480, 'resolution': '480p', 'format_note': 'DASH webm', 'preference': -40},
212 '246': {'ext': 'webm', 'height': 480, 'resolution': '480p', 'format_note': 'DASH webm', 'preference': -40},
213 '247': {'ext': 'webm', 'height': 720, 'resolution': '720p', 'format_note': 'DASH webm', 'preference': -40},
214 '248': {'ext': 'webm', 'height': 1080, 'resolution': '1080p', 'format_note': 'DASH webm', 'preference': -40},
215
216 # Dash webm audio
217 '171': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH webm audio', 'abr': 48, 'preference': -50},
218 '172': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH webm audio', 'abr': 256, 'preference': -50},
ce6b9a2d
PH
219
220 # RTMP (unnamed)
221 '_rtmp': {'protocol': 'rtmp'},
c5e8d7af 222 }
836a086c 223
c5e8d7af 224 IE_NAME = u'youtube'
2eb88d95
PH
225 _TESTS = [
226 {
0e853ca4
PH
227 u"url": u"http://www.youtube.com/watch?v=BaW_jenozKc",
228 u"file": u"BaW_jenozKc.mp4",
229 u"info_dict": {
230 u"title": u"youtube-dl test video \"'/\\ä↭𝕐",
231 u"uploader": u"Philipp Hagemeister",
232 u"uploader_id": u"phihag",
233 u"upload_date": u"20121002",
27dcce19 234 u"description": u"test chars: \"'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de ."
2eb88d95 235 }
0e853ca4 236 },
0e853ca4
PH
237 {
238 u"url": u"http://www.youtube.com/watch?v=UxxajLWwzqY",
239 u"file": u"UxxajLWwzqY.mp4",
240 u"note": u"Test generic use_cipher_signature video (#897)",
241 u"info_dict": {
242 u"upload_date": u"20120506",
243 u"title": u"Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]",
2dc59299 244 u"description": u"md5:5b292926389560516e384ac437c0ec07",
45ed795c 245 u"uploader": u"Icona Pop",
0e853ca4 246 u"uploader_id": u"IconaPop"
2eb88d95 247 }
c108eb73
JMF
248 },
249 {
250 u"url": u"https://www.youtube.com/watch?v=07FYdnEawAQ",
251 u"file": u"07FYdnEawAQ.mp4",
252 u"note": u"Test VEVO video with age protection (#956)",
253 u"info_dict": {
254 u"upload_date": u"20130703",
255 u"title": u"Justin Timberlake - Tunnel Vision (Explicit)",
256 u"description": u"md5:64249768eec3bc4276236606ea996373",
257 u"uploader": u"justintimberlakeVEVO",
258 u"uploader_id": u"justintimberlakeVEVO"
259 }
260 },
fccd3771 261 {
83aa5293 262 u"url": u"//www.YouTube.com/watch?v=yZIXLfi8CZQ",
fccd3771
PH
263 u"file": u"yZIXLfi8CZQ.mp4",
264 u"note": u"Embed-only video (#1746)",
265 u"info_dict": {
266 u"upload_date": u"20120608",
267 u"title": u"Principal Sexually Assaults A Teacher - Episode 117 - 8th June 2012",
268 u"description": u"md5:09b78bd971f1e3e289601dfba15ca4f7",
269 u"uploader": u"SET India",
270 u"uploader_id": u"setindia"
271 }
272 },
2eb88d95
PH
273 ]
274
c5e8d7af
PH
275
276 @classmethod
277 def suitable(cls, url):
278 """Receives a URL and returns True if suitable for this IE."""
e3ea4790 279 if YoutubePlaylistIE.suitable(url): return False
fccd3771 280 return re.match(cls._VALID_URL, url) is not None
c5e8d7af 281
e0df6211
PH
282 def __init__(self, *args, **kwargs):
283 super(YoutubeIE, self).__init__(*args, **kwargs)
83799698 284 self._player_cache = {}
e0df6211 285
c5e8d7af
PH
286 def report_video_info_webpage_download(self, video_id):
287 """Report attempt to download video info webpage."""
288 self.to_screen(u'%s: Downloading video info webpage' % video_id)
289
c5e8d7af
PH
290 def report_information_extraction(self, video_id):
291 """Report attempt to extract video information."""
292 self.to_screen(u'%s: Extracting video information' % video_id)
293
294 def report_unavailable_format(self, video_id, format):
295 """Report extracted video URL."""
296 self.to_screen(u'%s: Format %s not available' % (video_id, format))
297
298 def report_rtmp_download(self):
299 """Indicate the download will use the RTMP protocol."""
300 self.to_screen(u'RTMP download detected')
301
c4417ddb
PH
302 def _extract_signature_function(self, video_id, player_url, slen):
303 id_m = re.match(r'.*-(?P<id>[a-zA-Z0-9_-]+)\.(?P<ext>[a-z]+)$',
83799698 304 player_url)
e0df6211
PH
305 player_type = id_m.group('ext')
306 player_id = id_m.group('id')
307
c4417ddb
PH
308 # Read from filesystem cache
309 func_id = '%s_%s_%d' % (player_type, player_id, slen)
310 assert os.path.basename(func_id) == func_id
c38b1e77 311 cache_dir = get_cachedir(self._downloader.params)
c4417ddb 312
c3c88a26 313 cache_enabled = cache_dir is not None
f8061589 314 if cache_enabled:
c4417ddb
PH
315 cache_fn = os.path.join(os.path.expanduser(cache_dir),
316 u'youtube-sigfuncs',
317 func_id + '.json')
318 try:
edf3e38e 319 with io.open(cache_fn, 'r', encoding='utf-8') as cachef:
c4417ddb
PH
320 cache_spec = json.load(cachef)
321 return lambda s: u''.join(s[i] for i in cache_spec)
edf3e38e 322 except IOError:
c4417ddb 323 pass # No cache available
83799698 324
e0df6211
PH
325 if player_type == 'js':
326 code = self._download_webpage(
327 player_url, video_id,
83799698 328 note=u'Downloading %s player %s' % (player_type, player_id),
e0df6211 329 errnote=u'Download of %s failed' % player_url)
83799698 330 res = self._parse_sig_js(code)
c4417ddb 331 elif player_type == 'swf':
e0df6211
PH
332 urlh = self._request_webpage(
333 player_url, video_id,
83799698 334 note=u'Downloading %s player %s' % (player_type, player_id),
e0df6211
PH
335 errnote=u'Download of %s failed' % player_url)
336 code = urlh.read()
83799698 337 res = self._parse_sig_swf(code)
e0df6211
PH
338 else:
339 assert False, 'Invalid player type %r' % player_type
340
f8061589 341 if cache_enabled:
edf3e38e 342 try:
c705320f
PH
343 test_string = u''.join(map(compat_chr, range(slen)))
344 cache_res = res(test_string)
edf3e38e
PH
345 cache_spec = [ord(c) for c in cache_res]
346 try:
347 os.makedirs(os.path.dirname(cache_fn))
348 except OSError as ose:
349 if ose.errno != errno.EEXIST:
350 raise
351 write_json_file(cache_spec, cache_fn)
0ca96d48 352 except Exception:
edf3e38e
PH
353 tb = traceback.format_exc()
354 self._downloader.report_warning(
355 u'Writing cache to %r failed: %s' % (cache_fn, tb))
83799698
PH
356
357 return res
358
edf3e38e
PH
359 def _print_sig_code(self, func, slen):
360 def gen_sig_code(idxs):
361 def _genslice(start, end, step):
362 starts = u'' if start == 0 else str(start)
e35e4ddc
PH
363 ends = (u':%d' % (end+step)) if end + step >= 0 else u':'
364 steps = u'' if step == 1 else (u':%d' % step)
edf3e38e
PH
365 return u's[%s%s%s]' % (starts, ends, steps)
366
367 step = None
0ca96d48
PH
368 start = '(Never used)' # Quelch pyflakes warnings - start will be
369 # set as soon as step is set
edf3e38e
PH
370 for i, prev in zip(idxs[1:], idxs[:-1]):
371 if step is not None:
372 if i - prev == step:
373 continue
374 yield _genslice(start, prev, step)
375 step = None
376 continue
377 if i - prev in [-1, 1]:
378 step = i - prev
379 start = prev
380 continue
381 else:
382 yield u's[%d]' % prev
383 if step is None:
384 yield u's[%d]' % i
385 else:
386 yield _genslice(start, i, step)
387
c705320f
PH
388 test_string = u''.join(map(compat_chr, range(slen)))
389 cache_res = func(test_string)
edf3e38e
PH
390 cache_spec = [ord(c) for c in cache_res]
391 expr_code = u' + '.join(gen_sig_code(cache_spec))
392 code = u'if len(s) == %d:\n return %s\n' % (slen, expr_code)
f8061589 393 self.to_screen(u'Extracted signature function:\n' + code)
edf3e38e 394
e0df6211
PH
395 def _parse_sig_js(self, jscode):
396 funcname = self._search_regex(
397 r'signature=([a-zA-Z]+)', jscode,
398 u'Initial JS player signature function name')
399
400 functions = {}
401
402 def argidx(varname):
403 return string.lowercase.index(varname)
404
405 def interpret_statement(stmt, local_vars, allow_recursion=20):
406 if allow_recursion < 0:
0ca96d48 407 raise ExtractorError(u'Recursion limit reached')
e0df6211
PH
408
409 if stmt.startswith(u'var '):
410 stmt = stmt[len(u'var '):]
411 ass_m = re.match(r'^(?P<out>[a-z]+)(?:\[(?P<index>[^\]]+)\])?' +
412 r'=(?P<expr>.*)$', stmt)
413 if ass_m:
414 if ass_m.groupdict().get('index'):
415 def assign(val):
416 lvar = local_vars[ass_m.group('out')]
417 idx = interpret_expression(ass_m.group('index'),
418 local_vars, allow_recursion)
419 assert isinstance(idx, int)
420 lvar[idx] = val
421 return val
422 expr = ass_m.group('expr')
423 else:
424 def assign(val):
425 local_vars[ass_m.group('out')] = val
426 return val
427 expr = ass_m.group('expr')
428 elif stmt.startswith(u'return '):
429 assign = lambda v: v
430 expr = stmt[len(u'return '):]
431 else:
432 raise ExtractorError(
433 u'Cannot determine left side of statement in %r' % stmt)
434
435 v = interpret_expression(expr, local_vars, allow_recursion)
436 return assign(v)
437
438 def interpret_expression(expr, local_vars, allow_recursion):
439 if expr.isdigit():
440 return int(expr)
441
442 if expr.isalpha():
443 return local_vars[expr]
444
445 m = re.match(r'^(?P<in>[a-z]+)\.(?P<member>.*)$', expr)
446 if m:
447 member = m.group('member')
448 val = local_vars[m.group('in')]
449 if member == 'split("")':
450 return list(val)
451 if member == 'join("")':
452 return u''.join(val)
453 if member == 'length':
454 return len(val)
455 if member == 'reverse()':
456 return val[::-1]
457 slice_m = re.match(r'slice\((?P<idx>.*)\)', member)
458 if slice_m:
459 idx = interpret_expression(
460 slice_m.group('idx'), local_vars, allow_recursion-1)
461 return val[idx:]
462
463 m = re.match(
464 r'^(?P<in>[a-z]+)\[(?P<idx>.+)\]$', expr)
465 if m:
466 val = local_vars[m.group('in')]
467 idx = interpret_expression(m.group('idx'), local_vars,
468 allow_recursion-1)
469 return val[idx]
470
471 m = re.match(r'^(?P<a>.+?)(?P<op>[%])(?P<b>.+?)$', expr)
472 if m:
473 a = interpret_expression(m.group('a'),
474 local_vars, allow_recursion)
475 b = interpret_expression(m.group('b'),
476 local_vars, allow_recursion)
477 return a % b
478
479 m = re.match(
480 r'^(?P<func>[a-zA-Z]+)\((?P<args>[a-z0-9,]+)\)$', expr)
481 if m:
482 fname = m.group('func')
483 if fname not in functions:
484 functions[fname] = extract_function(fname)
485 argvals = [int(v) if v.isdigit() else local_vars[v]
486 for v in m.group('args').split(',')]
487 return functions[fname](argvals)
488 raise ExtractorError(u'Unsupported JS expression %r' % expr)
489
490 def extract_function(funcname):
491 func_m = re.search(
492 r'function ' + re.escape(funcname) +
493 r'\((?P<args>[a-z,]+)\){(?P<code>[^}]+)}',
494 jscode)
495 argnames = func_m.group('args').split(',')
496
497 def resf(args):
498 local_vars = dict(zip(argnames, args))
499 for stmt in func_m.group('code').split(';'):
500 res = interpret_statement(stmt, local_vars)
501 return res
502 return resf
503
504 initial_function = extract_function(funcname)
505 return lambda s: initial_function([s])
506
507 def _parse_sig_swf(self, file_contents):
508 if file_contents[1:3] != b'WS':
509 raise ExtractorError(
510 u'Not an SWF file; header is %r' % file_contents[:3])
511 if file_contents[:1] == b'C':
512 content = zlib.decompress(file_contents[8:])
513 else:
514 raise NotImplementedError(u'Unsupported compression format %r' %
515 file_contents[:1])
516
517 def extract_tags(content):
518 pos = 0
519 while pos < len(content):
520 header16 = struct.unpack('<H', content[pos:pos+2])[0]
521 pos += 2
522 tag_code = header16 >> 6
523 tag_len = header16 & 0x3f
524 if tag_len == 0x3f:
525 tag_len = struct.unpack('<I', content[pos:pos+4])[0]
526 pos += 4
527 assert pos+tag_len <= len(content)
528 yield (tag_code, content[pos:pos+tag_len])
529 pos += tag_len
530
531 code_tag = next(tag
532 for tag_code, tag in extract_tags(content)
533 if tag_code == 82)
534 p = code_tag.index(b'\0', 4) + 1
ba552f54 535 code_reader = io.BytesIO(code_tag[p:])
e0df6211
PH
536
537 # Parse ABC (AVM2 ByteCode)
ba552f54
PH
538 def read_int(reader=None):
539 if reader is None:
540 reader = code_reader
e0df6211
PH
541 res = 0
542 shift = 0
543 for _ in range(5):
ba552f54
PH
544 buf = reader.read(1)
545 assert len(buf) == 1
546 b = struct.unpack('<B', buf)[0]
e0df6211
PH
547 res = res | ((b & 0x7f) << shift)
548 if b & 0x80 == 0:
549 break
550 shift += 7
ba552f54
PH
551 return res
552
553 def u30(reader=None):
554 res = read_int(reader)
555 assert res & 0xf0000000 == 0
e0df6211
PH
556 return res
557 u32 = read_int
558
ba552f54
PH
559 def s32(reader=None):
560 v = read_int(reader)
e0df6211
PH
561 if v & 0x80000000 != 0:
562 v = - ((v ^ 0xffffffff) + 1)
ba552f54
PH
563 return v
564
0ca96d48 565 def read_string(reader=None):
ba552f54
PH
566 if reader is None:
567 reader = code_reader
568 slen = u30(reader)
569 resb = reader.read(slen)
570 assert len(resb) == slen
571 return resb.decode('utf-8')
572
573 def read_bytes(count, reader=None):
574 if reader is None:
575 reader = code_reader
576 resb = reader.read(count)
577 assert len(resb) == count
578 return resb
579
580 def read_byte(reader=None):
581 resb = read_bytes(1, reader=reader)
582 res = struct.unpack('<B', resb)[0]
583 return res
e0df6211
PH
584
585 # minor_version + major_version
0ca96d48 586 read_bytes(2 + 2)
e0df6211
PH
587
588 # Constant pool
ba552f54 589 int_count = u30()
e0df6211 590 for _c in range(1, int_count):
0ca96d48 591 s32()
ba552f54 592 uint_count = u30()
e0df6211 593 for _c in range(1, uint_count):
0ca96d48 594 u32()
ba552f54 595 double_count = u30()
0ca96d48 596 read_bytes((double_count-1) * 8)
ba552f54 597 string_count = u30()
e0df6211
PH
598 constant_strings = [u'']
599 for _c in range(1, string_count):
0ca96d48 600 s = read_string()
e0df6211 601 constant_strings.append(s)
ba552f54 602 namespace_count = u30()
e0df6211 603 for _c in range(1, namespace_count):
0ca96d48
PH
604 read_bytes(1) # kind
605 u30() # name
ba552f54 606 ns_set_count = u30()
e0df6211 607 for _c in range(1, ns_set_count):
ba552f54 608 count = u30()
e0df6211 609 for _c2 in range(count):
0ca96d48 610 u30()
ba552f54 611 multiname_count = u30()
e0df6211
PH
612 MULTINAME_SIZES = {
613 0x07: 2, # QName
614 0x0d: 2, # QNameA
615 0x0f: 1, # RTQName
616 0x10: 1, # RTQNameA
617 0x11: 0, # RTQNameL
618 0x12: 0, # RTQNameLA
619 0x09: 2, # Multiname
620 0x0e: 2, # MultinameA
621 0x1b: 1, # MultinameL
622 0x1c: 1, # MultinameLA
623 }
624 multinames = [u'']
625 for _c in range(1, multiname_count):
ba552f54 626 kind = u30()
e0df6211
PH
627 assert kind in MULTINAME_SIZES, u'Invalid multiname kind %r' % kind
628 if kind == 0x07:
0ca96d48 629 u30() # namespace_idx
ba552f54 630 name_idx = u30()
e0df6211
PH
631 multinames.append(constant_strings[name_idx])
632 else:
633 multinames.append('[MULTINAME kind: %d]' % kind)
634 for _c2 in range(MULTINAME_SIZES[kind]):
0ca96d48 635 u30()
e0df6211
PH
636
637 # Methods
ba552f54 638 method_count = u30()
e0df6211
PH
639 MethodInfo = collections.namedtuple(
640 'MethodInfo',
641 ['NEED_ARGUMENTS', 'NEED_REST'])
642 method_infos = []
643 for method_id in range(method_count):
ba552f54 644 param_count = u30()
0ca96d48 645 u30() # return type
e0df6211 646 for _ in range(param_count):
0ca96d48
PH
647 u30() # param type
648 u30() # name index (always 0 for youtube)
ba552f54 649 flags = read_byte()
e0df6211
PH
650 if flags & 0x08 != 0:
651 # Options present
ba552f54 652 option_count = u30()
e0df6211 653 for c in range(option_count):
0ca96d48
PH
654 u30() # val
655 read_bytes(1) # kind
e0df6211
PH
656 if flags & 0x80 != 0:
657 # Param names present
658 for _ in range(param_count):
0ca96d48 659 u30() # param name
e0df6211
PH
660 mi = MethodInfo(flags & 0x01 != 0, flags & 0x04 != 0)
661 method_infos.append(mi)
662
663 # Metadata
ba552f54 664 metadata_count = u30()
e0df6211 665 for _c in range(metadata_count):
0ca96d48 666 u30() # name
ba552f54 667 item_count = u30()
e0df6211 668 for _c2 in range(item_count):
0ca96d48
PH
669 u30() # key
670 u30() # value
ba552f54
PH
671
672 def parse_traits_info():
673 trait_name_idx = u30()
674 kind_full = read_byte()
e0df6211
PH
675 kind = kind_full & 0x0f
676 attrs = kind_full >> 4
677 methods = {}
678 if kind in [0x00, 0x06]: # Slot or Const
0ca96d48
PH
679 u30() # Slot id
680 u30() # type_name_idx
ba552f54 681 vindex = u30()
e0df6211 682 if vindex != 0:
0ca96d48 683 read_byte() # vkind
e0df6211 684 elif kind in [0x01, 0x02, 0x03]: # Method / Getter / Setter
0ca96d48 685 u30() # disp_id
ba552f54 686 method_idx = u30()
e0df6211
PH
687 methods[multinames[trait_name_idx]] = method_idx
688 elif kind == 0x04: # Class
0ca96d48
PH
689 u30() # slot_id
690 u30() # classi
e0df6211 691 elif kind == 0x05: # Function
0ca96d48 692 u30() # slot_id
ba552f54 693 function_idx = u30()
e0df6211
PH
694 methods[function_idx] = multinames[trait_name_idx]
695 else:
696 raise ExtractorError(u'Unsupported trait kind %d' % kind)
697
698 if attrs & 0x4 != 0: # Metadata present
ba552f54 699 metadata_count = u30()
e0df6211 700 for _c3 in range(metadata_count):
0ca96d48 701 u30() # metadata index
e0df6211 702
ba552f54 703 return methods
e0df6211
PH
704
705 # Classes
706 TARGET_CLASSNAME = u'SignatureDecipher'
707 searched_idx = multinames.index(TARGET_CLASSNAME)
708 searched_class_id = None
ba552f54 709 class_count = u30()
e0df6211 710 for class_id in range(class_count):
ba552f54 711 name_idx = u30()
e0df6211
PH
712 if name_idx == searched_idx:
713 # We found the class we're looking for!
714 searched_class_id = class_id
0ca96d48 715 u30() # super_name idx
ba552f54 716 flags = read_byte()
e0df6211 717 if flags & 0x08 != 0: # Protected namespace is present
0ca96d48 718 u30() # protected_ns_idx
ba552f54 719 intrf_count = u30()
e0df6211 720 for _c2 in range(intrf_count):
0ca96d48
PH
721 u30()
722 u30() # iinit
ba552f54 723 trait_count = u30()
e0df6211 724 for _c2 in range(trait_count):
0ca96d48 725 parse_traits_info()
e0df6211
PH
726
727 if searched_class_id is None:
728 raise ExtractorError(u'Target class %r not found' %
729 TARGET_CLASSNAME)
730
731 method_names = {}
732 method_idxs = {}
733 for class_id in range(class_count):
0ca96d48 734 u30() # cinit
ba552f54 735 trait_count = u30()
e0df6211 736 for _c2 in range(trait_count):
ba552f54 737 trait_methods = parse_traits_info()
e0df6211
PH
738 if class_id == searched_class_id:
739 method_names.update(trait_methods.items())
740 method_idxs.update(dict(
741 (idx, name)
742 for name, idx in trait_methods.items()))
743
744 # Scripts
ba552f54 745 script_count = u30()
e0df6211 746 for _c in range(script_count):
0ca96d48 747 u30() # init
ba552f54 748 trait_count = u30()
e0df6211 749 for _c2 in range(trait_count):
0ca96d48 750 parse_traits_info()
e0df6211
PH
751
752 # Method bodies
ba552f54 753 method_body_count = u30()
e0df6211
PH
754 Method = collections.namedtuple('Method', ['code', 'local_count'])
755 methods = {}
756 for _c in range(method_body_count):
ba552f54 757 method_idx = u30()
0ca96d48 758 u30() # max_stack
ba552f54 759 local_count = u30()
0ca96d48
PH
760 u30() # init_scope_depth
761 u30() # max_scope_depth
ba552f54
PH
762 code_length = u30()
763 code = read_bytes(code_length)
e0df6211 764 if method_idx in method_idxs:
ba552f54 765 m = Method(code, local_count)
e0df6211 766 methods[method_idxs[method_idx]] = m
ba552f54 767 exception_count = u30()
e0df6211 768 for _c2 in range(exception_count):
0ca96d48
PH
769 u30() # from
770 u30() # to
771 u30() # target
772 u30() # exc_type
773 u30() # var_name
ba552f54 774 trait_count = u30()
e0df6211 775 for _c2 in range(trait_count):
0ca96d48 776 parse_traits_info()
e0df6211 777
ba552f54 778 assert p + code_reader.tell() == len(code_tag)
e0df6211
PH
779 assert len(methods) == len(method_idxs)
780
781 method_pyfunctions = {}
782
783 def extract_function(func_name):
784 if func_name in method_pyfunctions:
785 return method_pyfunctions[func_name]
786 if func_name not in methods:
787 raise ExtractorError(u'Cannot find function %r' % func_name)
788 m = methods[func_name]
789
790 def resfunc(args):
e0df6211
PH
791 registers = ['(this)'] + list(args) + [None] * m.local_count
792 stack = []
793 coder = io.BytesIO(m.code)
794 while True:
795 opcode = struct.unpack('!B', coder.read(1))[0]
a7177865 796 if opcode == 36: # pushbyte
e0df6211
PH
797 v = struct.unpack('!B', coder.read(1))[0]
798 stack.append(v)
799 elif opcode == 44: # pushstring
800 idx = u30(coder)
801 stack.append(constant_strings[idx])
802 elif opcode == 48: # pushscope
803 # We don't implement the scope register, so we'll just
804 # ignore the popped value
805 stack.pop()
806 elif opcode == 70: # callproperty
807 index = u30(coder)
808 mname = multinames[index]
809 arg_count = u30(coder)
810 args = list(reversed(
811 [stack.pop() for _ in range(arg_count)]))
812 obj = stack.pop()
813 if mname == u'split':
814 assert len(args) == 1
815 assert isinstance(args[0], compat_str)
816 assert isinstance(obj, compat_str)
817 if args[0] == u'':
818 res = list(obj)
819 else:
820 res = obj.split(args[0])
821 stack.append(res)
a7177865
PH
822 elif mname == u'slice':
823 assert len(args) == 1
824 assert isinstance(args[0], int)
825 assert isinstance(obj, list)
826 res = obj[args[0]:]
827 stack.append(res)
828 elif mname == u'join':
829 assert len(args) == 1
830 assert isinstance(args[0], compat_str)
831 assert isinstance(obj, list)
832 res = args[0].join(obj)
833 stack.append(res)
e0df6211
PH
834 elif mname in method_pyfunctions:
835 stack.append(method_pyfunctions[mname](args))
836 else:
837 raise NotImplementedError(
838 u'Unsupported property %r on %r'
839 % (mname, obj))
a7177865
PH
840 elif opcode == 72: # returnvalue
841 res = stack.pop()
842 return res
843 elif opcode == 79: # callpropvoid
844 index = u30(coder)
845 mname = multinames[index]
846 arg_count = u30(coder)
847 args = list(reversed(
848 [stack.pop() for _ in range(arg_count)]))
849 obj = stack.pop()
850 if mname == u'reverse':
851 assert isinstance(obj, list)
852 obj.reverse()
853 else:
854 raise NotImplementedError(
855 u'Unsupported (void) property %r on %r'
856 % (mname, obj))
e0df6211
PH
857 elif opcode == 93: # findpropstrict
858 index = u30(coder)
859 mname = multinames[index]
860 res = extract_function(mname)
861 stack.append(res)
862 elif opcode == 97: # setproperty
863 index = u30(coder)
864 value = stack.pop()
865 idx = stack.pop()
866 obj = stack.pop()
867 assert isinstance(obj, list)
868 assert isinstance(idx, int)
869 obj[idx] = value
870 elif opcode == 98: # getlocal
871 index = u30(coder)
872 stack.append(registers[index])
873 elif opcode == 99: # setlocal
874 index = u30(coder)
875 value = stack.pop()
876 registers[index] = value
877 elif opcode == 102: # getproperty
878 index = u30(coder)
879 pname = multinames[index]
880 if pname == u'length':
881 obj = stack.pop()
882 assert isinstance(obj, list)
883 stack.append(len(obj))
884 else: # Assume attribute access
885 idx = stack.pop()
886 assert isinstance(idx, int)
887 obj = stack.pop()
888 assert isinstance(obj, list)
889 stack.append(obj[idx])
890 elif opcode == 128: # coerce
0ca96d48 891 u30(coder)
e0df6211
PH
892 elif opcode == 133: # coerce_s
893 assert isinstance(stack[-1], (type(None), compat_str))
894 elif opcode == 164: # modulo
895 value2 = stack.pop()
896 value1 = stack.pop()
897 res = value1 % value2
898 stack.append(res)
a7177865
PH
899 elif opcode == 208: # getlocal_0
900 stack.append(registers[0])
901 elif opcode == 209: # getlocal_1
902 stack.append(registers[1])
903 elif opcode == 210: # getlocal_2
904 stack.append(registers[2])
905 elif opcode == 211: # getlocal_3
906 stack.append(registers[3])
e0df6211
PH
907 elif opcode == 214: # setlocal_2
908 registers[2] = stack.pop()
909 elif opcode == 215: # setlocal_3
910 registers[3] = stack.pop()
911 else:
912 raise NotImplementedError(
913 u'Unsupported opcode %d' % opcode)
914
915 method_pyfunctions[func_name] = resfunc
916 return resfunc
917
918 initial_function = extract_function(u'decipher')
919 return lambda s: initial_function([s])
920
83799698 921 def _decrypt_signature(self, s, video_id, player_url, age_gate=False):
257a2501 922 """Turn the encrypted s field into a working signature"""
6b37f0be 923
83799698 924 if player_url is not None:
9f9be844
PH
925 if player_url.startswith(u'//'):
926 player_url = u'https:' + player_url
e0df6211 927 try:
7f8ae73a
PH
928 player_id = (player_url, len(s))
929 if player_id not in self._player_cache:
83799698 930 func = self._extract_signature_function(
c4417ddb 931 video_id, player_url, len(s)
e0df6211 932 )
7f8ae73a
PH
933 self._player_cache[player_id] = func
934 func = self._player_cache[player_id]
edf3e38e
PH
935 if self._downloader.params.get('youtube_print_sig_code'):
936 self._print_sig_code(func, len(s))
937 return func(s)
0ca96d48 938 except Exception:
e0df6211 939 tb = traceback.format_exc()
83799698
PH
940 self._downloader.report_warning(
941 u'Automatic signature extraction failed: ' + tb)
e0df6211 942
d2d8f895
PH
943 self._downloader.report_warning(
944 u'Warning: Falling back to static signature algorithm')
920de7a2 945
2f2ffea9
PH
946 return self._static_decrypt_signature(
947 s, video_id, player_url, age_gate)
e0df6211 948
2f2ffea9 949 def _static_decrypt_signature(self, s, video_id, player_url, age_gate):
e0df6211
PH
950 if age_gate:
951 # The videos with age protection use another player, so the
952 # algorithms can be different.
953 if len(s) == 86:
954 return s[2:63] + s[82] + s[64:82] + s[63]
955
bc4b9008 956 if len(s) == 93:
957 return s[86:29:-1] + s[88] + s[28:5:-1]
958 elif len(s) == 92:
444b1165 959 return s[25] + s[3:25] + s[0] + s[26:42] + s[79] + s[43:79] + s[91] + s[80:83]
38d025b3
JMF
960 elif len(s) == 91:
961 return s[84:27:-1] + s[86] + s[26:5:-1]
444b1165
JMF
962 elif len(s) == 90:
963 return s[25] + s[3:25] + s[2] + s[26:40] + s[77] + s[41:77] + s[89] + s[78:81]
8a9d86a2 964 elif len(s) == 89:
965 return s[84:78:-1] + s[87] + s[77:60:-1] + s[0] + s[59:3:-1]
444b1165 966 elif len(s) == 88:
3e223834 967 return s[7:28] + s[87] + s[29:45] + s[55] + s[46:55] + s[2] + s[56:87] + s[28]
be547e1d 968 elif len(s) == 87:
3a725669 969 return s[6:27] + s[4] + s[28:39] + s[27] + s[40:59] + s[2] + s[60:]
be547e1d 970 elif len(s) == 86:
f2c327fd 971 return s[80:72:-1] + s[16] + s[71:39:-1] + s[72] + s[38:16:-1] + s[82] + s[15::-1]
be547e1d 972 elif len(s) == 85:
6ae8ee3f 973 return s[3:11] + s[0] + s[12:55] + s[84] + s[56:84]
be547e1d 974 elif len(s) == 84:
6f56389b 975 return s[78:70:-1] + s[14] + s[69:37:-1] + s[70] + s[36:14:-1] + s[80] + s[:14][::-1]
be547e1d 976 elif len(s) == 83:
920de7a2 977 return s[80:63:-1] + s[0] + s[62:0:-1] + s[63]
be547e1d 978 elif len(s) == 82:
c21315f2 979 return s[80:37:-1] + s[7] + s[36:7:-1] + s[0] + s[6:0:-1] + s[37]
be547e1d 980 elif len(s) == 81:
aedd6bb9 981 return s[56] + s[79:56:-1] + s[41] + s[55:41:-1] + s[80] + s[40:34:-1] + s[0] + s[33:29:-1] + s[34] + s[28:9:-1] + s[29] + s[8:0:-1] + s[9]
066090dd
JMF
982 elif len(s) == 80:
983 return s[1:19] + s[0] + s[20:68] + s[19] + s[69:80]
5c468ca8
JMF
984 elif len(s) == 79:
985 return s[54] + s[77:54:-1] + s[39] + s[53:39:-1] + s[78] + s[38:34:-1] + s[0] + s[33:29:-1] + s[34] + s[28:9:-1] + s[29] + s[8:0:-1] + s[9]
be547e1d
PH
986
987 else:
988 raise ExtractorError(u'Unable to decrypt signature, key length %d not supported; retrying might work' % (len(s)))
c5e8d7af 989
1f343eaa 990 def _get_available_subtitles(self, video_id, webpage):
de7f3446 991 try:
7fad1c63
JMF
992 sub_list = self._download_webpage(
993 'http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,
994 video_id, note=False)
995 except ExtractorError as err:
de7f3446
JMF
996 self._downloader.report_warning(u'unable to download video subtitles: %s' % compat_str(err))
997 return {}
998 lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
999
1000 sub_lang_list = {}
1001 for l in lang_list:
1002 lang = l[1]
1003 params = compat_urllib_parse.urlencode({
1004 'lang': lang,
1005 'v': video_id,
ca715127 1006 'fmt': self._downloader.params.get('subtitlesformat', 'srt'),
c3197e3e 1007 'name': unescapeHTML(l[0]).encode('utf-8'),
de7f3446
JMF
1008 })
1009 url = u'http://www.youtube.com/api/timedtext?' + params
1010 sub_lang_list[lang] = url
1011 if not sub_lang_list:
1012 self._downloader.report_warning(u'video doesn\'t have subtitles')
1013 return {}
1014 return sub_lang_list
1015
055e6f36 1016 def _get_available_automatic_caption(self, video_id, webpage):
de7f3446
JMF
1017 """We need the webpage for getting the captions url, pass it as an
1018 argument to speed up the process."""
ca715127 1019 sub_format = self._downloader.params.get('subtitlesformat', 'srt')
de7f3446
JMF
1020 self.to_screen(u'%s: Looking for automatic captions' % video_id)
1021 mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
055e6f36 1022 err_msg = u'Couldn\'t find automatic captions for %s' % video_id
de7f3446
JMF
1023 if mobj is None:
1024 self._downloader.report_warning(err_msg)
1025 return {}
1026 player_config = json.loads(mobj.group(1))
1027 try:
1028 args = player_config[u'args']
1029 caption_url = args[u'ttsurl']
1030 timestamp = args[u'timestamp']
055e6f36
JMF
1031 # We get the available subtitles
1032 list_params = compat_urllib_parse.urlencode({
1033 'type': 'list',
1034 'tlangs': 1,
1035 'asrs': 1,
de7f3446 1036 })
055e6f36 1037 list_url = caption_url + '&' + list_params
e26f8712 1038 caption_list = self._download_xml(list_url, video_id)
e3dc22ca 1039 original_lang_node = caption_list.find('track')
f6a54188 1040 if original_lang_node is None or original_lang_node.attrib.get('kind') != 'asr' :
e3dc22ca
JMF
1041 self._downloader.report_warning(u'Video doesn\'t have automatic captions')
1042 return {}
1043 original_lang = original_lang_node.attrib['lang_code']
055e6f36
JMF
1044
1045 sub_lang_list = {}
1046 for lang_node in caption_list.findall('target'):
1047 sub_lang = lang_node.attrib['lang_code']
1048 params = compat_urllib_parse.urlencode({
1049 'lang': original_lang,
1050 'tlang': sub_lang,
1051 'fmt': sub_format,
1052 'ts': timestamp,
1053 'kind': 'asr',
1054 })
1055 sub_lang_list[sub_lang] = caption_url + '&' + params
1056 return sub_lang_list
de7f3446
JMF
1057 # An extractor error can be raise by the download process if there are
1058 # no automatic captions but there are subtitles
1059 except (KeyError, ExtractorError):
1060 self._downloader.report_warning(err_msg)
1061 return {}
1062
c5e8d7af
PH
1063 def _extract_id(self, url):
1064 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1065 if mobj is None:
1066 raise ExtractorError(u'Invalid URL: %s' % url)
1067 video_id = mobj.group(2)
1068 return video_id
1069
1d043b93
JMF
1070 def _get_video_url_list(self, url_map):
1071 """
1072 Transform a dictionary in the format {itag:url} to a list of (itag, url)
1073 with the requested formats.
1074 """
2c62dc26 1075 existing_formats = [x for x in self._formats if x in url_map]
1d043b93
JMF
1076 if len(existing_formats) == 0:
1077 raise ExtractorError(u'no known formats available for video')
4ea3be0a 1078 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1079 video_url_list.reverse() # order worst to best
1d043b93
JMF
1080 return video_url_list
1081
1082 def _extract_from_m3u8(self, manifest_url, video_id):
1083 url_map = {}
1084 def _get_urls(_manifest):
1085 lines = _manifest.split('\n')
1086 urls = filter(lambda l: l and not l.startswith('#'),
1087 lines)
1088 return urls
1089 manifest = self._download_webpage(manifest_url, video_id, u'Downloading formats manifest')
1090 formats_urls = _get_urls(manifest)
1091 for format_url in formats_urls:
890f62e8 1092 itag = self._search_regex(r'itag/(\d+?)/', format_url, 'itag')
1d043b93
JMF
1093 url_map[itag] = format_url
1094 return url_map
1095
1fb07d10
JG
1096 def _extract_annotations(self, video_id):
1097 url = 'https://www.youtube.com/annotations_invideo?features=1&legacy=1&video_id=%s' % video_id
1098 return self._download_webpage(url, video_id, note=u'Searching for annotations.', errnote=u'Unable to download video annotations.')
1099
c5e8d7af
PH
1100 def _real_extract(self, url):
1101 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
1102 mobj = re.search(self._NEXT_URL_RE, url)
1103 if mobj:
1104 url = 'https://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
1105 video_id = self._extract_id(url)
1106
1107 # Get video webpage
c5e8d7af 1108 url = 'https://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
336c3a69 1109 video_webpage = self._download_webpage(url, video_id)
c5e8d7af
PH
1110
1111 # Attempt to extract SWF player URL
e0df6211 1112 mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
c5e8d7af
PH
1113 if mobj is not None:
1114 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1115 else:
1116 player_url = None
1117
1118 # Get video info
1119 self.report_video_info_webpage_download(video_id)
c108eb73
JMF
1120 if re.search(r'player-age-gate-content">', video_webpage) is not None:
1121 self.report_age_confirmation()
1122 age_gate = True
1123 # We simulate the access to the video from www.youtube.com/v/{video_id}
1124 # this can be viewed without login into Youtube
1125 data = compat_urllib_parse.urlencode({'video_id': video_id,
fccd3771 1126 'el': 'player_embedded',
c108eb73
JMF
1127 'gl': 'US',
1128 'hl': 'en',
1129 'eurl': 'https://youtube.googleapis.com/v/' + video_id,
1130 'asv': 3,
1131 'sts':'1588',
1132 })
1133 video_info_url = 'https://www.youtube.com/get_video_info?' + data
c5e8d7af
PH
1134 video_info_webpage = self._download_webpage(video_info_url, video_id,
1135 note=False,
1136 errnote='unable to download video info webpage')
1137 video_info = compat_parse_qs(video_info_webpage)
c108eb73
JMF
1138 else:
1139 age_gate = False
1140 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1141 video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1142 % (video_id, el_type))
1143 video_info_webpage = self._download_webpage(video_info_url, video_id,
1144 note=False,
1145 errnote='unable to download video info webpage')
1146 video_info = compat_parse_qs(video_info_webpage)
1147 if 'token' in video_info:
1148 break
c5e8d7af
PH
1149 if 'token' not in video_info:
1150 if 'reason' in video_info:
9a82b238 1151 raise ExtractorError(u'YouTube said: %s' % video_info['reason'][0], expected=True)
c5e8d7af
PH
1152 else:
1153 raise ExtractorError(u'"token" parameter not in video info for unknown reason')
1154
1d699755
PH
1155 if 'view_count' in video_info:
1156 view_count = int(video_info['view_count'][0])
1157 else:
1158 view_count = None
1159
c5e8d7af
PH
1160 # Check for "rental" videos
1161 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
1162 raise ExtractorError(u'"rental" videos not supported')
1163
1164 # Start extracting information
1165 self.report_information_extraction(video_id)
1166
1167 # uploader
1168 if 'author' not in video_info:
1169 raise ExtractorError(u'Unable to extract uploader name')
1170 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
1171
1172 # uploader_id
1173 video_uploader_id = None
1174 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
1175 if mobj is not None:
1176 video_uploader_id = mobj.group(1)
1177 else:
1178 self._downloader.report_warning(u'unable to extract uploader nickname')
1179
1180 # title
a8c6b241
PH
1181 if 'title' in video_info:
1182 video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
1183 else:
1184 self._downloader.report_warning(u'Unable to extract video title')
1185 video_title = u'_'
c5e8d7af
PH
1186
1187 # thumbnail image
7763b04e
JMF
1188 # We try first to get a high quality image:
1189 m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">',
1190 video_webpage, re.DOTALL)
1191 if m_thumb is not None:
1192 video_thumbnail = m_thumb.group(1)
1193 elif 'thumbnail_url' not in video_info:
c5e8d7af 1194 self._downloader.report_warning(u'unable to extract video thumbnail')
f490e77e 1195 video_thumbnail = None
c5e8d7af
PH
1196 else: # don't panic if we can't find it
1197 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
1198
1199 # upload date
1200 upload_date = None
1201 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
1202 if mobj is not None:
1203 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1204 upload_date = unified_strdate(upload_date)
1205
1206 # description
1207 video_description = get_element_by_id("eow-description", video_webpage)
1208 if video_description:
27dcce19
PH
1209 video_description = re.sub(r'''(?x)
1210 <a\s+
1211 (?:[a-zA-Z-]+="[^"]+"\s+)*?
1212 title="([^"]+)"\s+
1213 (?:[a-zA-Z-]+="[^"]+"\s+)*?
1214 class="yt-uix-redirect-link"\s*>
1215 [^<]+
1216 </a>
1217 ''', r'\1', video_description)
c5e8d7af
PH
1218 video_description = clean_html(video_description)
1219 else:
1220 fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
1221 if fd_mobj:
1222 video_description = unescapeHTML(fd_mobj.group(1))
1223 else:
1224 video_description = u''
1225
336c3a69 1226 def _extract_count(klass):
46374a56
PH
1227 count = self._search_regex(
1228 r'class="%s">([\d,]+)</span>' % re.escape(klass),
1229 video_webpage, klass, default=None)
336c3a69
JMF
1230 if count is not None:
1231 return int(count.replace(',', ''))
1232 return None
1233 like_count = _extract_count(u'likes-count')
1234 dislike_count = _extract_count(u'dislikes-count')
1235
c5e8d7af 1236 # subtitles
d82134c3 1237 video_subtitles = self.extract_subtitles(video_id, video_webpage)
c5e8d7af 1238
c5e8d7af 1239 if self._downloader.params.get('listsubtitles', False):
d665f8d3 1240 self._list_available_subtitles(video_id, video_webpage)
c5e8d7af
PH
1241 return
1242
1243 if 'length_seconds' not in video_info:
1244 self._downloader.report_warning(u'unable to extract video duration')
b466b702 1245 video_duration = None
c5e8d7af 1246 else:
b466b702 1247 video_duration = int(compat_urllib_parse.unquote_plus(video_info['length_seconds'][0]))
c5e8d7af 1248
1fb07d10
JG
1249 # annotations
1250 video_annotations = None
1251 if self._downloader.params.get('writeannotations', False):
1252 video_annotations = self._extract_annotations(video_id)
1253
c5e8d7af 1254 # Decide which formats to download
c5e8d7af
PH
1255
1256 try:
1257 mobj = re.search(r';ytplayer.config = ({.*?});', video_webpage)
50be92c1
PH
1258 if not mobj:
1259 raise ValueError('Could not find vevo ID')
c5e8d7af
PH
1260 info = json.loads(mobj.group(1))
1261 args = info['args']
7ce7e394
JMF
1262 # Easy way to know if the 's' value is in url_encoded_fmt_stream_map
1263 # this signatures are encrypted
44d46655 1264 if 'url_encoded_fmt_stream_map' not in args:
f10503db 1265 raise ValueError(u'No stream_map present') # caught below
00fe14fc
JMF
1266 re_signature = re.compile(r'[&,]s=')
1267 m_s = re_signature.search(args['url_encoded_fmt_stream_map'])
7ce7e394
JMF
1268 if m_s is not None:
1269 self.to_screen(u'%s: Encrypted signatures detected.' % video_id)
c5e8d7af 1270 video_info['url_encoded_fmt_stream_map'] = [args['url_encoded_fmt_stream_map']]
00fe14fc 1271 m_s = re_signature.search(args.get('adaptive_fmts', u''))
b7a68384 1272 if m_s is not None:
00fe14fc
JMF
1273 if 'adaptive_fmts' in video_info:
1274 video_info['adaptive_fmts'][0] += ',' + args['adaptive_fmts']
37b6d5f6 1275 else:
00fe14fc 1276 video_info['adaptive_fmts'] = [args['adaptive_fmts']]
c5e8d7af
PH
1277 except ValueError:
1278 pass
1279
1280 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1281 self.report_rtmp_download()
ce6b9a2d 1282 video_url_list = [('_rtmp', video_info['conn'][0])]
00fe14fc
JMF
1283 elif len(video_info.get('url_encoded_fmt_stream_map', [])) >= 1 or len(video_info.get('adaptive_fmts', [])) >= 1:
1284 encoded_url_map = video_info.get('url_encoded_fmt_stream_map', [''])[0] + ',' + video_info.get('adaptive_fmts',[''])[0]
1285 if 'rtmpe%3Dyes' in encoded_url_map:
a7055eb9 1286 raise ExtractorError('rtmpe downloads are not supported, see https://github.com/rg3/youtube-dl/issues/343 for more information.', expected=True)
c5e8d7af 1287 url_map = {}
00fe14fc 1288 for url_data_str in encoded_url_map.split(','):
c5e8d7af
PH
1289 url_data = compat_parse_qs(url_data_str)
1290 if 'itag' in url_data and 'url' in url_data:
1291 url = url_data['url'][0]
1292 if 'sig' in url_data:
1293 url += '&signature=' + url_data['sig'][0]
1294 elif 's' in url_data:
e0df6211 1295 encrypted_sig = url_data['s'][0]
769fda3c 1296 if self._downloader.params.get('verbose'):
c108eb73 1297 if age_gate:
bdde940e
PH
1298 if player_url is None:
1299 player_version = 'unknown'
1300 else:
1301 player_version = self._search_regex(
1302 r'-(.+)\.swf$', player_url,
1303 u'flash player', fatal=False)
e0df6211 1304 player_desc = 'flash player %s' % player_version
c108eb73 1305 else:
83799698
PH
1306 player_version = self._search_regex(
1307 r'html5player-(.+?)\.js', video_webpage,
c108eb73 1308 'html5 player', fatal=False)
e0df6211
PH
1309 player_desc = u'html5 player %s' % player_version
1310
1311 parts_sizes = u'.'.join(compat_str(len(part)) for part in encrypted_sig.split('.'))
5a76c651 1312 self.to_screen(u'encrypted signature length %d (%s), itag %s, %s' %
e0df6211
PH
1313 (len(encrypted_sig), parts_sizes, url_data['itag'][0], player_desc))
1314
83799698 1315 if not age_gate:
e0df6211
PH
1316 jsplayer_url_json = self._search_regex(
1317 r'"assets":.+?"js":\s*("[^"]+")',
1318 video_webpage, u'JS player URL')
83799698 1319 player_url = json.loads(jsplayer_url_json)
e0df6211 1320
83799698
PH
1321 signature = self._decrypt_signature(
1322 encrypted_sig, video_id, player_url, age_gate)
c5e8d7af
PH
1323 url += '&signature=' + signature
1324 if 'ratebypass' not in url:
1325 url += '&ratebypass=yes'
1326 url_map[url_data['itag'][0]] = url
1d043b93 1327 video_url_list = self._get_video_url_list(url_map)
1d043b93
JMF
1328 elif video_info.get('hlsvp'):
1329 manifest_url = video_info['hlsvp'][0]
1330 url_map = self._extract_from_m3u8(manifest_url, video_id)
1331 video_url_list = self._get_video_url_list(url_map)
c5e8d7af 1332 else:
9abb3204 1333 raise ExtractorError(u'no conn, hlsvp or url_encoded_fmt_stream_map information found in video info')
c5e8d7af 1334
4ea3be0a 1335 formats = []
600cc1a4 1336 for itag, video_real_url in video_url_list:
2c62dc26
PH
1337 dct = {
1338 'format_id': itag,
1339 'url': video_real_url,
1340 'player_url': player_url,
1341 }
1342 dct.update(self._formats[itag])
1343 formats.append(dct)
d80044c2 1344
4bcc7bd1 1345 self._sort_formats(formats)
4ea3be0a 1346
1347 return {
1348 'id': video_id,
1349 'uploader': video_uploader,
1350 'uploader_id': video_uploader_id,
1351 'upload_date': upload_date,
1352 'title': video_title,
1353 'thumbnail': video_thumbnail,
1354 'description': video_description,
1355 'subtitles': video_subtitles,
1356 'duration': video_duration,
1357 'age_limit': 18 if age_gate else 0,
1358 'annotations': video_annotations,
1359 'webpage_url': 'https://www.youtube.com/watch?v=%s' % video_id,
1360 'view_count': view_count,
1361 'like_count': like_count,
1362 'dislike_count': dislike_count,
1363 'formats': formats,
1364 }
c5e8d7af 1365
880e1c52 1366class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
0f818663 1367 IE_DESC = u'YouTube.com playlists'
c5e8d7af
PH
1368 _VALID_URL = r"""(?:
1369 (?:https?://)?
1370 (?:\w+\.)?
1371 youtube\.com/
1372 (?:
1373 (?:course|view_play_list|my_playlists|artist|playlist|watch)
1374 \? (?:.*?&)*? (?:p|a|list)=
1375 | p/
1376 )
715c8e7b 1377 ((?:PL|EC|UU|FL|RD)?[0-9A-Za-z-_]{10,})
c5e8d7af
PH
1378 .*
1379 |
715c8e7b 1380 ((?:PL|EC|UU|FL|RD)[0-9A-Za-z-_]{10,})
c5e8d7af 1381 )"""
dcbb4580
JMF
1382 _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s&page=%s'
1383 _MORE_PAGES_INDICATOR = r'data-link-type="next"'
6e47b51e 1384 _VIDEO_RE = r'href="/watch\?v=(?P<id>[0-9A-Za-z_-]{11})&amp;[^"]*?index=(?P<index>\d+)'
c5e8d7af
PH
1385 IE_NAME = u'youtube:playlist'
1386
1387 @classmethod
1388 def suitable(cls, url):
1389 """Receives a URL and returns True if suitable for this IE."""
1390 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1391
880e1c52
JMF
1392 def _real_initialize(self):
1393 self._login()
1394
652cdaa2
JMF
1395 def _ids_to_results(self, ids):
1396 return [self.url_result(vid_id, 'Youtube', video_id=vid_id)
1397 for vid_id in ids]
1398
1399 def _extract_mix(self, playlist_id):
1400 # The mixes are generated from a a single video
1401 # the id of the playlist is just 'RD' + video_id
7d4afc55 1402 url = 'https://youtube.com/watch?v=%s&list=%s' % (playlist_id[-11:], playlist_id)
652cdaa2 1403 webpage = self._download_webpage(url, playlist_id, u'Downloading Youtube mix')
76d1700b
JMF
1404 title_span = (get_element_by_attribute('class', 'title long-title', webpage) or
1405 get_element_by_attribute('class', 'title ', webpage))
1406 title = clean_html(title_span)
652cdaa2
JMF
1407 video_re = r'data-index="\d+".*?href="/watch\?v=([0-9A-Za-z_-]{11})&amp;[^"]*?list=%s' % re.escape(playlist_id)
1408 ids = orderedSet(re.findall(video_re, webpage))
1409 url_results = self._ids_to_results(ids)
1410
1411 return self.playlist_result(url_results, playlist_id, title)
1412
c5e8d7af
PH
1413 def _real_extract(self, url):
1414 # Extract playlist id
1415 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1416 if mobj is None:
1417 raise ExtractorError(u'Invalid URL: %s' % url)
47192f92
FV
1418 playlist_id = mobj.group(1) or mobj.group(2)
1419
1420 # Check if it's a video-specific URL
7c61bd36 1421 query_dict = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
47192f92
FV
1422 if 'v' in query_dict:
1423 video_id = query_dict['v'][0]
1424 if self._downloader.params.get('noplaylist'):
1425 self.to_screen(u'Downloading just video %s because of --no-playlist' % video_id)
7012b23c 1426 return self.url_result(video_id, 'Youtube', video_id=video_id)
47192f92
FV
1427 else:
1428 self.to_screen(u'Downloading playlist PL%s - add --no-playlist to just download video %s' % (playlist_id, video_id))
c5e8d7af 1429
7d4afc55 1430 if playlist_id.startswith('RD'):
652cdaa2
JMF
1431 # Mixes require a custom extraction process
1432 return self._extract_mix(playlist_id)
0a688bc0
JMF
1433 if playlist_id.startswith('TL'):
1434 raise ExtractorError(u'For downloading YouTube.com top lists, use '
1435 u'the "yttoplist" keyword, for example "youtube-dl \'yttoplist:music:Top Tracks\'"', expected=True)
652cdaa2 1436
dcbb4580
JMF
1437 # Extract the video ids from the playlist pages
1438 ids = []
c5e8d7af 1439
755eb032 1440 for page_num in itertools.count(1):
dcbb4580 1441 url = self._TEMPLATE_URL % (playlist_id, page_num)
c5e8d7af 1442 page = self._download_webpage(url, playlist_id, u'Downloading page #%s' % page_num)
6e47b51e
JMF
1443 matches = re.finditer(self._VIDEO_RE, page)
1444 # We remove the duplicates and the link with index 0
1445 # (it's not the first video of the playlist)
1446 new_ids = orderedSet(m.group('id') for m in matches if m.group('index') != '0')
dcbb4580 1447 ids.extend(new_ids)
c5e8d7af 1448
dcbb4580 1449 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
c5e8d7af
PH
1450 break
1451
c91778f8
PH
1452 try:
1453 playlist_title = self._og_search_title(page)
1454 except RegexNotFoundError:
1455 self.report_warning(
1456 u'Playlist page is missing OpenGraph title, falling back ...',
1457 playlist_id)
1458 playlist_title = self._html_search_regex(
1459 r'<h1 class="pl-header-title">(.*?)</h1>', page, u'title')
c5e8d7af 1460
652cdaa2 1461 url_results = self._ids_to_results(ids)
dcbb4580 1462 return self.playlist_result(url_results, playlist_id, playlist_title)
c5e8d7af
PH
1463
1464
0a688bc0
JMF
1465class YoutubeTopListIE(YoutubePlaylistIE):
1466 IE_NAME = u'youtube:toplist'
1467 IE_DESC = (u'YouTube.com top lists, "yttoplist:{channel}:{list title}"'
1468 u' (Example: "yttoplist:music:Top Tracks")')
1469 _VALID_URL = r'yttoplist:(?P<chann>.*?):(?P<title>.*?)$'
1470
1471 def _real_extract(self, url):
1472 mobj = re.match(self._VALID_URL, url)
1473 channel = mobj.group('chann')
1474 title = mobj.group('title')
1475 query = compat_urllib_parse.urlencode({'title': title})
1476 playlist_re = 'href="([^"]+?%s[^"]+?)"' % re.escape(query)
1477 channel_page = self._download_webpage('https://www.youtube.com/%s' % channel, title)
1478 link = self._html_search_regex(playlist_re, channel_page, u'list')
1479 url = compat_urlparse.urljoin('https://www.youtube.com/', link)
1480
1481 video_re = r'data-index="\d+".*?data-video-id="([0-9A-Za-z_-]{11})"'
1482 ids = []
1483 # sometimes the webpage doesn't contain the videos
1484 # retry until we get them
1485 for i in itertools.count(0):
1486 msg = u'Downloading Youtube mix'
1487 if i > 0:
1488 msg += ', retry #%d' % i
1489 webpage = self._download_webpage(url, title, msg)
1490 ids = orderedSet(re.findall(video_re, webpage))
1491 if ids:
1492 break
1493 url_results = self._ids_to_results(ids)
1494 return self.playlist_result(url_results, playlist_title=title)
1495
1496
c5e8d7af 1497class YoutubeChannelIE(InfoExtractor):
0f818663 1498 IE_DESC = u'YouTube.com channels'
c5e8d7af 1499 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
c5e8d7af 1500 _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
252580c5 1501 _MORE_PAGES_URL = 'http://www.youtube.com/c4_browse_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
c5e8d7af
PH
1502 IE_NAME = u'youtube:channel'
1503
1504 def extract_videos_from_page(self, page):
1505 ids_in_page = []
1506 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1507 if mobj.group(1) not in ids_in_page:
1508 ids_in_page.append(mobj.group(1))
1509 return ids_in_page
1510
1511 def _real_extract(self, url):
1512 # Extract channel id
1513 mobj = re.match(self._VALID_URL, url)
1514 if mobj is None:
1515 raise ExtractorError(u'Invalid URL: %s' % url)
1516
1517 # Download channel page
1518 channel_id = mobj.group(1)
1519 video_ids = []
b9643eed
JMF
1520 url = 'https://www.youtube.com/channel/%s/videos' % channel_id
1521 channel_page = self._download_webpage(url, channel_id)
31812a9e
PH
1522 autogenerated = re.search(r'''(?x)
1523 class="[^"]*?(?:
1524 channel-header-autogenerated-label|
1525 yt-channel-title-autogenerated
1526 )[^"]*"''', channel_page) is not None
c5e8d7af 1527
b9643eed
JMF
1528 if autogenerated:
1529 # The videos are contained in a single page
1530 # the ajax pages can't be used, they are empty
1531 video_ids = self.extract_videos_from_page(channel_page)
1532 else:
1533 # Download all channel pages using the json-based channel_ajax query
1534 for pagenum in itertools.count(1):
1535 url = self._MORE_PAGES_URL % (pagenum, channel_id)
1536 page = self._download_webpage(url, channel_id,
1537 u'Downloading page #%s' % pagenum)
1538
1539 page = json.loads(page)
1540
1541 ids_in_page = self.extract_videos_from_page(page['content_html'])
1542 video_ids.extend(ids_in_page)
1543
1544 if self._MORE_PAGES_INDICATOR not in page['load_more_widget_html']:
1545 break
c5e8d7af
PH
1546
1547 self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1548
7012b23c
PH
1549 url_entries = [self.url_result(video_id, 'Youtube', video_id=video_id)
1550 for video_id in video_ids]
1551 return self.playlist_result(url_entries, channel_id)
c5e8d7af
PH
1552
1553
1554class YoutubeUserIE(InfoExtractor):
0f818663 1555 IE_DESC = u'YouTube.com user videos (URL or "ytuser" keyword)'
57da92b7 1556 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:user/)?(?!(?:attribution_link|watch)(?:$|[^a-z_A-Z0-9-])))|ytuser:)(?!feed/)([A-Za-z0-9_-]+)'
c5e8d7af
PH
1557 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1558 _GDATA_PAGE_SIZE = 50
fd9cf738 1559 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d&alt=json'
c5e8d7af
PH
1560 IE_NAME = u'youtube:user'
1561
e3ea4790 1562 @classmethod
f4b05232 1563 def suitable(cls, url):
e3ea4790
JMF
1564 # Don't return True if the url can be extracted with other youtube
1565 # extractor, the regex would is too permissive and it would match.
1566 other_ies = iter(klass for (name, klass) in globals().items() if name.endswith('IE') and klass is not cls)
1567 if any(ie.suitable(url) for ie in other_ies): return False
f4b05232
JMF
1568 else: return super(YoutubeUserIE, cls).suitable(url)
1569
c5e8d7af
PH
1570 def _real_extract(self, url):
1571 # Extract username
1572 mobj = re.match(self._VALID_URL, url)
1573 if mobj is None:
1574 raise ExtractorError(u'Invalid URL: %s' % url)
1575
1576 username = mobj.group(1)
1577
1578 # Download video ids using YouTube Data API. Result size per
1579 # query is limited (currently to 50 videos) so we need to query
1580 # page by page until there are no video ids - it means we got
1581 # all of them.
1582
e302f9ce 1583 url_results = []
c5e8d7af 1584
755eb032 1585 for pagenum in itertools.count(0):
c5e8d7af
PH
1586 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1587
1588 gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
1589 page = self._download_webpage(gdata_url, username,
1590 u'Downloading video ids from %d to %d' % (start_index, start_index + self._GDATA_PAGE_SIZE))
1591
fd9cf738
JMF
1592 try:
1593 response = json.loads(page)
1594 except ValueError as err:
1595 raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
71c82637
JMF
1596 if 'entry' not in response['feed']:
1597 # Number of videos is a multiple of self._MAX_RESULTS
1598 break
fd9cf738 1599
c5e8d7af 1600 # Extract video identifiers
e302f9ce
PH
1601 entries = response['feed']['entry']
1602 for entry in entries:
1603 title = entry['title']['$t']
1604 video_id = entry['id']['$t'].split('/')[-1]
1605 url_results.append({
1606 '_type': 'url',
1607 'url': video_id,
1608 'ie_key': 'Youtube',
1609 'id': 'video_id',
1610 'title': title,
1611 })
c5e8d7af
PH
1612
1613 # A little optimization - if current page is not
1614 # "full", ie. does not contain PAGE_SIZE video ids then
1615 # we can assume that this page is the last one - there
1616 # are no more ids on further pages - no need to query
1617 # again.
1618
e302f9ce 1619 if len(entries) < self._GDATA_PAGE_SIZE:
c5e8d7af
PH
1620 break
1621
7012b23c
PH
1622 return self.playlist_result(url_results, playlist_title=username)
1623
b05654f0
PH
1624
1625class YoutubeSearchIE(SearchInfoExtractor):
0f818663 1626 IE_DESC = u'YouTube.com searches'
b05654f0
PH
1627 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1628 _MAX_RESULTS = 1000
1629 IE_NAME = u'youtube:search'
1630 _SEARCH_KEY = 'ytsearch'
1631
b05654f0
PH
1632 def _get_n_results(self, query, n):
1633 """Get a specified number of results for a query"""
1634
1635 video_ids = []
1636 pagenum = 0
1637 limit = n
1638
1639 while (50 * pagenum) < limit:
b05654f0 1640 result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
7cc3570e
PH
1641 data_json = self._download_webpage(
1642 result_url, video_id=u'query "%s"' % query,
1643 note=u'Downloading page %s' % (pagenum + 1),
1644 errnote=u'Unable to download API page')
1645 data = json.loads(data_json)
1646 api_response = data['data']
1647
1648 if 'items' not in api_response:
b05654f0
PH
1649 raise ExtractorError(u'[youtube] No video results')
1650
1651 new_ids = list(video['id'] for video in api_response['items'])
1652 video_ids += new_ids
1653
1654 limit = min(n, api_response['totalItems'])
1655 pagenum += 1
1656
1657 if len(video_ids) > n:
1658 video_ids = video_ids[:n]
7012b23c
PH
1659 videos = [self.url_result(video_id, 'Youtube', video_id=video_id)
1660 for video_id in video_ids]
b05654f0 1661 return self.playlist_result(videos, query)
75dff0ee 1662
a3dd9248 1663class YoutubeSearchDateIE(YoutubeSearchIE):
cb7fb546 1664 IE_NAME = YoutubeSearchIE.IE_NAME + ':date'
a3dd9248
CM
1665 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc&orderby=published'
1666 _SEARCH_KEY = 'ytsearchdate'
08fb86c4 1667 IE_DESC = u'YouTube.com searches, newest videos first'
75dff0ee
JMF
1668
1669class YoutubeShowIE(InfoExtractor):
0f818663 1670 IE_DESC = u'YouTube.com (multi-season) shows'
75dff0ee
JMF
1671 _VALID_URL = r'https?://www\.youtube\.com/show/(.*)'
1672 IE_NAME = u'youtube:show'
1673
1674 def _real_extract(self, url):
1675 mobj = re.match(self._VALID_URL, url)
1676 show_name = mobj.group(1)
1677 webpage = self._download_webpage(url, show_name, u'Downloading show webpage')
1678 # There's one playlist for each season of the show
1679 m_seasons = list(re.finditer(r'href="(/playlist\?list=.*?)"', webpage))
1680 self.to_screen(u'%s: Found %s seasons' % (show_name, len(m_seasons)))
1681 return [self.url_result('https://www.youtube.com' + season.group(1), 'YoutubePlaylist') for season in m_seasons]
04cc9617
JMF
1682
1683
b2e8bc1b 1684class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
d7ae0639
JMF
1685 """
1686 Base class for extractors that fetch info from
1687 http://www.youtube.com/feed_ajax
1688 Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.
1689 """
b2e8bc1b 1690 _LOGIN_REQUIRED = True
43ba5456
JMF
1691 # use action_load_personal_feed instead of action_load_system_feed
1692 _PERSONAL_FEED = False
04cc9617 1693
d7ae0639
JMF
1694 @property
1695 def _FEED_TEMPLATE(self):
43ba5456
JMF
1696 action = 'action_load_system_feed'
1697 if self._PERSONAL_FEED:
1698 action = 'action_load_personal_feed'
1699 return 'http://www.youtube.com/feed_ajax?%s=1&feed_name=%s&paging=%%s' % (action, self._FEED_NAME)
d7ae0639
JMF
1700
1701 @property
1702 def IE_NAME(self):
1703 return u'youtube:%s' % self._FEED_NAME
04cc9617 1704
81f0259b 1705 def _real_initialize(self):
b2e8bc1b 1706 self._login()
81f0259b 1707
04cc9617
JMF
1708 def _real_extract(self, url):
1709 feed_entries = []
0e44d838
JMF
1710 paging = 0
1711 for i in itertools.count(1):
d7ae0639
JMF
1712 info = self._download_webpage(self._FEED_TEMPLATE % paging,
1713 u'%s feed' % self._FEED_NAME,
04cc9617
JMF
1714 u'Downloading page %s' % i)
1715 info = json.loads(info)
1716 feed_html = info['feed_html']
43ba5456 1717 m_ids = re.finditer(r'"/watch\?v=(.*?)["&]', feed_html)
04cc9617 1718 ids = orderedSet(m.group(1) for m in m_ids)
7012b23c
PH
1719 feed_entries.extend(
1720 self.url_result(video_id, 'Youtube', video_id=video_id)
1721 for video_id in ids)
04cc9617
JMF
1722 if info['paging'] is None:
1723 break
0e44d838 1724 paging = info['paging']
d7ae0639
JMF
1725 return self.playlist_result(feed_entries, playlist_title=self._PLAYLIST_TITLE)
1726
1727class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor):
1728 IE_DESC = u'YouTube.com subscriptions feed, "ytsubs" keyword(requires authentication)'
1729 _VALID_URL = r'https?://www\.youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
1730 _FEED_NAME = 'subscriptions'
1731 _PLAYLIST_TITLE = u'Youtube Subscriptions'
1732
1733class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
1734 IE_DESC = u'YouTube.com recommended videos, "ytrec" keyword (requires authentication)'
1735 _VALID_URL = r'https?://www\.youtube\.com/feed/recommended|:ytrec(?:ommended)?'
1736 _FEED_NAME = 'recommended'
1737 _PLAYLIST_TITLE = u'Youtube Recommended videos'
c626a3d9 1738
43ba5456
JMF
1739class YoutubeWatchLaterIE(YoutubeFeedsInfoExtractor):
1740 IE_DESC = u'Youtube watch later list, "ytwatchlater" keyword (requires authentication)'
1741 _VALID_URL = r'https?://www\.youtube\.com/feed/watch_later|:ytwatchlater'
1742 _FEED_NAME = 'watch_later'
1743 _PLAYLIST_TITLE = u'Youtube Watch Later'
43ba5456 1744 _PERSONAL_FEED = True
c626a3d9 1745
f459d170
JMF
1746class YoutubeHistoryIE(YoutubeFeedsInfoExtractor):
1747 IE_DESC = u'Youtube watch history, "ythistory" keyword (requires authentication)'
1748 _VALID_URL = u'https?://www\.youtube\.com/feed/history|:ythistory'
1749 _FEED_NAME = 'history'
1750 _PERSONAL_FEED = True
1751 _PLAYLIST_TITLE = u'Youtube Watch History'
1752
c626a3d9
JMF
1753class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
1754 IE_NAME = u'youtube:favorites'
1755 IE_DESC = u'YouTube.com favourite videos, "ytfav" keyword (requires authentication)'
c7a7750d 1756 _VALID_URL = r'https?://www\.youtube\.com/my_favorites|:ytfav(?:ou?rites)?'
c626a3d9
JMF
1757 _LOGIN_REQUIRED = True
1758
1759 def _real_extract(self, url):
1760 webpage = self._download_webpage('https://www.youtube.com/my_favorites', 'Youtube Favourites videos')
1761 playlist_id = self._search_regex(r'list=(.+?)["&]', webpage, u'favourites playlist id')
1762 return self.url_result(playlist_id, 'YoutubePlaylist')
15870e90
PH
1763
1764
1765class YoutubeTruncatedURLIE(InfoExtractor):
1766 IE_NAME = 'youtube:truncated_url'
1767 IE_DESC = False # Do not list
1768 _VALID_URL = r'(?:https?://)?[^/]+/watch\?feature=[a-z_]+$'
1769
1770 def _real_extract(self, url):
1771 raise ExtractorError(
1772 u'Did you forget to quote the URL? Remember that & is a meta '
1773 u'character in most shells, so you want to put the URL in quotes, '
1774 u'like youtube-dl '
b4622a32
PH
1775 u'"http://www.youtube.com/watch?feature=foo&v=BaW_jenozKc" '
1776 u' or simply youtube-dl BaW_jenozKc .',
15870e90 1777 expected=True)