]> jfr.im git - yt-dlp.git/blame - youtube_dl/extractor/youtube.py
Add __len__ to PagedLists
[yt-dlp.git] / youtube_dl / extractor / youtube.py
CommitLineData
c5e8d7af 1# coding: utf-8
c5e8d7af 2
e0df6211 3import collections
edf3e38e 4import errno
e0df6211 5import io
0ca96d48 6import itertools
c5e8d7af 7import json
c4417ddb 8import os.path
c5e8d7af 9import re
e0df6211
PH
10import string
11import struct
12import traceback
13import zlib
c5e8d7af 14
b05654f0 15from .common import InfoExtractor, SearchInfoExtractor
54d39d8b 16from .subtitles import SubtitlesInfoExtractor
c5e8d7af 17from ..utils import (
edf3e38e 18 compat_chr,
c5e8d7af 19 compat_parse_qs,
c5e8d7af
PH
20 compat_urllib_parse,
21 compat_urllib_request,
7c61bd36 22 compat_urlparse,
c5e8d7af
PH
23 compat_str,
24
25 clean_html,
c38b1e77 26 get_cachedir,
c5e8d7af 27 get_element_by_id,
652cdaa2 28 get_element_by_attribute,
c5e8d7af 29 ExtractorError,
dd27fd17 30 int_or_none,
b7ab0590 31 PagedList,
c91778f8 32 RegexNotFoundError,
c5e8d7af
PH
33 unescapeHTML,
34 unified_strdate,
04cc9617 35 orderedSet,
edf3e38e 36 write_json_file,
c5e8d7af
PH
37)
38
de7f3446 39class YoutubeBaseInfoExtractor(InfoExtractor):
b2e8bc1b
JMF
40 """Provide base functions for Youtube extractors"""
41 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
42 _LANG_URL = r'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
43 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
44 _NETRC_MACHINE = 'youtube'
45 # If True it will raise an error if no login info is provided
46 _LOGIN_REQUIRED = False
47
b2e8bc1b 48 def _set_language(self):
7cc3570e
PH
49 return bool(self._download_webpage(
50 self._LANG_URL, None,
51 note=u'Setting language', errnote='unable to set language',
52 fatal=False))
b2e8bc1b
JMF
53
54 def _login(self):
55 (username, password) = self._get_login_info()
56 # No authentication to be performed
57 if username is None:
58 if self._LOGIN_REQUIRED:
59 raise ExtractorError(u'No login info available, needed for using %s.' % self.IE_NAME, expected=True)
60 return False
61
7cc3570e
PH
62 login_page = self._download_webpage(
63 self._LOGIN_URL, None,
64 note=u'Downloading login page',
65 errnote=u'unable to fetch login page', fatal=False)
66 if login_page is False:
67 return
b2e8bc1b 68
795f28f8
PH
69 galx = self._search_regex(r'(?s)<input.+?name="GALX".+?value="(.+?)"',
70 login_page, u'Login GALX parameter')
c5e8d7af 71
b2e8bc1b
JMF
72 # Log in
73 login_form_strs = {
74 u'continue': u'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
75 u'Email': username,
76 u'GALX': galx,
77 u'Passwd': password,
78 u'PersistentCookie': u'yes',
79 u'_utf8': u'霱',
80 u'bgresponse': u'js_disabled',
81 u'checkConnection': u'',
82 u'checkedDomains': u'youtube',
83 u'dnConn': u'',
b2e8bc1b
JMF
84 u'pstMsg': u'0',
85 u'rmShown': u'1',
86 u'secTok': u'',
87 u'signIn': u'Sign in',
88 u'timeStmp': u'',
89 u'service': u'youtube',
90 u'uilel': u'3',
91 u'hl': u'en_US',
92 }
93 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
94 # chokes on unicode
95 login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
96 login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
7cc3570e
PH
97
98 req = compat_urllib_request.Request(self._LOGIN_URL, login_data)
99 login_results = self._download_webpage(
100 req, None,
101 note=u'Logging in', errnote=u'unable to log in', fatal=False)
102 if login_results is False:
103 return False
104 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
105 self._downloader.report_warning(u'unable to log in: bad username or password')
b2e8bc1b
JMF
106 return False
107 return True
108
109 def _confirm_age(self):
110 age_form = {
7cc3570e
PH
111 'next_url': '/',
112 'action_confirm': 'Confirm',
113 }
114 req = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
115
116 self._download_webpage(
117 req, None,
118 note=u'Confirming age', errnote=u'Unable to confirm age')
b2e8bc1b
JMF
119 return True
120
121 def _real_initialize(self):
122 if self._downloader is None:
123 return
124 if not self._set_language():
125 return
126 if not self._login():
127 return
128 self._confirm_age()
c5e8d7af 129
8377574c 130
de7f3446 131class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
0f818663 132 IE_DESC = u'YouTube.com'
cb7dfeea 133 _VALID_URL = r"""(?x)^
c5e8d7af 134 (
83aa5293 135 (?:https?://|//)? # http(s):// or protocol-independent URL (optional)
cb7dfeea 136 (?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/|
484aaeb2
PH
137 (?:www\.)?deturl\.com/www\.youtube\.com/|
138 (?:www\.)?pwnyoutube\.com|
e69ae5b9
JMF
139 tube\.majestyc\.net/|
140 youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains
c5e8d7af
PH
141 (?:.*?\#/)? # handle anchor (#/) redirect urls
142 (?: # the various things that can precede the ID:
143 (?:(?:v|embed|e)/) # v/ or embed/ or e/
144 |(?: # or the v= param in all its forms
d741e55a 145 (?:(?:watch|movie)(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
c5e8d7af
PH
146 (?:\?|\#!?) # the params delimiter ? or # or #!
147 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
148 v=
149 )
f4b05232
JMF
150 ))
151 |youtu\.be/ # just youtu.be/xxxx
152 )
c5e8d7af 153 )? # all until now is optional -> you can pass the naked ID
8963d9c2 154 ([0-9A-Za-z_-]{11}) # here is it! the YouTube video ID
c5e8d7af
PH
155 (?(1).+)? # if we found the ID, everything can follow
156 $"""
c5e8d7af 157 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
2c62dc26
PH
158 _formats = {
159 '5': {'ext': 'flv', 'width': 400, 'height': 240},
160 '6': {'ext': 'flv', 'width': 450, 'height': 270},
161 '13': {'ext': '3gp'},
162 '17': {'ext': '3gp', 'width': 176, 'height': 144},
163 '18': {'ext': 'mp4', 'width': 640, 'height': 360},
164 '22': {'ext': 'mp4', 'width': 1280, 'height': 720},
165 '34': {'ext': 'flv', 'width': 640, 'height': 360},
166 '35': {'ext': 'flv', 'width': 854, 'height': 480},
167 '36': {'ext': '3gp', 'width': 320, 'height': 240},
168 '37': {'ext': 'mp4', 'width': 1920, 'height': 1080},
169 '38': {'ext': 'mp4', 'width': 4096, 'height': 3072},
170 '43': {'ext': 'webm', 'width': 640, 'height': 360},
171 '44': {'ext': 'webm', 'width': 854, 'height': 480},
172 '45': {'ext': 'webm', 'width': 1280, 'height': 720},
173 '46': {'ext': 'webm', 'width': 1920, 'height': 1080},
174
1d043b93 175
86fe61c8 176 # 3d videos
2c62dc26
PH
177 '82': {'ext': 'mp4', 'height': 360, 'resolution': '360p', 'format_note': '3D', 'preference': -20},
178 '83': {'ext': 'mp4', 'height': 480, 'resolution': '480p', 'format_note': '3D', 'preference': -20},
179 '84': {'ext': 'mp4', 'height': 720, 'resolution': '720p', 'format_note': '3D', 'preference': -20},
180 '85': {'ext': 'mp4', 'height': 1080, 'resolution': '1080p', 'format_note': '3D', 'preference': -20},
181 '100': {'ext': 'webm', 'height': 360, 'resolution': '360p', 'format_note': '3D', 'preference': -20},
182 '101': {'ext': 'webm', 'height': 480, 'resolution': '480p', 'format_note': '3D', 'preference': -20},
183 '102': {'ext': 'webm', 'height': 720, 'resolution': '720p', 'format_note': '3D', 'preference': -20},
836a086c 184
96fb5605 185 # Apple HTTP Live Streaming
2c62dc26
PH
186 '92': {'ext': 'mp4', 'height': 240, 'resolution': '240p', 'format_note': 'HLS', 'preference': -10},
187 '93': {'ext': 'mp4', 'height': 360, 'resolution': '360p', 'format_note': 'HLS', 'preference': -10},
188 '94': {'ext': 'mp4', 'height': 480, 'resolution': '480p', 'format_note': 'HLS', 'preference': -10},
189 '95': {'ext': 'mp4', 'height': 720, 'resolution': '720p', 'format_note': 'HLS', 'preference': -10},
190 '96': {'ext': 'mp4', 'height': 1080, 'resolution': '1080p', 'format_note': 'HLS', 'preference': -10},
191 '132': {'ext': 'mp4', 'height': 240, 'resolution': '240p', 'format_note': 'HLS', 'preference': -10},
192 '151': {'ext': 'mp4', 'height': 72, 'resolution': '72p', 'format_note': 'HLS', 'preference': -10},
193
194 # DASH mp4 video
195 '133': {'ext': 'mp4', 'height': 240, 'resolution': '240p', 'format_note': 'DASH video', 'preference': -40},
196 '134': {'ext': 'mp4', 'height': 360, 'resolution': '360p', 'format_note': 'DASH video', 'preference': -40},
197 '135': {'ext': 'mp4', 'height': 480, 'resolution': '480p', 'format_note': 'DASH video', 'preference': -40},
198 '136': {'ext': 'mp4', 'height': 720, 'resolution': '720p', 'format_note': 'DASH video', 'preference': -40},
199 '137': {'ext': 'mp4', 'height': 1080, 'resolution': '1080p', 'format_note': 'DASH video', 'preference': -40},
200 '138': {'ext': 'mp4', 'height': 1081, 'resolution': '>1080p', 'format_note': 'DASH video', 'preference': -40},
201 '160': {'ext': 'mp4', 'height': 192, 'resolution': '192p', 'format_note': 'DASH video', 'preference': -40},
8fa8a629 202 '264': {'ext': 'mp4', 'height': 1080, 'resolution': '1080p', 'format_note': 'DASH video', 'preference': -40},
836a086c 203
f6f1fc92 204 # Dash mp4 audio
2c62dc26
PH
205 '139': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 48, 'preference': -50},
206 '140': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 128, 'preference': -50},
207 '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 256, 'preference': -50},
836a086c
AZ
208
209 # Dash webm
2c62dc26
PH
210 '242': {'ext': 'webm', 'height': 240, 'resolution': '240p', 'format_note': 'DASH webm', 'preference': -40},
211 '243': {'ext': 'webm', 'height': 360, 'resolution': '360p', 'format_note': 'DASH webm', 'preference': -40},
212 '244': {'ext': 'webm', 'height': 480, 'resolution': '480p', 'format_note': 'DASH webm', 'preference': -40},
213 '245': {'ext': 'webm', 'height': 480, 'resolution': '480p', 'format_note': 'DASH webm', 'preference': -40},
214 '246': {'ext': 'webm', 'height': 480, 'resolution': '480p', 'format_note': 'DASH webm', 'preference': -40},
215 '247': {'ext': 'webm', 'height': 720, 'resolution': '720p', 'format_note': 'DASH webm', 'preference': -40},
216 '248': {'ext': 'webm', 'height': 1080, 'resolution': '1080p', 'format_note': 'DASH webm', 'preference': -40},
217
218 # Dash webm audio
219 '171': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH webm audio', 'abr': 48, 'preference': -50},
220 '172': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH webm audio', 'abr': 256, 'preference': -50},
ce6b9a2d
PH
221
222 # RTMP (unnamed)
223 '_rtmp': {'protocol': 'rtmp'},
c5e8d7af 224 }
836a086c 225
c5e8d7af 226 IE_NAME = u'youtube'
2eb88d95
PH
227 _TESTS = [
228 {
0e853ca4
PH
229 u"url": u"http://www.youtube.com/watch?v=BaW_jenozKc",
230 u"file": u"BaW_jenozKc.mp4",
231 u"info_dict": {
232 u"title": u"youtube-dl test video \"'/\\ä↭𝕐",
233 u"uploader": u"Philipp Hagemeister",
234 u"uploader_id": u"phihag",
235 u"upload_date": u"20121002",
27dcce19 236 u"description": u"test chars: \"'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de ."
2eb88d95 237 }
0e853ca4 238 },
0e853ca4
PH
239 {
240 u"url": u"http://www.youtube.com/watch?v=UxxajLWwzqY",
241 u"file": u"UxxajLWwzqY.mp4",
242 u"note": u"Test generic use_cipher_signature video (#897)",
243 u"info_dict": {
244 u"upload_date": u"20120506",
245 u"title": u"Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]",
2dc59299 246 u"description": u"md5:5b292926389560516e384ac437c0ec07",
45ed795c 247 u"uploader": u"Icona Pop",
0e853ca4 248 u"uploader_id": u"IconaPop"
2eb88d95 249 }
c108eb73
JMF
250 },
251 {
252 u"url": u"https://www.youtube.com/watch?v=07FYdnEawAQ",
253 u"file": u"07FYdnEawAQ.mp4",
254 u"note": u"Test VEVO video with age protection (#956)",
255 u"info_dict": {
256 u"upload_date": u"20130703",
257 u"title": u"Justin Timberlake - Tunnel Vision (Explicit)",
258 u"description": u"md5:64249768eec3bc4276236606ea996373",
259 u"uploader": u"justintimberlakeVEVO",
260 u"uploader_id": u"justintimberlakeVEVO"
261 }
262 },
fccd3771 263 {
83aa5293 264 u"url": u"//www.YouTube.com/watch?v=yZIXLfi8CZQ",
fccd3771
PH
265 u"file": u"yZIXLfi8CZQ.mp4",
266 u"note": u"Embed-only video (#1746)",
267 u"info_dict": {
268 u"upload_date": u"20120608",
269 u"title": u"Principal Sexually Assaults A Teacher - Episode 117 - 8th June 2012",
270 u"description": u"md5:09b78bd971f1e3e289601dfba15ca4f7",
271 u"uploader": u"SET India",
272 u"uploader_id": u"setindia"
273 }
274 },
dd27fd17
PH
275 {
276 u"url": u"http://www.youtube.com/watch?v=a9LDPn-MO4I",
277 u"file": u"a9LDPn-MO4I.m4a",
278 u"note": u"256k DASH audio (format 141) via DASH manifest",
279 u"params": {
280 u"format": "141"
281 },
282 u"info_dict": {
283 u"upload_date": "20121002",
284 u"uploader_id": "8KVIDEO",
285 u"description": "No description available.",
286 u"uploader": "8KVIDEO",
287 u"title": "UHDTV TEST 8K VIDEO.mp4"
288 }
289 },
2eb88d95
PH
290 ]
291
c5e8d7af
PH
292
293 @classmethod
294 def suitable(cls, url):
295 """Receives a URL and returns True if suitable for this IE."""
e3ea4790 296 if YoutubePlaylistIE.suitable(url): return False
fccd3771 297 return re.match(cls._VALID_URL, url) is not None
c5e8d7af 298
e0df6211
PH
299 def __init__(self, *args, **kwargs):
300 super(YoutubeIE, self).__init__(*args, **kwargs)
83799698 301 self._player_cache = {}
e0df6211 302
c5e8d7af
PH
303 def report_video_info_webpage_download(self, video_id):
304 """Report attempt to download video info webpage."""
305 self.to_screen(u'%s: Downloading video info webpage' % video_id)
306
c5e8d7af
PH
307 def report_information_extraction(self, video_id):
308 """Report attempt to extract video information."""
309 self.to_screen(u'%s: Extracting video information' % video_id)
310
311 def report_unavailable_format(self, video_id, format):
312 """Report extracted video URL."""
313 self.to_screen(u'%s: Format %s not available' % (video_id, format))
314
315 def report_rtmp_download(self):
316 """Indicate the download will use the RTMP protocol."""
317 self.to_screen(u'RTMP download detected')
318
c4417ddb
PH
319 def _extract_signature_function(self, video_id, player_url, slen):
320 id_m = re.match(r'.*-(?P<id>[a-zA-Z0-9_-]+)\.(?P<ext>[a-z]+)$',
83799698 321 player_url)
e0df6211
PH
322 player_type = id_m.group('ext')
323 player_id = id_m.group('id')
324
c4417ddb
PH
325 # Read from filesystem cache
326 func_id = '%s_%s_%d' % (player_type, player_id, slen)
327 assert os.path.basename(func_id) == func_id
c38b1e77 328 cache_dir = get_cachedir(self._downloader.params)
c4417ddb 329
c3c88a26 330 cache_enabled = cache_dir is not None
f8061589 331 if cache_enabled:
c4417ddb
PH
332 cache_fn = os.path.join(os.path.expanduser(cache_dir),
333 u'youtube-sigfuncs',
334 func_id + '.json')
335 try:
edf3e38e 336 with io.open(cache_fn, 'r', encoding='utf-8') as cachef:
c4417ddb
PH
337 cache_spec = json.load(cachef)
338 return lambda s: u''.join(s[i] for i in cache_spec)
edf3e38e 339 except IOError:
c4417ddb 340 pass # No cache available
83799698 341
e0df6211
PH
342 if player_type == 'js':
343 code = self._download_webpage(
344 player_url, video_id,
83799698 345 note=u'Downloading %s player %s' % (player_type, player_id),
e0df6211 346 errnote=u'Download of %s failed' % player_url)
83799698 347 res = self._parse_sig_js(code)
c4417ddb 348 elif player_type == 'swf':
e0df6211
PH
349 urlh = self._request_webpage(
350 player_url, video_id,
83799698 351 note=u'Downloading %s player %s' % (player_type, player_id),
e0df6211
PH
352 errnote=u'Download of %s failed' % player_url)
353 code = urlh.read()
83799698 354 res = self._parse_sig_swf(code)
e0df6211
PH
355 else:
356 assert False, 'Invalid player type %r' % player_type
357
f8061589 358 if cache_enabled:
edf3e38e 359 try:
c705320f
PH
360 test_string = u''.join(map(compat_chr, range(slen)))
361 cache_res = res(test_string)
edf3e38e
PH
362 cache_spec = [ord(c) for c in cache_res]
363 try:
364 os.makedirs(os.path.dirname(cache_fn))
365 except OSError as ose:
366 if ose.errno != errno.EEXIST:
367 raise
368 write_json_file(cache_spec, cache_fn)
0ca96d48 369 except Exception:
edf3e38e
PH
370 tb = traceback.format_exc()
371 self._downloader.report_warning(
372 u'Writing cache to %r failed: %s' % (cache_fn, tb))
83799698
PH
373
374 return res
375
edf3e38e
PH
376 def _print_sig_code(self, func, slen):
377 def gen_sig_code(idxs):
378 def _genslice(start, end, step):
379 starts = u'' if start == 0 else str(start)
e35e4ddc
PH
380 ends = (u':%d' % (end+step)) if end + step >= 0 else u':'
381 steps = u'' if step == 1 else (u':%d' % step)
edf3e38e
PH
382 return u's[%s%s%s]' % (starts, ends, steps)
383
384 step = None
0ca96d48
PH
385 start = '(Never used)' # Quelch pyflakes warnings - start will be
386 # set as soon as step is set
edf3e38e
PH
387 for i, prev in zip(idxs[1:], idxs[:-1]):
388 if step is not None:
389 if i - prev == step:
390 continue
391 yield _genslice(start, prev, step)
392 step = None
393 continue
394 if i - prev in [-1, 1]:
395 step = i - prev
396 start = prev
397 continue
398 else:
399 yield u's[%d]' % prev
400 if step is None:
401 yield u's[%d]' % i
402 else:
403 yield _genslice(start, i, step)
404
c705320f
PH
405 test_string = u''.join(map(compat_chr, range(slen)))
406 cache_res = func(test_string)
edf3e38e
PH
407 cache_spec = [ord(c) for c in cache_res]
408 expr_code = u' + '.join(gen_sig_code(cache_spec))
409 code = u'if len(s) == %d:\n return %s\n' % (slen, expr_code)
f8061589 410 self.to_screen(u'Extracted signature function:\n' + code)
edf3e38e 411
e0df6211
PH
412 def _parse_sig_js(self, jscode):
413 funcname = self._search_regex(
414 r'signature=([a-zA-Z]+)', jscode,
415 u'Initial JS player signature function name')
416
417 functions = {}
418
419 def argidx(varname):
420 return string.lowercase.index(varname)
421
422 def interpret_statement(stmt, local_vars, allow_recursion=20):
423 if allow_recursion < 0:
0ca96d48 424 raise ExtractorError(u'Recursion limit reached')
e0df6211
PH
425
426 if stmt.startswith(u'var '):
427 stmt = stmt[len(u'var '):]
428 ass_m = re.match(r'^(?P<out>[a-z]+)(?:\[(?P<index>[^\]]+)\])?' +
429 r'=(?P<expr>.*)$', stmt)
430 if ass_m:
431 if ass_m.groupdict().get('index'):
432 def assign(val):
433 lvar = local_vars[ass_m.group('out')]
434 idx = interpret_expression(ass_m.group('index'),
435 local_vars, allow_recursion)
436 assert isinstance(idx, int)
437 lvar[idx] = val
438 return val
439 expr = ass_m.group('expr')
440 else:
441 def assign(val):
442 local_vars[ass_m.group('out')] = val
443 return val
444 expr = ass_m.group('expr')
445 elif stmt.startswith(u'return '):
446 assign = lambda v: v
447 expr = stmt[len(u'return '):]
448 else:
449 raise ExtractorError(
450 u'Cannot determine left side of statement in %r' % stmt)
451
452 v = interpret_expression(expr, local_vars, allow_recursion)
453 return assign(v)
454
455 def interpret_expression(expr, local_vars, allow_recursion):
456 if expr.isdigit():
457 return int(expr)
458
459 if expr.isalpha():
460 return local_vars[expr]
461
462 m = re.match(r'^(?P<in>[a-z]+)\.(?P<member>.*)$', expr)
463 if m:
464 member = m.group('member')
465 val = local_vars[m.group('in')]
466 if member == 'split("")':
467 return list(val)
468 if member == 'join("")':
469 return u''.join(val)
470 if member == 'length':
471 return len(val)
472 if member == 'reverse()':
473 return val[::-1]
474 slice_m = re.match(r'slice\((?P<idx>.*)\)', member)
475 if slice_m:
476 idx = interpret_expression(
477 slice_m.group('idx'), local_vars, allow_recursion-1)
478 return val[idx:]
479
480 m = re.match(
481 r'^(?P<in>[a-z]+)\[(?P<idx>.+)\]$', expr)
482 if m:
483 val = local_vars[m.group('in')]
484 idx = interpret_expression(m.group('idx'), local_vars,
485 allow_recursion-1)
486 return val[idx]
487
488 m = re.match(r'^(?P<a>.+?)(?P<op>[%])(?P<b>.+?)$', expr)
489 if m:
490 a = interpret_expression(m.group('a'),
491 local_vars, allow_recursion)
492 b = interpret_expression(m.group('b'),
493 local_vars, allow_recursion)
494 return a % b
495
496 m = re.match(
497 r'^(?P<func>[a-zA-Z]+)\((?P<args>[a-z0-9,]+)\)$', expr)
498 if m:
499 fname = m.group('func')
500 if fname not in functions:
501 functions[fname] = extract_function(fname)
502 argvals = [int(v) if v.isdigit() else local_vars[v]
503 for v in m.group('args').split(',')]
504 return functions[fname](argvals)
505 raise ExtractorError(u'Unsupported JS expression %r' % expr)
506
507 def extract_function(funcname):
508 func_m = re.search(
509 r'function ' + re.escape(funcname) +
510 r'\((?P<args>[a-z,]+)\){(?P<code>[^}]+)}',
511 jscode)
512 argnames = func_m.group('args').split(',')
513
514 def resf(args):
515 local_vars = dict(zip(argnames, args))
516 for stmt in func_m.group('code').split(';'):
517 res = interpret_statement(stmt, local_vars)
518 return res
519 return resf
520
521 initial_function = extract_function(funcname)
522 return lambda s: initial_function([s])
523
524 def _parse_sig_swf(self, file_contents):
525 if file_contents[1:3] != b'WS':
526 raise ExtractorError(
527 u'Not an SWF file; header is %r' % file_contents[:3])
528 if file_contents[:1] == b'C':
529 content = zlib.decompress(file_contents[8:])
530 else:
531 raise NotImplementedError(u'Unsupported compression format %r' %
532 file_contents[:1])
533
534 def extract_tags(content):
535 pos = 0
536 while pos < len(content):
537 header16 = struct.unpack('<H', content[pos:pos+2])[0]
538 pos += 2
539 tag_code = header16 >> 6
540 tag_len = header16 & 0x3f
541 if tag_len == 0x3f:
542 tag_len = struct.unpack('<I', content[pos:pos+4])[0]
543 pos += 4
544 assert pos+tag_len <= len(content)
545 yield (tag_code, content[pos:pos+tag_len])
546 pos += tag_len
547
548 code_tag = next(tag
549 for tag_code, tag in extract_tags(content)
550 if tag_code == 82)
551 p = code_tag.index(b'\0', 4) + 1
ba552f54 552 code_reader = io.BytesIO(code_tag[p:])
e0df6211
PH
553
554 # Parse ABC (AVM2 ByteCode)
ba552f54
PH
555 def read_int(reader=None):
556 if reader is None:
557 reader = code_reader
e0df6211
PH
558 res = 0
559 shift = 0
560 for _ in range(5):
ba552f54
PH
561 buf = reader.read(1)
562 assert len(buf) == 1
563 b = struct.unpack('<B', buf)[0]
e0df6211
PH
564 res = res | ((b & 0x7f) << shift)
565 if b & 0x80 == 0:
566 break
567 shift += 7
ba552f54
PH
568 return res
569
570 def u30(reader=None):
571 res = read_int(reader)
572 assert res & 0xf0000000 == 0
e0df6211
PH
573 return res
574 u32 = read_int
575
ba552f54
PH
576 def s32(reader=None):
577 v = read_int(reader)
e0df6211
PH
578 if v & 0x80000000 != 0:
579 v = - ((v ^ 0xffffffff) + 1)
ba552f54
PH
580 return v
581
0ca96d48 582 def read_string(reader=None):
ba552f54
PH
583 if reader is None:
584 reader = code_reader
585 slen = u30(reader)
586 resb = reader.read(slen)
587 assert len(resb) == slen
588 return resb.decode('utf-8')
589
590 def read_bytes(count, reader=None):
591 if reader is None:
592 reader = code_reader
593 resb = reader.read(count)
594 assert len(resb) == count
595 return resb
596
597 def read_byte(reader=None):
598 resb = read_bytes(1, reader=reader)
599 res = struct.unpack('<B', resb)[0]
600 return res
e0df6211
PH
601
602 # minor_version + major_version
0ca96d48 603 read_bytes(2 + 2)
e0df6211
PH
604
605 # Constant pool
ba552f54 606 int_count = u30()
e0df6211 607 for _c in range(1, int_count):
0ca96d48 608 s32()
ba552f54 609 uint_count = u30()
e0df6211 610 for _c in range(1, uint_count):
0ca96d48 611 u32()
ba552f54 612 double_count = u30()
0ca96d48 613 read_bytes((double_count-1) * 8)
ba552f54 614 string_count = u30()
e0df6211
PH
615 constant_strings = [u'']
616 for _c in range(1, string_count):
0ca96d48 617 s = read_string()
e0df6211 618 constant_strings.append(s)
ba552f54 619 namespace_count = u30()
e0df6211 620 for _c in range(1, namespace_count):
0ca96d48
PH
621 read_bytes(1) # kind
622 u30() # name
ba552f54 623 ns_set_count = u30()
e0df6211 624 for _c in range(1, ns_set_count):
ba552f54 625 count = u30()
e0df6211 626 for _c2 in range(count):
0ca96d48 627 u30()
ba552f54 628 multiname_count = u30()
e0df6211
PH
629 MULTINAME_SIZES = {
630 0x07: 2, # QName
631 0x0d: 2, # QNameA
632 0x0f: 1, # RTQName
633 0x10: 1, # RTQNameA
634 0x11: 0, # RTQNameL
635 0x12: 0, # RTQNameLA
636 0x09: 2, # Multiname
637 0x0e: 2, # MultinameA
638 0x1b: 1, # MultinameL
639 0x1c: 1, # MultinameLA
640 }
641 multinames = [u'']
642 for _c in range(1, multiname_count):
ba552f54 643 kind = u30()
e0df6211
PH
644 assert kind in MULTINAME_SIZES, u'Invalid multiname kind %r' % kind
645 if kind == 0x07:
0ca96d48 646 u30() # namespace_idx
ba552f54 647 name_idx = u30()
e0df6211
PH
648 multinames.append(constant_strings[name_idx])
649 else:
650 multinames.append('[MULTINAME kind: %d]' % kind)
651 for _c2 in range(MULTINAME_SIZES[kind]):
0ca96d48 652 u30()
e0df6211
PH
653
654 # Methods
ba552f54 655 method_count = u30()
e0df6211
PH
656 MethodInfo = collections.namedtuple(
657 'MethodInfo',
658 ['NEED_ARGUMENTS', 'NEED_REST'])
659 method_infos = []
660 for method_id in range(method_count):
ba552f54 661 param_count = u30()
0ca96d48 662 u30() # return type
e0df6211 663 for _ in range(param_count):
0ca96d48
PH
664 u30() # param type
665 u30() # name index (always 0 for youtube)
ba552f54 666 flags = read_byte()
e0df6211
PH
667 if flags & 0x08 != 0:
668 # Options present
ba552f54 669 option_count = u30()
e0df6211 670 for c in range(option_count):
0ca96d48
PH
671 u30() # val
672 read_bytes(1) # kind
e0df6211
PH
673 if flags & 0x80 != 0:
674 # Param names present
675 for _ in range(param_count):
0ca96d48 676 u30() # param name
e0df6211
PH
677 mi = MethodInfo(flags & 0x01 != 0, flags & 0x04 != 0)
678 method_infos.append(mi)
679
680 # Metadata
ba552f54 681 metadata_count = u30()
e0df6211 682 for _c in range(metadata_count):
0ca96d48 683 u30() # name
ba552f54 684 item_count = u30()
e0df6211 685 for _c2 in range(item_count):
0ca96d48
PH
686 u30() # key
687 u30() # value
ba552f54
PH
688
689 def parse_traits_info():
690 trait_name_idx = u30()
691 kind_full = read_byte()
e0df6211
PH
692 kind = kind_full & 0x0f
693 attrs = kind_full >> 4
694 methods = {}
695 if kind in [0x00, 0x06]: # Slot or Const
0ca96d48
PH
696 u30() # Slot id
697 u30() # type_name_idx
ba552f54 698 vindex = u30()
e0df6211 699 if vindex != 0:
0ca96d48 700 read_byte() # vkind
e0df6211 701 elif kind in [0x01, 0x02, 0x03]: # Method / Getter / Setter
0ca96d48 702 u30() # disp_id
ba552f54 703 method_idx = u30()
e0df6211
PH
704 methods[multinames[trait_name_idx]] = method_idx
705 elif kind == 0x04: # Class
0ca96d48
PH
706 u30() # slot_id
707 u30() # classi
e0df6211 708 elif kind == 0x05: # Function
0ca96d48 709 u30() # slot_id
ba552f54 710 function_idx = u30()
e0df6211
PH
711 methods[function_idx] = multinames[trait_name_idx]
712 else:
713 raise ExtractorError(u'Unsupported trait kind %d' % kind)
714
715 if attrs & 0x4 != 0: # Metadata present
ba552f54 716 metadata_count = u30()
e0df6211 717 for _c3 in range(metadata_count):
0ca96d48 718 u30() # metadata index
e0df6211 719
ba552f54 720 return methods
e0df6211
PH
721
722 # Classes
723 TARGET_CLASSNAME = u'SignatureDecipher'
724 searched_idx = multinames.index(TARGET_CLASSNAME)
725 searched_class_id = None
ba552f54 726 class_count = u30()
e0df6211 727 for class_id in range(class_count):
ba552f54 728 name_idx = u30()
e0df6211
PH
729 if name_idx == searched_idx:
730 # We found the class we're looking for!
731 searched_class_id = class_id
0ca96d48 732 u30() # super_name idx
ba552f54 733 flags = read_byte()
e0df6211 734 if flags & 0x08 != 0: # Protected namespace is present
0ca96d48 735 u30() # protected_ns_idx
ba552f54 736 intrf_count = u30()
e0df6211 737 for _c2 in range(intrf_count):
0ca96d48
PH
738 u30()
739 u30() # iinit
ba552f54 740 trait_count = u30()
e0df6211 741 for _c2 in range(trait_count):
0ca96d48 742 parse_traits_info()
e0df6211
PH
743
744 if searched_class_id is None:
745 raise ExtractorError(u'Target class %r not found' %
746 TARGET_CLASSNAME)
747
748 method_names = {}
749 method_idxs = {}
750 for class_id in range(class_count):
0ca96d48 751 u30() # cinit
ba552f54 752 trait_count = u30()
e0df6211 753 for _c2 in range(trait_count):
ba552f54 754 trait_methods = parse_traits_info()
e0df6211
PH
755 if class_id == searched_class_id:
756 method_names.update(trait_methods.items())
757 method_idxs.update(dict(
758 (idx, name)
759 for name, idx in trait_methods.items()))
760
761 # Scripts
ba552f54 762 script_count = u30()
e0df6211 763 for _c in range(script_count):
0ca96d48 764 u30() # init
ba552f54 765 trait_count = u30()
e0df6211 766 for _c2 in range(trait_count):
0ca96d48 767 parse_traits_info()
e0df6211
PH
768
769 # Method bodies
ba552f54 770 method_body_count = u30()
e0df6211
PH
771 Method = collections.namedtuple('Method', ['code', 'local_count'])
772 methods = {}
773 for _c in range(method_body_count):
ba552f54 774 method_idx = u30()
0ca96d48 775 u30() # max_stack
ba552f54 776 local_count = u30()
0ca96d48
PH
777 u30() # init_scope_depth
778 u30() # max_scope_depth
ba552f54
PH
779 code_length = u30()
780 code = read_bytes(code_length)
e0df6211 781 if method_idx in method_idxs:
ba552f54 782 m = Method(code, local_count)
e0df6211 783 methods[method_idxs[method_idx]] = m
ba552f54 784 exception_count = u30()
e0df6211 785 for _c2 in range(exception_count):
0ca96d48
PH
786 u30() # from
787 u30() # to
788 u30() # target
789 u30() # exc_type
790 u30() # var_name
ba552f54 791 trait_count = u30()
e0df6211 792 for _c2 in range(trait_count):
0ca96d48 793 parse_traits_info()
e0df6211 794
ba552f54 795 assert p + code_reader.tell() == len(code_tag)
e0df6211
PH
796 assert len(methods) == len(method_idxs)
797
798 method_pyfunctions = {}
799
800 def extract_function(func_name):
801 if func_name in method_pyfunctions:
802 return method_pyfunctions[func_name]
803 if func_name not in methods:
804 raise ExtractorError(u'Cannot find function %r' % func_name)
805 m = methods[func_name]
806
807 def resfunc(args):
e0df6211
PH
808 registers = ['(this)'] + list(args) + [None] * m.local_count
809 stack = []
810 coder = io.BytesIO(m.code)
811 while True:
812 opcode = struct.unpack('!B', coder.read(1))[0]
a7177865 813 if opcode == 36: # pushbyte
e0df6211
PH
814 v = struct.unpack('!B', coder.read(1))[0]
815 stack.append(v)
816 elif opcode == 44: # pushstring
817 idx = u30(coder)
818 stack.append(constant_strings[idx])
819 elif opcode == 48: # pushscope
820 # We don't implement the scope register, so we'll just
821 # ignore the popped value
822 stack.pop()
823 elif opcode == 70: # callproperty
824 index = u30(coder)
825 mname = multinames[index]
826 arg_count = u30(coder)
827 args = list(reversed(
828 [stack.pop() for _ in range(arg_count)]))
829 obj = stack.pop()
830 if mname == u'split':
831 assert len(args) == 1
832 assert isinstance(args[0], compat_str)
833 assert isinstance(obj, compat_str)
834 if args[0] == u'':
835 res = list(obj)
836 else:
837 res = obj.split(args[0])
838 stack.append(res)
a7177865
PH
839 elif mname == u'slice':
840 assert len(args) == 1
841 assert isinstance(args[0], int)
842 assert isinstance(obj, list)
843 res = obj[args[0]:]
844 stack.append(res)
845 elif mname == u'join':
846 assert len(args) == 1
847 assert isinstance(args[0], compat_str)
848 assert isinstance(obj, list)
849 res = args[0].join(obj)
850 stack.append(res)
e0df6211
PH
851 elif mname in method_pyfunctions:
852 stack.append(method_pyfunctions[mname](args))
853 else:
854 raise NotImplementedError(
855 u'Unsupported property %r on %r'
856 % (mname, obj))
a7177865
PH
857 elif opcode == 72: # returnvalue
858 res = stack.pop()
859 return res
860 elif opcode == 79: # callpropvoid
861 index = u30(coder)
862 mname = multinames[index]
863 arg_count = u30(coder)
864 args = list(reversed(
865 [stack.pop() for _ in range(arg_count)]))
866 obj = stack.pop()
867 if mname == u'reverse':
868 assert isinstance(obj, list)
869 obj.reverse()
870 else:
871 raise NotImplementedError(
872 u'Unsupported (void) property %r on %r'
873 % (mname, obj))
e0df6211
PH
874 elif opcode == 93: # findpropstrict
875 index = u30(coder)
876 mname = multinames[index]
877 res = extract_function(mname)
878 stack.append(res)
879 elif opcode == 97: # setproperty
880 index = u30(coder)
881 value = stack.pop()
882 idx = stack.pop()
883 obj = stack.pop()
884 assert isinstance(obj, list)
885 assert isinstance(idx, int)
886 obj[idx] = value
887 elif opcode == 98: # getlocal
888 index = u30(coder)
889 stack.append(registers[index])
890 elif opcode == 99: # setlocal
891 index = u30(coder)
892 value = stack.pop()
893 registers[index] = value
894 elif opcode == 102: # getproperty
895 index = u30(coder)
896 pname = multinames[index]
897 if pname == u'length':
898 obj = stack.pop()
899 assert isinstance(obj, list)
900 stack.append(len(obj))
901 else: # Assume attribute access
902 idx = stack.pop()
903 assert isinstance(idx, int)
904 obj = stack.pop()
905 assert isinstance(obj, list)
906 stack.append(obj[idx])
907 elif opcode == 128: # coerce
0ca96d48 908 u30(coder)
e0df6211
PH
909 elif opcode == 133: # coerce_s
910 assert isinstance(stack[-1], (type(None), compat_str))
911 elif opcode == 164: # modulo
912 value2 = stack.pop()
913 value1 = stack.pop()
914 res = value1 % value2
915 stack.append(res)
a7177865
PH
916 elif opcode == 208: # getlocal_0
917 stack.append(registers[0])
918 elif opcode == 209: # getlocal_1
919 stack.append(registers[1])
920 elif opcode == 210: # getlocal_2
921 stack.append(registers[2])
922 elif opcode == 211: # getlocal_3
923 stack.append(registers[3])
e0df6211
PH
924 elif opcode == 214: # setlocal_2
925 registers[2] = stack.pop()
926 elif opcode == 215: # setlocal_3
927 registers[3] = stack.pop()
928 else:
929 raise NotImplementedError(
930 u'Unsupported opcode %d' % opcode)
931
932 method_pyfunctions[func_name] = resfunc
933 return resfunc
934
935 initial_function = extract_function(u'decipher')
936 return lambda s: initial_function([s])
937
83799698 938 def _decrypt_signature(self, s, video_id, player_url, age_gate=False):
257a2501 939 """Turn the encrypted s field into a working signature"""
6b37f0be 940
83799698 941 if player_url is not None:
9f9be844
PH
942 if player_url.startswith(u'//'):
943 player_url = u'https:' + player_url
e0df6211 944 try:
7f8ae73a
PH
945 player_id = (player_url, len(s))
946 if player_id not in self._player_cache:
83799698 947 func = self._extract_signature_function(
c4417ddb 948 video_id, player_url, len(s)
e0df6211 949 )
7f8ae73a
PH
950 self._player_cache[player_id] = func
951 func = self._player_cache[player_id]
edf3e38e
PH
952 if self._downloader.params.get('youtube_print_sig_code'):
953 self._print_sig_code(func, len(s))
954 return func(s)
0ca96d48 955 except Exception:
e0df6211 956 tb = traceback.format_exc()
83799698
PH
957 self._downloader.report_warning(
958 u'Automatic signature extraction failed: ' + tb)
e0df6211 959
d2d8f895
PH
960 self._downloader.report_warning(
961 u'Warning: Falling back to static signature algorithm')
920de7a2 962
2f2ffea9
PH
963 return self._static_decrypt_signature(
964 s, video_id, player_url, age_gate)
e0df6211 965
2f2ffea9 966 def _static_decrypt_signature(self, s, video_id, player_url, age_gate):
e0df6211
PH
967 if age_gate:
968 # The videos with age protection use another player, so the
969 # algorithms can be different.
970 if len(s) == 86:
971 return s[2:63] + s[82] + s[64:82] + s[63]
972
bc4b9008 973 if len(s) == 93:
974 return s[86:29:-1] + s[88] + s[28:5:-1]
975 elif len(s) == 92:
444b1165 976 return s[25] + s[3:25] + s[0] + s[26:42] + s[79] + s[43:79] + s[91] + s[80:83]
38d025b3
JMF
977 elif len(s) == 91:
978 return s[84:27:-1] + s[86] + s[26:5:-1]
444b1165
JMF
979 elif len(s) == 90:
980 return s[25] + s[3:25] + s[2] + s[26:40] + s[77] + s[41:77] + s[89] + s[78:81]
8a9d86a2 981 elif len(s) == 89:
982 return s[84:78:-1] + s[87] + s[77:60:-1] + s[0] + s[59:3:-1]
444b1165 983 elif len(s) == 88:
3e223834 984 return s[7:28] + s[87] + s[29:45] + s[55] + s[46:55] + s[2] + s[56:87] + s[28]
be547e1d 985 elif len(s) == 87:
3a725669 986 return s[6:27] + s[4] + s[28:39] + s[27] + s[40:59] + s[2] + s[60:]
be547e1d 987 elif len(s) == 86:
f2c327fd 988 return s[80:72:-1] + s[16] + s[71:39:-1] + s[72] + s[38:16:-1] + s[82] + s[15::-1]
be547e1d 989 elif len(s) == 85:
6ae8ee3f 990 return s[3:11] + s[0] + s[12:55] + s[84] + s[56:84]
be547e1d 991 elif len(s) == 84:
6f56389b 992 return s[78:70:-1] + s[14] + s[69:37:-1] + s[70] + s[36:14:-1] + s[80] + s[:14][::-1]
be547e1d 993 elif len(s) == 83:
920de7a2 994 return s[80:63:-1] + s[0] + s[62:0:-1] + s[63]
be547e1d 995 elif len(s) == 82:
c21315f2 996 return s[80:37:-1] + s[7] + s[36:7:-1] + s[0] + s[6:0:-1] + s[37]
be547e1d 997 elif len(s) == 81:
aedd6bb9 998 return s[56] + s[79:56:-1] + s[41] + s[55:41:-1] + s[80] + s[40:34:-1] + s[0] + s[33:29:-1] + s[34] + s[28:9:-1] + s[29] + s[8:0:-1] + s[9]
066090dd
JMF
999 elif len(s) == 80:
1000 return s[1:19] + s[0] + s[20:68] + s[19] + s[69:80]
5c468ca8
JMF
1001 elif len(s) == 79:
1002 return s[54] + s[77:54:-1] + s[39] + s[53:39:-1] + s[78] + s[38:34:-1] + s[0] + s[33:29:-1] + s[34] + s[28:9:-1] + s[29] + s[8:0:-1] + s[9]
be547e1d
PH
1003
1004 else:
1005 raise ExtractorError(u'Unable to decrypt signature, key length %d not supported; retrying might work' % (len(s)))
c5e8d7af 1006
1f343eaa 1007 def _get_available_subtitles(self, video_id, webpage):
de7f3446 1008 try:
7fad1c63
JMF
1009 sub_list = self._download_webpage(
1010 'http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,
1011 video_id, note=False)
1012 except ExtractorError as err:
de7f3446
JMF
1013 self._downloader.report_warning(u'unable to download video subtitles: %s' % compat_str(err))
1014 return {}
1015 lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
1016
1017 sub_lang_list = {}
1018 for l in lang_list:
1019 lang = l[1]
1020 params = compat_urllib_parse.urlencode({
1021 'lang': lang,
1022 'v': video_id,
ca715127 1023 'fmt': self._downloader.params.get('subtitlesformat', 'srt'),
c3197e3e 1024 'name': unescapeHTML(l[0]).encode('utf-8'),
de7f3446
JMF
1025 })
1026 url = u'http://www.youtube.com/api/timedtext?' + params
1027 sub_lang_list[lang] = url
1028 if not sub_lang_list:
1029 self._downloader.report_warning(u'video doesn\'t have subtitles')
1030 return {}
1031 return sub_lang_list
1032
055e6f36 1033 def _get_available_automatic_caption(self, video_id, webpage):
de7f3446
JMF
1034 """We need the webpage for getting the captions url, pass it as an
1035 argument to speed up the process."""
ca715127 1036 sub_format = self._downloader.params.get('subtitlesformat', 'srt')
de7f3446
JMF
1037 self.to_screen(u'%s: Looking for automatic captions' % video_id)
1038 mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
055e6f36 1039 err_msg = u'Couldn\'t find automatic captions for %s' % video_id
de7f3446
JMF
1040 if mobj is None:
1041 self._downloader.report_warning(err_msg)
1042 return {}
1043 player_config = json.loads(mobj.group(1))
1044 try:
1045 args = player_config[u'args']
1046 caption_url = args[u'ttsurl']
1047 timestamp = args[u'timestamp']
055e6f36
JMF
1048 # We get the available subtitles
1049 list_params = compat_urllib_parse.urlencode({
1050 'type': 'list',
1051 'tlangs': 1,
1052 'asrs': 1,
de7f3446 1053 })
055e6f36 1054 list_url = caption_url + '&' + list_params
e26f8712 1055 caption_list = self._download_xml(list_url, video_id)
e3dc22ca 1056 original_lang_node = caption_list.find('track')
f6a54188 1057 if original_lang_node is None or original_lang_node.attrib.get('kind') != 'asr' :
e3dc22ca
JMF
1058 self._downloader.report_warning(u'Video doesn\'t have automatic captions')
1059 return {}
1060 original_lang = original_lang_node.attrib['lang_code']
055e6f36
JMF
1061
1062 sub_lang_list = {}
1063 for lang_node in caption_list.findall('target'):
1064 sub_lang = lang_node.attrib['lang_code']
1065 params = compat_urllib_parse.urlencode({
1066 'lang': original_lang,
1067 'tlang': sub_lang,
1068 'fmt': sub_format,
1069 'ts': timestamp,
1070 'kind': 'asr',
1071 })
1072 sub_lang_list[sub_lang] = caption_url + '&' + params
1073 return sub_lang_list
de7f3446
JMF
1074 # An extractor error can be raise by the download process if there are
1075 # no automatic captions but there are subtitles
1076 except (KeyError, ExtractorError):
1077 self._downloader.report_warning(err_msg)
1078 return {}
1079
c5e8d7af
PH
1080 def _extract_id(self, url):
1081 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1082 if mobj is None:
1083 raise ExtractorError(u'Invalid URL: %s' % url)
1084 video_id = mobj.group(2)
1085 return video_id
1086
1d043b93
JMF
1087 def _extract_from_m3u8(self, manifest_url, video_id):
1088 url_map = {}
1089 def _get_urls(_manifest):
1090 lines = _manifest.split('\n')
1091 urls = filter(lambda l: l and not l.startswith('#'),
1092 lines)
1093 return urls
1094 manifest = self._download_webpage(manifest_url, video_id, u'Downloading formats manifest')
1095 formats_urls = _get_urls(manifest)
1096 for format_url in formats_urls:
890f62e8 1097 itag = self._search_regex(r'itag/(\d+?)/', format_url, 'itag')
1d043b93
JMF
1098 url_map[itag] = format_url
1099 return url_map
1100
1fb07d10
JG
1101 def _extract_annotations(self, video_id):
1102 url = 'https://www.youtube.com/annotations_invideo?features=1&legacy=1&video_id=%s' % video_id
1103 return self._download_webpage(url, video_id, note=u'Searching for annotations.', errnote=u'Unable to download video annotations.')
1104
c5e8d7af
PH
1105 def _real_extract(self, url):
1106 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
1107 mobj = re.search(self._NEXT_URL_RE, url)
1108 if mobj:
1109 url = 'https://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
1110 video_id = self._extract_id(url)
1111
1112 # Get video webpage
c5e8d7af 1113 url = 'https://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
336c3a69 1114 video_webpage = self._download_webpage(url, video_id)
c5e8d7af
PH
1115
1116 # Attempt to extract SWF player URL
e0df6211 1117 mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
c5e8d7af
PH
1118 if mobj is not None:
1119 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1120 else:
1121 player_url = None
1122
1123 # Get video info
1124 self.report_video_info_webpage_download(video_id)
c108eb73
JMF
1125 if re.search(r'player-age-gate-content">', video_webpage) is not None:
1126 self.report_age_confirmation()
1127 age_gate = True
1128 # We simulate the access to the video from www.youtube.com/v/{video_id}
1129 # this can be viewed without login into Youtube
1130 data = compat_urllib_parse.urlencode({'video_id': video_id,
fccd3771 1131 'el': 'player_embedded',
c108eb73
JMF
1132 'gl': 'US',
1133 'hl': 'en',
1134 'eurl': 'https://youtube.googleapis.com/v/' + video_id,
1135 'asv': 3,
1136 'sts':'1588',
1137 })
1138 video_info_url = 'https://www.youtube.com/get_video_info?' + data
c5e8d7af
PH
1139 video_info_webpage = self._download_webpage(video_info_url, video_id,
1140 note=False,
1141 errnote='unable to download video info webpage')
1142 video_info = compat_parse_qs(video_info_webpage)
c108eb73
JMF
1143 else:
1144 age_gate = False
1145 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1146 video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1147 % (video_id, el_type))
1148 video_info_webpage = self._download_webpage(video_info_url, video_id,
1149 note=False,
1150 errnote='unable to download video info webpage')
1151 video_info = compat_parse_qs(video_info_webpage)
1152 if 'token' in video_info:
1153 break
c5e8d7af
PH
1154 if 'token' not in video_info:
1155 if 'reason' in video_info:
9a82b238 1156 raise ExtractorError(u'YouTube said: %s' % video_info['reason'][0], expected=True)
c5e8d7af
PH
1157 else:
1158 raise ExtractorError(u'"token" parameter not in video info for unknown reason')
1159
1d699755
PH
1160 if 'view_count' in video_info:
1161 view_count = int(video_info['view_count'][0])
1162 else:
1163 view_count = None
1164
c5e8d7af
PH
1165 # Check for "rental" videos
1166 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
1167 raise ExtractorError(u'"rental" videos not supported')
1168
1169 # Start extracting information
1170 self.report_information_extraction(video_id)
1171
1172 # uploader
1173 if 'author' not in video_info:
1174 raise ExtractorError(u'Unable to extract uploader name')
1175 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
1176
1177 # uploader_id
1178 video_uploader_id = None
1179 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
1180 if mobj is not None:
1181 video_uploader_id = mobj.group(1)
1182 else:
1183 self._downloader.report_warning(u'unable to extract uploader nickname')
1184
1185 # title
a8c6b241
PH
1186 if 'title' in video_info:
1187 video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
1188 else:
1189 self._downloader.report_warning(u'Unable to extract video title')
1190 video_title = u'_'
c5e8d7af
PH
1191
1192 # thumbnail image
7763b04e
JMF
1193 # We try first to get a high quality image:
1194 m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">',
1195 video_webpage, re.DOTALL)
1196 if m_thumb is not None:
1197 video_thumbnail = m_thumb.group(1)
1198 elif 'thumbnail_url' not in video_info:
c5e8d7af 1199 self._downloader.report_warning(u'unable to extract video thumbnail')
f490e77e 1200 video_thumbnail = None
c5e8d7af
PH
1201 else: # don't panic if we can't find it
1202 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
1203
1204 # upload date
1205 upload_date = None
1206 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
1207 if mobj is not None:
1208 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1209 upload_date = unified_strdate(upload_date)
1210
1211 # description
1212 video_description = get_element_by_id("eow-description", video_webpage)
1213 if video_description:
27dcce19
PH
1214 video_description = re.sub(r'''(?x)
1215 <a\s+
1216 (?:[a-zA-Z-]+="[^"]+"\s+)*?
1217 title="([^"]+)"\s+
1218 (?:[a-zA-Z-]+="[^"]+"\s+)*?
1219 class="yt-uix-redirect-link"\s*>
1220 [^<]+
1221 </a>
1222 ''', r'\1', video_description)
c5e8d7af
PH
1223 video_description = clean_html(video_description)
1224 else:
1225 fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
1226 if fd_mobj:
1227 video_description = unescapeHTML(fd_mobj.group(1))
1228 else:
1229 video_description = u''
1230
336c3a69 1231 def _extract_count(klass):
46374a56
PH
1232 count = self._search_regex(
1233 r'class="%s">([\d,]+)</span>' % re.escape(klass),
1234 video_webpage, klass, default=None)
336c3a69
JMF
1235 if count is not None:
1236 return int(count.replace(',', ''))
1237 return None
1238 like_count = _extract_count(u'likes-count')
1239 dislike_count = _extract_count(u'dislikes-count')
1240
c5e8d7af 1241 # subtitles
d82134c3 1242 video_subtitles = self.extract_subtitles(video_id, video_webpage)
c5e8d7af 1243
c5e8d7af 1244 if self._downloader.params.get('listsubtitles', False):
d665f8d3 1245 self._list_available_subtitles(video_id, video_webpage)
c5e8d7af
PH
1246 return
1247
1248 if 'length_seconds' not in video_info:
1249 self._downloader.report_warning(u'unable to extract video duration')
b466b702 1250 video_duration = None
c5e8d7af 1251 else:
b466b702 1252 video_duration = int(compat_urllib_parse.unquote_plus(video_info['length_seconds'][0]))
c5e8d7af 1253
1fb07d10
JG
1254 # annotations
1255 video_annotations = None
1256 if self._downloader.params.get('writeannotations', False):
1257 video_annotations = self._extract_annotations(video_id)
1258
c5e8d7af 1259 # Decide which formats to download
c5e8d7af
PH
1260 try:
1261 mobj = re.search(r';ytplayer.config = ({.*?});', video_webpage)
50be92c1
PH
1262 if not mobj:
1263 raise ValueError('Could not find vevo ID')
c5e8d7af
PH
1264 info = json.loads(mobj.group(1))
1265 args = info['args']
7ce7e394
JMF
1266 # Easy way to know if the 's' value is in url_encoded_fmt_stream_map
1267 # this signatures are encrypted
44d46655 1268 if 'url_encoded_fmt_stream_map' not in args:
f10503db 1269 raise ValueError(u'No stream_map present') # caught below
00fe14fc
JMF
1270 re_signature = re.compile(r'[&,]s=')
1271 m_s = re_signature.search(args['url_encoded_fmt_stream_map'])
7ce7e394
JMF
1272 if m_s is not None:
1273 self.to_screen(u'%s: Encrypted signatures detected.' % video_id)
c5e8d7af 1274 video_info['url_encoded_fmt_stream_map'] = [args['url_encoded_fmt_stream_map']]
00fe14fc 1275 m_s = re_signature.search(args.get('adaptive_fmts', u''))
b7a68384 1276 if m_s is not None:
00fe14fc
JMF
1277 if 'adaptive_fmts' in video_info:
1278 video_info['adaptive_fmts'][0] += ',' + args['adaptive_fmts']
37b6d5f6 1279 else:
00fe14fc 1280 video_info['adaptive_fmts'] = [args['adaptive_fmts']]
c5e8d7af
PH
1281 except ValueError:
1282 pass
1283
dd27fd17
PH
1284 def _map_to_format_list(urlmap):
1285 formats = []
1286 for itag, video_real_url in urlmap.items():
1287 dct = {
1288 'format_id': itag,
1289 'url': video_real_url,
1290 'player_url': player_url,
1291 }
1292 dct.update(self._formats[itag])
1293 formats.append(dct)
1294 return formats
1295
c5e8d7af
PH
1296 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1297 self.report_rtmp_download()
dd27fd17
PH
1298 formats = [{
1299 'format_id': '_rtmp',
1300 'protocol': 'rtmp',
1301 'url': video_info['conn'][0],
1302 'player_url': player_url,
1303 }]
00fe14fc
JMF
1304 elif len(video_info.get('url_encoded_fmt_stream_map', [])) >= 1 or len(video_info.get('adaptive_fmts', [])) >= 1:
1305 encoded_url_map = video_info.get('url_encoded_fmt_stream_map', [''])[0] + ',' + video_info.get('adaptive_fmts',[''])[0]
1306 if 'rtmpe%3Dyes' in encoded_url_map:
a7055eb9 1307 raise ExtractorError('rtmpe downloads are not supported, see https://github.com/rg3/youtube-dl/issues/343 for more information.', expected=True)
c5e8d7af 1308 url_map = {}
00fe14fc 1309 for url_data_str in encoded_url_map.split(','):
c5e8d7af
PH
1310 url_data = compat_parse_qs(url_data_str)
1311 if 'itag' in url_data and 'url' in url_data:
1312 url = url_data['url'][0]
1313 if 'sig' in url_data:
1314 url += '&signature=' + url_data['sig'][0]
1315 elif 's' in url_data:
e0df6211 1316 encrypted_sig = url_data['s'][0]
769fda3c 1317 if self._downloader.params.get('verbose'):
c108eb73 1318 if age_gate:
bdde940e
PH
1319 if player_url is None:
1320 player_version = 'unknown'
1321 else:
1322 player_version = self._search_regex(
1323 r'-(.+)\.swf$', player_url,
1324 u'flash player', fatal=False)
e0df6211 1325 player_desc = 'flash player %s' % player_version
c108eb73 1326 else:
83799698
PH
1327 player_version = self._search_regex(
1328 r'html5player-(.+?)\.js', video_webpage,
c108eb73 1329 'html5 player', fatal=False)
e0df6211
PH
1330 player_desc = u'html5 player %s' % player_version
1331
1332 parts_sizes = u'.'.join(compat_str(len(part)) for part in encrypted_sig.split('.'))
5a76c651 1333 self.to_screen(u'encrypted signature length %d (%s), itag %s, %s' %
e0df6211
PH
1334 (len(encrypted_sig), parts_sizes, url_data['itag'][0], player_desc))
1335
83799698 1336 if not age_gate:
e0df6211
PH
1337 jsplayer_url_json = self._search_regex(
1338 r'"assets":.+?"js":\s*("[^"]+")',
1339 video_webpage, u'JS player URL')
83799698 1340 player_url = json.loads(jsplayer_url_json)
e0df6211 1341
83799698
PH
1342 signature = self._decrypt_signature(
1343 encrypted_sig, video_id, player_url, age_gate)
c5e8d7af
PH
1344 url += '&signature=' + signature
1345 if 'ratebypass' not in url:
1346 url += '&ratebypass=yes'
1347 url_map[url_data['itag'][0]] = url
dd27fd17 1348 formats = _map_to_format_list(url_map)
1d043b93
JMF
1349 elif video_info.get('hlsvp'):
1350 manifest_url = video_info['hlsvp'][0]
1351 url_map = self._extract_from_m3u8(manifest_url, video_id)
dd27fd17 1352 formats = _map_to_format_list(url_map)
c5e8d7af 1353 else:
9abb3204 1354 raise ExtractorError(u'no conn, hlsvp or url_encoded_fmt_stream_map information found in video info')
c5e8d7af 1355
dd27fd17
PH
1356 # Look for the DASH manifest
1357 dash_manifest_url_lst = video_info.get('dashmpd')
1358 if dash_manifest_url_lst and dash_manifest_url_lst[0]:
1359 try:
1360 dash_doc = self._download_xml(
1361 dash_manifest_url_lst[0], video_id,
1362 note=u'Downloading DASH manifest',
1363 errnote=u'Could not download DASH manifest')
1364 for r in dash_doc.findall(u'.//{urn:mpeg:DASH:schema:MPD:2011}Representation'):
1365 url_el = r.find('{urn:mpeg:DASH:schema:MPD:2011}BaseURL')
1366 if url_el is None:
1367 continue
1368 format_id = r.attrib['id']
1369 video_url = url_el.text
1370 filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength'))
1371 f = {
1372 'format_id': format_id,
1373 'url': video_url,
1374 'width': int_or_none(r.attrib.get('width')),
1375 'tbr': int_or_none(r.attrib.get('bandwidth'), 1000),
1376 'asr': int_or_none(r.attrib.get('audioSamplingRate')),
1377 'filesize': filesize,
1378 }
1379 try:
1380 existing_format = next(
1381 fo for fo in formats
1382 if fo['format_id'] == format_id)
1383 except StopIteration:
1384 f.update(self._formats.get(format_id, {}))
1385 formats.append(f)
1386 else:
1387 existing_format.update(f)
1388
1389 except (ExtractorError, KeyError) as e:
1390 self.report_warning(u'Skipping DASH manifest: %s' % e, video_id)
d80044c2 1391
4bcc7bd1 1392 self._sort_formats(formats)
4ea3be0a 1393
1394 return {
1395 'id': video_id,
1396 'uploader': video_uploader,
1397 'uploader_id': video_uploader_id,
1398 'upload_date': upload_date,
1399 'title': video_title,
1400 'thumbnail': video_thumbnail,
1401 'description': video_description,
1402 'subtitles': video_subtitles,
1403 'duration': video_duration,
1404 'age_limit': 18 if age_gate else 0,
1405 'annotations': video_annotations,
1406 'webpage_url': 'https://www.youtube.com/watch?v=%s' % video_id,
1407 'view_count': view_count,
1408 'like_count': like_count,
1409 'dislike_count': dislike_count,
1410 'formats': formats,
1411 }
c5e8d7af 1412
880e1c52 1413class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
0f818663 1414 IE_DESC = u'YouTube.com playlists'
c5e8d7af
PH
1415 _VALID_URL = r"""(?:
1416 (?:https?://)?
1417 (?:\w+\.)?
1418 youtube\.com/
1419 (?:
1420 (?:course|view_play_list|my_playlists|artist|playlist|watch)
1421 \? (?:.*?&)*? (?:p|a|list)=
1422 | p/
1423 )
715c8e7b 1424 ((?:PL|EC|UU|FL|RD)?[0-9A-Za-z-_]{10,})
c5e8d7af
PH
1425 .*
1426 |
715c8e7b 1427 ((?:PL|EC|UU|FL|RD)[0-9A-Za-z-_]{10,})
c5e8d7af 1428 )"""
dcbb4580
JMF
1429 _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s&page=%s'
1430 _MORE_PAGES_INDICATOR = r'data-link-type="next"'
6e47b51e 1431 _VIDEO_RE = r'href="/watch\?v=(?P<id>[0-9A-Za-z_-]{11})&amp;[^"]*?index=(?P<index>\d+)'
c5e8d7af
PH
1432 IE_NAME = u'youtube:playlist'
1433
1434 @classmethod
1435 def suitable(cls, url):
1436 """Receives a URL and returns True if suitable for this IE."""
1437 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1438
880e1c52
JMF
1439 def _real_initialize(self):
1440 self._login()
1441
652cdaa2
JMF
1442 def _ids_to_results(self, ids):
1443 return [self.url_result(vid_id, 'Youtube', video_id=vid_id)
1444 for vid_id in ids]
1445
1446 def _extract_mix(self, playlist_id):
1447 # The mixes are generated from a a single video
1448 # the id of the playlist is just 'RD' + video_id
7d4afc55 1449 url = 'https://youtube.com/watch?v=%s&list=%s' % (playlist_id[-11:], playlist_id)
652cdaa2 1450 webpage = self._download_webpage(url, playlist_id, u'Downloading Youtube mix')
76d1700b
JMF
1451 title_span = (get_element_by_attribute('class', 'title long-title', webpage) or
1452 get_element_by_attribute('class', 'title ', webpage))
1453 title = clean_html(title_span)
652cdaa2
JMF
1454 video_re = r'data-index="\d+".*?href="/watch\?v=([0-9A-Za-z_-]{11})&amp;[^"]*?list=%s' % re.escape(playlist_id)
1455 ids = orderedSet(re.findall(video_re, webpage))
1456 url_results = self._ids_to_results(ids)
1457
1458 return self.playlist_result(url_results, playlist_id, title)
1459
c5e8d7af
PH
1460 def _real_extract(self, url):
1461 # Extract playlist id
1462 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1463 if mobj is None:
1464 raise ExtractorError(u'Invalid URL: %s' % url)
47192f92
FV
1465 playlist_id = mobj.group(1) or mobj.group(2)
1466
1467 # Check if it's a video-specific URL
7c61bd36 1468 query_dict = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
47192f92
FV
1469 if 'v' in query_dict:
1470 video_id = query_dict['v'][0]
1471 if self._downloader.params.get('noplaylist'):
1472 self.to_screen(u'Downloading just video %s because of --no-playlist' % video_id)
7012b23c 1473 return self.url_result(video_id, 'Youtube', video_id=video_id)
47192f92
FV
1474 else:
1475 self.to_screen(u'Downloading playlist PL%s - add --no-playlist to just download video %s' % (playlist_id, video_id))
c5e8d7af 1476
7d4afc55 1477 if playlist_id.startswith('RD'):
652cdaa2
JMF
1478 # Mixes require a custom extraction process
1479 return self._extract_mix(playlist_id)
0a688bc0
JMF
1480 if playlist_id.startswith('TL'):
1481 raise ExtractorError(u'For downloading YouTube.com top lists, use '
1482 u'the "yttoplist" keyword, for example "youtube-dl \'yttoplist:music:Top Tracks\'"', expected=True)
652cdaa2 1483
dcbb4580
JMF
1484 # Extract the video ids from the playlist pages
1485 ids = []
c5e8d7af 1486
755eb032 1487 for page_num in itertools.count(1):
dcbb4580 1488 url = self._TEMPLATE_URL % (playlist_id, page_num)
c5e8d7af 1489 page = self._download_webpage(url, playlist_id, u'Downloading page #%s' % page_num)
6e47b51e
JMF
1490 matches = re.finditer(self._VIDEO_RE, page)
1491 # We remove the duplicates and the link with index 0
1492 # (it's not the first video of the playlist)
1493 new_ids = orderedSet(m.group('id') for m in matches if m.group('index') != '0')
dcbb4580 1494 ids.extend(new_ids)
c5e8d7af 1495
dcbb4580 1496 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
c5e8d7af
PH
1497 break
1498
c91778f8
PH
1499 try:
1500 playlist_title = self._og_search_title(page)
1501 except RegexNotFoundError:
1502 self.report_warning(
1503 u'Playlist page is missing OpenGraph title, falling back ...',
1504 playlist_id)
1505 playlist_title = self._html_search_regex(
1506 r'<h1 class="pl-header-title">(.*?)</h1>', page, u'title')
c5e8d7af 1507
652cdaa2 1508 url_results = self._ids_to_results(ids)
dcbb4580 1509 return self.playlist_result(url_results, playlist_id, playlist_title)
c5e8d7af
PH
1510
1511
0a688bc0
JMF
1512class YoutubeTopListIE(YoutubePlaylistIE):
1513 IE_NAME = u'youtube:toplist'
1514 IE_DESC = (u'YouTube.com top lists, "yttoplist:{channel}:{list title}"'
1515 u' (Example: "yttoplist:music:Top Tracks")')
1516 _VALID_URL = r'yttoplist:(?P<chann>.*?):(?P<title>.*?)$'
1517
1518 def _real_extract(self, url):
1519 mobj = re.match(self._VALID_URL, url)
1520 channel = mobj.group('chann')
1521 title = mobj.group('title')
1522 query = compat_urllib_parse.urlencode({'title': title})
1523 playlist_re = 'href="([^"]+?%s[^"]+?)"' % re.escape(query)
1524 channel_page = self._download_webpage('https://www.youtube.com/%s' % channel, title)
1525 link = self._html_search_regex(playlist_re, channel_page, u'list')
1526 url = compat_urlparse.urljoin('https://www.youtube.com/', link)
1527
1528 video_re = r'data-index="\d+".*?data-video-id="([0-9A-Za-z_-]{11})"'
1529 ids = []
1530 # sometimes the webpage doesn't contain the videos
1531 # retry until we get them
1532 for i in itertools.count(0):
1533 msg = u'Downloading Youtube mix'
1534 if i > 0:
1535 msg += ', retry #%d' % i
1536 webpage = self._download_webpage(url, title, msg)
1537 ids = orderedSet(re.findall(video_re, webpage))
1538 if ids:
1539 break
1540 url_results = self._ids_to_results(ids)
1541 return self.playlist_result(url_results, playlist_title=title)
1542
1543
c5e8d7af 1544class YoutubeChannelIE(InfoExtractor):
0f818663 1545 IE_DESC = u'YouTube.com channels'
c5e8d7af 1546 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
c5e8d7af 1547 _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
252580c5 1548 _MORE_PAGES_URL = 'http://www.youtube.com/c4_browse_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
c5e8d7af
PH
1549 IE_NAME = u'youtube:channel'
1550
1551 def extract_videos_from_page(self, page):
1552 ids_in_page = []
1553 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1554 if mobj.group(1) not in ids_in_page:
1555 ids_in_page.append(mobj.group(1))
1556 return ids_in_page
1557
1558 def _real_extract(self, url):
1559 # Extract channel id
1560 mobj = re.match(self._VALID_URL, url)
1561 if mobj is None:
1562 raise ExtractorError(u'Invalid URL: %s' % url)
1563
1564 # Download channel page
1565 channel_id = mobj.group(1)
1566 video_ids = []
b9643eed
JMF
1567 url = 'https://www.youtube.com/channel/%s/videos' % channel_id
1568 channel_page = self._download_webpage(url, channel_id)
31812a9e
PH
1569 autogenerated = re.search(r'''(?x)
1570 class="[^"]*?(?:
1571 channel-header-autogenerated-label|
1572 yt-channel-title-autogenerated
1573 )[^"]*"''', channel_page) is not None
c5e8d7af 1574
b9643eed
JMF
1575 if autogenerated:
1576 # The videos are contained in a single page
1577 # the ajax pages can't be used, they are empty
1578 video_ids = self.extract_videos_from_page(channel_page)
1579 else:
1580 # Download all channel pages using the json-based channel_ajax query
1581 for pagenum in itertools.count(1):
1582 url = self._MORE_PAGES_URL % (pagenum, channel_id)
1583 page = self._download_webpage(url, channel_id,
1584 u'Downloading page #%s' % pagenum)
1585
1586 page = json.loads(page)
1587
1588 ids_in_page = self.extract_videos_from_page(page['content_html'])
1589 video_ids.extend(ids_in_page)
1590
1591 if self._MORE_PAGES_INDICATOR not in page['load_more_widget_html']:
1592 break
c5e8d7af
PH
1593
1594 self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1595
7012b23c
PH
1596 url_entries = [self.url_result(video_id, 'Youtube', video_id=video_id)
1597 for video_id in video_ids]
1598 return self.playlist_result(url_entries, channel_id)
c5e8d7af
PH
1599
1600
1601class YoutubeUserIE(InfoExtractor):
0f818663 1602 IE_DESC = u'YouTube.com user videos (URL or "ytuser" keyword)'
57da92b7 1603 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:user/)?(?!(?:attribution_link|watch)(?:$|[^a-z_A-Z0-9-])))|ytuser:)(?!feed/)([A-Za-z0-9_-]+)'
c5e8d7af
PH
1604 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1605 _GDATA_PAGE_SIZE = 50
fd9cf738 1606 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d&alt=json'
c5e8d7af
PH
1607 IE_NAME = u'youtube:user'
1608
e3ea4790 1609 @classmethod
f4b05232 1610 def suitable(cls, url):
e3ea4790
JMF
1611 # Don't return True if the url can be extracted with other youtube
1612 # extractor, the regex would is too permissive and it would match.
1613 other_ies = iter(klass for (name, klass) in globals().items() if name.endswith('IE') and klass is not cls)
1614 if any(ie.suitable(url) for ie in other_ies): return False
f4b05232
JMF
1615 else: return super(YoutubeUserIE, cls).suitable(url)
1616
c5e8d7af
PH
1617 def _real_extract(self, url):
1618 # Extract username
1619 mobj = re.match(self._VALID_URL, url)
1620 if mobj is None:
1621 raise ExtractorError(u'Invalid URL: %s' % url)
1622
1623 username = mobj.group(1)
1624
1625 # Download video ids using YouTube Data API. Result size per
1626 # query is limited (currently to 50 videos) so we need to query
1627 # page by page until there are no video ids - it means we got
1628 # all of them.
1629
b7ab0590 1630 def download_page(pagenum):
c5e8d7af
PH
1631 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1632
1633 gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
b7ab0590
PH
1634 page = self._download_webpage(
1635 gdata_url, username,
1636 u'Downloading video ids from %d to %d' % (
1637 start_index, start_index + self._GDATA_PAGE_SIZE))
c5e8d7af 1638
fd9cf738
JMF
1639 try:
1640 response = json.loads(page)
1641 except ValueError as err:
1642 raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
71c82637 1643 if 'entry' not in response['feed']:
b7ab0590 1644 return
fd9cf738 1645
c5e8d7af 1646 # Extract video identifiers
e302f9ce
PH
1647 entries = response['feed']['entry']
1648 for entry in entries:
1649 title = entry['title']['$t']
1650 video_id = entry['id']['$t'].split('/')[-1]
b7ab0590 1651 yield {
e302f9ce
PH
1652 '_type': 'url',
1653 'url': video_id,
1654 'ie_key': 'Youtube',
1655 'id': 'video_id',
1656 'title': title,
b7ab0590
PH
1657 }
1658 url_results = PagedList(download_page, self._GDATA_PAGE_SIZE)
c5e8d7af 1659
7012b23c
PH
1660 return self.playlist_result(url_results, playlist_title=username)
1661
b05654f0
PH
1662
1663class YoutubeSearchIE(SearchInfoExtractor):
0f818663 1664 IE_DESC = u'YouTube.com searches'
b05654f0
PH
1665 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1666 _MAX_RESULTS = 1000
1667 IE_NAME = u'youtube:search'
1668 _SEARCH_KEY = 'ytsearch'
1669
b05654f0
PH
1670 def _get_n_results(self, query, n):
1671 """Get a specified number of results for a query"""
1672
1673 video_ids = []
1674 pagenum = 0
1675 limit = n
1676
1677 while (50 * pagenum) < limit:
b05654f0 1678 result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
7cc3570e
PH
1679 data_json = self._download_webpage(
1680 result_url, video_id=u'query "%s"' % query,
1681 note=u'Downloading page %s' % (pagenum + 1),
1682 errnote=u'Unable to download API page')
1683 data = json.loads(data_json)
1684 api_response = data['data']
1685
1686 if 'items' not in api_response:
b05654f0
PH
1687 raise ExtractorError(u'[youtube] No video results')
1688
1689 new_ids = list(video['id'] for video in api_response['items'])
1690 video_ids += new_ids
1691
1692 limit = min(n, api_response['totalItems'])
1693 pagenum += 1
1694
1695 if len(video_ids) > n:
1696 video_ids = video_ids[:n]
7012b23c
PH
1697 videos = [self.url_result(video_id, 'Youtube', video_id=video_id)
1698 for video_id in video_ids]
b05654f0 1699 return self.playlist_result(videos, query)
75dff0ee 1700
a3dd9248 1701class YoutubeSearchDateIE(YoutubeSearchIE):
cb7fb546 1702 IE_NAME = YoutubeSearchIE.IE_NAME + ':date'
a3dd9248
CM
1703 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc&orderby=published'
1704 _SEARCH_KEY = 'ytsearchdate'
08fb86c4 1705 IE_DESC = u'YouTube.com searches, newest videos first'
75dff0ee
JMF
1706
1707class YoutubeShowIE(InfoExtractor):
0f818663 1708 IE_DESC = u'YouTube.com (multi-season) shows'
75dff0ee
JMF
1709 _VALID_URL = r'https?://www\.youtube\.com/show/(.*)'
1710 IE_NAME = u'youtube:show'
1711
1712 def _real_extract(self, url):
1713 mobj = re.match(self._VALID_URL, url)
1714 show_name = mobj.group(1)
1715 webpage = self._download_webpage(url, show_name, u'Downloading show webpage')
1716 # There's one playlist for each season of the show
1717 m_seasons = list(re.finditer(r'href="(/playlist\?list=.*?)"', webpage))
1718 self.to_screen(u'%s: Found %s seasons' % (show_name, len(m_seasons)))
1719 return [self.url_result('https://www.youtube.com' + season.group(1), 'YoutubePlaylist') for season in m_seasons]
04cc9617
JMF
1720
1721
b2e8bc1b 1722class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
d7ae0639
JMF
1723 """
1724 Base class for extractors that fetch info from
1725 http://www.youtube.com/feed_ajax
1726 Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.
1727 """
b2e8bc1b 1728 _LOGIN_REQUIRED = True
43ba5456
JMF
1729 # use action_load_personal_feed instead of action_load_system_feed
1730 _PERSONAL_FEED = False
04cc9617 1731
d7ae0639
JMF
1732 @property
1733 def _FEED_TEMPLATE(self):
43ba5456
JMF
1734 action = 'action_load_system_feed'
1735 if self._PERSONAL_FEED:
1736 action = 'action_load_personal_feed'
1737 return 'http://www.youtube.com/feed_ajax?%s=1&feed_name=%s&paging=%%s' % (action, self._FEED_NAME)
d7ae0639
JMF
1738
1739 @property
1740 def IE_NAME(self):
1741 return u'youtube:%s' % self._FEED_NAME
04cc9617 1742
81f0259b 1743 def _real_initialize(self):
b2e8bc1b 1744 self._login()
81f0259b 1745
04cc9617
JMF
1746 def _real_extract(self, url):
1747 feed_entries = []
0e44d838
JMF
1748 paging = 0
1749 for i in itertools.count(1):
d7ae0639
JMF
1750 info = self._download_webpage(self._FEED_TEMPLATE % paging,
1751 u'%s feed' % self._FEED_NAME,
04cc9617
JMF
1752 u'Downloading page %s' % i)
1753 info = json.loads(info)
1754 feed_html = info['feed_html']
43ba5456 1755 m_ids = re.finditer(r'"/watch\?v=(.*?)["&]', feed_html)
04cc9617 1756 ids = orderedSet(m.group(1) for m in m_ids)
7012b23c
PH
1757 feed_entries.extend(
1758 self.url_result(video_id, 'Youtube', video_id=video_id)
1759 for video_id in ids)
04cc9617
JMF
1760 if info['paging'] is None:
1761 break
0e44d838 1762 paging = info['paging']
d7ae0639
JMF
1763 return self.playlist_result(feed_entries, playlist_title=self._PLAYLIST_TITLE)
1764
1765class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor):
1766 IE_DESC = u'YouTube.com subscriptions feed, "ytsubs" keyword(requires authentication)'
1767 _VALID_URL = r'https?://www\.youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
1768 _FEED_NAME = 'subscriptions'
1769 _PLAYLIST_TITLE = u'Youtube Subscriptions'
1770
1771class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
1772 IE_DESC = u'YouTube.com recommended videos, "ytrec" keyword (requires authentication)'
1773 _VALID_URL = r'https?://www\.youtube\.com/feed/recommended|:ytrec(?:ommended)?'
1774 _FEED_NAME = 'recommended'
1775 _PLAYLIST_TITLE = u'Youtube Recommended videos'
c626a3d9 1776
43ba5456
JMF
1777class YoutubeWatchLaterIE(YoutubeFeedsInfoExtractor):
1778 IE_DESC = u'Youtube watch later list, "ytwatchlater" keyword (requires authentication)'
1779 _VALID_URL = r'https?://www\.youtube\.com/feed/watch_later|:ytwatchlater'
1780 _FEED_NAME = 'watch_later'
1781 _PLAYLIST_TITLE = u'Youtube Watch Later'
43ba5456 1782 _PERSONAL_FEED = True
c626a3d9 1783
f459d170
JMF
1784class YoutubeHistoryIE(YoutubeFeedsInfoExtractor):
1785 IE_DESC = u'Youtube watch history, "ythistory" keyword (requires authentication)'
1786 _VALID_URL = u'https?://www\.youtube\.com/feed/history|:ythistory'
1787 _FEED_NAME = 'history'
1788 _PERSONAL_FEED = True
1789 _PLAYLIST_TITLE = u'Youtube Watch History'
1790
c626a3d9
JMF
1791class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
1792 IE_NAME = u'youtube:favorites'
1793 IE_DESC = u'YouTube.com favourite videos, "ytfav" keyword (requires authentication)'
c7a7750d 1794 _VALID_URL = r'https?://www\.youtube\.com/my_favorites|:ytfav(?:ou?rites)?'
c626a3d9
JMF
1795 _LOGIN_REQUIRED = True
1796
1797 def _real_extract(self, url):
1798 webpage = self._download_webpage('https://www.youtube.com/my_favorites', 'Youtube Favourites videos')
1799 playlist_id = self._search_regex(r'list=(.+?)["&]', webpage, u'favourites playlist id')
1800 return self.url_result(playlist_id, 'YoutubePlaylist')
15870e90
PH
1801
1802
1803class YoutubeTruncatedURLIE(InfoExtractor):
1804 IE_NAME = 'youtube:truncated_url'
1805 IE_DESC = False # Do not list
1806 _VALID_URL = r'(?:https?://)?[^/]+/watch\?feature=[a-z_]+$'
1807
1808 def _real_extract(self, url):
1809 raise ExtractorError(
1810 u'Did you forget to quote the URL? Remember that & is a meta '
1811 u'character in most shells, so you want to put the URL in quotes, '
1812 u'like youtube-dl '
b4622a32
PH
1813 u'"http://www.youtube.com/watch?feature=foo&v=BaW_jenozKc" '
1814 u' or simply youtube-dl BaW_jenozKc .',
15870e90 1815 expected=True)