]> jfr.im git - yt-dlp.git/blame - youtube_dl/extractor/youtube.py
Add build instructions (Fixes #2218)
[yt-dlp.git] / youtube_dl / extractor / youtube.py
CommitLineData
c5e8d7af 1# coding: utf-8
c5e8d7af 2
e0df6211 3import collections
edf3e38e 4import errno
e0df6211 5import io
0ca96d48 6import itertools
c5e8d7af 7import json
c4417ddb 8import os.path
c5e8d7af 9import re
e0df6211
PH
10import string
11import struct
12import traceback
13import zlib
c5e8d7af 14
b05654f0 15from .common import InfoExtractor, SearchInfoExtractor
54d39d8b 16from .subtitles import SubtitlesInfoExtractor
c5e8d7af 17from ..utils import (
edf3e38e 18 compat_chr,
c5e8d7af 19 compat_parse_qs,
c5e8d7af
PH
20 compat_urllib_parse,
21 compat_urllib_request,
7c61bd36 22 compat_urlparse,
c5e8d7af
PH
23 compat_str,
24
25 clean_html,
c38b1e77 26 get_cachedir,
c5e8d7af 27 get_element_by_id,
652cdaa2 28 get_element_by_attribute,
c5e8d7af 29 ExtractorError,
dd27fd17 30 int_or_none,
b7ab0590 31 PagedList,
c91778f8 32 RegexNotFoundError,
c5e8d7af
PH
33 unescapeHTML,
34 unified_strdate,
04cc9617 35 orderedSet,
edf3e38e 36 write_json_file,
c5e8d7af
PH
37)
38
de7f3446 39class YoutubeBaseInfoExtractor(InfoExtractor):
b2e8bc1b
JMF
40 """Provide base functions for Youtube extractors"""
41 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
42 _LANG_URL = r'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
43 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
44 _NETRC_MACHINE = 'youtube'
45 # If True it will raise an error if no login info is provided
46 _LOGIN_REQUIRED = False
47
b2e8bc1b 48 def _set_language(self):
7cc3570e
PH
49 return bool(self._download_webpage(
50 self._LANG_URL, None,
51 note=u'Setting language', errnote='unable to set language',
52 fatal=False))
b2e8bc1b
JMF
53
54 def _login(self):
55 (username, password) = self._get_login_info()
56 # No authentication to be performed
57 if username is None:
58 if self._LOGIN_REQUIRED:
59 raise ExtractorError(u'No login info available, needed for using %s.' % self.IE_NAME, expected=True)
60 return False
61
7cc3570e
PH
62 login_page = self._download_webpage(
63 self._LOGIN_URL, None,
64 note=u'Downloading login page',
65 errnote=u'unable to fetch login page', fatal=False)
66 if login_page is False:
67 return
b2e8bc1b 68
795f28f8
PH
69 galx = self._search_regex(r'(?s)<input.+?name="GALX".+?value="(.+?)"',
70 login_page, u'Login GALX parameter')
c5e8d7af 71
b2e8bc1b
JMF
72 # Log in
73 login_form_strs = {
74 u'continue': u'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
75 u'Email': username,
76 u'GALX': galx,
77 u'Passwd': password,
78 u'PersistentCookie': u'yes',
79 u'_utf8': u'霱',
80 u'bgresponse': u'js_disabled',
81 u'checkConnection': u'',
82 u'checkedDomains': u'youtube',
83 u'dnConn': u'',
b2e8bc1b
JMF
84 u'pstMsg': u'0',
85 u'rmShown': u'1',
86 u'secTok': u'',
87 u'signIn': u'Sign in',
88 u'timeStmp': u'',
89 u'service': u'youtube',
90 u'uilel': u'3',
91 u'hl': u'en_US',
92 }
93 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
94 # chokes on unicode
95 login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
96 login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
7cc3570e
PH
97
98 req = compat_urllib_request.Request(self._LOGIN_URL, login_data)
99 login_results = self._download_webpage(
100 req, None,
101 note=u'Logging in', errnote=u'unable to log in', fatal=False)
102 if login_results is False:
103 return False
104 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
105 self._downloader.report_warning(u'unable to log in: bad username or password')
b2e8bc1b
JMF
106 return False
107 return True
108
109 def _confirm_age(self):
110 age_form = {
7cc3570e
PH
111 'next_url': '/',
112 'action_confirm': 'Confirm',
113 }
114 req = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
115
116 self._download_webpage(
117 req, None,
118 note=u'Confirming age', errnote=u'Unable to confirm age')
b2e8bc1b
JMF
119 return True
120
121 def _real_initialize(self):
122 if self._downloader is None:
123 return
124 if not self._set_language():
125 return
126 if not self._login():
127 return
128 self._confirm_age()
c5e8d7af 129
8377574c 130
de7f3446 131class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
0f818663 132 IE_DESC = u'YouTube.com'
cb7dfeea 133 _VALID_URL = r"""(?x)^
c5e8d7af 134 (
83aa5293 135 (?:https?://|//)? # http(s):// or protocol-independent URL (optional)
cb7dfeea 136 (?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/|
484aaeb2
PH
137 (?:www\.)?deturl\.com/www\.youtube\.com/|
138 (?:www\.)?pwnyoutube\.com|
e69ae5b9
JMF
139 tube\.majestyc\.net/|
140 youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains
c5e8d7af
PH
141 (?:.*?\#/)? # handle anchor (#/) redirect urls
142 (?: # the various things that can precede the ID:
143 (?:(?:v|embed|e)/) # v/ or embed/ or e/
144 |(?: # or the v= param in all its forms
d741e55a 145 (?:(?:watch|movie)(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
c5e8d7af
PH
146 (?:\?|\#!?) # the params delimiter ? or # or #!
147 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
148 v=
149 )
f4b05232
JMF
150 ))
151 |youtu\.be/ # just youtu.be/xxxx
152 )
c5e8d7af 153 )? # all until now is optional -> you can pass the naked ID
8963d9c2 154 ([0-9A-Za-z_-]{11}) # here is it! the YouTube video ID
c5e8d7af
PH
155 (?(1).+)? # if we found the ID, everything can follow
156 $"""
c5e8d7af 157 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
2c62dc26
PH
158 _formats = {
159 '5': {'ext': 'flv', 'width': 400, 'height': 240},
160 '6': {'ext': 'flv', 'width': 450, 'height': 270},
161 '13': {'ext': '3gp'},
162 '17': {'ext': '3gp', 'width': 176, 'height': 144},
163 '18': {'ext': 'mp4', 'width': 640, 'height': 360},
164 '22': {'ext': 'mp4', 'width': 1280, 'height': 720},
165 '34': {'ext': 'flv', 'width': 640, 'height': 360},
166 '35': {'ext': 'flv', 'width': 854, 'height': 480},
167 '36': {'ext': '3gp', 'width': 320, 'height': 240},
168 '37': {'ext': 'mp4', 'width': 1920, 'height': 1080},
169 '38': {'ext': 'mp4', 'width': 4096, 'height': 3072},
170 '43': {'ext': 'webm', 'width': 640, 'height': 360},
171 '44': {'ext': 'webm', 'width': 854, 'height': 480},
172 '45': {'ext': 'webm', 'width': 1280, 'height': 720},
173 '46': {'ext': 'webm', 'width': 1920, 'height': 1080},
174
1d043b93 175
86fe61c8 176 # 3d videos
2c62dc26
PH
177 '82': {'ext': 'mp4', 'height': 360, 'resolution': '360p', 'format_note': '3D', 'preference': -20},
178 '83': {'ext': 'mp4', 'height': 480, 'resolution': '480p', 'format_note': '3D', 'preference': -20},
179 '84': {'ext': 'mp4', 'height': 720, 'resolution': '720p', 'format_note': '3D', 'preference': -20},
180 '85': {'ext': 'mp4', 'height': 1080, 'resolution': '1080p', 'format_note': '3D', 'preference': -20},
181 '100': {'ext': 'webm', 'height': 360, 'resolution': '360p', 'format_note': '3D', 'preference': -20},
182 '101': {'ext': 'webm', 'height': 480, 'resolution': '480p', 'format_note': '3D', 'preference': -20},
183 '102': {'ext': 'webm', 'height': 720, 'resolution': '720p', 'format_note': '3D', 'preference': -20},
836a086c 184
96fb5605 185 # Apple HTTP Live Streaming
2c62dc26
PH
186 '92': {'ext': 'mp4', 'height': 240, 'resolution': '240p', 'format_note': 'HLS', 'preference': -10},
187 '93': {'ext': 'mp4', 'height': 360, 'resolution': '360p', 'format_note': 'HLS', 'preference': -10},
188 '94': {'ext': 'mp4', 'height': 480, 'resolution': '480p', 'format_note': 'HLS', 'preference': -10},
189 '95': {'ext': 'mp4', 'height': 720, 'resolution': '720p', 'format_note': 'HLS', 'preference': -10},
190 '96': {'ext': 'mp4', 'height': 1080, 'resolution': '1080p', 'format_note': 'HLS', 'preference': -10},
191 '132': {'ext': 'mp4', 'height': 240, 'resolution': '240p', 'format_note': 'HLS', 'preference': -10},
192 '151': {'ext': 'mp4', 'height': 72, 'resolution': '72p', 'format_note': 'HLS', 'preference': -10},
193
194 # DASH mp4 video
195 '133': {'ext': 'mp4', 'height': 240, 'resolution': '240p', 'format_note': 'DASH video', 'preference': -40},
196 '134': {'ext': 'mp4', 'height': 360, 'resolution': '360p', 'format_note': 'DASH video', 'preference': -40},
197 '135': {'ext': 'mp4', 'height': 480, 'resolution': '480p', 'format_note': 'DASH video', 'preference': -40},
198 '136': {'ext': 'mp4', 'height': 720, 'resolution': '720p', 'format_note': 'DASH video', 'preference': -40},
199 '137': {'ext': 'mp4', 'height': 1080, 'resolution': '1080p', 'format_note': 'DASH video', 'preference': -40},
200 '138': {'ext': 'mp4', 'height': 1081, 'resolution': '>1080p', 'format_note': 'DASH video', 'preference': -40},
201 '160': {'ext': 'mp4', 'height': 192, 'resolution': '192p', 'format_note': 'DASH video', 'preference': -40},
8fa8a629 202 '264': {'ext': 'mp4', 'height': 1080, 'resolution': '1080p', 'format_note': 'DASH video', 'preference': -40},
836a086c 203
f6f1fc92 204 # Dash mp4 audio
2c62dc26
PH
205 '139': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 48, 'preference': -50},
206 '140': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 128, 'preference': -50},
207 '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 256, 'preference': -50},
836a086c
AZ
208
209 # Dash webm
2c62dc26
PH
210 '242': {'ext': 'webm', 'height': 240, 'resolution': '240p', 'format_note': 'DASH webm', 'preference': -40},
211 '243': {'ext': 'webm', 'height': 360, 'resolution': '360p', 'format_note': 'DASH webm', 'preference': -40},
212 '244': {'ext': 'webm', 'height': 480, 'resolution': '480p', 'format_note': 'DASH webm', 'preference': -40},
213 '245': {'ext': 'webm', 'height': 480, 'resolution': '480p', 'format_note': 'DASH webm', 'preference': -40},
214 '246': {'ext': 'webm', 'height': 480, 'resolution': '480p', 'format_note': 'DASH webm', 'preference': -40},
215 '247': {'ext': 'webm', 'height': 720, 'resolution': '720p', 'format_note': 'DASH webm', 'preference': -40},
216 '248': {'ext': 'webm', 'height': 1080, 'resolution': '1080p', 'format_note': 'DASH webm', 'preference': -40},
217
218 # Dash webm audio
219 '171': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH webm audio', 'abr': 48, 'preference': -50},
220 '172': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH webm audio', 'abr': 256, 'preference': -50},
ce6b9a2d
PH
221
222 # RTMP (unnamed)
223 '_rtmp': {'protocol': 'rtmp'},
c5e8d7af 224 }
836a086c 225
c5e8d7af 226 IE_NAME = u'youtube'
2eb88d95
PH
227 _TESTS = [
228 {
0e853ca4
PH
229 u"url": u"http://www.youtube.com/watch?v=BaW_jenozKc",
230 u"file": u"BaW_jenozKc.mp4",
231 u"info_dict": {
232 u"title": u"youtube-dl test video \"'/\\ä↭𝕐",
233 u"uploader": u"Philipp Hagemeister",
234 u"uploader_id": u"phihag",
235 u"upload_date": u"20121002",
27dcce19 236 u"description": u"test chars: \"'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de ."
2eb88d95 237 }
0e853ca4 238 },
0e853ca4
PH
239 {
240 u"url": u"http://www.youtube.com/watch?v=UxxajLWwzqY",
241 u"file": u"UxxajLWwzqY.mp4",
242 u"note": u"Test generic use_cipher_signature video (#897)",
243 u"info_dict": {
244 u"upload_date": u"20120506",
245 u"title": u"Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]",
2dc59299 246 u"description": u"md5:5b292926389560516e384ac437c0ec07",
45ed795c 247 u"uploader": u"Icona Pop",
0e853ca4 248 u"uploader_id": u"IconaPop"
2eb88d95 249 }
c108eb73
JMF
250 },
251 {
252 u"url": u"https://www.youtube.com/watch?v=07FYdnEawAQ",
253 u"file": u"07FYdnEawAQ.mp4",
254 u"note": u"Test VEVO video with age protection (#956)",
255 u"info_dict": {
256 u"upload_date": u"20130703",
257 u"title": u"Justin Timberlake - Tunnel Vision (Explicit)",
258 u"description": u"md5:64249768eec3bc4276236606ea996373",
259 u"uploader": u"justintimberlakeVEVO",
260 u"uploader_id": u"justintimberlakeVEVO"
261 }
262 },
fccd3771 263 {
83aa5293 264 u"url": u"//www.YouTube.com/watch?v=yZIXLfi8CZQ",
fccd3771
PH
265 u"file": u"yZIXLfi8CZQ.mp4",
266 u"note": u"Embed-only video (#1746)",
267 u"info_dict": {
268 u"upload_date": u"20120608",
269 u"title": u"Principal Sexually Assaults A Teacher - Episode 117 - 8th June 2012",
270 u"description": u"md5:09b78bd971f1e3e289601dfba15ca4f7",
271 u"uploader": u"SET India",
272 u"uploader_id": u"setindia"
273 }
274 },
dd27fd17
PH
275 {
276 u"url": u"http://www.youtube.com/watch?v=a9LDPn-MO4I",
277 u"file": u"a9LDPn-MO4I.m4a",
278 u"note": u"256k DASH audio (format 141) via DASH manifest",
dd27fd17
PH
279 u"info_dict": {
280 u"upload_date": "20121002",
281 u"uploader_id": "8KVIDEO",
282 u"description": "No description available.",
283 u"uploader": "8KVIDEO",
284 u"title": "UHDTV TEST 8K VIDEO.mp4"
4919603f
PH
285 },
286 u"params": {
287 u"youtube_include_dash_manifest": True,
288 u"format": "141",
289 },
dd27fd17 290 },
2eb88d95
PH
291 ]
292
c5e8d7af
PH
293
294 @classmethod
295 def suitable(cls, url):
296 """Receives a URL and returns True if suitable for this IE."""
e3ea4790 297 if YoutubePlaylistIE.suitable(url): return False
fccd3771 298 return re.match(cls._VALID_URL, url) is not None
c5e8d7af 299
e0df6211
PH
300 def __init__(self, *args, **kwargs):
301 super(YoutubeIE, self).__init__(*args, **kwargs)
83799698 302 self._player_cache = {}
e0df6211 303
c5e8d7af
PH
304 def report_video_info_webpage_download(self, video_id):
305 """Report attempt to download video info webpage."""
306 self.to_screen(u'%s: Downloading video info webpage' % video_id)
307
c5e8d7af
PH
308 def report_information_extraction(self, video_id):
309 """Report attempt to extract video information."""
310 self.to_screen(u'%s: Extracting video information' % video_id)
311
312 def report_unavailable_format(self, video_id, format):
313 """Report extracted video URL."""
314 self.to_screen(u'%s: Format %s not available' % (video_id, format))
315
316 def report_rtmp_download(self):
317 """Indicate the download will use the RTMP protocol."""
318 self.to_screen(u'RTMP download detected')
319
c4417ddb
PH
320 def _extract_signature_function(self, video_id, player_url, slen):
321 id_m = re.match(r'.*-(?P<id>[a-zA-Z0-9_-]+)\.(?P<ext>[a-z]+)$',
83799698 322 player_url)
e0df6211
PH
323 player_type = id_m.group('ext')
324 player_id = id_m.group('id')
325
c4417ddb
PH
326 # Read from filesystem cache
327 func_id = '%s_%s_%d' % (player_type, player_id, slen)
328 assert os.path.basename(func_id) == func_id
c38b1e77 329 cache_dir = get_cachedir(self._downloader.params)
c4417ddb 330
c3c88a26 331 cache_enabled = cache_dir is not None
f8061589 332 if cache_enabled:
c4417ddb
PH
333 cache_fn = os.path.join(os.path.expanduser(cache_dir),
334 u'youtube-sigfuncs',
335 func_id + '.json')
336 try:
edf3e38e 337 with io.open(cache_fn, 'r', encoding='utf-8') as cachef:
c4417ddb
PH
338 cache_spec = json.load(cachef)
339 return lambda s: u''.join(s[i] for i in cache_spec)
edf3e38e 340 except IOError:
c4417ddb 341 pass # No cache available
83799698 342
e0df6211
PH
343 if player_type == 'js':
344 code = self._download_webpage(
345 player_url, video_id,
83799698 346 note=u'Downloading %s player %s' % (player_type, player_id),
e0df6211 347 errnote=u'Download of %s failed' % player_url)
83799698 348 res = self._parse_sig_js(code)
c4417ddb 349 elif player_type == 'swf':
e0df6211
PH
350 urlh = self._request_webpage(
351 player_url, video_id,
83799698 352 note=u'Downloading %s player %s' % (player_type, player_id),
e0df6211
PH
353 errnote=u'Download of %s failed' % player_url)
354 code = urlh.read()
83799698 355 res = self._parse_sig_swf(code)
e0df6211
PH
356 else:
357 assert False, 'Invalid player type %r' % player_type
358
f8061589 359 if cache_enabled:
edf3e38e 360 try:
c705320f
PH
361 test_string = u''.join(map(compat_chr, range(slen)))
362 cache_res = res(test_string)
edf3e38e
PH
363 cache_spec = [ord(c) for c in cache_res]
364 try:
365 os.makedirs(os.path.dirname(cache_fn))
366 except OSError as ose:
367 if ose.errno != errno.EEXIST:
368 raise
369 write_json_file(cache_spec, cache_fn)
0ca96d48 370 except Exception:
edf3e38e
PH
371 tb = traceback.format_exc()
372 self._downloader.report_warning(
373 u'Writing cache to %r failed: %s' % (cache_fn, tb))
83799698
PH
374
375 return res
376
edf3e38e
PH
377 def _print_sig_code(self, func, slen):
378 def gen_sig_code(idxs):
379 def _genslice(start, end, step):
380 starts = u'' if start == 0 else str(start)
e35e4ddc
PH
381 ends = (u':%d' % (end+step)) if end + step >= 0 else u':'
382 steps = u'' if step == 1 else (u':%d' % step)
edf3e38e
PH
383 return u's[%s%s%s]' % (starts, ends, steps)
384
385 step = None
0ca96d48
PH
386 start = '(Never used)' # Quelch pyflakes warnings - start will be
387 # set as soon as step is set
edf3e38e
PH
388 for i, prev in zip(idxs[1:], idxs[:-1]):
389 if step is not None:
390 if i - prev == step:
391 continue
392 yield _genslice(start, prev, step)
393 step = None
394 continue
395 if i - prev in [-1, 1]:
396 step = i - prev
397 start = prev
398 continue
399 else:
400 yield u's[%d]' % prev
401 if step is None:
402 yield u's[%d]' % i
403 else:
404 yield _genslice(start, i, step)
405
c705320f
PH
406 test_string = u''.join(map(compat_chr, range(slen)))
407 cache_res = func(test_string)
edf3e38e
PH
408 cache_spec = [ord(c) for c in cache_res]
409 expr_code = u' + '.join(gen_sig_code(cache_spec))
410 code = u'if len(s) == %d:\n return %s\n' % (slen, expr_code)
f8061589 411 self.to_screen(u'Extracted signature function:\n' + code)
edf3e38e 412
e0df6211
PH
413 def _parse_sig_js(self, jscode):
414 funcname = self._search_regex(
415 r'signature=([a-zA-Z]+)', jscode,
416 u'Initial JS player signature function name')
417
418 functions = {}
419
420 def argidx(varname):
421 return string.lowercase.index(varname)
422
423 def interpret_statement(stmt, local_vars, allow_recursion=20):
424 if allow_recursion < 0:
0ca96d48 425 raise ExtractorError(u'Recursion limit reached')
e0df6211
PH
426
427 if stmt.startswith(u'var '):
428 stmt = stmt[len(u'var '):]
429 ass_m = re.match(r'^(?P<out>[a-z]+)(?:\[(?P<index>[^\]]+)\])?' +
430 r'=(?P<expr>.*)$', stmt)
431 if ass_m:
432 if ass_m.groupdict().get('index'):
433 def assign(val):
434 lvar = local_vars[ass_m.group('out')]
435 idx = interpret_expression(ass_m.group('index'),
436 local_vars, allow_recursion)
437 assert isinstance(idx, int)
438 lvar[idx] = val
439 return val
440 expr = ass_m.group('expr')
441 else:
442 def assign(val):
443 local_vars[ass_m.group('out')] = val
444 return val
445 expr = ass_m.group('expr')
446 elif stmt.startswith(u'return '):
447 assign = lambda v: v
448 expr = stmt[len(u'return '):]
449 else:
450 raise ExtractorError(
451 u'Cannot determine left side of statement in %r' % stmt)
452
453 v = interpret_expression(expr, local_vars, allow_recursion)
454 return assign(v)
455
456 def interpret_expression(expr, local_vars, allow_recursion):
457 if expr.isdigit():
458 return int(expr)
459
460 if expr.isalpha():
461 return local_vars[expr]
462
463 m = re.match(r'^(?P<in>[a-z]+)\.(?P<member>.*)$', expr)
464 if m:
465 member = m.group('member')
466 val = local_vars[m.group('in')]
467 if member == 'split("")':
468 return list(val)
469 if member == 'join("")':
470 return u''.join(val)
471 if member == 'length':
472 return len(val)
473 if member == 'reverse()':
474 return val[::-1]
475 slice_m = re.match(r'slice\((?P<idx>.*)\)', member)
476 if slice_m:
477 idx = interpret_expression(
478 slice_m.group('idx'), local_vars, allow_recursion-1)
479 return val[idx:]
480
481 m = re.match(
482 r'^(?P<in>[a-z]+)\[(?P<idx>.+)\]$', expr)
483 if m:
484 val = local_vars[m.group('in')]
485 idx = interpret_expression(m.group('idx'), local_vars,
486 allow_recursion-1)
487 return val[idx]
488
489 m = re.match(r'^(?P<a>.+?)(?P<op>[%])(?P<b>.+?)$', expr)
490 if m:
491 a = interpret_expression(m.group('a'),
492 local_vars, allow_recursion)
493 b = interpret_expression(m.group('b'),
494 local_vars, allow_recursion)
495 return a % b
496
497 m = re.match(
498 r'^(?P<func>[a-zA-Z]+)\((?P<args>[a-z0-9,]+)\)$', expr)
499 if m:
500 fname = m.group('func')
501 if fname not in functions:
502 functions[fname] = extract_function(fname)
503 argvals = [int(v) if v.isdigit() else local_vars[v]
504 for v in m.group('args').split(',')]
505 return functions[fname](argvals)
506 raise ExtractorError(u'Unsupported JS expression %r' % expr)
507
508 def extract_function(funcname):
509 func_m = re.search(
510 r'function ' + re.escape(funcname) +
511 r'\((?P<args>[a-z,]+)\){(?P<code>[^}]+)}',
512 jscode)
513 argnames = func_m.group('args').split(',')
514
515 def resf(args):
516 local_vars = dict(zip(argnames, args))
517 for stmt in func_m.group('code').split(';'):
518 res = interpret_statement(stmt, local_vars)
519 return res
520 return resf
521
522 initial_function = extract_function(funcname)
523 return lambda s: initial_function([s])
524
525 def _parse_sig_swf(self, file_contents):
526 if file_contents[1:3] != b'WS':
527 raise ExtractorError(
528 u'Not an SWF file; header is %r' % file_contents[:3])
529 if file_contents[:1] == b'C':
530 content = zlib.decompress(file_contents[8:])
531 else:
532 raise NotImplementedError(u'Unsupported compression format %r' %
533 file_contents[:1])
534
535 def extract_tags(content):
536 pos = 0
537 while pos < len(content):
538 header16 = struct.unpack('<H', content[pos:pos+2])[0]
539 pos += 2
540 tag_code = header16 >> 6
541 tag_len = header16 & 0x3f
542 if tag_len == 0x3f:
543 tag_len = struct.unpack('<I', content[pos:pos+4])[0]
544 pos += 4
545 assert pos+tag_len <= len(content)
546 yield (tag_code, content[pos:pos+tag_len])
547 pos += tag_len
548
549 code_tag = next(tag
550 for tag_code, tag in extract_tags(content)
551 if tag_code == 82)
552 p = code_tag.index(b'\0', 4) + 1
ba552f54 553 code_reader = io.BytesIO(code_tag[p:])
e0df6211
PH
554
555 # Parse ABC (AVM2 ByteCode)
ba552f54
PH
556 def read_int(reader=None):
557 if reader is None:
558 reader = code_reader
e0df6211
PH
559 res = 0
560 shift = 0
561 for _ in range(5):
ba552f54
PH
562 buf = reader.read(1)
563 assert len(buf) == 1
564 b = struct.unpack('<B', buf)[0]
e0df6211
PH
565 res = res | ((b & 0x7f) << shift)
566 if b & 0x80 == 0:
567 break
568 shift += 7
ba552f54
PH
569 return res
570
571 def u30(reader=None):
572 res = read_int(reader)
573 assert res & 0xf0000000 == 0
e0df6211
PH
574 return res
575 u32 = read_int
576
ba552f54
PH
577 def s32(reader=None):
578 v = read_int(reader)
e0df6211
PH
579 if v & 0x80000000 != 0:
580 v = - ((v ^ 0xffffffff) + 1)
ba552f54
PH
581 return v
582
0ca96d48 583 def read_string(reader=None):
ba552f54
PH
584 if reader is None:
585 reader = code_reader
586 slen = u30(reader)
587 resb = reader.read(slen)
588 assert len(resb) == slen
589 return resb.decode('utf-8')
590
591 def read_bytes(count, reader=None):
592 if reader is None:
593 reader = code_reader
594 resb = reader.read(count)
595 assert len(resb) == count
596 return resb
597
598 def read_byte(reader=None):
599 resb = read_bytes(1, reader=reader)
600 res = struct.unpack('<B', resb)[0]
601 return res
e0df6211
PH
602
603 # minor_version + major_version
0ca96d48 604 read_bytes(2 + 2)
e0df6211
PH
605
606 # Constant pool
ba552f54 607 int_count = u30()
e0df6211 608 for _c in range(1, int_count):
0ca96d48 609 s32()
ba552f54 610 uint_count = u30()
e0df6211 611 for _c in range(1, uint_count):
0ca96d48 612 u32()
ba552f54 613 double_count = u30()
0ca96d48 614 read_bytes((double_count-1) * 8)
ba552f54 615 string_count = u30()
e0df6211
PH
616 constant_strings = [u'']
617 for _c in range(1, string_count):
0ca96d48 618 s = read_string()
e0df6211 619 constant_strings.append(s)
ba552f54 620 namespace_count = u30()
e0df6211 621 for _c in range(1, namespace_count):
0ca96d48
PH
622 read_bytes(1) # kind
623 u30() # name
ba552f54 624 ns_set_count = u30()
e0df6211 625 for _c in range(1, ns_set_count):
ba552f54 626 count = u30()
e0df6211 627 for _c2 in range(count):
0ca96d48 628 u30()
ba552f54 629 multiname_count = u30()
e0df6211
PH
630 MULTINAME_SIZES = {
631 0x07: 2, # QName
632 0x0d: 2, # QNameA
633 0x0f: 1, # RTQName
634 0x10: 1, # RTQNameA
635 0x11: 0, # RTQNameL
636 0x12: 0, # RTQNameLA
637 0x09: 2, # Multiname
638 0x0e: 2, # MultinameA
639 0x1b: 1, # MultinameL
640 0x1c: 1, # MultinameLA
641 }
642 multinames = [u'']
643 for _c in range(1, multiname_count):
ba552f54 644 kind = u30()
e0df6211
PH
645 assert kind in MULTINAME_SIZES, u'Invalid multiname kind %r' % kind
646 if kind == 0x07:
0ca96d48 647 u30() # namespace_idx
ba552f54 648 name_idx = u30()
e0df6211
PH
649 multinames.append(constant_strings[name_idx])
650 else:
651 multinames.append('[MULTINAME kind: %d]' % kind)
652 for _c2 in range(MULTINAME_SIZES[kind]):
0ca96d48 653 u30()
e0df6211
PH
654
655 # Methods
ba552f54 656 method_count = u30()
e0df6211
PH
657 MethodInfo = collections.namedtuple(
658 'MethodInfo',
659 ['NEED_ARGUMENTS', 'NEED_REST'])
660 method_infos = []
661 for method_id in range(method_count):
ba552f54 662 param_count = u30()
0ca96d48 663 u30() # return type
e0df6211 664 for _ in range(param_count):
0ca96d48
PH
665 u30() # param type
666 u30() # name index (always 0 for youtube)
ba552f54 667 flags = read_byte()
e0df6211
PH
668 if flags & 0x08 != 0:
669 # Options present
ba552f54 670 option_count = u30()
e0df6211 671 for c in range(option_count):
0ca96d48
PH
672 u30() # val
673 read_bytes(1) # kind
e0df6211
PH
674 if flags & 0x80 != 0:
675 # Param names present
676 for _ in range(param_count):
0ca96d48 677 u30() # param name
e0df6211
PH
678 mi = MethodInfo(flags & 0x01 != 0, flags & 0x04 != 0)
679 method_infos.append(mi)
680
681 # Metadata
ba552f54 682 metadata_count = u30()
e0df6211 683 for _c in range(metadata_count):
0ca96d48 684 u30() # name
ba552f54 685 item_count = u30()
e0df6211 686 for _c2 in range(item_count):
0ca96d48
PH
687 u30() # key
688 u30() # value
ba552f54
PH
689
690 def parse_traits_info():
691 trait_name_idx = u30()
692 kind_full = read_byte()
e0df6211
PH
693 kind = kind_full & 0x0f
694 attrs = kind_full >> 4
695 methods = {}
696 if kind in [0x00, 0x06]: # Slot or Const
0ca96d48
PH
697 u30() # Slot id
698 u30() # type_name_idx
ba552f54 699 vindex = u30()
e0df6211 700 if vindex != 0:
0ca96d48 701 read_byte() # vkind
e0df6211 702 elif kind in [0x01, 0x02, 0x03]: # Method / Getter / Setter
0ca96d48 703 u30() # disp_id
ba552f54 704 method_idx = u30()
e0df6211
PH
705 methods[multinames[trait_name_idx]] = method_idx
706 elif kind == 0x04: # Class
0ca96d48
PH
707 u30() # slot_id
708 u30() # classi
e0df6211 709 elif kind == 0x05: # Function
0ca96d48 710 u30() # slot_id
ba552f54 711 function_idx = u30()
e0df6211
PH
712 methods[function_idx] = multinames[trait_name_idx]
713 else:
714 raise ExtractorError(u'Unsupported trait kind %d' % kind)
715
716 if attrs & 0x4 != 0: # Metadata present
ba552f54 717 metadata_count = u30()
e0df6211 718 for _c3 in range(metadata_count):
0ca96d48 719 u30() # metadata index
e0df6211 720
ba552f54 721 return methods
e0df6211
PH
722
723 # Classes
724 TARGET_CLASSNAME = u'SignatureDecipher'
725 searched_idx = multinames.index(TARGET_CLASSNAME)
726 searched_class_id = None
ba552f54 727 class_count = u30()
e0df6211 728 for class_id in range(class_count):
ba552f54 729 name_idx = u30()
e0df6211
PH
730 if name_idx == searched_idx:
731 # We found the class we're looking for!
732 searched_class_id = class_id
0ca96d48 733 u30() # super_name idx
ba552f54 734 flags = read_byte()
e0df6211 735 if flags & 0x08 != 0: # Protected namespace is present
0ca96d48 736 u30() # protected_ns_idx
ba552f54 737 intrf_count = u30()
e0df6211 738 for _c2 in range(intrf_count):
0ca96d48
PH
739 u30()
740 u30() # iinit
ba552f54 741 trait_count = u30()
e0df6211 742 for _c2 in range(trait_count):
0ca96d48 743 parse_traits_info()
e0df6211
PH
744
745 if searched_class_id is None:
746 raise ExtractorError(u'Target class %r not found' %
747 TARGET_CLASSNAME)
748
749 method_names = {}
750 method_idxs = {}
751 for class_id in range(class_count):
0ca96d48 752 u30() # cinit
ba552f54 753 trait_count = u30()
e0df6211 754 for _c2 in range(trait_count):
ba552f54 755 trait_methods = parse_traits_info()
e0df6211
PH
756 if class_id == searched_class_id:
757 method_names.update(trait_methods.items())
758 method_idxs.update(dict(
759 (idx, name)
760 for name, idx in trait_methods.items()))
761
762 # Scripts
ba552f54 763 script_count = u30()
e0df6211 764 for _c in range(script_count):
0ca96d48 765 u30() # init
ba552f54 766 trait_count = u30()
e0df6211 767 for _c2 in range(trait_count):
0ca96d48 768 parse_traits_info()
e0df6211
PH
769
770 # Method bodies
ba552f54 771 method_body_count = u30()
e0df6211
PH
772 Method = collections.namedtuple('Method', ['code', 'local_count'])
773 methods = {}
774 for _c in range(method_body_count):
ba552f54 775 method_idx = u30()
0ca96d48 776 u30() # max_stack
ba552f54 777 local_count = u30()
0ca96d48
PH
778 u30() # init_scope_depth
779 u30() # max_scope_depth
ba552f54
PH
780 code_length = u30()
781 code = read_bytes(code_length)
e0df6211 782 if method_idx in method_idxs:
ba552f54 783 m = Method(code, local_count)
e0df6211 784 methods[method_idxs[method_idx]] = m
ba552f54 785 exception_count = u30()
e0df6211 786 for _c2 in range(exception_count):
0ca96d48
PH
787 u30() # from
788 u30() # to
789 u30() # target
790 u30() # exc_type
791 u30() # var_name
ba552f54 792 trait_count = u30()
e0df6211 793 for _c2 in range(trait_count):
0ca96d48 794 parse_traits_info()
e0df6211 795
ba552f54 796 assert p + code_reader.tell() == len(code_tag)
e0df6211
PH
797 assert len(methods) == len(method_idxs)
798
799 method_pyfunctions = {}
800
801 def extract_function(func_name):
802 if func_name in method_pyfunctions:
803 return method_pyfunctions[func_name]
804 if func_name not in methods:
805 raise ExtractorError(u'Cannot find function %r' % func_name)
806 m = methods[func_name]
807
808 def resfunc(args):
e0df6211
PH
809 registers = ['(this)'] + list(args) + [None] * m.local_count
810 stack = []
811 coder = io.BytesIO(m.code)
812 while True:
813 opcode = struct.unpack('!B', coder.read(1))[0]
a7177865 814 if opcode == 36: # pushbyte
e0df6211
PH
815 v = struct.unpack('!B', coder.read(1))[0]
816 stack.append(v)
817 elif opcode == 44: # pushstring
818 idx = u30(coder)
819 stack.append(constant_strings[idx])
820 elif opcode == 48: # pushscope
821 # We don't implement the scope register, so we'll just
822 # ignore the popped value
823 stack.pop()
824 elif opcode == 70: # callproperty
825 index = u30(coder)
826 mname = multinames[index]
827 arg_count = u30(coder)
828 args = list(reversed(
829 [stack.pop() for _ in range(arg_count)]))
830 obj = stack.pop()
831 if mname == u'split':
832 assert len(args) == 1
833 assert isinstance(args[0], compat_str)
834 assert isinstance(obj, compat_str)
835 if args[0] == u'':
836 res = list(obj)
837 else:
838 res = obj.split(args[0])
839 stack.append(res)
a7177865
PH
840 elif mname == u'slice':
841 assert len(args) == 1
842 assert isinstance(args[0], int)
843 assert isinstance(obj, list)
844 res = obj[args[0]:]
845 stack.append(res)
846 elif mname == u'join':
847 assert len(args) == 1
848 assert isinstance(args[0], compat_str)
849 assert isinstance(obj, list)
850 res = args[0].join(obj)
851 stack.append(res)
e0df6211
PH
852 elif mname in method_pyfunctions:
853 stack.append(method_pyfunctions[mname](args))
854 else:
855 raise NotImplementedError(
856 u'Unsupported property %r on %r'
857 % (mname, obj))
a7177865
PH
858 elif opcode == 72: # returnvalue
859 res = stack.pop()
860 return res
861 elif opcode == 79: # callpropvoid
862 index = u30(coder)
863 mname = multinames[index]
864 arg_count = u30(coder)
865 args = list(reversed(
866 [stack.pop() for _ in range(arg_count)]))
867 obj = stack.pop()
868 if mname == u'reverse':
869 assert isinstance(obj, list)
870 obj.reverse()
871 else:
872 raise NotImplementedError(
873 u'Unsupported (void) property %r on %r'
874 % (mname, obj))
e0df6211
PH
875 elif opcode == 93: # findpropstrict
876 index = u30(coder)
877 mname = multinames[index]
878 res = extract_function(mname)
879 stack.append(res)
880 elif opcode == 97: # setproperty
881 index = u30(coder)
882 value = stack.pop()
883 idx = stack.pop()
884 obj = stack.pop()
885 assert isinstance(obj, list)
886 assert isinstance(idx, int)
887 obj[idx] = value
888 elif opcode == 98: # getlocal
889 index = u30(coder)
890 stack.append(registers[index])
891 elif opcode == 99: # setlocal
892 index = u30(coder)
893 value = stack.pop()
894 registers[index] = value
895 elif opcode == 102: # getproperty
896 index = u30(coder)
897 pname = multinames[index]
898 if pname == u'length':
899 obj = stack.pop()
900 assert isinstance(obj, list)
901 stack.append(len(obj))
902 else: # Assume attribute access
903 idx = stack.pop()
904 assert isinstance(idx, int)
905 obj = stack.pop()
906 assert isinstance(obj, list)
907 stack.append(obj[idx])
908 elif opcode == 128: # coerce
0ca96d48 909 u30(coder)
e0df6211
PH
910 elif opcode == 133: # coerce_s
911 assert isinstance(stack[-1], (type(None), compat_str))
912 elif opcode == 164: # modulo
913 value2 = stack.pop()
914 value1 = stack.pop()
915 res = value1 % value2
916 stack.append(res)
a7177865
PH
917 elif opcode == 208: # getlocal_0
918 stack.append(registers[0])
919 elif opcode == 209: # getlocal_1
920 stack.append(registers[1])
921 elif opcode == 210: # getlocal_2
922 stack.append(registers[2])
923 elif opcode == 211: # getlocal_3
924 stack.append(registers[3])
e0df6211
PH
925 elif opcode == 214: # setlocal_2
926 registers[2] = stack.pop()
927 elif opcode == 215: # setlocal_3
928 registers[3] = stack.pop()
929 else:
930 raise NotImplementedError(
931 u'Unsupported opcode %d' % opcode)
932
933 method_pyfunctions[func_name] = resfunc
934 return resfunc
935
936 initial_function = extract_function(u'decipher')
937 return lambda s: initial_function([s])
938
83799698 939 def _decrypt_signature(self, s, video_id, player_url, age_gate=False):
257a2501 940 """Turn the encrypted s field into a working signature"""
6b37f0be 941
83799698 942 if player_url is not None:
9f9be844
PH
943 if player_url.startswith(u'//'):
944 player_url = u'https:' + player_url
e0df6211 945 try:
7f8ae73a
PH
946 player_id = (player_url, len(s))
947 if player_id not in self._player_cache:
83799698 948 func = self._extract_signature_function(
c4417ddb 949 video_id, player_url, len(s)
e0df6211 950 )
7f8ae73a
PH
951 self._player_cache[player_id] = func
952 func = self._player_cache[player_id]
edf3e38e
PH
953 if self._downloader.params.get('youtube_print_sig_code'):
954 self._print_sig_code(func, len(s))
955 return func(s)
0ca96d48 956 except Exception:
e0df6211 957 tb = traceback.format_exc()
83799698
PH
958 self._downloader.report_warning(
959 u'Automatic signature extraction failed: ' + tb)
e0df6211 960
d2d8f895
PH
961 self._downloader.report_warning(
962 u'Warning: Falling back to static signature algorithm')
920de7a2 963
2f2ffea9
PH
964 return self._static_decrypt_signature(
965 s, video_id, player_url, age_gate)
e0df6211 966
2f2ffea9 967 def _static_decrypt_signature(self, s, video_id, player_url, age_gate):
e0df6211
PH
968 if age_gate:
969 # The videos with age protection use another player, so the
970 # algorithms can be different.
971 if len(s) == 86:
972 return s[2:63] + s[82] + s[64:82] + s[63]
973
bc4b9008 974 if len(s) == 93:
975 return s[86:29:-1] + s[88] + s[28:5:-1]
976 elif len(s) == 92:
444b1165 977 return s[25] + s[3:25] + s[0] + s[26:42] + s[79] + s[43:79] + s[91] + s[80:83]
38d025b3
JMF
978 elif len(s) == 91:
979 return s[84:27:-1] + s[86] + s[26:5:-1]
444b1165
JMF
980 elif len(s) == 90:
981 return s[25] + s[3:25] + s[2] + s[26:40] + s[77] + s[41:77] + s[89] + s[78:81]
8a9d86a2 982 elif len(s) == 89:
983 return s[84:78:-1] + s[87] + s[77:60:-1] + s[0] + s[59:3:-1]
444b1165 984 elif len(s) == 88:
3e223834 985 return s[7:28] + s[87] + s[29:45] + s[55] + s[46:55] + s[2] + s[56:87] + s[28]
be547e1d 986 elif len(s) == 87:
3a725669 987 return s[6:27] + s[4] + s[28:39] + s[27] + s[40:59] + s[2] + s[60:]
be547e1d 988 elif len(s) == 86:
f2c327fd 989 return s[80:72:-1] + s[16] + s[71:39:-1] + s[72] + s[38:16:-1] + s[82] + s[15::-1]
be547e1d 990 elif len(s) == 85:
6ae8ee3f 991 return s[3:11] + s[0] + s[12:55] + s[84] + s[56:84]
be547e1d 992 elif len(s) == 84:
6f56389b 993 return s[78:70:-1] + s[14] + s[69:37:-1] + s[70] + s[36:14:-1] + s[80] + s[:14][::-1]
be547e1d 994 elif len(s) == 83:
920de7a2 995 return s[80:63:-1] + s[0] + s[62:0:-1] + s[63]
be547e1d 996 elif len(s) == 82:
c21315f2 997 return s[80:37:-1] + s[7] + s[36:7:-1] + s[0] + s[6:0:-1] + s[37]
be547e1d 998 elif len(s) == 81:
aedd6bb9 999 return s[56] + s[79:56:-1] + s[41] + s[55:41:-1] + s[80] + s[40:34:-1] + s[0] + s[33:29:-1] + s[34] + s[28:9:-1] + s[29] + s[8:0:-1] + s[9]
066090dd
JMF
1000 elif len(s) == 80:
1001 return s[1:19] + s[0] + s[20:68] + s[19] + s[69:80]
5c468ca8
JMF
1002 elif len(s) == 79:
1003 return s[54] + s[77:54:-1] + s[39] + s[53:39:-1] + s[78] + s[38:34:-1] + s[0] + s[33:29:-1] + s[34] + s[28:9:-1] + s[29] + s[8:0:-1] + s[9]
be547e1d
PH
1004
1005 else:
1006 raise ExtractorError(u'Unable to decrypt signature, key length %d not supported; retrying might work' % (len(s)))
c5e8d7af 1007
1f343eaa 1008 def _get_available_subtitles(self, video_id, webpage):
de7f3446 1009 try:
7fad1c63
JMF
1010 sub_list = self._download_webpage(
1011 'http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,
1012 video_id, note=False)
1013 except ExtractorError as err:
de7f3446
JMF
1014 self._downloader.report_warning(u'unable to download video subtitles: %s' % compat_str(err))
1015 return {}
1016 lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
1017
1018 sub_lang_list = {}
1019 for l in lang_list:
1020 lang = l[1]
1021 params = compat_urllib_parse.urlencode({
1022 'lang': lang,
1023 'v': video_id,
ca715127 1024 'fmt': self._downloader.params.get('subtitlesformat', 'srt'),
c3197e3e 1025 'name': unescapeHTML(l[0]).encode('utf-8'),
de7f3446
JMF
1026 })
1027 url = u'http://www.youtube.com/api/timedtext?' + params
1028 sub_lang_list[lang] = url
1029 if not sub_lang_list:
1030 self._downloader.report_warning(u'video doesn\'t have subtitles')
1031 return {}
1032 return sub_lang_list
1033
055e6f36 1034 def _get_available_automatic_caption(self, video_id, webpage):
de7f3446
JMF
1035 """We need the webpage for getting the captions url, pass it as an
1036 argument to speed up the process."""
ca715127 1037 sub_format = self._downloader.params.get('subtitlesformat', 'srt')
de7f3446
JMF
1038 self.to_screen(u'%s: Looking for automatic captions' % video_id)
1039 mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
055e6f36 1040 err_msg = u'Couldn\'t find automatic captions for %s' % video_id
de7f3446
JMF
1041 if mobj is None:
1042 self._downloader.report_warning(err_msg)
1043 return {}
1044 player_config = json.loads(mobj.group(1))
1045 try:
1046 args = player_config[u'args']
1047 caption_url = args[u'ttsurl']
1048 timestamp = args[u'timestamp']
055e6f36
JMF
1049 # We get the available subtitles
1050 list_params = compat_urllib_parse.urlencode({
1051 'type': 'list',
1052 'tlangs': 1,
1053 'asrs': 1,
de7f3446 1054 })
055e6f36 1055 list_url = caption_url + '&' + list_params
e26f8712 1056 caption_list = self._download_xml(list_url, video_id)
e3dc22ca 1057 original_lang_node = caption_list.find('track')
f6a54188 1058 if original_lang_node is None or original_lang_node.attrib.get('kind') != 'asr' :
e3dc22ca
JMF
1059 self._downloader.report_warning(u'Video doesn\'t have automatic captions')
1060 return {}
1061 original_lang = original_lang_node.attrib['lang_code']
055e6f36
JMF
1062
1063 sub_lang_list = {}
1064 for lang_node in caption_list.findall('target'):
1065 sub_lang = lang_node.attrib['lang_code']
1066 params = compat_urllib_parse.urlencode({
1067 'lang': original_lang,
1068 'tlang': sub_lang,
1069 'fmt': sub_format,
1070 'ts': timestamp,
1071 'kind': 'asr',
1072 })
1073 sub_lang_list[sub_lang] = caption_url + '&' + params
1074 return sub_lang_list
de7f3446
JMF
1075 # An extractor error can be raise by the download process if there are
1076 # no automatic captions but there are subtitles
1077 except (KeyError, ExtractorError):
1078 self._downloader.report_warning(err_msg)
1079 return {}
1080
c5e8d7af
PH
1081 def _extract_id(self, url):
1082 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1083 if mobj is None:
1084 raise ExtractorError(u'Invalid URL: %s' % url)
1085 video_id = mobj.group(2)
1086 return video_id
1087
1d043b93
JMF
1088 def _extract_from_m3u8(self, manifest_url, video_id):
1089 url_map = {}
1090 def _get_urls(_manifest):
1091 lines = _manifest.split('\n')
1092 urls = filter(lambda l: l and not l.startswith('#'),
1093 lines)
1094 return urls
1095 manifest = self._download_webpage(manifest_url, video_id, u'Downloading formats manifest')
1096 formats_urls = _get_urls(manifest)
1097 for format_url in formats_urls:
890f62e8 1098 itag = self._search_regex(r'itag/(\d+?)/', format_url, 'itag')
1d043b93
JMF
1099 url_map[itag] = format_url
1100 return url_map
1101
1fb07d10
JG
1102 def _extract_annotations(self, video_id):
1103 url = 'https://www.youtube.com/annotations_invideo?features=1&legacy=1&video_id=%s' % video_id
1104 return self._download_webpage(url, video_id, note=u'Searching for annotations.', errnote=u'Unable to download video annotations.')
1105
c5e8d7af
PH
1106 def _real_extract(self, url):
1107 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
1108 mobj = re.search(self._NEXT_URL_RE, url)
1109 if mobj:
1110 url = 'https://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
1111 video_id = self._extract_id(url)
1112
1113 # Get video webpage
c5e8d7af 1114 url = 'https://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
336c3a69 1115 video_webpage = self._download_webpage(url, video_id)
c5e8d7af
PH
1116
1117 # Attempt to extract SWF player URL
e0df6211 1118 mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
c5e8d7af
PH
1119 if mobj is not None:
1120 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1121 else:
1122 player_url = None
1123
1124 # Get video info
1125 self.report_video_info_webpage_download(video_id)
c108eb73
JMF
1126 if re.search(r'player-age-gate-content">', video_webpage) is not None:
1127 self.report_age_confirmation()
1128 age_gate = True
1129 # We simulate the access to the video from www.youtube.com/v/{video_id}
1130 # this can be viewed without login into Youtube
1131 data = compat_urllib_parse.urlencode({'video_id': video_id,
fccd3771 1132 'el': 'player_embedded',
c108eb73
JMF
1133 'gl': 'US',
1134 'hl': 'en',
1135 'eurl': 'https://youtube.googleapis.com/v/' + video_id,
1136 'asv': 3,
1137 'sts':'1588',
1138 })
1139 video_info_url = 'https://www.youtube.com/get_video_info?' + data
c5e8d7af
PH
1140 video_info_webpage = self._download_webpage(video_info_url, video_id,
1141 note=False,
1142 errnote='unable to download video info webpage')
1143 video_info = compat_parse_qs(video_info_webpage)
c108eb73
JMF
1144 else:
1145 age_gate = False
1146 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1147 video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1148 % (video_id, el_type))
1149 video_info_webpage = self._download_webpage(video_info_url, video_id,
1150 note=False,
1151 errnote='unable to download video info webpage')
1152 video_info = compat_parse_qs(video_info_webpage)
1153 if 'token' in video_info:
1154 break
c5e8d7af
PH
1155 if 'token' not in video_info:
1156 if 'reason' in video_info:
9a82b238 1157 raise ExtractorError(u'YouTube said: %s' % video_info['reason'][0], expected=True)
c5e8d7af
PH
1158 else:
1159 raise ExtractorError(u'"token" parameter not in video info for unknown reason')
1160
1d699755
PH
1161 if 'view_count' in video_info:
1162 view_count = int(video_info['view_count'][0])
1163 else:
1164 view_count = None
1165
c5e8d7af
PH
1166 # Check for "rental" videos
1167 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
1168 raise ExtractorError(u'"rental" videos not supported')
1169
1170 # Start extracting information
1171 self.report_information_extraction(video_id)
1172
1173 # uploader
1174 if 'author' not in video_info:
1175 raise ExtractorError(u'Unable to extract uploader name')
1176 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
1177
1178 # uploader_id
1179 video_uploader_id = None
1180 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
1181 if mobj is not None:
1182 video_uploader_id = mobj.group(1)
1183 else:
1184 self._downloader.report_warning(u'unable to extract uploader nickname')
1185
1186 # title
a8c6b241
PH
1187 if 'title' in video_info:
1188 video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
1189 else:
1190 self._downloader.report_warning(u'Unable to extract video title')
1191 video_title = u'_'
c5e8d7af
PH
1192
1193 # thumbnail image
7763b04e
JMF
1194 # We try first to get a high quality image:
1195 m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">',
1196 video_webpage, re.DOTALL)
1197 if m_thumb is not None:
1198 video_thumbnail = m_thumb.group(1)
1199 elif 'thumbnail_url' not in video_info:
c5e8d7af 1200 self._downloader.report_warning(u'unable to extract video thumbnail')
f490e77e 1201 video_thumbnail = None
c5e8d7af
PH
1202 else: # don't panic if we can't find it
1203 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
1204
1205 # upload date
1206 upload_date = None
1207 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
1208 if mobj is not None:
1209 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1210 upload_date = unified_strdate(upload_date)
1211
1212 # description
1213 video_description = get_element_by_id("eow-description", video_webpage)
1214 if video_description:
27dcce19
PH
1215 video_description = re.sub(r'''(?x)
1216 <a\s+
1217 (?:[a-zA-Z-]+="[^"]+"\s+)*?
1218 title="([^"]+)"\s+
1219 (?:[a-zA-Z-]+="[^"]+"\s+)*?
1220 class="yt-uix-redirect-link"\s*>
1221 [^<]+
1222 </a>
1223 ''', r'\1', video_description)
c5e8d7af
PH
1224 video_description = clean_html(video_description)
1225 else:
1226 fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
1227 if fd_mobj:
1228 video_description = unescapeHTML(fd_mobj.group(1))
1229 else:
1230 video_description = u''
1231
336c3a69 1232 def _extract_count(klass):
46374a56
PH
1233 count = self._search_regex(
1234 r'class="%s">([\d,]+)</span>' % re.escape(klass),
1235 video_webpage, klass, default=None)
336c3a69
JMF
1236 if count is not None:
1237 return int(count.replace(',', ''))
1238 return None
1239 like_count = _extract_count(u'likes-count')
1240 dislike_count = _extract_count(u'dislikes-count')
1241
c5e8d7af 1242 # subtitles
d82134c3 1243 video_subtitles = self.extract_subtitles(video_id, video_webpage)
c5e8d7af 1244
c5e8d7af 1245 if self._downloader.params.get('listsubtitles', False):
d665f8d3 1246 self._list_available_subtitles(video_id, video_webpage)
c5e8d7af
PH
1247 return
1248
1249 if 'length_seconds' not in video_info:
1250 self._downloader.report_warning(u'unable to extract video duration')
b466b702 1251 video_duration = None
c5e8d7af 1252 else:
b466b702 1253 video_duration = int(compat_urllib_parse.unquote_plus(video_info['length_seconds'][0]))
c5e8d7af 1254
1fb07d10
JG
1255 # annotations
1256 video_annotations = None
1257 if self._downloader.params.get('writeannotations', False):
1258 video_annotations = self._extract_annotations(video_id)
1259
c5e8d7af 1260 # Decide which formats to download
c5e8d7af
PH
1261 try:
1262 mobj = re.search(r';ytplayer.config = ({.*?});', video_webpage)
50be92c1
PH
1263 if not mobj:
1264 raise ValueError('Could not find vevo ID')
c5e8d7af
PH
1265 info = json.loads(mobj.group(1))
1266 args = info['args']
7ce7e394
JMF
1267 # Easy way to know if the 's' value is in url_encoded_fmt_stream_map
1268 # this signatures are encrypted
44d46655 1269 if 'url_encoded_fmt_stream_map' not in args:
f10503db 1270 raise ValueError(u'No stream_map present') # caught below
00fe14fc
JMF
1271 re_signature = re.compile(r'[&,]s=')
1272 m_s = re_signature.search(args['url_encoded_fmt_stream_map'])
7ce7e394
JMF
1273 if m_s is not None:
1274 self.to_screen(u'%s: Encrypted signatures detected.' % video_id)
c5e8d7af 1275 video_info['url_encoded_fmt_stream_map'] = [args['url_encoded_fmt_stream_map']]
00fe14fc 1276 m_s = re_signature.search(args.get('adaptive_fmts', u''))
b7a68384 1277 if m_s is not None:
00fe14fc
JMF
1278 if 'adaptive_fmts' in video_info:
1279 video_info['adaptive_fmts'][0] += ',' + args['adaptive_fmts']
37b6d5f6 1280 else:
00fe14fc 1281 video_info['adaptive_fmts'] = [args['adaptive_fmts']]
c5e8d7af
PH
1282 except ValueError:
1283 pass
1284
dd27fd17
PH
1285 def _map_to_format_list(urlmap):
1286 formats = []
1287 for itag, video_real_url in urlmap.items():
1288 dct = {
1289 'format_id': itag,
1290 'url': video_real_url,
1291 'player_url': player_url,
1292 }
0b65e5d4
PH
1293 if itag in self._formats:
1294 dct.update(self._formats[itag])
dd27fd17
PH
1295 formats.append(dct)
1296 return formats
1297
c5e8d7af
PH
1298 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1299 self.report_rtmp_download()
dd27fd17
PH
1300 formats = [{
1301 'format_id': '_rtmp',
1302 'protocol': 'rtmp',
1303 'url': video_info['conn'][0],
1304 'player_url': player_url,
1305 }]
00fe14fc
JMF
1306 elif len(video_info.get('url_encoded_fmt_stream_map', [])) >= 1 or len(video_info.get('adaptive_fmts', [])) >= 1:
1307 encoded_url_map = video_info.get('url_encoded_fmt_stream_map', [''])[0] + ',' + video_info.get('adaptive_fmts',[''])[0]
1308 if 'rtmpe%3Dyes' in encoded_url_map:
a7055eb9 1309 raise ExtractorError('rtmpe downloads are not supported, see https://github.com/rg3/youtube-dl/issues/343 for more information.', expected=True)
c5e8d7af 1310 url_map = {}
00fe14fc 1311 for url_data_str in encoded_url_map.split(','):
c5e8d7af
PH
1312 url_data = compat_parse_qs(url_data_str)
1313 if 'itag' in url_data and 'url' in url_data:
1314 url = url_data['url'][0]
1315 if 'sig' in url_data:
1316 url += '&signature=' + url_data['sig'][0]
1317 elif 's' in url_data:
e0df6211 1318 encrypted_sig = url_data['s'][0]
769fda3c 1319 if self._downloader.params.get('verbose'):
c108eb73 1320 if age_gate:
bdde940e
PH
1321 if player_url is None:
1322 player_version = 'unknown'
1323 else:
1324 player_version = self._search_regex(
1325 r'-(.+)\.swf$', player_url,
1326 u'flash player', fatal=False)
e0df6211 1327 player_desc = 'flash player %s' % player_version
c108eb73 1328 else:
83799698
PH
1329 player_version = self._search_regex(
1330 r'html5player-(.+?)\.js', video_webpage,
c108eb73 1331 'html5 player', fatal=False)
e0df6211
PH
1332 player_desc = u'html5 player %s' % player_version
1333
1334 parts_sizes = u'.'.join(compat_str(len(part)) for part in encrypted_sig.split('.'))
5a76c651 1335 self.to_screen(u'encrypted signature length %d (%s), itag %s, %s' %
e0df6211
PH
1336 (len(encrypted_sig), parts_sizes, url_data['itag'][0], player_desc))
1337
83799698 1338 if not age_gate:
e0df6211
PH
1339 jsplayer_url_json = self._search_regex(
1340 r'"assets":.+?"js":\s*("[^"]+")',
1341 video_webpage, u'JS player URL')
83799698 1342 player_url = json.loads(jsplayer_url_json)
e0df6211 1343
83799698
PH
1344 signature = self._decrypt_signature(
1345 encrypted_sig, video_id, player_url, age_gate)
c5e8d7af
PH
1346 url += '&signature=' + signature
1347 if 'ratebypass' not in url:
1348 url += '&ratebypass=yes'
1349 url_map[url_data['itag'][0]] = url
dd27fd17 1350 formats = _map_to_format_list(url_map)
1d043b93
JMF
1351 elif video_info.get('hlsvp'):
1352 manifest_url = video_info['hlsvp'][0]
1353 url_map = self._extract_from_m3u8(manifest_url, video_id)
dd27fd17 1354 formats = _map_to_format_list(url_map)
c5e8d7af 1355 else:
9abb3204 1356 raise ExtractorError(u'no conn, hlsvp or url_encoded_fmt_stream_map information found in video info')
c5e8d7af 1357
dd27fd17
PH
1358 # Look for the DASH manifest
1359 dash_manifest_url_lst = video_info.get('dashmpd')
4919603f
PH
1360 if (dash_manifest_url_lst and dash_manifest_url_lst[0] and
1361 self._downloader.params.get('youtube_include_dash_manifest', False)):
dd27fd17
PH
1362 try:
1363 dash_doc = self._download_xml(
1364 dash_manifest_url_lst[0], video_id,
1365 note=u'Downloading DASH manifest',
1366 errnote=u'Could not download DASH manifest')
1367 for r in dash_doc.findall(u'.//{urn:mpeg:DASH:schema:MPD:2011}Representation'):
1368 url_el = r.find('{urn:mpeg:DASH:schema:MPD:2011}BaseURL')
1369 if url_el is None:
1370 continue
1371 format_id = r.attrib['id']
1372 video_url = url_el.text
1373 filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength'))
1374 f = {
1375 'format_id': format_id,
1376 'url': video_url,
1377 'width': int_or_none(r.attrib.get('width')),
1378 'tbr': int_or_none(r.attrib.get('bandwidth'), 1000),
1379 'asr': int_or_none(r.attrib.get('audioSamplingRate')),
1380 'filesize': filesize,
1381 }
1382 try:
1383 existing_format = next(
1384 fo for fo in formats
1385 if fo['format_id'] == format_id)
1386 except StopIteration:
1387 f.update(self._formats.get(format_id, {}))
1388 formats.append(f)
1389 else:
1390 existing_format.update(f)
1391
1392 except (ExtractorError, KeyError) as e:
1393 self.report_warning(u'Skipping DASH manifest: %s' % e, video_id)
d80044c2 1394
4bcc7bd1 1395 self._sort_formats(formats)
4ea3be0a 1396
1397 return {
1398 'id': video_id,
1399 'uploader': video_uploader,
1400 'uploader_id': video_uploader_id,
1401 'upload_date': upload_date,
1402 'title': video_title,
1403 'thumbnail': video_thumbnail,
1404 'description': video_description,
1405 'subtitles': video_subtitles,
1406 'duration': video_duration,
1407 'age_limit': 18 if age_gate else 0,
1408 'annotations': video_annotations,
1409 'webpage_url': 'https://www.youtube.com/watch?v=%s' % video_id,
1410 'view_count': view_count,
1411 'like_count': like_count,
1412 'dislike_count': dislike_count,
1413 'formats': formats,
1414 }
c5e8d7af 1415
880e1c52 1416class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
0f818663 1417 IE_DESC = u'YouTube.com playlists'
c5e8d7af
PH
1418 _VALID_URL = r"""(?:
1419 (?:https?://)?
1420 (?:\w+\.)?
1421 youtube\.com/
1422 (?:
1423 (?:course|view_play_list|my_playlists|artist|playlist|watch)
1424 \? (?:.*?&)*? (?:p|a|list)=
1425 | p/
1426 )
715c8e7b 1427 ((?:PL|EC|UU|FL|RD)?[0-9A-Za-z-_]{10,})
c5e8d7af
PH
1428 .*
1429 |
715c8e7b 1430 ((?:PL|EC|UU|FL|RD)[0-9A-Za-z-_]{10,})
c5e8d7af 1431 )"""
dcbb4580
JMF
1432 _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s&page=%s'
1433 _MORE_PAGES_INDICATOR = r'data-link-type="next"'
6e47b51e 1434 _VIDEO_RE = r'href="/watch\?v=(?P<id>[0-9A-Za-z_-]{11})&amp;[^"]*?index=(?P<index>\d+)'
c5e8d7af
PH
1435 IE_NAME = u'youtube:playlist'
1436
1437 @classmethod
1438 def suitable(cls, url):
1439 """Receives a URL and returns True if suitable for this IE."""
1440 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1441
880e1c52
JMF
1442 def _real_initialize(self):
1443 self._login()
1444
652cdaa2
JMF
1445 def _ids_to_results(self, ids):
1446 return [self.url_result(vid_id, 'Youtube', video_id=vid_id)
1447 for vid_id in ids]
1448
1449 def _extract_mix(self, playlist_id):
1450 # The mixes are generated from a a single video
1451 # the id of the playlist is just 'RD' + video_id
7d4afc55 1452 url = 'https://youtube.com/watch?v=%s&list=%s' % (playlist_id[-11:], playlist_id)
652cdaa2 1453 webpage = self._download_webpage(url, playlist_id, u'Downloading Youtube mix')
76d1700b
JMF
1454 title_span = (get_element_by_attribute('class', 'title long-title', webpage) or
1455 get_element_by_attribute('class', 'title ', webpage))
1456 title = clean_html(title_span)
652cdaa2
JMF
1457 video_re = r'data-index="\d+".*?href="/watch\?v=([0-9A-Za-z_-]{11})&amp;[^"]*?list=%s' % re.escape(playlist_id)
1458 ids = orderedSet(re.findall(video_re, webpage))
1459 url_results = self._ids_to_results(ids)
1460
1461 return self.playlist_result(url_results, playlist_id, title)
1462
c5e8d7af
PH
1463 def _real_extract(self, url):
1464 # Extract playlist id
1465 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1466 if mobj is None:
1467 raise ExtractorError(u'Invalid URL: %s' % url)
47192f92
FV
1468 playlist_id = mobj.group(1) or mobj.group(2)
1469
1470 # Check if it's a video-specific URL
7c61bd36 1471 query_dict = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
47192f92
FV
1472 if 'v' in query_dict:
1473 video_id = query_dict['v'][0]
1474 if self._downloader.params.get('noplaylist'):
1475 self.to_screen(u'Downloading just video %s because of --no-playlist' % video_id)
7012b23c 1476 return self.url_result(video_id, 'Youtube', video_id=video_id)
47192f92
FV
1477 else:
1478 self.to_screen(u'Downloading playlist PL%s - add --no-playlist to just download video %s' % (playlist_id, video_id))
c5e8d7af 1479
7d4afc55 1480 if playlist_id.startswith('RD'):
652cdaa2
JMF
1481 # Mixes require a custom extraction process
1482 return self._extract_mix(playlist_id)
0a688bc0
JMF
1483 if playlist_id.startswith('TL'):
1484 raise ExtractorError(u'For downloading YouTube.com top lists, use '
1485 u'the "yttoplist" keyword, for example "youtube-dl \'yttoplist:music:Top Tracks\'"', expected=True)
652cdaa2 1486
dcbb4580
JMF
1487 # Extract the video ids from the playlist pages
1488 ids = []
c5e8d7af 1489
755eb032 1490 for page_num in itertools.count(1):
dcbb4580 1491 url = self._TEMPLATE_URL % (playlist_id, page_num)
c5e8d7af 1492 page = self._download_webpage(url, playlist_id, u'Downloading page #%s' % page_num)
6e47b51e
JMF
1493 matches = re.finditer(self._VIDEO_RE, page)
1494 # We remove the duplicates and the link with index 0
1495 # (it's not the first video of the playlist)
1496 new_ids = orderedSet(m.group('id') for m in matches if m.group('index') != '0')
dcbb4580 1497 ids.extend(new_ids)
c5e8d7af 1498
dcbb4580 1499 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
c5e8d7af
PH
1500 break
1501
c91778f8
PH
1502 try:
1503 playlist_title = self._og_search_title(page)
1504 except RegexNotFoundError:
1505 self.report_warning(
1506 u'Playlist page is missing OpenGraph title, falling back ...',
1507 playlist_id)
1508 playlist_title = self._html_search_regex(
1509 r'<h1 class="pl-header-title">(.*?)</h1>', page, u'title')
c5e8d7af 1510
652cdaa2 1511 url_results = self._ids_to_results(ids)
dcbb4580 1512 return self.playlist_result(url_results, playlist_id, playlist_title)
c5e8d7af
PH
1513
1514
0a688bc0
JMF
1515class YoutubeTopListIE(YoutubePlaylistIE):
1516 IE_NAME = u'youtube:toplist'
1517 IE_DESC = (u'YouTube.com top lists, "yttoplist:{channel}:{list title}"'
1518 u' (Example: "yttoplist:music:Top Tracks")')
1519 _VALID_URL = r'yttoplist:(?P<chann>.*?):(?P<title>.*?)$'
1520
1521 def _real_extract(self, url):
1522 mobj = re.match(self._VALID_URL, url)
1523 channel = mobj.group('chann')
1524 title = mobj.group('title')
1525 query = compat_urllib_parse.urlencode({'title': title})
1526 playlist_re = 'href="([^"]+?%s[^"]+?)"' % re.escape(query)
1527 channel_page = self._download_webpage('https://www.youtube.com/%s' % channel, title)
1528 link = self._html_search_regex(playlist_re, channel_page, u'list')
1529 url = compat_urlparse.urljoin('https://www.youtube.com/', link)
1530
1531 video_re = r'data-index="\d+".*?data-video-id="([0-9A-Za-z_-]{11})"'
1532 ids = []
1533 # sometimes the webpage doesn't contain the videos
1534 # retry until we get them
1535 for i in itertools.count(0):
1536 msg = u'Downloading Youtube mix'
1537 if i > 0:
1538 msg += ', retry #%d' % i
1539 webpage = self._download_webpage(url, title, msg)
1540 ids = orderedSet(re.findall(video_re, webpage))
1541 if ids:
1542 break
1543 url_results = self._ids_to_results(ids)
1544 return self.playlist_result(url_results, playlist_title=title)
1545
1546
c5e8d7af 1547class YoutubeChannelIE(InfoExtractor):
0f818663 1548 IE_DESC = u'YouTube.com channels'
c5e8d7af 1549 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
c5e8d7af 1550 _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
252580c5 1551 _MORE_PAGES_URL = 'http://www.youtube.com/c4_browse_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
c5e8d7af
PH
1552 IE_NAME = u'youtube:channel'
1553
1554 def extract_videos_from_page(self, page):
1555 ids_in_page = []
1556 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1557 if mobj.group(1) not in ids_in_page:
1558 ids_in_page.append(mobj.group(1))
1559 return ids_in_page
1560
1561 def _real_extract(self, url):
1562 # Extract channel id
1563 mobj = re.match(self._VALID_URL, url)
1564 if mobj is None:
1565 raise ExtractorError(u'Invalid URL: %s' % url)
1566
1567 # Download channel page
1568 channel_id = mobj.group(1)
1569 video_ids = []
b9643eed
JMF
1570 url = 'https://www.youtube.com/channel/%s/videos' % channel_id
1571 channel_page = self._download_webpage(url, channel_id)
31812a9e
PH
1572 autogenerated = re.search(r'''(?x)
1573 class="[^"]*?(?:
1574 channel-header-autogenerated-label|
1575 yt-channel-title-autogenerated
1576 )[^"]*"''', channel_page) is not None
c5e8d7af 1577
b9643eed
JMF
1578 if autogenerated:
1579 # The videos are contained in a single page
1580 # the ajax pages can't be used, they are empty
1581 video_ids = self.extract_videos_from_page(channel_page)
1582 else:
1583 # Download all channel pages using the json-based channel_ajax query
1584 for pagenum in itertools.count(1):
1585 url = self._MORE_PAGES_URL % (pagenum, channel_id)
1586 page = self._download_webpage(url, channel_id,
1587 u'Downloading page #%s' % pagenum)
1588
1589 page = json.loads(page)
1590
1591 ids_in_page = self.extract_videos_from_page(page['content_html'])
1592 video_ids.extend(ids_in_page)
1593
1594 if self._MORE_PAGES_INDICATOR not in page['load_more_widget_html']:
1595 break
c5e8d7af
PH
1596
1597 self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1598
7012b23c
PH
1599 url_entries = [self.url_result(video_id, 'Youtube', video_id=video_id)
1600 for video_id in video_ids]
1601 return self.playlist_result(url_entries, channel_id)
c5e8d7af
PH
1602
1603
1604class YoutubeUserIE(InfoExtractor):
0f818663 1605 IE_DESC = u'YouTube.com user videos (URL or "ytuser" keyword)'
57da92b7 1606 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:user/)?(?!(?:attribution_link|watch)(?:$|[^a-z_A-Z0-9-])))|ytuser:)(?!feed/)([A-Za-z0-9_-]+)'
c5e8d7af
PH
1607 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1608 _GDATA_PAGE_SIZE = 50
fd9cf738 1609 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d&alt=json'
c5e8d7af
PH
1610 IE_NAME = u'youtube:user'
1611
e3ea4790 1612 @classmethod
f4b05232 1613 def suitable(cls, url):
e3ea4790
JMF
1614 # Don't return True if the url can be extracted with other youtube
1615 # extractor, the regex would is too permissive and it would match.
1616 other_ies = iter(klass for (name, klass) in globals().items() if name.endswith('IE') and klass is not cls)
1617 if any(ie.suitable(url) for ie in other_ies): return False
f4b05232
JMF
1618 else: return super(YoutubeUserIE, cls).suitable(url)
1619
c5e8d7af
PH
1620 def _real_extract(self, url):
1621 # Extract username
1622 mobj = re.match(self._VALID_URL, url)
1623 if mobj is None:
1624 raise ExtractorError(u'Invalid URL: %s' % url)
1625
1626 username = mobj.group(1)
1627
1628 # Download video ids using YouTube Data API. Result size per
1629 # query is limited (currently to 50 videos) so we need to query
1630 # page by page until there are no video ids - it means we got
1631 # all of them.
1632
b7ab0590 1633 def download_page(pagenum):
c5e8d7af
PH
1634 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1635
1636 gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
b7ab0590
PH
1637 page = self._download_webpage(
1638 gdata_url, username,
1639 u'Downloading video ids from %d to %d' % (
1640 start_index, start_index + self._GDATA_PAGE_SIZE))
c5e8d7af 1641
fd9cf738
JMF
1642 try:
1643 response = json.loads(page)
1644 except ValueError as err:
1645 raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
71c82637 1646 if 'entry' not in response['feed']:
b7ab0590 1647 return
fd9cf738 1648
c5e8d7af 1649 # Extract video identifiers
e302f9ce
PH
1650 entries = response['feed']['entry']
1651 for entry in entries:
1652 title = entry['title']['$t']
1653 video_id = entry['id']['$t'].split('/')[-1]
b7ab0590 1654 yield {
e302f9ce
PH
1655 '_type': 'url',
1656 'url': video_id,
1657 'ie_key': 'Youtube',
1658 'id': 'video_id',
1659 'title': title,
b7ab0590
PH
1660 }
1661 url_results = PagedList(download_page, self._GDATA_PAGE_SIZE)
c5e8d7af 1662
7012b23c
PH
1663 return self.playlist_result(url_results, playlist_title=username)
1664
b05654f0
PH
1665
1666class YoutubeSearchIE(SearchInfoExtractor):
0f818663 1667 IE_DESC = u'YouTube.com searches'
b05654f0
PH
1668 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1669 _MAX_RESULTS = 1000
1670 IE_NAME = u'youtube:search'
1671 _SEARCH_KEY = 'ytsearch'
1672
b05654f0
PH
1673 def _get_n_results(self, query, n):
1674 """Get a specified number of results for a query"""
1675
1676 video_ids = []
1677 pagenum = 0
1678 limit = n
1679
1680 while (50 * pagenum) < limit:
b05654f0 1681 result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
7cc3570e
PH
1682 data_json = self._download_webpage(
1683 result_url, video_id=u'query "%s"' % query,
1684 note=u'Downloading page %s' % (pagenum + 1),
1685 errnote=u'Unable to download API page')
1686 data = json.loads(data_json)
1687 api_response = data['data']
1688
1689 if 'items' not in api_response:
b05654f0
PH
1690 raise ExtractorError(u'[youtube] No video results')
1691
1692 new_ids = list(video['id'] for video in api_response['items'])
1693 video_ids += new_ids
1694
1695 limit = min(n, api_response['totalItems'])
1696 pagenum += 1
1697
1698 if len(video_ids) > n:
1699 video_ids = video_ids[:n]
7012b23c
PH
1700 videos = [self.url_result(video_id, 'Youtube', video_id=video_id)
1701 for video_id in video_ids]
b05654f0 1702 return self.playlist_result(videos, query)
75dff0ee 1703
a3dd9248 1704class YoutubeSearchDateIE(YoutubeSearchIE):
cb7fb546 1705 IE_NAME = YoutubeSearchIE.IE_NAME + ':date'
a3dd9248
CM
1706 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc&orderby=published'
1707 _SEARCH_KEY = 'ytsearchdate'
08fb86c4 1708 IE_DESC = u'YouTube.com searches, newest videos first'
75dff0ee
JMF
1709
1710class YoutubeShowIE(InfoExtractor):
0f818663 1711 IE_DESC = u'YouTube.com (multi-season) shows'
75dff0ee
JMF
1712 _VALID_URL = r'https?://www\.youtube\.com/show/(.*)'
1713 IE_NAME = u'youtube:show'
1714
1715 def _real_extract(self, url):
1716 mobj = re.match(self._VALID_URL, url)
1717 show_name = mobj.group(1)
1718 webpage = self._download_webpage(url, show_name, u'Downloading show webpage')
1719 # There's one playlist for each season of the show
1720 m_seasons = list(re.finditer(r'href="(/playlist\?list=.*?)"', webpage))
1721 self.to_screen(u'%s: Found %s seasons' % (show_name, len(m_seasons)))
1722 return [self.url_result('https://www.youtube.com' + season.group(1), 'YoutubePlaylist') for season in m_seasons]
04cc9617
JMF
1723
1724
b2e8bc1b 1725class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
d7ae0639
JMF
1726 """
1727 Base class for extractors that fetch info from
1728 http://www.youtube.com/feed_ajax
1729 Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.
1730 """
b2e8bc1b 1731 _LOGIN_REQUIRED = True
43ba5456
JMF
1732 # use action_load_personal_feed instead of action_load_system_feed
1733 _PERSONAL_FEED = False
04cc9617 1734
d7ae0639
JMF
1735 @property
1736 def _FEED_TEMPLATE(self):
43ba5456
JMF
1737 action = 'action_load_system_feed'
1738 if self._PERSONAL_FEED:
1739 action = 'action_load_personal_feed'
1740 return 'http://www.youtube.com/feed_ajax?%s=1&feed_name=%s&paging=%%s' % (action, self._FEED_NAME)
d7ae0639
JMF
1741
1742 @property
1743 def IE_NAME(self):
1744 return u'youtube:%s' % self._FEED_NAME
04cc9617 1745
81f0259b 1746 def _real_initialize(self):
b2e8bc1b 1747 self._login()
81f0259b 1748
04cc9617
JMF
1749 def _real_extract(self, url):
1750 feed_entries = []
0e44d838
JMF
1751 paging = 0
1752 for i in itertools.count(1):
d7ae0639
JMF
1753 info = self._download_webpage(self._FEED_TEMPLATE % paging,
1754 u'%s feed' % self._FEED_NAME,
04cc9617
JMF
1755 u'Downloading page %s' % i)
1756 info = json.loads(info)
1757 feed_html = info['feed_html']
43ba5456 1758 m_ids = re.finditer(r'"/watch\?v=(.*?)["&]', feed_html)
04cc9617 1759 ids = orderedSet(m.group(1) for m in m_ids)
7012b23c
PH
1760 feed_entries.extend(
1761 self.url_result(video_id, 'Youtube', video_id=video_id)
1762 for video_id in ids)
04cc9617
JMF
1763 if info['paging'] is None:
1764 break
0e44d838 1765 paging = info['paging']
d7ae0639
JMF
1766 return self.playlist_result(feed_entries, playlist_title=self._PLAYLIST_TITLE)
1767
1768class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor):
1769 IE_DESC = u'YouTube.com subscriptions feed, "ytsubs" keyword(requires authentication)'
1770 _VALID_URL = r'https?://www\.youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
1771 _FEED_NAME = 'subscriptions'
1772 _PLAYLIST_TITLE = u'Youtube Subscriptions'
1773
1774class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
1775 IE_DESC = u'YouTube.com recommended videos, "ytrec" keyword (requires authentication)'
1776 _VALID_URL = r'https?://www\.youtube\.com/feed/recommended|:ytrec(?:ommended)?'
1777 _FEED_NAME = 'recommended'
1778 _PLAYLIST_TITLE = u'Youtube Recommended videos'
c626a3d9 1779
43ba5456
JMF
1780class YoutubeWatchLaterIE(YoutubeFeedsInfoExtractor):
1781 IE_DESC = u'Youtube watch later list, "ytwatchlater" keyword (requires authentication)'
1782 _VALID_URL = r'https?://www\.youtube\.com/feed/watch_later|:ytwatchlater'
1783 _FEED_NAME = 'watch_later'
1784 _PLAYLIST_TITLE = u'Youtube Watch Later'
43ba5456 1785 _PERSONAL_FEED = True
c626a3d9 1786
f459d170
JMF
1787class YoutubeHistoryIE(YoutubeFeedsInfoExtractor):
1788 IE_DESC = u'Youtube watch history, "ythistory" keyword (requires authentication)'
1789 _VALID_URL = u'https?://www\.youtube\.com/feed/history|:ythistory'
1790 _FEED_NAME = 'history'
1791 _PERSONAL_FEED = True
1792 _PLAYLIST_TITLE = u'Youtube Watch History'
1793
c626a3d9
JMF
1794class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
1795 IE_NAME = u'youtube:favorites'
1796 IE_DESC = u'YouTube.com favourite videos, "ytfav" keyword (requires authentication)'
c7a7750d 1797 _VALID_URL = r'https?://www\.youtube\.com/my_favorites|:ytfav(?:ou?rites)?'
c626a3d9
JMF
1798 _LOGIN_REQUIRED = True
1799
1800 def _real_extract(self, url):
1801 webpage = self._download_webpage('https://www.youtube.com/my_favorites', 'Youtube Favourites videos')
1802 playlist_id = self._search_regex(r'list=(.+?)["&]', webpage, u'favourites playlist id')
1803 return self.url_result(playlist_id, 'YoutubePlaylist')
15870e90
PH
1804
1805
1806class YoutubeTruncatedURLIE(InfoExtractor):
1807 IE_NAME = 'youtube:truncated_url'
1808 IE_DESC = False # Do not list
975d35db
PH
1809 _VALID_URL = r'''(?x)
1810 (?:https?://)?[^/]+/watch\?feature=[a-z_]+$|
1811 (?:https?://)?(?:www\.)?youtube\.com/attribution_link\?a=[^&]+$
1812 '''
15870e90
PH
1813
1814 def _real_extract(self, url):
1815 raise ExtractorError(
1816 u'Did you forget to quote the URL? Remember that & is a meta '
1817 u'character in most shells, so you want to put the URL in quotes, '
1818 u'like youtube-dl '
b4622a32
PH
1819 u'"http://www.youtube.com/watch?feature=foo&v=BaW_jenozKc" '
1820 u' or simply youtube-dl BaW_jenozKc .',
15870e90 1821 expected=True)