]> jfr.im git - yt-dlp.git/blame - youtube_dl/extractor/youtube.py
[sina] Recognize http://video.sina.com.cn/v/b/{id}-*.html urls (fixes #2212)
[yt-dlp.git] / youtube_dl / extractor / youtube.py
CommitLineData
c5e8d7af 1# coding: utf-8
c5e8d7af 2
e0df6211 3import collections
edf3e38e 4import errno
e0df6211 5import io
0ca96d48 6import itertools
c5e8d7af 7import json
c4417ddb 8import os.path
c5e8d7af 9import re
e0df6211
PH
10import string
11import struct
12import traceback
13import zlib
c5e8d7af 14
b05654f0 15from .common import InfoExtractor, SearchInfoExtractor
54d39d8b 16from .subtitles import SubtitlesInfoExtractor
c5e8d7af 17from ..utils import (
edf3e38e 18 compat_chr,
c5e8d7af 19 compat_parse_qs,
c5e8d7af
PH
20 compat_urllib_parse,
21 compat_urllib_request,
7c61bd36 22 compat_urlparse,
c5e8d7af
PH
23 compat_str,
24
25 clean_html,
c38b1e77 26 get_cachedir,
c5e8d7af 27 get_element_by_id,
652cdaa2 28 get_element_by_attribute,
c5e8d7af 29 ExtractorError,
dd27fd17 30 int_or_none,
b7ab0590 31 PagedList,
c91778f8 32 RegexNotFoundError,
c5e8d7af
PH
33 unescapeHTML,
34 unified_strdate,
04cc9617 35 orderedSet,
edf3e38e 36 write_json_file,
c5e8d7af
PH
37)
38
de7f3446 39class YoutubeBaseInfoExtractor(InfoExtractor):
b2e8bc1b
JMF
40 """Provide base functions for Youtube extractors"""
41 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
42 _LANG_URL = r'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
43 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
44 _NETRC_MACHINE = 'youtube'
45 # If True it will raise an error if no login info is provided
46 _LOGIN_REQUIRED = False
47
b2e8bc1b 48 def _set_language(self):
7cc3570e
PH
49 return bool(self._download_webpage(
50 self._LANG_URL, None,
51 note=u'Setting language', errnote='unable to set language',
52 fatal=False))
b2e8bc1b
JMF
53
54 def _login(self):
55 (username, password) = self._get_login_info()
56 # No authentication to be performed
57 if username is None:
58 if self._LOGIN_REQUIRED:
59 raise ExtractorError(u'No login info available, needed for using %s.' % self.IE_NAME, expected=True)
60 return False
61
7cc3570e
PH
62 login_page = self._download_webpage(
63 self._LOGIN_URL, None,
64 note=u'Downloading login page',
65 errnote=u'unable to fetch login page', fatal=False)
66 if login_page is False:
67 return
b2e8bc1b 68
795f28f8
PH
69 galx = self._search_regex(r'(?s)<input.+?name="GALX".+?value="(.+?)"',
70 login_page, u'Login GALX parameter')
c5e8d7af 71
b2e8bc1b
JMF
72 # Log in
73 login_form_strs = {
74 u'continue': u'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
75 u'Email': username,
76 u'GALX': galx,
77 u'Passwd': password,
78 u'PersistentCookie': u'yes',
79 u'_utf8': u'霱',
80 u'bgresponse': u'js_disabled',
81 u'checkConnection': u'',
82 u'checkedDomains': u'youtube',
83 u'dnConn': u'',
b2e8bc1b
JMF
84 u'pstMsg': u'0',
85 u'rmShown': u'1',
86 u'secTok': u'',
87 u'signIn': u'Sign in',
88 u'timeStmp': u'',
89 u'service': u'youtube',
90 u'uilel': u'3',
91 u'hl': u'en_US',
92 }
93 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
94 # chokes on unicode
95 login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
96 login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
7cc3570e
PH
97
98 req = compat_urllib_request.Request(self._LOGIN_URL, login_data)
99 login_results = self._download_webpage(
100 req, None,
101 note=u'Logging in', errnote=u'unable to log in', fatal=False)
102 if login_results is False:
103 return False
104 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
105 self._downloader.report_warning(u'unable to log in: bad username or password')
b2e8bc1b
JMF
106 return False
107 return True
108
109 def _confirm_age(self):
110 age_form = {
7cc3570e
PH
111 'next_url': '/',
112 'action_confirm': 'Confirm',
113 }
114 req = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
115
116 self._download_webpage(
117 req, None,
118 note=u'Confirming age', errnote=u'Unable to confirm age')
b2e8bc1b
JMF
119 return True
120
121 def _real_initialize(self):
122 if self._downloader is None:
123 return
124 if not self._set_language():
125 return
126 if not self._login():
127 return
128 self._confirm_age()
c5e8d7af 129
8377574c 130
de7f3446 131class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
0f818663 132 IE_DESC = u'YouTube.com'
cb7dfeea 133 _VALID_URL = r"""(?x)^
c5e8d7af 134 (
83aa5293 135 (?:https?://|//)? # http(s):// or protocol-independent URL (optional)
cb7dfeea 136 (?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/|
484aaeb2
PH
137 (?:www\.)?deturl\.com/www\.youtube\.com/|
138 (?:www\.)?pwnyoutube\.com|
e69ae5b9
JMF
139 tube\.majestyc\.net/|
140 youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains
c5e8d7af
PH
141 (?:.*?\#/)? # handle anchor (#/) redirect urls
142 (?: # the various things that can precede the ID:
143 (?:(?:v|embed|e)/) # v/ or embed/ or e/
144 |(?: # or the v= param in all its forms
d741e55a 145 (?:(?:watch|movie)(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
c5e8d7af
PH
146 (?:\?|\#!?) # the params delimiter ? or # or #!
147 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
148 v=
149 )
f4b05232
JMF
150 ))
151 |youtu\.be/ # just youtu.be/xxxx
152 )
c5e8d7af 153 )? # all until now is optional -> you can pass the naked ID
8963d9c2 154 ([0-9A-Za-z_-]{11}) # here is it! the YouTube video ID
c5e8d7af
PH
155 (?(1).+)? # if we found the ID, everything can follow
156 $"""
c5e8d7af 157 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
2c62dc26
PH
158 _formats = {
159 '5': {'ext': 'flv', 'width': 400, 'height': 240},
160 '6': {'ext': 'flv', 'width': 450, 'height': 270},
161 '13': {'ext': '3gp'},
162 '17': {'ext': '3gp', 'width': 176, 'height': 144},
163 '18': {'ext': 'mp4', 'width': 640, 'height': 360},
164 '22': {'ext': 'mp4', 'width': 1280, 'height': 720},
165 '34': {'ext': 'flv', 'width': 640, 'height': 360},
166 '35': {'ext': 'flv', 'width': 854, 'height': 480},
167 '36': {'ext': '3gp', 'width': 320, 'height': 240},
168 '37': {'ext': 'mp4', 'width': 1920, 'height': 1080},
169 '38': {'ext': 'mp4', 'width': 4096, 'height': 3072},
170 '43': {'ext': 'webm', 'width': 640, 'height': 360},
171 '44': {'ext': 'webm', 'width': 854, 'height': 480},
172 '45': {'ext': 'webm', 'width': 1280, 'height': 720},
173 '46': {'ext': 'webm', 'width': 1920, 'height': 1080},
174
1d043b93 175
86fe61c8 176 # 3d videos
2c62dc26
PH
177 '82': {'ext': 'mp4', 'height': 360, 'resolution': '360p', 'format_note': '3D', 'preference': -20},
178 '83': {'ext': 'mp4', 'height': 480, 'resolution': '480p', 'format_note': '3D', 'preference': -20},
179 '84': {'ext': 'mp4', 'height': 720, 'resolution': '720p', 'format_note': '3D', 'preference': -20},
180 '85': {'ext': 'mp4', 'height': 1080, 'resolution': '1080p', 'format_note': '3D', 'preference': -20},
181 '100': {'ext': 'webm', 'height': 360, 'resolution': '360p', 'format_note': '3D', 'preference': -20},
182 '101': {'ext': 'webm', 'height': 480, 'resolution': '480p', 'format_note': '3D', 'preference': -20},
183 '102': {'ext': 'webm', 'height': 720, 'resolution': '720p', 'format_note': '3D', 'preference': -20},
836a086c 184
96fb5605 185 # Apple HTTP Live Streaming
2c62dc26
PH
186 '92': {'ext': 'mp4', 'height': 240, 'resolution': '240p', 'format_note': 'HLS', 'preference': -10},
187 '93': {'ext': 'mp4', 'height': 360, 'resolution': '360p', 'format_note': 'HLS', 'preference': -10},
188 '94': {'ext': 'mp4', 'height': 480, 'resolution': '480p', 'format_note': 'HLS', 'preference': -10},
189 '95': {'ext': 'mp4', 'height': 720, 'resolution': '720p', 'format_note': 'HLS', 'preference': -10},
190 '96': {'ext': 'mp4', 'height': 1080, 'resolution': '1080p', 'format_note': 'HLS', 'preference': -10},
191 '132': {'ext': 'mp4', 'height': 240, 'resolution': '240p', 'format_note': 'HLS', 'preference': -10},
192 '151': {'ext': 'mp4', 'height': 72, 'resolution': '72p', 'format_note': 'HLS', 'preference': -10},
193
194 # DASH mp4 video
195 '133': {'ext': 'mp4', 'height': 240, 'resolution': '240p', 'format_note': 'DASH video', 'preference': -40},
196 '134': {'ext': 'mp4', 'height': 360, 'resolution': '360p', 'format_note': 'DASH video', 'preference': -40},
197 '135': {'ext': 'mp4', 'height': 480, 'resolution': '480p', 'format_note': 'DASH video', 'preference': -40},
198 '136': {'ext': 'mp4', 'height': 720, 'resolution': '720p', 'format_note': 'DASH video', 'preference': -40},
199 '137': {'ext': 'mp4', 'height': 1080, 'resolution': '1080p', 'format_note': 'DASH video', 'preference': -40},
200 '138': {'ext': 'mp4', 'height': 1081, 'resolution': '>1080p', 'format_note': 'DASH video', 'preference': -40},
201 '160': {'ext': 'mp4', 'height': 192, 'resolution': '192p', 'format_note': 'DASH video', 'preference': -40},
8fa8a629 202 '264': {'ext': 'mp4', 'height': 1080, 'resolution': '1080p', 'format_note': 'DASH video', 'preference': -40},
836a086c 203
f6f1fc92 204 # Dash mp4 audio
2c62dc26
PH
205 '139': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 48, 'preference': -50},
206 '140': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 128, 'preference': -50},
207 '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 256, 'preference': -50},
836a086c
AZ
208
209 # Dash webm
2c62dc26
PH
210 '242': {'ext': 'webm', 'height': 240, 'resolution': '240p', 'format_note': 'DASH webm', 'preference': -40},
211 '243': {'ext': 'webm', 'height': 360, 'resolution': '360p', 'format_note': 'DASH webm', 'preference': -40},
212 '244': {'ext': 'webm', 'height': 480, 'resolution': '480p', 'format_note': 'DASH webm', 'preference': -40},
213 '245': {'ext': 'webm', 'height': 480, 'resolution': '480p', 'format_note': 'DASH webm', 'preference': -40},
214 '246': {'ext': 'webm', 'height': 480, 'resolution': '480p', 'format_note': 'DASH webm', 'preference': -40},
215 '247': {'ext': 'webm', 'height': 720, 'resolution': '720p', 'format_note': 'DASH webm', 'preference': -40},
216 '248': {'ext': 'webm', 'height': 1080, 'resolution': '1080p', 'format_note': 'DASH webm', 'preference': -40},
217
218 # Dash webm audio
219 '171': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH webm audio', 'abr': 48, 'preference': -50},
220 '172': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH webm audio', 'abr': 256, 'preference': -50},
ce6b9a2d
PH
221
222 # RTMP (unnamed)
223 '_rtmp': {'protocol': 'rtmp'},
c5e8d7af 224 }
836a086c 225
c5e8d7af 226 IE_NAME = u'youtube'
2eb88d95
PH
227 _TESTS = [
228 {
0e853ca4
PH
229 u"url": u"http://www.youtube.com/watch?v=BaW_jenozKc",
230 u"file": u"BaW_jenozKc.mp4",
231 u"info_dict": {
232 u"title": u"youtube-dl test video \"'/\\ä↭𝕐",
233 u"uploader": u"Philipp Hagemeister",
234 u"uploader_id": u"phihag",
235 u"upload_date": u"20121002",
27dcce19 236 u"description": u"test chars: \"'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de ."
2eb88d95 237 }
0e853ca4 238 },
0e853ca4
PH
239 {
240 u"url": u"http://www.youtube.com/watch?v=UxxajLWwzqY",
241 u"file": u"UxxajLWwzqY.mp4",
242 u"note": u"Test generic use_cipher_signature video (#897)",
243 u"info_dict": {
244 u"upload_date": u"20120506",
245 u"title": u"Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]",
2dc59299 246 u"description": u"md5:5b292926389560516e384ac437c0ec07",
45ed795c 247 u"uploader": u"Icona Pop",
0e853ca4 248 u"uploader_id": u"IconaPop"
2eb88d95 249 }
c108eb73
JMF
250 },
251 {
252 u"url": u"https://www.youtube.com/watch?v=07FYdnEawAQ",
253 u"file": u"07FYdnEawAQ.mp4",
254 u"note": u"Test VEVO video with age protection (#956)",
255 u"info_dict": {
256 u"upload_date": u"20130703",
257 u"title": u"Justin Timberlake - Tunnel Vision (Explicit)",
258 u"description": u"md5:64249768eec3bc4276236606ea996373",
259 u"uploader": u"justintimberlakeVEVO",
260 u"uploader_id": u"justintimberlakeVEVO"
261 }
262 },
fccd3771 263 {
83aa5293 264 u"url": u"//www.YouTube.com/watch?v=yZIXLfi8CZQ",
fccd3771
PH
265 u"file": u"yZIXLfi8CZQ.mp4",
266 u"note": u"Embed-only video (#1746)",
267 u"info_dict": {
268 u"upload_date": u"20120608",
269 u"title": u"Principal Sexually Assaults A Teacher - Episode 117 - 8th June 2012",
270 u"description": u"md5:09b78bd971f1e3e289601dfba15ca4f7",
271 u"uploader": u"SET India",
272 u"uploader_id": u"setindia"
273 }
274 },
dd27fd17
PH
275 {
276 u"url": u"http://www.youtube.com/watch?v=a9LDPn-MO4I",
277 u"file": u"a9LDPn-MO4I.m4a",
278 u"note": u"256k DASH audio (format 141) via DASH manifest",
dd27fd17
PH
279 u"info_dict": {
280 u"upload_date": "20121002",
281 u"uploader_id": "8KVIDEO",
282 u"description": "No description available.",
283 u"uploader": "8KVIDEO",
284 u"title": "UHDTV TEST 8K VIDEO.mp4"
4919603f
PH
285 },
286 u"params": {
287 u"youtube_include_dash_manifest": True,
288 u"format": "141",
289 },
dd27fd17 290 },
2eb88d95
PH
291 ]
292
c5e8d7af
PH
293
294 @classmethod
295 def suitable(cls, url):
296 """Receives a URL and returns True if suitable for this IE."""
e3ea4790 297 if YoutubePlaylistIE.suitable(url): return False
fccd3771 298 return re.match(cls._VALID_URL, url) is not None
c5e8d7af 299
e0df6211
PH
300 def __init__(self, *args, **kwargs):
301 super(YoutubeIE, self).__init__(*args, **kwargs)
83799698 302 self._player_cache = {}
e0df6211 303
c5e8d7af
PH
304 def report_video_info_webpage_download(self, video_id):
305 """Report attempt to download video info webpage."""
306 self.to_screen(u'%s: Downloading video info webpage' % video_id)
307
c5e8d7af
PH
308 def report_information_extraction(self, video_id):
309 """Report attempt to extract video information."""
310 self.to_screen(u'%s: Extracting video information' % video_id)
311
312 def report_unavailable_format(self, video_id, format):
313 """Report extracted video URL."""
314 self.to_screen(u'%s: Format %s not available' % (video_id, format))
315
316 def report_rtmp_download(self):
317 """Indicate the download will use the RTMP protocol."""
318 self.to_screen(u'RTMP download detected')
319
c4417ddb
PH
320 def _extract_signature_function(self, video_id, player_url, slen):
321 id_m = re.match(r'.*-(?P<id>[a-zA-Z0-9_-]+)\.(?P<ext>[a-z]+)$',
83799698 322 player_url)
e0df6211
PH
323 player_type = id_m.group('ext')
324 player_id = id_m.group('id')
325
c4417ddb
PH
326 # Read from filesystem cache
327 func_id = '%s_%s_%d' % (player_type, player_id, slen)
328 assert os.path.basename(func_id) == func_id
c38b1e77 329 cache_dir = get_cachedir(self._downloader.params)
c4417ddb 330
c3c88a26 331 cache_enabled = cache_dir is not None
f8061589 332 if cache_enabled:
c4417ddb
PH
333 cache_fn = os.path.join(os.path.expanduser(cache_dir),
334 u'youtube-sigfuncs',
335 func_id + '.json')
336 try:
edf3e38e 337 with io.open(cache_fn, 'r', encoding='utf-8') as cachef:
c4417ddb
PH
338 cache_spec = json.load(cachef)
339 return lambda s: u''.join(s[i] for i in cache_spec)
edf3e38e 340 except IOError:
c4417ddb 341 pass # No cache available
83799698 342
e0df6211
PH
343 if player_type == 'js':
344 code = self._download_webpage(
345 player_url, video_id,
83799698 346 note=u'Downloading %s player %s' % (player_type, player_id),
e0df6211 347 errnote=u'Download of %s failed' % player_url)
83799698 348 res = self._parse_sig_js(code)
c4417ddb 349 elif player_type == 'swf':
e0df6211
PH
350 urlh = self._request_webpage(
351 player_url, video_id,
83799698 352 note=u'Downloading %s player %s' % (player_type, player_id),
e0df6211
PH
353 errnote=u'Download of %s failed' % player_url)
354 code = urlh.read()
83799698 355 res = self._parse_sig_swf(code)
e0df6211
PH
356 else:
357 assert False, 'Invalid player type %r' % player_type
358
f8061589 359 if cache_enabled:
edf3e38e 360 try:
c705320f
PH
361 test_string = u''.join(map(compat_chr, range(slen)))
362 cache_res = res(test_string)
edf3e38e
PH
363 cache_spec = [ord(c) for c in cache_res]
364 try:
365 os.makedirs(os.path.dirname(cache_fn))
366 except OSError as ose:
367 if ose.errno != errno.EEXIST:
368 raise
369 write_json_file(cache_spec, cache_fn)
0ca96d48 370 except Exception:
edf3e38e
PH
371 tb = traceback.format_exc()
372 self._downloader.report_warning(
373 u'Writing cache to %r failed: %s' % (cache_fn, tb))
83799698
PH
374
375 return res
376
edf3e38e
PH
377 def _print_sig_code(self, func, slen):
378 def gen_sig_code(idxs):
379 def _genslice(start, end, step):
380 starts = u'' if start == 0 else str(start)
e35e4ddc
PH
381 ends = (u':%d' % (end+step)) if end + step >= 0 else u':'
382 steps = u'' if step == 1 else (u':%d' % step)
edf3e38e
PH
383 return u's[%s%s%s]' % (starts, ends, steps)
384
385 step = None
0ca96d48
PH
386 start = '(Never used)' # Quelch pyflakes warnings - start will be
387 # set as soon as step is set
edf3e38e
PH
388 for i, prev in zip(idxs[1:], idxs[:-1]):
389 if step is not None:
390 if i - prev == step:
391 continue
392 yield _genslice(start, prev, step)
393 step = None
394 continue
395 if i - prev in [-1, 1]:
396 step = i - prev
397 start = prev
398 continue
399 else:
400 yield u's[%d]' % prev
401 if step is None:
402 yield u's[%d]' % i
403 else:
404 yield _genslice(start, i, step)
405
c705320f
PH
406 test_string = u''.join(map(compat_chr, range(slen)))
407 cache_res = func(test_string)
edf3e38e
PH
408 cache_spec = [ord(c) for c in cache_res]
409 expr_code = u' + '.join(gen_sig_code(cache_spec))
410 code = u'if len(s) == %d:\n return %s\n' % (slen, expr_code)
f8061589 411 self.to_screen(u'Extracted signature function:\n' + code)
edf3e38e 412
e0df6211
PH
413 def _parse_sig_js(self, jscode):
414 funcname = self._search_regex(
415 r'signature=([a-zA-Z]+)', jscode,
416 u'Initial JS player signature function name')
417
418 functions = {}
419
420 def argidx(varname):
421 return string.lowercase.index(varname)
422
423 def interpret_statement(stmt, local_vars, allow_recursion=20):
424 if allow_recursion < 0:
0ca96d48 425 raise ExtractorError(u'Recursion limit reached')
e0df6211
PH
426
427 if stmt.startswith(u'var '):
428 stmt = stmt[len(u'var '):]
429 ass_m = re.match(r'^(?P<out>[a-z]+)(?:\[(?P<index>[^\]]+)\])?' +
430 r'=(?P<expr>.*)$', stmt)
431 if ass_m:
432 if ass_m.groupdict().get('index'):
433 def assign(val):
434 lvar = local_vars[ass_m.group('out')]
435 idx = interpret_expression(ass_m.group('index'),
436 local_vars, allow_recursion)
437 assert isinstance(idx, int)
438 lvar[idx] = val
439 return val
440 expr = ass_m.group('expr')
441 else:
442 def assign(val):
443 local_vars[ass_m.group('out')] = val
444 return val
445 expr = ass_m.group('expr')
446 elif stmt.startswith(u'return '):
447 assign = lambda v: v
448 expr = stmt[len(u'return '):]
449 else:
450 raise ExtractorError(
451 u'Cannot determine left side of statement in %r' % stmt)
452
453 v = interpret_expression(expr, local_vars, allow_recursion)
454 return assign(v)
455
456 def interpret_expression(expr, local_vars, allow_recursion):
457 if expr.isdigit():
458 return int(expr)
459
460 if expr.isalpha():
461 return local_vars[expr]
462
463 m = re.match(r'^(?P<in>[a-z]+)\.(?P<member>.*)$', expr)
464 if m:
465 member = m.group('member')
466 val = local_vars[m.group('in')]
467 if member == 'split("")':
468 return list(val)
469 if member == 'join("")':
470 return u''.join(val)
471 if member == 'length':
472 return len(val)
473 if member == 'reverse()':
474 return val[::-1]
475 slice_m = re.match(r'slice\((?P<idx>.*)\)', member)
476 if slice_m:
477 idx = interpret_expression(
478 slice_m.group('idx'), local_vars, allow_recursion-1)
479 return val[idx:]
480
481 m = re.match(
482 r'^(?P<in>[a-z]+)\[(?P<idx>.+)\]$', expr)
483 if m:
484 val = local_vars[m.group('in')]
485 idx = interpret_expression(m.group('idx'), local_vars,
486 allow_recursion-1)
487 return val[idx]
488
489 m = re.match(r'^(?P<a>.+?)(?P<op>[%])(?P<b>.+?)$', expr)
490 if m:
491 a = interpret_expression(m.group('a'),
492 local_vars, allow_recursion)
493 b = interpret_expression(m.group('b'),
494 local_vars, allow_recursion)
495 return a % b
496
497 m = re.match(
498 r'^(?P<func>[a-zA-Z]+)\((?P<args>[a-z0-9,]+)\)$', expr)
499 if m:
500 fname = m.group('func')
501 if fname not in functions:
502 functions[fname] = extract_function(fname)
503 argvals = [int(v) if v.isdigit() else local_vars[v]
504 for v in m.group('args').split(',')]
505 return functions[fname](argvals)
506 raise ExtractorError(u'Unsupported JS expression %r' % expr)
507
508 def extract_function(funcname):
509 func_m = re.search(
510 r'function ' + re.escape(funcname) +
511 r'\((?P<args>[a-z,]+)\){(?P<code>[^}]+)}',
512 jscode)
513 argnames = func_m.group('args').split(',')
514
515 def resf(args):
516 local_vars = dict(zip(argnames, args))
517 for stmt in func_m.group('code').split(';'):
518 res = interpret_statement(stmt, local_vars)
519 return res
520 return resf
521
522 initial_function = extract_function(funcname)
523 return lambda s: initial_function([s])
524
525 def _parse_sig_swf(self, file_contents):
526 if file_contents[1:3] != b'WS':
527 raise ExtractorError(
528 u'Not an SWF file; header is %r' % file_contents[:3])
529 if file_contents[:1] == b'C':
530 content = zlib.decompress(file_contents[8:])
531 else:
532 raise NotImplementedError(u'Unsupported compression format %r' %
533 file_contents[:1])
534
535 def extract_tags(content):
536 pos = 0
537 while pos < len(content):
538 header16 = struct.unpack('<H', content[pos:pos+2])[0]
539 pos += 2
540 tag_code = header16 >> 6
541 tag_len = header16 & 0x3f
542 if tag_len == 0x3f:
543 tag_len = struct.unpack('<I', content[pos:pos+4])[0]
544 pos += 4
545 assert pos+tag_len <= len(content)
546 yield (tag_code, content[pos:pos+tag_len])
547 pos += tag_len
548
549 code_tag = next(tag
550 for tag_code, tag in extract_tags(content)
551 if tag_code == 82)
552 p = code_tag.index(b'\0', 4) + 1
ba552f54 553 code_reader = io.BytesIO(code_tag[p:])
e0df6211
PH
554
555 # Parse ABC (AVM2 ByteCode)
ba552f54
PH
556 def read_int(reader=None):
557 if reader is None:
558 reader = code_reader
e0df6211
PH
559 res = 0
560 shift = 0
561 for _ in range(5):
ba552f54
PH
562 buf = reader.read(1)
563 assert len(buf) == 1
564 b = struct.unpack('<B', buf)[0]
e0df6211
PH
565 res = res | ((b & 0x7f) << shift)
566 if b & 0x80 == 0:
567 break
568 shift += 7
ba552f54
PH
569 return res
570
571 def u30(reader=None):
572 res = read_int(reader)
573 assert res & 0xf0000000 == 0
e0df6211
PH
574 return res
575 u32 = read_int
576
ba552f54
PH
577 def s32(reader=None):
578 v = read_int(reader)
e0df6211
PH
579 if v & 0x80000000 != 0:
580 v = - ((v ^ 0xffffffff) + 1)
ba552f54
PH
581 return v
582
0ca96d48 583 def read_string(reader=None):
ba552f54
PH
584 if reader is None:
585 reader = code_reader
586 slen = u30(reader)
587 resb = reader.read(slen)
588 assert len(resb) == slen
589 return resb.decode('utf-8')
590
591 def read_bytes(count, reader=None):
592 if reader is None:
593 reader = code_reader
594 resb = reader.read(count)
595 assert len(resb) == count
596 return resb
597
598 def read_byte(reader=None):
599 resb = read_bytes(1, reader=reader)
600 res = struct.unpack('<B', resb)[0]
601 return res
e0df6211
PH
602
603 # minor_version + major_version
0ca96d48 604 read_bytes(2 + 2)
e0df6211
PH
605
606 # Constant pool
ba552f54 607 int_count = u30()
e0df6211 608 for _c in range(1, int_count):
0ca96d48 609 s32()
ba552f54 610 uint_count = u30()
e0df6211 611 for _c in range(1, uint_count):
0ca96d48 612 u32()
ba552f54 613 double_count = u30()
0ca96d48 614 read_bytes((double_count-1) * 8)
ba552f54 615 string_count = u30()
e0df6211
PH
616 constant_strings = [u'']
617 for _c in range(1, string_count):
0ca96d48 618 s = read_string()
e0df6211 619 constant_strings.append(s)
ba552f54 620 namespace_count = u30()
e0df6211 621 for _c in range(1, namespace_count):
0ca96d48
PH
622 read_bytes(1) # kind
623 u30() # name
ba552f54 624 ns_set_count = u30()
e0df6211 625 for _c in range(1, ns_set_count):
ba552f54 626 count = u30()
e0df6211 627 for _c2 in range(count):
0ca96d48 628 u30()
ba552f54 629 multiname_count = u30()
e0df6211
PH
630 MULTINAME_SIZES = {
631 0x07: 2, # QName
632 0x0d: 2, # QNameA
633 0x0f: 1, # RTQName
634 0x10: 1, # RTQNameA
635 0x11: 0, # RTQNameL
636 0x12: 0, # RTQNameLA
637 0x09: 2, # Multiname
638 0x0e: 2, # MultinameA
639 0x1b: 1, # MultinameL
640 0x1c: 1, # MultinameLA
641 }
642 multinames = [u'']
643 for _c in range(1, multiname_count):
ba552f54 644 kind = u30()
e0df6211
PH
645 assert kind in MULTINAME_SIZES, u'Invalid multiname kind %r' % kind
646 if kind == 0x07:
0ca96d48 647 u30() # namespace_idx
ba552f54 648 name_idx = u30()
e0df6211
PH
649 multinames.append(constant_strings[name_idx])
650 else:
651 multinames.append('[MULTINAME kind: %d]' % kind)
652 for _c2 in range(MULTINAME_SIZES[kind]):
0ca96d48 653 u30()
e0df6211
PH
654
655 # Methods
ba552f54 656 method_count = u30()
e0df6211
PH
657 MethodInfo = collections.namedtuple(
658 'MethodInfo',
659 ['NEED_ARGUMENTS', 'NEED_REST'])
660 method_infos = []
661 for method_id in range(method_count):
ba552f54 662 param_count = u30()
0ca96d48 663 u30() # return type
e0df6211 664 for _ in range(param_count):
0ca96d48
PH
665 u30() # param type
666 u30() # name index (always 0 for youtube)
ba552f54 667 flags = read_byte()
e0df6211
PH
668 if flags & 0x08 != 0:
669 # Options present
ba552f54 670 option_count = u30()
e0df6211 671 for c in range(option_count):
0ca96d48
PH
672 u30() # val
673 read_bytes(1) # kind
e0df6211
PH
674 if flags & 0x80 != 0:
675 # Param names present
676 for _ in range(param_count):
0ca96d48 677 u30() # param name
e0df6211
PH
678 mi = MethodInfo(flags & 0x01 != 0, flags & 0x04 != 0)
679 method_infos.append(mi)
680
681 # Metadata
ba552f54 682 metadata_count = u30()
e0df6211 683 for _c in range(metadata_count):
0ca96d48 684 u30() # name
ba552f54 685 item_count = u30()
e0df6211 686 for _c2 in range(item_count):
0ca96d48
PH
687 u30() # key
688 u30() # value
ba552f54
PH
689
690 def parse_traits_info():
691 trait_name_idx = u30()
692 kind_full = read_byte()
e0df6211
PH
693 kind = kind_full & 0x0f
694 attrs = kind_full >> 4
695 methods = {}
696 if kind in [0x00, 0x06]: # Slot or Const
0ca96d48
PH
697 u30() # Slot id
698 u30() # type_name_idx
ba552f54 699 vindex = u30()
e0df6211 700 if vindex != 0:
0ca96d48 701 read_byte() # vkind
e0df6211 702 elif kind in [0x01, 0x02, 0x03]: # Method / Getter / Setter
0ca96d48 703 u30() # disp_id
ba552f54 704 method_idx = u30()
e0df6211
PH
705 methods[multinames[trait_name_idx]] = method_idx
706 elif kind == 0x04: # Class
0ca96d48
PH
707 u30() # slot_id
708 u30() # classi
e0df6211 709 elif kind == 0x05: # Function
0ca96d48 710 u30() # slot_id
ba552f54 711 function_idx = u30()
e0df6211
PH
712 methods[function_idx] = multinames[trait_name_idx]
713 else:
714 raise ExtractorError(u'Unsupported trait kind %d' % kind)
715
716 if attrs & 0x4 != 0: # Metadata present
ba552f54 717 metadata_count = u30()
e0df6211 718 for _c3 in range(metadata_count):
0ca96d48 719 u30() # metadata index
e0df6211 720
ba552f54 721 return methods
e0df6211
PH
722
723 # Classes
724 TARGET_CLASSNAME = u'SignatureDecipher'
725 searched_idx = multinames.index(TARGET_CLASSNAME)
726 searched_class_id = None
ba552f54 727 class_count = u30()
e0df6211 728 for class_id in range(class_count):
ba552f54 729 name_idx = u30()
e0df6211
PH
730 if name_idx == searched_idx:
731 # We found the class we're looking for!
732 searched_class_id = class_id
0ca96d48 733 u30() # super_name idx
ba552f54 734 flags = read_byte()
e0df6211 735 if flags & 0x08 != 0: # Protected namespace is present
0ca96d48 736 u30() # protected_ns_idx
ba552f54 737 intrf_count = u30()
e0df6211 738 for _c2 in range(intrf_count):
0ca96d48
PH
739 u30()
740 u30() # iinit
ba552f54 741 trait_count = u30()
e0df6211 742 for _c2 in range(trait_count):
0ca96d48 743 parse_traits_info()
e0df6211
PH
744
745 if searched_class_id is None:
746 raise ExtractorError(u'Target class %r not found' %
747 TARGET_CLASSNAME)
748
749 method_names = {}
750 method_idxs = {}
751 for class_id in range(class_count):
0ca96d48 752 u30() # cinit
ba552f54 753 trait_count = u30()
e0df6211 754 for _c2 in range(trait_count):
ba552f54 755 trait_methods = parse_traits_info()
e0df6211
PH
756 if class_id == searched_class_id:
757 method_names.update(trait_methods.items())
758 method_idxs.update(dict(
759 (idx, name)
760 for name, idx in trait_methods.items()))
761
762 # Scripts
ba552f54 763 script_count = u30()
e0df6211 764 for _c in range(script_count):
0ca96d48 765 u30() # init
ba552f54 766 trait_count = u30()
e0df6211 767 for _c2 in range(trait_count):
0ca96d48 768 parse_traits_info()
e0df6211
PH
769
770 # Method bodies
ba552f54 771 method_body_count = u30()
e0df6211
PH
772 Method = collections.namedtuple('Method', ['code', 'local_count'])
773 methods = {}
774 for _c in range(method_body_count):
ba552f54 775 method_idx = u30()
0ca96d48 776 u30() # max_stack
ba552f54 777 local_count = u30()
0ca96d48
PH
778 u30() # init_scope_depth
779 u30() # max_scope_depth
ba552f54
PH
780 code_length = u30()
781 code = read_bytes(code_length)
e0df6211 782 if method_idx in method_idxs:
ba552f54 783 m = Method(code, local_count)
e0df6211 784 methods[method_idxs[method_idx]] = m
ba552f54 785 exception_count = u30()
e0df6211 786 for _c2 in range(exception_count):
0ca96d48
PH
787 u30() # from
788 u30() # to
789 u30() # target
790 u30() # exc_type
791 u30() # var_name
ba552f54 792 trait_count = u30()
e0df6211 793 for _c2 in range(trait_count):
0ca96d48 794 parse_traits_info()
e0df6211 795
ba552f54 796 assert p + code_reader.tell() == len(code_tag)
e0df6211
PH
797 assert len(methods) == len(method_idxs)
798
799 method_pyfunctions = {}
800
801 def extract_function(func_name):
802 if func_name in method_pyfunctions:
803 return method_pyfunctions[func_name]
804 if func_name not in methods:
805 raise ExtractorError(u'Cannot find function %r' % func_name)
806 m = methods[func_name]
807
808 def resfunc(args):
e0df6211
PH
809 registers = ['(this)'] + list(args) + [None] * m.local_count
810 stack = []
811 coder = io.BytesIO(m.code)
812 while True:
813 opcode = struct.unpack('!B', coder.read(1))[0]
a7177865 814 if opcode == 36: # pushbyte
e0df6211
PH
815 v = struct.unpack('!B', coder.read(1))[0]
816 stack.append(v)
817 elif opcode == 44: # pushstring
818 idx = u30(coder)
819 stack.append(constant_strings[idx])
820 elif opcode == 48: # pushscope
821 # We don't implement the scope register, so we'll just
822 # ignore the popped value
823 stack.pop()
824 elif opcode == 70: # callproperty
825 index = u30(coder)
826 mname = multinames[index]
827 arg_count = u30(coder)
828 args = list(reversed(
829 [stack.pop() for _ in range(arg_count)]))
830 obj = stack.pop()
831 if mname == u'split':
832 assert len(args) == 1
833 assert isinstance(args[0], compat_str)
834 assert isinstance(obj, compat_str)
835 if args[0] == u'':
836 res = list(obj)
837 else:
838 res = obj.split(args[0])
839 stack.append(res)
a7177865
PH
840 elif mname == u'slice':
841 assert len(args) == 1
842 assert isinstance(args[0], int)
843 assert isinstance(obj, list)
844 res = obj[args[0]:]
845 stack.append(res)
846 elif mname == u'join':
847 assert len(args) == 1
848 assert isinstance(args[0], compat_str)
849 assert isinstance(obj, list)
850 res = args[0].join(obj)
851 stack.append(res)
e0df6211
PH
852 elif mname in method_pyfunctions:
853 stack.append(method_pyfunctions[mname](args))
854 else:
855 raise NotImplementedError(
856 u'Unsupported property %r on %r'
857 % (mname, obj))
a7177865
PH
858 elif opcode == 72: # returnvalue
859 res = stack.pop()
860 return res
861 elif opcode == 79: # callpropvoid
862 index = u30(coder)
863 mname = multinames[index]
864 arg_count = u30(coder)
865 args = list(reversed(
866 [stack.pop() for _ in range(arg_count)]))
867 obj = stack.pop()
868 if mname == u'reverse':
869 assert isinstance(obj, list)
870 obj.reverse()
871 else:
872 raise NotImplementedError(
873 u'Unsupported (void) property %r on %r'
874 % (mname, obj))
e0df6211
PH
875 elif opcode == 93: # findpropstrict
876 index = u30(coder)
877 mname = multinames[index]
878 res = extract_function(mname)
879 stack.append(res)
880 elif opcode == 97: # setproperty
881 index = u30(coder)
882 value = stack.pop()
883 idx = stack.pop()
884 obj = stack.pop()
885 assert isinstance(obj, list)
886 assert isinstance(idx, int)
887 obj[idx] = value
888 elif opcode == 98: # getlocal
889 index = u30(coder)
890 stack.append(registers[index])
891 elif opcode == 99: # setlocal
892 index = u30(coder)
893 value = stack.pop()
894 registers[index] = value
895 elif opcode == 102: # getproperty
896 index = u30(coder)
897 pname = multinames[index]
898 if pname == u'length':
899 obj = stack.pop()
900 assert isinstance(obj, list)
901 stack.append(len(obj))
902 else: # Assume attribute access
903 idx = stack.pop()
904 assert isinstance(idx, int)
905 obj = stack.pop()
906 assert isinstance(obj, list)
907 stack.append(obj[idx])
908 elif opcode == 128: # coerce
0ca96d48 909 u30(coder)
e0df6211
PH
910 elif opcode == 133: # coerce_s
911 assert isinstance(stack[-1], (type(None), compat_str))
912 elif opcode == 164: # modulo
913 value2 = stack.pop()
914 value1 = stack.pop()
915 res = value1 % value2
916 stack.append(res)
a7177865
PH
917 elif opcode == 208: # getlocal_0
918 stack.append(registers[0])
919 elif opcode == 209: # getlocal_1
920 stack.append(registers[1])
921 elif opcode == 210: # getlocal_2
922 stack.append(registers[2])
923 elif opcode == 211: # getlocal_3
924 stack.append(registers[3])
e0df6211
PH
925 elif opcode == 214: # setlocal_2
926 registers[2] = stack.pop()
927 elif opcode == 215: # setlocal_3
928 registers[3] = stack.pop()
929 else:
930 raise NotImplementedError(
931 u'Unsupported opcode %d' % opcode)
932
933 method_pyfunctions[func_name] = resfunc
934 return resfunc
935
936 initial_function = extract_function(u'decipher')
937 return lambda s: initial_function([s])
938
83799698 939 def _decrypt_signature(self, s, video_id, player_url, age_gate=False):
257a2501 940 """Turn the encrypted s field into a working signature"""
6b37f0be 941
83799698 942 if player_url is not None:
9f9be844
PH
943 if player_url.startswith(u'//'):
944 player_url = u'https:' + player_url
e0df6211 945 try:
7f8ae73a
PH
946 player_id = (player_url, len(s))
947 if player_id not in self._player_cache:
83799698 948 func = self._extract_signature_function(
c4417ddb 949 video_id, player_url, len(s)
e0df6211 950 )
7f8ae73a
PH
951 self._player_cache[player_id] = func
952 func = self._player_cache[player_id]
edf3e38e
PH
953 if self._downloader.params.get('youtube_print_sig_code'):
954 self._print_sig_code(func, len(s))
955 return func(s)
0ca96d48 956 except Exception:
e0df6211 957 tb = traceback.format_exc()
83799698
PH
958 self._downloader.report_warning(
959 u'Automatic signature extraction failed: ' + tb)
e0df6211 960
d2d8f895
PH
961 self._downloader.report_warning(
962 u'Warning: Falling back to static signature algorithm')
920de7a2 963
2f2ffea9
PH
964 return self._static_decrypt_signature(
965 s, video_id, player_url, age_gate)
e0df6211 966
2f2ffea9 967 def _static_decrypt_signature(self, s, video_id, player_url, age_gate):
e0df6211
PH
968 if age_gate:
969 # The videos with age protection use another player, so the
970 # algorithms can be different.
971 if len(s) == 86:
972 return s[2:63] + s[82] + s[64:82] + s[63]
973
bc4b9008 974 if len(s) == 93:
975 return s[86:29:-1] + s[88] + s[28:5:-1]
976 elif len(s) == 92:
444b1165 977 return s[25] + s[3:25] + s[0] + s[26:42] + s[79] + s[43:79] + s[91] + s[80:83]
38d025b3
JMF
978 elif len(s) == 91:
979 return s[84:27:-1] + s[86] + s[26:5:-1]
444b1165
JMF
980 elif len(s) == 90:
981 return s[25] + s[3:25] + s[2] + s[26:40] + s[77] + s[41:77] + s[89] + s[78:81]
8a9d86a2 982 elif len(s) == 89:
983 return s[84:78:-1] + s[87] + s[77:60:-1] + s[0] + s[59:3:-1]
444b1165 984 elif len(s) == 88:
3e223834 985 return s[7:28] + s[87] + s[29:45] + s[55] + s[46:55] + s[2] + s[56:87] + s[28]
be547e1d 986 elif len(s) == 87:
3a725669 987 return s[6:27] + s[4] + s[28:39] + s[27] + s[40:59] + s[2] + s[60:]
be547e1d 988 elif len(s) == 86:
f2c327fd 989 return s[80:72:-1] + s[16] + s[71:39:-1] + s[72] + s[38:16:-1] + s[82] + s[15::-1]
be547e1d 990 elif len(s) == 85:
6ae8ee3f 991 return s[3:11] + s[0] + s[12:55] + s[84] + s[56:84]
be547e1d 992 elif len(s) == 84:
6f56389b 993 return s[78:70:-1] + s[14] + s[69:37:-1] + s[70] + s[36:14:-1] + s[80] + s[:14][::-1]
be547e1d 994 elif len(s) == 83:
920de7a2 995 return s[80:63:-1] + s[0] + s[62:0:-1] + s[63]
be547e1d 996 elif len(s) == 82:
c21315f2 997 return s[80:37:-1] + s[7] + s[36:7:-1] + s[0] + s[6:0:-1] + s[37]
be547e1d 998 elif len(s) == 81:
aedd6bb9 999 return s[56] + s[79:56:-1] + s[41] + s[55:41:-1] + s[80] + s[40:34:-1] + s[0] + s[33:29:-1] + s[34] + s[28:9:-1] + s[29] + s[8:0:-1] + s[9]
066090dd
JMF
1000 elif len(s) == 80:
1001 return s[1:19] + s[0] + s[20:68] + s[19] + s[69:80]
5c468ca8
JMF
1002 elif len(s) == 79:
1003 return s[54] + s[77:54:-1] + s[39] + s[53:39:-1] + s[78] + s[38:34:-1] + s[0] + s[33:29:-1] + s[34] + s[28:9:-1] + s[29] + s[8:0:-1] + s[9]
be547e1d
PH
1004
1005 else:
1006 raise ExtractorError(u'Unable to decrypt signature, key length %d not supported; retrying might work' % (len(s)))
c5e8d7af 1007
1f343eaa 1008 def _get_available_subtitles(self, video_id, webpage):
de7f3446 1009 try:
7fad1c63
JMF
1010 sub_list = self._download_webpage(
1011 'http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,
1012 video_id, note=False)
1013 except ExtractorError as err:
de7f3446
JMF
1014 self._downloader.report_warning(u'unable to download video subtitles: %s' % compat_str(err))
1015 return {}
1016 lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
1017
1018 sub_lang_list = {}
1019 for l in lang_list:
1020 lang = l[1]
1021 params = compat_urllib_parse.urlencode({
1022 'lang': lang,
1023 'v': video_id,
ca715127 1024 'fmt': self._downloader.params.get('subtitlesformat', 'srt'),
c3197e3e 1025 'name': unescapeHTML(l[0]).encode('utf-8'),
de7f3446
JMF
1026 })
1027 url = u'http://www.youtube.com/api/timedtext?' + params
1028 sub_lang_list[lang] = url
1029 if not sub_lang_list:
1030 self._downloader.report_warning(u'video doesn\'t have subtitles')
1031 return {}
1032 return sub_lang_list
1033
055e6f36 1034 def _get_available_automatic_caption(self, video_id, webpage):
de7f3446
JMF
1035 """We need the webpage for getting the captions url, pass it as an
1036 argument to speed up the process."""
ca715127 1037 sub_format = self._downloader.params.get('subtitlesformat', 'srt')
de7f3446
JMF
1038 self.to_screen(u'%s: Looking for automatic captions' % video_id)
1039 mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
055e6f36 1040 err_msg = u'Couldn\'t find automatic captions for %s' % video_id
de7f3446
JMF
1041 if mobj is None:
1042 self._downloader.report_warning(err_msg)
1043 return {}
1044 player_config = json.loads(mobj.group(1))
1045 try:
1046 args = player_config[u'args']
1047 caption_url = args[u'ttsurl']
1048 timestamp = args[u'timestamp']
055e6f36
JMF
1049 # We get the available subtitles
1050 list_params = compat_urllib_parse.urlencode({
1051 'type': 'list',
1052 'tlangs': 1,
1053 'asrs': 1,
de7f3446 1054 })
055e6f36 1055 list_url = caption_url + '&' + list_params
e26f8712 1056 caption_list = self._download_xml(list_url, video_id)
e3dc22ca 1057 original_lang_node = caption_list.find('track')
f6a54188 1058 if original_lang_node is None or original_lang_node.attrib.get('kind') != 'asr' :
e3dc22ca
JMF
1059 self._downloader.report_warning(u'Video doesn\'t have automatic captions')
1060 return {}
1061 original_lang = original_lang_node.attrib['lang_code']
055e6f36
JMF
1062
1063 sub_lang_list = {}
1064 for lang_node in caption_list.findall('target'):
1065 sub_lang = lang_node.attrib['lang_code']
1066 params = compat_urllib_parse.urlencode({
1067 'lang': original_lang,
1068 'tlang': sub_lang,
1069 'fmt': sub_format,
1070 'ts': timestamp,
1071 'kind': 'asr',
1072 })
1073 sub_lang_list[sub_lang] = caption_url + '&' + params
1074 return sub_lang_list
de7f3446
JMF
1075 # An extractor error can be raise by the download process if there are
1076 # no automatic captions but there are subtitles
1077 except (KeyError, ExtractorError):
1078 self._downloader.report_warning(err_msg)
1079 return {}
1080
c5e8d7af
PH
1081 def _extract_id(self, url):
1082 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1083 if mobj is None:
1084 raise ExtractorError(u'Invalid URL: %s' % url)
1085 video_id = mobj.group(2)
1086 return video_id
1087
1d043b93
JMF
1088 def _extract_from_m3u8(self, manifest_url, video_id):
1089 url_map = {}
1090 def _get_urls(_manifest):
1091 lines = _manifest.split('\n')
1092 urls = filter(lambda l: l and not l.startswith('#'),
1093 lines)
1094 return urls
1095 manifest = self._download_webpage(manifest_url, video_id, u'Downloading formats manifest')
1096 formats_urls = _get_urls(manifest)
1097 for format_url in formats_urls:
890f62e8 1098 itag = self._search_regex(r'itag/(\d+?)/', format_url, 'itag')
1d043b93
JMF
1099 url_map[itag] = format_url
1100 return url_map
1101
1fb07d10
JG
1102 def _extract_annotations(self, video_id):
1103 url = 'https://www.youtube.com/annotations_invideo?features=1&legacy=1&video_id=%s' % video_id
1104 return self._download_webpage(url, video_id, note=u'Searching for annotations.', errnote=u'Unable to download video annotations.')
1105
c5e8d7af
PH
1106 def _real_extract(self, url):
1107 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
1108 mobj = re.search(self._NEXT_URL_RE, url)
1109 if mobj:
1110 url = 'https://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
1111 video_id = self._extract_id(url)
1112
1113 # Get video webpage
c5e8d7af 1114 url = 'https://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
336c3a69 1115 video_webpage = self._download_webpage(url, video_id)
c5e8d7af
PH
1116
1117 # Attempt to extract SWF player URL
e0df6211 1118 mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
c5e8d7af
PH
1119 if mobj is not None:
1120 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1121 else:
1122 player_url = None
1123
1124 # Get video info
1125 self.report_video_info_webpage_download(video_id)
c108eb73
JMF
1126 if re.search(r'player-age-gate-content">', video_webpage) is not None:
1127 self.report_age_confirmation()
1128 age_gate = True
1129 # We simulate the access to the video from www.youtube.com/v/{video_id}
1130 # this can be viewed without login into Youtube
1131 data = compat_urllib_parse.urlencode({'video_id': video_id,
fccd3771 1132 'el': 'player_embedded',
c108eb73
JMF
1133 'gl': 'US',
1134 'hl': 'en',
1135 'eurl': 'https://youtube.googleapis.com/v/' + video_id,
1136 'asv': 3,
1137 'sts':'1588',
1138 })
1139 video_info_url = 'https://www.youtube.com/get_video_info?' + data
c5e8d7af
PH
1140 video_info_webpage = self._download_webpage(video_info_url, video_id,
1141 note=False,
1142 errnote='unable to download video info webpage')
1143 video_info = compat_parse_qs(video_info_webpage)
c108eb73
JMF
1144 else:
1145 age_gate = False
1146 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1147 video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1148 % (video_id, el_type))
1149 video_info_webpage = self._download_webpage(video_info_url, video_id,
1150 note=False,
1151 errnote='unable to download video info webpage')
1152 video_info = compat_parse_qs(video_info_webpage)
1153 if 'token' in video_info:
1154 break
c5e8d7af
PH
1155 if 'token' not in video_info:
1156 if 'reason' in video_info:
9a82b238 1157 raise ExtractorError(u'YouTube said: %s' % video_info['reason'][0], expected=True)
c5e8d7af
PH
1158 else:
1159 raise ExtractorError(u'"token" parameter not in video info for unknown reason')
1160
1d699755
PH
1161 if 'view_count' in video_info:
1162 view_count = int(video_info['view_count'][0])
1163 else:
1164 view_count = None
1165
c5e8d7af
PH
1166 # Check for "rental" videos
1167 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
1168 raise ExtractorError(u'"rental" videos not supported')
1169
1170 # Start extracting information
1171 self.report_information_extraction(video_id)
1172
1173 # uploader
1174 if 'author' not in video_info:
1175 raise ExtractorError(u'Unable to extract uploader name')
1176 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
1177
1178 # uploader_id
1179 video_uploader_id = None
1180 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
1181 if mobj is not None:
1182 video_uploader_id = mobj.group(1)
1183 else:
1184 self._downloader.report_warning(u'unable to extract uploader nickname')
1185
1186 # title
a8c6b241
PH
1187 if 'title' in video_info:
1188 video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
1189 else:
1190 self._downloader.report_warning(u'Unable to extract video title')
1191 video_title = u'_'
c5e8d7af
PH
1192
1193 # thumbnail image
7763b04e
JMF
1194 # We try first to get a high quality image:
1195 m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">',
1196 video_webpage, re.DOTALL)
1197 if m_thumb is not None:
1198 video_thumbnail = m_thumb.group(1)
1199 elif 'thumbnail_url' not in video_info:
c5e8d7af 1200 self._downloader.report_warning(u'unable to extract video thumbnail')
f490e77e 1201 video_thumbnail = None
c5e8d7af
PH
1202 else: # don't panic if we can't find it
1203 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
1204
1205 # upload date
1206 upload_date = None
1207 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
1208 if mobj is not None:
1209 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1210 upload_date = unified_strdate(upload_date)
1211
1212 # description
1213 video_description = get_element_by_id("eow-description", video_webpage)
1214 if video_description:
27dcce19
PH
1215 video_description = re.sub(r'''(?x)
1216 <a\s+
1217 (?:[a-zA-Z-]+="[^"]+"\s+)*?
1218 title="([^"]+)"\s+
1219 (?:[a-zA-Z-]+="[^"]+"\s+)*?
1220 class="yt-uix-redirect-link"\s*>
1221 [^<]+
1222 </a>
1223 ''', r'\1', video_description)
c5e8d7af
PH
1224 video_description = clean_html(video_description)
1225 else:
1226 fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
1227 if fd_mobj:
1228 video_description = unescapeHTML(fd_mobj.group(1))
1229 else:
1230 video_description = u''
1231
336c3a69 1232 def _extract_count(klass):
46374a56
PH
1233 count = self._search_regex(
1234 r'class="%s">([\d,]+)</span>' % re.escape(klass),
1235 video_webpage, klass, default=None)
336c3a69
JMF
1236 if count is not None:
1237 return int(count.replace(',', ''))
1238 return None
1239 like_count = _extract_count(u'likes-count')
1240 dislike_count = _extract_count(u'dislikes-count')
1241
c5e8d7af 1242 # subtitles
d82134c3 1243 video_subtitles = self.extract_subtitles(video_id, video_webpage)
c5e8d7af 1244
c5e8d7af 1245 if self._downloader.params.get('listsubtitles', False):
d665f8d3 1246 self._list_available_subtitles(video_id, video_webpage)
c5e8d7af
PH
1247 return
1248
1249 if 'length_seconds' not in video_info:
1250 self._downloader.report_warning(u'unable to extract video duration')
b466b702 1251 video_duration = None
c5e8d7af 1252 else:
b466b702 1253 video_duration = int(compat_urllib_parse.unquote_plus(video_info['length_seconds'][0]))
c5e8d7af 1254
1fb07d10
JG
1255 # annotations
1256 video_annotations = None
1257 if self._downloader.params.get('writeannotations', False):
1258 video_annotations = self._extract_annotations(video_id)
1259
c5e8d7af 1260 # Decide which formats to download
c5e8d7af
PH
1261 try:
1262 mobj = re.search(r';ytplayer.config = ({.*?});', video_webpage)
50be92c1
PH
1263 if not mobj:
1264 raise ValueError('Could not find vevo ID')
c5e8d7af
PH
1265 info = json.loads(mobj.group(1))
1266 args = info['args']
7ce7e394
JMF
1267 # Easy way to know if the 's' value is in url_encoded_fmt_stream_map
1268 # this signatures are encrypted
44d46655 1269 if 'url_encoded_fmt_stream_map' not in args:
f10503db 1270 raise ValueError(u'No stream_map present') # caught below
00fe14fc
JMF
1271 re_signature = re.compile(r'[&,]s=')
1272 m_s = re_signature.search(args['url_encoded_fmt_stream_map'])
7ce7e394
JMF
1273 if m_s is not None:
1274 self.to_screen(u'%s: Encrypted signatures detected.' % video_id)
c5e8d7af 1275 video_info['url_encoded_fmt_stream_map'] = [args['url_encoded_fmt_stream_map']]
00fe14fc 1276 m_s = re_signature.search(args.get('adaptive_fmts', u''))
b7a68384 1277 if m_s is not None:
00fe14fc
JMF
1278 if 'adaptive_fmts' in video_info:
1279 video_info['adaptive_fmts'][0] += ',' + args['adaptive_fmts']
37b6d5f6 1280 else:
00fe14fc 1281 video_info['adaptive_fmts'] = [args['adaptive_fmts']]
c5e8d7af
PH
1282 except ValueError:
1283 pass
1284
dd27fd17
PH
1285 def _map_to_format_list(urlmap):
1286 formats = []
1287 for itag, video_real_url in urlmap.items():
1288 dct = {
1289 'format_id': itag,
1290 'url': video_real_url,
1291 'player_url': player_url,
1292 }
1293 dct.update(self._formats[itag])
1294 formats.append(dct)
1295 return formats
1296
c5e8d7af
PH
1297 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1298 self.report_rtmp_download()
dd27fd17
PH
1299 formats = [{
1300 'format_id': '_rtmp',
1301 'protocol': 'rtmp',
1302 'url': video_info['conn'][0],
1303 'player_url': player_url,
1304 }]
00fe14fc
JMF
1305 elif len(video_info.get('url_encoded_fmt_stream_map', [])) >= 1 or len(video_info.get('adaptive_fmts', [])) >= 1:
1306 encoded_url_map = video_info.get('url_encoded_fmt_stream_map', [''])[0] + ',' + video_info.get('adaptive_fmts',[''])[0]
1307 if 'rtmpe%3Dyes' in encoded_url_map:
a7055eb9 1308 raise ExtractorError('rtmpe downloads are not supported, see https://github.com/rg3/youtube-dl/issues/343 for more information.', expected=True)
c5e8d7af 1309 url_map = {}
00fe14fc 1310 for url_data_str in encoded_url_map.split(','):
c5e8d7af
PH
1311 url_data = compat_parse_qs(url_data_str)
1312 if 'itag' in url_data and 'url' in url_data:
1313 url = url_data['url'][0]
1314 if 'sig' in url_data:
1315 url += '&signature=' + url_data['sig'][0]
1316 elif 's' in url_data:
e0df6211 1317 encrypted_sig = url_data['s'][0]
769fda3c 1318 if self._downloader.params.get('verbose'):
c108eb73 1319 if age_gate:
bdde940e
PH
1320 if player_url is None:
1321 player_version = 'unknown'
1322 else:
1323 player_version = self._search_regex(
1324 r'-(.+)\.swf$', player_url,
1325 u'flash player', fatal=False)
e0df6211 1326 player_desc = 'flash player %s' % player_version
c108eb73 1327 else:
83799698
PH
1328 player_version = self._search_regex(
1329 r'html5player-(.+?)\.js', video_webpage,
c108eb73 1330 'html5 player', fatal=False)
e0df6211
PH
1331 player_desc = u'html5 player %s' % player_version
1332
1333 parts_sizes = u'.'.join(compat_str(len(part)) for part in encrypted_sig.split('.'))
5a76c651 1334 self.to_screen(u'encrypted signature length %d (%s), itag %s, %s' %
e0df6211
PH
1335 (len(encrypted_sig), parts_sizes, url_data['itag'][0], player_desc))
1336
83799698 1337 if not age_gate:
e0df6211
PH
1338 jsplayer_url_json = self._search_regex(
1339 r'"assets":.+?"js":\s*("[^"]+")',
1340 video_webpage, u'JS player URL')
83799698 1341 player_url = json.loads(jsplayer_url_json)
e0df6211 1342
83799698
PH
1343 signature = self._decrypt_signature(
1344 encrypted_sig, video_id, player_url, age_gate)
c5e8d7af
PH
1345 url += '&signature=' + signature
1346 if 'ratebypass' not in url:
1347 url += '&ratebypass=yes'
1348 url_map[url_data['itag'][0]] = url
dd27fd17 1349 formats = _map_to_format_list(url_map)
1d043b93
JMF
1350 elif video_info.get('hlsvp'):
1351 manifest_url = video_info['hlsvp'][0]
1352 url_map = self._extract_from_m3u8(manifest_url, video_id)
dd27fd17 1353 formats = _map_to_format_list(url_map)
c5e8d7af 1354 else:
9abb3204 1355 raise ExtractorError(u'no conn, hlsvp or url_encoded_fmt_stream_map information found in video info')
c5e8d7af 1356
dd27fd17
PH
1357 # Look for the DASH manifest
1358 dash_manifest_url_lst = video_info.get('dashmpd')
4919603f
PH
1359 if (dash_manifest_url_lst and dash_manifest_url_lst[0] and
1360 self._downloader.params.get('youtube_include_dash_manifest', False)):
dd27fd17
PH
1361 try:
1362 dash_doc = self._download_xml(
1363 dash_manifest_url_lst[0], video_id,
1364 note=u'Downloading DASH manifest',
1365 errnote=u'Could not download DASH manifest')
1366 for r in dash_doc.findall(u'.//{urn:mpeg:DASH:schema:MPD:2011}Representation'):
1367 url_el = r.find('{urn:mpeg:DASH:schema:MPD:2011}BaseURL')
1368 if url_el is None:
1369 continue
1370 format_id = r.attrib['id']
1371 video_url = url_el.text
1372 filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength'))
1373 f = {
1374 'format_id': format_id,
1375 'url': video_url,
1376 'width': int_or_none(r.attrib.get('width')),
1377 'tbr': int_or_none(r.attrib.get('bandwidth'), 1000),
1378 'asr': int_or_none(r.attrib.get('audioSamplingRate')),
1379 'filesize': filesize,
1380 }
1381 try:
1382 existing_format = next(
1383 fo for fo in formats
1384 if fo['format_id'] == format_id)
1385 except StopIteration:
1386 f.update(self._formats.get(format_id, {}))
1387 formats.append(f)
1388 else:
1389 existing_format.update(f)
1390
1391 except (ExtractorError, KeyError) as e:
1392 self.report_warning(u'Skipping DASH manifest: %s' % e, video_id)
d80044c2 1393
4bcc7bd1 1394 self._sort_formats(formats)
4ea3be0a 1395
1396 return {
1397 'id': video_id,
1398 'uploader': video_uploader,
1399 'uploader_id': video_uploader_id,
1400 'upload_date': upload_date,
1401 'title': video_title,
1402 'thumbnail': video_thumbnail,
1403 'description': video_description,
1404 'subtitles': video_subtitles,
1405 'duration': video_duration,
1406 'age_limit': 18 if age_gate else 0,
1407 'annotations': video_annotations,
1408 'webpage_url': 'https://www.youtube.com/watch?v=%s' % video_id,
1409 'view_count': view_count,
1410 'like_count': like_count,
1411 'dislike_count': dislike_count,
1412 'formats': formats,
1413 }
c5e8d7af 1414
880e1c52 1415class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
0f818663 1416 IE_DESC = u'YouTube.com playlists'
c5e8d7af
PH
1417 _VALID_URL = r"""(?:
1418 (?:https?://)?
1419 (?:\w+\.)?
1420 youtube\.com/
1421 (?:
1422 (?:course|view_play_list|my_playlists|artist|playlist|watch)
1423 \? (?:.*?&)*? (?:p|a|list)=
1424 | p/
1425 )
715c8e7b 1426 ((?:PL|EC|UU|FL|RD)?[0-9A-Za-z-_]{10,})
c5e8d7af
PH
1427 .*
1428 |
715c8e7b 1429 ((?:PL|EC|UU|FL|RD)[0-9A-Za-z-_]{10,})
c5e8d7af 1430 )"""
dcbb4580
JMF
1431 _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s&page=%s'
1432 _MORE_PAGES_INDICATOR = r'data-link-type="next"'
6e47b51e 1433 _VIDEO_RE = r'href="/watch\?v=(?P<id>[0-9A-Za-z_-]{11})&amp;[^"]*?index=(?P<index>\d+)'
c5e8d7af
PH
1434 IE_NAME = u'youtube:playlist'
1435
1436 @classmethod
1437 def suitable(cls, url):
1438 """Receives a URL and returns True if suitable for this IE."""
1439 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1440
880e1c52
JMF
1441 def _real_initialize(self):
1442 self._login()
1443
652cdaa2
JMF
1444 def _ids_to_results(self, ids):
1445 return [self.url_result(vid_id, 'Youtube', video_id=vid_id)
1446 for vid_id in ids]
1447
1448 def _extract_mix(self, playlist_id):
1449 # The mixes are generated from a a single video
1450 # the id of the playlist is just 'RD' + video_id
7d4afc55 1451 url = 'https://youtube.com/watch?v=%s&list=%s' % (playlist_id[-11:], playlist_id)
652cdaa2 1452 webpage = self._download_webpage(url, playlist_id, u'Downloading Youtube mix')
76d1700b
JMF
1453 title_span = (get_element_by_attribute('class', 'title long-title', webpage) or
1454 get_element_by_attribute('class', 'title ', webpage))
1455 title = clean_html(title_span)
652cdaa2
JMF
1456 video_re = r'data-index="\d+".*?href="/watch\?v=([0-9A-Za-z_-]{11})&amp;[^"]*?list=%s' % re.escape(playlist_id)
1457 ids = orderedSet(re.findall(video_re, webpage))
1458 url_results = self._ids_to_results(ids)
1459
1460 return self.playlist_result(url_results, playlist_id, title)
1461
c5e8d7af
PH
1462 def _real_extract(self, url):
1463 # Extract playlist id
1464 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1465 if mobj is None:
1466 raise ExtractorError(u'Invalid URL: %s' % url)
47192f92
FV
1467 playlist_id = mobj.group(1) or mobj.group(2)
1468
1469 # Check if it's a video-specific URL
7c61bd36 1470 query_dict = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
47192f92
FV
1471 if 'v' in query_dict:
1472 video_id = query_dict['v'][0]
1473 if self._downloader.params.get('noplaylist'):
1474 self.to_screen(u'Downloading just video %s because of --no-playlist' % video_id)
7012b23c 1475 return self.url_result(video_id, 'Youtube', video_id=video_id)
47192f92
FV
1476 else:
1477 self.to_screen(u'Downloading playlist PL%s - add --no-playlist to just download video %s' % (playlist_id, video_id))
c5e8d7af 1478
7d4afc55 1479 if playlist_id.startswith('RD'):
652cdaa2
JMF
1480 # Mixes require a custom extraction process
1481 return self._extract_mix(playlist_id)
0a688bc0
JMF
1482 if playlist_id.startswith('TL'):
1483 raise ExtractorError(u'For downloading YouTube.com top lists, use '
1484 u'the "yttoplist" keyword, for example "youtube-dl \'yttoplist:music:Top Tracks\'"', expected=True)
652cdaa2 1485
dcbb4580
JMF
1486 # Extract the video ids from the playlist pages
1487 ids = []
c5e8d7af 1488
755eb032 1489 for page_num in itertools.count(1):
dcbb4580 1490 url = self._TEMPLATE_URL % (playlist_id, page_num)
c5e8d7af 1491 page = self._download_webpage(url, playlist_id, u'Downloading page #%s' % page_num)
6e47b51e
JMF
1492 matches = re.finditer(self._VIDEO_RE, page)
1493 # We remove the duplicates and the link with index 0
1494 # (it's not the first video of the playlist)
1495 new_ids = orderedSet(m.group('id') for m in matches if m.group('index') != '0')
dcbb4580 1496 ids.extend(new_ids)
c5e8d7af 1497
dcbb4580 1498 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
c5e8d7af
PH
1499 break
1500
c91778f8
PH
1501 try:
1502 playlist_title = self._og_search_title(page)
1503 except RegexNotFoundError:
1504 self.report_warning(
1505 u'Playlist page is missing OpenGraph title, falling back ...',
1506 playlist_id)
1507 playlist_title = self._html_search_regex(
1508 r'<h1 class="pl-header-title">(.*?)</h1>', page, u'title')
c5e8d7af 1509
652cdaa2 1510 url_results = self._ids_to_results(ids)
dcbb4580 1511 return self.playlist_result(url_results, playlist_id, playlist_title)
c5e8d7af
PH
1512
1513
0a688bc0
JMF
1514class YoutubeTopListIE(YoutubePlaylistIE):
1515 IE_NAME = u'youtube:toplist'
1516 IE_DESC = (u'YouTube.com top lists, "yttoplist:{channel}:{list title}"'
1517 u' (Example: "yttoplist:music:Top Tracks")')
1518 _VALID_URL = r'yttoplist:(?P<chann>.*?):(?P<title>.*?)$'
1519
1520 def _real_extract(self, url):
1521 mobj = re.match(self._VALID_URL, url)
1522 channel = mobj.group('chann')
1523 title = mobj.group('title')
1524 query = compat_urllib_parse.urlencode({'title': title})
1525 playlist_re = 'href="([^"]+?%s[^"]+?)"' % re.escape(query)
1526 channel_page = self._download_webpage('https://www.youtube.com/%s' % channel, title)
1527 link = self._html_search_regex(playlist_re, channel_page, u'list')
1528 url = compat_urlparse.urljoin('https://www.youtube.com/', link)
1529
1530 video_re = r'data-index="\d+".*?data-video-id="([0-9A-Za-z_-]{11})"'
1531 ids = []
1532 # sometimes the webpage doesn't contain the videos
1533 # retry until we get them
1534 for i in itertools.count(0):
1535 msg = u'Downloading Youtube mix'
1536 if i > 0:
1537 msg += ', retry #%d' % i
1538 webpage = self._download_webpage(url, title, msg)
1539 ids = orderedSet(re.findall(video_re, webpage))
1540 if ids:
1541 break
1542 url_results = self._ids_to_results(ids)
1543 return self.playlist_result(url_results, playlist_title=title)
1544
1545
c5e8d7af 1546class YoutubeChannelIE(InfoExtractor):
0f818663 1547 IE_DESC = u'YouTube.com channels'
c5e8d7af 1548 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
c5e8d7af 1549 _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
252580c5 1550 _MORE_PAGES_URL = 'http://www.youtube.com/c4_browse_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
c5e8d7af
PH
1551 IE_NAME = u'youtube:channel'
1552
1553 def extract_videos_from_page(self, page):
1554 ids_in_page = []
1555 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1556 if mobj.group(1) not in ids_in_page:
1557 ids_in_page.append(mobj.group(1))
1558 return ids_in_page
1559
1560 def _real_extract(self, url):
1561 # Extract channel id
1562 mobj = re.match(self._VALID_URL, url)
1563 if mobj is None:
1564 raise ExtractorError(u'Invalid URL: %s' % url)
1565
1566 # Download channel page
1567 channel_id = mobj.group(1)
1568 video_ids = []
b9643eed
JMF
1569 url = 'https://www.youtube.com/channel/%s/videos' % channel_id
1570 channel_page = self._download_webpage(url, channel_id)
31812a9e
PH
1571 autogenerated = re.search(r'''(?x)
1572 class="[^"]*?(?:
1573 channel-header-autogenerated-label|
1574 yt-channel-title-autogenerated
1575 )[^"]*"''', channel_page) is not None
c5e8d7af 1576
b9643eed
JMF
1577 if autogenerated:
1578 # The videos are contained in a single page
1579 # the ajax pages can't be used, they are empty
1580 video_ids = self.extract_videos_from_page(channel_page)
1581 else:
1582 # Download all channel pages using the json-based channel_ajax query
1583 for pagenum in itertools.count(1):
1584 url = self._MORE_PAGES_URL % (pagenum, channel_id)
1585 page = self._download_webpage(url, channel_id,
1586 u'Downloading page #%s' % pagenum)
1587
1588 page = json.loads(page)
1589
1590 ids_in_page = self.extract_videos_from_page(page['content_html'])
1591 video_ids.extend(ids_in_page)
1592
1593 if self._MORE_PAGES_INDICATOR not in page['load_more_widget_html']:
1594 break
c5e8d7af
PH
1595
1596 self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1597
7012b23c
PH
1598 url_entries = [self.url_result(video_id, 'Youtube', video_id=video_id)
1599 for video_id in video_ids]
1600 return self.playlist_result(url_entries, channel_id)
c5e8d7af
PH
1601
1602
1603class YoutubeUserIE(InfoExtractor):
0f818663 1604 IE_DESC = u'YouTube.com user videos (URL or "ytuser" keyword)'
57da92b7 1605 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:user/)?(?!(?:attribution_link|watch)(?:$|[^a-z_A-Z0-9-])))|ytuser:)(?!feed/)([A-Za-z0-9_-]+)'
c5e8d7af
PH
1606 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1607 _GDATA_PAGE_SIZE = 50
fd9cf738 1608 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d&alt=json'
c5e8d7af
PH
1609 IE_NAME = u'youtube:user'
1610
e3ea4790 1611 @classmethod
f4b05232 1612 def suitable(cls, url):
e3ea4790
JMF
1613 # Don't return True if the url can be extracted with other youtube
1614 # extractor, the regex would is too permissive and it would match.
1615 other_ies = iter(klass for (name, klass) in globals().items() if name.endswith('IE') and klass is not cls)
1616 if any(ie.suitable(url) for ie in other_ies): return False
f4b05232
JMF
1617 else: return super(YoutubeUserIE, cls).suitable(url)
1618
c5e8d7af
PH
1619 def _real_extract(self, url):
1620 # Extract username
1621 mobj = re.match(self._VALID_URL, url)
1622 if mobj is None:
1623 raise ExtractorError(u'Invalid URL: %s' % url)
1624
1625 username = mobj.group(1)
1626
1627 # Download video ids using YouTube Data API. Result size per
1628 # query is limited (currently to 50 videos) so we need to query
1629 # page by page until there are no video ids - it means we got
1630 # all of them.
1631
b7ab0590 1632 def download_page(pagenum):
c5e8d7af
PH
1633 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1634
1635 gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
b7ab0590
PH
1636 page = self._download_webpage(
1637 gdata_url, username,
1638 u'Downloading video ids from %d to %d' % (
1639 start_index, start_index + self._GDATA_PAGE_SIZE))
c5e8d7af 1640
fd9cf738
JMF
1641 try:
1642 response = json.loads(page)
1643 except ValueError as err:
1644 raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
71c82637 1645 if 'entry' not in response['feed']:
b7ab0590 1646 return
fd9cf738 1647
c5e8d7af 1648 # Extract video identifiers
e302f9ce
PH
1649 entries = response['feed']['entry']
1650 for entry in entries:
1651 title = entry['title']['$t']
1652 video_id = entry['id']['$t'].split('/')[-1]
b7ab0590 1653 yield {
e302f9ce
PH
1654 '_type': 'url',
1655 'url': video_id,
1656 'ie_key': 'Youtube',
1657 'id': 'video_id',
1658 'title': title,
b7ab0590
PH
1659 }
1660 url_results = PagedList(download_page, self._GDATA_PAGE_SIZE)
c5e8d7af 1661
7012b23c
PH
1662 return self.playlist_result(url_results, playlist_title=username)
1663
b05654f0
PH
1664
1665class YoutubeSearchIE(SearchInfoExtractor):
0f818663 1666 IE_DESC = u'YouTube.com searches'
b05654f0
PH
1667 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1668 _MAX_RESULTS = 1000
1669 IE_NAME = u'youtube:search'
1670 _SEARCH_KEY = 'ytsearch'
1671
b05654f0
PH
1672 def _get_n_results(self, query, n):
1673 """Get a specified number of results for a query"""
1674
1675 video_ids = []
1676 pagenum = 0
1677 limit = n
1678
1679 while (50 * pagenum) < limit:
b05654f0 1680 result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
7cc3570e
PH
1681 data_json = self._download_webpage(
1682 result_url, video_id=u'query "%s"' % query,
1683 note=u'Downloading page %s' % (pagenum + 1),
1684 errnote=u'Unable to download API page')
1685 data = json.loads(data_json)
1686 api_response = data['data']
1687
1688 if 'items' not in api_response:
b05654f0
PH
1689 raise ExtractorError(u'[youtube] No video results')
1690
1691 new_ids = list(video['id'] for video in api_response['items'])
1692 video_ids += new_ids
1693
1694 limit = min(n, api_response['totalItems'])
1695 pagenum += 1
1696
1697 if len(video_ids) > n:
1698 video_ids = video_ids[:n]
7012b23c
PH
1699 videos = [self.url_result(video_id, 'Youtube', video_id=video_id)
1700 for video_id in video_ids]
b05654f0 1701 return self.playlist_result(videos, query)
75dff0ee 1702
a3dd9248 1703class YoutubeSearchDateIE(YoutubeSearchIE):
cb7fb546 1704 IE_NAME = YoutubeSearchIE.IE_NAME + ':date'
a3dd9248
CM
1705 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc&orderby=published'
1706 _SEARCH_KEY = 'ytsearchdate'
08fb86c4 1707 IE_DESC = u'YouTube.com searches, newest videos first'
75dff0ee
JMF
1708
1709class YoutubeShowIE(InfoExtractor):
0f818663 1710 IE_DESC = u'YouTube.com (multi-season) shows'
75dff0ee
JMF
1711 _VALID_URL = r'https?://www\.youtube\.com/show/(.*)'
1712 IE_NAME = u'youtube:show'
1713
1714 def _real_extract(self, url):
1715 mobj = re.match(self._VALID_URL, url)
1716 show_name = mobj.group(1)
1717 webpage = self._download_webpage(url, show_name, u'Downloading show webpage')
1718 # There's one playlist for each season of the show
1719 m_seasons = list(re.finditer(r'href="(/playlist\?list=.*?)"', webpage))
1720 self.to_screen(u'%s: Found %s seasons' % (show_name, len(m_seasons)))
1721 return [self.url_result('https://www.youtube.com' + season.group(1), 'YoutubePlaylist') for season in m_seasons]
04cc9617
JMF
1722
1723
b2e8bc1b 1724class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
d7ae0639
JMF
1725 """
1726 Base class for extractors that fetch info from
1727 http://www.youtube.com/feed_ajax
1728 Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.
1729 """
b2e8bc1b 1730 _LOGIN_REQUIRED = True
43ba5456
JMF
1731 # use action_load_personal_feed instead of action_load_system_feed
1732 _PERSONAL_FEED = False
04cc9617 1733
d7ae0639
JMF
1734 @property
1735 def _FEED_TEMPLATE(self):
43ba5456
JMF
1736 action = 'action_load_system_feed'
1737 if self._PERSONAL_FEED:
1738 action = 'action_load_personal_feed'
1739 return 'http://www.youtube.com/feed_ajax?%s=1&feed_name=%s&paging=%%s' % (action, self._FEED_NAME)
d7ae0639
JMF
1740
1741 @property
1742 def IE_NAME(self):
1743 return u'youtube:%s' % self._FEED_NAME
04cc9617 1744
81f0259b 1745 def _real_initialize(self):
b2e8bc1b 1746 self._login()
81f0259b 1747
04cc9617
JMF
1748 def _real_extract(self, url):
1749 feed_entries = []
0e44d838
JMF
1750 paging = 0
1751 for i in itertools.count(1):
d7ae0639
JMF
1752 info = self._download_webpage(self._FEED_TEMPLATE % paging,
1753 u'%s feed' % self._FEED_NAME,
04cc9617
JMF
1754 u'Downloading page %s' % i)
1755 info = json.loads(info)
1756 feed_html = info['feed_html']
43ba5456 1757 m_ids = re.finditer(r'"/watch\?v=(.*?)["&]', feed_html)
04cc9617 1758 ids = orderedSet(m.group(1) for m in m_ids)
7012b23c
PH
1759 feed_entries.extend(
1760 self.url_result(video_id, 'Youtube', video_id=video_id)
1761 for video_id in ids)
04cc9617
JMF
1762 if info['paging'] is None:
1763 break
0e44d838 1764 paging = info['paging']
d7ae0639
JMF
1765 return self.playlist_result(feed_entries, playlist_title=self._PLAYLIST_TITLE)
1766
1767class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor):
1768 IE_DESC = u'YouTube.com subscriptions feed, "ytsubs" keyword(requires authentication)'
1769 _VALID_URL = r'https?://www\.youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
1770 _FEED_NAME = 'subscriptions'
1771 _PLAYLIST_TITLE = u'Youtube Subscriptions'
1772
1773class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
1774 IE_DESC = u'YouTube.com recommended videos, "ytrec" keyword (requires authentication)'
1775 _VALID_URL = r'https?://www\.youtube\.com/feed/recommended|:ytrec(?:ommended)?'
1776 _FEED_NAME = 'recommended'
1777 _PLAYLIST_TITLE = u'Youtube Recommended videos'
c626a3d9 1778
43ba5456
JMF
1779class YoutubeWatchLaterIE(YoutubeFeedsInfoExtractor):
1780 IE_DESC = u'Youtube watch later list, "ytwatchlater" keyword (requires authentication)'
1781 _VALID_URL = r'https?://www\.youtube\.com/feed/watch_later|:ytwatchlater'
1782 _FEED_NAME = 'watch_later'
1783 _PLAYLIST_TITLE = u'Youtube Watch Later'
43ba5456 1784 _PERSONAL_FEED = True
c626a3d9 1785
f459d170
JMF
1786class YoutubeHistoryIE(YoutubeFeedsInfoExtractor):
1787 IE_DESC = u'Youtube watch history, "ythistory" keyword (requires authentication)'
1788 _VALID_URL = u'https?://www\.youtube\.com/feed/history|:ythistory'
1789 _FEED_NAME = 'history'
1790 _PERSONAL_FEED = True
1791 _PLAYLIST_TITLE = u'Youtube Watch History'
1792
c626a3d9
JMF
1793class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
1794 IE_NAME = u'youtube:favorites'
1795 IE_DESC = u'YouTube.com favourite videos, "ytfav" keyword (requires authentication)'
c7a7750d 1796 _VALID_URL = r'https?://www\.youtube\.com/my_favorites|:ytfav(?:ou?rites)?'
c626a3d9
JMF
1797 _LOGIN_REQUIRED = True
1798
1799 def _real_extract(self, url):
1800 webpage = self._download_webpage('https://www.youtube.com/my_favorites', 'Youtube Favourites videos')
1801 playlist_id = self._search_regex(r'list=(.+?)["&]', webpage, u'favourites playlist id')
1802 return self.url_result(playlist_id, 'YoutubePlaylist')
15870e90
PH
1803
1804
1805class YoutubeTruncatedURLIE(InfoExtractor):
1806 IE_NAME = 'youtube:truncated_url'
1807 IE_DESC = False # Do not list
1808 _VALID_URL = r'(?:https?://)?[^/]+/watch\?feature=[a-z_]+$'
1809
1810 def _real_extract(self, url):
1811 raise ExtractorError(
1812 u'Did you forget to quote the URL? Remember that & is a meta '
1813 u'character in most shells, so you want to put the URL in quotes, '
1814 u'like youtube-dl '
b4622a32
PH
1815 u'"http://www.youtube.com/watch?feature=foo&v=BaW_jenozKc" '
1816 u' or simply youtube-dl BaW_jenozKc .',
15870e90 1817 expected=True)