]> jfr.im git - yt-dlp.git/blame - youtube_dl/extractor/youtube.py
release 2014.01.23.3
[yt-dlp.git] / youtube_dl / extractor / youtube.py
CommitLineData
c5e8d7af 1# coding: utf-8
c5e8d7af 2
e0df6211 3import collections
edf3e38e 4import errno
e0df6211 5import io
0ca96d48 6import itertools
c5e8d7af 7import json
c4417ddb 8import os.path
c5e8d7af 9import re
e0df6211
PH
10import string
11import struct
12import traceback
13import zlib
c5e8d7af 14
b05654f0 15from .common import InfoExtractor, SearchInfoExtractor
54d39d8b 16from .subtitles import SubtitlesInfoExtractor
c5e8d7af 17from ..utils import (
edf3e38e 18 compat_chr,
c5e8d7af 19 compat_parse_qs,
c5e8d7af
PH
20 compat_urllib_parse,
21 compat_urllib_request,
7c61bd36 22 compat_urlparse,
c5e8d7af
PH
23 compat_str,
24
25 clean_html,
c38b1e77 26 get_cachedir,
c5e8d7af 27 get_element_by_id,
652cdaa2 28 get_element_by_attribute,
c5e8d7af 29 ExtractorError,
dd27fd17 30 int_or_none,
b7ab0590 31 PagedList,
c91778f8 32 RegexNotFoundError,
c5e8d7af
PH
33 unescapeHTML,
34 unified_strdate,
04cc9617 35 orderedSet,
edf3e38e 36 write_json_file,
c5e8d7af
PH
37)
38
de7f3446 39class YoutubeBaseInfoExtractor(InfoExtractor):
b2e8bc1b
JMF
40 """Provide base functions for Youtube extractors"""
41 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
42 _LANG_URL = r'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
43 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
44 _NETRC_MACHINE = 'youtube'
45 # If True it will raise an error if no login info is provided
46 _LOGIN_REQUIRED = False
47
b2e8bc1b 48 def _set_language(self):
7cc3570e
PH
49 return bool(self._download_webpage(
50 self._LANG_URL, None,
51 note=u'Setting language', errnote='unable to set language',
52 fatal=False))
b2e8bc1b
JMF
53
54 def _login(self):
55 (username, password) = self._get_login_info()
56 # No authentication to be performed
57 if username is None:
58 if self._LOGIN_REQUIRED:
59 raise ExtractorError(u'No login info available, needed for using %s.' % self.IE_NAME, expected=True)
60 return False
61
7cc3570e
PH
62 login_page = self._download_webpage(
63 self._LOGIN_URL, None,
64 note=u'Downloading login page',
65 errnote=u'unable to fetch login page', fatal=False)
66 if login_page is False:
67 return
b2e8bc1b 68
795f28f8
PH
69 galx = self._search_regex(r'(?s)<input.+?name="GALX".+?value="(.+?)"',
70 login_page, u'Login GALX parameter')
c5e8d7af 71
b2e8bc1b
JMF
72 # Log in
73 login_form_strs = {
74 u'continue': u'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
75 u'Email': username,
76 u'GALX': galx,
77 u'Passwd': password,
78 u'PersistentCookie': u'yes',
79 u'_utf8': u'霱',
80 u'bgresponse': u'js_disabled',
81 u'checkConnection': u'',
82 u'checkedDomains': u'youtube',
83 u'dnConn': u'',
b2e8bc1b
JMF
84 u'pstMsg': u'0',
85 u'rmShown': u'1',
86 u'secTok': u'',
87 u'signIn': u'Sign in',
88 u'timeStmp': u'',
89 u'service': u'youtube',
90 u'uilel': u'3',
91 u'hl': u'en_US',
92 }
93 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
94 # chokes on unicode
95 login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
96 login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
7cc3570e
PH
97
98 req = compat_urllib_request.Request(self._LOGIN_URL, login_data)
99 login_results = self._download_webpage(
100 req, None,
101 note=u'Logging in', errnote=u'unable to log in', fatal=False)
102 if login_results is False:
103 return False
104 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
105 self._downloader.report_warning(u'unable to log in: bad username or password')
b2e8bc1b
JMF
106 return False
107 return True
108
109 def _confirm_age(self):
110 age_form = {
7cc3570e
PH
111 'next_url': '/',
112 'action_confirm': 'Confirm',
113 }
114 req = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
115
116 self._download_webpage(
117 req, None,
118 note=u'Confirming age', errnote=u'Unable to confirm age')
b2e8bc1b
JMF
119 return True
120
121 def _real_initialize(self):
122 if self._downloader is None:
123 return
124 if not self._set_language():
125 return
126 if not self._login():
127 return
128 self._confirm_age()
c5e8d7af 129
8377574c 130
de7f3446 131class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
0f818663 132 IE_DESC = u'YouTube.com'
cb7dfeea 133 _VALID_URL = r"""(?x)^
c5e8d7af 134 (
83aa5293 135 (?:https?://|//)? # http(s):// or protocol-independent URL (optional)
cb7dfeea 136 (?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/|
484aaeb2
PH
137 (?:www\.)?deturl\.com/www\.youtube\.com/|
138 (?:www\.)?pwnyoutube\.com|
e69ae5b9
JMF
139 tube\.majestyc\.net/|
140 youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains
c5e8d7af
PH
141 (?:.*?\#/)? # handle anchor (#/) redirect urls
142 (?: # the various things that can precede the ID:
143 (?:(?:v|embed|e)/) # v/ or embed/ or e/
144 |(?: # or the v= param in all its forms
d741e55a 145 (?:(?:watch|movie)(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
c5e8d7af
PH
146 (?:\?|\#!?) # the params delimiter ? or # or #!
147 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
148 v=
149 )
f4b05232
JMF
150 ))
151 |youtu\.be/ # just youtu.be/xxxx
152 )
c5e8d7af 153 )? # all until now is optional -> you can pass the naked ID
8963d9c2 154 ([0-9A-Za-z_-]{11}) # here is it! the YouTube video ID
c5e8d7af
PH
155 (?(1).+)? # if we found the ID, everything can follow
156 $"""
c5e8d7af 157 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
2c62dc26
PH
158 _formats = {
159 '5': {'ext': 'flv', 'width': 400, 'height': 240},
160 '6': {'ext': 'flv', 'width': 450, 'height': 270},
161 '13': {'ext': '3gp'},
162 '17': {'ext': '3gp', 'width': 176, 'height': 144},
163 '18': {'ext': 'mp4', 'width': 640, 'height': 360},
164 '22': {'ext': 'mp4', 'width': 1280, 'height': 720},
165 '34': {'ext': 'flv', 'width': 640, 'height': 360},
166 '35': {'ext': 'flv', 'width': 854, 'height': 480},
167 '36': {'ext': '3gp', 'width': 320, 'height': 240},
168 '37': {'ext': 'mp4', 'width': 1920, 'height': 1080},
169 '38': {'ext': 'mp4', 'width': 4096, 'height': 3072},
170 '43': {'ext': 'webm', 'width': 640, 'height': 360},
171 '44': {'ext': 'webm', 'width': 854, 'height': 480},
172 '45': {'ext': 'webm', 'width': 1280, 'height': 720},
173 '46': {'ext': 'webm', 'width': 1920, 'height': 1080},
174
1d043b93 175
86fe61c8 176 # 3d videos
2c62dc26
PH
177 '82': {'ext': 'mp4', 'height': 360, 'resolution': '360p', 'format_note': '3D', 'preference': -20},
178 '83': {'ext': 'mp4', 'height': 480, 'resolution': '480p', 'format_note': '3D', 'preference': -20},
179 '84': {'ext': 'mp4', 'height': 720, 'resolution': '720p', 'format_note': '3D', 'preference': -20},
180 '85': {'ext': 'mp4', 'height': 1080, 'resolution': '1080p', 'format_note': '3D', 'preference': -20},
181 '100': {'ext': 'webm', 'height': 360, 'resolution': '360p', 'format_note': '3D', 'preference': -20},
182 '101': {'ext': 'webm', 'height': 480, 'resolution': '480p', 'format_note': '3D', 'preference': -20},
183 '102': {'ext': 'webm', 'height': 720, 'resolution': '720p', 'format_note': '3D', 'preference': -20},
836a086c 184
96fb5605 185 # Apple HTTP Live Streaming
2c62dc26
PH
186 '92': {'ext': 'mp4', 'height': 240, 'resolution': '240p', 'format_note': 'HLS', 'preference': -10},
187 '93': {'ext': 'mp4', 'height': 360, 'resolution': '360p', 'format_note': 'HLS', 'preference': -10},
188 '94': {'ext': 'mp4', 'height': 480, 'resolution': '480p', 'format_note': 'HLS', 'preference': -10},
189 '95': {'ext': 'mp4', 'height': 720, 'resolution': '720p', 'format_note': 'HLS', 'preference': -10},
190 '96': {'ext': 'mp4', 'height': 1080, 'resolution': '1080p', 'format_note': 'HLS', 'preference': -10},
191 '132': {'ext': 'mp4', 'height': 240, 'resolution': '240p', 'format_note': 'HLS', 'preference': -10},
192 '151': {'ext': 'mp4', 'height': 72, 'resolution': '72p', 'format_note': 'HLS', 'preference': -10},
193
194 # DASH mp4 video
195 '133': {'ext': 'mp4', 'height': 240, 'resolution': '240p', 'format_note': 'DASH video', 'preference': -40},
196 '134': {'ext': 'mp4', 'height': 360, 'resolution': '360p', 'format_note': 'DASH video', 'preference': -40},
197 '135': {'ext': 'mp4', 'height': 480, 'resolution': '480p', 'format_note': 'DASH video', 'preference': -40},
198 '136': {'ext': 'mp4', 'height': 720, 'resolution': '720p', 'format_note': 'DASH video', 'preference': -40},
199 '137': {'ext': 'mp4', 'height': 1080, 'resolution': '1080p', 'format_note': 'DASH video', 'preference': -40},
200 '138': {'ext': 'mp4', 'height': 1081, 'resolution': '>1080p', 'format_note': 'DASH video', 'preference': -40},
201 '160': {'ext': 'mp4', 'height': 192, 'resolution': '192p', 'format_note': 'DASH video', 'preference': -40},
8fa8a629 202 '264': {'ext': 'mp4', 'height': 1080, 'resolution': '1080p', 'format_note': 'DASH video', 'preference': -40},
836a086c 203
f6f1fc92 204 # Dash mp4 audio
2c62dc26
PH
205 '139': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 48, 'preference': -50},
206 '140': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 128, 'preference': -50},
207 '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 256, 'preference': -50},
836a086c
AZ
208
209 # Dash webm
1394ce65
PH
210 '167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'VP8', 'acodec': 'none', 'preference': -40},
211 '168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'VP8', 'acodec': 'none', 'preference': -40},
212 '168': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'VP8', 'acodec': 'none', 'preference': -40},
213 '218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'VP8', 'acodec': 'none', 'preference': -40},
214 '219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'VP8', 'acodec': 'none', 'preference': -40},
2c62dc26
PH
215 '242': {'ext': 'webm', 'height': 240, 'resolution': '240p', 'format_note': 'DASH webm', 'preference': -40},
216 '243': {'ext': 'webm', 'height': 360, 'resolution': '360p', 'format_note': 'DASH webm', 'preference': -40},
217 '244': {'ext': 'webm', 'height': 480, 'resolution': '480p', 'format_note': 'DASH webm', 'preference': -40},
218 '245': {'ext': 'webm', 'height': 480, 'resolution': '480p', 'format_note': 'DASH webm', 'preference': -40},
219 '246': {'ext': 'webm', 'height': 480, 'resolution': '480p', 'format_note': 'DASH webm', 'preference': -40},
220 '247': {'ext': 'webm', 'height': 720, 'resolution': '720p', 'format_note': 'DASH webm', 'preference': -40},
221 '248': {'ext': 'webm', 'height': 1080, 'resolution': '1080p', 'format_note': 'DASH webm', 'preference': -40},
222
223 # Dash webm audio
224 '171': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH webm audio', 'abr': 48, 'preference': -50},
225 '172': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH webm audio', 'abr': 256, 'preference': -50},
ce6b9a2d
PH
226
227 # RTMP (unnamed)
228 '_rtmp': {'protocol': 'rtmp'},
c5e8d7af 229 }
836a086c 230
c5e8d7af 231 IE_NAME = u'youtube'
2eb88d95
PH
232 _TESTS = [
233 {
0e853ca4
PH
234 u"url": u"http://www.youtube.com/watch?v=BaW_jenozKc",
235 u"file": u"BaW_jenozKc.mp4",
236 u"info_dict": {
237 u"title": u"youtube-dl test video \"'/\\ä↭𝕐",
238 u"uploader": u"Philipp Hagemeister",
239 u"uploader_id": u"phihag",
240 u"upload_date": u"20121002",
27dcce19 241 u"description": u"test chars: \"'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de ."
2eb88d95 242 }
0e853ca4 243 },
0e853ca4
PH
244 {
245 u"url": u"http://www.youtube.com/watch?v=UxxajLWwzqY",
246 u"file": u"UxxajLWwzqY.mp4",
247 u"note": u"Test generic use_cipher_signature video (#897)",
248 u"info_dict": {
249 u"upload_date": u"20120506",
250 u"title": u"Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]",
2dc59299 251 u"description": u"md5:5b292926389560516e384ac437c0ec07",
45ed795c 252 u"uploader": u"Icona Pop",
0e853ca4 253 u"uploader_id": u"IconaPop"
2eb88d95 254 }
c108eb73
JMF
255 },
256 {
257 u"url": u"https://www.youtube.com/watch?v=07FYdnEawAQ",
258 u"file": u"07FYdnEawAQ.mp4",
259 u"note": u"Test VEVO video with age protection (#956)",
260 u"info_dict": {
261 u"upload_date": u"20130703",
262 u"title": u"Justin Timberlake - Tunnel Vision (Explicit)",
263 u"description": u"md5:64249768eec3bc4276236606ea996373",
264 u"uploader": u"justintimberlakeVEVO",
265 u"uploader_id": u"justintimberlakeVEVO"
266 }
267 },
fccd3771 268 {
83aa5293 269 u"url": u"//www.YouTube.com/watch?v=yZIXLfi8CZQ",
fccd3771
PH
270 u"file": u"yZIXLfi8CZQ.mp4",
271 u"note": u"Embed-only video (#1746)",
272 u"info_dict": {
273 u"upload_date": u"20120608",
274 u"title": u"Principal Sexually Assaults A Teacher - Episode 117 - 8th June 2012",
275 u"description": u"md5:09b78bd971f1e3e289601dfba15ca4f7",
276 u"uploader": u"SET India",
277 u"uploader_id": u"setindia"
278 }
279 },
dd27fd17
PH
280 {
281 u"url": u"http://www.youtube.com/watch?v=a9LDPn-MO4I",
282 u"file": u"a9LDPn-MO4I.m4a",
283 u"note": u"256k DASH audio (format 141) via DASH manifest",
dd27fd17
PH
284 u"info_dict": {
285 u"upload_date": "20121002",
286 u"uploader_id": "8KVIDEO",
287 u"description": "No description available.",
288 u"uploader": "8KVIDEO",
289 u"title": "UHDTV TEST 8K VIDEO.mp4"
4919603f
PH
290 },
291 u"params": {
292 u"youtube_include_dash_manifest": True,
293 u"format": "141",
294 },
dd27fd17 295 },
2eb88d95
PH
296 ]
297
c5e8d7af
PH
298
299 @classmethod
300 def suitable(cls, url):
301 """Receives a URL and returns True if suitable for this IE."""
e3ea4790 302 if YoutubePlaylistIE.suitable(url): return False
fccd3771 303 return re.match(cls._VALID_URL, url) is not None
c5e8d7af 304
e0df6211
PH
305 def __init__(self, *args, **kwargs):
306 super(YoutubeIE, self).__init__(*args, **kwargs)
83799698 307 self._player_cache = {}
e0df6211 308
c5e8d7af
PH
309 def report_video_info_webpage_download(self, video_id):
310 """Report attempt to download video info webpage."""
311 self.to_screen(u'%s: Downloading video info webpage' % video_id)
312
c5e8d7af
PH
313 def report_information_extraction(self, video_id):
314 """Report attempt to extract video information."""
315 self.to_screen(u'%s: Extracting video information' % video_id)
316
317 def report_unavailable_format(self, video_id, format):
318 """Report extracted video URL."""
319 self.to_screen(u'%s: Format %s not available' % (video_id, format))
320
321 def report_rtmp_download(self):
322 """Indicate the download will use the RTMP protocol."""
323 self.to_screen(u'RTMP download detected')
324
c4417ddb
PH
325 def _extract_signature_function(self, video_id, player_url, slen):
326 id_m = re.match(r'.*-(?P<id>[a-zA-Z0-9_-]+)\.(?P<ext>[a-z]+)$',
83799698 327 player_url)
e0df6211
PH
328 player_type = id_m.group('ext')
329 player_id = id_m.group('id')
330
c4417ddb
PH
331 # Read from filesystem cache
332 func_id = '%s_%s_%d' % (player_type, player_id, slen)
333 assert os.path.basename(func_id) == func_id
c38b1e77 334 cache_dir = get_cachedir(self._downloader.params)
c4417ddb 335
c3c88a26 336 cache_enabled = cache_dir is not None
f8061589 337 if cache_enabled:
c4417ddb
PH
338 cache_fn = os.path.join(os.path.expanduser(cache_dir),
339 u'youtube-sigfuncs',
340 func_id + '.json')
341 try:
edf3e38e 342 with io.open(cache_fn, 'r', encoding='utf-8') as cachef:
c4417ddb
PH
343 cache_spec = json.load(cachef)
344 return lambda s: u''.join(s[i] for i in cache_spec)
edf3e38e 345 except IOError:
c4417ddb 346 pass # No cache available
83799698 347
e0df6211
PH
348 if player_type == 'js':
349 code = self._download_webpage(
350 player_url, video_id,
83799698 351 note=u'Downloading %s player %s' % (player_type, player_id),
e0df6211 352 errnote=u'Download of %s failed' % player_url)
83799698 353 res = self._parse_sig_js(code)
c4417ddb 354 elif player_type == 'swf':
e0df6211
PH
355 urlh = self._request_webpage(
356 player_url, video_id,
83799698 357 note=u'Downloading %s player %s' % (player_type, player_id),
e0df6211
PH
358 errnote=u'Download of %s failed' % player_url)
359 code = urlh.read()
83799698 360 res = self._parse_sig_swf(code)
e0df6211
PH
361 else:
362 assert False, 'Invalid player type %r' % player_type
363
f8061589 364 if cache_enabled:
edf3e38e 365 try:
c705320f
PH
366 test_string = u''.join(map(compat_chr, range(slen)))
367 cache_res = res(test_string)
edf3e38e
PH
368 cache_spec = [ord(c) for c in cache_res]
369 try:
370 os.makedirs(os.path.dirname(cache_fn))
371 except OSError as ose:
372 if ose.errno != errno.EEXIST:
373 raise
374 write_json_file(cache_spec, cache_fn)
0ca96d48 375 except Exception:
edf3e38e
PH
376 tb = traceback.format_exc()
377 self._downloader.report_warning(
378 u'Writing cache to %r failed: %s' % (cache_fn, tb))
83799698
PH
379
380 return res
381
edf3e38e
PH
382 def _print_sig_code(self, func, slen):
383 def gen_sig_code(idxs):
384 def _genslice(start, end, step):
385 starts = u'' if start == 0 else str(start)
e35e4ddc
PH
386 ends = (u':%d' % (end+step)) if end + step >= 0 else u':'
387 steps = u'' if step == 1 else (u':%d' % step)
edf3e38e
PH
388 return u's[%s%s%s]' % (starts, ends, steps)
389
390 step = None
0ca96d48
PH
391 start = '(Never used)' # Quelch pyflakes warnings - start will be
392 # set as soon as step is set
edf3e38e
PH
393 for i, prev in zip(idxs[1:], idxs[:-1]):
394 if step is not None:
395 if i - prev == step:
396 continue
397 yield _genslice(start, prev, step)
398 step = None
399 continue
400 if i - prev in [-1, 1]:
401 step = i - prev
402 start = prev
403 continue
404 else:
405 yield u's[%d]' % prev
406 if step is None:
407 yield u's[%d]' % i
408 else:
409 yield _genslice(start, i, step)
410
c705320f
PH
411 test_string = u''.join(map(compat_chr, range(slen)))
412 cache_res = func(test_string)
edf3e38e
PH
413 cache_spec = [ord(c) for c in cache_res]
414 expr_code = u' + '.join(gen_sig_code(cache_spec))
415 code = u'if len(s) == %d:\n return %s\n' % (slen, expr_code)
f8061589 416 self.to_screen(u'Extracted signature function:\n' + code)
edf3e38e 417
e0df6211
PH
418 def _parse_sig_js(self, jscode):
419 funcname = self._search_regex(
420 r'signature=([a-zA-Z]+)', jscode,
421 u'Initial JS player signature function name')
422
423 functions = {}
424
425 def argidx(varname):
426 return string.lowercase.index(varname)
427
428 def interpret_statement(stmt, local_vars, allow_recursion=20):
429 if allow_recursion < 0:
0ca96d48 430 raise ExtractorError(u'Recursion limit reached')
e0df6211
PH
431
432 if stmt.startswith(u'var '):
433 stmt = stmt[len(u'var '):]
434 ass_m = re.match(r'^(?P<out>[a-z]+)(?:\[(?P<index>[^\]]+)\])?' +
435 r'=(?P<expr>.*)$', stmt)
436 if ass_m:
437 if ass_m.groupdict().get('index'):
438 def assign(val):
439 lvar = local_vars[ass_m.group('out')]
440 idx = interpret_expression(ass_m.group('index'),
441 local_vars, allow_recursion)
442 assert isinstance(idx, int)
443 lvar[idx] = val
444 return val
445 expr = ass_m.group('expr')
446 else:
447 def assign(val):
448 local_vars[ass_m.group('out')] = val
449 return val
450 expr = ass_m.group('expr')
451 elif stmt.startswith(u'return '):
452 assign = lambda v: v
453 expr = stmt[len(u'return '):]
454 else:
455 raise ExtractorError(
456 u'Cannot determine left side of statement in %r' % stmt)
457
458 v = interpret_expression(expr, local_vars, allow_recursion)
459 return assign(v)
460
461 def interpret_expression(expr, local_vars, allow_recursion):
462 if expr.isdigit():
463 return int(expr)
464
465 if expr.isalpha():
466 return local_vars[expr]
467
468 m = re.match(r'^(?P<in>[a-z]+)\.(?P<member>.*)$', expr)
469 if m:
470 member = m.group('member')
471 val = local_vars[m.group('in')]
472 if member == 'split("")':
473 return list(val)
474 if member == 'join("")':
475 return u''.join(val)
476 if member == 'length':
477 return len(val)
478 if member == 'reverse()':
479 return val[::-1]
480 slice_m = re.match(r'slice\((?P<idx>.*)\)', member)
481 if slice_m:
482 idx = interpret_expression(
483 slice_m.group('idx'), local_vars, allow_recursion-1)
484 return val[idx:]
485
486 m = re.match(
487 r'^(?P<in>[a-z]+)\[(?P<idx>.+)\]$', expr)
488 if m:
489 val = local_vars[m.group('in')]
490 idx = interpret_expression(m.group('idx'), local_vars,
491 allow_recursion-1)
492 return val[idx]
493
494 m = re.match(r'^(?P<a>.+?)(?P<op>[%])(?P<b>.+?)$', expr)
495 if m:
496 a = interpret_expression(m.group('a'),
497 local_vars, allow_recursion)
498 b = interpret_expression(m.group('b'),
499 local_vars, allow_recursion)
500 return a % b
501
502 m = re.match(
503 r'^(?P<func>[a-zA-Z]+)\((?P<args>[a-z0-9,]+)\)$', expr)
504 if m:
505 fname = m.group('func')
506 if fname not in functions:
507 functions[fname] = extract_function(fname)
508 argvals = [int(v) if v.isdigit() else local_vars[v]
509 for v in m.group('args').split(',')]
510 return functions[fname](argvals)
511 raise ExtractorError(u'Unsupported JS expression %r' % expr)
512
513 def extract_function(funcname):
514 func_m = re.search(
515 r'function ' + re.escape(funcname) +
516 r'\((?P<args>[a-z,]+)\){(?P<code>[^}]+)}',
517 jscode)
518 argnames = func_m.group('args').split(',')
519
520 def resf(args):
521 local_vars = dict(zip(argnames, args))
522 for stmt in func_m.group('code').split(';'):
523 res = interpret_statement(stmt, local_vars)
524 return res
525 return resf
526
527 initial_function = extract_function(funcname)
528 return lambda s: initial_function([s])
529
530 def _parse_sig_swf(self, file_contents):
531 if file_contents[1:3] != b'WS':
532 raise ExtractorError(
533 u'Not an SWF file; header is %r' % file_contents[:3])
534 if file_contents[:1] == b'C':
535 content = zlib.decompress(file_contents[8:])
536 else:
537 raise NotImplementedError(u'Unsupported compression format %r' %
538 file_contents[:1])
539
540 def extract_tags(content):
541 pos = 0
542 while pos < len(content):
543 header16 = struct.unpack('<H', content[pos:pos+2])[0]
544 pos += 2
545 tag_code = header16 >> 6
546 tag_len = header16 & 0x3f
547 if tag_len == 0x3f:
548 tag_len = struct.unpack('<I', content[pos:pos+4])[0]
549 pos += 4
550 assert pos+tag_len <= len(content)
551 yield (tag_code, content[pos:pos+tag_len])
552 pos += tag_len
553
554 code_tag = next(tag
555 for tag_code, tag in extract_tags(content)
556 if tag_code == 82)
557 p = code_tag.index(b'\0', 4) + 1
ba552f54 558 code_reader = io.BytesIO(code_tag[p:])
e0df6211
PH
559
560 # Parse ABC (AVM2 ByteCode)
ba552f54
PH
561 def read_int(reader=None):
562 if reader is None:
563 reader = code_reader
e0df6211
PH
564 res = 0
565 shift = 0
566 for _ in range(5):
ba552f54
PH
567 buf = reader.read(1)
568 assert len(buf) == 1
569 b = struct.unpack('<B', buf)[0]
e0df6211
PH
570 res = res | ((b & 0x7f) << shift)
571 if b & 0x80 == 0:
572 break
573 shift += 7
ba552f54
PH
574 return res
575
576 def u30(reader=None):
577 res = read_int(reader)
578 assert res & 0xf0000000 == 0
e0df6211
PH
579 return res
580 u32 = read_int
581
ba552f54
PH
582 def s32(reader=None):
583 v = read_int(reader)
e0df6211
PH
584 if v & 0x80000000 != 0:
585 v = - ((v ^ 0xffffffff) + 1)
ba552f54
PH
586 return v
587
0ca96d48 588 def read_string(reader=None):
ba552f54
PH
589 if reader is None:
590 reader = code_reader
591 slen = u30(reader)
592 resb = reader.read(slen)
593 assert len(resb) == slen
594 return resb.decode('utf-8')
595
596 def read_bytes(count, reader=None):
597 if reader is None:
598 reader = code_reader
599 resb = reader.read(count)
600 assert len(resb) == count
601 return resb
602
603 def read_byte(reader=None):
604 resb = read_bytes(1, reader=reader)
605 res = struct.unpack('<B', resb)[0]
606 return res
e0df6211
PH
607
608 # minor_version + major_version
0ca96d48 609 read_bytes(2 + 2)
e0df6211
PH
610
611 # Constant pool
ba552f54 612 int_count = u30()
e0df6211 613 for _c in range(1, int_count):
0ca96d48 614 s32()
ba552f54 615 uint_count = u30()
e0df6211 616 for _c in range(1, uint_count):
0ca96d48 617 u32()
ba552f54 618 double_count = u30()
0ca96d48 619 read_bytes((double_count-1) * 8)
ba552f54 620 string_count = u30()
e0df6211
PH
621 constant_strings = [u'']
622 for _c in range(1, string_count):
0ca96d48 623 s = read_string()
e0df6211 624 constant_strings.append(s)
ba552f54 625 namespace_count = u30()
e0df6211 626 for _c in range(1, namespace_count):
0ca96d48
PH
627 read_bytes(1) # kind
628 u30() # name
ba552f54 629 ns_set_count = u30()
e0df6211 630 for _c in range(1, ns_set_count):
ba552f54 631 count = u30()
e0df6211 632 for _c2 in range(count):
0ca96d48 633 u30()
ba552f54 634 multiname_count = u30()
e0df6211
PH
635 MULTINAME_SIZES = {
636 0x07: 2, # QName
637 0x0d: 2, # QNameA
638 0x0f: 1, # RTQName
639 0x10: 1, # RTQNameA
640 0x11: 0, # RTQNameL
641 0x12: 0, # RTQNameLA
642 0x09: 2, # Multiname
643 0x0e: 2, # MultinameA
644 0x1b: 1, # MultinameL
645 0x1c: 1, # MultinameLA
646 }
647 multinames = [u'']
648 for _c in range(1, multiname_count):
ba552f54 649 kind = u30()
e0df6211
PH
650 assert kind in MULTINAME_SIZES, u'Invalid multiname kind %r' % kind
651 if kind == 0x07:
0ca96d48 652 u30() # namespace_idx
ba552f54 653 name_idx = u30()
e0df6211
PH
654 multinames.append(constant_strings[name_idx])
655 else:
656 multinames.append('[MULTINAME kind: %d]' % kind)
657 for _c2 in range(MULTINAME_SIZES[kind]):
0ca96d48 658 u30()
e0df6211
PH
659
660 # Methods
ba552f54 661 method_count = u30()
e0df6211
PH
662 MethodInfo = collections.namedtuple(
663 'MethodInfo',
664 ['NEED_ARGUMENTS', 'NEED_REST'])
665 method_infos = []
666 for method_id in range(method_count):
ba552f54 667 param_count = u30()
0ca96d48 668 u30() # return type
e0df6211 669 for _ in range(param_count):
0ca96d48
PH
670 u30() # param type
671 u30() # name index (always 0 for youtube)
ba552f54 672 flags = read_byte()
e0df6211
PH
673 if flags & 0x08 != 0:
674 # Options present
ba552f54 675 option_count = u30()
e0df6211 676 for c in range(option_count):
0ca96d48
PH
677 u30() # val
678 read_bytes(1) # kind
e0df6211
PH
679 if flags & 0x80 != 0:
680 # Param names present
681 for _ in range(param_count):
0ca96d48 682 u30() # param name
e0df6211
PH
683 mi = MethodInfo(flags & 0x01 != 0, flags & 0x04 != 0)
684 method_infos.append(mi)
685
686 # Metadata
ba552f54 687 metadata_count = u30()
e0df6211 688 for _c in range(metadata_count):
0ca96d48 689 u30() # name
ba552f54 690 item_count = u30()
e0df6211 691 for _c2 in range(item_count):
0ca96d48
PH
692 u30() # key
693 u30() # value
ba552f54
PH
694
695 def parse_traits_info():
696 trait_name_idx = u30()
697 kind_full = read_byte()
e0df6211
PH
698 kind = kind_full & 0x0f
699 attrs = kind_full >> 4
700 methods = {}
701 if kind in [0x00, 0x06]: # Slot or Const
0ca96d48
PH
702 u30() # Slot id
703 u30() # type_name_idx
ba552f54 704 vindex = u30()
e0df6211 705 if vindex != 0:
0ca96d48 706 read_byte() # vkind
e0df6211 707 elif kind in [0x01, 0x02, 0x03]: # Method / Getter / Setter
0ca96d48 708 u30() # disp_id
ba552f54 709 method_idx = u30()
e0df6211
PH
710 methods[multinames[trait_name_idx]] = method_idx
711 elif kind == 0x04: # Class
0ca96d48
PH
712 u30() # slot_id
713 u30() # classi
e0df6211 714 elif kind == 0x05: # Function
0ca96d48 715 u30() # slot_id
ba552f54 716 function_idx = u30()
e0df6211
PH
717 methods[function_idx] = multinames[trait_name_idx]
718 else:
719 raise ExtractorError(u'Unsupported trait kind %d' % kind)
720
721 if attrs & 0x4 != 0: # Metadata present
ba552f54 722 metadata_count = u30()
e0df6211 723 for _c3 in range(metadata_count):
0ca96d48 724 u30() # metadata index
e0df6211 725
ba552f54 726 return methods
e0df6211
PH
727
728 # Classes
729 TARGET_CLASSNAME = u'SignatureDecipher'
730 searched_idx = multinames.index(TARGET_CLASSNAME)
731 searched_class_id = None
ba552f54 732 class_count = u30()
e0df6211 733 for class_id in range(class_count):
ba552f54 734 name_idx = u30()
e0df6211
PH
735 if name_idx == searched_idx:
736 # We found the class we're looking for!
737 searched_class_id = class_id
0ca96d48 738 u30() # super_name idx
ba552f54 739 flags = read_byte()
e0df6211 740 if flags & 0x08 != 0: # Protected namespace is present
0ca96d48 741 u30() # protected_ns_idx
ba552f54 742 intrf_count = u30()
e0df6211 743 for _c2 in range(intrf_count):
0ca96d48
PH
744 u30()
745 u30() # iinit
ba552f54 746 trait_count = u30()
e0df6211 747 for _c2 in range(trait_count):
0ca96d48 748 parse_traits_info()
e0df6211
PH
749
750 if searched_class_id is None:
751 raise ExtractorError(u'Target class %r not found' %
752 TARGET_CLASSNAME)
753
754 method_names = {}
755 method_idxs = {}
756 for class_id in range(class_count):
0ca96d48 757 u30() # cinit
ba552f54 758 trait_count = u30()
e0df6211 759 for _c2 in range(trait_count):
ba552f54 760 trait_methods = parse_traits_info()
e0df6211
PH
761 if class_id == searched_class_id:
762 method_names.update(trait_methods.items())
763 method_idxs.update(dict(
764 (idx, name)
765 for name, idx in trait_methods.items()))
766
767 # Scripts
ba552f54 768 script_count = u30()
e0df6211 769 for _c in range(script_count):
0ca96d48 770 u30() # init
ba552f54 771 trait_count = u30()
e0df6211 772 for _c2 in range(trait_count):
0ca96d48 773 parse_traits_info()
e0df6211
PH
774
775 # Method bodies
ba552f54 776 method_body_count = u30()
e0df6211
PH
777 Method = collections.namedtuple('Method', ['code', 'local_count'])
778 methods = {}
779 for _c in range(method_body_count):
ba552f54 780 method_idx = u30()
0ca96d48 781 u30() # max_stack
ba552f54 782 local_count = u30()
0ca96d48
PH
783 u30() # init_scope_depth
784 u30() # max_scope_depth
ba552f54
PH
785 code_length = u30()
786 code = read_bytes(code_length)
e0df6211 787 if method_idx in method_idxs:
ba552f54 788 m = Method(code, local_count)
e0df6211 789 methods[method_idxs[method_idx]] = m
ba552f54 790 exception_count = u30()
e0df6211 791 for _c2 in range(exception_count):
0ca96d48
PH
792 u30() # from
793 u30() # to
794 u30() # target
795 u30() # exc_type
796 u30() # var_name
ba552f54 797 trait_count = u30()
e0df6211 798 for _c2 in range(trait_count):
0ca96d48 799 parse_traits_info()
e0df6211 800
ba552f54 801 assert p + code_reader.tell() == len(code_tag)
e0df6211
PH
802 assert len(methods) == len(method_idxs)
803
804 method_pyfunctions = {}
805
806 def extract_function(func_name):
807 if func_name in method_pyfunctions:
808 return method_pyfunctions[func_name]
809 if func_name not in methods:
810 raise ExtractorError(u'Cannot find function %r' % func_name)
811 m = methods[func_name]
812
813 def resfunc(args):
e0df6211
PH
814 registers = ['(this)'] + list(args) + [None] * m.local_count
815 stack = []
816 coder = io.BytesIO(m.code)
817 while True:
818 opcode = struct.unpack('!B', coder.read(1))[0]
a7177865 819 if opcode == 36: # pushbyte
e0df6211
PH
820 v = struct.unpack('!B', coder.read(1))[0]
821 stack.append(v)
822 elif opcode == 44: # pushstring
823 idx = u30(coder)
824 stack.append(constant_strings[idx])
825 elif opcode == 48: # pushscope
826 # We don't implement the scope register, so we'll just
827 # ignore the popped value
828 stack.pop()
829 elif opcode == 70: # callproperty
830 index = u30(coder)
831 mname = multinames[index]
832 arg_count = u30(coder)
833 args = list(reversed(
834 [stack.pop() for _ in range(arg_count)]))
835 obj = stack.pop()
836 if mname == u'split':
837 assert len(args) == 1
838 assert isinstance(args[0], compat_str)
839 assert isinstance(obj, compat_str)
840 if args[0] == u'':
841 res = list(obj)
842 else:
843 res = obj.split(args[0])
844 stack.append(res)
a7177865
PH
845 elif mname == u'slice':
846 assert len(args) == 1
847 assert isinstance(args[0], int)
848 assert isinstance(obj, list)
849 res = obj[args[0]:]
850 stack.append(res)
851 elif mname == u'join':
852 assert len(args) == 1
853 assert isinstance(args[0], compat_str)
854 assert isinstance(obj, list)
855 res = args[0].join(obj)
856 stack.append(res)
e0df6211
PH
857 elif mname in method_pyfunctions:
858 stack.append(method_pyfunctions[mname](args))
859 else:
860 raise NotImplementedError(
861 u'Unsupported property %r on %r'
862 % (mname, obj))
a7177865
PH
863 elif opcode == 72: # returnvalue
864 res = stack.pop()
865 return res
866 elif opcode == 79: # callpropvoid
867 index = u30(coder)
868 mname = multinames[index]
869 arg_count = u30(coder)
870 args = list(reversed(
871 [stack.pop() for _ in range(arg_count)]))
872 obj = stack.pop()
873 if mname == u'reverse':
874 assert isinstance(obj, list)
875 obj.reverse()
876 else:
877 raise NotImplementedError(
878 u'Unsupported (void) property %r on %r'
879 % (mname, obj))
e0df6211
PH
880 elif opcode == 93: # findpropstrict
881 index = u30(coder)
882 mname = multinames[index]
883 res = extract_function(mname)
884 stack.append(res)
885 elif opcode == 97: # setproperty
886 index = u30(coder)
887 value = stack.pop()
888 idx = stack.pop()
889 obj = stack.pop()
890 assert isinstance(obj, list)
891 assert isinstance(idx, int)
892 obj[idx] = value
893 elif opcode == 98: # getlocal
894 index = u30(coder)
895 stack.append(registers[index])
896 elif opcode == 99: # setlocal
897 index = u30(coder)
898 value = stack.pop()
899 registers[index] = value
900 elif opcode == 102: # getproperty
901 index = u30(coder)
902 pname = multinames[index]
903 if pname == u'length':
904 obj = stack.pop()
905 assert isinstance(obj, list)
906 stack.append(len(obj))
907 else: # Assume attribute access
908 idx = stack.pop()
909 assert isinstance(idx, int)
910 obj = stack.pop()
911 assert isinstance(obj, list)
912 stack.append(obj[idx])
913 elif opcode == 128: # coerce
0ca96d48 914 u30(coder)
e0df6211
PH
915 elif opcode == 133: # coerce_s
916 assert isinstance(stack[-1], (type(None), compat_str))
917 elif opcode == 164: # modulo
918 value2 = stack.pop()
919 value1 = stack.pop()
920 res = value1 % value2
921 stack.append(res)
a7177865
PH
922 elif opcode == 208: # getlocal_0
923 stack.append(registers[0])
924 elif opcode == 209: # getlocal_1
925 stack.append(registers[1])
926 elif opcode == 210: # getlocal_2
927 stack.append(registers[2])
928 elif opcode == 211: # getlocal_3
929 stack.append(registers[3])
e0df6211
PH
930 elif opcode == 214: # setlocal_2
931 registers[2] = stack.pop()
932 elif opcode == 215: # setlocal_3
933 registers[3] = stack.pop()
934 else:
935 raise NotImplementedError(
936 u'Unsupported opcode %d' % opcode)
937
938 method_pyfunctions[func_name] = resfunc
939 return resfunc
940
941 initial_function = extract_function(u'decipher')
942 return lambda s: initial_function([s])
943
83799698 944 def _decrypt_signature(self, s, video_id, player_url, age_gate=False):
257a2501 945 """Turn the encrypted s field into a working signature"""
6b37f0be 946
83799698 947 if player_url is not None:
9f9be844
PH
948 if player_url.startswith(u'//'):
949 player_url = u'https:' + player_url
e0df6211 950 try:
7f8ae73a
PH
951 player_id = (player_url, len(s))
952 if player_id not in self._player_cache:
83799698 953 func = self._extract_signature_function(
c4417ddb 954 video_id, player_url, len(s)
e0df6211 955 )
7f8ae73a
PH
956 self._player_cache[player_id] = func
957 func = self._player_cache[player_id]
edf3e38e
PH
958 if self._downloader.params.get('youtube_print_sig_code'):
959 self._print_sig_code(func, len(s))
960 return func(s)
0ca96d48 961 except Exception:
e0df6211 962 tb = traceback.format_exc()
83799698
PH
963 self._downloader.report_warning(
964 u'Automatic signature extraction failed: ' + tb)
e0df6211 965
d2d8f895
PH
966 self._downloader.report_warning(
967 u'Warning: Falling back to static signature algorithm')
920de7a2 968
2f2ffea9
PH
969 return self._static_decrypt_signature(
970 s, video_id, player_url, age_gate)
e0df6211 971
2f2ffea9 972 def _static_decrypt_signature(self, s, video_id, player_url, age_gate):
e0df6211
PH
973 if age_gate:
974 # The videos with age protection use another player, so the
975 # algorithms can be different.
976 if len(s) == 86:
977 return s[2:63] + s[82] + s[64:82] + s[63]
978
bc4b9008 979 if len(s) == 93:
980 return s[86:29:-1] + s[88] + s[28:5:-1]
981 elif len(s) == 92:
444b1165 982 return s[25] + s[3:25] + s[0] + s[26:42] + s[79] + s[43:79] + s[91] + s[80:83]
38d025b3
JMF
983 elif len(s) == 91:
984 return s[84:27:-1] + s[86] + s[26:5:-1]
444b1165
JMF
985 elif len(s) == 90:
986 return s[25] + s[3:25] + s[2] + s[26:40] + s[77] + s[41:77] + s[89] + s[78:81]
8a9d86a2 987 elif len(s) == 89:
988 return s[84:78:-1] + s[87] + s[77:60:-1] + s[0] + s[59:3:-1]
444b1165 989 elif len(s) == 88:
3e223834 990 return s[7:28] + s[87] + s[29:45] + s[55] + s[46:55] + s[2] + s[56:87] + s[28]
be547e1d 991 elif len(s) == 87:
3a725669 992 return s[6:27] + s[4] + s[28:39] + s[27] + s[40:59] + s[2] + s[60:]
be547e1d 993 elif len(s) == 86:
f2c327fd 994 return s[80:72:-1] + s[16] + s[71:39:-1] + s[72] + s[38:16:-1] + s[82] + s[15::-1]
be547e1d 995 elif len(s) == 85:
6ae8ee3f 996 return s[3:11] + s[0] + s[12:55] + s[84] + s[56:84]
be547e1d 997 elif len(s) == 84:
6f56389b 998 return s[78:70:-1] + s[14] + s[69:37:-1] + s[70] + s[36:14:-1] + s[80] + s[:14][::-1]
be547e1d 999 elif len(s) == 83:
920de7a2 1000 return s[80:63:-1] + s[0] + s[62:0:-1] + s[63]
be547e1d 1001 elif len(s) == 82:
c21315f2 1002 return s[80:37:-1] + s[7] + s[36:7:-1] + s[0] + s[6:0:-1] + s[37]
be547e1d 1003 elif len(s) == 81:
aedd6bb9 1004 return s[56] + s[79:56:-1] + s[41] + s[55:41:-1] + s[80] + s[40:34:-1] + s[0] + s[33:29:-1] + s[34] + s[28:9:-1] + s[29] + s[8:0:-1] + s[9]
066090dd
JMF
1005 elif len(s) == 80:
1006 return s[1:19] + s[0] + s[20:68] + s[19] + s[69:80]
5c468ca8
JMF
1007 elif len(s) == 79:
1008 return s[54] + s[77:54:-1] + s[39] + s[53:39:-1] + s[78] + s[38:34:-1] + s[0] + s[33:29:-1] + s[34] + s[28:9:-1] + s[29] + s[8:0:-1] + s[9]
be547e1d
PH
1009
1010 else:
1011 raise ExtractorError(u'Unable to decrypt signature, key length %d not supported; retrying might work' % (len(s)))
c5e8d7af 1012
1f343eaa 1013 def _get_available_subtitles(self, video_id, webpage):
de7f3446 1014 try:
7fad1c63
JMF
1015 sub_list = self._download_webpage(
1016 'http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,
1017 video_id, note=False)
1018 except ExtractorError as err:
de7f3446
JMF
1019 self._downloader.report_warning(u'unable to download video subtitles: %s' % compat_str(err))
1020 return {}
1021 lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
1022
1023 sub_lang_list = {}
1024 for l in lang_list:
1025 lang = l[1]
1026 params = compat_urllib_parse.urlencode({
1027 'lang': lang,
1028 'v': video_id,
ca715127 1029 'fmt': self._downloader.params.get('subtitlesformat', 'srt'),
c3197e3e 1030 'name': unescapeHTML(l[0]).encode('utf-8'),
de7f3446
JMF
1031 })
1032 url = u'http://www.youtube.com/api/timedtext?' + params
1033 sub_lang_list[lang] = url
1034 if not sub_lang_list:
1035 self._downloader.report_warning(u'video doesn\'t have subtitles')
1036 return {}
1037 return sub_lang_list
1038
055e6f36 1039 def _get_available_automatic_caption(self, video_id, webpage):
de7f3446
JMF
1040 """We need the webpage for getting the captions url, pass it as an
1041 argument to speed up the process."""
ca715127 1042 sub_format = self._downloader.params.get('subtitlesformat', 'srt')
de7f3446
JMF
1043 self.to_screen(u'%s: Looking for automatic captions' % video_id)
1044 mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
055e6f36 1045 err_msg = u'Couldn\'t find automatic captions for %s' % video_id
de7f3446
JMF
1046 if mobj is None:
1047 self._downloader.report_warning(err_msg)
1048 return {}
1049 player_config = json.loads(mobj.group(1))
1050 try:
1051 args = player_config[u'args']
1052 caption_url = args[u'ttsurl']
1053 timestamp = args[u'timestamp']
055e6f36
JMF
1054 # We get the available subtitles
1055 list_params = compat_urllib_parse.urlencode({
1056 'type': 'list',
1057 'tlangs': 1,
1058 'asrs': 1,
de7f3446 1059 })
055e6f36 1060 list_url = caption_url + '&' + list_params
e26f8712 1061 caption_list = self._download_xml(list_url, video_id)
e3dc22ca 1062 original_lang_node = caption_list.find('track')
f6a54188 1063 if original_lang_node is None or original_lang_node.attrib.get('kind') != 'asr' :
e3dc22ca
JMF
1064 self._downloader.report_warning(u'Video doesn\'t have automatic captions')
1065 return {}
1066 original_lang = original_lang_node.attrib['lang_code']
055e6f36
JMF
1067
1068 sub_lang_list = {}
1069 for lang_node in caption_list.findall('target'):
1070 sub_lang = lang_node.attrib['lang_code']
1071 params = compat_urllib_parse.urlencode({
1072 'lang': original_lang,
1073 'tlang': sub_lang,
1074 'fmt': sub_format,
1075 'ts': timestamp,
1076 'kind': 'asr',
1077 })
1078 sub_lang_list[sub_lang] = caption_url + '&' + params
1079 return sub_lang_list
de7f3446
JMF
1080 # An extractor error can be raise by the download process if there are
1081 # no automatic captions but there are subtitles
1082 except (KeyError, ExtractorError):
1083 self._downloader.report_warning(err_msg)
1084 return {}
1085
c5e8d7af
PH
1086 def _extract_id(self, url):
1087 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1088 if mobj is None:
1089 raise ExtractorError(u'Invalid URL: %s' % url)
1090 video_id = mobj.group(2)
1091 return video_id
1092
1d043b93
JMF
1093 def _extract_from_m3u8(self, manifest_url, video_id):
1094 url_map = {}
1095 def _get_urls(_manifest):
1096 lines = _manifest.split('\n')
1097 urls = filter(lambda l: l and not l.startswith('#'),
1098 lines)
1099 return urls
1100 manifest = self._download_webpage(manifest_url, video_id, u'Downloading formats manifest')
1101 formats_urls = _get_urls(manifest)
1102 for format_url in formats_urls:
890f62e8 1103 itag = self._search_regex(r'itag/(\d+?)/', format_url, 'itag')
1d043b93
JMF
1104 url_map[itag] = format_url
1105 return url_map
1106
1fb07d10
JG
1107 def _extract_annotations(self, video_id):
1108 url = 'https://www.youtube.com/annotations_invideo?features=1&legacy=1&video_id=%s' % video_id
1109 return self._download_webpage(url, video_id, note=u'Searching for annotations.', errnote=u'Unable to download video annotations.')
1110
c5e8d7af
PH
1111 def _real_extract(self, url):
1112 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
1113 mobj = re.search(self._NEXT_URL_RE, url)
1114 if mobj:
1115 url = 'https://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
1116 video_id = self._extract_id(url)
1117
1118 # Get video webpage
c5e8d7af 1119 url = 'https://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
336c3a69 1120 video_webpage = self._download_webpage(url, video_id)
c5e8d7af
PH
1121
1122 # Attempt to extract SWF player URL
e0df6211 1123 mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
c5e8d7af
PH
1124 if mobj is not None:
1125 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1126 else:
1127 player_url = None
1128
1129 # Get video info
1130 self.report_video_info_webpage_download(video_id)
c108eb73
JMF
1131 if re.search(r'player-age-gate-content">', video_webpage) is not None:
1132 self.report_age_confirmation()
1133 age_gate = True
1134 # We simulate the access to the video from www.youtube.com/v/{video_id}
1135 # this can be viewed without login into Youtube
1136 data = compat_urllib_parse.urlencode({'video_id': video_id,
fccd3771 1137 'el': 'player_embedded',
c108eb73
JMF
1138 'gl': 'US',
1139 'hl': 'en',
1140 'eurl': 'https://youtube.googleapis.com/v/' + video_id,
1141 'asv': 3,
1142 'sts':'1588',
1143 })
1144 video_info_url = 'https://www.youtube.com/get_video_info?' + data
c5e8d7af
PH
1145 video_info_webpage = self._download_webpage(video_info_url, video_id,
1146 note=False,
1147 errnote='unable to download video info webpage')
1148 video_info = compat_parse_qs(video_info_webpage)
c108eb73
JMF
1149 else:
1150 age_gate = False
1151 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1152 video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1153 % (video_id, el_type))
1154 video_info_webpage = self._download_webpage(video_info_url, video_id,
1155 note=False,
1156 errnote='unable to download video info webpage')
1157 video_info = compat_parse_qs(video_info_webpage)
1158 if 'token' in video_info:
1159 break
c5e8d7af
PH
1160 if 'token' not in video_info:
1161 if 'reason' in video_info:
9a82b238 1162 raise ExtractorError(u'YouTube said: %s' % video_info['reason'][0], expected=True)
c5e8d7af
PH
1163 else:
1164 raise ExtractorError(u'"token" parameter not in video info for unknown reason')
1165
1d699755
PH
1166 if 'view_count' in video_info:
1167 view_count = int(video_info['view_count'][0])
1168 else:
1169 view_count = None
1170
c5e8d7af
PH
1171 # Check for "rental" videos
1172 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
1173 raise ExtractorError(u'"rental" videos not supported')
1174
1175 # Start extracting information
1176 self.report_information_extraction(video_id)
1177
1178 # uploader
1179 if 'author' not in video_info:
1180 raise ExtractorError(u'Unable to extract uploader name')
1181 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
1182
1183 # uploader_id
1184 video_uploader_id = None
1185 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
1186 if mobj is not None:
1187 video_uploader_id = mobj.group(1)
1188 else:
1189 self._downloader.report_warning(u'unable to extract uploader nickname')
1190
1191 # title
a8c6b241
PH
1192 if 'title' in video_info:
1193 video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
1194 else:
1195 self._downloader.report_warning(u'Unable to extract video title')
1196 video_title = u'_'
c5e8d7af
PH
1197
1198 # thumbnail image
7763b04e
JMF
1199 # We try first to get a high quality image:
1200 m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">',
1201 video_webpage, re.DOTALL)
1202 if m_thumb is not None:
1203 video_thumbnail = m_thumb.group(1)
1204 elif 'thumbnail_url' not in video_info:
c5e8d7af 1205 self._downloader.report_warning(u'unable to extract video thumbnail')
f490e77e 1206 video_thumbnail = None
c5e8d7af
PH
1207 else: # don't panic if we can't find it
1208 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
1209
1210 # upload date
1211 upload_date = None
1212 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
1213 if mobj is not None:
1214 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1215 upload_date = unified_strdate(upload_date)
1216
1217 # description
1218 video_description = get_element_by_id("eow-description", video_webpage)
1219 if video_description:
27dcce19
PH
1220 video_description = re.sub(r'''(?x)
1221 <a\s+
1222 (?:[a-zA-Z-]+="[^"]+"\s+)*?
1223 title="([^"]+)"\s+
1224 (?:[a-zA-Z-]+="[^"]+"\s+)*?
1225 class="yt-uix-redirect-link"\s*>
1226 [^<]+
1227 </a>
1228 ''', r'\1', video_description)
c5e8d7af
PH
1229 video_description = clean_html(video_description)
1230 else:
1231 fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
1232 if fd_mobj:
1233 video_description = unescapeHTML(fd_mobj.group(1))
1234 else:
1235 video_description = u''
1236
336c3a69 1237 def _extract_count(klass):
46374a56
PH
1238 count = self._search_regex(
1239 r'class="%s">([\d,]+)</span>' % re.escape(klass),
1240 video_webpage, klass, default=None)
336c3a69
JMF
1241 if count is not None:
1242 return int(count.replace(',', ''))
1243 return None
1244 like_count = _extract_count(u'likes-count')
1245 dislike_count = _extract_count(u'dislikes-count')
1246
c5e8d7af 1247 # subtitles
d82134c3 1248 video_subtitles = self.extract_subtitles(video_id, video_webpage)
c5e8d7af 1249
c5e8d7af 1250 if self._downloader.params.get('listsubtitles', False):
d665f8d3 1251 self._list_available_subtitles(video_id, video_webpage)
c5e8d7af
PH
1252 return
1253
1254 if 'length_seconds' not in video_info:
1255 self._downloader.report_warning(u'unable to extract video duration')
b466b702 1256 video_duration = None
c5e8d7af 1257 else:
b466b702 1258 video_duration = int(compat_urllib_parse.unquote_plus(video_info['length_seconds'][0]))
c5e8d7af 1259
1fb07d10
JG
1260 # annotations
1261 video_annotations = None
1262 if self._downloader.params.get('writeannotations', False):
1263 video_annotations = self._extract_annotations(video_id)
1264
c5e8d7af 1265 # Decide which formats to download
c5e8d7af
PH
1266 try:
1267 mobj = re.search(r';ytplayer.config = ({.*?});', video_webpage)
50be92c1
PH
1268 if not mobj:
1269 raise ValueError('Could not find vevo ID')
c5e8d7af
PH
1270 info = json.loads(mobj.group(1))
1271 args = info['args']
7ce7e394
JMF
1272 # Easy way to know if the 's' value is in url_encoded_fmt_stream_map
1273 # this signatures are encrypted
44d46655 1274 if 'url_encoded_fmt_stream_map' not in args:
f10503db 1275 raise ValueError(u'No stream_map present') # caught below
00fe14fc
JMF
1276 re_signature = re.compile(r'[&,]s=')
1277 m_s = re_signature.search(args['url_encoded_fmt_stream_map'])
7ce7e394
JMF
1278 if m_s is not None:
1279 self.to_screen(u'%s: Encrypted signatures detected.' % video_id)
c5e8d7af 1280 video_info['url_encoded_fmt_stream_map'] = [args['url_encoded_fmt_stream_map']]
00fe14fc 1281 m_s = re_signature.search(args.get('adaptive_fmts', u''))
b7a68384 1282 if m_s is not None:
00fe14fc
JMF
1283 if 'adaptive_fmts' in video_info:
1284 video_info['adaptive_fmts'][0] += ',' + args['adaptive_fmts']
37b6d5f6 1285 else:
00fe14fc 1286 video_info['adaptive_fmts'] = [args['adaptive_fmts']]
c5e8d7af
PH
1287 except ValueError:
1288 pass
1289
dd27fd17
PH
1290 def _map_to_format_list(urlmap):
1291 formats = []
1292 for itag, video_real_url in urlmap.items():
1293 dct = {
1294 'format_id': itag,
1295 'url': video_real_url,
1296 'player_url': player_url,
1297 }
0b65e5d4
PH
1298 if itag in self._formats:
1299 dct.update(self._formats[itag])
dd27fd17
PH
1300 formats.append(dct)
1301 return formats
1302
c5e8d7af
PH
1303 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1304 self.report_rtmp_download()
dd27fd17
PH
1305 formats = [{
1306 'format_id': '_rtmp',
1307 'protocol': 'rtmp',
1308 'url': video_info['conn'][0],
1309 'player_url': player_url,
1310 }]
00fe14fc
JMF
1311 elif len(video_info.get('url_encoded_fmt_stream_map', [])) >= 1 or len(video_info.get('adaptive_fmts', [])) >= 1:
1312 encoded_url_map = video_info.get('url_encoded_fmt_stream_map', [''])[0] + ',' + video_info.get('adaptive_fmts',[''])[0]
1313 if 'rtmpe%3Dyes' in encoded_url_map:
a7055eb9 1314 raise ExtractorError('rtmpe downloads are not supported, see https://github.com/rg3/youtube-dl/issues/343 for more information.', expected=True)
c5e8d7af 1315 url_map = {}
00fe14fc 1316 for url_data_str in encoded_url_map.split(','):
c5e8d7af
PH
1317 url_data = compat_parse_qs(url_data_str)
1318 if 'itag' in url_data and 'url' in url_data:
1319 url = url_data['url'][0]
1320 if 'sig' in url_data:
1321 url += '&signature=' + url_data['sig'][0]
1322 elif 's' in url_data:
e0df6211 1323 encrypted_sig = url_data['s'][0]
769fda3c 1324 if self._downloader.params.get('verbose'):
c108eb73 1325 if age_gate:
bdde940e
PH
1326 if player_url is None:
1327 player_version = 'unknown'
1328 else:
1329 player_version = self._search_regex(
1330 r'-(.+)\.swf$', player_url,
1331 u'flash player', fatal=False)
e0df6211 1332 player_desc = 'flash player %s' % player_version
c108eb73 1333 else:
83799698
PH
1334 player_version = self._search_regex(
1335 r'html5player-(.+?)\.js', video_webpage,
c108eb73 1336 'html5 player', fatal=False)
e0df6211
PH
1337 player_desc = u'html5 player %s' % player_version
1338
1339 parts_sizes = u'.'.join(compat_str(len(part)) for part in encrypted_sig.split('.'))
5a76c651 1340 self.to_screen(u'encrypted signature length %d (%s), itag %s, %s' %
e0df6211
PH
1341 (len(encrypted_sig), parts_sizes, url_data['itag'][0], player_desc))
1342
83799698 1343 if not age_gate:
e0df6211
PH
1344 jsplayer_url_json = self._search_regex(
1345 r'"assets":.+?"js":\s*("[^"]+")',
1346 video_webpage, u'JS player URL')
83799698 1347 player_url = json.loads(jsplayer_url_json)
e0df6211 1348
83799698
PH
1349 signature = self._decrypt_signature(
1350 encrypted_sig, video_id, player_url, age_gate)
c5e8d7af
PH
1351 url += '&signature=' + signature
1352 if 'ratebypass' not in url:
1353 url += '&ratebypass=yes'
1354 url_map[url_data['itag'][0]] = url
dd27fd17 1355 formats = _map_to_format_list(url_map)
1d043b93
JMF
1356 elif video_info.get('hlsvp'):
1357 manifest_url = video_info['hlsvp'][0]
1358 url_map = self._extract_from_m3u8(manifest_url, video_id)
dd27fd17 1359 formats = _map_to_format_list(url_map)
c5e8d7af 1360 else:
9abb3204 1361 raise ExtractorError(u'no conn, hlsvp or url_encoded_fmt_stream_map information found in video info')
c5e8d7af 1362
dd27fd17
PH
1363 # Look for the DASH manifest
1364 dash_manifest_url_lst = video_info.get('dashmpd')
4919603f
PH
1365 if (dash_manifest_url_lst and dash_manifest_url_lst[0] and
1366 self._downloader.params.get('youtube_include_dash_manifest', False)):
dd27fd17
PH
1367 try:
1368 dash_doc = self._download_xml(
1369 dash_manifest_url_lst[0], video_id,
1370 note=u'Downloading DASH manifest',
1371 errnote=u'Could not download DASH manifest')
1372 for r in dash_doc.findall(u'.//{urn:mpeg:DASH:schema:MPD:2011}Representation'):
1373 url_el = r.find('{urn:mpeg:DASH:schema:MPD:2011}BaseURL')
1374 if url_el is None:
1375 continue
1376 format_id = r.attrib['id']
1377 video_url = url_el.text
1378 filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength'))
1379 f = {
1380 'format_id': format_id,
1381 'url': video_url,
1382 'width': int_or_none(r.attrib.get('width')),
1383 'tbr': int_or_none(r.attrib.get('bandwidth'), 1000),
1384 'asr': int_or_none(r.attrib.get('audioSamplingRate')),
1385 'filesize': filesize,
1386 }
1387 try:
1388 existing_format = next(
1389 fo for fo in formats
1390 if fo['format_id'] == format_id)
1391 except StopIteration:
1392 f.update(self._formats.get(format_id, {}))
1393 formats.append(f)
1394 else:
1395 existing_format.update(f)
1396
1397 except (ExtractorError, KeyError) as e:
1398 self.report_warning(u'Skipping DASH manifest: %s' % e, video_id)
d80044c2 1399
4bcc7bd1 1400 self._sort_formats(formats)
4ea3be0a 1401
1402 return {
1403 'id': video_id,
1404 'uploader': video_uploader,
1405 'uploader_id': video_uploader_id,
1406 'upload_date': upload_date,
1407 'title': video_title,
1408 'thumbnail': video_thumbnail,
1409 'description': video_description,
1410 'subtitles': video_subtitles,
1411 'duration': video_duration,
1412 'age_limit': 18 if age_gate else 0,
1413 'annotations': video_annotations,
1414 'webpage_url': 'https://www.youtube.com/watch?v=%s' % video_id,
1415 'view_count': view_count,
1416 'like_count': like_count,
1417 'dislike_count': dislike_count,
1418 'formats': formats,
1419 }
c5e8d7af 1420
880e1c52 1421class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
0f818663 1422 IE_DESC = u'YouTube.com playlists'
c5e8d7af
PH
1423 _VALID_URL = r"""(?:
1424 (?:https?://)?
1425 (?:\w+\.)?
1426 youtube\.com/
1427 (?:
1428 (?:course|view_play_list|my_playlists|artist|playlist|watch)
1429 \? (?:.*?&)*? (?:p|a|list)=
1430 | p/
1431 )
715c8e7b 1432 ((?:PL|EC|UU|FL|RD)?[0-9A-Za-z-_]{10,})
c5e8d7af
PH
1433 .*
1434 |
715c8e7b 1435 ((?:PL|EC|UU|FL|RD)[0-9A-Za-z-_]{10,})
c5e8d7af 1436 )"""
dcbb4580
JMF
1437 _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s&page=%s'
1438 _MORE_PAGES_INDICATOR = r'data-link-type="next"'
6e47b51e 1439 _VIDEO_RE = r'href="/watch\?v=(?P<id>[0-9A-Za-z_-]{11})&amp;[^"]*?index=(?P<index>\d+)'
c5e8d7af
PH
1440 IE_NAME = u'youtube:playlist'
1441
1442 @classmethod
1443 def suitable(cls, url):
1444 """Receives a URL and returns True if suitable for this IE."""
1445 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1446
880e1c52
JMF
1447 def _real_initialize(self):
1448 self._login()
1449
652cdaa2
JMF
1450 def _ids_to_results(self, ids):
1451 return [self.url_result(vid_id, 'Youtube', video_id=vid_id)
1452 for vid_id in ids]
1453
1454 def _extract_mix(self, playlist_id):
1455 # The mixes are generated from a a single video
1456 # the id of the playlist is just 'RD' + video_id
7d4afc55 1457 url = 'https://youtube.com/watch?v=%s&list=%s' % (playlist_id[-11:], playlist_id)
652cdaa2 1458 webpage = self._download_webpage(url, playlist_id, u'Downloading Youtube mix')
76d1700b
JMF
1459 title_span = (get_element_by_attribute('class', 'title long-title', webpage) or
1460 get_element_by_attribute('class', 'title ', webpage))
1461 title = clean_html(title_span)
652cdaa2
JMF
1462 video_re = r'data-index="\d+".*?href="/watch\?v=([0-9A-Za-z_-]{11})&amp;[^"]*?list=%s' % re.escape(playlist_id)
1463 ids = orderedSet(re.findall(video_re, webpage))
1464 url_results = self._ids_to_results(ids)
1465
1466 return self.playlist_result(url_results, playlist_id, title)
1467
c5e8d7af
PH
1468 def _real_extract(self, url):
1469 # Extract playlist id
1470 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1471 if mobj is None:
1472 raise ExtractorError(u'Invalid URL: %s' % url)
47192f92
FV
1473 playlist_id = mobj.group(1) or mobj.group(2)
1474
1475 # Check if it's a video-specific URL
7c61bd36 1476 query_dict = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
47192f92
FV
1477 if 'v' in query_dict:
1478 video_id = query_dict['v'][0]
1479 if self._downloader.params.get('noplaylist'):
1480 self.to_screen(u'Downloading just video %s because of --no-playlist' % video_id)
7012b23c 1481 return self.url_result(video_id, 'Youtube', video_id=video_id)
47192f92
FV
1482 else:
1483 self.to_screen(u'Downloading playlist PL%s - add --no-playlist to just download video %s' % (playlist_id, video_id))
c5e8d7af 1484
7d4afc55 1485 if playlist_id.startswith('RD'):
652cdaa2
JMF
1486 # Mixes require a custom extraction process
1487 return self._extract_mix(playlist_id)
0a688bc0
JMF
1488 if playlist_id.startswith('TL'):
1489 raise ExtractorError(u'For downloading YouTube.com top lists, use '
1490 u'the "yttoplist" keyword, for example "youtube-dl \'yttoplist:music:Top Tracks\'"', expected=True)
652cdaa2 1491
dcbb4580
JMF
1492 # Extract the video ids from the playlist pages
1493 ids = []
c5e8d7af 1494
755eb032 1495 for page_num in itertools.count(1):
dcbb4580 1496 url = self._TEMPLATE_URL % (playlist_id, page_num)
c5e8d7af 1497 page = self._download_webpage(url, playlist_id, u'Downloading page #%s' % page_num)
6e47b51e
JMF
1498 matches = re.finditer(self._VIDEO_RE, page)
1499 # We remove the duplicates and the link with index 0
1500 # (it's not the first video of the playlist)
1501 new_ids = orderedSet(m.group('id') for m in matches if m.group('index') != '0')
dcbb4580 1502 ids.extend(new_ids)
c5e8d7af 1503
dcbb4580 1504 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
c5e8d7af
PH
1505 break
1506
c91778f8
PH
1507 try:
1508 playlist_title = self._og_search_title(page)
1509 except RegexNotFoundError:
1510 self.report_warning(
1511 u'Playlist page is missing OpenGraph title, falling back ...',
1512 playlist_id)
1513 playlist_title = self._html_search_regex(
1514 r'<h1 class="pl-header-title">(.*?)</h1>', page, u'title')
c5e8d7af 1515
652cdaa2 1516 url_results = self._ids_to_results(ids)
dcbb4580 1517 return self.playlist_result(url_results, playlist_id, playlist_title)
c5e8d7af
PH
1518
1519
0a688bc0
JMF
1520class YoutubeTopListIE(YoutubePlaylistIE):
1521 IE_NAME = u'youtube:toplist'
1522 IE_DESC = (u'YouTube.com top lists, "yttoplist:{channel}:{list title}"'
1523 u' (Example: "yttoplist:music:Top Tracks")')
1524 _VALID_URL = r'yttoplist:(?P<chann>.*?):(?P<title>.*?)$'
1525
1526 def _real_extract(self, url):
1527 mobj = re.match(self._VALID_URL, url)
1528 channel = mobj.group('chann')
1529 title = mobj.group('title')
1530 query = compat_urllib_parse.urlencode({'title': title})
1531 playlist_re = 'href="([^"]+?%s[^"]+?)"' % re.escape(query)
1532 channel_page = self._download_webpage('https://www.youtube.com/%s' % channel, title)
1533 link = self._html_search_regex(playlist_re, channel_page, u'list')
1534 url = compat_urlparse.urljoin('https://www.youtube.com/', link)
1535
1536 video_re = r'data-index="\d+".*?data-video-id="([0-9A-Za-z_-]{11})"'
1537 ids = []
1538 # sometimes the webpage doesn't contain the videos
1539 # retry until we get them
1540 for i in itertools.count(0):
1541 msg = u'Downloading Youtube mix'
1542 if i > 0:
1543 msg += ', retry #%d' % i
1544 webpage = self._download_webpage(url, title, msg)
1545 ids = orderedSet(re.findall(video_re, webpage))
1546 if ids:
1547 break
1548 url_results = self._ids_to_results(ids)
1549 return self.playlist_result(url_results, playlist_title=title)
1550
1551
c5e8d7af 1552class YoutubeChannelIE(InfoExtractor):
0f818663 1553 IE_DESC = u'YouTube.com channels'
c5e8d7af 1554 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
c5e8d7af 1555 _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
252580c5 1556 _MORE_PAGES_URL = 'http://www.youtube.com/c4_browse_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
c5e8d7af
PH
1557 IE_NAME = u'youtube:channel'
1558
1559 def extract_videos_from_page(self, page):
1560 ids_in_page = []
1561 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1562 if mobj.group(1) not in ids_in_page:
1563 ids_in_page.append(mobj.group(1))
1564 return ids_in_page
1565
1566 def _real_extract(self, url):
1567 # Extract channel id
1568 mobj = re.match(self._VALID_URL, url)
1569 if mobj is None:
1570 raise ExtractorError(u'Invalid URL: %s' % url)
1571
1572 # Download channel page
1573 channel_id = mobj.group(1)
1574 video_ids = []
b9643eed
JMF
1575 url = 'https://www.youtube.com/channel/%s/videos' % channel_id
1576 channel_page = self._download_webpage(url, channel_id)
31812a9e
PH
1577 autogenerated = re.search(r'''(?x)
1578 class="[^"]*?(?:
1579 channel-header-autogenerated-label|
1580 yt-channel-title-autogenerated
1581 )[^"]*"''', channel_page) is not None
c5e8d7af 1582
b9643eed
JMF
1583 if autogenerated:
1584 # The videos are contained in a single page
1585 # the ajax pages can't be used, they are empty
1586 video_ids = self.extract_videos_from_page(channel_page)
1587 else:
1588 # Download all channel pages using the json-based channel_ajax query
1589 for pagenum in itertools.count(1):
1590 url = self._MORE_PAGES_URL % (pagenum, channel_id)
1591 page = self._download_webpage(url, channel_id,
1592 u'Downloading page #%s' % pagenum)
1593
1594 page = json.loads(page)
1595
1596 ids_in_page = self.extract_videos_from_page(page['content_html'])
1597 video_ids.extend(ids_in_page)
1598
1599 if self._MORE_PAGES_INDICATOR not in page['load_more_widget_html']:
1600 break
c5e8d7af
PH
1601
1602 self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1603
7012b23c
PH
1604 url_entries = [self.url_result(video_id, 'Youtube', video_id=video_id)
1605 for video_id in video_ids]
1606 return self.playlist_result(url_entries, channel_id)
c5e8d7af
PH
1607
1608
1609class YoutubeUserIE(InfoExtractor):
0f818663 1610 IE_DESC = u'YouTube.com user videos (URL or "ytuser" keyword)'
57da92b7 1611 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:user/)?(?!(?:attribution_link|watch)(?:$|[^a-z_A-Z0-9-])))|ytuser:)(?!feed/)([A-Za-z0-9_-]+)'
c5e8d7af
PH
1612 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1613 _GDATA_PAGE_SIZE = 50
fd9cf738 1614 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d&alt=json'
c5e8d7af
PH
1615 IE_NAME = u'youtube:user'
1616
e3ea4790 1617 @classmethod
f4b05232 1618 def suitable(cls, url):
e3ea4790
JMF
1619 # Don't return True if the url can be extracted with other youtube
1620 # extractor, the regex would is too permissive and it would match.
1621 other_ies = iter(klass for (name, klass) in globals().items() if name.endswith('IE') and klass is not cls)
1622 if any(ie.suitable(url) for ie in other_ies): return False
f4b05232
JMF
1623 else: return super(YoutubeUserIE, cls).suitable(url)
1624
c5e8d7af
PH
1625 def _real_extract(self, url):
1626 # Extract username
1627 mobj = re.match(self._VALID_URL, url)
1628 if mobj is None:
1629 raise ExtractorError(u'Invalid URL: %s' % url)
1630
1631 username = mobj.group(1)
1632
1633 # Download video ids using YouTube Data API. Result size per
1634 # query is limited (currently to 50 videos) so we need to query
1635 # page by page until there are no video ids - it means we got
1636 # all of them.
1637
b7ab0590 1638 def download_page(pagenum):
c5e8d7af
PH
1639 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1640
1641 gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
b7ab0590
PH
1642 page = self._download_webpage(
1643 gdata_url, username,
1644 u'Downloading video ids from %d to %d' % (
1645 start_index, start_index + self._GDATA_PAGE_SIZE))
c5e8d7af 1646
fd9cf738
JMF
1647 try:
1648 response = json.loads(page)
1649 except ValueError as err:
1650 raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
71c82637 1651 if 'entry' not in response['feed']:
b7ab0590 1652 return
fd9cf738 1653
c5e8d7af 1654 # Extract video identifiers
e302f9ce
PH
1655 entries = response['feed']['entry']
1656 for entry in entries:
1657 title = entry['title']['$t']
1658 video_id = entry['id']['$t'].split('/')[-1]
b7ab0590 1659 yield {
e302f9ce
PH
1660 '_type': 'url',
1661 'url': video_id,
1662 'ie_key': 'Youtube',
1663 'id': 'video_id',
1664 'title': title,
b7ab0590
PH
1665 }
1666 url_results = PagedList(download_page, self._GDATA_PAGE_SIZE)
c5e8d7af 1667
7012b23c
PH
1668 return self.playlist_result(url_results, playlist_title=username)
1669
b05654f0
PH
1670
1671class YoutubeSearchIE(SearchInfoExtractor):
0f818663 1672 IE_DESC = u'YouTube.com searches'
b05654f0
PH
1673 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1674 _MAX_RESULTS = 1000
1675 IE_NAME = u'youtube:search'
1676 _SEARCH_KEY = 'ytsearch'
1677
b05654f0
PH
1678 def _get_n_results(self, query, n):
1679 """Get a specified number of results for a query"""
1680
1681 video_ids = []
1682 pagenum = 0
1683 limit = n
1684
1685 while (50 * pagenum) < limit:
b05654f0 1686 result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
7cc3570e
PH
1687 data_json = self._download_webpage(
1688 result_url, video_id=u'query "%s"' % query,
1689 note=u'Downloading page %s' % (pagenum + 1),
1690 errnote=u'Unable to download API page')
1691 data = json.loads(data_json)
1692 api_response = data['data']
1693
1694 if 'items' not in api_response:
b05654f0
PH
1695 raise ExtractorError(u'[youtube] No video results')
1696
1697 new_ids = list(video['id'] for video in api_response['items'])
1698 video_ids += new_ids
1699
1700 limit = min(n, api_response['totalItems'])
1701 pagenum += 1
1702
1703 if len(video_ids) > n:
1704 video_ids = video_ids[:n]
7012b23c
PH
1705 videos = [self.url_result(video_id, 'Youtube', video_id=video_id)
1706 for video_id in video_ids]
b05654f0 1707 return self.playlist_result(videos, query)
75dff0ee 1708
a3dd9248 1709class YoutubeSearchDateIE(YoutubeSearchIE):
cb7fb546 1710 IE_NAME = YoutubeSearchIE.IE_NAME + ':date'
a3dd9248
CM
1711 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc&orderby=published'
1712 _SEARCH_KEY = 'ytsearchdate'
08fb86c4 1713 IE_DESC = u'YouTube.com searches, newest videos first'
75dff0ee
JMF
1714
1715class YoutubeShowIE(InfoExtractor):
0f818663 1716 IE_DESC = u'YouTube.com (multi-season) shows'
75dff0ee
JMF
1717 _VALID_URL = r'https?://www\.youtube\.com/show/(.*)'
1718 IE_NAME = u'youtube:show'
1719
1720 def _real_extract(self, url):
1721 mobj = re.match(self._VALID_URL, url)
1722 show_name = mobj.group(1)
1723 webpage = self._download_webpage(url, show_name, u'Downloading show webpage')
1724 # There's one playlist for each season of the show
1725 m_seasons = list(re.finditer(r'href="(/playlist\?list=.*?)"', webpage))
1726 self.to_screen(u'%s: Found %s seasons' % (show_name, len(m_seasons)))
1727 return [self.url_result('https://www.youtube.com' + season.group(1), 'YoutubePlaylist') for season in m_seasons]
04cc9617
JMF
1728
1729
b2e8bc1b 1730class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
d7ae0639
JMF
1731 """
1732 Base class for extractors that fetch info from
1733 http://www.youtube.com/feed_ajax
1734 Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.
1735 """
b2e8bc1b 1736 _LOGIN_REQUIRED = True
43ba5456
JMF
1737 # use action_load_personal_feed instead of action_load_system_feed
1738 _PERSONAL_FEED = False
04cc9617 1739
d7ae0639
JMF
1740 @property
1741 def _FEED_TEMPLATE(self):
43ba5456
JMF
1742 action = 'action_load_system_feed'
1743 if self._PERSONAL_FEED:
1744 action = 'action_load_personal_feed'
1745 return 'http://www.youtube.com/feed_ajax?%s=1&feed_name=%s&paging=%%s' % (action, self._FEED_NAME)
d7ae0639
JMF
1746
1747 @property
1748 def IE_NAME(self):
1749 return u'youtube:%s' % self._FEED_NAME
04cc9617 1750
81f0259b 1751 def _real_initialize(self):
b2e8bc1b 1752 self._login()
81f0259b 1753
04cc9617
JMF
1754 def _real_extract(self, url):
1755 feed_entries = []
0e44d838
JMF
1756 paging = 0
1757 for i in itertools.count(1):
d7ae0639
JMF
1758 info = self._download_webpage(self._FEED_TEMPLATE % paging,
1759 u'%s feed' % self._FEED_NAME,
04cc9617
JMF
1760 u'Downloading page %s' % i)
1761 info = json.loads(info)
1762 feed_html = info['feed_html']
43ba5456 1763 m_ids = re.finditer(r'"/watch\?v=(.*?)["&]', feed_html)
04cc9617 1764 ids = orderedSet(m.group(1) for m in m_ids)
7012b23c
PH
1765 feed_entries.extend(
1766 self.url_result(video_id, 'Youtube', video_id=video_id)
1767 for video_id in ids)
04cc9617
JMF
1768 if info['paging'] is None:
1769 break
0e44d838 1770 paging = info['paging']
d7ae0639
JMF
1771 return self.playlist_result(feed_entries, playlist_title=self._PLAYLIST_TITLE)
1772
1773class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor):
1774 IE_DESC = u'YouTube.com subscriptions feed, "ytsubs" keyword(requires authentication)'
1775 _VALID_URL = r'https?://www\.youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
1776 _FEED_NAME = 'subscriptions'
1777 _PLAYLIST_TITLE = u'Youtube Subscriptions'
1778
1779class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
1780 IE_DESC = u'YouTube.com recommended videos, "ytrec" keyword (requires authentication)'
1781 _VALID_URL = r'https?://www\.youtube\.com/feed/recommended|:ytrec(?:ommended)?'
1782 _FEED_NAME = 'recommended'
1783 _PLAYLIST_TITLE = u'Youtube Recommended videos'
c626a3d9 1784
43ba5456
JMF
1785class YoutubeWatchLaterIE(YoutubeFeedsInfoExtractor):
1786 IE_DESC = u'Youtube watch later list, "ytwatchlater" keyword (requires authentication)'
1787 _VALID_URL = r'https?://www\.youtube\.com/feed/watch_later|:ytwatchlater'
1788 _FEED_NAME = 'watch_later'
1789 _PLAYLIST_TITLE = u'Youtube Watch Later'
43ba5456 1790 _PERSONAL_FEED = True
c626a3d9 1791
f459d170
JMF
1792class YoutubeHistoryIE(YoutubeFeedsInfoExtractor):
1793 IE_DESC = u'Youtube watch history, "ythistory" keyword (requires authentication)'
1794 _VALID_URL = u'https?://www\.youtube\.com/feed/history|:ythistory'
1795 _FEED_NAME = 'history'
1796 _PERSONAL_FEED = True
1797 _PLAYLIST_TITLE = u'Youtube Watch History'
1798
c626a3d9
JMF
1799class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
1800 IE_NAME = u'youtube:favorites'
1801 IE_DESC = u'YouTube.com favourite videos, "ytfav" keyword (requires authentication)'
c7a7750d 1802 _VALID_URL = r'https?://www\.youtube\.com/my_favorites|:ytfav(?:ou?rites)?'
c626a3d9
JMF
1803 _LOGIN_REQUIRED = True
1804
1805 def _real_extract(self, url):
1806 webpage = self._download_webpage('https://www.youtube.com/my_favorites', 'Youtube Favourites videos')
1807 playlist_id = self._search_regex(r'list=(.+?)["&]', webpage, u'favourites playlist id')
1808 return self.url_result(playlist_id, 'YoutubePlaylist')
15870e90
PH
1809
1810
1811class YoutubeTruncatedURLIE(InfoExtractor):
1812 IE_NAME = 'youtube:truncated_url'
1813 IE_DESC = False # Do not list
975d35db
PH
1814 _VALID_URL = r'''(?x)
1815 (?:https?://)?[^/]+/watch\?feature=[a-z_]+$|
1816 (?:https?://)?(?:www\.)?youtube\.com/attribution_link\?a=[^&]+$
1817 '''
15870e90
PH
1818
1819 def _real_extract(self, url):
1820 raise ExtractorError(
1821 u'Did you forget to quote the URL? Remember that & is a meta '
1822 u'character in most shells, so you want to put the URL in quotes, '
1823 u'like youtube-dl '
b4622a32
PH
1824 u'"http://www.youtube.com/watch?feature=foo&v=BaW_jenozKc" '
1825 u' or simply youtube-dl BaW_jenozKc .',
15870e90 1826 expected=True)