]> jfr.im git - yt-dlp.git/blame - youtube_dl/extractor/youtube.py
Some pep8 style fixes
[yt-dlp.git] / youtube_dl / extractor / youtube.py
CommitLineData
c5e8d7af 1# coding: utf-8
c5e8d7af 2
e0df6211 3import collections
edf3e38e 4import errno
e0df6211 5import io
0ca96d48 6import itertools
c5e8d7af 7import json
c4417ddb 8import os.path
c5e8d7af 9import re
e0df6211
PH
10import string
11import struct
12import traceback
13import zlib
c5e8d7af 14
b05654f0 15from .common import InfoExtractor, SearchInfoExtractor
54d39d8b 16from .subtitles import SubtitlesInfoExtractor
c5e8d7af 17from ..utils import (
edf3e38e 18 compat_chr,
c5e8d7af 19 compat_parse_qs,
c5e8d7af
PH
20 compat_urllib_parse,
21 compat_urllib_request,
7c61bd36 22 compat_urlparse,
c5e8d7af
PH
23 compat_str,
24
25 clean_html,
c38b1e77 26 get_cachedir,
c5e8d7af 27 get_element_by_id,
652cdaa2 28 get_element_by_attribute,
c5e8d7af 29 ExtractorError,
dd27fd17 30 int_or_none,
b7ab0590 31 PagedList,
c91778f8 32 RegexNotFoundError,
c5e8d7af
PH
33 unescapeHTML,
34 unified_strdate,
04cc9617 35 orderedSet,
edf3e38e 36 write_json_file,
c5e8d7af
PH
37)
38
de7f3446 39class YoutubeBaseInfoExtractor(InfoExtractor):
b2e8bc1b
JMF
40 """Provide base functions for Youtube extractors"""
41 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
42 _LANG_URL = r'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
43 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
44 _NETRC_MACHINE = 'youtube'
45 # If True it will raise an error if no login info is provided
46 _LOGIN_REQUIRED = False
47
b2e8bc1b 48 def _set_language(self):
7cc3570e
PH
49 return bool(self._download_webpage(
50 self._LANG_URL, None,
51 note=u'Setting language', errnote='unable to set language',
52 fatal=False))
b2e8bc1b
JMF
53
54 def _login(self):
55 (username, password) = self._get_login_info()
56 # No authentication to be performed
57 if username is None:
58 if self._LOGIN_REQUIRED:
59 raise ExtractorError(u'No login info available, needed for using %s.' % self.IE_NAME, expected=True)
60 return False
61
7cc3570e
PH
62 login_page = self._download_webpage(
63 self._LOGIN_URL, None,
64 note=u'Downloading login page',
65 errnote=u'unable to fetch login page', fatal=False)
66 if login_page is False:
67 return
b2e8bc1b 68
795f28f8
PH
69 galx = self._search_regex(r'(?s)<input.+?name="GALX".+?value="(.+?)"',
70 login_page, u'Login GALX parameter')
c5e8d7af 71
b2e8bc1b
JMF
72 # Log in
73 login_form_strs = {
74 u'continue': u'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
75 u'Email': username,
76 u'GALX': galx,
77 u'Passwd': password,
78 u'PersistentCookie': u'yes',
79 u'_utf8': u'霱',
80 u'bgresponse': u'js_disabled',
81 u'checkConnection': u'',
82 u'checkedDomains': u'youtube',
83 u'dnConn': u'',
b2e8bc1b
JMF
84 u'pstMsg': u'0',
85 u'rmShown': u'1',
86 u'secTok': u'',
87 u'signIn': u'Sign in',
88 u'timeStmp': u'',
89 u'service': u'youtube',
90 u'uilel': u'3',
91 u'hl': u'en_US',
92 }
93 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
94 # chokes on unicode
95 login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
96 login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
7cc3570e
PH
97
98 req = compat_urllib_request.Request(self._LOGIN_URL, login_data)
99 login_results = self._download_webpage(
100 req, None,
101 note=u'Logging in', errnote=u'unable to log in', fatal=False)
102 if login_results is False:
103 return False
104 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
105 self._downloader.report_warning(u'unable to log in: bad username or password')
b2e8bc1b
JMF
106 return False
107 return True
108
109 def _confirm_age(self):
110 age_form = {
7cc3570e
PH
111 'next_url': '/',
112 'action_confirm': 'Confirm',
113 }
114 req = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
115
116 self._download_webpage(
117 req, None,
118 note=u'Confirming age', errnote=u'Unable to confirm age')
b2e8bc1b
JMF
119 return True
120
121 def _real_initialize(self):
122 if self._downloader is None:
123 return
124 if not self._set_language():
125 return
126 if not self._login():
127 return
128 self._confirm_age()
c5e8d7af 129
8377574c 130
de7f3446 131class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
0f818663 132 IE_DESC = u'YouTube.com'
cb7dfeea 133 _VALID_URL = r"""(?x)^
c5e8d7af 134 (
83aa5293 135 (?:https?://|//)? # http(s):// or protocol-independent URL (optional)
cb7dfeea 136 (?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/|
484aaeb2
PH
137 (?:www\.)?deturl\.com/www\.youtube\.com/|
138 (?:www\.)?pwnyoutube\.com|
e69ae5b9
JMF
139 tube\.majestyc\.net/|
140 youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains
c5e8d7af
PH
141 (?:.*?\#/)? # handle anchor (#/) redirect urls
142 (?: # the various things that can precede the ID:
143 (?:(?:v|embed|e)/) # v/ or embed/ or e/
144 |(?: # or the v= param in all its forms
d741e55a 145 (?:(?:watch|movie)(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
c5e8d7af
PH
146 (?:\?|\#!?) # the params delimiter ? or # or #!
147 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
148 v=
149 )
f4b05232
JMF
150 ))
151 |youtu\.be/ # just youtu.be/xxxx
152 )
c5e8d7af 153 )? # all until now is optional -> you can pass the naked ID
8963d9c2 154 ([0-9A-Za-z_-]{11}) # here is it! the YouTube video ID
c5e8d7af
PH
155 (?(1).+)? # if we found the ID, everything can follow
156 $"""
c5e8d7af 157 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
2c62dc26
PH
158 _formats = {
159 '5': {'ext': 'flv', 'width': 400, 'height': 240},
160 '6': {'ext': 'flv', 'width': 450, 'height': 270},
161 '13': {'ext': '3gp'},
162 '17': {'ext': '3gp', 'width': 176, 'height': 144},
163 '18': {'ext': 'mp4', 'width': 640, 'height': 360},
164 '22': {'ext': 'mp4', 'width': 1280, 'height': 720},
165 '34': {'ext': 'flv', 'width': 640, 'height': 360},
166 '35': {'ext': 'flv', 'width': 854, 'height': 480},
167 '36': {'ext': '3gp', 'width': 320, 'height': 240},
168 '37': {'ext': 'mp4', 'width': 1920, 'height': 1080},
169 '38': {'ext': 'mp4', 'width': 4096, 'height': 3072},
170 '43': {'ext': 'webm', 'width': 640, 'height': 360},
171 '44': {'ext': 'webm', 'width': 854, 'height': 480},
172 '45': {'ext': 'webm', 'width': 1280, 'height': 720},
173 '46': {'ext': 'webm', 'width': 1920, 'height': 1080},
174
1d043b93 175
86fe61c8 176 # 3d videos
2c62dc26
PH
177 '82': {'ext': 'mp4', 'height': 360, 'resolution': '360p', 'format_note': '3D', 'preference': -20},
178 '83': {'ext': 'mp4', 'height': 480, 'resolution': '480p', 'format_note': '3D', 'preference': -20},
179 '84': {'ext': 'mp4', 'height': 720, 'resolution': '720p', 'format_note': '3D', 'preference': -20},
180 '85': {'ext': 'mp4', 'height': 1080, 'resolution': '1080p', 'format_note': '3D', 'preference': -20},
181 '100': {'ext': 'webm', 'height': 360, 'resolution': '360p', 'format_note': '3D', 'preference': -20},
182 '101': {'ext': 'webm', 'height': 480, 'resolution': '480p', 'format_note': '3D', 'preference': -20},
183 '102': {'ext': 'webm', 'height': 720, 'resolution': '720p', 'format_note': '3D', 'preference': -20},
836a086c 184
96fb5605 185 # Apple HTTP Live Streaming
2c62dc26
PH
186 '92': {'ext': 'mp4', 'height': 240, 'resolution': '240p', 'format_note': 'HLS', 'preference': -10},
187 '93': {'ext': 'mp4', 'height': 360, 'resolution': '360p', 'format_note': 'HLS', 'preference': -10},
188 '94': {'ext': 'mp4', 'height': 480, 'resolution': '480p', 'format_note': 'HLS', 'preference': -10},
189 '95': {'ext': 'mp4', 'height': 720, 'resolution': '720p', 'format_note': 'HLS', 'preference': -10},
190 '96': {'ext': 'mp4', 'height': 1080, 'resolution': '1080p', 'format_note': 'HLS', 'preference': -10},
191 '132': {'ext': 'mp4', 'height': 240, 'resolution': '240p', 'format_note': 'HLS', 'preference': -10},
192 '151': {'ext': 'mp4', 'height': 72, 'resolution': '72p', 'format_note': 'HLS', 'preference': -10},
193
194 # DASH mp4 video
195 '133': {'ext': 'mp4', 'height': 240, 'resolution': '240p', 'format_note': 'DASH video', 'preference': -40},
196 '134': {'ext': 'mp4', 'height': 360, 'resolution': '360p', 'format_note': 'DASH video', 'preference': -40},
197 '135': {'ext': 'mp4', 'height': 480, 'resolution': '480p', 'format_note': 'DASH video', 'preference': -40},
198 '136': {'ext': 'mp4', 'height': 720, 'resolution': '720p', 'format_note': 'DASH video', 'preference': -40},
199 '137': {'ext': 'mp4', 'height': 1080, 'resolution': '1080p', 'format_note': 'DASH video', 'preference': -40},
200 '138': {'ext': 'mp4', 'height': 1081, 'resolution': '>1080p', 'format_note': 'DASH video', 'preference': -40},
201 '160': {'ext': 'mp4', 'height': 192, 'resolution': '192p', 'format_note': 'DASH video', 'preference': -40},
8fa8a629 202 '264': {'ext': 'mp4', 'height': 1080, 'resolution': '1080p', 'format_note': 'DASH video', 'preference': -40},
836a086c 203
f6f1fc92 204 # Dash mp4 audio
2c62dc26
PH
205 '139': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 48, 'preference': -50},
206 '140': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 128, 'preference': -50},
207 '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 256, 'preference': -50},
836a086c
AZ
208
209 # Dash webm
1394ce65
PH
210 '167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'VP8', 'acodec': 'none', 'preference': -40},
211 '168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'VP8', 'acodec': 'none', 'preference': -40},
2c5bae42
PH
212 '169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'VP8', 'acodec': 'none', 'preference': -40},
213 '170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'VP8', 'acodec': 'none', 'preference': -40},
1394ce65
PH
214 '218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'VP8', 'acodec': 'none', 'preference': -40},
215 '219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'VP8', 'acodec': 'none', 'preference': -40},
2c62dc26
PH
216 '242': {'ext': 'webm', 'height': 240, 'resolution': '240p', 'format_note': 'DASH webm', 'preference': -40},
217 '243': {'ext': 'webm', 'height': 360, 'resolution': '360p', 'format_note': 'DASH webm', 'preference': -40},
218 '244': {'ext': 'webm', 'height': 480, 'resolution': '480p', 'format_note': 'DASH webm', 'preference': -40},
219 '245': {'ext': 'webm', 'height': 480, 'resolution': '480p', 'format_note': 'DASH webm', 'preference': -40},
220 '246': {'ext': 'webm', 'height': 480, 'resolution': '480p', 'format_note': 'DASH webm', 'preference': -40},
221 '247': {'ext': 'webm', 'height': 720, 'resolution': '720p', 'format_note': 'DASH webm', 'preference': -40},
222 '248': {'ext': 'webm', 'height': 1080, 'resolution': '1080p', 'format_note': 'DASH webm', 'preference': -40},
223
224 # Dash webm audio
225 '171': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH webm audio', 'abr': 48, 'preference': -50},
226 '172': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH webm audio', 'abr': 256, 'preference': -50},
ce6b9a2d
PH
227
228 # RTMP (unnamed)
229 '_rtmp': {'protocol': 'rtmp'},
c5e8d7af 230 }
836a086c 231
c5e8d7af 232 IE_NAME = u'youtube'
2eb88d95
PH
233 _TESTS = [
234 {
0e853ca4
PH
235 u"url": u"http://www.youtube.com/watch?v=BaW_jenozKc",
236 u"file": u"BaW_jenozKc.mp4",
237 u"info_dict": {
238 u"title": u"youtube-dl test video \"'/\\ä↭𝕐",
239 u"uploader": u"Philipp Hagemeister",
240 u"uploader_id": u"phihag",
241 u"upload_date": u"20121002",
27dcce19 242 u"description": u"test chars: \"'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de ."
2eb88d95 243 }
0e853ca4 244 },
0e853ca4
PH
245 {
246 u"url": u"http://www.youtube.com/watch?v=UxxajLWwzqY",
247 u"file": u"UxxajLWwzqY.mp4",
248 u"note": u"Test generic use_cipher_signature video (#897)",
249 u"info_dict": {
250 u"upload_date": u"20120506",
251 u"title": u"Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]",
2dc59299 252 u"description": u"md5:5b292926389560516e384ac437c0ec07",
45ed795c 253 u"uploader": u"Icona Pop",
0e853ca4 254 u"uploader_id": u"IconaPop"
2eb88d95 255 }
c108eb73
JMF
256 },
257 {
258 u"url": u"https://www.youtube.com/watch?v=07FYdnEawAQ",
259 u"file": u"07FYdnEawAQ.mp4",
260 u"note": u"Test VEVO video with age protection (#956)",
261 u"info_dict": {
262 u"upload_date": u"20130703",
263 u"title": u"Justin Timberlake - Tunnel Vision (Explicit)",
264 u"description": u"md5:64249768eec3bc4276236606ea996373",
265 u"uploader": u"justintimberlakeVEVO",
266 u"uploader_id": u"justintimberlakeVEVO"
267 }
268 },
fccd3771 269 {
83aa5293 270 u"url": u"//www.YouTube.com/watch?v=yZIXLfi8CZQ",
fccd3771
PH
271 u"file": u"yZIXLfi8CZQ.mp4",
272 u"note": u"Embed-only video (#1746)",
273 u"info_dict": {
274 u"upload_date": u"20120608",
275 u"title": u"Principal Sexually Assaults A Teacher - Episode 117 - 8th June 2012",
276 u"description": u"md5:09b78bd971f1e3e289601dfba15ca4f7",
277 u"uploader": u"SET India",
278 u"uploader_id": u"setindia"
279 }
280 },
dd27fd17
PH
281 {
282 u"url": u"http://www.youtube.com/watch?v=a9LDPn-MO4I",
283 u"file": u"a9LDPn-MO4I.m4a",
284 u"note": u"256k DASH audio (format 141) via DASH manifest",
dd27fd17
PH
285 u"info_dict": {
286 u"upload_date": "20121002",
287 u"uploader_id": "8KVIDEO",
288 u"description": "No description available.",
289 u"uploader": "8KVIDEO",
290 u"title": "UHDTV TEST 8K VIDEO.mp4"
4919603f
PH
291 },
292 u"params": {
293 u"youtube_include_dash_manifest": True,
294 u"format": "141",
295 },
dd27fd17 296 },
2eb88d95
PH
297 ]
298
c5e8d7af
PH
299
300 @classmethod
301 def suitable(cls, url):
302 """Receives a URL and returns True if suitable for this IE."""
e3ea4790 303 if YoutubePlaylistIE.suitable(url): return False
fccd3771 304 return re.match(cls._VALID_URL, url) is not None
c5e8d7af 305
e0df6211
PH
306 def __init__(self, *args, **kwargs):
307 super(YoutubeIE, self).__init__(*args, **kwargs)
83799698 308 self._player_cache = {}
e0df6211 309
c5e8d7af
PH
310 def report_video_info_webpage_download(self, video_id):
311 """Report attempt to download video info webpage."""
312 self.to_screen(u'%s: Downloading video info webpage' % video_id)
313
c5e8d7af
PH
314 def report_information_extraction(self, video_id):
315 """Report attempt to extract video information."""
316 self.to_screen(u'%s: Extracting video information' % video_id)
317
318 def report_unavailable_format(self, video_id, format):
319 """Report extracted video URL."""
320 self.to_screen(u'%s: Format %s not available' % (video_id, format))
321
322 def report_rtmp_download(self):
323 """Indicate the download will use the RTMP protocol."""
324 self.to_screen(u'RTMP download detected')
325
c4417ddb
PH
326 def _extract_signature_function(self, video_id, player_url, slen):
327 id_m = re.match(r'.*-(?P<id>[a-zA-Z0-9_-]+)\.(?P<ext>[a-z]+)$',
83799698 328 player_url)
e0df6211
PH
329 player_type = id_m.group('ext')
330 player_id = id_m.group('id')
331
c4417ddb
PH
332 # Read from filesystem cache
333 func_id = '%s_%s_%d' % (player_type, player_id, slen)
334 assert os.path.basename(func_id) == func_id
c38b1e77 335 cache_dir = get_cachedir(self._downloader.params)
c4417ddb 336
c3c88a26 337 cache_enabled = cache_dir is not None
f8061589 338 if cache_enabled:
c4417ddb
PH
339 cache_fn = os.path.join(os.path.expanduser(cache_dir),
340 u'youtube-sigfuncs',
341 func_id + '.json')
342 try:
edf3e38e 343 with io.open(cache_fn, 'r', encoding='utf-8') as cachef:
c4417ddb
PH
344 cache_spec = json.load(cachef)
345 return lambda s: u''.join(s[i] for i in cache_spec)
edf3e38e 346 except IOError:
c4417ddb 347 pass # No cache available
83799698 348
e0df6211
PH
349 if player_type == 'js':
350 code = self._download_webpage(
351 player_url, video_id,
83799698 352 note=u'Downloading %s player %s' % (player_type, player_id),
e0df6211 353 errnote=u'Download of %s failed' % player_url)
83799698 354 res = self._parse_sig_js(code)
c4417ddb 355 elif player_type == 'swf':
e0df6211
PH
356 urlh = self._request_webpage(
357 player_url, video_id,
83799698 358 note=u'Downloading %s player %s' % (player_type, player_id),
e0df6211
PH
359 errnote=u'Download of %s failed' % player_url)
360 code = urlh.read()
83799698 361 res = self._parse_sig_swf(code)
e0df6211
PH
362 else:
363 assert False, 'Invalid player type %r' % player_type
364
f8061589 365 if cache_enabled:
edf3e38e 366 try:
c705320f
PH
367 test_string = u''.join(map(compat_chr, range(slen)))
368 cache_res = res(test_string)
edf3e38e
PH
369 cache_spec = [ord(c) for c in cache_res]
370 try:
371 os.makedirs(os.path.dirname(cache_fn))
372 except OSError as ose:
373 if ose.errno != errno.EEXIST:
374 raise
375 write_json_file(cache_spec, cache_fn)
0ca96d48 376 except Exception:
edf3e38e
PH
377 tb = traceback.format_exc()
378 self._downloader.report_warning(
379 u'Writing cache to %r failed: %s' % (cache_fn, tb))
83799698
PH
380
381 return res
382
edf3e38e
PH
383 def _print_sig_code(self, func, slen):
384 def gen_sig_code(idxs):
385 def _genslice(start, end, step):
386 starts = u'' if start == 0 else str(start)
e35e4ddc
PH
387 ends = (u':%d' % (end+step)) if end + step >= 0 else u':'
388 steps = u'' if step == 1 else (u':%d' % step)
edf3e38e
PH
389 return u's[%s%s%s]' % (starts, ends, steps)
390
391 step = None
0ca96d48
PH
392 start = '(Never used)' # Quelch pyflakes warnings - start will be
393 # set as soon as step is set
edf3e38e
PH
394 for i, prev in zip(idxs[1:], idxs[:-1]):
395 if step is not None:
396 if i - prev == step:
397 continue
398 yield _genslice(start, prev, step)
399 step = None
400 continue
401 if i - prev in [-1, 1]:
402 step = i - prev
403 start = prev
404 continue
405 else:
406 yield u's[%d]' % prev
407 if step is None:
408 yield u's[%d]' % i
409 else:
410 yield _genslice(start, i, step)
411
c705320f
PH
412 test_string = u''.join(map(compat_chr, range(slen)))
413 cache_res = func(test_string)
edf3e38e
PH
414 cache_spec = [ord(c) for c in cache_res]
415 expr_code = u' + '.join(gen_sig_code(cache_spec))
416 code = u'if len(s) == %d:\n return %s\n' % (slen, expr_code)
f8061589 417 self.to_screen(u'Extracted signature function:\n' + code)
edf3e38e 418
e0df6211
PH
419 def _parse_sig_js(self, jscode):
420 funcname = self._search_regex(
421 r'signature=([a-zA-Z]+)', jscode,
422 u'Initial JS player signature function name')
423
424 functions = {}
425
426 def argidx(varname):
427 return string.lowercase.index(varname)
428
429 def interpret_statement(stmt, local_vars, allow_recursion=20):
430 if allow_recursion < 0:
0ca96d48 431 raise ExtractorError(u'Recursion limit reached')
e0df6211
PH
432
433 if stmt.startswith(u'var '):
434 stmt = stmt[len(u'var '):]
435 ass_m = re.match(r'^(?P<out>[a-z]+)(?:\[(?P<index>[^\]]+)\])?' +
436 r'=(?P<expr>.*)$', stmt)
437 if ass_m:
438 if ass_m.groupdict().get('index'):
439 def assign(val):
440 lvar = local_vars[ass_m.group('out')]
441 idx = interpret_expression(ass_m.group('index'),
442 local_vars, allow_recursion)
443 assert isinstance(idx, int)
444 lvar[idx] = val
445 return val
446 expr = ass_m.group('expr')
447 else:
448 def assign(val):
449 local_vars[ass_m.group('out')] = val
450 return val
451 expr = ass_m.group('expr')
452 elif stmt.startswith(u'return '):
453 assign = lambda v: v
454 expr = stmt[len(u'return '):]
455 else:
456 raise ExtractorError(
457 u'Cannot determine left side of statement in %r' % stmt)
458
459 v = interpret_expression(expr, local_vars, allow_recursion)
460 return assign(v)
461
462 def interpret_expression(expr, local_vars, allow_recursion):
463 if expr.isdigit():
464 return int(expr)
465
466 if expr.isalpha():
467 return local_vars[expr]
468
469 m = re.match(r'^(?P<in>[a-z]+)\.(?P<member>.*)$', expr)
470 if m:
471 member = m.group('member')
472 val = local_vars[m.group('in')]
473 if member == 'split("")':
474 return list(val)
475 if member == 'join("")':
476 return u''.join(val)
477 if member == 'length':
478 return len(val)
479 if member == 'reverse()':
480 return val[::-1]
481 slice_m = re.match(r'slice\((?P<idx>.*)\)', member)
482 if slice_m:
483 idx = interpret_expression(
484 slice_m.group('idx'), local_vars, allow_recursion-1)
485 return val[idx:]
486
487 m = re.match(
488 r'^(?P<in>[a-z]+)\[(?P<idx>.+)\]$', expr)
489 if m:
490 val = local_vars[m.group('in')]
491 idx = interpret_expression(m.group('idx'), local_vars,
492 allow_recursion-1)
493 return val[idx]
494
495 m = re.match(r'^(?P<a>.+?)(?P<op>[%])(?P<b>.+?)$', expr)
496 if m:
497 a = interpret_expression(m.group('a'),
498 local_vars, allow_recursion)
499 b = interpret_expression(m.group('b'),
500 local_vars, allow_recursion)
501 return a % b
502
503 m = re.match(
504 r'^(?P<func>[a-zA-Z]+)\((?P<args>[a-z0-9,]+)\)$', expr)
505 if m:
506 fname = m.group('func')
507 if fname not in functions:
508 functions[fname] = extract_function(fname)
509 argvals = [int(v) if v.isdigit() else local_vars[v]
510 for v in m.group('args').split(',')]
511 return functions[fname](argvals)
512 raise ExtractorError(u'Unsupported JS expression %r' % expr)
513
514 def extract_function(funcname):
515 func_m = re.search(
516 r'function ' + re.escape(funcname) +
517 r'\((?P<args>[a-z,]+)\){(?P<code>[^}]+)}',
518 jscode)
519 argnames = func_m.group('args').split(',')
520
521 def resf(args):
522 local_vars = dict(zip(argnames, args))
523 for stmt in func_m.group('code').split(';'):
524 res = interpret_statement(stmt, local_vars)
525 return res
526 return resf
527
528 initial_function = extract_function(funcname)
529 return lambda s: initial_function([s])
530
531 def _parse_sig_swf(self, file_contents):
532 if file_contents[1:3] != b'WS':
533 raise ExtractorError(
534 u'Not an SWF file; header is %r' % file_contents[:3])
535 if file_contents[:1] == b'C':
536 content = zlib.decompress(file_contents[8:])
537 else:
538 raise NotImplementedError(u'Unsupported compression format %r' %
539 file_contents[:1])
540
541 def extract_tags(content):
542 pos = 0
543 while pos < len(content):
544 header16 = struct.unpack('<H', content[pos:pos+2])[0]
545 pos += 2
546 tag_code = header16 >> 6
547 tag_len = header16 & 0x3f
548 if tag_len == 0x3f:
549 tag_len = struct.unpack('<I', content[pos:pos+4])[0]
550 pos += 4
551 assert pos+tag_len <= len(content)
552 yield (tag_code, content[pos:pos+tag_len])
553 pos += tag_len
554
555 code_tag = next(tag
556 for tag_code, tag in extract_tags(content)
557 if tag_code == 82)
558 p = code_tag.index(b'\0', 4) + 1
ba552f54 559 code_reader = io.BytesIO(code_tag[p:])
e0df6211
PH
560
561 # Parse ABC (AVM2 ByteCode)
ba552f54
PH
562 def read_int(reader=None):
563 if reader is None:
564 reader = code_reader
e0df6211
PH
565 res = 0
566 shift = 0
567 for _ in range(5):
ba552f54
PH
568 buf = reader.read(1)
569 assert len(buf) == 1
570 b = struct.unpack('<B', buf)[0]
e0df6211
PH
571 res = res | ((b & 0x7f) << shift)
572 if b & 0x80 == 0:
573 break
574 shift += 7
ba552f54
PH
575 return res
576
577 def u30(reader=None):
578 res = read_int(reader)
579 assert res & 0xf0000000 == 0
e0df6211
PH
580 return res
581 u32 = read_int
582
ba552f54
PH
583 def s32(reader=None):
584 v = read_int(reader)
e0df6211
PH
585 if v & 0x80000000 != 0:
586 v = - ((v ^ 0xffffffff) + 1)
ba552f54
PH
587 return v
588
0ca96d48 589 def read_string(reader=None):
ba552f54
PH
590 if reader is None:
591 reader = code_reader
592 slen = u30(reader)
593 resb = reader.read(slen)
594 assert len(resb) == slen
595 return resb.decode('utf-8')
596
597 def read_bytes(count, reader=None):
598 if reader is None:
599 reader = code_reader
600 resb = reader.read(count)
601 assert len(resb) == count
602 return resb
603
604 def read_byte(reader=None):
605 resb = read_bytes(1, reader=reader)
606 res = struct.unpack('<B', resb)[0]
607 return res
e0df6211
PH
608
609 # minor_version + major_version
0ca96d48 610 read_bytes(2 + 2)
e0df6211
PH
611
612 # Constant pool
ba552f54 613 int_count = u30()
e0df6211 614 for _c in range(1, int_count):
0ca96d48 615 s32()
ba552f54 616 uint_count = u30()
e0df6211 617 for _c in range(1, uint_count):
0ca96d48 618 u32()
ba552f54 619 double_count = u30()
0ca96d48 620 read_bytes((double_count-1) * 8)
ba552f54 621 string_count = u30()
e0df6211
PH
622 constant_strings = [u'']
623 for _c in range(1, string_count):
0ca96d48 624 s = read_string()
e0df6211 625 constant_strings.append(s)
ba552f54 626 namespace_count = u30()
e0df6211 627 for _c in range(1, namespace_count):
0ca96d48
PH
628 read_bytes(1) # kind
629 u30() # name
ba552f54 630 ns_set_count = u30()
e0df6211 631 for _c in range(1, ns_set_count):
ba552f54 632 count = u30()
e0df6211 633 for _c2 in range(count):
0ca96d48 634 u30()
ba552f54 635 multiname_count = u30()
e0df6211
PH
636 MULTINAME_SIZES = {
637 0x07: 2, # QName
638 0x0d: 2, # QNameA
639 0x0f: 1, # RTQName
640 0x10: 1, # RTQNameA
641 0x11: 0, # RTQNameL
642 0x12: 0, # RTQNameLA
643 0x09: 2, # Multiname
644 0x0e: 2, # MultinameA
645 0x1b: 1, # MultinameL
646 0x1c: 1, # MultinameLA
647 }
648 multinames = [u'']
649 for _c in range(1, multiname_count):
ba552f54 650 kind = u30()
e0df6211
PH
651 assert kind in MULTINAME_SIZES, u'Invalid multiname kind %r' % kind
652 if kind == 0x07:
0ca96d48 653 u30() # namespace_idx
ba552f54 654 name_idx = u30()
e0df6211
PH
655 multinames.append(constant_strings[name_idx])
656 else:
657 multinames.append('[MULTINAME kind: %d]' % kind)
658 for _c2 in range(MULTINAME_SIZES[kind]):
0ca96d48 659 u30()
e0df6211
PH
660
661 # Methods
ba552f54 662 method_count = u30()
e0df6211
PH
663 MethodInfo = collections.namedtuple(
664 'MethodInfo',
665 ['NEED_ARGUMENTS', 'NEED_REST'])
666 method_infos = []
667 for method_id in range(method_count):
ba552f54 668 param_count = u30()
0ca96d48 669 u30() # return type
e0df6211 670 for _ in range(param_count):
0ca96d48
PH
671 u30() # param type
672 u30() # name index (always 0 for youtube)
ba552f54 673 flags = read_byte()
e0df6211
PH
674 if flags & 0x08 != 0:
675 # Options present
ba552f54 676 option_count = u30()
e0df6211 677 for c in range(option_count):
0ca96d48
PH
678 u30() # val
679 read_bytes(1) # kind
e0df6211
PH
680 if flags & 0x80 != 0:
681 # Param names present
682 for _ in range(param_count):
0ca96d48 683 u30() # param name
e0df6211
PH
684 mi = MethodInfo(flags & 0x01 != 0, flags & 0x04 != 0)
685 method_infos.append(mi)
686
687 # Metadata
ba552f54 688 metadata_count = u30()
e0df6211 689 for _c in range(metadata_count):
0ca96d48 690 u30() # name
ba552f54 691 item_count = u30()
e0df6211 692 for _c2 in range(item_count):
0ca96d48
PH
693 u30() # key
694 u30() # value
ba552f54
PH
695
696 def parse_traits_info():
697 trait_name_idx = u30()
698 kind_full = read_byte()
e0df6211
PH
699 kind = kind_full & 0x0f
700 attrs = kind_full >> 4
701 methods = {}
702 if kind in [0x00, 0x06]: # Slot or Const
0ca96d48
PH
703 u30() # Slot id
704 u30() # type_name_idx
ba552f54 705 vindex = u30()
e0df6211 706 if vindex != 0:
0ca96d48 707 read_byte() # vkind
e0df6211 708 elif kind in [0x01, 0x02, 0x03]: # Method / Getter / Setter
0ca96d48 709 u30() # disp_id
ba552f54 710 method_idx = u30()
e0df6211
PH
711 methods[multinames[trait_name_idx]] = method_idx
712 elif kind == 0x04: # Class
0ca96d48
PH
713 u30() # slot_id
714 u30() # classi
e0df6211 715 elif kind == 0x05: # Function
0ca96d48 716 u30() # slot_id
ba552f54 717 function_idx = u30()
e0df6211
PH
718 methods[function_idx] = multinames[trait_name_idx]
719 else:
720 raise ExtractorError(u'Unsupported trait kind %d' % kind)
721
722 if attrs & 0x4 != 0: # Metadata present
ba552f54 723 metadata_count = u30()
e0df6211 724 for _c3 in range(metadata_count):
0ca96d48 725 u30() # metadata index
e0df6211 726
ba552f54 727 return methods
e0df6211
PH
728
729 # Classes
730 TARGET_CLASSNAME = u'SignatureDecipher'
731 searched_idx = multinames.index(TARGET_CLASSNAME)
732 searched_class_id = None
ba552f54 733 class_count = u30()
e0df6211 734 for class_id in range(class_count):
ba552f54 735 name_idx = u30()
e0df6211
PH
736 if name_idx == searched_idx:
737 # We found the class we're looking for!
738 searched_class_id = class_id
0ca96d48 739 u30() # super_name idx
ba552f54 740 flags = read_byte()
e0df6211 741 if flags & 0x08 != 0: # Protected namespace is present
0ca96d48 742 u30() # protected_ns_idx
ba552f54 743 intrf_count = u30()
e0df6211 744 for _c2 in range(intrf_count):
0ca96d48
PH
745 u30()
746 u30() # iinit
ba552f54 747 trait_count = u30()
e0df6211 748 for _c2 in range(trait_count):
0ca96d48 749 parse_traits_info()
e0df6211
PH
750
751 if searched_class_id is None:
752 raise ExtractorError(u'Target class %r not found' %
753 TARGET_CLASSNAME)
754
755 method_names = {}
756 method_idxs = {}
757 for class_id in range(class_count):
0ca96d48 758 u30() # cinit
ba552f54 759 trait_count = u30()
e0df6211 760 for _c2 in range(trait_count):
ba552f54 761 trait_methods = parse_traits_info()
e0df6211
PH
762 if class_id == searched_class_id:
763 method_names.update(trait_methods.items())
764 method_idxs.update(dict(
765 (idx, name)
766 for name, idx in trait_methods.items()))
767
768 # Scripts
ba552f54 769 script_count = u30()
e0df6211 770 for _c in range(script_count):
0ca96d48 771 u30() # init
ba552f54 772 trait_count = u30()
e0df6211 773 for _c2 in range(trait_count):
0ca96d48 774 parse_traits_info()
e0df6211
PH
775
776 # Method bodies
ba552f54 777 method_body_count = u30()
e0df6211
PH
778 Method = collections.namedtuple('Method', ['code', 'local_count'])
779 methods = {}
780 for _c in range(method_body_count):
ba552f54 781 method_idx = u30()
0ca96d48 782 u30() # max_stack
ba552f54 783 local_count = u30()
0ca96d48
PH
784 u30() # init_scope_depth
785 u30() # max_scope_depth
ba552f54
PH
786 code_length = u30()
787 code = read_bytes(code_length)
e0df6211 788 if method_idx in method_idxs:
ba552f54 789 m = Method(code, local_count)
e0df6211 790 methods[method_idxs[method_idx]] = m
ba552f54 791 exception_count = u30()
e0df6211 792 for _c2 in range(exception_count):
0ca96d48
PH
793 u30() # from
794 u30() # to
795 u30() # target
796 u30() # exc_type
797 u30() # var_name
ba552f54 798 trait_count = u30()
e0df6211 799 for _c2 in range(trait_count):
0ca96d48 800 parse_traits_info()
e0df6211 801
ba552f54 802 assert p + code_reader.tell() == len(code_tag)
e0df6211
PH
803 assert len(methods) == len(method_idxs)
804
805 method_pyfunctions = {}
806
807 def extract_function(func_name):
808 if func_name in method_pyfunctions:
809 return method_pyfunctions[func_name]
810 if func_name not in methods:
811 raise ExtractorError(u'Cannot find function %r' % func_name)
812 m = methods[func_name]
813
814 def resfunc(args):
e0df6211
PH
815 registers = ['(this)'] + list(args) + [None] * m.local_count
816 stack = []
817 coder = io.BytesIO(m.code)
818 while True:
819 opcode = struct.unpack('!B', coder.read(1))[0]
a7177865 820 if opcode == 36: # pushbyte
e0df6211
PH
821 v = struct.unpack('!B', coder.read(1))[0]
822 stack.append(v)
823 elif opcode == 44: # pushstring
824 idx = u30(coder)
825 stack.append(constant_strings[idx])
826 elif opcode == 48: # pushscope
827 # We don't implement the scope register, so we'll just
828 # ignore the popped value
829 stack.pop()
830 elif opcode == 70: # callproperty
831 index = u30(coder)
832 mname = multinames[index]
833 arg_count = u30(coder)
834 args = list(reversed(
835 [stack.pop() for _ in range(arg_count)]))
836 obj = stack.pop()
837 if mname == u'split':
838 assert len(args) == 1
839 assert isinstance(args[0], compat_str)
840 assert isinstance(obj, compat_str)
841 if args[0] == u'':
842 res = list(obj)
843 else:
844 res = obj.split(args[0])
845 stack.append(res)
a7177865
PH
846 elif mname == u'slice':
847 assert len(args) == 1
848 assert isinstance(args[0], int)
849 assert isinstance(obj, list)
850 res = obj[args[0]:]
851 stack.append(res)
852 elif mname == u'join':
853 assert len(args) == 1
854 assert isinstance(args[0], compat_str)
855 assert isinstance(obj, list)
856 res = args[0].join(obj)
857 stack.append(res)
e0df6211
PH
858 elif mname in method_pyfunctions:
859 stack.append(method_pyfunctions[mname](args))
860 else:
861 raise NotImplementedError(
862 u'Unsupported property %r on %r'
863 % (mname, obj))
a7177865
PH
864 elif opcode == 72: # returnvalue
865 res = stack.pop()
866 return res
867 elif opcode == 79: # callpropvoid
868 index = u30(coder)
869 mname = multinames[index]
870 arg_count = u30(coder)
871 args = list(reversed(
872 [stack.pop() for _ in range(arg_count)]))
873 obj = stack.pop()
874 if mname == u'reverse':
875 assert isinstance(obj, list)
876 obj.reverse()
877 else:
878 raise NotImplementedError(
879 u'Unsupported (void) property %r on %r'
880 % (mname, obj))
e0df6211
PH
881 elif opcode == 93: # findpropstrict
882 index = u30(coder)
883 mname = multinames[index]
884 res = extract_function(mname)
885 stack.append(res)
886 elif opcode == 97: # setproperty
887 index = u30(coder)
888 value = stack.pop()
889 idx = stack.pop()
890 obj = stack.pop()
891 assert isinstance(obj, list)
892 assert isinstance(idx, int)
893 obj[idx] = value
894 elif opcode == 98: # getlocal
895 index = u30(coder)
896 stack.append(registers[index])
897 elif opcode == 99: # setlocal
898 index = u30(coder)
899 value = stack.pop()
900 registers[index] = value
901 elif opcode == 102: # getproperty
902 index = u30(coder)
903 pname = multinames[index]
904 if pname == u'length':
905 obj = stack.pop()
906 assert isinstance(obj, list)
907 stack.append(len(obj))
908 else: # Assume attribute access
909 idx = stack.pop()
910 assert isinstance(idx, int)
911 obj = stack.pop()
912 assert isinstance(obj, list)
913 stack.append(obj[idx])
914 elif opcode == 128: # coerce
0ca96d48 915 u30(coder)
e0df6211
PH
916 elif opcode == 133: # coerce_s
917 assert isinstance(stack[-1], (type(None), compat_str))
918 elif opcode == 164: # modulo
919 value2 = stack.pop()
920 value1 = stack.pop()
921 res = value1 % value2
922 stack.append(res)
a7177865
PH
923 elif opcode == 208: # getlocal_0
924 stack.append(registers[0])
925 elif opcode == 209: # getlocal_1
926 stack.append(registers[1])
927 elif opcode == 210: # getlocal_2
928 stack.append(registers[2])
929 elif opcode == 211: # getlocal_3
930 stack.append(registers[3])
e0df6211
PH
931 elif opcode == 214: # setlocal_2
932 registers[2] = stack.pop()
933 elif opcode == 215: # setlocal_3
934 registers[3] = stack.pop()
935 else:
936 raise NotImplementedError(
937 u'Unsupported opcode %d' % opcode)
938
939 method_pyfunctions[func_name] = resfunc
940 return resfunc
941
942 initial_function = extract_function(u'decipher')
943 return lambda s: initial_function([s])
944
83799698 945 def _decrypt_signature(self, s, video_id, player_url, age_gate=False):
257a2501 946 """Turn the encrypted s field into a working signature"""
6b37f0be 947
83799698 948 if player_url is not None:
9f9be844
PH
949 if player_url.startswith(u'//'):
950 player_url = u'https:' + player_url
e0df6211 951 try:
7f8ae73a
PH
952 player_id = (player_url, len(s))
953 if player_id not in self._player_cache:
83799698 954 func = self._extract_signature_function(
c4417ddb 955 video_id, player_url, len(s)
e0df6211 956 )
7f8ae73a
PH
957 self._player_cache[player_id] = func
958 func = self._player_cache[player_id]
edf3e38e
PH
959 if self._downloader.params.get('youtube_print_sig_code'):
960 self._print_sig_code(func, len(s))
961 return func(s)
0ca96d48 962 except Exception:
e0df6211 963 tb = traceback.format_exc()
83799698
PH
964 self._downloader.report_warning(
965 u'Automatic signature extraction failed: ' + tb)
e0df6211 966
d2d8f895
PH
967 self._downloader.report_warning(
968 u'Warning: Falling back to static signature algorithm')
920de7a2 969
2f2ffea9
PH
970 return self._static_decrypt_signature(
971 s, video_id, player_url, age_gate)
e0df6211 972
2f2ffea9 973 def _static_decrypt_signature(self, s, video_id, player_url, age_gate):
e0df6211
PH
974 if age_gate:
975 # The videos with age protection use another player, so the
976 # algorithms can be different.
977 if len(s) == 86:
978 return s[2:63] + s[82] + s[64:82] + s[63]
979
bc4b9008 980 if len(s) == 93:
981 return s[86:29:-1] + s[88] + s[28:5:-1]
982 elif len(s) == 92:
444b1165 983 return s[25] + s[3:25] + s[0] + s[26:42] + s[79] + s[43:79] + s[91] + s[80:83]
38d025b3
JMF
984 elif len(s) == 91:
985 return s[84:27:-1] + s[86] + s[26:5:-1]
444b1165
JMF
986 elif len(s) == 90:
987 return s[25] + s[3:25] + s[2] + s[26:40] + s[77] + s[41:77] + s[89] + s[78:81]
8a9d86a2 988 elif len(s) == 89:
989 return s[84:78:-1] + s[87] + s[77:60:-1] + s[0] + s[59:3:-1]
444b1165 990 elif len(s) == 88:
3e223834 991 return s[7:28] + s[87] + s[29:45] + s[55] + s[46:55] + s[2] + s[56:87] + s[28]
be547e1d 992 elif len(s) == 87:
3a725669 993 return s[6:27] + s[4] + s[28:39] + s[27] + s[40:59] + s[2] + s[60:]
be547e1d 994 elif len(s) == 86:
f2c327fd 995 return s[80:72:-1] + s[16] + s[71:39:-1] + s[72] + s[38:16:-1] + s[82] + s[15::-1]
be547e1d 996 elif len(s) == 85:
6ae8ee3f 997 return s[3:11] + s[0] + s[12:55] + s[84] + s[56:84]
be547e1d 998 elif len(s) == 84:
6f56389b 999 return s[78:70:-1] + s[14] + s[69:37:-1] + s[70] + s[36:14:-1] + s[80] + s[:14][::-1]
be547e1d 1000 elif len(s) == 83:
920de7a2 1001 return s[80:63:-1] + s[0] + s[62:0:-1] + s[63]
be547e1d 1002 elif len(s) == 82:
c21315f2 1003 return s[80:37:-1] + s[7] + s[36:7:-1] + s[0] + s[6:0:-1] + s[37]
be547e1d 1004 elif len(s) == 81:
aedd6bb9 1005 return s[56] + s[79:56:-1] + s[41] + s[55:41:-1] + s[80] + s[40:34:-1] + s[0] + s[33:29:-1] + s[34] + s[28:9:-1] + s[29] + s[8:0:-1] + s[9]
066090dd
JMF
1006 elif len(s) == 80:
1007 return s[1:19] + s[0] + s[20:68] + s[19] + s[69:80]
5c468ca8
JMF
1008 elif len(s) == 79:
1009 return s[54] + s[77:54:-1] + s[39] + s[53:39:-1] + s[78] + s[38:34:-1] + s[0] + s[33:29:-1] + s[34] + s[28:9:-1] + s[29] + s[8:0:-1] + s[9]
be547e1d
PH
1010
1011 else:
1012 raise ExtractorError(u'Unable to decrypt signature, key length %d not supported; retrying might work' % (len(s)))
c5e8d7af 1013
1f343eaa 1014 def _get_available_subtitles(self, video_id, webpage):
de7f3446 1015 try:
7fad1c63
JMF
1016 sub_list = self._download_webpage(
1017 'http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,
1018 video_id, note=False)
1019 except ExtractorError as err:
de7f3446
JMF
1020 self._downloader.report_warning(u'unable to download video subtitles: %s' % compat_str(err))
1021 return {}
1022 lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
1023
1024 sub_lang_list = {}
1025 for l in lang_list:
1026 lang = l[1]
1027 params = compat_urllib_parse.urlencode({
1028 'lang': lang,
1029 'v': video_id,
ca715127 1030 'fmt': self._downloader.params.get('subtitlesformat', 'srt'),
c3197e3e 1031 'name': unescapeHTML(l[0]).encode('utf-8'),
de7f3446
JMF
1032 })
1033 url = u'http://www.youtube.com/api/timedtext?' + params
1034 sub_lang_list[lang] = url
1035 if not sub_lang_list:
1036 self._downloader.report_warning(u'video doesn\'t have subtitles')
1037 return {}
1038 return sub_lang_list
1039
055e6f36 1040 def _get_available_automatic_caption(self, video_id, webpage):
de7f3446
JMF
1041 """We need the webpage for getting the captions url, pass it as an
1042 argument to speed up the process."""
ca715127 1043 sub_format = self._downloader.params.get('subtitlesformat', 'srt')
de7f3446
JMF
1044 self.to_screen(u'%s: Looking for automatic captions' % video_id)
1045 mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
055e6f36 1046 err_msg = u'Couldn\'t find automatic captions for %s' % video_id
de7f3446
JMF
1047 if mobj is None:
1048 self._downloader.report_warning(err_msg)
1049 return {}
1050 player_config = json.loads(mobj.group(1))
1051 try:
1052 args = player_config[u'args']
1053 caption_url = args[u'ttsurl']
1054 timestamp = args[u'timestamp']
055e6f36
JMF
1055 # We get the available subtitles
1056 list_params = compat_urllib_parse.urlencode({
1057 'type': 'list',
1058 'tlangs': 1,
1059 'asrs': 1,
de7f3446 1060 })
055e6f36 1061 list_url = caption_url + '&' + list_params
e26f8712 1062 caption_list = self._download_xml(list_url, video_id)
e3dc22ca 1063 original_lang_node = caption_list.find('track')
f6a54188 1064 if original_lang_node is None or original_lang_node.attrib.get('kind') != 'asr' :
e3dc22ca
JMF
1065 self._downloader.report_warning(u'Video doesn\'t have automatic captions')
1066 return {}
1067 original_lang = original_lang_node.attrib['lang_code']
055e6f36
JMF
1068
1069 sub_lang_list = {}
1070 for lang_node in caption_list.findall('target'):
1071 sub_lang = lang_node.attrib['lang_code']
1072 params = compat_urllib_parse.urlencode({
1073 'lang': original_lang,
1074 'tlang': sub_lang,
1075 'fmt': sub_format,
1076 'ts': timestamp,
1077 'kind': 'asr',
1078 })
1079 sub_lang_list[sub_lang] = caption_url + '&' + params
1080 return sub_lang_list
de7f3446
JMF
1081 # An extractor error can be raise by the download process if there are
1082 # no automatic captions but there are subtitles
1083 except (KeyError, ExtractorError):
1084 self._downloader.report_warning(err_msg)
1085 return {}
1086
c5e8d7af
PH
1087 def _extract_id(self, url):
1088 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1089 if mobj is None:
1090 raise ExtractorError(u'Invalid URL: %s' % url)
1091 video_id = mobj.group(2)
1092 return video_id
1093
1d043b93
JMF
1094 def _extract_from_m3u8(self, manifest_url, video_id):
1095 url_map = {}
1096 def _get_urls(_manifest):
1097 lines = _manifest.split('\n')
1098 urls = filter(lambda l: l and not l.startswith('#'),
1099 lines)
1100 return urls
1101 manifest = self._download_webpage(manifest_url, video_id, u'Downloading formats manifest')
1102 formats_urls = _get_urls(manifest)
1103 for format_url in formats_urls:
890f62e8 1104 itag = self._search_regex(r'itag/(\d+?)/', format_url, 'itag')
1d043b93
JMF
1105 url_map[itag] = format_url
1106 return url_map
1107
1fb07d10
JG
1108 def _extract_annotations(self, video_id):
1109 url = 'https://www.youtube.com/annotations_invideo?features=1&legacy=1&video_id=%s' % video_id
1110 return self._download_webpage(url, video_id, note=u'Searching for annotations.', errnote=u'Unable to download video annotations.')
1111
c5e8d7af
PH
1112 def _real_extract(self, url):
1113 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
1114 mobj = re.search(self._NEXT_URL_RE, url)
1115 if mobj:
1116 url = 'https://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
1117 video_id = self._extract_id(url)
1118
1119 # Get video webpage
c5e8d7af 1120 url = 'https://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
336c3a69 1121 video_webpage = self._download_webpage(url, video_id)
c5e8d7af
PH
1122
1123 # Attempt to extract SWF player URL
e0df6211 1124 mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
c5e8d7af
PH
1125 if mobj is not None:
1126 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1127 else:
1128 player_url = None
1129
1130 # Get video info
1131 self.report_video_info_webpage_download(video_id)
c108eb73
JMF
1132 if re.search(r'player-age-gate-content">', video_webpage) is not None:
1133 self.report_age_confirmation()
1134 age_gate = True
1135 # We simulate the access to the video from www.youtube.com/v/{video_id}
1136 # this can be viewed without login into Youtube
1137 data = compat_urllib_parse.urlencode({'video_id': video_id,
fccd3771 1138 'el': 'player_embedded',
c108eb73
JMF
1139 'gl': 'US',
1140 'hl': 'en',
1141 'eurl': 'https://youtube.googleapis.com/v/' + video_id,
1142 'asv': 3,
1143 'sts':'1588',
1144 })
1145 video_info_url = 'https://www.youtube.com/get_video_info?' + data
c5e8d7af
PH
1146 video_info_webpage = self._download_webpage(video_info_url, video_id,
1147 note=False,
1148 errnote='unable to download video info webpage')
1149 video_info = compat_parse_qs(video_info_webpage)
c108eb73
JMF
1150 else:
1151 age_gate = False
1152 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1153 video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1154 % (video_id, el_type))
1155 video_info_webpage = self._download_webpage(video_info_url, video_id,
1156 note=False,
1157 errnote='unable to download video info webpage')
1158 video_info = compat_parse_qs(video_info_webpage)
1159 if 'token' in video_info:
1160 break
c5e8d7af
PH
1161 if 'token' not in video_info:
1162 if 'reason' in video_info:
9a82b238 1163 raise ExtractorError(u'YouTube said: %s' % video_info['reason'][0], expected=True)
c5e8d7af
PH
1164 else:
1165 raise ExtractorError(u'"token" parameter not in video info for unknown reason')
1166
1d699755
PH
1167 if 'view_count' in video_info:
1168 view_count = int(video_info['view_count'][0])
1169 else:
1170 view_count = None
1171
c5e8d7af
PH
1172 # Check for "rental" videos
1173 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
1174 raise ExtractorError(u'"rental" videos not supported')
1175
1176 # Start extracting information
1177 self.report_information_extraction(video_id)
1178
1179 # uploader
1180 if 'author' not in video_info:
1181 raise ExtractorError(u'Unable to extract uploader name')
1182 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
1183
1184 # uploader_id
1185 video_uploader_id = None
1186 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
1187 if mobj is not None:
1188 video_uploader_id = mobj.group(1)
1189 else:
1190 self._downloader.report_warning(u'unable to extract uploader nickname')
1191
1192 # title
a8c6b241
PH
1193 if 'title' in video_info:
1194 video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
1195 else:
1196 self._downloader.report_warning(u'Unable to extract video title')
1197 video_title = u'_'
c5e8d7af
PH
1198
1199 # thumbnail image
7763b04e
JMF
1200 # We try first to get a high quality image:
1201 m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">',
1202 video_webpage, re.DOTALL)
1203 if m_thumb is not None:
1204 video_thumbnail = m_thumb.group(1)
1205 elif 'thumbnail_url' not in video_info:
c5e8d7af 1206 self._downloader.report_warning(u'unable to extract video thumbnail')
f490e77e 1207 video_thumbnail = None
c5e8d7af
PH
1208 else: # don't panic if we can't find it
1209 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
1210
1211 # upload date
1212 upload_date = None
1213 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
1214 if mobj is not None:
1215 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1216 upload_date = unified_strdate(upload_date)
1217
1218 # description
1219 video_description = get_element_by_id("eow-description", video_webpage)
1220 if video_description:
27dcce19
PH
1221 video_description = re.sub(r'''(?x)
1222 <a\s+
1223 (?:[a-zA-Z-]+="[^"]+"\s+)*?
1224 title="([^"]+)"\s+
1225 (?:[a-zA-Z-]+="[^"]+"\s+)*?
1226 class="yt-uix-redirect-link"\s*>
1227 [^<]+
1228 </a>
1229 ''', r'\1', video_description)
c5e8d7af
PH
1230 video_description = clean_html(video_description)
1231 else:
1232 fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
1233 if fd_mobj:
1234 video_description = unescapeHTML(fd_mobj.group(1))
1235 else:
1236 video_description = u''
1237
336c3a69 1238 def _extract_count(klass):
46374a56
PH
1239 count = self._search_regex(
1240 r'class="%s">([\d,]+)</span>' % re.escape(klass),
1241 video_webpage, klass, default=None)
336c3a69
JMF
1242 if count is not None:
1243 return int(count.replace(',', ''))
1244 return None
1245 like_count = _extract_count(u'likes-count')
1246 dislike_count = _extract_count(u'dislikes-count')
1247
c5e8d7af 1248 # subtitles
d82134c3 1249 video_subtitles = self.extract_subtitles(video_id, video_webpage)
c5e8d7af 1250
c5e8d7af 1251 if self._downloader.params.get('listsubtitles', False):
d665f8d3 1252 self._list_available_subtitles(video_id, video_webpage)
c5e8d7af
PH
1253 return
1254
1255 if 'length_seconds' not in video_info:
1256 self._downloader.report_warning(u'unable to extract video duration')
b466b702 1257 video_duration = None
c5e8d7af 1258 else:
b466b702 1259 video_duration = int(compat_urllib_parse.unquote_plus(video_info['length_seconds'][0]))
c5e8d7af 1260
1fb07d10
JG
1261 # annotations
1262 video_annotations = None
1263 if self._downloader.params.get('writeannotations', False):
1264 video_annotations = self._extract_annotations(video_id)
1265
c5e8d7af 1266 # Decide which formats to download
c5e8d7af
PH
1267 try:
1268 mobj = re.search(r';ytplayer.config = ({.*?});', video_webpage)
50be92c1
PH
1269 if not mobj:
1270 raise ValueError('Could not find vevo ID')
c5e8d7af
PH
1271 info = json.loads(mobj.group(1))
1272 args = info['args']
7ce7e394
JMF
1273 # Easy way to know if the 's' value is in url_encoded_fmt_stream_map
1274 # this signatures are encrypted
44d46655 1275 if 'url_encoded_fmt_stream_map' not in args:
f10503db 1276 raise ValueError(u'No stream_map present') # caught below
00fe14fc
JMF
1277 re_signature = re.compile(r'[&,]s=')
1278 m_s = re_signature.search(args['url_encoded_fmt_stream_map'])
7ce7e394
JMF
1279 if m_s is not None:
1280 self.to_screen(u'%s: Encrypted signatures detected.' % video_id)
c5e8d7af 1281 video_info['url_encoded_fmt_stream_map'] = [args['url_encoded_fmt_stream_map']]
00fe14fc 1282 m_s = re_signature.search(args.get('adaptive_fmts', u''))
b7a68384 1283 if m_s is not None:
00fe14fc
JMF
1284 if 'adaptive_fmts' in video_info:
1285 video_info['adaptive_fmts'][0] += ',' + args['adaptive_fmts']
37b6d5f6 1286 else:
00fe14fc 1287 video_info['adaptive_fmts'] = [args['adaptive_fmts']]
c5e8d7af
PH
1288 except ValueError:
1289 pass
1290
dd27fd17
PH
1291 def _map_to_format_list(urlmap):
1292 formats = []
1293 for itag, video_real_url in urlmap.items():
1294 dct = {
1295 'format_id': itag,
1296 'url': video_real_url,
1297 'player_url': player_url,
1298 }
0b65e5d4
PH
1299 if itag in self._formats:
1300 dct.update(self._formats[itag])
dd27fd17
PH
1301 formats.append(dct)
1302 return formats
1303
c5e8d7af
PH
1304 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1305 self.report_rtmp_download()
dd27fd17
PH
1306 formats = [{
1307 'format_id': '_rtmp',
1308 'protocol': 'rtmp',
1309 'url': video_info['conn'][0],
1310 'player_url': player_url,
1311 }]
00fe14fc
JMF
1312 elif len(video_info.get('url_encoded_fmt_stream_map', [])) >= 1 or len(video_info.get('adaptive_fmts', [])) >= 1:
1313 encoded_url_map = video_info.get('url_encoded_fmt_stream_map', [''])[0] + ',' + video_info.get('adaptive_fmts',[''])[0]
1314 if 'rtmpe%3Dyes' in encoded_url_map:
a7055eb9 1315 raise ExtractorError('rtmpe downloads are not supported, see https://github.com/rg3/youtube-dl/issues/343 for more information.', expected=True)
c5e8d7af 1316 url_map = {}
00fe14fc 1317 for url_data_str in encoded_url_map.split(','):
c5e8d7af
PH
1318 url_data = compat_parse_qs(url_data_str)
1319 if 'itag' in url_data and 'url' in url_data:
1320 url = url_data['url'][0]
1321 if 'sig' in url_data:
1322 url += '&signature=' + url_data['sig'][0]
1323 elif 's' in url_data:
e0df6211 1324 encrypted_sig = url_data['s'][0]
769fda3c 1325 if self._downloader.params.get('verbose'):
c108eb73 1326 if age_gate:
bdde940e
PH
1327 if player_url is None:
1328 player_version = 'unknown'
1329 else:
1330 player_version = self._search_regex(
1331 r'-(.+)\.swf$', player_url,
1332 u'flash player', fatal=False)
e0df6211 1333 player_desc = 'flash player %s' % player_version
c108eb73 1334 else:
83799698
PH
1335 player_version = self._search_regex(
1336 r'html5player-(.+?)\.js', video_webpage,
c108eb73 1337 'html5 player', fatal=False)
e0df6211
PH
1338 player_desc = u'html5 player %s' % player_version
1339
1340 parts_sizes = u'.'.join(compat_str(len(part)) for part in encrypted_sig.split('.'))
5a76c651 1341 self.to_screen(u'encrypted signature length %d (%s), itag %s, %s' %
e0df6211
PH
1342 (len(encrypted_sig), parts_sizes, url_data['itag'][0], player_desc))
1343
83799698 1344 if not age_gate:
e0df6211
PH
1345 jsplayer_url_json = self._search_regex(
1346 r'"assets":.+?"js":\s*("[^"]+")',
1347 video_webpage, u'JS player URL')
83799698 1348 player_url = json.loads(jsplayer_url_json)
e0df6211 1349
83799698
PH
1350 signature = self._decrypt_signature(
1351 encrypted_sig, video_id, player_url, age_gate)
c5e8d7af
PH
1352 url += '&signature=' + signature
1353 if 'ratebypass' not in url:
1354 url += '&ratebypass=yes'
1355 url_map[url_data['itag'][0]] = url
dd27fd17 1356 formats = _map_to_format_list(url_map)
1d043b93
JMF
1357 elif video_info.get('hlsvp'):
1358 manifest_url = video_info['hlsvp'][0]
1359 url_map = self._extract_from_m3u8(manifest_url, video_id)
dd27fd17 1360 formats = _map_to_format_list(url_map)
c5e8d7af 1361 else:
9abb3204 1362 raise ExtractorError(u'no conn, hlsvp or url_encoded_fmt_stream_map information found in video info')
c5e8d7af 1363
dd27fd17
PH
1364 # Look for the DASH manifest
1365 dash_manifest_url_lst = video_info.get('dashmpd')
4919603f
PH
1366 if (dash_manifest_url_lst and dash_manifest_url_lst[0] and
1367 self._downloader.params.get('youtube_include_dash_manifest', False)):
dd27fd17
PH
1368 try:
1369 dash_doc = self._download_xml(
1370 dash_manifest_url_lst[0], video_id,
1371 note=u'Downloading DASH manifest',
1372 errnote=u'Could not download DASH manifest')
1373 for r in dash_doc.findall(u'.//{urn:mpeg:DASH:schema:MPD:2011}Representation'):
1374 url_el = r.find('{urn:mpeg:DASH:schema:MPD:2011}BaseURL')
1375 if url_el is None:
1376 continue
1377 format_id = r.attrib['id']
1378 video_url = url_el.text
1379 filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength'))
1380 f = {
1381 'format_id': format_id,
1382 'url': video_url,
1383 'width': int_or_none(r.attrib.get('width')),
1384 'tbr': int_or_none(r.attrib.get('bandwidth'), 1000),
1385 'asr': int_or_none(r.attrib.get('audioSamplingRate')),
1386 'filesize': filesize,
1387 }
1388 try:
1389 existing_format = next(
1390 fo for fo in formats
1391 if fo['format_id'] == format_id)
1392 except StopIteration:
1393 f.update(self._formats.get(format_id, {}))
1394 formats.append(f)
1395 else:
1396 existing_format.update(f)
1397
1398 except (ExtractorError, KeyError) as e:
1399 self.report_warning(u'Skipping DASH manifest: %s' % e, video_id)
d80044c2 1400
4bcc7bd1 1401 self._sort_formats(formats)
4ea3be0a 1402
1403 return {
1404 'id': video_id,
1405 'uploader': video_uploader,
1406 'uploader_id': video_uploader_id,
1407 'upload_date': upload_date,
1408 'title': video_title,
1409 'thumbnail': video_thumbnail,
1410 'description': video_description,
1411 'subtitles': video_subtitles,
1412 'duration': video_duration,
1413 'age_limit': 18 if age_gate else 0,
1414 'annotations': video_annotations,
1415 'webpage_url': 'https://www.youtube.com/watch?v=%s' % video_id,
1416 'view_count': view_count,
1417 'like_count': like_count,
1418 'dislike_count': dislike_count,
1419 'formats': formats,
1420 }
c5e8d7af 1421
880e1c52 1422class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
0f818663 1423 IE_DESC = u'YouTube.com playlists'
c5e8d7af
PH
1424 _VALID_URL = r"""(?:
1425 (?:https?://)?
1426 (?:\w+\.)?
1427 youtube\.com/
1428 (?:
1429 (?:course|view_play_list|my_playlists|artist|playlist|watch)
1430 \? (?:.*?&)*? (?:p|a|list)=
1431 | p/
1432 )
715c8e7b 1433 ((?:PL|EC|UU|FL|RD)?[0-9A-Za-z-_]{10,})
c5e8d7af
PH
1434 .*
1435 |
715c8e7b 1436 ((?:PL|EC|UU|FL|RD)[0-9A-Za-z-_]{10,})
c5e8d7af 1437 )"""
dcbb4580
JMF
1438 _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s&page=%s'
1439 _MORE_PAGES_INDICATOR = r'data-link-type="next"'
6e47b51e 1440 _VIDEO_RE = r'href="/watch\?v=(?P<id>[0-9A-Za-z_-]{11})&amp;[^"]*?index=(?P<index>\d+)'
c5e8d7af
PH
1441 IE_NAME = u'youtube:playlist'
1442
1443 @classmethod
1444 def suitable(cls, url):
1445 """Receives a URL and returns True if suitable for this IE."""
1446 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1447
880e1c52
JMF
1448 def _real_initialize(self):
1449 self._login()
1450
652cdaa2
JMF
1451 def _ids_to_results(self, ids):
1452 return [self.url_result(vid_id, 'Youtube', video_id=vid_id)
1453 for vid_id in ids]
1454
1455 def _extract_mix(self, playlist_id):
1456 # The mixes are generated from a a single video
1457 # the id of the playlist is just 'RD' + video_id
7d4afc55 1458 url = 'https://youtube.com/watch?v=%s&list=%s' % (playlist_id[-11:], playlist_id)
652cdaa2 1459 webpage = self._download_webpage(url, playlist_id, u'Downloading Youtube mix')
76d1700b
JMF
1460 title_span = (get_element_by_attribute('class', 'title long-title', webpage) or
1461 get_element_by_attribute('class', 'title ', webpage))
1462 title = clean_html(title_span)
652cdaa2
JMF
1463 video_re = r'data-index="\d+".*?href="/watch\?v=([0-9A-Za-z_-]{11})&amp;[^"]*?list=%s' % re.escape(playlist_id)
1464 ids = orderedSet(re.findall(video_re, webpage))
1465 url_results = self._ids_to_results(ids)
1466
1467 return self.playlist_result(url_results, playlist_id, title)
1468
c5e8d7af
PH
1469 def _real_extract(self, url):
1470 # Extract playlist id
1471 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1472 if mobj is None:
1473 raise ExtractorError(u'Invalid URL: %s' % url)
47192f92
FV
1474 playlist_id = mobj.group(1) or mobj.group(2)
1475
1476 # Check if it's a video-specific URL
7c61bd36 1477 query_dict = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
47192f92
FV
1478 if 'v' in query_dict:
1479 video_id = query_dict['v'][0]
1480 if self._downloader.params.get('noplaylist'):
1481 self.to_screen(u'Downloading just video %s because of --no-playlist' % video_id)
7012b23c 1482 return self.url_result(video_id, 'Youtube', video_id=video_id)
47192f92
FV
1483 else:
1484 self.to_screen(u'Downloading playlist PL%s - add --no-playlist to just download video %s' % (playlist_id, video_id))
c5e8d7af 1485
7d4afc55 1486 if playlist_id.startswith('RD'):
652cdaa2
JMF
1487 # Mixes require a custom extraction process
1488 return self._extract_mix(playlist_id)
0a688bc0
JMF
1489 if playlist_id.startswith('TL'):
1490 raise ExtractorError(u'For downloading YouTube.com top lists, use '
1491 u'the "yttoplist" keyword, for example "youtube-dl \'yttoplist:music:Top Tracks\'"', expected=True)
652cdaa2 1492
dcbb4580
JMF
1493 # Extract the video ids from the playlist pages
1494 ids = []
c5e8d7af 1495
755eb032 1496 for page_num in itertools.count(1):
dcbb4580 1497 url = self._TEMPLATE_URL % (playlist_id, page_num)
c5e8d7af 1498 page = self._download_webpage(url, playlist_id, u'Downloading page #%s' % page_num)
6e47b51e
JMF
1499 matches = re.finditer(self._VIDEO_RE, page)
1500 # We remove the duplicates and the link with index 0
1501 # (it's not the first video of the playlist)
1502 new_ids = orderedSet(m.group('id') for m in matches if m.group('index') != '0')
dcbb4580 1503 ids.extend(new_ids)
c5e8d7af 1504
dcbb4580 1505 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
c5e8d7af
PH
1506 break
1507
c91778f8
PH
1508 try:
1509 playlist_title = self._og_search_title(page)
1510 except RegexNotFoundError:
1511 self.report_warning(
1512 u'Playlist page is missing OpenGraph title, falling back ...',
1513 playlist_id)
1514 playlist_title = self._html_search_regex(
1515 r'<h1 class="pl-header-title">(.*?)</h1>', page, u'title')
c5e8d7af 1516
652cdaa2 1517 url_results = self._ids_to_results(ids)
dcbb4580 1518 return self.playlist_result(url_results, playlist_id, playlist_title)
c5e8d7af
PH
1519
1520
0a688bc0
JMF
1521class YoutubeTopListIE(YoutubePlaylistIE):
1522 IE_NAME = u'youtube:toplist'
1523 IE_DESC = (u'YouTube.com top lists, "yttoplist:{channel}:{list title}"'
1524 u' (Example: "yttoplist:music:Top Tracks")')
1525 _VALID_URL = r'yttoplist:(?P<chann>.*?):(?P<title>.*?)$'
1526
1527 def _real_extract(self, url):
1528 mobj = re.match(self._VALID_URL, url)
1529 channel = mobj.group('chann')
1530 title = mobj.group('title')
1531 query = compat_urllib_parse.urlencode({'title': title})
1532 playlist_re = 'href="([^"]+?%s[^"]+?)"' % re.escape(query)
1533 channel_page = self._download_webpage('https://www.youtube.com/%s' % channel, title)
1534 link = self._html_search_regex(playlist_re, channel_page, u'list')
1535 url = compat_urlparse.urljoin('https://www.youtube.com/', link)
1536
1537 video_re = r'data-index="\d+".*?data-video-id="([0-9A-Za-z_-]{11})"'
1538 ids = []
1539 # sometimes the webpage doesn't contain the videos
1540 # retry until we get them
1541 for i in itertools.count(0):
1542 msg = u'Downloading Youtube mix'
1543 if i > 0:
1544 msg += ', retry #%d' % i
1545 webpage = self._download_webpage(url, title, msg)
1546 ids = orderedSet(re.findall(video_re, webpage))
1547 if ids:
1548 break
1549 url_results = self._ids_to_results(ids)
1550 return self.playlist_result(url_results, playlist_title=title)
1551
1552
c5e8d7af 1553class YoutubeChannelIE(InfoExtractor):
0f818663 1554 IE_DESC = u'YouTube.com channels'
c5e8d7af 1555 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
c5e8d7af 1556 _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
252580c5 1557 _MORE_PAGES_URL = 'http://www.youtube.com/c4_browse_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
c5e8d7af
PH
1558 IE_NAME = u'youtube:channel'
1559
1560 def extract_videos_from_page(self, page):
1561 ids_in_page = []
1562 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1563 if mobj.group(1) not in ids_in_page:
1564 ids_in_page.append(mobj.group(1))
1565 return ids_in_page
1566
1567 def _real_extract(self, url):
1568 # Extract channel id
1569 mobj = re.match(self._VALID_URL, url)
1570 if mobj is None:
1571 raise ExtractorError(u'Invalid URL: %s' % url)
1572
1573 # Download channel page
1574 channel_id = mobj.group(1)
1575 video_ids = []
b9643eed
JMF
1576 url = 'https://www.youtube.com/channel/%s/videos' % channel_id
1577 channel_page = self._download_webpage(url, channel_id)
31812a9e
PH
1578 autogenerated = re.search(r'''(?x)
1579 class="[^"]*?(?:
1580 channel-header-autogenerated-label|
1581 yt-channel-title-autogenerated
1582 )[^"]*"''', channel_page) is not None
c5e8d7af 1583
b9643eed
JMF
1584 if autogenerated:
1585 # The videos are contained in a single page
1586 # the ajax pages can't be used, they are empty
1587 video_ids = self.extract_videos_from_page(channel_page)
1588 else:
1589 # Download all channel pages using the json-based channel_ajax query
1590 for pagenum in itertools.count(1):
1591 url = self._MORE_PAGES_URL % (pagenum, channel_id)
1592 page = self._download_webpage(url, channel_id,
1593 u'Downloading page #%s' % pagenum)
1594
1595 page = json.loads(page)
1596
1597 ids_in_page = self.extract_videos_from_page(page['content_html'])
1598 video_ids.extend(ids_in_page)
1599
1600 if self._MORE_PAGES_INDICATOR not in page['load_more_widget_html']:
1601 break
c5e8d7af
PH
1602
1603 self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1604
7012b23c
PH
1605 url_entries = [self.url_result(video_id, 'Youtube', video_id=video_id)
1606 for video_id in video_ids]
1607 return self.playlist_result(url_entries, channel_id)
c5e8d7af
PH
1608
1609
1610class YoutubeUserIE(InfoExtractor):
0f818663 1611 IE_DESC = u'YouTube.com user videos (URL or "ytuser" keyword)'
57da92b7 1612 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:user/)?(?!(?:attribution_link|watch)(?:$|[^a-z_A-Z0-9-])))|ytuser:)(?!feed/)([A-Za-z0-9_-]+)'
c5e8d7af
PH
1613 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1614 _GDATA_PAGE_SIZE = 50
fd9cf738 1615 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d&alt=json'
c5e8d7af
PH
1616 IE_NAME = u'youtube:user'
1617
e3ea4790 1618 @classmethod
f4b05232 1619 def suitable(cls, url):
e3ea4790
JMF
1620 # Don't return True if the url can be extracted with other youtube
1621 # extractor, the regex would is too permissive and it would match.
1622 other_ies = iter(klass for (name, klass) in globals().items() if name.endswith('IE') and klass is not cls)
1623 if any(ie.suitable(url) for ie in other_ies): return False
f4b05232
JMF
1624 else: return super(YoutubeUserIE, cls).suitable(url)
1625
c5e8d7af
PH
1626 def _real_extract(self, url):
1627 # Extract username
1628 mobj = re.match(self._VALID_URL, url)
1629 if mobj is None:
1630 raise ExtractorError(u'Invalid URL: %s' % url)
1631
1632 username = mobj.group(1)
1633
1634 # Download video ids using YouTube Data API. Result size per
1635 # query is limited (currently to 50 videos) so we need to query
1636 # page by page until there are no video ids - it means we got
1637 # all of them.
1638
b7ab0590 1639 def download_page(pagenum):
c5e8d7af
PH
1640 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1641
1642 gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
b7ab0590
PH
1643 page = self._download_webpage(
1644 gdata_url, username,
1645 u'Downloading video ids from %d to %d' % (
1646 start_index, start_index + self._GDATA_PAGE_SIZE))
c5e8d7af 1647
fd9cf738
JMF
1648 try:
1649 response = json.loads(page)
1650 except ValueError as err:
1651 raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
71c82637 1652 if 'entry' not in response['feed']:
b7ab0590 1653 return
fd9cf738 1654
c5e8d7af 1655 # Extract video identifiers
e302f9ce
PH
1656 entries = response['feed']['entry']
1657 for entry in entries:
1658 title = entry['title']['$t']
1659 video_id = entry['id']['$t'].split('/')[-1]
b7ab0590 1660 yield {
e302f9ce
PH
1661 '_type': 'url',
1662 'url': video_id,
1663 'ie_key': 'Youtube',
1664 'id': 'video_id',
1665 'title': title,
b7ab0590
PH
1666 }
1667 url_results = PagedList(download_page, self._GDATA_PAGE_SIZE)
c5e8d7af 1668
7012b23c
PH
1669 return self.playlist_result(url_results, playlist_title=username)
1670
b05654f0
PH
1671
1672class YoutubeSearchIE(SearchInfoExtractor):
0f818663 1673 IE_DESC = u'YouTube.com searches'
b05654f0
PH
1674 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1675 _MAX_RESULTS = 1000
1676 IE_NAME = u'youtube:search'
1677 _SEARCH_KEY = 'ytsearch'
1678
b05654f0
PH
1679 def _get_n_results(self, query, n):
1680 """Get a specified number of results for a query"""
1681
1682 video_ids = []
1683 pagenum = 0
1684 limit = n
1685
1686 while (50 * pagenum) < limit:
b05654f0 1687 result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
7cc3570e
PH
1688 data_json = self._download_webpage(
1689 result_url, video_id=u'query "%s"' % query,
1690 note=u'Downloading page %s' % (pagenum + 1),
1691 errnote=u'Unable to download API page')
1692 data = json.loads(data_json)
1693 api_response = data['data']
1694
1695 if 'items' not in api_response:
b05654f0
PH
1696 raise ExtractorError(u'[youtube] No video results')
1697
1698 new_ids = list(video['id'] for video in api_response['items'])
1699 video_ids += new_ids
1700
1701 limit = min(n, api_response['totalItems'])
1702 pagenum += 1
1703
1704 if len(video_ids) > n:
1705 video_ids = video_ids[:n]
7012b23c
PH
1706 videos = [self.url_result(video_id, 'Youtube', video_id=video_id)
1707 for video_id in video_ids]
b05654f0 1708 return self.playlist_result(videos, query)
75dff0ee 1709
a3dd9248 1710class YoutubeSearchDateIE(YoutubeSearchIE):
cb7fb546 1711 IE_NAME = YoutubeSearchIE.IE_NAME + ':date'
a3dd9248
CM
1712 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc&orderby=published'
1713 _SEARCH_KEY = 'ytsearchdate'
08fb86c4 1714 IE_DESC = u'YouTube.com searches, newest videos first'
75dff0ee
JMF
1715
1716class YoutubeShowIE(InfoExtractor):
0f818663 1717 IE_DESC = u'YouTube.com (multi-season) shows'
75dff0ee
JMF
1718 _VALID_URL = r'https?://www\.youtube\.com/show/(.*)'
1719 IE_NAME = u'youtube:show'
1720
1721 def _real_extract(self, url):
1722 mobj = re.match(self._VALID_URL, url)
1723 show_name = mobj.group(1)
1724 webpage = self._download_webpage(url, show_name, u'Downloading show webpage')
1725 # There's one playlist for each season of the show
1726 m_seasons = list(re.finditer(r'href="(/playlist\?list=.*?)"', webpage))
1727 self.to_screen(u'%s: Found %s seasons' % (show_name, len(m_seasons)))
1728 return [self.url_result('https://www.youtube.com' + season.group(1), 'YoutubePlaylist') for season in m_seasons]
04cc9617
JMF
1729
1730
b2e8bc1b 1731class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
d7ae0639
JMF
1732 """
1733 Base class for extractors that fetch info from
1734 http://www.youtube.com/feed_ajax
1735 Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.
1736 """
b2e8bc1b 1737 _LOGIN_REQUIRED = True
43ba5456
JMF
1738 # use action_load_personal_feed instead of action_load_system_feed
1739 _PERSONAL_FEED = False
04cc9617 1740
d7ae0639
JMF
1741 @property
1742 def _FEED_TEMPLATE(self):
43ba5456
JMF
1743 action = 'action_load_system_feed'
1744 if self._PERSONAL_FEED:
1745 action = 'action_load_personal_feed'
1746 return 'http://www.youtube.com/feed_ajax?%s=1&feed_name=%s&paging=%%s' % (action, self._FEED_NAME)
d7ae0639
JMF
1747
1748 @property
1749 def IE_NAME(self):
1750 return u'youtube:%s' % self._FEED_NAME
04cc9617 1751
81f0259b 1752 def _real_initialize(self):
b2e8bc1b 1753 self._login()
81f0259b 1754
04cc9617
JMF
1755 def _real_extract(self, url):
1756 feed_entries = []
0e44d838
JMF
1757 paging = 0
1758 for i in itertools.count(1):
d7ae0639
JMF
1759 info = self._download_webpage(self._FEED_TEMPLATE % paging,
1760 u'%s feed' % self._FEED_NAME,
04cc9617
JMF
1761 u'Downloading page %s' % i)
1762 info = json.loads(info)
1763 feed_html = info['feed_html']
43ba5456 1764 m_ids = re.finditer(r'"/watch\?v=(.*?)["&]', feed_html)
04cc9617 1765 ids = orderedSet(m.group(1) for m in m_ids)
7012b23c
PH
1766 feed_entries.extend(
1767 self.url_result(video_id, 'Youtube', video_id=video_id)
1768 for video_id in ids)
04cc9617
JMF
1769 if info['paging'] is None:
1770 break
0e44d838 1771 paging = info['paging']
d7ae0639
JMF
1772 return self.playlist_result(feed_entries, playlist_title=self._PLAYLIST_TITLE)
1773
1774class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor):
1775 IE_DESC = u'YouTube.com subscriptions feed, "ytsubs" keyword(requires authentication)'
1776 _VALID_URL = r'https?://www\.youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
1777 _FEED_NAME = 'subscriptions'
1778 _PLAYLIST_TITLE = u'Youtube Subscriptions'
1779
1780class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
1781 IE_DESC = u'YouTube.com recommended videos, "ytrec" keyword (requires authentication)'
1782 _VALID_URL = r'https?://www\.youtube\.com/feed/recommended|:ytrec(?:ommended)?'
1783 _FEED_NAME = 'recommended'
1784 _PLAYLIST_TITLE = u'Youtube Recommended videos'
c626a3d9 1785
43ba5456
JMF
1786class YoutubeWatchLaterIE(YoutubeFeedsInfoExtractor):
1787 IE_DESC = u'Youtube watch later list, "ytwatchlater" keyword (requires authentication)'
1788 _VALID_URL = r'https?://www\.youtube\.com/feed/watch_later|:ytwatchlater'
1789 _FEED_NAME = 'watch_later'
1790 _PLAYLIST_TITLE = u'Youtube Watch Later'
43ba5456 1791 _PERSONAL_FEED = True
c626a3d9 1792
f459d170
JMF
1793class YoutubeHistoryIE(YoutubeFeedsInfoExtractor):
1794 IE_DESC = u'Youtube watch history, "ythistory" keyword (requires authentication)'
1795 _VALID_URL = u'https?://www\.youtube\.com/feed/history|:ythistory'
1796 _FEED_NAME = 'history'
1797 _PERSONAL_FEED = True
1798 _PLAYLIST_TITLE = u'Youtube Watch History'
1799
c626a3d9
JMF
1800class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
1801 IE_NAME = u'youtube:favorites'
1802 IE_DESC = u'YouTube.com favourite videos, "ytfav" keyword (requires authentication)'
c7a7750d 1803 _VALID_URL = r'https?://www\.youtube\.com/my_favorites|:ytfav(?:ou?rites)?'
c626a3d9
JMF
1804 _LOGIN_REQUIRED = True
1805
1806 def _real_extract(self, url):
1807 webpage = self._download_webpage('https://www.youtube.com/my_favorites', 'Youtube Favourites videos')
1808 playlist_id = self._search_regex(r'list=(.+?)["&]', webpage, u'favourites playlist id')
1809 return self.url_result(playlist_id, 'YoutubePlaylist')
15870e90
PH
1810
1811
1812class YoutubeTruncatedURLIE(InfoExtractor):
1813 IE_NAME = 'youtube:truncated_url'
1814 IE_DESC = False # Do not list
975d35db
PH
1815 _VALID_URL = r'''(?x)
1816 (?:https?://)?[^/]+/watch\?feature=[a-z_]+$|
1817 (?:https?://)?(?:www\.)?youtube\.com/attribution_link\?a=[^&]+$
1818 '''
15870e90
PH
1819
1820 def _real_extract(self, url):
1821 raise ExtractorError(
1822 u'Did you forget to quote the URL? Remember that & is a meta '
1823 u'character in most shells, so you want to put the URL in quotes, '
1824 u'like youtube-dl '
b4622a32
PH
1825 u'"http://www.youtube.com/watch?feature=foo&v=BaW_jenozKc" '
1826 u' or simply youtube-dl BaW_jenozKc .',
15870e90 1827 expected=True)