]> jfr.im git - yt-dlp.git/blame - youtube_dl/extractor/youtube.py
release 2014.03.03
[yt-dlp.git] / youtube_dl / extractor / youtube.py
CommitLineData
c5e8d7af 1# coding: utf-8
c5e8d7af 2
e0df6211 3import collections
edf3e38e 4import errno
e0df6211 5import io
0ca96d48 6import itertools
c5e8d7af 7import json
c4417ddb 8import os.path
c5e8d7af 9import re
e0df6211
PH
10import string
11import struct
12import traceback
13import zlib
c5e8d7af 14
b05654f0 15from .common import InfoExtractor, SearchInfoExtractor
54d39d8b 16from .subtitles import SubtitlesInfoExtractor
c5e8d7af 17from ..utils import (
edf3e38e 18 compat_chr,
c5e8d7af 19 compat_parse_qs,
c5e8d7af
PH
20 compat_urllib_parse,
21 compat_urllib_request,
7c61bd36 22 compat_urlparse,
c5e8d7af
PH
23 compat_str,
24
25 clean_html,
c38b1e77 26 get_cachedir,
c5e8d7af 27 get_element_by_id,
652cdaa2 28 get_element_by_attribute,
c5e8d7af 29 ExtractorError,
dd27fd17 30 int_or_none,
b7ab0590 31 PagedList,
c5e8d7af
PH
32 unescapeHTML,
33 unified_strdate,
04cc9617 34 orderedSet,
edf3e38e 35 write_json_file,
81c2f20b 36 uppercase_escape,
c5e8d7af
PH
37)
38
de7f3446 39class YoutubeBaseInfoExtractor(InfoExtractor):
b2e8bc1b
JMF
40 """Provide base functions for Youtube extractors"""
41 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
42 _LANG_URL = r'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
38c2e5b8 43 _AGE_URL = 'https://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
b2e8bc1b
JMF
44 _NETRC_MACHINE = 'youtube'
45 # If True it will raise an error if no login info is provided
46 _LOGIN_REQUIRED = False
47
b2e8bc1b 48 def _set_language(self):
7cc3570e
PH
49 return bool(self._download_webpage(
50 self._LANG_URL, None,
51 note=u'Setting language', errnote='unable to set language',
52 fatal=False))
b2e8bc1b
JMF
53
54 def _login(self):
55 (username, password) = self._get_login_info()
56 # No authentication to be performed
57 if username is None:
58 if self._LOGIN_REQUIRED:
59 raise ExtractorError(u'No login info available, needed for using %s.' % self.IE_NAME, expected=True)
60 return False
61
7cc3570e
PH
62 login_page = self._download_webpage(
63 self._LOGIN_URL, None,
64 note=u'Downloading login page',
65 errnote=u'unable to fetch login page', fatal=False)
66 if login_page is False:
67 return
b2e8bc1b 68
795f28f8
PH
69 galx = self._search_regex(r'(?s)<input.+?name="GALX".+?value="(.+?)"',
70 login_page, u'Login GALX parameter')
c5e8d7af 71
b2e8bc1b
JMF
72 # Log in
73 login_form_strs = {
74 u'continue': u'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
75 u'Email': username,
76 u'GALX': galx,
77 u'Passwd': password,
78 u'PersistentCookie': u'yes',
79 u'_utf8': u'霱',
80 u'bgresponse': u'js_disabled',
81 u'checkConnection': u'',
82 u'checkedDomains': u'youtube',
83 u'dnConn': u'',
b2e8bc1b
JMF
84 u'pstMsg': u'0',
85 u'rmShown': u'1',
86 u'secTok': u'',
87 u'signIn': u'Sign in',
88 u'timeStmp': u'',
89 u'service': u'youtube',
90 u'uilel': u'3',
91 u'hl': u'en_US',
92 }
93 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
94 # chokes on unicode
95 login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
96 login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
7cc3570e
PH
97
98 req = compat_urllib_request.Request(self._LOGIN_URL, login_data)
99 login_results = self._download_webpage(
100 req, None,
101 note=u'Logging in', errnote=u'unable to log in', fatal=False)
102 if login_results is False:
103 return False
104 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
105 self._downloader.report_warning(u'unable to log in: bad username or password')
b2e8bc1b
JMF
106 return False
107 return True
108
109 def _confirm_age(self):
110 age_form = {
7cc3570e
PH
111 'next_url': '/',
112 'action_confirm': 'Confirm',
113 }
5700e779
JMF
114 req = compat_urllib_request.Request(self._AGE_URL,
115 compat_urllib_parse.urlencode(age_form).encode('ascii'))
7cc3570e
PH
116
117 self._download_webpage(
118 req, None,
119 note=u'Confirming age', errnote=u'Unable to confirm age')
b2e8bc1b
JMF
120 return True
121
122 def _real_initialize(self):
123 if self._downloader is None:
124 return
125 if not self._set_language():
126 return
127 if not self._login():
128 return
129 self._confirm_age()
c5e8d7af 130
8377574c 131
de7f3446 132class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
0f818663 133 IE_DESC = u'YouTube.com'
cb7dfeea 134 _VALID_URL = r"""(?x)^
c5e8d7af 135 (
83aa5293 136 (?:https?://|//)? # http(s):// or protocol-independent URL (optional)
cb7dfeea 137 (?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/|
484aaeb2 138 (?:www\.)?deturl\.com/www\.youtube\.com/|
e70dc1d1 139 (?:www\.)?pwnyoutube\.com/|
f7000f3a 140 (?:www\.)?yourepeat\.com/|
e69ae5b9
JMF
141 tube\.majestyc\.net/|
142 youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains
c5e8d7af
PH
143 (?:.*?\#/)? # handle anchor (#/) redirect urls
144 (?: # the various things that can precede the ID:
145 (?:(?:v|embed|e)/) # v/ or embed/ or e/
146 |(?: # or the v= param in all its forms
f7000f3a 147 (?:(?:watch|movie)(?:_popup)?(?:\.php)?/?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
c5e8d7af
PH
148 (?:\?|\#!?) # the params delimiter ? or # or #!
149 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
150 v=
151 )
f4b05232
JMF
152 ))
153 |youtu\.be/ # just youtu.be/xxxx
154 )
c5e8d7af 155 )? # all until now is optional -> you can pass the naked ID
8963d9c2 156 ([0-9A-Za-z_-]{11}) # here is it! the YouTube video ID
c5e8d7af
PH
157 (?(1).+)? # if we found the ID, everything can follow
158 $"""
c5e8d7af 159 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
2c62dc26
PH
160 _formats = {
161 '5': {'ext': 'flv', 'width': 400, 'height': 240},
162 '6': {'ext': 'flv', 'width': 450, 'height': 270},
163 '13': {'ext': '3gp'},
164 '17': {'ext': '3gp', 'width': 176, 'height': 144},
165 '18': {'ext': 'mp4', 'width': 640, 'height': 360},
166 '22': {'ext': 'mp4', 'width': 1280, 'height': 720},
167 '34': {'ext': 'flv', 'width': 640, 'height': 360},
168 '35': {'ext': 'flv', 'width': 854, 'height': 480},
169 '36': {'ext': '3gp', 'width': 320, 'height': 240},
170 '37': {'ext': 'mp4', 'width': 1920, 'height': 1080},
171 '38': {'ext': 'mp4', 'width': 4096, 'height': 3072},
172 '43': {'ext': 'webm', 'width': 640, 'height': 360},
173 '44': {'ext': 'webm', 'width': 854, 'height': 480},
174 '45': {'ext': 'webm', 'width': 1280, 'height': 720},
175 '46': {'ext': 'webm', 'width': 1920, 'height': 1080},
176
1d043b93 177
86fe61c8 178 # 3d videos
2c62dc26
PH
179 '82': {'ext': 'mp4', 'height': 360, 'resolution': '360p', 'format_note': '3D', 'preference': -20},
180 '83': {'ext': 'mp4', 'height': 480, 'resolution': '480p', 'format_note': '3D', 'preference': -20},
181 '84': {'ext': 'mp4', 'height': 720, 'resolution': '720p', 'format_note': '3D', 'preference': -20},
182 '85': {'ext': 'mp4', 'height': 1080, 'resolution': '1080p', 'format_note': '3D', 'preference': -20},
183 '100': {'ext': 'webm', 'height': 360, 'resolution': '360p', 'format_note': '3D', 'preference': -20},
184 '101': {'ext': 'webm', 'height': 480, 'resolution': '480p', 'format_note': '3D', 'preference': -20},
185 '102': {'ext': 'webm', 'height': 720, 'resolution': '720p', 'format_note': '3D', 'preference': -20},
836a086c 186
96fb5605 187 # Apple HTTP Live Streaming
2c62dc26
PH
188 '92': {'ext': 'mp4', 'height': 240, 'resolution': '240p', 'format_note': 'HLS', 'preference': -10},
189 '93': {'ext': 'mp4', 'height': 360, 'resolution': '360p', 'format_note': 'HLS', 'preference': -10},
190 '94': {'ext': 'mp4', 'height': 480, 'resolution': '480p', 'format_note': 'HLS', 'preference': -10},
191 '95': {'ext': 'mp4', 'height': 720, 'resolution': '720p', 'format_note': 'HLS', 'preference': -10},
192 '96': {'ext': 'mp4', 'height': 1080, 'resolution': '1080p', 'format_note': 'HLS', 'preference': -10},
193 '132': {'ext': 'mp4', 'height': 240, 'resolution': '240p', 'format_note': 'HLS', 'preference': -10},
194 '151': {'ext': 'mp4', 'height': 72, 'resolution': '72p', 'format_note': 'HLS', 'preference': -10},
195
196 # DASH mp4 video
197 '133': {'ext': 'mp4', 'height': 240, 'resolution': '240p', 'format_note': 'DASH video', 'preference': -40},
198 '134': {'ext': 'mp4', 'height': 360, 'resolution': '360p', 'format_note': 'DASH video', 'preference': -40},
199 '135': {'ext': 'mp4', 'height': 480, 'resolution': '480p', 'format_note': 'DASH video', 'preference': -40},
200 '136': {'ext': 'mp4', 'height': 720, 'resolution': '720p', 'format_note': 'DASH video', 'preference': -40},
201 '137': {'ext': 'mp4', 'height': 1080, 'resolution': '1080p', 'format_note': 'DASH video', 'preference': -40},
60d3a2e0 202 '138': {'ext': 'mp4', 'height': 2160, 'resolution': '2160p', 'format_note': 'DASH video', 'preference': -40},
2c62dc26 203 '160': {'ext': 'mp4', 'height': 192, 'resolution': '192p', 'format_note': 'DASH video', 'preference': -40},
60d3a2e0 204 '264': {'ext': 'mp4', 'height': 1440, 'resolution': '1440p', 'format_note': 'DASH video', 'preference': -40},
836a086c 205
f6f1fc92 206 # Dash mp4 audio
2c62dc26
PH
207 '139': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 48, 'preference': -50},
208 '140': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 128, 'preference': -50},
209 '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 256, 'preference': -50},
836a086c
AZ
210
211 # Dash webm
1394ce65
PH
212 '167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'VP8', 'acodec': 'none', 'preference': -40},
213 '168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'VP8', 'acodec': 'none', 'preference': -40},
2c5bae42
PH
214 '169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'VP8', 'acodec': 'none', 'preference': -40},
215 '170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'VP8', 'acodec': 'none', 'preference': -40},
1394ce65
PH
216 '218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'VP8', 'acodec': 'none', 'preference': -40},
217 '219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'VP8', 'acodec': 'none', 'preference': -40},
2c62dc26
PH
218 '242': {'ext': 'webm', 'height': 240, 'resolution': '240p', 'format_note': 'DASH webm', 'preference': -40},
219 '243': {'ext': 'webm', 'height': 360, 'resolution': '360p', 'format_note': 'DASH webm', 'preference': -40},
220 '244': {'ext': 'webm', 'height': 480, 'resolution': '480p', 'format_note': 'DASH webm', 'preference': -40},
221 '245': {'ext': 'webm', 'height': 480, 'resolution': '480p', 'format_note': 'DASH webm', 'preference': -40},
222 '246': {'ext': 'webm', 'height': 480, 'resolution': '480p', 'format_note': 'DASH webm', 'preference': -40},
223 '247': {'ext': 'webm', 'height': 720, 'resolution': '720p', 'format_note': 'DASH webm', 'preference': -40},
224 '248': {'ext': 'webm', 'height': 1080, 'resolution': '1080p', 'format_note': 'DASH webm', 'preference': -40},
225
226 # Dash webm audio
227 '171': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH webm audio', 'abr': 48, 'preference': -50},
228 '172': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH webm audio', 'abr': 256, 'preference': -50},
ce6b9a2d
PH
229
230 # RTMP (unnamed)
231 '_rtmp': {'protocol': 'rtmp'},
c5e8d7af 232 }
836a086c 233
c5e8d7af 234 IE_NAME = u'youtube'
2eb88d95
PH
235 _TESTS = [
236 {
0e853ca4
PH
237 u"url": u"http://www.youtube.com/watch?v=BaW_jenozKc",
238 u"file": u"BaW_jenozKc.mp4",
239 u"info_dict": {
240 u"title": u"youtube-dl test video \"'/\\ä↭𝕐",
241 u"uploader": u"Philipp Hagemeister",
242 u"uploader_id": u"phihag",
243 u"upload_date": u"20121002",
27dcce19 244 u"description": u"test chars: \"'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de ."
2eb88d95 245 }
0e853ca4 246 },
0e853ca4
PH
247 {
248 u"url": u"http://www.youtube.com/watch?v=UxxajLWwzqY",
249 u"file": u"UxxajLWwzqY.mp4",
250 u"note": u"Test generic use_cipher_signature video (#897)",
251 u"info_dict": {
252 u"upload_date": u"20120506",
253 u"title": u"Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]",
2dc59299 254 u"description": u"md5:5b292926389560516e384ac437c0ec07",
45ed795c 255 u"uploader": u"Icona Pop",
0e853ca4 256 u"uploader_id": u"IconaPop"
2eb88d95 257 }
c108eb73
JMF
258 },
259 {
260 u"url": u"https://www.youtube.com/watch?v=07FYdnEawAQ",
261 u"file": u"07FYdnEawAQ.mp4",
262 u"note": u"Test VEVO video with age protection (#956)",
263 u"info_dict": {
264 u"upload_date": u"20130703",
265 u"title": u"Justin Timberlake - Tunnel Vision (Explicit)",
266 u"description": u"md5:64249768eec3bc4276236606ea996373",
267 u"uploader": u"justintimberlakeVEVO",
268 u"uploader_id": u"justintimberlakeVEVO"
269 }
270 },
fccd3771 271 {
83aa5293 272 u"url": u"//www.YouTube.com/watch?v=yZIXLfi8CZQ",
fccd3771
PH
273 u"file": u"yZIXLfi8CZQ.mp4",
274 u"note": u"Embed-only video (#1746)",
275 u"info_dict": {
276 u"upload_date": u"20120608",
277 u"title": u"Principal Sexually Assaults A Teacher - Episode 117 - 8th June 2012",
278 u"description": u"md5:09b78bd971f1e3e289601dfba15ca4f7",
279 u"uploader": u"SET India",
280 u"uploader_id": u"setindia"
281 }
282 },
dd27fd17
PH
283 {
284 u"url": u"http://www.youtube.com/watch?v=a9LDPn-MO4I",
285 u"file": u"a9LDPn-MO4I.m4a",
286 u"note": u"256k DASH audio (format 141) via DASH manifest",
dd27fd17
PH
287 u"info_dict": {
288 u"upload_date": "20121002",
289 u"uploader_id": "8KVIDEO",
290 u"description": "No description available.",
291 u"uploader": "8KVIDEO",
292 u"title": "UHDTV TEST 8K VIDEO.mp4"
4919603f
PH
293 },
294 u"params": {
295 u"youtube_include_dash_manifest": True,
296 u"format": "141",
297 },
dd27fd17 298 },
3489b7d2
JMF
299 # DASH manifest with encrypted signature
300 {
301 u'url': u'https://www.youtube.com/watch?v=IB3lcPjvWLA',
302 u'info_dict': {
303 u'id': u'IB3lcPjvWLA',
304 u'ext': u'm4a',
305 u'title': u'Afrojack - The Spark ft. Spree Wilson',
306 u'description': u'md5:3199ed45ee8836572865580804d7ac0f',
307 u'uploader': u'AfrojackVEVO',
308 u'uploader_id': u'AfrojackVEVO',
309 u'upload_date': u'20131011',
310 },
311 u"params": {
312 u'youtube_include_dash_manifest': True,
313 u'format': '141',
314 },
315 },
2eb88d95
PH
316 ]
317
c5e8d7af
PH
318
319 @classmethod
320 def suitable(cls, url):
321 """Receives a URL and returns True if suitable for this IE."""
e3ea4790 322 if YoutubePlaylistIE.suitable(url): return False
fccd3771 323 return re.match(cls._VALID_URL, url) is not None
c5e8d7af 324
e0df6211
PH
325 def __init__(self, *args, **kwargs):
326 super(YoutubeIE, self).__init__(*args, **kwargs)
83799698 327 self._player_cache = {}
e0df6211 328
c5e8d7af
PH
329 def report_video_info_webpage_download(self, video_id):
330 """Report attempt to download video info webpage."""
331 self.to_screen(u'%s: Downloading video info webpage' % video_id)
332
c5e8d7af
PH
333 def report_information_extraction(self, video_id):
334 """Report attempt to extract video information."""
335 self.to_screen(u'%s: Extracting video information' % video_id)
336
337 def report_unavailable_format(self, video_id, format):
338 """Report extracted video URL."""
339 self.to_screen(u'%s: Format %s not available' % (video_id, format))
340
341 def report_rtmp_download(self):
342 """Indicate the download will use the RTMP protocol."""
343 self.to_screen(u'RTMP download detected')
344
c4417ddb
PH
345 def _extract_signature_function(self, video_id, player_url, slen):
346 id_m = re.match(r'.*-(?P<id>[a-zA-Z0-9_-]+)\.(?P<ext>[a-z]+)$',
83799698 347 player_url)
e0df6211
PH
348 player_type = id_m.group('ext')
349 player_id = id_m.group('id')
350
c4417ddb
PH
351 # Read from filesystem cache
352 func_id = '%s_%s_%d' % (player_type, player_id, slen)
353 assert os.path.basename(func_id) == func_id
c38b1e77 354 cache_dir = get_cachedir(self._downloader.params)
c4417ddb 355
c3c88a26 356 cache_enabled = cache_dir is not None
f8061589 357 if cache_enabled:
c4417ddb
PH
358 cache_fn = os.path.join(os.path.expanduser(cache_dir),
359 u'youtube-sigfuncs',
360 func_id + '.json')
361 try:
edf3e38e 362 with io.open(cache_fn, 'r', encoding='utf-8') as cachef:
c4417ddb
PH
363 cache_spec = json.load(cachef)
364 return lambda s: u''.join(s[i] for i in cache_spec)
edf3e38e 365 except IOError:
c4417ddb 366 pass # No cache available
83799698 367
e0df6211
PH
368 if player_type == 'js':
369 code = self._download_webpage(
370 player_url, video_id,
83799698 371 note=u'Downloading %s player %s' % (player_type, player_id),
e0df6211 372 errnote=u'Download of %s failed' % player_url)
83799698 373 res = self._parse_sig_js(code)
c4417ddb 374 elif player_type == 'swf':
e0df6211
PH
375 urlh = self._request_webpage(
376 player_url, video_id,
83799698 377 note=u'Downloading %s player %s' % (player_type, player_id),
e0df6211
PH
378 errnote=u'Download of %s failed' % player_url)
379 code = urlh.read()
83799698 380 res = self._parse_sig_swf(code)
e0df6211
PH
381 else:
382 assert False, 'Invalid player type %r' % player_type
383
f8061589 384 if cache_enabled:
edf3e38e 385 try:
c705320f
PH
386 test_string = u''.join(map(compat_chr, range(slen)))
387 cache_res = res(test_string)
edf3e38e
PH
388 cache_spec = [ord(c) for c in cache_res]
389 try:
390 os.makedirs(os.path.dirname(cache_fn))
391 except OSError as ose:
392 if ose.errno != errno.EEXIST:
393 raise
394 write_json_file(cache_spec, cache_fn)
0ca96d48 395 except Exception:
edf3e38e
PH
396 tb = traceback.format_exc()
397 self._downloader.report_warning(
398 u'Writing cache to %r failed: %s' % (cache_fn, tb))
83799698
PH
399
400 return res
401
edf3e38e
PH
402 def _print_sig_code(self, func, slen):
403 def gen_sig_code(idxs):
404 def _genslice(start, end, step):
405 starts = u'' if start == 0 else str(start)
e35e4ddc
PH
406 ends = (u':%d' % (end+step)) if end + step >= 0 else u':'
407 steps = u'' if step == 1 else (u':%d' % step)
edf3e38e
PH
408 return u's[%s%s%s]' % (starts, ends, steps)
409
410 step = None
0ca96d48
PH
411 start = '(Never used)' # Quelch pyflakes warnings - start will be
412 # set as soon as step is set
edf3e38e
PH
413 for i, prev in zip(idxs[1:], idxs[:-1]):
414 if step is not None:
415 if i - prev == step:
416 continue
417 yield _genslice(start, prev, step)
418 step = None
419 continue
420 if i - prev in [-1, 1]:
421 step = i - prev
422 start = prev
423 continue
424 else:
425 yield u's[%d]' % prev
426 if step is None:
427 yield u's[%d]' % i
428 else:
429 yield _genslice(start, i, step)
430
c705320f
PH
431 test_string = u''.join(map(compat_chr, range(slen)))
432 cache_res = func(test_string)
edf3e38e
PH
433 cache_spec = [ord(c) for c in cache_res]
434 expr_code = u' + '.join(gen_sig_code(cache_spec))
435 code = u'if len(s) == %d:\n return %s\n' % (slen, expr_code)
f8061589 436 self.to_screen(u'Extracted signature function:\n' + code)
edf3e38e 437
e0df6211
PH
438 def _parse_sig_js(self, jscode):
439 funcname = self._search_regex(
440 r'signature=([a-zA-Z]+)', jscode,
441 u'Initial JS player signature function name')
442
443 functions = {}
444
445 def argidx(varname):
446 return string.lowercase.index(varname)
447
448 def interpret_statement(stmt, local_vars, allow_recursion=20):
449 if allow_recursion < 0:
0ca96d48 450 raise ExtractorError(u'Recursion limit reached')
e0df6211
PH
451
452 if stmt.startswith(u'var '):
453 stmt = stmt[len(u'var '):]
454 ass_m = re.match(r'^(?P<out>[a-z]+)(?:\[(?P<index>[^\]]+)\])?' +
455 r'=(?P<expr>.*)$', stmt)
456 if ass_m:
457 if ass_m.groupdict().get('index'):
458 def assign(val):
459 lvar = local_vars[ass_m.group('out')]
460 idx = interpret_expression(ass_m.group('index'),
461 local_vars, allow_recursion)
462 assert isinstance(idx, int)
463 lvar[idx] = val
464 return val
465 expr = ass_m.group('expr')
466 else:
467 def assign(val):
468 local_vars[ass_m.group('out')] = val
469 return val
470 expr = ass_m.group('expr')
471 elif stmt.startswith(u'return '):
472 assign = lambda v: v
473 expr = stmt[len(u'return '):]
474 else:
475 raise ExtractorError(
476 u'Cannot determine left side of statement in %r' % stmt)
477
478 v = interpret_expression(expr, local_vars, allow_recursion)
479 return assign(v)
480
481 def interpret_expression(expr, local_vars, allow_recursion):
482 if expr.isdigit():
483 return int(expr)
484
485 if expr.isalpha():
486 return local_vars[expr]
487
488 m = re.match(r'^(?P<in>[a-z]+)\.(?P<member>.*)$', expr)
489 if m:
490 member = m.group('member')
491 val = local_vars[m.group('in')]
492 if member == 'split("")':
493 return list(val)
494 if member == 'join("")':
495 return u''.join(val)
496 if member == 'length':
497 return len(val)
498 if member == 'reverse()':
499 return val[::-1]
500 slice_m = re.match(r'slice\((?P<idx>.*)\)', member)
501 if slice_m:
502 idx = interpret_expression(
503 slice_m.group('idx'), local_vars, allow_recursion-1)
504 return val[idx:]
505
506 m = re.match(
507 r'^(?P<in>[a-z]+)\[(?P<idx>.+)\]$', expr)
508 if m:
509 val = local_vars[m.group('in')]
510 idx = interpret_expression(m.group('idx'), local_vars,
511 allow_recursion-1)
512 return val[idx]
513
514 m = re.match(r'^(?P<a>.+?)(?P<op>[%])(?P<b>.+?)$', expr)
515 if m:
516 a = interpret_expression(m.group('a'),
517 local_vars, allow_recursion)
518 b = interpret_expression(m.group('b'),
519 local_vars, allow_recursion)
520 return a % b
521
522 m = re.match(
20650c86 523 r'^(?P<func>[a-zA-Z$]+)\((?P<args>[a-z0-9,]+)\)$', expr)
e0df6211
PH
524 if m:
525 fname = m.group('func')
526 if fname not in functions:
527 functions[fname] = extract_function(fname)
528 argvals = [int(v) if v.isdigit() else local_vars[v]
529 for v in m.group('args').split(',')]
530 return functions[fname](argvals)
531 raise ExtractorError(u'Unsupported JS expression %r' % expr)
532
533 def extract_function(funcname):
534 func_m = re.search(
535 r'function ' + re.escape(funcname) +
536 r'\((?P<args>[a-z,]+)\){(?P<code>[^}]+)}',
537 jscode)
538 argnames = func_m.group('args').split(',')
539
540 def resf(args):
541 local_vars = dict(zip(argnames, args))
542 for stmt in func_m.group('code').split(';'):
543 res = interpret_statement(stmt, local_vars)
544 return res
545 return resf
546
547 initial_function = extract_function(funcname)
548 return lambda s: initial_function([s])
549
550 def _parse_sig_swf(self, file_contents):
551 if file_contents[1:3] != b'WS':
552 raise ExtractorError(
553 u'Not an SWF file; header is %r' % file_contents[:3])
554 if file_contents[:1] == b'C':
555 content = zlib.decompress(file_contents[8:])
556 else:
557 raise NotImplementedError(u'Unsupported compression format %r' %
558 file_contents[:1])
559
560 def extract_tags(content):
561 pos = 0
562 while pos < len(content):
563 header16 = struct.unpack('<H', content[pos:pos+2])[0]
564 pos += 2
565 tag_code = header16 >> 6
566 tag_len = header16 & 0x3f
567 if tag_len == 0x3f:
568 tag_len = struct.unpack('<I', content[pos:pos+4])[0]
569 pos += 4
570 assert pos+tag_len <= len(content)
571 yield (tag_code, content[pos:pos+tag_len])
572 pos += tag_len
573
574 code_tag = next(tag
575 for tag_code, tag in extract_tags(content)
576 if tag_code == 82)
577 p = code_tag.index(b'\0', 4) + 1
ba552f54 578 code_reader = io.BytesIO(code_tag[p:])
e0df6211
PH
579
580 # Parse ABC (AVM2 ByteCode)
ba552f54
PH
581 def read_int(reader=None):
582 if reader is None:
583 reader = code_reader
e0df6211
PH
584 res = 0
585 shift = 0
586 for _ in range(5):
ba552f54
PH
587 buf = reader.read(1)
588 assert len(buf) == 1
589 b = struct.unpack('<B', buf)[0]
e0df6211
PH
590 res = res | ((b & 0x7f) << shift)
591 if b & 0x80 == 0:
592 break
593 shift += 7
ba552f54
PH
594 return res
595
596 def u30(reader=None):
597 res = read_int(reader)
598 assert res & 0xf0000000 == 0
e0df6211
PH
599 return res
600 u32 = read_int
601
ba552f54
PH
602 def s32(reader=None):
603 v = read_int(reader)
e0df6211
PH
604 if v & 0x80000000 != 0:
605 v = - ((v ^ 0xffffffff) + 1)
ba552f54
PH
606 return v
607
0ca96d48 608 def read_string(reader=None):
ba552f54
PH
609 if reader is None:
610 reader = code_reader
611 slen = u30(reader)
612 resb = reader.read(slen)
613 assert len(resb) == slen
614 return resb.decode('utf-8')
615
616 def read_bytes(count, reader=None):
617 if reader is None:
618 reader = code_reader
619 resb = reader.read(count)
620 assert len(resb) == count
621 return resb
622
623 def read_byte(reader=None):
624 resb = read_bytes(1, reader=reader)
625 res = struct.unpack('<B', resb)[0]
626 return res
e0df6211
PH
627
628 # minor_version + major_version
0ca96d48 629 read_bytes(2 + 2)
e0df6211
PH
630
631 # Constant pool
ba552f54 632 int_count = u30()
e0df6211 633 for _c in range(1, int_count):
0ca96d48 634 s32()
ba552f54 635 uint_count = u30()
e0df6211 636 for _c in range(1, uint_count):
0ca96d48 637 u32()
ba552f54 638 double_count = u30()
0ca96d48 639 read_bytes((double_count-1) * 8)
ba552f54 640 string_count = u30()
e0df6211
PH
641 constant_strings = [u'']
642 for _c in range(1, string_count):
0ca96d48 643 s = read_string()
e0df6211 644 constant_strings.append(s)
ba552f54 645 namespace_count = u30()
e0df6211 646 for _c in range(1, namespace_count):
0ca96d48
PH
647 read_bytes(1) # kind
648 u30() # name
ba552f54 649 ns_set_count = u30()
e0df6211 650 for _c in range(1, ns_set_count):
ba552f54 651 count = u30()
e0df6211 652 for _c2 in range(count):
0ca96d48 653 u30()
ba552f54 654 multiname_count = u30()
e0df6211
PH
655 MULTINAME_SIZES = {
656 0x07: 2, # QName
657 0x0d: 2, # QNameA
658 0x0f: 1, # RTQName
659 0x10: 1, # RTQNameA
660 0x11: 0, # RTQNameL
661 0x12: 0, # RTQNameLA
662 0x09: 2, # Multiname
663 0x0e: 2, # MultinameA
664 0x1b: 1, # MultinameL
665 0x1c: 1, # MultinameLA
666 }
667 multinames = [u'']
668 for _c in range(1, multiname_count):
ba552f54 669 kind = u30()
e0df6211
PH
670 assert kind in MULTINAME_SIZES, u'Invalid multiname kind %r' % kind
671 if kind == 0x07:
0ca96d48 672 u30() # namespace_idx
ba552f54 673 name_idx = u30()
e0df6211
PH
674 multinames.append(constant_strings[name_idx])
675 else:
676 multinames.append('[MULTINAME kind: %d]' % kind)
677 for _c2 in range(MULTINAME_SIZES[kind]):
0ca96d48 678 u30()
e0df6211
PH
679
680 # Methods
ba552f54 681 method_count = u30()
e0df6211
PH
682 MethodInfo = collections.namedtuple(
683 'MethodInfo',
684 ['NEED_ARGUMENTS', 'NEED_REST'])
685 method_infos = []
686 for method_id in range(method_count):
ba552f54 687 param_count = u30()
0ca96d48 688 u30() # return type
e0df6211 689 for _ in range(param_count):
0ca96d48
PH
690 u30() # param type
691 u30() # name index (always 0 for youtube)
ba552f54 692 flags = read_byte()
e0df6211
PH
693 if flags & 0x08 != 0:
694 # Options present
ba552f54 695 option_count = u30()
e0df6211 696 for c in range(option_count):
0ca96d48
PH
697 u30() # val
698 read_bytes(1) # kind
e0df6211
PH
699 if flags & 0x80 != 0:
700 # Param names present
701 for _ in range(param_count):
0ca96d48 702 u30() # param name
e0df6211
PH
703 mi = MethodInfo(flags & 0x01 != 0, flags & 0x04 != 0)
704 method_infos.append(mi)
705
706 # Metadata
ba552f54 707 metadata_count = u30()
e0df6211 708 for _c in range(metadata_count):
0ca96d48 709 u30() # name
ba552f54 710 item_count = u30()
e0df6211 711 for _c2 in range(item_count):
0ca96d48
PH
712 u30() # key
713 u30() # value
ba552f54
PH
714
715 def parse_traits_info():
716 trait_name_idx = u30()
717 kind_full = read_byte()
e0df6211
PH
718 kind = kind_full & 0x0f
719 attrs = kind_full >> 4
720 methods = {}
721 if kind in [0x00, 0x06]: # Slot or Const
0ca96d48
PH
722 u30() # Slot id
723 u30() # type_name_idx
ba552f54 724 vindex = u30()
e0df6211 725 if vindex != 0:
0ca96d48 726 read_byte() # vkind
e0df6211 727 elif kind in [0x01, 0x02, 0x03]: # Method / Getter / Setter
0ca96d48 728 u30() # disp_id
ba552f54 729 method_idx = u30()
e0df6211
PH
730 methods[multinames[trait_name_idx]] = method_idx
731 elif kind == 0x04: # Class
0ca96d48
PH
732 u30() # slot_id
733 u30() # classi
e0df6211 734 elif kind == 0x05: # Function
0ca96d48 735 u30() # slot_id
ba552f54 736 function_idx = u30()
e0df6211
PH
737 methods[function_idx] = multinames[trait_name_idx]
738 else:
739 raise ExtractorError(u'Unsupported trait kind %d' % kind)
740
741 if attrs & 0x4 != 0: # Metadata present
ba552f54 742 metadata_count = u30()
e0df6211 743 for _c3 in range(metadata_count):
0ca96d48 744 u30() # metadata index
e0df6211 745
ba552f54 746 return methods
e0df6211
PH
747
748 # Classes
749 TARGET_CLASSNAME = u'SignatureDecipher'
750 searched_idx = multinames.index(TARGET_CLASSNAME)
751 searched_class_id = None
ba552f54 752 class_count = u30()
e0df6211 753 for class_id in range(class_count):
ba552f54 754 name_idx = u30()
e0df6211
PH
755 if name_idx == searched_idx:
756 # We found the class we're looking for!
757 searched_class_id = class_id
0ca96d48 758 u30() # super_name idx
ba552f54 759 flags = read_byte()
e0df6211 760 if flags & 0x08 != 0: # Protected namespace is present
0ca96d48 761 u30() # protected_ns_idx
ba552f54 762 intrf_count = u30()
e0df6211 763 for _c2 in range(intrf_count):
0ca96d48
PH
764 u30()
765 u30() # iinit
ba552f54 766 trait_count = u30()
e0df6211 767 for _c2 in range(trait_count):
0ca96d48 768 parse_traits_info()
e0df6211
PH
769
770 if searched_class_id is None:
771 raise ExtractorError(u'Target class %r not found' %
772 TARGET_CLASSNAME)
773
774 method_names = {}
775 method_idxs = {}
776 for class_id in range(class_count):
0ca96d48 777 u30() # cinit
ba552f54 778 trait_count = u30()
e0df6211 779 for _c2 in range(trait_count):
ba552f54 780 trait_methods = parse_traits_info()
e0df6211
PH
781 if class_id == searched_class_id:
782 method_names.update(trait_methods.items())
783 method_idxs.update(dict(
784 (idx, name)
785 for name, idx in trait_methods.items()))
786
787 # Scripts
ba552f54 788 script_count = u30()
e0df6211 789 for _c in range(script_count):
0ca96d48 790 u30() # init
ba552f54 791 trait_count = u30()
e0df6211 792 for _c2 in range(trait_count):
0ca96d48 793 parse_traits_info()
e0df6211
PH
794
795 # Method bodies
ba552f54 796 method_body_count = u30()
e0df6211
PH
797 Method = collections.namedtuple('Method', ['code', 'local_count'])
798 methods = {}
799 for _c in range(method_body_count):
ba552f54 800 method_idx = u30()
0ca96d48 801 u30() # max_stack
ba552f54 802 local_count = u30()
0ca96d48
PH
803 u30() # init_scope_depth
804 u30() # max_scope_depth
ba552f54
PH
805 code_length = u30()
806 code = read_bytes(code_length)
e0df6211 807 if method_idx in method_idxs:
ba552f54 808 m = Method(code, local_count)
e0df6211 809 methods[method_idxs[method_idx]] = m
ba552f54 810 exception_count = u30()
e0df6211 811 for _c2 in range(exception_count):
0ca96d48
PH
812 u30() # from
813 u30() # to
814 u30() # target
815 u30() # exc_type
816 u30() # var_name
ba552f54 817 trait_count = u30()
e0df6211 818 for _c2 in range(trait_count):
0ca96d48 819 parse_traits_info()
e0df6211 820
ba552f54 821 assert p + code_reader.tell() == len(code_tag)
e0df6211
PH
822 assert len(methods) == len(method_idxs)
823
824 method_pyfunctions = {}
825
826 def extract_function(func_name):
827 if func_name in method_pyfunctions:
828 return method_pyfunctions[func_name]
829 if func_name not in methods:
830 raise ExtractorError(u'Cannot find function %r' % func_name)
831 m = methods[func_name]
832
833 def resfunc(args):
e0df6211
PH
834 registers = ['(this)'] + list(args) + [None] * m.local_count
835 stack = []
836 coder = io.BytesIO(m.code)
837 while True:
838 opcode = struct.unpack('!B', coder.read(1))[0]
a7177865 839 if opcode == 36: # pushbyte
e0df6211
PH
840 v = struct.unpack('!B', coder.read(1))[0]
841 stack.append(v)
842 elif opcode == 44: # pushstring
843 idx = u30(coder)
844 stack.append(constant_strings[idx])
845 elif opcode == 48: # pushscope
846 # We don't implement the scope register, so we'll just
847 # ignore the popped value
848 stack.pop()
849 elif opcode == 70: # callproperty
850 index = u30(coder)
851 mname = multinames[index]
852 arg_count = u30(coder)
853 args = list(reversed(
854 [stack.pop() for _ in range(arg_count)]))
855 obj = stack.pop()
856 if mname == u'split':
857 assert len(args) == 1
858 assert isinstance(args[0], compat_str)
859 assert isinstance(obj, compat_str)
860 if args[0] == u'':
861 res = list(obj)
862 else:
863 res = obj.split(args[0])
864 stack.append(res)
a7177865
PH
865 elif mname == u'slice':
866 assert len(args) == 1
867 assert isinstance(args[0], int)
868 assert isinstance(obj, list)
869 res = obj[args[0]:]
870 stack.append(res)
871 elif mname == u'join':
872 assert len(args) == 1
873 assert isinstance(args[0], compat_str)
874 assert isinstance(obj, list)
875 res = args[0].join(obj)
876 stack.append(res)
e0df6211
PH
877 elif mname in method_pyfunctions:
878 stack.append(method_pyfunctions[mname](args))
879 else:
880 raise NotImplementedError(
881 u'Unsupported property %r on %r'
882 % (mname, obj))
a7177865
PH
883 elif opcode == 72: # returnvalue
884 res = stack.pop()
885 return res
886 elif opcode == 79: # callpropvoid
887 index = u30(coder)
888 mname = multinames[index]
889 arg_count = u30(coder)
890 args = list(reversed(
891 [stack.pop() for _ in range(arg_count)]))
892 obj = stack.pop()
893 if mname == u'reverse':
894 assert isinstance(obj, list)
895 obj.reverse()
896 else:
897 raise NotImplementedError(
898 u'Unsupported (void) property %r on %r'
899 % (mname, obj))
e0df6211
PH
900 elif opcode == 93: # findpropstrict
901 index = u30(coder)
902 mname = multinames[index]
903 res = extract_function(mname)
904 stack.append(res)
905 elif opcode == 97: # setproperty
906 index = u30(coder)
907 value = stack.pop()
908 idx = stack.pop()
909 obj = stack.pop()
910 assert isinstance(obj, list)
911 assert isinstance(idx, int)
912 obj[idx] = value
913 elif opcode == 98: # getlocal
914 index = u30(coder)
915 stack.append(registers[index])
916 elif opcode == 99: # setlocal
917 index = u30(coder)
918 value = stack.pop()
919 registers[index] = value
920 elif opcode == 102: # getproperty
921 index = u30(coder)
922 pname = multinames[index]
923 if pname == u'length':
924 obj = stack.pop()
925 assert isinstance(obj, list)
926 stack.append(len(obj))
927 else: # Assume attribute access
928 idx = stack.pop()
929 assert isinstance(idx, int)
930 obj = stack.pop()
931 assert isinstance(obj, list)
932 stack.append(obj[idx])
933 elif opcode == 128: # coerce
0ca96d48 934 u30(coder)
e0df6211
PH
935 elif opcode == 133: # coerce_s
936 assert isinstance(stack[-1], (type(None), compat_str))
937 elif opcode == 164: # modulo
938 value2 = stack.pop()
939 value1 = stack.pop()
940 res = value1 % value2
941 stack.append(res)
a7177865
PH
942 elif opcode == 208: # getlocal_0
943 stack.append(registers[0])
944 elif opcode == 209: # getlocal_1
945 stack.append(registers[1])
946 elif opcode == 210: # getlocal_2
947 stack.append(registers[2])
948 elif opcode == 211: # getlocal_3
949 stack.append(registers[3])
e0df6211
PH
950 elif opcode == 214: # setlocal_2
951 registers[2] = stack.pop()
952 elif opcode == 215: # setlocal_3
953 registers[3] = stack.pop()
954 else:
955 raise NotImplementedError(
956 u'Unsupported opcode %d' % opcode)
957
958 method_pyfunctions[func_name] = resfunc
959 return resfunc
960
961 initial_function = extract_function(u'decipher')
962 return lambda s: initial_function([s])
963
83799698 964 def _decrypt_signature(self, s, video_id, player_url, age_gate=False):
257a2501 965 """Turn the encrypted s field into a working signature"""
6b37f0be 966
83799698 967 if player_url is not None:
9f9be844
PH
968 if player_url.startswith(u'//'):
969 player_url = u'https:' + player_url
e0df6211 970 try:
7f8ae73a
PH
971 player_id = (player_url, len(s))
972 if player_id not in self._player_cache:
83799698 973 func = self._extract_signature_function(
c4417ddb 974 video_id, player_url, len(s)
e0df6211 975 )
7f8ae73a
PH
976 self._player_cache[player_id] = func
977 func = self._player_cache[player_id]
edf3e38e
PH
978 if self._downloader.params.get('youtube_print_sig_code'):
979 self._print_sig_code(func, len(s))
980 return func(s)
0ca96d48 981 except Exception:
e0df6211 982 tb = traceback.format_exc()
83799698
PH
983 self._downloader.report_warning(
984 u'Automatic signature extraction failed: ' + tb)
e0df6211 985
d2d8f895
PH
986 self._downloader.report_warning(
987 u'Warning: Falling back to static signature algorithm')
920de7a2 988
2f2ffea9
PH
989 return self._static_decrypt_signature(
990 s, video_id, player_url, age_gate)
e0df6211 991
2f2ffea9 992 def _static_decrypt_signature(self, s, video_id, player_url, age_gate):
e0df6211
PH
993 if age_gate:
994 # The videos with age protection use another player, so the
995 # algorithms can be different.
996 if len(s) == 86:
997 return s[2:63] + s[82] + s[64:82] + s[63]
998
bc4b9008 999 if len(s) == 93:
1000 return s[86:29:-1] + s[88] + s[28:5:-1]
1001 elif len(s) == 92:
444b1165 1002 return s[25] + s[3:25] + s[0] + s[26:42] + s[79] + s[43:79] + s[91] + s[80:83]
38d025b3
JMF
1003 elif len(s) == 91:
1004 return s[84:27:-1] + s[86] + s[26:5:-1]
444b1165
JMF
1005 elif len(s) == 90:
1006 return s[25] + s[3:25] + s[2] + s[26:40] + s[77] + s[41:77] + s[89] + s[78:81]
8a9d86a2 1007 elif len(s) == 89:
1008 return s[84:78:-1] + s[87] + s[77:60:-1] + s[0] + s[59:3:-1]
444b1165 1009 elif len(s) == 88:
3e223834 1010 return s[7:28] + s[87] + s[29:45] + s[55] + s[46:55] + s[2] + s[56:87] + s[28]
be547e1d 1011 elif len(s) == 87:
3a725669 1012 return s[6:27] + s[4] + s[28:39] + s[27] + s[40:59] + s[2] + s[60:]
be547e1d 1013 elif len(s) == 86:
f2c327fd 1014 return s[80:72:-1] + s[16] + s[71:39:-1] + s[72] + s[38:16:-1] + s[82] + s[15::-1]
be547e1d 1015 elif len(s) == 85:
6ae8ee3f 1016 return s[3:11] + s[0] + s[12:55] + s[84] + s[56:84]
be547e1d 1017 elif len(s) == 84:
6f56389b 1018 return s[78:70:-1] + s[14] + s[69:37:-1] + s[70] + s[36:14:-1] + s[80] + s[:14][::-1]
be547e1d 1019 elif len(s) == 83:
920de7a2 1020 return s[80:63:-1] + s[0] + s[62:0:-1] + s[63]
be547e1d 1021 elif len(s) == 82:
c21315f2 1022 return s[80:37:-1] + s[7] + s[36:7:-1] + s[0] + s[6:0:-1] + s[37]
be547e1d 1023 elif len(s) == 81:
aedd6bb9 1024 return s[56] + s[79:56:-1] + s[41] + s[55:41:-1] + s[80] + s[40:34:-1] + s[0] + s[33:29:-1] + s[34] + s[28:9:-1] + s[29] + s[8:0:-1] + s[9]
066090dd
JMF
1025 elif len(s) == 80:
1026 return s[1:19] + s[0] + s[20:68] + s[19] + s[69:80]
5c468ca8
JMF
1027 elif len(s) == 79:
1028 return s[54] + s[77:54:-1] + s[39] + s[53:39:-1] + s[78] + s[38:34:-1] + s[0] + s[33:29:-1] + s[34] + s[28:9:-1] + s[29] + s[8:0:-1] + s[9]
be547e1d
PH
1029
1030 else:
1031 raise ExtractorError(u'Unable to decrypt signature, key length %d not supported; retrying might work' % (len(s)))
c5e8d7af 1032
1f343eaa 1033 def _get_available_subtitles(self, video_id, webpage):
de7f3446 1034 try:
7fad1c63 1035 sub_list = self._download_webpage(
38c2e5b8 1036 'https://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,
7fad1c63
JMF
1037 video_id, note=False)
1038 except ExtractorError as err:
de7f3446
JMF
1039 self._downloader.report_warning(u'unable to download video subtitles: %s' % compat_str(err))
1040 return {}
1041 lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
1042
1043 sub_lang_list = {}
1044 for l in lang_list:
1045 lang = l[1]
1046 params = compat_urllib_parse.urlencode({
1047 'lang': lang,
1048 'v': video_id,
ca715127 1049 'fmt': self._downloader.params.get('subtitlesformat', 'srt'),
c3197e3e 1050 'name': unescapeHTML(l[0]).encode('utf-8'),
de7f3446 1051 })
38c2e5b8 1052 url = u'https://www.youtube.com/api/timedtext?' + params
de7f3446
JMF
1053 sub_lang_list[lang] = url
1054 if not sub_lang_list:
1055 self._downloader.report_warning(u'video doesn\'t have subtitles')
1056 return {}
1057 return sub_lang_list
1058
055e6f36 1059 def _get_available_automatic_caption(self, video_id, webpage):
de7f3446
JMF
1060 """We need the webpage for getting the captions url, pass it as an
1061 argument to speed up the process."""
ca715127 1062 sub_format = self._downloader.params.get('subtitlesformat', 'srt')
de7f3446
JMF
1063 self.to_screen(u'%s: Looking for automatic captions' % video_id)
1064 mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
055e6f36 1065 err_msg = u'Couldn\'t find automatic captions for %s' % video_id
de7f3446
JMF
1066 if mobj is None:
1067 self._downloader.report_warning(err_msg)
1068 return {}
1069 player_config = json.loads(mobj.group(1))
1070 try:
1071 args = player_config[u'args']
1072 caption_url = args[u'ttsurl']
1073 timestamp = args[u'timestamp']
055e6f36
JMF
1074 # We get the available subtitles
1075 list_params = compat_urllib_parse.urlencode({
1076 'type': 'list',
1077 'tlangs': 1,
1078 'asrs': 1,
de7f3446 1079 })
055e6f36 1080 list_url = caption_url + '&' + list_params
e26f8712 1081 caption_list = self._download_xml(list_url, video_id)
e3dc22ca 1082 original_lang_node = caption_list.find('track')
f6a54188 1083 if original_lang_node is None or original_lang_node.attrib.get('kind') != 'asr' :
e3dc22ca
JMF
1084 self._downloader.report_warning(u'Video doesn\'t have automatic captions')
1085 return {}
1086 original_lang = original_lang_node.attrib['lang_code']
055e6f36
JMF
1087
1088 sub_lang_list = {}
1089 for lang_node in caption_list.findall('target'):
1090 sub_lang = lang_node.attrib['lang_code']
1091 params = compat_urllib_parse.urlencode({
1092 'lang': original_lang,
1093 'tlang': sub_lang,
1094 'fmt': sub_format,
1095 'ts': timestamp,
1096 'kind': 'asr',
1097 })
1098 sub_lang_list[sub_lang] = caption_url + '&' + params
1099 return sub_lang_list
de7f3446
JMF
1100 # An extractor error can be raise by the download process if there are
1101 # no automatic captions but there are subtitles
1102 except (KeyError, ExtractorError):
1103 self._downloader.report_warning(err_msg)
1104 return {}
1105
97665381
PH
1106 @classmethod
1107 def extract_id(cls, url):
1108 mobj = re.match(cls._VALID_URL, url, re.VERBOSE)
c5e8d7af
PH
1109 if mobj is None:
1110 raise ExtractorError(u'Invalid URL: %s' % url)
1111 video_id = mobj.group(2)
1112 return video_id
1113
1d043b93
JMF
1114 def _extract_from_m3u8(self, manifest_url, video_id):
1115 url_map = {}
1116 def _get_urls(_manifest):
1117 lines = _manifest.split('\n')
1118 urls = filter(lambda l: l and not l.startswith('#'),
1119 lines)
1120 return urls
1121 manifest = self._download_webpage(manifest_url, video_id, u'Downloading formats manifest')
1122 formats_urls = _get_urls(manifest)
1123 for format_url in formats_urls:
890f62e8 1124 itag = self._search_regex(r'itag/(\d+?)/', format_url, 'itag')
1d043b93
JMF
1125 url_map[itag] = format_url
1126 return url_map
1127
1fb07d10
JG
1128 def _extract_annotations(self, video_id):
1129 url = 'https://www.youtube.com/annotations_invideo?features=1&legacy=1&video_id=%s' % video_id
1130 return self._download_webpage(url, video_id, note=u'Searching for annotations.', errnote=u'Unable to download video annotations.')
1131
c5e8d7af
PH
1132 def _real_extract(self, url):
1133 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
1134 mobj = re.search(self._NEXT_URL_RE, url)
1135 if mobj:
1136 url = 'https://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
97665381 1137 video_id = self.extract_id(url)
c5e8d7af
PH
1138
1139 # Get video webpage
c5e8d7af 1140 url = 'https://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
336c3a69 1141 video_webpage = self._download_webpage(url, video_id)
c5e8d7af
PH
1142
1143 # Attempt to extract SWF player URL
e0df6211 1144 mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
c5e8d7af
PH
1145 if mobj is not None:
1146 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1147 else:
1148 player_url = None
1149
1150 # Get video info
1151 self.report_video_info_webpage_download(video_id)
c108eb73
JMF
1152 if re.search(r'player-age-gate-content">', video_webpage) is not None:
1153 self.report_age_confirmation()
1154 age_gate = True
1155 # We simulate the access to the video from www.youtube.com/v/{video_id}
1156 # this can be viewed without login into Youtube
1157 data = compat_urllib_parse.urlencode({'video_id': video_id,
fccd3771 1158 'el': 'player_embedded',
c108eb73
JMF
1159 'gl': 'US',
1160 'hl': 'en',
1161 'eurl': 'https://youtube.googleapis.com/v/' + video_id,
1162 'asv': 3,
1163 'sts':'1588',
1164 })
1165 video_info_url = 'https://www.youtube.com/get_video_info?' + data
c5e8d7af
PH
1166 video_info_webpage = self._download_webpage(video_info_url, video_id,
1167 note=False,
1168 errnote='unable to download video info webpage')
1169 video_info = compat_parse_qs(video_info_webpage)
c108eb73
JMF
1170 else:
1171 age_gate = False
1172 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1173 video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1174 % (video_id, el_type))
1175 video_info_webpage = self._download_webpage(video_info_url, video_id,
1176 note=False,
1177 errnote='unable to download video info webpage')
1178 video_info = compat_parse_qs(video_info_webpage)
1179 if 'token' in video_info:
1180 break
c5e8d7af
PH
1181 if 'token' not in video_info:
1182 if 'reason' in video_info:
9a82b238 1183 raise ExtractorError(u'YouTube said: %s' % video_info['reason'][0], expected=True)
c5e8d7af
PH
1184 else:
1185 raise ExtractorError(u'"token" parameter not in video info for unknown reason')
1186
1d699755
PH
1187 if 'view_count' in video_info:
1188 view_count = int(video_info['view_count'][0])
1189 else:
1190 view_count = None
1191
c5e8d7af
PH
1192 # Check for "rental" videos
1193 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
1194 raise ExtractorError(u'"rental" videos not supported')
1195
1196 # Start extracting information
1197 self.report_information_extraction(video_id)
1198
1199 # uploader
1200 if 'author' not in video_info:
1201 raise ExtractorError(u'Unable to extract uploader name')
1202 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
1203
1204 # uploader_id
1205 video_uploader_id = None
1206 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
1207 if mobj is not None:
1208 video_uploader_id = mobj.group(1)
1209 else:
1210 self._downloader.report_warning(u'unable to extract uploader nickname')
1211
1212 # title
a8c6b241
PH
1213 if 'title' in video_info:
1214 video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
1215 else:
1216 self._downloader.report_warning(u'Unable to extract video title')
1217 video_title = u'_'
c5e8d7af
PH
1218
1219 # thumbnail image
7763b04e
JMF
1220 # We try first to get a high quality image:
1221 m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">',
1222 video_webpage, re.DOTALL)
1223 if m_thumb is not None:
1224 video_thumbnail = m_thumb.group(1)
1225 elif 'thumbnail_url' not in video_info:
c5e8d7af 1226 self._downloader.report_warning(u'unable to extract video thumbnail')
f490e77e 1227 video_thumbnail = None
c5e8d7af
PH
1228 else: # don't panic if we can't find it
1229 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
1230
1231 # upload date
1232 upload_date = None
1233 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
1234 if mobj is not None:
1235 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1236 upload_date = unified_strdate(upload_date)
1237
1238 # description
1239 video_description = get_element_by_id("eow-description", video_webpage)
1240 if video_description:
27dcce19
PH
1241 video_description = re.sub(r'''(?x)
1242 <a\s+
1243 (?:[a-zA-Z-]+="[^"]+"\s+)*?
1244 title="([^"]+)"\s+
1245 (?:[a-zA-Z-]+="[^"]+"\s+)*?
1246 class="yt-uix-redirect-link"\s*>
1247 [^<]+
1248 </a>
1249 ''', r'\1', video_description)
c5e8d7af
PH
1250 video_description = clean_html(video_description)
1251 else:
1252 fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
1253 if fd_mobj:
1254 video_description = unescapeHTML(fd_mobj.group(1))
1255 else:
1256 video_description = u''
1257
336c3a69 1258 def _extract_count(klass):
46374a56
PH
1259 count = self._search_regex(
1260 r'class="%s">([\d,]+)</span>' % re.escape(klass),
1261 video_webpage, klass, default=None)
336c3a69
JMF
1262 if count is not None:
1263 return int(count.replace(',', ''))
1264 return None
1265 like_count = _extract_count(u'likes-count')
1266 dislike_count = _extract_count(u'dislikes-count')
1267
c5e8d7af 1268 # subtitles
d82134c3 1269 video_subtitles = self.extract_subtitles(video_id, video_webpage)
c5e8d7af 1270
c5e8d7af 1271 if self._downloader.params.get('listsubtitles', False):
d665f8d3 1272 self._list_available_subtitles(video_id, video_webpage)
c5e8d7af
PH
1273 return
1274
1275 if 'length_seconds' not in video_info:
1276 self._downloader.report_warning(u'unable to extract video duration')
b466b702 1277 video_duration = None
c5e8d7af 1278 else:
b466b702 1279 video_duration = int(compat_urllib_parse.unquote_plus(video_info['length_seconds'][0]))
c5e8d7af 1280
1fb07d10
JG
1281 # annotations
1282 video_annotations = None
1283 if self._downloader.params.get('writeannotations', False):
1284 video_annotations = self._extract_annotations(video_id)
1285
c5e8d7af 1286 # Decide which formats to download
c5e8d7af
PH
1287 try:
1288 mobj = re.search(r';ytplayer.config = ({.*?});', video_webpage)
50be92c1
PH
1289 if not mobj:
1290 raise ValueError('Could not find vevo ID')
3489b7d2
JMF
1291 ytplayer_config = json.loads(mobj.group(1))
1292 args = ytplayer_config['args']
7ce7e394
JMF
1293 # Easy way to know if the 's' value is in url_encoded_fmt_stream_map
1294 # this signatures are encrypted
44d46655 1295 if 'url_encoded_fmt_stream_map' not in args:
f10503db 1296 raise ValueError(u'No stream_map present') # caught below
00fe14fc
JMF
1297 re_signature = re.compile(r'[&,]s=')
1298 m_s = re_signature.search(args['url_encoded_fmt_stream_map'])
7ce7e394
JMF
1299 if m_s is not None:
1300 self.to_screen(u'%s: Encrypted signatures detected.' % video_id)
c5e8d7af 1301 video_info['url_encoded_fmt_stream_map'] = [args['url_encoded_fmt_stream_map']]
00fe14fc 1302 m_s = re_signature.search(args.get('adaptive_fmts', u''))
b7a68384 1303 if m_s is not None:
00fe14fc
JMF
1304 if 'adaptive_fmts' in video_info:
1305 video_info['adaptive_fmts'][0] += ',' + args['adaptive_fmts']
37b6d5f6 1306 else:
00fe14fc 1307 video_info['adaptive_fmts'] = [args['adaptive_fmts']]
c5e8d7af
PH
1308 except ValueError:
1309 pass
1310
dd27fd17
PH
1311 def _map_to_format_list(urlmap):
1312 formats = []
1313 for itag, video_real_url in urlmap.items():
1314 dct = {
1315 'format_id': itag,
1316 'url': video_real_url,
1317 'player_url': player_url,
1318 }
0b65e5d4
PH
1319 if itag in self._formats:
1320 dct.update(self._formats[itag])
dd27fd17
PH
1321 formats.append(dct)
1322 return formats
1323
c5e8d7af
PH
1324 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1325 self.report_rtmp_download()
dd27fd17
PH
1326 formats = [{
1327 'format_id': '_rtmp',
1328 'protocol': 'rtmp',
1329 'url': video_info['conn'][0],
1330 'player_url': player_url,
1331 }]
00fe14fc
JMF
1332 elif len(video_info.get('url_encoded_fmt_stream_map', [])) >= 1 or len(video_info.get('adaptive_fmts', [])) >= 1:
1333 encoded_url_map = video_info.get('url_encoded_fmt_stream_map', [''])[0] + ',' + video_info.get('adaptive_fmts',[''])[0]
1334 if 'rtmpe%3Dyes' in encoded_url_map:
a7055eb9 1335 raise ExtractorError('rtmpe downloads are not supported, see https://github.com/rg3/youtube-dl/issues/343 for more information.', expected=True)
c5e8d7af 1336 url_map = {}
00fe14fc 1337 for url_data_str in encoded_url_map.split(','):
c5e8d7af
PH
1338 url_data = compat_parse_qs(url_data_str)
1339 if 'itag' in url_data and 'url' in url_data:
1340 url = url_data['url'][0]
1341 if 'sig' in url_data:
1342 url += '&signature=' + url_data['sig'][0]
1343 elif 's' in url_data:
e0df6211 1344 encrypted_sig = url_data['s'][0]
769fda3c 1345 if self._downloader.params.get('verbose'):
c108eb73 1346 if age_gate:
bdde940e
PH
1347 if player_url is None:
1348 player_version = 'unknown'
1349 else:
1350 player_version = self._search_regex(
1351 r'-(.+)\.swf$', player_url,
1352 u'flash player', fatal=False)
e0df6211 1353 player_desc = 'flash player %s' % player_version
c108eb73 1354 else:
83799698
PH
1355 player_version = self._search_regex(
1356 r'html5player-(.+?)\.js', video_webpage,
c108eb73 1357 'html5 player', fatal=False)
e0df6211
PH
1358 player_desc = u'html5 player %s' % player_version
1359
1360 parts_sizes = u'.'.join(compat_str(len(part)) for part in encrypted_sig.split('.'))
5a76c651 1361 self.to_screen(u'encrypted signature length %d (%s), itag %s, %s' %
e0df6211
PH
1362 (len(encrypted_sig), parts_sizes, url_data['itag'][0], player_desc))
1363
83799698 1364 if not age_gate:
e0df6211
PH
1365 jsplayer_url_json = self._search_regex(
1366 r'"assets":.+?"js":\s*("[^"]+")',
1367 video_webpage, u'JS player URL')
83799698 1368 player_url = json.loads(jsplayer_url_json)
e0df6211 1369
83799698
PH
1370 signature = self._decrypt_signature(
1371 encrypted_sig, video_id, player_url, age_gate)
c5e8d7af
PH
1372 url += '&signature=' + signature
1373 if 'ratebypass' not in url:
1374 url += '&ratebypass=yes'
1375 url_map[url_data['itag'][0]] = url
dd27fd17 1376 formats = _map_to_format_list(url_map)
1d043b93
JMF
1377 elif video_info.get('hlsvp'):
1378 manifest_url = video_info['hlsvp'][0]
1379 url_map = self._extract_from_m3u8(manifest_url, video_id)
dd27fd17 1380 formats = _map_to_format_list(url_map)
c5e8d7af 1381 else:
9abb3204 1382 raise ExtractorError(u'no conn, hlsvp or url_encoded_fmt_stream_map information found in video info')
c5e8d7af 1383
dd27fd17 1384 # Look for the DASH manifest
d68f0cdb 1385 if (self._downloader.params.get('youtube_include_dash_manifest', False)):
dd27fd17 1386 try:
d68f0cdb 1387 # The DASH manifest used needs to be the one from the original video_webpage.
1388 # The one found in get_video_info seems to be using different signatures.
1389 # However, in the case of an age restriction there won't be any embedded dashmpd in the video_webpage.
1390 # Luckily, it seems, this case uses some kind of default signature (len == 86), so the
1391 # combination of get_video_info and the _static_decrypt_signature() decryption fallback will work here.
1392 if age_gate:
3489b7d2 1393 dash_manifest_url = video_info.get('dashmpd')[0]
d68f0cdb 1394 else:
3489b7d2 1395 dash_manifest_url = ytplayer_config['args']['dashmpd']
d68f0cdb 1396 def decrypt_sig(mobj):
1397 s = mobj.group(1)
1398 dec_s = self._decrypt_signature(s, video_id, player_url, age_gate)
1399 return '/signature/%s' % dec_s
1400 dash_manifest_url = re.sub(r'/s/([\w\.]+)', decrypt_sig, dash_manifest_url)
dd27fd17 1401 dash_doc = self._download_xml(
d68f0cdb 1402 dash_manifest_url, video_id,
dd27fd17
PH
1403 note=u'Downloading DASH manifest',
1404 errnote=u'Could not download DASH manifest')
1405 for r in dash_doc.findall(u'.//{urn:mpeg:DASH:schema:MPD:2011}Representation'):
1406 url_el = r.find('{urn:mpeg:DASH:schema:MPD:2011}BaseURL')
1407 if url_el is None:
1408 continue
1409 format_id = r.attrib['id']
1410 video_url = url_el.text
1411 filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength'))
1412 f = {
1413 'format_id': format_id,
1414 'url': video_url,
1415 'width': int_or_none(r.attrib.get('width')),
1416 'tbr': int_or_none(r.attrib.get('bandwidth'), 1000),
1417 'asr': int_or_none(r.attrib.get('audioSamplingRate')),
1418 'filesize': filesize,
1419 }
1420 try:
1421 existing_format = next(
1422 fo for fo in formats
1423 if fo['format_id'] == format_id)
1424 except StopIteration:
1425 f.update(self._formats.get(format_id, {}))
1426 formats.append(f)
1427 else:
1428 existing_format.update(f)
1429
1430 except (ExtractorError, KeyError) as e:
1431 self.report_warning(u'Skipping DASH manifest: %s' % e, video_id)
d80044c2 1432
4bcc7bd1 1433 self._sort_formats(formats)
4ea3be0a 1434
1435 return {
1436 'id': video_id,
1437 'uploader': video_uploader,
1438 'uploader_id': video_uploader_id,
1439 'upload_date': upload_date,
1440 'title': video_title,
1441 'thumbnail': video_thumbnail,
1442 'description': video_description,
1443 'subtitles': video_subtitles,
1444 'duration': video_duration,
1445 'age_limit': 18 if age_gate else 0,
1446 'annotations': video_annotations,
1447 'webpage_url': 'https://www.youtube.com/watch?v=%s' % video_id,
1448 'view_count': view_count,
1449 'like_count': like_count,
1450 'dislike_count': dislike_count,
1451 'formats': formats,
1452 }
c5e8d7af 1453
880e1c52 1454class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
0f818663 1455 IE_DESC = u'YouTube.com playlists'
d67cc9fa 1456 _VALID_URL = r"""(?x)(?:
c5e8d7af
PH
1457 (?:https?://)?
1458 (?:\w+\.)?
1459 youtube\.com/
1460 (?:
1461 (?:course|view_play_list|my_playlists|artist|playlist|watch)
1462 \? (?:.*?&)*? (?:p|a|list)=
1463 | p/
1464 )
d67cc9fa
JMF
1465 (
1466 (?:PL|EC|UU|FL|RD)?[0-9A-Za-z-_]{10,}
1467 # Top tracks, they can also include dots
1468 |(?:MC)[\w\.]*
1469 )
c5e8d7af
PH
1470 .*
1471 |
715c8e7b 1472 ((?:PL|EC|UU|FL|RD)[0-9A-Za-z-_]{10,})
c5e8d7af 1473 )"""
dbb94fb0 1474 _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s'
dcbb4580 1475 _MORE_PAGES_INDICATOR = r'data-link-type="next"'
dbb94fb0 1476 _VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})&amp;[^"]*?index=(?P<index>\d+)'
c5e8d7af
PH
1477 IE_NAME = u'youtube:playlist'
1478
880e1c52
JMF
1479 def _real_initialize(self):
1480 self._login()
1481
652cdaa2
JMF
1482 def _ids_to_results(self, ids):
1483 return [self.url_result(vid_id, 'Youtube', video_id=vid_id)
1484 for vid_id in ids]
1485
1486 def _extract_mix(self, playlist_id):
1487 # The mixes are generated from a a single video
1488 # the id of the playlist is just 'RD' + video_id
7d4afc55 1489 url = 'https://youtube.com/watch?v=%s&list=%s' % (playlist_id[-11:], playlist_id)
652cdaa2 1490 webpage = self._download_webpage(url, playlist_id, u'Downloading Youtube mix')
bc2f773b
JMF
1491 search_title = lambda class_name: get_element_by_attribute('class', class_name, webpage)
1492 title_span = (search_title('playlist-title') or
1493 search_title('title long-title') or search_title('title'))
76d1700b 1494 title = clean_html(title_span)
a2dafe28 1495 video_re = r'''(?x)data-video-username="(.*?)".*?
bc2f773b
JMF
1496 href="/watch\?v=([0-9A-Za-z_-]{11})&amp;[^"]*?list=%s''' % re.escape(playlist_id)
1497 matches = orderedSet(re.findall(video_re, webpage, flags=re.DOTALL))
a2dafe28 1498 # Some of the videos may have been deleted, their username field is empty
bc2f773b 1499 ids = [video_id for (username, video_id) in matches if username]
652cdaa2
JMF
1500 url_results = self._ids_to_results(ids)
1501
1502 return self.playlist_result(url_results, playlist_id, title)
1503
c5e8d7af
PH
1504 def _real_extract(self, url):
1505 # Extract playlist id
d67cc9fa 1506 mobj = re.match(self._VALID_URL, url)
c5e8d7af
PH
1507 if mobj is None:
1508 raise ExtractorError(u'Invalid URL: %s' % url)
47192f92
FV
1509 playlist_id = mobj.group(1) or mobj.group(2)
1510
1511 # Check if it's a video-specific URL
7c61bd36 1512 query_dict = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
47192f92
FV
1513 if 'v' in query_dict:
1514 video_id = query_dict['v'][0]
1515 if self._downloader.params.get('noplaylist'):
1516 self.to_screen(u'Downloading just video %s because of --no-playlist' % video_id)
7012b23c 1517 return self.url_result(video_id, 'Youtube', video_id=video_id)
47192f92
FV
1518 else:
1519 self.to_screen(u'Downloading playlist PL%s - add --no-playlist to just download video %s' % (playlist_id, video_id))
c5e8d7af 1520
7d4afc55 1521 if playlist_id.startswith('RD'):
652cdaa2
JMF
1522 # Mixes require a custom extraction process
1523 return self._extract_mix(playlist_id)
0a688bc0
JMF
1524 if playlist_id.startswith('TL'):
1525 raise ExtractorError(u'For downloading YouTube.com top lists, use '
1526 u'the "yttoplist" keyword, for example "youtube-dl \'yttoplist:music:Top Tracks\'"', expected=True)
652cdaa2 1527
dbb94fb0
S
1528 url = self._TEMPLATE_URL % playlist_id
1529 page = self._download_webpage(url, playlist_id)
1530 more_widget_html = content_html = page
1531
dcbb4580
JMF
1532 # Extract the video ids from the playlist pages
1533 ids = []
c5e8d7af 1534
755eb032 1535 for page_num in itertools.count(1):
dbb94fb0 1536 matches = re.finditer(self._VIDEO_RE, content_html)
6e47b51e
JMF
1537 # We remove the duplicates and the link with index 0
1538 # (it's not the first video of the playlist)
1539 new_ids = orderedSet(m.group('id') for m in matches if m.group('index') != '0')
dcbb4580 1540 ids.extend(new_ids)
c5e8d7af 1541
dbb94fb0
S
1542 mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
1543 if not mobj:
c5e8d7af
PH
1544 break
1545
dbb94fb0
S
1546 more = self._download_json(
1547 'https://youtube.com/%s' % mobj.group('more'), playlist_id, 'Downloading page #%s' % page_num)
1548 content_html = more['content_html']
1549 more_widget_html = more['load_more_widget_html']
1550
1551 playlist_title = self._html_search_regex(
1552 r'<h1 class="pl-header-title">\s*(.*?)\s*</h1>', page, u'title')
c5e8d7af 1553
652cdaa2 1554 url_results = self._ids_to_results(ids)
dcbb4580 1555 return self.playlist_result(url_results, playlist_id, playlist_title)
c5e8d7af
PH
1556
1557
0a688bc0
JMF
1558class YoutubeTopListIE(YoutubePlaylistIE):
1559 IE_NAME = u'youtube:toplist'
1560 IE_DESC = (u'YouTube.com top lists, "yttoplist:{channel}:{list title}"'
1561 u' (Example: "yttoplist:music:Top Tracks")')
1562 _VALID_URL = r'yttoplist:(?P<chann>.*?):(?P<title>.*?)$'
1563
1564 def _real_extract(self, url):
1565 mobj = re.match(self._VALID_URL, url)
1566 channel = mobj.group('chann')
1567 title = mobj.group('title')
1568 query = compat_urllib_parse.urlencode({'title': title})
beddbc2a 1569 playlist_re = 'href="([^"]+?%s.*?)"' % re.escape(query)
0a688bc0
JMF
1570 channel_page = self._download_webpage('https://www.youtube.com/%s' % channel, title)
1571 link = self._html_search_regex(playlist_re, channel_page, u'list')
1572 url = compat_urlparse.urljoin('https://www.youtube.com/', link)
1573
1574 video_re = r'data-index="\d+".*?data-video-id="([0-9A-Za-z_-]{11})"'
1575 ids = []
1576 # sometimes the webpage doesn't contain the videos
1577 # retry until we get them
1578 for i in itertools.count(0):
1579 msg = u'Downloading Youtube mix'
1580 if i > 0:
1581 msg += ', retry #%d' % i
1582 webpage = self._download_webpage(url, title, msg)
1583 ids = orderedSet(re.findall(video_re, webpage))
1584 if ids:
1585 break
1586 url_results = self._ids_to_results(ids)
1587 return self.playlist_result(url_results, playlist_title=title)
1588
1589
c5e8d7af 1590class YoutubeChannelIE(InfoExtractor):
0f818663 1591 IE_DESC = u'YouTube.com channels'
c5e8d7af 1592 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
c5e8d7af 1593 _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
38c2e5b8 1594 _MORE_PAGES_URL = 'https://www.youtube.com/c4_browse_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
c5e8d7af
PH
1595 IE_NAME = u'youtube:channel'
1596
1597 def extract_videos_from_page(self, page):
1598 ids_in_page = []
1599 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1600 if mobj.group(1) not in ids_in_page:
1601 ids_in_page.append(mobj.group(1))
1602 return ids_in_page
1603
1604 def _real_extract(self, url):
1605 # Extract channel id
1606 mobj = re.match(self._VALID_URL, url)
1607 if mobj is None:
1608 raise ExtractorError(u'Invalid URL: %s' % url)
1609
1610 # Download channel page
1611 channel_id = mobj.group(1)
1612 video_ids = []
b9643eed
JMF
1613 url = 'https://www.youtube.com/channel/%s/videos' % channel_id
1614 channel_page = self._download_webpage(url, channel_id)
31812a9e
PH
1615 autogenerated = re.search(r'''(?x)
1616 class="[^"]*?(?:
1617 channel-header-autogenerated-label|
1618 yt-channel-title-autogenerated
1619 )[^"]*"''', channel_page) is not None
c5e8d7af 1620
b9643eed
JMF
1621 if autogenerated:
1622 # The videos are contained in a single page
1623 # the ajax pages can't be used, they are empty
1624 video_ids = self.extract_videos_from_page(channel_page)
1625 else:
1626 # Download all channel pages using the json-based channel_ajax query
1627 for pagenum in itertools.count(1):
1628 url = self._MORE_PAGES_URL % (pagenum, channel_id)
81c2f20b
PH
1629 page = self._download_json(
1630 url, channel_id, note=u'Downloading page #%s' % pagenum,
1631 transform_source=uppercase_escape)
1632
b9643eed
JMF
1633 ids_in_page = self.extract_videos_from_page(page['content_html'])
1634 video_ids.extend(ids_in_page)
1635
1636 if self._MORE_PAGES_INDICATOR not in page['load_more_widget_html']:
1637 break
c5e8d7af
PH
1638
1639 self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1640
7012b23c
PH
1641 url_entries = [self.url_result(video_id, 'Youtube', video_id=video_id)
1642 for video_id in video_ids]
1643 return self.playlist_result(url_entries, channel_id)
c5e8d7af
PH
1644
1645
1646class YoutubeUserIE(InfoExtractor):
0f818663 1647 IE_DESC = u'YouTube.com user videos (URL or "ytuser" keyword)'
57da92b7 1648 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:user/)?(?!(?:attribution_link|watch)(?:$|[^a-z_A-Z0-9-])))|ytuser:)(?!feed/)([A-Za-z0-9_-]+)'
38c2e5b8 1649 _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/users/%s'
c5e8d7af 1650 _GDATA_PAGE_SIZE = 50
38c2e5b8 1651 _GDATA_URL = 'https://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d&alt=json'
c5e8d7af
PH
1652 IE_NAME = u'youtube:user'
1653
e3ea4790 1654 @classmethod
f4b05232 1655 def suitable(cls, url):
e3ea4790
JMF
1656 # Don't return True if the url can be extracted with other youtube
1657 # extractor, the regex would is too permissive and it would match.
1658 other_ies = iter(klass for (name, klass) in globals().items() if name.endswith('IE') and klass is not cls)
1659 if any(ie.suitable(url) for ie in other_ies): return False
f4b05232
JMF
1660 else: return super(YoutubeUserIE, cls).suitable(url)
1661
c5e8d7af
PH
1662 def _real_extract(self, url):
1663 # Extract username
1664 mobj = re.match(self._VALID_URL, url)
1665 if mobj is None:
1666 raise ExtractorError(u'Invalid URL: %s' % url)
1667
1668 username = mobj.group(1)
1669
1670 # Download video ids using YouTube Data API. Result size per
1671 # query is limited (currently to 50 videos) so we need to query
1672 # page by page until there are no video ids - it means we got
1673 # all of them.
1674
b7ab0590 1675 def download_page(pagenum):
c5e8d7af
PH
1676 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1677
1678 gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
b7ab0590
PH
1679 page = self._download_webpage(
1680 gdata_url, username,
1681 u'Downloading video ids from %d to %d' % (
1682 start_index, start_index + self._GDATA_PAGE_SIZE))
c5e8d7af 1683
fd9cf738
JMF
1684 try:
1685 response = json.loads(page)
1686 except ValueError as err:
1687 raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
71c82637 1688 if 'entry' not in response['feed']:
b7ab0590 1689 return
fd9cf738 1690
c5e8d7af 1691 # Extract video identifiers
e302f9ce
PH
1692 entries = response['feed']['entry']
1693 for entry in entries:
1694 title = entry['title']['$t']
1695 video_id = entry['id']['$t'].split('/')[-1]
b7ab0590 1696 yield {
e302f9ce
PH
1697 '_type': 'url',
1698 'url': video_id,
1699 'ie_key': 'Youtube',
b11cec41 1700 'id': video_id,
e302f9ce 1701 'title': title,
b7ab0590
PH
1702 }
1703 url_results = PagedList(download_page, self._GDATA_PAGE_SIZE)
c5e8d7af 1704
7012b23c
PH
1705 return self.playlist_result(url_results, playlist_title=username)
1706
b05654f0
PH
1707
1708class YoutubeSearchIE(SearchInfoExtractor):
0f818663 1709 IE_DESC = u'YouTube.com searches'
b05654f0
PH
1710 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1711 _MAX_RESULTS = 1000
1712 IE_NAME = u'youtube:search'
1713 _SEARCH_KEY = 'ytsearch'
1714
b05654f0
PH
1715 def _get_n_results(self, query, n):
1716 """Get a specified number of results for a query"""
1717
1718 video_ids = []
1719 pagenum = 0
1720 limit = n
1721
1722 while (50 * pagenum) < limit:
b05654f0 1723 result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
7cc3570e
PH
1724 data_json = self._download_webpage(
1725 result_url, video_id=u'query "%s"' % query,
1726 note=u'Downloading page %s' % (pagenum + 1),
1727 errnote=u'Unable to download API page')
1728 data = json.loads(data_json)
1729 api_response = data['data']
1730
1731 if 'items' not in api_response:
07ad22b8
PH
1732 raise ExtractorError(
1733 u'[youtube] No video results', expected=True)
b05654f0
PH
1734
1735 new_ids = list(video['id'] for video in api_response['items'])
1736 video_ids += new_ids
1737
1738 limit = min(n, api_response['totalItems'])
1739 pagenum += 1
1740
1741 if len(video_ids) > n:
1742 video_ids = video_ids[:n]
7012b23c
PH
1743 videos = [self.url_result(video_id, 'Youtube', video_id=video_id)
1744 for video_id in video_ids]
b05654f0 1745 return self.playlist_result(videos, query)
75dff0ee 1746
a3dd9248 1747class YoutubeSearchDateIE(YoutubeSearchIE):
cb7fb546 1748 IE_NAME = YoutubeSearchIE.IE_NAME + ':date'
a3dd9248
CM
1749 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc&orderby=published'
1750 _SEARCH_KEY = 'ytsearchdate'
08fb86c4 1751 IE_DESC = u'YouTube.com searches, newest videos first'
75dff0ee
JMF
1752
1753class YoutubeShowIE(InfoExtractor):
0f818663 1754 IE_DESC = u'YouTube.com (multi-season) shows'
75dff0ee
JMF
1755 _VALID_URL = r'https?://www\.youtube\.com/show/(.*)'
1756 IE_NAME = u'youtube:show'
1757
1758 def _real_extract(self, url):
1759 mobj = re.match(self._VALID_URL, url)
1760 show_name = mobj.group(1)
1761 webpage = self._download_webpage(url, show_name, u'Downloading show webpage')
1762 # There's one playlist for each season of the show
1763 m_seasons = list(re.finditer(r'href="(/playlist\?list=.*?)"', webpage))
1764 self.to_screen(u'%s: Found %s seasons' % (show_name, len(m_seasons)))
1765 return [self.url_result('https://www.youtube.com' + season.group(1), 'YoutubePlaylist') for season in m_seasons]
04cc9617
JMF
1766
1767
b2e8bc1b 1768class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
d7ae0639
JMF
1769 """
1770 Base class for extractors that fetch info from
1771 http://www.youtube.com/feed_ajax
1772 Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.
1773 """
b2e8bc1b 1774 _LOGIN_REQUIRED = True
43ba5456
JMF
1775 # use action_load_personal_feed instead of action_load_system_feed
1776 _PERSONAL_FEED = False
04cc9617 1777
d7ae0639
JMF
1778 @property
1779 def _FEED_TEMPLATE(self):
43ba5456
JMF
1780 action = 'action_load_system_feed'
1781 if self._PERSONAL_FEED:
1782 action = 'action_load_personal_feed'
38c2e5b8 1783 return 'https://www.youtube.com/feed_ajax?%s=1&feed_name=%s&paging=%%s' % (action, self._FEED_NAME)
d7ae0639
JMF
1784
1785 @property
1786 def IE_NAME(self):
1787 return u'youtube:%s' % self._FEED_NAME
04cc9617 1788
81f0259b 1789 def _real_initialize(self):
b2e8bc1b 1790 self._login()
81f0259b 1791
04cc9617
JMF
1792 def _real_extract(self, url):
1793 feed_entries = []
0e44d838
JMF
1794 paging = 0
1795 for i in itertools.count(1):
d7ae0639
JMF
1796 info = self._download_webpage(self._FEED_TEMPLATE % paging,
1797 u'%s feed' % self._FEED_NAME,
04cc9617
JMF
1798 u'Downloading page %s' % i)
1799 info = json.loads(info)
1800 feed_html = info['feed_html']
43ba5456 1801 m_ids = re.finditer(r'"/watch\?v=(.*?)["&]', feed_html)
04cc9617 1802 ids = orderedSet(m.group(1) for m in m_ids)
7012b23c
PH
1803 feed_entries.extend(
1804 self.url_result(video_id, 'Youtube', video_id=video_id)
1805 for video_id in ids)
04cc9617
JMF
1806 if info['paging'] is None:
1807 break
0e44d838 1808 paging = info['paging']
d7ae0639
JMF
1809 return self.playlist_result(feed_entries, playlist_title=self._PLAYLIST_TITLE)
1810
1811class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor):
1812 IE_DESC = u'YouTube.com subscriptions feed, "ytsubs" keyword(requires authentication)'
1813 _VALID_URL = r'https?://www\.youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
1814 _FEED_NAME = 'subscriptions'
1815 _PLAYLIST_TITLE = u'Youtube Subscriptions'
1816
1817class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
1818 IE_DESC = u'YouTube.com recommended videos, "ytrec" keyword (requires authentication)'
1819 _VALID_URL = r'https?://www\.youtube\.com/feed/recommended|:ytrec(?:ommended)?'
1820 _FEED_NAME = 'recommended'
1821 _PLAYLIST_TITLE = u'Youtube Recommended videos'
c626a3d9 1822
43ba5456
JMF
1823class YoutubeWatchLaterIE(YoutubeFeedsInfoExtractor):
1824 IE_DESC = u'Youtube watch later list, "ytwatchlater" keyword (requires authentication)'
1825 _VALID_URL = r'https?://www\.youtube\.com/feed/watch_later|:ytwatchlater'
1826 _FEED_NAME = 'watch_later'
1827 _PLAYLIST_TITLE = u'Youtube Watch Later'
43ba5456 1828 _PERSONAL_FEED = True
c626a3d9 1829
f459d170
JMF
1830class YoutubeHistoryIE(YoutubeFeedsInfoExtractor):
1831 IE_DESC = u'Youtube watch history, "ythistory" keyword (requires authentication)'
1832 _VALID_URL = u'https?://www\.youtube\.com/feed/history|:ythistory'
1833 _FEED_NAME = 'history'
1834 _PERSONAL_FEED = True
1835 _PLAYLIST_TITLE = u'Youtube Watch History'
1836
c626a3d9
JMF
1837class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
1838 IE_NAME = u'youtube:favorites'
1839 IE_DESC = u'YouTube.com favourite videos, "ytfav" keyword (requires authentication)'
c7a7750d 1840 _VALID_URL = r'https?://www\.youtube\.com/my_favorites|:ytfav(?:ou?rites)?'
c626a3d9
JMF
1841 _LOGIN_REQUIRED = True
1842
1843 def _real_extract(self, url):
1844 webpage = self._download_webpage('https://www.youtube.com/my_favorites', 'Youtube Favourites videos')
1845 playlist_id = self._search_regex(r'list=(.+?)["&]', webpage, u'favourites playlist id')
1846 return self.url_result(playlist_id, 'YoutubePlaylist')
15870e90
PH
1847
1848
1849class YoutubeTruncatedURLIE(InfoExtractor):
1850 IE_NAME = 'youtube:truncated_url'
1851 IE_DESC = False # Do not list
975d35db 1852 _VALID_URL = r'''(?x)
2eb5d315 1853 (?:https?://)?[^/]+/watch\?(?:feature=[a-z_]+)?$|
975d35db
PH
1854 (?:https?://)?(?:www\.)?youtube\.com/attribution_link\?a=[^&]+$
1855 '''
15870e90
PH
1856
1857 def _real_extract(self, url):
1858 raise ExtractorError(
1859 u'Did you forget to quote the URL? Remember that & is a meta '
1860 u'character in most shells, so you want to put the URL in quotes, '
1861 u'like youtube-dl '
b4622a32
PH
1862 u'"http://www.youtube.com/watch?feature=foo&v=BaW_jenozKc" '
1863 u' or simply youtube-dl BaW_jenozKc .',
15870e90 1864 expected=True)