]> jfr.im git - yt-dlp.git/blame - youtube_dl/extractor/youtube.py
[jeuxvideo] Modernize
[yt-dlp.git] / youtube_dl / extractor / youtube.py
CommitLineData
c5e8d7af 1# coding: utf-8
c5e8d7af 2
e0df6211 3import collections
edf3e38e 4import errno
e0df6211 5import io
0ca96d48 6import itertools
c5e8d7af 7import json
c4417ddb 8import os.path
c5e8d7af 9import re
e0df6211
PH
10import string
11import struct
12import traceback
13import zlib
c5e8d7af 14
b05654f0 15from .common import InfoExtractor, SearchInfoExtractor
54d39d8b 16from .subtitles import SubtitlesInfoExtractor
c5e8d7af 17from ..utils import (
edf3e38e 18 compat_chr,
c5e8d7af 19 compat_parse_qs,
c5e8d7af
PH
20 compat_urllib_parse,
21 compat_urllib_request,
7c61bd36 22 compat_urlparse,
c5e8d7af
PH
23 compat_str,
24
25 clean_html,
c38b1e77 26 get_cachedir,
c5e8d7af 27 get_element_by_id,
652cdaa2 28 get_element_by_attribute,
c5e8d7af 29 ExtractorError,
dd27fd17 30 int_or_none,
b7ab0590 31 PagedList,
c91778f8 32 RegexNotFoundError,
c5e8d7af
PH
33 unescapeHTML,
34 unified_strdate,
04cc9617 35 orderedSet,
edf3e38e 36 write_json_file,
c5e8d7af
PH
37)
38
de7f3446 39class YoutubeBaseInfoExtractor(InfoExtractor):
b2e8bc1b
JMF
40 """Provide base functions for Youtube extractors"""
41 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
42 _LANG_URL = r'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
38c2e5b8 43 _AGE_URL = 'https://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
b2e8bc1b
JMF
44 _NETRC_MACHINE = 'youtube'
45 # If True it will raise an error if no login info is provided
46 _LOGIN_REQUIRED = False
47
b2e8bc1b 48 def _set_language(self):
7cc3570e
PH
49 return bool(self._download_webpage(
50 self._LANG_URL, None,
51 note=u'Setting language', errnote='unable to set language',
52 fatal=False))
b2e8bc1b
JMF
53
54 def _login(self):
55 (username, password) = self._get_login_info()
56 # No authentication to be performed
57 if username is None:
58 if self._LOGIN_REQUIRED:
59 raise ExtractorError(u'No login info available, needed for using %s.' % self.IE_NAME, expected=True)
60 return False
61
7cc3570e
PH
62 login_page = self._download_webpage(
63 self._LOGIN_URL, None,
64 note=u'Downloading login page',
65 errnote=u'unable to fetch login page', fatal=False)
66 if login_page is False:
67 return
b2e8bc1b 68
795f28f8
PH
69 galx = self._search_regex(r'(?s)<input.+?name="GALX".+?value="(.+?)"',
70 login_page, u'Login GALX parameter')
c5e8d7af 71
b2e8bc1b
JMF
72 # Log in
73 login_form_strs = {
74 u'continue': u'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
75 u'Email': username,
76 u'GALX': galx,
77 u'Passwd': password,
78 u'PersistentCookie': u'yes',
79 u'_utf8': u'霱',
80 u'bgresponse': u'js_disabled',
81 u'checkConnection': u'',
82 u'checkedDomains': u'youtube',
83 u'dnConn': u'',
b2e8bc1b
JMF
84 u'pstMsg': u'0',
85 u'rmShown': u'1',
86 u'secTok': u'',
87 u'signIn': u'Sign in',
88 u'timeStmp': u'',
89 u'service': u'youtube',
90 u'uilel': u'3',
91 u'hl': u'en_US',
92 }
93 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
94 # chokes on unicode
95 login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
96 login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
7cc3570e
PH
97
98 req = compat_urllib_request.Request(self._LOGIN_URL, login_data)
99 login_results = self._download_webpage(
100 req, None,
101 note=u'Logging in', errnote=u'unable to log in', fatal=False)
102 if login_results is False:
103 return False
104 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
105 self._downloader.report_warning(u'unable to log in: bad username or password')
b2e8bc1b
JMF
106 return False
107 return True
108
109 def _confirm_age(self):
110 age_form = {
7cc3570e
PH
111 'next_url': '/',
112 'action_confirm': 'Confirm',
113 }
5700e779
JMF
114 req = compat_urllib_request.Request(self._AGE_URL,
115 compat_urllib_parse.urlencode(age_form).encode('ascii'))
7cc3570e
PH
116
117 self._download_webpage(
118 req, None,
119 note=u'Confirming age', errnote=u'Unable to confirm age')
b2e8bc1b
JMF
120 return True
121
122 def _real_initialize(self):
123 if self._downloader is None:
124 return
125 if not self._set_language():
126 return
127 if not self._login():
128 return
129 self._confirm_age()
c5e8d7af 130
8377574c 131
de7f3446 132class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
0f818663 133 IE_DESC = u'YouTube.com'
cb7dfeea 134 _VALID_URL = r"""(?x)^
c5e8d7af 135 (
83aa5293 136 (?:https?://|//)? # http(s):// or protocol-independent URL (optional)
cb7dfeea 137 (?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/|
484aaeb2
PH
138 (?:www\.)?deturl\.com/www\.youtube\.com/|
139 (?:www\.)?pwnyoutube\.com|
e69ae5b9
JMF
140 tube\.majestyc\.net/|
141 youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains
c5e8d7af
PH
142 (?:.*?\#/)? # handle anchor (#/) redirect urls
143 (?: # the various things that can precede the ID:
144 (?:(?:v|embed|e)/) # v/ or embed/ or e/
145 |(?: # or the v= param in all its forms
d741e55a 146 (?:(?:watch|movie)(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
c5e8d7af
PH
147 (?:\?|\#!?) # the params delimiter ? or # or #!
148 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
149 v=
150 )
f4b05232
JMF
151 ))
152 |youtu\.be/ # just youtu.be/xxxx
153 )
c5e8d7af 154 )? # all until now is optional -> you can pass the naked ID
8963d9c2 155 ([0-9A-Za-z_-]{11}) # here is it! the YouTube video ID
c5e8d7af
PH
156 (?(1).+)? # if we found the ID, everything can follow
157 $"""
c5e8d7af 158 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
2c62dc26
PH
159 _formats = {
160 '5': {'ext': 'flv', 'width': 400, 'height': 240},
161 '6': {'ext': 'flv', 'width': 450, 'height': 270},
162 '13': {'ext': '3gp'},
163 '17': {'ext': '3gp', 'width': 176, 'height': 144},
164 '18': {'ext': 'mp4', 'width': 640, 'height': 360},
165 '22': {'ext': 'mp4', 'width': 1280, 'height': 720},
166 '34': {'ext': 'flv', 'width': 640, 'height': 360},
167 '35': {'ext': 'flv', 'width': 854, 'height': 480},
168 '36': {'ext': '3gp', 'width': 320, 'height': 240},
169 '37': {'ext': 'mp4', 'width': 1920, 'height': 1080},
170 '38': {'ext': 'mp4', 'width': 4096, 'height': 3072},
171 '43': {'ext': 'webm', 'width': 640, 'height': 360},
172 '44': {'ext': 'webm', 'width': 854, 'height': 480},
173 '45': {'ext': 'webm', 'width': 1280, 'height': 720},
174 '46': {'ext': 'webm', 'width': 1920, 'height': 1080},
175
1d043b93 176
86fe61c8 177 # 3d videos
2c62dc26
PH
178 '82': {'ext': 'mp4', 'height': 360, 'resolution': '360p', 'format_note': '3D', 'preference': -20},
179 '83': {'ext': 'mp4', 'height': 480, 'resolution': '480p', 'format_note': '3D', 'preference': -20},
180 '84': {'ext': 'mp4', 'height': 720, 'resolution': '720p', 'format_note': '3D', 'preference': -20},
181 '85': {'ext': 'mp4', 'height': 1080, 'resolution': '1080p', 'format_note': '3D', 'preference': -20},
182 '100': {'ext': 'webm', 'height': 360, 'resolution': '360p', 'format_note': '3D', 'preference': -20},
183 '101': {'ext': 'webm', 'height': 480, 'resolution': '480p', 'format_note': '3D', 'preference': -20},
184 '102': {'ext': 'webm', 'height': 720, 'resolution': '720p', 'format_note': '3D', 'preference': -20},
836a086c 185
96fb5605 186 # Apple HTTP Live Streaming
2c62dc26
PH
187 '92': {'ext': 'mp4', 'height': 240, 'resolution': '240p', 'format_note': 'HLS', 'preference': -10},
188 '93': {'ext': 'mp4', 'height': 360, 'resolution': '360p', 'format_note': 'HLS', 'preference': -10},
189 '94': {'ext': 'mp4', 'height': 480, 'resolution': '480p', 'format_note': 'HLS', 'preference': -10},
190 '95': {'ext': 'mp4', 'height': 720, 'resolution': '720p', 'format_note': 'HLS', 'preference': -10},
191 '96': {'ext': 'mp4', 'height': 1080, 'resolution': '1080p', 'format_note': 'HLS', 'preference': -10},
192 '132': {'ext': 'mp4', 'height': 240, 'resolution': '240p', 'format_note': 'HLS', 'preference': -10},
193 '151': {'ext': 'mp4', 'height': 72, 'resolution': '72p', 'format_note': 'HLS', 'preference': -10},
194
195 # DASH mp4 video
196 '133': {'ext': 'mp4', 'height': 240, 'resolution': '240p', 'format_note': 'DASH video', 'preference': -40},
197 '134': {'ext': 'mp4', 'height': 360, 'resolution': '360p', 'format_note': 'DASH video', 'preference': -40},
198 '135': {'ext': 'mp4', 'height': 480, 'resolution': '480p', 'format_note': 'DASH video', 'preference': -40},
199 '136': {'ext': 'mp4', 'height': 720, 'resolution': '720p', 'format_note': 'DASH video', 'preference': -40},
200 '137': {'ext': 'mp4', 'height': 1080, 'resolution': '1080p', 'format_note': 'DASH video', 'preference': -40},
201 '138': {'ext': 'mp4', 'height': 1081, 'resolution': '>1080p', 'format_note': 'DASH video', 'preference': -40},
202 '160': {'ext': 'mp4', 'height': 192, 'resolution': '192p', 'format_note': 'DASH video', 'preference': -40},
8fa8a629 203 '264': {'ext': 'mp4', 'height': 1080, 'resolution': '1080p', 'format_note': 'DASH video', 'preference': -40},
836a086c 204
f6f1fc92 205 # Dash mp4 audio
2c62dc26
PH
206 '139': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 48, 'preference': -50},
207 '140': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 128, 'preference': -50},
208 '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 256, 'preference': -50},
836a086c
AZ
209
210 # Dash webm
1394ce65
PH
211 '167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'VP8', 'acodec': 'none', 'preference': -40},
212 '168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'VP8', 'acodec': 'none', 'preference': -40},
2c5bae42
PH
213 '169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'VP8', 'acodec': 'none', 'preference': -40},
214 '170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'VP8', 'acodec': 'none', 'preference': -40},
1394ce65
PH
215 '218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'VP8', 'acodec': 'none', 'preference': -40},
216 '219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'VP8', 'acodec': 'none', 'preference': -40},
2c62dc26
PH
217 '242': {'ext': 'webm', 'height': 240, 'resolution': '240p', 'format_note': 'DASH webm', 'preference': -40},
218 '243': {'ext': 'webm', 'height': 360, 'resolution': '360p', 'format_note': 'DASH webm', 'preference': -40},
219 '244': {'ext': 'webm', 'height': 480, 'resolution': '480p', 'format_note': 'DASH webm', 'preference': -40},
220 '245': {'ext': 'webm', 'height': 480, 'resolution': '480p', 'format_note': 'DASH webm', 'preference': -40},
221 '246': {'ext': 'webm', 'height': 480, 'resolution': '480p', 'format_note': 'DASH webm', 'preference': -40},
222 '247': {'ext': 'webm', 'height': 720, 'resolution': '720p', 'format_note': 'DASH webm', 'preference': -40},
223 '248': {'ext': 'webm', 'height': 1080, 'resolution': '1080p', 'format_note': 'DASH webm', 'preference': -40},
224
225 # Dash webm audio
226 '171': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH webm audio', 'abr': 48, 'preference': -50},
227 '172': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH webm audio', 'abr': 256, 'preference': -50},
ce6b9a2d
PH
228
229 # RTMP (unnamed)
230 '_rtmp': {'protocol': 'rtmp'},
c5e8d7af 231 }
836a086c 232
c5e8d7af 233 IE_NAME = u'youtube'
2eb88d95
PH
234 _TESTS = [
235 {
0e853ca4
PH
236 u"url": u"http://www.youtube.com/watch?v=BaW_jenozKc",
237 u"file": u"BaW_jenozKc.mp4",
238 u"info_dict": {
239 u"title": u"youtube-dl test video \"'/\\ä↭𝕐",
240 u"uploader": u"Philipp Hagemeister",
241 u"uploader_id": u"phihag",
242 u"upload_date": u"20121002",
27dcce19 243 u"description": u"test chars: \"'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de ."
2eb88d95 244 }
0e853ca4 245 },
0e853ca4
PH
246 {
247 u"url": u"http://www.youtube.com/watch?v=UxxajLWwzqY",
248 u"file": u"UxxajLWwzqY.mp4",
249 u"note": u"Test generic use_cipher_signature video (#897)",
250 u"info_dict": {
251 u"upload_date": u"20120506",
252 u"title": u"Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]",
2dc59299 253 u"description": u"md5:5b292926389560516e384ac437c0ec07",
45ed795c 254 u"uploader": u"Icona Pop",
0e853ca4 255 u"uploader_id": u"IconaPop"
2eb88d95 256 }
c108eb73
JMF
257 },
258 {
259 u"url": u"https://www.youtube.com/watch?v=07FYdnEawAQ",
260 u"file": u"07FYdnEawAQ.mp4",
261 u"note": u"Test VEVO video with age protection (#956)",
262 u"info_dict": {
263 u"upload_date": u"20130703",
264 u"title": u"Justin Timberlake - Tunnel Vision (Explicit)",
265 u"description": u"md5:64249768eec3bc4276236606ea996373",
266 u"uploader": u"justintimberlakeVEVO",
267 u"uploader_id": u"justintimberlakeVEVO"
268 }
269 },
fccd3771 270 {
83aa5293 271 u"url": u"//www.YouTube.com/watch?v=yZIXLfi8CZQ",
fccd3771
PH
272 u"file": u"yZIXLfi8CZQ.mp4",
273 u"note": u"Embed-only video (#1746)",
274 u"info_dict": {
275 u"upload_date": u"20120608",
276 u"title": u"Principal Sexually Assaults A Teacher - Episode 117 - 8th June 2012",
277 u"description": u"md5:09b78bd971f1e3e289601dfba15ca4f7",
278 u"uploader": u"SET India",
279 u"uploader_id": u"setindia"
280 }
281 },
dd27fd17
PH
282 {
283 u"url": u"http://www.youtube.com/watch?v=a9LDPn-MO4I",
284 u"file": u"a9LDPn-MO4I.m4a",
285 u"note": u"256k DASH audio (format 141) via DASH manifest",
dd27fd17
PH
286 u"info_dict": {
287 u"upload_date": "20121002",
288 u"uploader_id": "8KVIDEO",
289 u"description": "No description available.",
290 u"uploader": "8KVIDEO",
291 u"title": "UHDTV TEST 8K VIDEO.mp4"
4919603f
PH
292 },
293 u"params": {
294 u"youtube_include_dash_manifest": True,
295 u"format": "141",
296 },
dd27fd17 297 },
2eb88d95
PH
298 ]
299
c5e8d7af
PH
300
301 @classmethod
302 def suitable(cls, url):
303 """Receives a URL and returns True if suitable for this IE."""
e3ea4790 304 if YoutubePlaylistIE.suitable(url): return False
fccd3771 305 return re.match(cls._VALID_URL, url) is not None
c5e8d7af 306
e0df6211
PH
307 def __init__(self, *args, **kwargs):
308 super(YoutubeIE, self).__init__(*args, **kwargs)
83799698 309 self._player_cache = {}
e0df6211 310
c5e8d7af
PH
311 def report_video_info_webpage_download(self, video_id):
312 """Report attempt to download video info webpage."""
313 self.to_screen(u'%s: Downloading video info webpage' % video_id)
314
c5e8d7af
PH
315 def report_information_extraction(self, video_id):
316 """Report attempt to extract video information."""
317 self.to_screen(u'%s: Extracting video information' % video_id)
318
319 def report_unavailable_format(self, video_id, format):
320 """Report extracted video URL."""
321 self.to_screen(u'%s: Format %s not available' % (video_id, format))
322
323 def report_rtmp_download(self):
324 """Indicate the download will use the RTMP protocol."""
325 self.to_screen(u'RTMP download detected')
326
c4417ddb
PH
327 def _extract_signature_function(self, video_id, player_url, slen):
328 id_m = re.match(r'.*-(?P<id>[a-zA-Z0-9_-]+)\.(?P<ext>[a-z]+)$',
83799698 329 player_url)
e0df6211
PH
330 player_type = id_m.group('ext')
331 player_id = id_m.group('id')
332
c4417ddb
PH
333 # Read from filesystem cache
334 func_id = '%s_%s_%d' % (player_type, player_id, slen)
335 assert os.path.basename(func_id) == func_id
c38b1e77 336 cache_dir = get_cachedir(self._downloader.params)
c4417ddb 337
c3c88a26 338 cache_enabled = cache_dir is not None
f8061589 339 if cache_enabled:
c4417ddb
PH
340 cache_fn = os.path.join(os.path.expanduser(cache_dir),
341 u'youtube-sigfuncs',
342 func_id + '.json')
343 try:
edf3e38e 344 with io.open(cache_fn, 'r', encoding='utf-8') as cachef:
c4417ddb
PH
345 cache_spec = json.load(cachef)
346 return lambda s: u''.join(s[i] for i in cache_spec)
edf3e38e 347 except IOError:
c4417ddb 348 pass # No cache available
83799698 349
e0df6211
PH
350 if player_type == 'js':
351 code = self._download_webpage(
352 player_url, video_id,
83799698 353 note=u'Downloading %s player %s' % (player_type, player_id),
e0df6211 354 errnote=u'Download of %s failed' % player_url)
83799698 355 res = self._parse_sig_js(code)
c4417ddb 356 elif player_type == 'swf':
e0df6211
PH
357 urlh = self._request_webpage(
358 player_url, video_id,
83799698 359 note=u'Downloading %s player %s' % (player_type, player_id),
e0df6211
PH
360 errnote=u'Download of %s failed' % player_url)
361 code = urlh.read()
83799698 362 res = self._parse_sig_swf(code)
e0df6211
PH
363 else:
364 assert False, 'Invalid player type %r' % player_type
365
f8061589 366 if cache_enabled:
edf3e38e 367 try:
c705320f
PH
368 test_string = u''.join(map(compat_chr, range(slen)))
369 cache_res = res(test_string)
edf3e38e
PH
370 cache_spec = [ord(c) for c in cache_res]
371 try:
372 os.makedirs(os.path.dirname(cache_fn))
373 except OSError as ose:
374 if ose.errno != errno.EEXIST:
375 raise
376 write_json_file(cache_spec, cache_fn)
0ca96d48 377 except Exception:
edf3e38e
PH
378 tb = traceback.format_exc()
379 self._downloader.report_warning(
380 u'Writing cache to %r failed: %s' % (cache_fn, tb))
83799698
PH
381
382 return res
383
edf3e38e
PH
384 def _print_sig_code(self, func, slen):
385 def gen_sig_code(idxs):
386 def _genslice(start, end, step):
387 starts = u'' if start == 0 else str(start)
e35e4ddc
PH
388 ends = (u':%d' % (end+step)) if end + step >= 0 else u':'
389 steps = u'' if step == 1 else (u':%d' % step)
edf3e38e
PH
390 return u's[%s%s%s]' % (starts, ends, steps)
391
392 step = None
0ca96d48
PH
393 start = '(Never used)' # Quelch pyflakes warnings - start will be
394 # set as soon as step is set
edf3e38e
PH
395 for i, prev in zip(idxs[1:], idxs[:-1]):
396 if step is not None:
397 if i - prev == step:
398 continue
399 yield _genslice(start, prev, step)
400 step = None
401 continue
402 if i - prev in [-1, 1]:
403 step = i - prev
404 start = prev
405 continue
406 else:
407 yield u's[%d]' % prev
408 if step is None:
409 yield u's[%d]' % i
410 else:
411 yield _genslice(start, i, step)
412
c705320f
PH
413 test_string = u''.join(map(compat_chr, range(slen)))
414 cache_res = func(test_string)
edf3e38e
PH
415 cache_spec = [ord(c) for c in cache_res]
416 expr_code = u' + '.join(gen_sig_code(cache_spec))
417 code = u'if len(s) == %d:\n return %s\n' % (slen, expr_code)
f8061589 418 self.to_screen(u'Extracted signature function:\n' + code)
edf3e38e 419
e0df6211
PH
420 def _parse_sig_js(self, jscode):
421 funcname = self._search_regex(
422 r'signature=([a-zA-Z]+)', jscode,
423 u'Initial JS player signature function name')
424
425 functions = {}
426
427 def argidx(varname):
428 return string.lowercase.index(varname)
429
430 def interpret_statement(stmt, local_vars, allow_recursion=20):
431 if allow_recursion < 0:
0ca96d48 432 raise ExtractorError(u'Recursion limit reached')
e0df6211
PH
433
434 if stmt.startswith(u'var '):
435 stmt = stmt[len(u'var '):]
436 ass_m = re.match(r'^(?P<out>[a-z]+)(?:\[(?P<index>[^\]]+)\])?' +
437 r'=(?P<expr>.*)$', stmt)
438 if ass_m:
439 if ass_m.groupdict().get('index'):
440 def assign(val):
441 lvar = local_vars[ass_m.group('out')]
442 idx = interpret_expression(ass_m.group('index'),
443 local_vars, allow_recursion)
444 assert isinstance(idx, int)
445 lvar[idx] = val
446 return val
447 expr = ass_m.group('expr')
448 else:
449 def assign(val):
450 local_vars[ass_m.group('out')] = val
451 return val
452 expr = ass_m.group('expr')
453 elif stmt.startswith(u'return '):
454 assign = lambda v: v
455 expr = stmt[len(u'return '):]
456 else:
457 raise ExtractorError(
458 u'Cannot determine left side of statement in %r' % stmt)
459
460 v = interpret_expression(expr, local_vars, allow_recursion)
461 return assign(v)
462
463 def interpret_expression(expr, local_vars, allow_recursion):
464 if expr.isdigit():
465 return int(expr)
466
467 if expr.isalpha():
468 return local_vars[expr]
469
470 m = re.match(r'^(?P<in>[a-z]+)\.(?P<member>.*)$', expr)
471 if m:
472 member = m.group('member')
473 val = local_vars[m.group('in')]
474 if member == 'split("")':
475 return list(val)
476 if member == 'join("")':
477 return u''.join(val)
478 if member == 'length':
479 return len(val)
480 if member == 'reverse()':
481 return val[::-1]
482 slice_m = re.match(r'slice\((?P<idx>.*)\)', member)
483 if slice_m:
484 idx = interpret_expression(
485 slice_m.group('idx'), local_vars, allow_recursion-1)
486 return val[idx:]
487
488 m = re.match(
489 r'^(?P<in>[a-z]+)\[(?P<idx>.+)\]$', expr)
490 if m:
491 val = local_vars[m.group('in')]
492 idx = interpret_expression(m.group('idx'), local_vars,
493 allow_recursion-1)
494 return val[idx]
495
496 m = re.match(r'^(?P<a>.+?)(?P<op>[%])(?P<b>.+?)$', expr)
497 if m:
498 a = interpret_expression(m.group('a'),
499 local_vars, allow_recursion)
500 b = interpret_expression(m.group('b'),
501 local_vars, allow_recursion)
502 return a % b
503
504 m = re.match(
20650c86 505 r'^(?P<func>[a-zA-Z$]+)\((?P<args>[a-z0-9,]+)\)$', expr)
e0df6211
PH
506 if m:
507 fname = m.group('func')
508 if fname not in functions:
509 functions[fname] = extract_function(fname)
510 argvals = [int(v) if v.isdigit() else local_vars[v]
511 for v in m.group('args').split(',')]
512 return functions[fname](argvals)
513 raise ExtractorError(u'Unsupported JS expression %r' % expr)
514
515 def extract_function(funcname):
516 func_m = re.search(
517 r'function ' + re.escape(funcname) +
518 r'\((?P<args>[a-z,]+)\){(?P<code>[^}]+)}',
519 jscode)
520 argnames = func_m.group('args').split(',')
521
522 def resf(args):
523 local_vars = dict(zip(argnames, args))
524 for stmt in func_m.group('code').split(';'):
525 res = interpret_statement(stmt, local_vars)
526 return res
527 return resf
528
529 initial_function = extract_function(funcname)
530 return lambda s: initial_function([s])
531
532 def _parse_sig_swf(self, file_contents):
533 if file_contents[1:3] != b'WS':
534 raise ExtractorError(
535 u'Not an SWF file; header is %r' % file_contents[:3])
536 if file_contents[:1] == b'C':
537 content = zlib.decompress(file_contents[8:])
538 else:
539 raise NotImplementedError(u'Unsupported compression format %r' %
540 file_contents[:1])
541
542 def extract_tags(content):
543 pos = 0
544 while pos < len(content):
545 header16 = struct.unpack('<H', content[pos:pos+2])[0]
546 pos += 2
547 tag_code = header16 >> 6
548 tag_len = header16 & 0x3f
549 if tag_len == 0x3f:
550 tag_len = struct.unpack('<I', content[pos:pos+4])[0]
551 pos += 4
552 assert pos+tag_len <= len(content)
553 yield (tag_code, content[pos:pos+tag_len])
554 pos += tag_len
555
556 code_tag = next(tag
557 for tag_code, tag in extract_tags(content)
558 if tag_code == 82)
559 p = code_tag.index(b'\0', 4) + 1
ba552f54 560 code_reader = io.BytesIO(code_tag[p:])
e0df6211
PH
561
562 # Parse ABC (AVM2 ByteCode)
ba552f54
PH
563 def read_int(reader=None):
564 if reader is None:
565 reader = code_reader
e0df6211
PH
566 res = 0
567 shift = 0
568 for _ in range(5):
ba552f54
PH
569 buf = reader.read(1)
570 assert len(buf) == 1
571 b = struct.unpack('<B', buf)[0]
e0df6211
PH
572 res = res | ((b & 0x7f) << shift)
573 if b & 0x80 == 0:
574 break
575 shift += 7
ba552f54
PH
576 return res
577
578 def u30(reader=None):
579 res = read_int(reader)
580 assert res & 0xf0000000 == 0
e0df6211
PH
581 return res
582 u32 = read_int
583
ba552f54
PH
584 def s32(reader=None):
585 v = read_int(reader)
e0df6211
PH
586 if v & 0x80000000 != 0:
587 v = - ((v ^ 0xffffffff) + 1)
ba552f54
PH
588 return v
589
0ca96d48 590 def read_string(reader=None):
ba552f54
PH
591 if reader is None:
592 reader = code_reader
593 slen = u30(reader)
594 resb = reader.read(slen)
595 assert len(resb) == slen
596 return resb.decode('utf-8')
597
598 def read_bytes(count, reader=None):
599 if reader is None:
600 reader = code_reader
601 resb = reader.read(count)
602 assert len(resb) == count
603 return resb
604
605 def read_byte(reader=None):
606 resb = read_bytes(1, reader=reader)
607 res = struct.unpack('<B', resb)[0]
608 return res
e0df6211
PH
609
610 # minor_version + major_version
0ca96d48 611 read_bytes(2 + 2)
e0df6211
PH
612
613 # Constant pool
ba552f54 614 int_count = u30()
e0df6211 615 for _c in range(1, int_count):
0ca96d48 616 s32()
ba552f54 617 uint_count = u30()
e0df6211 618 for _c in range(1, uint_count):
0ca96d48 619 u32()
ba552f54 620 double_count = u30()
0ca96d48 621 read_bytes((double_count-1) * 8)
ba552f54 622 string_count = u30()
e0df6211
PH
623 constant_strings = [u'']
624 for _c in range(1, string_count):
0ca96d48 625 s = read_string()
e0df6211 626 constant_strings.append(s)
ba552f54 627 namespace_count = u30()
e0df6211 628 for _c in range(1, namespace_count):
0ca96d48
PH
629 read_bytes(1) # kind
630 u30() # name
ba552f54 631 ns_set_count = u30()
e0df6211 632 for _c in range(1, ns_set_count):
ba552f54 633 count = u30()
e0df6211 634 for _c2 in range(count):
0ca96d48 635 u30()
ba552f54 636 multiname_count = u30()
e0df6211
PH
637 MULTINAME_SIZES = {
638 0x07: 2, # QName
639 0x0d: 2, # QNameA
640 0x0f: 1, # RTQName
641 0x10: 1, # RTQNameA
642 0x11: 0, # RTQNameL
643 0x12: 0, # RTQNameLA
644 0x09: 2, # Multiname
645 0x0e: 2, # MultinameA
646 0x1b: 1, # MultinameL
647 0x1c: 1, # MultinameLA
648 }
649 multinames = [u'']
650 for _c in range(1, multiname_count):
ba552f54 651 kind = u30()
e0df6211
PH
652 assert kind in MULTINAME_SIZES, u'Invalid multiname kind %r' % kind
653 if kind == 0x07:
0ca96d48 654 u30() # namespace_idx
ba552f54 655 name_idx = u30()
e0df6211
PH
656 multinames.append(constant_strings[name_idx])
657 else:
658 multinames.append('[MULTINAME kind: %d]' % kind)
659 for _c2 in range(MULTINAME_SIZES[kind]):
0ca96d48 660 u30()
e0df6211
PH
661
662 # Methods
ba552f54 663 method_count = u30()
e0df6211
PH
664 MethodInfo = collections.namedtuple(
665 'MethodInfo',
666 ['NEED_ARGUMENTS', 'NEED_REST'])
667 method_infos = []
668 for method_id in range(method_count):
ba552f54 669 param_count = u30()
0ca96d48 670 u30() # return type
e0df6211 671 for _ in range(param_count):
0ca96d48
PH
672 u30() # param type
673 u30() # name index (always 0 for youtube)
ba552f54 674 flags = read_byte()
e0df6211
PH
675 if flags & 0x08 != 0:
676 # Options present
ba552f54 677 option_count = u30()
e0df6211 678 for c in range(option_count):
0ca96d48
PH
679 u30() # val
680 read_bytes(1) # kind
e0df6211
PH
681 if flags & 0x80 != 0:
682 # Param names present
683 for _ in range(param_count):
0ca96d48 684 u30() # param name
e0df6211
PH
685 mi = MethodInfo(flags & 0x01 != 0, flags & 0x04 != 0)
686 method_infos.append(mi)
687
688 # Metadata
ba552f54 689 metadata_count = u30()
e0df6211 690 for _c in range(metadata_count):
0ca96d48 691 u30() # name
ba552f54 692 item_count = u30()
e0df6211 693 for _c2 in range(item_count):
0ca96d48
PH
694 u30() # key
695 u30() # value
ba552f54
PH
696
697 def parse_traits_info():
698 trait_name_idx = u30()
699 kind_full = read_byte()
e0df6211
PH
700 kind = kind_full & 0x0f
701 attrs = kind_full >> 4
702 methods = {}
703 if kind in [0x00, 0x06]: # Slot or Const
0ca96d48
PH
704 u30() # Slot id
705 u30() # type_name_idx
ba552f54 706 vindex = u30()
e0df6211 707 if vindex != 0:
0ca96d48 708 read_byte() # vkind
e0df6211 709 elif kind in [0x01, 0x02, 0x03]: # Method / Getter / Setter
0ca96d48 710 u30() # disp_id
ba552f54 711 method_idx = u30()
e0df6211
PH
712 methods[multinames[trait_name_idx]] = method_idx
713 elif kind == 0x04: # Class
0ca96d48
PH
714 u30() # slot_id
715 u30() # classi
e0df6211 716 elif kind == 0x05: # Function
0ca96d48 717 u30() # slot_id
ba552f54 718 function_idx = u30()
e0df6211
PH
719 methods[function_idx] = multinames[trait_name_idx]
720 else:
721 raise ExtractorError(u'Unsupported trait kind %d' % kind)
722
723 if attrs & 0x4 != 0: # Metadata present
ba552f54 724 metadata_count = u30()
e0df6211 725 for _c3 in range(metadata_count):
0ca96d48 726 u30() # metadata index
e0df6211 727
ba552f54 728 return methods
e0df6211
PH
729
730 # Classes
731 TARGET_CLASSNAME = u'SignatureDecipher'
732 searched_idx = multinames.index(TARGET_CLASSNAME)
733 searched_class_id = None
ba552f54 734 class_count = u30()
e0df6211 735 for class_id in range(class_count):
ba552f54 736 name_idx = u30()
e0df6211
PH
737 if name_idx == searched_idx:
738 # We found the class we're looking for!
739 searched_class_id = class_id
0ca96d48 740 u30() # super_name idx
ba552f54 741 flags = read_byte()
e0df6211 742 if flags & 0x08 != 0: # Protected namespace is present
0ca96d48 743 u30() # protected_ns_idx
ba552f54 744 intrf_count = u30()
e0df6211 745 for _c2 in range(intrf_count):
0ca96d48
PH
746 u30()
747 u30() # iinit
ba552f54 748 trait_count = u30()
e0df6211 749 for _c2 in range(trait_count):
0ca96d48 750 parse_traits_info()
e0df6211
PH
751
752 if searched_class_id is None:
753 raise ExtractorError(u'Target class %r not found' %
754 TARGET_CLASSNAME)
755
756 method_names = {}
757 method_idxs = {}
758 for class_id in range(class_count):
0ca96d48 759 u30() # cinit
ba552f54 760 trait_count = u30()
e0df6211 761 for _c2 in range(trait_count):
ba552f54 762 trait_methods = parse_traits_info()
e0df6211
PH
763 if class_id == searched_class_id:
764 method_names.update(trait_methods.items())
765 method_idxs.update(dict(
766 (idx, name)
767 for name, idx in trait_methods.items()))
768
769 # Scripts
ba552f54 770 script_count = u30()
e0df6211 771 for _c in range(script_count):
0ca96d48 772 u30() # init
ba552f54 773 trait_count = u30()
e0df6211 774 for _c2 in range(trait_count):
0ca96d48 775 parse_traits_info()
e0df6211
PH
776
777 # Method bodies
ba552f54 778 method_body_count = u30()
e0df6211
PH
779 Method = collections.namedtuple('Method', ['code', 'local_count'])
780 methods = {}
781 for _c in range(method_body_count):
ba552f54 782 method_idx = u30()
0ca96d48 783 u30() # max_stack
ba552f54 784 local_count = u30()
0ca96d48
PH
785 u30() # init_scope_depth
786 u30() # max_scope_depth
ba552f54
PH
787 code_length = u30()
788 code = read_bytes(code_length)
e0df6211 789 if method_idx in method_idxs:
ba552f54 790 m = Method(code, local_count)
e0df6211 791 methods[method_idxs[method_idx]] = m
ba552f54 792 exception_count = u30()
e0df6211 793 for _c2 in range(exception_count):
0ca96d48
PH
794 u30() # from
795 u30() # to
796 u30() # target
797 u30() # exc_type
798 u30() # var_name
ba552f54 799 trait_count = u30()
e0df6211 800 for _c2 in range(trait_count):
0ca96d48 801 parse_traits_info()
e0df6211 802
ba552f54 803 assert p + code_reader.tell() == len(code_tag)
e0df6211
PH
804 assert len(methods) == len(method_idxs)
805
806 method_pyfunctions = {}
807
808 def extract_function(func_name):
809 if func_name in method_pyfunctions:
810 return method_pyfunctions[func_name]
811 if func_name not in methods:
812 raise ExtractorError(u'Cannot find function %r' % func_name)
813 m = methods[func_name]
814
815 def resfunc(args):
e0df6211
PH
816 registers = ['(this)'] + list(args) + [None] * m.local_count
817 stack = []
818 coder = io.BytesIO(m.code)
819 while True:
820 opcode = struct.unpack('!B', coder.read(1))[0]
a7177865 821 if opcode == 36: # pushbyte
e0df6211
PH
822 v = struct.unpack('!B', coder.read(1))[0]
823 stack.append(v)
824 elif opcode == 44: # pushstring
825 idx = u30(coder)
826 stack.append(constant_strings[idx])
827 elif opcode == 48: # pushscope
828 # We don't implement the scope register, so we'll just
829 # ignore the popped value
830 stack.pop()
831 elif opcode == 70: # callproperty
832 index = u30(coder)
833 mname = multinames[index]
834 arg_count = u30(coder)
835 args = list(reversed(
836 [stack.pop() for _ in range(arg_count)]))
837 obj = stack.pop()
838 if mname == u'split':
839 assert len(args) == 1
840 assert isinstance(args[0], compat_str)
841 assert isinstance(obj, compat_str)
842 if args[0] == u'':
843 res = list(obj)
844 else:
845 res = obj.split(args[0])
846 stack.append(res)
a7177865
PH
847 elif mname == u'slice':
848 assert len(args) == 1
849 assert isinstance(args[0], int)
850 assert isinstance(obj, list)
851 res = obj[args[0]:]
852 stack.append(res)
853 elif mname == u'join':
854 assert len(args) == 1
855 assert isinstance(args[0], compat_str)
856 assert isinstance(obj, list)
857 res = args[0].join(obj)
858 stack.append(res)
e0df6211
PH
859 elif mname in method_pyfunctions:
860 stack.append(method_pyfunctions[mname](args))
861 else:
862 raise NotImplementedError(
863 u'Unsupported property %r on %r'
864 % (mname, obj))
a7177865
PH
865 elif opcode == 72: # returnvalue
866 res = stack.pop()
867 return res
868 elif opcode == 79: # callpropvoid
869 index = u30(coder)
870 mname = multinames[index]
871 arg_count = u30(coder)
872 args = list(reversed(
873 [stack.pop() for _ in range(arg_count)]))
874 obj = stack.pop()
875 if mname == u'reverse':
876 assert isinstance(obj, list)
877 obj.reverse()
878 else:
879 raise NotImplementedError(
880 u'Unsupported (void) property %r on %r'
881 % (mname, obj))
e0df6211
PH
882 elif opcode == 93: # findpropstrict
883 index = u30(coder)
884 mname = multinames[index]
885 res = extract_function(mname)
886 stack.append(res)
887 elif opcode == 97: # setproperty
888 index = u30(coder)
889 value = stack.pop()
890 idx = stack.pop()
891 obj = stack.pop()
892 assert isinstance(obj, list)
893 assert isinstance(idx, int)
894 obj[idx] = value
895 elif opcode == 98: # getlocal
896 index = u30(coder)
897 stack.append(registers[index])
898 elif opcode == 99: # setlocal
899 index = u30(coder)
900 value = stack.pop()
901 registers[index] = value
902 elif opcode == 102: # getproperty
903 index = u30(coder)
904 pname = multinames[index]
905 if pname == u'length':
906 obj = stack.pop()
907 assert isinstance(obj, list)
908 stack.append(len(obj))
909 else: # Assume attribute access
910 idx = stack.pop()
911 assert isinstance(idx, int)
912 obj = stack.pop()
913 assert isinstance(obj, list)
914 stack.append(obj[idx])
915 elif opcode == 128: # coerce
0ca96d48 916 u30(coder)
e0df6211
PH
917 elif opcode == 133: # coerce_s
918 assert isinstance(stack[-1], (type(None), compat_str))
919 elif opcode == 164: # modulo
920 value2 = stack.pop()
921 value1 = stack.pop()
922 res = value1 % value2
923 stack.append(res)
a7177865
PH
924 elif opcode == 208: # getlocal_0
925 stack.append(registers[0])
926 elif opcode == 209: # getlocal_1
927 stack.append(registers[1])
928 elif opcode == 210: # getlocal_2
929 stack.append(registers[2])
930 elif opcode == 211: # getlocal_3
931 stack.append(registers[3])
e0df6211
PH
932 elif opcode == 214: # setlocal_2
933 registers[2] = stack.pop()
934 elif opcode == 215: # setlocal_3
935 registers[3] = stack.pop()
936 else:
937 raise NotImplementedError(
938 u'Unsupported opcode %d' % opcode)
939
940 method_pyfunctions[func_name] = resfunc
941 return resfunc
942
943 initial_function = extract_function(u'decipher')
944 return lambda s: initial_function([s])
945
83799698 946 def _decrypt_signature(self, s, video_id, player_url, age_gate=False):
257a2501 947 """Turn the encrypted s field into a working signature"""
6b37f0be 948
83799698 949 if player_url is not None:
9f9be844
PH
950 if player_url.startswith(u'//'):
951 player_url = u'https:' + player_url
e0df6211 952 try:
7f8ae73a
PH
953 player_id = (player_url, len(s))
954 if player_id not in self._player_cache:
83799698 955 func = self._extract_signature_function(
c4417ddb 956 video_id, player_url, len(s)
e0df6211 957 )
7f8ae73a
PH
958 self._player_cache[player_id] = func
959 func = self._player_cache[player_id]
edf3e38e
PH
960 if self._downloader.params.get('youtube_print_sig_code'):
961 self._print_sig_code(func, len(s))
962 return func(s)
0ca96d48 963 except Exception:
e0df6211 964 tb = traceback.format_exc()
83799698
PH
965 self._downloader.report_warning(
966 u'Automatic signature extraction failed: ' + tb)
e0df6211 967
d2d8f895
PH
968 self._downloader.report_warning(
969 u'Warning: Falling back to static signature algorithm')
920de7a2 970
2f2ffea9
PH
971 return self._static_decrypt_signature(
972 s, video_id, player_url, age_gate)
e0df6211 973
2f2ffea9 974 def _static_decrypt_signature(self, s, video_id, player_url, age_gate):
e0df6211
PH
975 if age_gate:
976 # The videos with age protection use another player, so the
977 # algorithms can be different.
978 if len(s) == 86:
979 return s[2:63] + s[82] + s[64:82] + s[63]
980
bc4b9008 981 if len(s) == 93:
982 return s[86:29:-1] + s[88] + s[28:5:-1]
983 elif len(s) == 92:
444b1165 984 return s[25] + s[3:25] + s[0] + s[26:42] + s[79] + s[43:79] + s[91] + s[80:83]
38d025b3
JMF
985 elif len(s) == 91:
986 return s[84:27:-1] + s[86] + s[26:5:-1]
444b1165
JMF
987 elif len(s) == 90:
988 return s[25] + s[3:25] + s[2] + s[26:40] + s[77] + s[41:77] + s[89] + s[78:81]
8a9d86a2 989 elif len(s) == 89:
990 return s[84:78:-1] + s[87] + s[77:60:-1] + s[0] + s[59:3:-1]
444b1165 991 elif len(s) == 88:
3e223834 992 return s[7:28] + s[87] + s[29:45] + s[55] + s[46:55] + s[2] + s[56:87] + s[28]
be547e1d 993 elif len(s) == 87:
3a725669 994 return s[6:27] + s[4] + s[28:39] + s[27] + s[40:59] + s[2] + s[60:]
be547e1d 995 elif len(s) == 86:
f2c327fd 996 return s[80:72:-1] + s[16] + s[71:39:-1] + s[72] + s[38:16:-1] + s[82] + s[15::-1]
be547e1d 997 elif len(s) == 85:
6ae8ee3f 998 return s[3:11] + s[0] + s[12:55] + s[84] + s[56:84]
be547e1d 999 elif len(s) == 84:
6f56389b 1000 return s[78:70:-1] + s[14] + s[69:37:-1] + s[70] + s[36:14:-1] + s[80] + s[:14][::-1]
be547e1d 1001 elif len(s) == 83:
920de7a2 1002 return s[80:63:-1] + s[0] + s[62:0:-1] + s[63]
be547e1d 1003 elif len(s) == 82:
c21315f2 1004 return s[80:37:-1] + s[7] + s[36:7:-1] + s[0] + s[6:0:-1] + s[37]
be547e1d 1005 elif len(s) == 81:
aedd6bb9 1006 return s[56] + s[79:56:-1] + s[41] + s[55:41:-1] + s[80] + s[40:34:-1] + s[0] + s[33:29:-1] + s[34] + s[28:9:-1] + s[29] + s[8:0:-1] + s[9]
066090dd
JMF
1007 elif len(s) == 80:
1008 return s[1:19] + s[0] + s[20:68] + s[19] + s[69:80]
5c468ca8
JMF
1009 elif len(s) == 79:
1010 return s[54] + s[77:54:-1] + s[39] + s[53:39:-1] + s[78] + s[38:34:-1] + s[0] + s[33:29:-1] + s[34] + s[28:9:-1] + s[29] + s[8:0:-1] + s[9]
be547e1d
PH
1011
1012 else:
1013 raise ExtractorError(u'Unable to decrypt signature, key length %d not supported; retrying might work' % (len(s)))
c5e8d7af 1014
1f343eaa 1015 def _get_available_subtitles(self, video_id, webpage):
de7f3446 1016 try:
7fad1c63 1017 sub_list = self._download_webpage(
38c2e5b8 1018 'https://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,
7fad1c63
JMF
1019 video_id, note=False)
1020 except ExtractorError as err:
de7f3446
JMF
1021 self._downloader.report_warning(u'unable to download video subtitles: %s' % compat_str(err))
1022 return {}
1023 lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
1024
1025 sub_lang_list = {}
1026 for l in lang_list:
1027 lang = l[1]
1028 params = compat_urllib_parse.urlencode({
1029 'lang': lang,
1030 'v': video_id,
ca715127 1031 'fmt': self._downloader.params.get('subtitlesformat', 'srt'),
c3197e3e 1032 'name': unescapeHTML(l[0]).encode('utf-8'),
de7f3446 1033 })
38c2e5b8 1034 url = u'https://www.youtube.com/api/timedtext?' + params
de7f3446
JMF
1035 sub_lang_list[lang] = url
1036 if not sub_lang_list:
1037 self._downloader.report_warning(u'video doesn\'t have subtitles')
1038 return {}
1039 return sub_lang_list
1040
055e6f36 1041 def _get_available_automatic_caption(self, video_id, webpage):
de7f3446
JMF
1042 """We need the webpage for getting the captions url, pass it as an
1043 argument to speed up the process."""
ca715127 1044 sub_format = self._downloader.params.get('subtitlesformat', 'srt')
de7f3446
JMF
1045 self.to_screen(u'%s: Looking for automatic captions' % video_id)
1046 mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
055e6f36 1047 err_msg = u'Couldn\'t find automatic captions for %s' % video_id
de7f3446
JMF
1048 if mobj is None:
1049 self._downloader.report_warning(err_msg)
1050 return {}
1051 player_config = json.loads(mobj.group(1))
1052 try:
1053 args = player_config[u'args']
1054 caption_url = args[u'ttsurl']
1055 timestamp = args[u'timestamp']
055e6f36
JMF
1056 # We get the available subtitles
1057 list_params = compat_urllib_parse.urlencode({
1058 'type': 'list',
1059 'tlangs': 1,
1060 'asrs': 1,
de7f3446 1061 })
055e6f36 1062 list_url = caption_url + '&' + list_params
e26f8712 1063 caption_list = self._download_xml(list_url, video_id)
e3dc22ca 1064 original_lang_node = caption_list.find('track')
f6a54188 1065 if original_lang_node is None or original_lang_node.attrib.get('kind') != 'asr' :
e3dc22ca
JMF
1066 self._downloader.report_warning(u'Video doesn\'t have automatic captions')
1067 return {}
1068 original_lang = original_lang_node.attrib['lang_code']
055e6f36
JMF
1069
1070 sub_lang_list = {}
1071 for lang_node in caption_list.findall('target'):
1072 sub_lang = lang_node.attrib['lang_code']
1073 params = compat_urllib_parse.urlencode({
1074 'lang': original_lang,
1075 'tlang': sub_lang,
1076 'fmt': sub_format,
1077 'ts': timestamp,
1078 'kind': 'asr',
1079 })
1080 sub_lang_list[sub_lang] = caption_url + '&' + params
1081 return sub_lang_list
de7f3446
JMF
1082 # An extractor error can be raise by the download process if there are
1083 # no automatic captions but there are subtitles
1084 except (KeyError, ExtractorError):
1085 self._downloader.report_warning(err_msg)
1086 return {}
1087
c5e8d7af
PH
1088 def _extract_id(self, url):
1089 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1090 if mobj is None:
1091 raise ExtractorError(u'Invalid URL: %s' % url)
1092 video_id = mobj.group(2)
1093 return video_id
1094
1d043b93
JMF
1095 def _extract_from_m3u8(self, manifest_url, video_id):
1096 url_map = {}
1097 def _get_urls(_manifest):
1098 lines = _manifest.split('\n')
1099 urls = filter(lambda l: l and not l.startswith('#'),
1100 lines)
1101 return urls
1102 manifest = self._download_webpage(manifest_url, video_id, u'Downloading formats manifest')
1103 formats_urls = _get_urls(manifest)
1104 for format_url in formats_urls:
890f62e8 1105 itag = self._search_regex(r'itag/(\d+?)/', format_url, 'itag')
1d043b93
JMF
1106 url_map[itag] = format_url
1107 return url_map
1108
1fb07d10
JG
1109 def _extract_annotations(self, video_id):
1110 url = 'https://www.youtube.com/annotations_invideo?features=1&legacy=1&video_id=%s' % video_id
1111 return self._download_webpage(url, video_id, note=u'Searching for annotations.', errnote=u'Unable to download video annotations.')
1112
c5e8d7af
PH
1113 def _real_extract(self, url):
1114 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
1115 mobj = re.search(self._NEXT_URL_RE, url)
1116 if mobj:
1117 url = 'https://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
1118 video_id = self._extract_id(url)
1119
1120 # Get video webpage
c5e8d7af 1121 url = 'https://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
336c3a69 1122 video_webpage = self._download_webpage(url, video_id)
c5e8d7af
PH
1123
1124 # Attempt to extract SWF player URL
e0df6211 1125 mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
c5e8d7af
PH
1126 if mobj is not None:
1127 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1128 else:
1129 player_url = None
1130
1131 # Get video info
1132 self.report_video_info_webpage_download(video_id)
c108eb73
JMF
1133 if re.search(r'player-age-gate-content">', video_webpage) is not None:
1134 self.report_age_confirmation()
1135 age_gate = True
1136 # We simulate the access to the video from www.youtube.com/v/{video_id}
1137 # this can be viewed without login into Youtube
1138 data = compat_urllib_parse.urlencode({'video_id': video_id,
fccd3771 1139 'el': 'player_embedded',
c108eb73
JMF
1140 'gl': 'US',
1141 'hl': 'en',
1142 'eurl': 'https://youtube.googleapis.com/v/' + video_id,
1143 'asv': 3,
1144 'sts':'1588',
1145 })
1146 video_info_url = 'https://www.youtube.com/get_video_info?' + data
c5e8d7af
PH
1147 video_info_webpage = self._download_webpage(video_info_url, video_id,
1148 note=False,
1149 errnote='unable to download video info webpage')
1150 video_info = compat_parse_qs(video_info_webpage)
c108eb73
JMF
1151 else:
1152 age_gate = False
1153 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1154 video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1155 % (video_id, el_type))
1156 video_info_webpage = self._download_webpage(video_info_url, video_id,
1157 note=False,
1158 errnote='unable to download video info webpage')
1159 video_info = compat_parse_qs(video_info_webpage)
1160 if 'token' in video_info:
1161 break
c5e8d7af
PH
1162 if 'token' not in video_info:
1163 if 'reason' in video_info:
9a82b238 1164 raise ExtractorError(u'YouTube said: %s' % video_info['reason'][0], expected=True)
c5e8d7af
PH
1165 else:
1166 raise ExtractorError(u'"token" parameter not in video info for unknown reason')
1167
1d699755
PH
1168 if 'view_count' in video_info:
1169 view_count = int(video_info['view_count'][0])
1170 else:
1171 view_count = None
1172
c5e8d7af
PH
1173 # Check for "rental" videos
1174 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
1175 raise ExtractorError(u'"rental" videos not supported')
1176
1177 # Start extracting information
1178 self.report_information_extraction(video_id)
1179
1180 # uploader
1181 if 'author' not in video_info:
1182 raise ExtractorError(u'Unable to extract uploader name')
1183 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
1184
1185 # uploader_id
1186 video_uploader_id = None
1187 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
1188 if mobj is not None:
1189 video_uploader_id = mobj.group(1)
1190 else:
1191 self._downloader.report_warning(u'unable to extract uploader nickname')
1192
1193 # title
a8c6b241
PH
1194 if 'title' in video_info:
1195 video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
1196 else:
1197 self._downloader.report_warning(u'Unable to extract video title')
1198 video_title = u'_'
c5e8d7af
PH
1199
1200 # thumbnail image
7763b04e
JMF
1201 # We try first to get a high quality image:
1202 m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">',
1203 video_webpage, re.DOTALL)
1204 if m_thumb is not None:
1205 video_thumbnail = m_thumb.group(1)
1206 elif 'thumbnail_url' not in video_info:
c5e8d7af 1207 self._downloader.report_warning(u'unable to extract video thumbnail')
f490e77e 1208 video_thumbnail = None
c5e8d7af
PH
1209 else: # don't panic if we can't find it
1210 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
1211
1212 # upload date
1213 upload_date = None
1214 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
1215 if mobj is not None:
1216 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1217 upload_date = unified_strdate(upload_date)
1218
1219 # description
1220 video_description = get_element_by_id("eow-description", video_webpage)
1221 if video_description:
27dcce19
PH
1222 video_description = re.sub(r'''(?x)
1223 <a\s+
1224 (?:[a-zA-Z-]+="[^"]+"\s+)*?
1225 title="([^"]+)"\s+
1226 (?:[a-zA-Z-]+="[^"]+"\s+)*?
1227 class="yt-uix-redirect-link"\s*>
1228 [^<]+
1229 </a>
1230 ''', r'\1', video_description)
c5e8d7af
PH
1231 video_description = clean_html(video_description)
1232 else:
1233 fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
1234 if fd_mobj:
1235 video_description = unescapeHTML(fd_mobj.group(1))
1236 else:
1237 video_description = u''
1238
336c3a69 1239 def _extract_count(klass):
46374a56
PH
1240 count = self._search_regex(
1241 r'class="%s">([\d,]+)</span>' % re.escape(klass),
1242 video_webpage, klass, default=None)
336c3a69
JMF
1243 if count is not None:
1244 return int(count.replace(',', ''))
1245 return None
1246 like_count = _extract_count(u'likes-count')
1247 dislike_count = _extract_count(u'dislikes-count')
1248
c5e8d7af 1249 # subtitles
d82134c3 1250 video_subtitles = self.extract_subtitles(video_id, video_webpage)
c5e8d7af 1251
c5e8d7af 1252 if self._downloader.params.get('listsubtitles', False):
d665f8d3 1253 self._list_available_subtitles(video_id, video_webpage)
c5e8d7af
PH
1254 return
1255
1256 if 'length_seconds' not in video_info:
1257 self._downloader.report_warning(u'unable to extract video duration')
b466b702 1258 video_duration = None
c5e8d7af 1259 else:
b466b702 1260 video_duration = int(compat_urllib_parse.unquote_plus(video_info['length_seconds'][0]))
c5e8d7af 1261
1fb07d10
JG
1262 # annotations
1263 video_annotations = None
1264 if self._downloader.params.get('writeannotations', False):
1265 video_annotations = self._extract_annotations(video_id)
1266
c5e8d7af 1267 # Decide which formats to download
c5e8d7af
PH
1268 try:
1269 mobj = re.search(r';ytplayer.config = ({.*?});', video_webpage)
50be92c1
PH
1270 if not mobj:
1271 raise ValueError('Could not find vevo ID')
c5e8d7af
PH
1272 info = json.loads(mobj.group(1))
1273 args = info['args']
7ce7e394
JMF
1274 # Easy way to know if the 's' value is in url_encoded_fmt_stream_map
1275 # this signatures are encrypted
44d46655 1276 if 'url_encoded_fmt_stream_map' not in args:
f10503db 1277 raise ValueError(u'No stream_map present') # caught below
00fe14fc
JMF
1278 re_signature = re.compile(r'[&,]s=')
1279 m_s = re_signature.search(args['url_encoded_fmt_stream_map'])
7ce7e394
JMF
1280 if m_s is not None:
1281 self.to_screen(u'%s: Encrypted signatures detected.' % video_id)
c5e8d7af 1282 video_info['url_encoded_fmt_stream_map'] = [args['url_encoded_fmt_stream_map']]
00fe14fc 1283 m_s = re_signature.search(args.get('adaptive_fmts', u''))
b7a68384 1284 if m_s is not None:
00fe14fc
JMF
1285 if 'adaptive_fmts' in video_info:
1286 video_info['adaptive_fmts'][0] += ',' + args['adaptive_fmts']
37b6d5f6 1287 else:
00fe14fc 1288 video_info['adaptive_fmts'] = [args['adaptive_fmts']]
c5e8d7af
PH
1289 except ValueError:
1290 pass
1291
dd27fd17
PH
1292 def _map_to_format_list(urlmap):
1293 formats = []
1294 for itag, video_real_url in urlmap.items():
1295 dct = {
1296 'format_id': itag,
1297 'url': video_real_url,
1298 'player_url': player_url,
1299 }
0b65e5d4
PH
1300 if itag in self._formats:
1301 dct.update(self._formats[itag])
dd27fd17
PH
1302 formats.append(dct)
1303 return formats
1304
c5e8d7af
PH
1305 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1306 self.report_rtmp_download()
dd27fd17
PH
1307 formats = [{
1308 'format_id': '_rtmp',
1309 'protocol': 'rtmp',
1310 'url': video_info['conn'][0],
1311 'player_url': player_url,
1312 }]
00fe14fc
JMF
1313 elif len(video_info.get('url_encoded_fmt_stream_map', [])) >= 1 or len(video_info.get('adaptive_fmts', [])) >= 1:
1314 encoded_url_map = video_info.get('url_encoded_fmt_stream_map', [''])[0] + ',' + video_info.get('adaptive_fmts',[''])[0]
1315 if 'rtmpe%3Dyes' in encoded_url_map:
a7055eb9 1316 raise ExtractorError('rtmpe downloads are not supported, see https://github.com/rg3/youtube-dl/issues/343 for more information.', expected=True)
c5e8d7af 1317 url_map = {}
00fe14fc 1318 for url_data_str in encoded_url_map.split(','):
c5e8d7af
PH
1319 url_data = compat_parse_qs(url_data_str)
1320 if 'itag' in url_data and 'url' in url_data:
1321 url = url_data['url'][0]
1322 if 'sig' in url_data:
1323 url += '&signature=' + url_data['sig'][0]
1324 elif 's' in url_data:
e0df6211 1325 encrypted_sig = url_data['s'][0]
769fda3c 1326 if self._downloader.params.get('verbose'):
c108eb73 1327 if age_gate:
bdde940e
PH
1328 if player_url is None:
1329 player_version = 'unknown'
1330 else:
1331 player_version = self._search_regex(
1332 r'-(.+)\.swf$', player_url,
1333 u'flash player', fatal=False)
e0df6211 1334 player_desc = 'flash player %s' % player_version
c108eb73 1335 else:
83799698
PH
1336 player_version = self._search_regex(
1337 r'html5player-(.+?)\.js', video_webpage,
c108eb73 1338 'html5 player', fatal=False)
e0df6211
PH
1339 player_desc = u'html5 player %s' % player_version
1340
1341 parts_sizes = u'.'.join(compat_str(len(part)) for part in encrypted_sig.split('.'))
5a76c651 1342 self.to_screen(u'encrypted signature length %d (%s), itag %s, %s' %
e0df6211
PH
1343 (len(encrypted_sig), parts_sizes, url_data['itag'][0], player_desc))
1344
83799698 1345 if not age_gate:
e0df6211
PH
1346 jsplayer_url_json = self._search_regex(
1347 r'"assets":.+?"js":\s*("[^"]+")',
1348 video_webpage, u'JS player URL')
83799698 1349 player_url = json.loads(jsplayer_url_json)
e0df6211 1350
83799698
PH
1351 signature = self._decrypt_signature(
1352 encrypted_sig, video_id, player_url, age_gate)
c5e8d7af
PH
1353 url += '&signature=' + signature
1354 if 'ratebypass' not in url:
1355 url += '&ratebypass=yes'
1356 url_map[url_data['itag'][0]] = url
dd27fd17 1357 formats = _map_to_format_list(url_map)
1d043b93
JMF
1358 elif video_info.get('hlsvp'):
1359 manifest_url = video_info['hlsvp'][0]
1360 url_map = self._extract_from_m3u8(manifest_url, video_id)
dd27fd17 1361 formats = _map_to_format_list(url_map)
c5e8d7af 1362 else:
9abb3204 1363 raise ExtractorError(u'no conn, hlsvp or url_encoded_fmt_stream_map information found in video info')
c5e8d7af 1364
dd27fd17
PH
1365 # Look for the DASH manifest
1366 dash_manifest_url_lst = video_info.get('dashmpd')
4919603f
PH
1367 if (dash_manifest_url_lst and dash_manifest_url_lst[0] and
1368 self._downloader.params.get('youtube_include_dash_manifest', False)):
dd27fd17
PH
1369 try:
1370 dash_doc = self._download_xml(
1371 dash_manifest_url_lst[0], video_id,
1372 note=u'Downloading DASH manifest',
1373 errnote=u'Could not download DASH manifest')
1374 for r in dash_doc.findall(u'.//{urn:mpeg:DASH:schema:MPD:2011}Representation'):
1375 url_el = r.find('{urn:mpeg:DASH:schema:MPD:2011}BaseURL')
1376 if url_el is None:
1377 continue
1378 format_id = r.attrib['id']
1379 video_url = url_el.text
1380 filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength'))
1381 f = {
1382 'format_id': format_id,
1383 'url': video_url,
1384 'width': int_or_none(r.attrib.get('width')),
1385 'tbr': int_or_none(r.attrib.get('bandwidth'), 1000),
1386 'asr': int_or_none(r.attrib.get('audioSamplingRate')),
1387 'filesize': filesize,
1388 }
1389 try:
1390 existing_format = next(
1391 fo for fo in formats
1392 if fo['format_id'] == format_id)
1393 except StopIteration:
1394 f.update(self._formats.get(format_id, {}))
1395 formats.append(f)
1396 else:
1397 existing_format.update(f)
1398
1399 except (ExtractorError, KeyError) as e:
1400 self.report_warning(u'Skipping DASH manifest: %s' % e, video_id)
d80044c2 1401
4bcc7bd1 1402 self._sort_formats(formats)
4ea3be0a 1403
1404 return {
1405 'id': video_id,
1406 'uploader': video_uploader,
1407 'uploader_id': video_uploader_id,
1408 'upload_date': upload_date,
1409 'title': video_title,
1410 'thumbnail': video_thumbnail,
1411 'description': video_description,
1412 'subtitles': video_subtitles,
1413 'duration': video_duration,
1414 'age_limit': 18 if age_gate else 0,
1415 'annotations': video_annotations,
1416 'webpage_url': 'https://www.youtube.com/watch?v=%s' % video_id,
1417 'view_count': view_count,
1418 'like_count': like_count,
1419 'dislike_count': dislike_count,
1420 'formats': formats,
1421 }
c5e8d7af 1422
880e1c52 1423class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
0f818663 1424 IE_DESC = u'YouTube.com playlists'
d67cc9fa 1425 _VALID_URL = r"""(?x)(?:
c5e8d7af
PH
1426 (?:https?://)?
1427 (?:\w+\.)?
1428 youtube\.com/
1429 (?:
1430 (?:course|view_play_list|my_playlists|artist|playlist|watch)
1431 \? (?:.*?&)*? (?:p|a|list)=
1432 | p/
1433 )
d67cc9fa
JMF
1434 (
1435 (?:PL|EC|UU|FL|RD)?[0-9A-Za-z-_]{10,}
1436 # Top tracks, they can also include dots
1437 |(?:MC)[\w\.]*
1438 )
c5e8d7af
PH
1439 .*
1440 |
715c8e7b 1441 ((?:PL|EC|UU|FL|RD)[0-9A-Za-z-_]{10,})
c5e8d7af 1442 )"""
dcbb4580
JMF
1443 _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s&page=%s'
1444 _MORE_PAGES_INDICATOR = r'data-link-type="next"'
6e47b51e 1445 _VIDEO_RE = r'href="/watch\?v=(?P<id>[0-9A-Za-z_-]{11})&amp;[^"]*?index=(?P<index>\d+)'
c5e8d7af
PH
1446 IE_NAME = u'youtube:playlist'
1447
880e1c52
JMF
1448 def _real_initialize(self):
1449 self._login()
1450
652cdaa2
JMF
1451 def _ids_to_results(self, ids):
1452 return [self.url_result(vid_id, 'Youtube', video_id=vid_id)
1453 for vid_id in ids]
1454
1455 def _extract_mix(self, playlist_id):
1456 # The mixes are generated from a a single video
1457 # the id of the playlist is just 'RD' + video_id
7d4afc55 1458 url = 'https://youtube.com/watch?v=%s&list=%s' % (playlist_id[-11:], playlist_id)
652cdaa2 1459 webpage = self._download_webpage(url, playlist_id, u'Downloading Youtube mix')
76d1700b
JMF
1460 title_span = (get_element_by_attribute('class', 'title long-title', webpage) or
1461 get_element_by_attribute('class', 'title ', webpage))
1462 title = clean_html(title_span)
652cdaa2
JMF
1463 video_re = r'data-index="\d+".*?href="/watch\?v=([0-9A-Za-z_-]{11})&amp;[^"]*?list=%s' % re.escape(playlist_id)
1464 ids = orderedSet(re.findall(video_re, webpage))
1465 url_results = self._ids_to_results(ids)
1466
1467 return self.playlist_result(url_results, playlist_id, title)
1468
c5e8d7af
PH
1469 def _real_extract(self, url):
1470 # Extract playlist id
d67cc9fa 1471 mobj = re.match(self._VALID_URL, url)
c5e8d7af
PH
1472 if mobj is None:
1473 raise ExtractorError(u'Invalid URL: %s' % url)
47192f92
FV
1474 playlist_id = mobj.group(1) or mobj.group(2)
1475
1476 # Check if it's a video-specific URL
7c61bd36 1477 query_dict = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
47192f92
FV
1478 if 'v' in query_dict:
1479 video_id = query_dict['v'][0]
1480 if self._downloader.params.get('noplaylist'):
1481 self.to_screen(u'Downloading just video %s because of --no-playlist' % video_id)
7012b23c 1482 return self.url_result(video_id, 'Youtube', video_id=video_id)
47192f92
FV
1483 else:
1484 self.to_screen(u'Downloading playlist PL%s - add --no-playlist to just download video %s' % (playlist_id, video_id))
c5e8d7af 1485
7d4afc55 1486 if playlist_id.startswith('RD'):
652cdaa2
JMF
1487 # Mixes require a custom extraction process
1488 return self._extract_mix(playlist_id)
0a688bc0
JMF
1489 if playlist_id.startswith('TL'):
1490 raise ExtractorError(u'For downloading YouTube.com top lists, use '
1491 u'the "yttoplist" keyword, for example "youtube-dl \'yttoplist:music:Top Tracks\'"', expected=True)
652cdaa2 1492
dcbb4580
JMF
1493 # Extract the video ids from the playlist pages
1494 ids = []
c5e8d7af 1495
755eb032 1496 for page_num in itertools.count(1):
dcbb4580 1497 url = self._TEMPLATE_URL % (playlist_id, page_num)
c5e8d7af 1498 page = self._download_webpage(url, playlist_id, u'Downloading page #%s' % page_num)
6e47b51e
JMF
1499 matches = re.finditer(self._VIDEO_RE, page)
1500 # We remove the duplicates and the link with index 0
1501 # (it's not the first video of the playlist)
1502 new_ids = orderedSet(m.group('id') for m in matches if m.group('index') != '0')
dcbb4580 1503 ids.extend(new_ids)
c5e8d7af 1504
dcbb4580 1505 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
c5e8d7af
PH
1506 break
1507
c91778f8
PH
1508 try:
1509 playlist_title = self._og_search_title(page)
1510 except RegexNotFoundError:
1511 self.report_warning(
1512 u'Playlist page is missing OpenGraph title, falling back ...',
1513 playlist_id)
1514 playlist_title = self._html_search_regex(
1515 r'<h1 class="pl-header-title">(.*?)</h1>', page, u'title')
c5e8d7af 1516
652cdaa2 1517 url_results = self._ids_to_results(ids)
dcbb4580 1518 return self.playlist_result(url_results, playlist_id, playlist_title)
c5e8d7af
PH
1519
1520
0a688bc0
JMF
1521class YoutubeTopListIE(YoutubePlaylistIE):
1522 IE_NAME = u'youtube:toplist'
1523 IE_DESC = (u'YouTube.com top lists, "yttoplist:{channel}:{list title}"'
1524 u' (Example: "yttoplist:music:Top Tracks")')
1525 _VALID_URL = r'yttoplist:(?P<chann>.*?):(?P<title>.*?)$'
1526
1527 def _real_extract(self, url):
1528 mobj = re.match(self._VALID_URL, url)
1529 channel = mobj.group('chann')
1530 title = mobj.group('title')
1531 query = compat_urllib_parse.urlencode({'title': title})
beddbc2a 1532 playlist_re = 'href="([^"]+?%s.*?)"' % re.escape(query)
0a688bc0
JMF
1533 channel_page = self._download_webpage('https://www.youtube.com/%s' % channel, title)
1534 link = self._html_search_regex(playlist_re, channel_page, u'list')
1535 url = compat_urlparse.urljoin('https://www.youtube.com/', link)
1536
1537 video_re = r'data-index="\d+".*?data-video-id="([0-9A-Za-z_-]{11})"'
1538 ids = []
1539 # sometimes the webpage doesn't contain the videos
1540 # retry until we get them
1541 for i in itertools.count(0):
1542 msg = u'Downloading Youtube mix'
1543 if i > 0:
1544 msg += ', retry #%d' % i
1545 webpage = self._download_webpage(url, title, msg)
1546 ids = orderedSet(re.findall(video_re, webpage))
1547 if ids:
1548 break
1549 url_results = self._ids_to_results(ids)
1550 return self.playlist_result(url_results, playlist_title=title)
1551
1552
c5e8d7af 1553class YoutubeChannelIE(InfoExtractor):
0f818663 1554 IE_DESC = u'YouTube.com channels'
c5e8d7af 1555 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
c5e8d7af 1556 _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
38c2e5b8 1557 _MORE_PAGES_URL = 'https://www.youtube.com/c4_browse_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
c5e8d7af
PH
1558 IE_NAME = u'youtube:channel'
1559
1560 def extract_videos_from_page(self, page):
1561 ids_in_page = []
1562 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1563 if mobj.group(1) not in ids_in_page:
1564 ids_in_page.append(mobj.group(1))
1565 return ids_in_page
1566
1567 def _real_extract(self, url):
1568 # Extract channel id
1569 mobj = re.match(self._VALID_URL, url)
1570 if mobj is None:
1571 raise ExtractorError(u'Invalid URL: %s' % url)
1572
1573 # Download channel page
1574 channel_id = mobj.group(1)
1575 video_ids = []
b9643eed
JMF
1576 url = 'https://www.youtube.com/channel/%s/videos' % channel_id
1577 channel_page = self._download_webpage(url, channel_id)
31812a9e
PH
1578 autogenerated = re.search(r'''(?x)
1579 class="[^"]*?(?:
1580 channel-header-autogenerated-label|
1581 yt-channel-title-autogenerated
1582 )[^"]*"''', channel_page) is not None
c5e8d7af 1583
b9643eed
JMF
1584 if autogenerated:
1585 # The videos are contained in a single page
1586 # the ajax pages can't be used, they are empty
1587 video_ids = self.extract_videos_from_page(channel_page)
1588 else:
1589 # Download all channel pages using the json-based channel_ajax query
1590 for pagenum in itertools.count(1):
1591 url = self._MORE_PAGES_URL % (pagenum, channel_id)
1592 page = self._download_webpage(url, channel_id,
1593 u'Downloading page #%s' % pagenum)
1594
1595 page = json.loads(page)
1596
1597 ids_in_page = self.extract_videos_from_page(page['content_html'])
1598 video_ids.extend(ids_in_page)
1599
1600 if self._MORE_PAGES_INDICATOR not in page['load_more_widget_html']:
1601 break
c5e8d7af
PH
1602
1603 self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1604
7012b23c
PH
1605 url_entries = [self.url_result(video_id, 'Youtube', video_id=video_id)
1606 for video_id in video_ids]
1607 return self.playlist_result(url_entries, channel_id)
c5e8d7af
PH
1608
1609
1610class YoutubeUserIE(InfoExtractor):
0f818663 1611 IE_DESC = u'YouTube.com user videos (URL or "ytuser" keyword)'
57da92b7 1612 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:user/)?(?!(?:attribution_link|watch)(?:$|[^a-z_A-Z0-9-])))|ytuser:)(?!feed/)([A-Za-z0-9_-]+)'
38c2e5b8 1613 _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/users/%s'
c5e8d7af 1614 _GDATA_PAGE_SIZE = 50
38c2e5b8 1615 _GDATA_URL = 'https://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d&alt=json'
c5e8d7af
PH
1616 IE_NAME = u'youtube:user'
1617
e3ea4790 1618 @classmethod
f4b05232 1619 def suitable(cls, url):
e3ea4790
JMF
1620 # Don't return True if the url can be extracted with other youtube
1621 # extractor, the regex would is too permissive and it would match.
1622 other_ies = iter(klass for (name, klass) in globals().items() if name.endswith('IE') and klass is not cls)
1623 if any(ie.suitable(url) for ie in other_ies): return False
f4b05232
JMF
1624 else: return super(YoutubeUserIE, cls).suitable(url)
1625
c5e8d7af
PH
1626 def _real_extract(self, url):
1627 # Extract username
1628 mobj = re.match(self._VALID_URL, url)
1629 if mobj is None:
1630 raise ExtractorError(u'Invalid URL: %s' % url)
1631
1632 username = mobj.group(1)
1633
1634 # Download video ids using YouTube Data API. Result size per
1635 # query is limited (currently to 50 videos) so we need to query
1636 # page by page until there are no video ids - it means we got
1637 # all of them.
1638
b7ab0590 1639 def download_page(pagenum):
c5e8d7af
PH
1640 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1641
1642 gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
b7ab0590
PH
1643 page = self._download_webpage(
1644 gdata_url, username,
1645 u'Downloading video ids from %d to %d' % (
1646 start_index, start_index + self._GDATA_PAGE_SIZE))
c5e8d7af 1647
fd9cf738
JMF
1648 try:
1649 response = json.loads(page)
1650 except ValueError as err:
1651 raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
71c82637 1652 if 'entry' not in response['feed']:
b7ab0590 1653 return
fd9cf738 1654
c5e8d7af 1655 # Extract video identifiers
e302f9ce
PH
1656 entries = response['feed']['entry']
1657 for entry in entries:
1658 title = entry['title']['$t']
1659 video_id = entry['id']['$t'].split('/')[-1]
b7ab0590 1660 yield {
e302f9ce
PH
1661 '_type': 'url',
1662 'url': video_id,
1663 'ie_key': 'Youtube',
b11cec41 1664 'id': video_id,
e302f9ce 1665 'title': title,
b7ab0590
PH
1666 }
1667 url_results = PagedList(download_page, self._GDATA_PAGE_SIZE)
c5e8d7af 1668
7012b23c
PH
1669 return self.playlist_result(url_results, playlist_title=username)
1670
b05654f0
PH
1671
1672class YoutubeSearchIE(SearchInfoExtractor):
0f818663 1673 IE_DESC = u'YouTube.com searches'
b05654f0
PH
1674 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1675 _MAX_RESULTS = 1000
1676 IE_NAME = u'youtube:search'
1677 _SEARCH_KEY = 'ytsearch'
1678
b05654f0
PH
1679 def _get_n_results(self, query, n):
1680 """Get a specified number of results for a query"""
1681
1682 video_ids = []
1683 pagenum = 0
1684 limit = n
1685
1686 while (50 * pagenum) < limit:
b05654f0 1687 result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
7cc3570e
PH
1688 data_json = self._download_webpage(
1689 result_url, video_id=u'query "%s"' % query,
1690 note=u'Downloading page %s' % (pagenum + 1),
1691 errnote=u'Unable to download API page')
1692 data = json.loads(data_json)
1693 api_response = data['data']
1694
1695 if 'items' not in api_response:
b05654f0
PH
1696 raise ExtractorError(u'[youtube] No video results')
1697
1698 new_ids = list(video['id'] for video in api_response['items'])
1699 video_ids += new_ids
1700
1701 limit = min(n, api_response['totalItems'])
1702 pagenum += 1
1703
1704 if len(video_ids) > n:
1705 video_ids = video_ids[:n]
7012b23c
PH
1706 videos = [self.url_result(video_id, 'Youtube', video_id=video_id)
1707 for video_id in video_ids]
b05654f0 1708 return self.playlist_result(videos, query)
75dff0ee 1709
a3dd9248 1710class YoutubeSearchDateIE(YoutubeSearchIE):
cb7fb546 1711 IE_NAME = YoutubeSearchIE.IE_NAME + ':date'
a3dd9248
CM
1712 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc&orderby=published'
1713 _SEARCH_KEY = 'ytsearchdate'
08fb86c4 1714 IE_DESC = u'YouTube.com searches, newest videos first'
75dff0ee
JMF
1715
1716class YoutubeShowIE(InfoExtractor):
0f818663 1717 IE_DESC = u'YouTube.com (multi-season) shows'
75dff0ee
JMF
1718 _VALID_URL = r'https?://www\.youtube\.com/show/(.*)'
1719 IE_NAME = u'youtube:show'
1720
1721 def _real_extract(self, url):
1722 mobj = re.match(self._VALID_URL, url)
1723 show_name = mobj.group(1)
1724 webpage = self._download_webpage(url, show_name, u'Downloading show webpage')
1725 # There's one playlist for each season of the show
1726 m_seasons = list(re.finditer(r'href="(/playlist\?list=.*?)"', webpage))
1727 self.to_screen(u'%s: Found %s seasons' % (show_name, len(m_seasons)))
1728 return [self.url_result('https://www.youtube.com' + season.group(1), 'YoutubePlaylist') for season in m_seasons]
04cc9617
JMF
1729
1730
b2e8bc1b 1731class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
d7ae0639
JMF
1732 """
1733 Base class for extractors that fetch info from
1734 http://www.youtube.com/feed_ajax
1735 Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.
1736 """
b2e8bc1b 1737 _LOGIN_REQUIRED = True
43ba5456
JMF
1738 # use action_load_personal_feed instead of action_load_system_feed
1739 _PERSONAL_FEED = False
04cc9617 1740
d7ae0639
JMF
1741 @property
1742 def _FEED_TEMPLATE(self):
43ba5456
JMF
1743 action = 'action_load_system_feed'
1744 if self._PERSONAL_FEED:
1745 action = 'action_load_personal_feed'
38c2e5b8 1746 return 'https://www.youtube.com/feed_ajax?%s=1&feed_name=%s&paging=%%s' % (action, self._FEED_NAME)
d7ae0639
JMF
1747
1748 @property
1749 def IE_NAME(self):
1750 return u'youtube:%s' % self._FEED_NAME
04cc9617 1751
81f0259b 1752 def _real_initialize(self):
b2e8bc1b 1753 self._login()
81f0259b 1754
04cc9617
JMF
1755 def _real_extract(self, url):
1756 feed_entries = []
0e44d838
JMF
1757 paging = 0
1758 for i in itertools.count(1):
d7ae0639
JMF
1759 info = self._download_webpage(self._FEED_TEMPLATE % paging,
1760 u'%s feed' % self._FEED_NAME,
04cc9617
JMF
1761 u'Downloading page %s' % i)
1762 info = json.loads(info)
1763 feed_html = info['feed_html']
43ba5456 1764 m_ids = re.finditer(r'"/watch\?v=(.*?)["&]', feed_html)
04cc9617 1765 ids = orderedSet(m.group(1) for m in m_ids)
7012b23c
PH
1766 feed_entries.extend(
1767 self.url_result(video_id, 'Youtube', video_id=video_id)
1768 for video_id in ids)
04cc9617
JMF
1769 if info['paging'] is None:
1770 break
0e44d838 1771 paging = info['paging']
d7ae0639
JMF
1772 return self.playlist_result(feed_entries, playlist_title=self._PLAYLIST_TITLE)
1773
1774class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor):
1775 IE_DESC = u'YouTube.com subscriptions feed, "ytsubs" keyword(requires authentication)'
1776 _VALID_URL = r'https?://www\.youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
1777 _FEED_NAME = 'subscriptions'
1778 _PLAYLIST_TITLE = u'Youtube Subscriptions'
1779
1780class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
1781 IE_DESC = u'YouTube.com recommended videos, "ytrec" keyword (requires authentication)'
1782 _VALID_URL = r'https?://www\.youtube\.com/feed/recommended|:ytrec(?:ommended)?'
1783 _FEED_NAME = 'recommended'
1784 _PLAYLIST_TITLE = u'Youtube Recommended videos'
c626a3d9 1785
43ba5456
JMF
1786class YoutubeWatchLaterIE(YoutubeFeedsInfoExtractor):
1787 IE_DESC = u'Youtube watch later list, "ytwatchlater" keyword (requires authentication)'
1788 _VALID_URL = r'https?://www\.youtube\.com/feed/watch_later|:ytwatchlater'
1789 _FEED_NAME = 'watch_later'
1790 _PLAYLIST_TITLE = u'Youtube Watch Later'
43ba5456 1791 _PERSONAL_FEED = True
c626a3d9 1792
f459d170
JMF
1793class YoutubeHistoryIE(YoutubeFeedsInfoExtractor):
1794 IE_DESC = u'Youtube watch history, "ythistory" keyword (requires authentication)'
1795 _VALID_URL = u'https?://www\.youtube\.com/feed/history|:ythistory'
1796 _FEED_NAME = 'history'
1797 _PERSONAL_FEED = True
1798 _PLAYLIST_TITLE = u'Youtube Watch History'
1799
c626a3d9
JMF
1800class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
1801 IE_NAME = u'youtube:favorites'
1802 IE_DESC = u'YouTube.com favourite videos, "ytfav" keyword (requires authentication)'
c7a7750d 1803 _VALID_URL = r'https?://www\.youtube\.com/my_favorites|:ytfav(?:ou?rites)?'
c626a3d9
JMF
1804 _LOGIN_REQUIRED = True
1805
1806 def _real_extract(self, url):
1807 webpage = self._download_webpage('https://www.youtube.com/my_favorites', 'Youtube Favourites videos')
1808 playlist_id = self._search_regex(r'list=(.+?)["&]', webpage, u'favourites playlist id')
1809 return self.url_result(playlist_id, 'YoutubePlaylist')
15870e90
PH
1810
1811
1812class YoutubeTruncatedURLIE(InfoExtractor):
1813 IE_NAME = 'youtube:truncated_url'
1814 IE_DESC = False # Do not list
975d35db
PH
1815 _VALID_URL = r'''(?x)
1816 (?:https?://)?[^/]+/watch\?feature=[a-z_]+$|
1817 (?:https?://)?(?:www\.)?youtube\.com/attribution_link\?a=[^&]+$
1818 '''
15870e90
PH
1819
1820 def _real_extract(self, url):
1821 raise ExtractorError(
1822 u'Did you forget to quote the URL? Remember that & is a meta '
1823 u'character in most shells, so you want to put the URL in quotes, '
1824 u'like youtube-dl '
b4622a32
PH
1825 u'"http://www.youtube.com/watch?feature=foo&v=BaW_jenozKc" '
1826 u' or simply youtube-dl BaW_jenozKc .',
15870e90 1827 expected=True)