]> jfr.im git - yt-dlp.git/blame - youtube_dl/extractor/youtube.py
[vimeo] Modernize test definition
[yt-dlp.git] / youtube_dl / extractor / youtube.py
CommitLineData
c5e8d7af 1# coding: utf-8
c5e8d7af 2
e0df6211 3import collections
edf3e38e 4import errno
e0df6211 5import io
0ca96d48 6import itertools
c5e8d7af 7import json
c4417ddb 8import os.path
c5e8d7af 9import re
e0df6211
PH
10import string
11import struct
12import traceback
13import zlib
c5e8d7af 14
b05654f0 15from .common import InfoExtractor, SearchInfoExtractor
54d39d8b 16from .subtitles import SubtitlesInfoExtractor
c5e8d7af 17from ..utils import (
edf3e38e 18 compat_chr,
c5e8d7af 19 compat_parse_qs,
c5e8d7af
PH
20 compat_urllib_parse,
21 compat_urllib_request,
7c61bd36 22 compat_urlparse,
c5e8d7af
PH
23 compat_str,
24
25 clean_html,
c38b1e77 26 get_cachedir,
c5e8d7af 27 get_element_by_id,
652cdaa2 28 get_element_by_attribute,
c5e8d7af 29 ExtractorError,
dd27fd17 30 int_or_none,
b7ab0590 31 PagedList,
c91778f8 32 RegexNotFoundError,
c5e8d7af
PH
33 unescapeHTML,
34 unified_strdate,
04cc9617 35 orderedSet,
edf3e38e 36 write_json_file,
81c2f20b 37 uppercase_escape,
c5e8d7af
PH
38)
39
de7f3446 40class YoutubeBaseInfoExtractor(InfoExtractor):
b2e8bc1b
JMF
41 """Provide base functions for Youtube extractors"""
42 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
43 _LANG_URL = r'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
38c2e5b8 44 _AGE_URL = 'https://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
b2e8bc1b
JMF
45 _NETRC_MACHINE = 'youtube'
46 # If True it will raise an error if no login info is provided
47 _LOGIN_REQUIRED = False
48
b2e8bc1b 49 def _set_language(self):
7cc3570e
PH
50 return bool(self._download_webpage(
51 self._LANG_URL, None,
52 note=u'Setting language', errnote='unable to set language',
53 fatal=False))
b2e8bc1b
JMF
54
55 def _login(self):
56 (username, password) = self._get_login_info()
57 # No authentication to be performed
58 if username is None:
59 if self._LOGIN_REQUIRED:
60 raise ExtractorError(u'No login info available, needed for using %s.' % self.IE_NAME, expected=True)
61 return False
62
7cc3570e
PH
63 login_page = self._download_webpage(
64 self._LOGIN_URL, None,
65 note=u'Downloading login page',
66 errnote=u'unable to fetch login page', fatal=False)
67 if login_page is False:
68 return
b2e8bc1b 69
795f28f8
PH
70 galx = self._search_regex(r'(?s)<input.+?name="GALX".+?value="(.+?)"',
71 login_page, u'Login GALX parameter')
c5e8d7af 72
b2e8bc1b
JMF
73 # Log in
74 login_form_strs = {
75 u'continue': u'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
76 u'Email': username,
77 u'GALX': galx,
78 u'Passwd': password,
79 u'PersistentCookie': u'yes',
80 u'_utf8': u'霱',
81 u'bgresponse': u'js_disabled',
82 u'checkConnection': u'',
83 u'checkedDomains': u'youtube',
84 u'dnConn': u'',
b2e8bc1b
JMF
85 u'pstMsg': u'0',
86 u'rmShown': u'1',
87 u'secTok': u'',
88 u'signIn': u'Sign in',
89 u'timeStmp': u'',
90 u'service': u'youtube',
91 u'uilel': u'3',
92 u'hl': u'en_US',
93 }
94 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
95 # chokes on unicode
96 login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
97 login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
7cc3570e
PH
98
99 req = compat_urllib_request.Request(self._LOGIN_URL, login_data)
100 login_results = self._download_webpage(
101 req, None,
102 note=u'Logging in', errnote=u'unable to log in', fatal=False)
103 if login_results is False:
104 return False
105 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
106 self._downloader.report_warning(u'unable to log in: bad username or password')
b2e8bc1b
JMF
107 return False
108 return True
109
110 def _confirm_age(self):
111 age_form = {
7cc3570e
PH
112 'next_url': '/',
113 'action_confirm': 'Confirm',
114 }
5700e779
JMF
115 req = compat_urllib_request.Request(self._AGE_URL,
116 compat_urllib_parse.urlencode(age_form).encode('ascii'))
7cc3570e
PH
117
118 self._download_webpage(
119 req, None,
120 note=u'Confirming age', errnote=u'Unable to confirm age')
b2e8bc1b
JMF
121 return True
122
123 def _real_initialize(self):
124 if self._downloader is None:
125 return
126 if not self._set_language():
127 return
128 if not self._login():
129 return
130 self._confirm_age()
c5e8d7af 131
8377574c 132
de7f3446 133class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
0f818663 134 IE_DESC = u'YouTube.com'
cb7dfeea 135 _VALID_URL = r"""(?x)^
c5e8d7af 136 (
83aa5293 137 (?:https?://|//)? # http(s):// or protocol-independent URL (optional)
cb7dfeea 138 (?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/|
484aaeb2 139 (?:www\.)?deturl\.com/www\.youtube\.com/|
e70dc1d1 140 (?:www\.)?pwnyoutube\.com/|
e69ae5b9
JMF
141 tube\.majestyc\.net/|
142 youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains
c5e8d7af
PH
143 (?:.*?\#/)? # handle anchor (#/) redirect urls
144 (?: # the various things that can precede the ID:
145 (?:(?:v|embed|e)/) # v/ or embed/ or e/
146 |(?: # or the v= param in all its forms
d741e55a 147 (?:(?:watch|movie)(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
c5e8d7af
PH
148 (?:\?|\#!?) # the params delimiter ? or # or #!
149 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
150 v=
151 )
f4b05232
JMF
152 ))
153 |youtu\.be/ # just youtu.be/xxxx
154 )
c5e8d7af 155 )? # all until now is optional -> you can pass the naked ID
8963d9c2 156 ([0-9A-Za-z_-]{11}) # here is it! the YouTube video ID
c5e8d7af
PH
157 (?(1).+)? # if we found the ID, everything can follow
158 $"""
c5e8d7af 159 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
2c62dc26
PH
160 _formats = {
161 '5': {'ext': 'flv', 'width': 400, 'height': 240},
162 '6': {'ext': 'flv', 'width': 450, 'height': 270},
163 '13': {'ext': '3gp'},
164 '17': {'ext': '3gp', 'width': 176, 'height': 144},
165 '18': {'ext': 'mp4', 'width': 640, 'height': 360},
166 '22': {'ext': 'mp4', 'width': 1280, 'height': 720},
167 '34': {'ext': 'flv', 'width': 640, 'height': 360},
168 '35': {'ext': 'flv', 'width': 854, 'height': 480},
169 '36': {'ext': '3gp', 'width': 320, 'height': 240},
170 '37': {'ext': 'mp4', 'width': 1920, 'height': 1080},
171 '38': {'ext': 'mp4', 'width': 4096, 'height': 3072},
172 '43': {'ext': 'webm', 'width': 640, 'height': 360},
173 '44': {'ext': 'webm', 'width': 854, 'height': 480},
174 '45': {'ext': 'webm', 'width': 1280, 'height': 720},
175 '46': {'ext': 'webm', 'width': 1920, 'height': 1080},
176
1d043b93 177
86fe61c8 178 # 3d videos
2c62dc26
PH
179 '82': {'ext': 'mp4', 'height': 360, 'resolution': '360p', 'format_note': '3D', 'preference': -20},
180 '83': {'ext': 'mp4', 'height': 480, 'resolution': '480p', 'format_note': '3D', 'preference': -20},
181 '84': {'ext': 'mp4', 'height': 720, 'resolution': '720p', 'format_note': '3D', 'preference': -20},
182 '85': {'ext': 'mp4', 'height': 1080, 'resolution': '1080p', 'format_note': '3D', 'preference': -20},
183 '100': {'ext': 'webm', 'height': 360, 'resolution': '360p', 'format_note': '3D', 'preference': -20},
184 '101': {'ext': 'webm', 'height': 480, 'resolution': '480p', 'format_note': '3D', 'preference': -20},
185 '102': {'ext': 'webm', 'height': 720, 'resolution': '720p', 'format_note': '3D', 'preference': -20},
836a086c 186
96fb5605 187 # Apple HTTP Live Streaming
2c62dc26
PH
188 '92': {'ext': 'mp4', 'height': 240, 'resolution': '240p', 'format_note': 'HLS', 'preference': -10},
189 '93': {'ext': 'mp4', 'height': 360, 'resolution': '360p', 'format_note': 'HLS', 'preference': -10},
190 '94': {'ext': 'mp4', 'height': 480, 'resolution': '480p', 'format_note': 'HLS', 'preference': -10},
191 '95': {'ext': 'mp4', 'height': 720, 'resolution': '720p', 'format_note': 'HLS', 'preference': -10},
192 '96': {'ext': 'mp4', 'height': 1080, 'resolution': '1080p', 'format_note': 'HLS', 'preference': -10},
193 '132': {'ext': 'mp4', 'height': 240, 'resolution': '240p', 'format_note': 'HLS', 'preference': -10},
194 '151': {'ext': 'mp4', 'height': 72, 'resolution': '72p', 'format_note': 'HLS', 'preference': -10},
195
196 # DASH mp4 video
197 '133': {'ext': 'mp4', 'height': 240, 'resolution': '240p', 'format_note': 'DASH video', 'preference': -40},
198 '134': {'ext': 'mp4', 'height': 360, 'resolution': '360p', 'format_note': 'DASH video', 'preference': -40},
199 '135': {'ext': 'mp4', 'height': 480, 'resolution': '480p', 'format_note': 'DASH video', 'preference': -40},
200 '136': {'ext': 'mp4', 'height': 720, 'resolution': '720p', 'format_note': 'DASH video', 'preference': -40},
201 '137': {'ext': 'mp4', 'height': 1080, 'resolution': '1080p', 'format_note': 'DASH video', 'preference': -40},
202 '138': {'ext': 'mp4', 'height': 1081, 'resolution': '>1080p', 'format_note': 'DASH video', 'preference': -40},
203 '160': {'ext': 'mp4', 'height': 192, 'resolution': '192p', 'format_note': 'DASH video', 'preference': -40},
8fa8a629 204 '264': {'ext': 'mp4', 'height': 1080, 'resolution': '1080p', 'format_note': 'DASH video', 'preference': -40},
836a086c 205
f6f1fc92 206 # Dash mp4 audio
2c62dc26
PH
207 '139': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 48, 'preference': -50},
208 '140': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 128, 'preference': -50},
209 '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 256, 'preference': -50},
836a086c
AZ
210
211 # Dash webm
1394ce65
PH
212 '167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'VP8', 'acodec': 'none', 'preference': -40},
213 '168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'VP8', 'acodec': 'none', 'preference': -40},
2c5bae42
PH
214 '169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'VP8', 'acodec': 'none', 'preference': -40},
215 '170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'VP8', 'acodec': 'none', 'preference': -40},
1394ce65
PH
216 '218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'VP8', 'acodec': 'none', 'preference': -40},
217 '219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'VP8', 'acodec': 'none', 'preference': -40},
2c62dc26
PH
218 '242': {'ext': 'webm', 'height': 240, 'resolution': '240p', 'format_note': 'DASH webm', 'preference': -40},
219 '243': {'ext': 'webm', 'height': 360, 'resolution': '360p', 'format_note': 'DASH webm', 'preference': -40},
220 '244': {'ext': 'webm', 'height': 480, 'resolution': '480p', 'format_note': 'DASH webm', 'preference': -40},
221 '245': {'ext': 'webm', 'height': 480, 'resolution': '480p', 'format_note': 'DASH webm', 'preference': -40},
222 '246': {'ext': 'webm', 'height': 480, 'resolution': '480p', 'format_note': 'DASH webm', 'preference': -40},
223 '247': {'ext': 'webm', 'height': 720, 'resolution': '720p', 'format_note': 'DASH webm', 'preference': -40},
224 '248': {'ext': 'webm', 'height': 1080, 'resolution': '1080p', 'format_note': 'DASH webm', 'preference': -40},
225
226 # Dash webm audio
227 '171': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH webm audio', 'abr': 48, 'preference': -50},
228 '172': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH webm audio', 'abr': 256, 'preference': -50},
ce6b9a2d
PH
229
230 # RTMP (unnamed)
231 '_rtmp': {'protocol': 'rtmp'},
c5e8d7af 232 }
836a086c 233
c5e8d7af 234 IE_NAME = u'youtube'
2eb88d95
PH
235 _TESTS = [
236 {
0e853ca4
PH
237 u"url": u"http://www.youtube.com/watch?v=BaW_jenozKc",
238 u"file": u"BaW_jenozKc.mp4",
239 u"info_dict": {
240 u"title": u"youtube-dl test video \"'/\\ä↭𝕐",
241 u"uploader": u"Philipp Hagemeister",
242 u"uploader_id": u"phihag",
243 u"upload_date": u"20121002",
27dcce19 244 u"description": u"test chars: \"'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de ."
2eb88d95 245 }
0e853ca4 246 },
0e853ca4
PH
247 {
248 u"url": u"http://www.youtube.com/watch?v=UxxajLWwzqY",
249 u"file": u"UxxajLWwzqY.mp4",
250 u"note": u"Test generic use_cipher_signature video (#897)",
251 u"info_dict": {
252 u"upload_date": u"20120506",
253 u"title": u"Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]",
2dc59299 254 u"description": u"md5:5b292926389560516e384ac437c0ec07",
45ed795c 255 u"uploader": u"Icona Pop",
0e853ca4 256 u"uploader_id": u"IconaPop"
2eb88d95 257 }
c108eb73
JMF
258 },
259 {
260 u"url": u"https://www.youtube.com/watch?v=07FYdnEawAQ",
261 u"file": u"07FYdnEawAQ.mp4",
262 u"note": u"Test VEVO video with age protection (#956)",
263 u"info_dict": {
264 u"upload_date": u"20130703",
265 u"title": u"Justin Timberlake - Tunnel Vision (Explicit)",
266 u"description": u"md5:64249768eec3bc4276236606ea996373",
267 u"uploader": u"justintimberlakeVEVO",
268 u"uploader_id": u"justintimberlakeVEVO"
269 }
270 },
fccd3771 271 {
83aa5293 272 u"url": u"//www.YouTube.com/watch?v=yZIXLfi8CZQ",
fccd3771
PH
273 u"file": u"yZIXLfi8CZQ.mp4",
274 u"note": u"Embed-only video (#1746)",
275 u"info_dict": {
276 u"upload_date": u"20120608",
277 u"title": u"Principal Sexually Assaults A Teacher - Episode 117 - 8th June 2012",
278 u"description": u"md5:09b78bd971f1e3e289601dfba15ca4f7",
279 u"uploader": u"SET India",
280 u"uploader_id": u"setindia"
281 }
282 },
dd27fd17
PH
283 {
284 u"url": u"http://www.youtube.com/watch?v=a9LDPn-MO4I",
285 u"file": u"a9LDPn-MO4I.m4a",
286 u"note": u"256k DASH audio (format 141) via DASH manifest",
dd27fd17
PH
287 u"info_dict": {
288 u"upload_date": "20121002",
289 u"uploader_id": "8KVIDEO",
290 u"description": "No description available.",
291 u"uploader": "8KVIDEO",
292 u"title": "UHDTV TEST 8K VIDEO.mp4"
4919603f
PH
293 },
294 u"params": {
295 u"youtube_include_dash_manifest": True,
296 u"format": "141",
297 },
dd27fd17 298 },
2eb88d95
PH
299 ]
300
c5e8d7af
PH
301
302 @classmethod
303 def suitable(cls, url):
304 """Receives a URL and returns True if suitable for this IE."""
e3ea4790 305 if YoutubePlaylistIE.suitable(url): return False
fccd3771 306 return re.match(cls._VALID_URL, url) is not None
c5e8d7af 307
e0df6211
PH
308 def __init__(self, *args, **kwargs):
309 super(YoutubeIE, self).__init__(*args, **kwargs)
83799698 310 self._player_cache = {}
e0df6211 311
c5e8d7af
PH
312 def report_video_info_webpage_download(self, video_id):
313 """Report attempt to download video info webpage."""
314 self.to_screen(u'%s: Downloading video info webpage' % video_id)
315
c5e8d7af
PH
316 def report_information_extraction(self, video_id):
317 """Report attempt to extract video information."""
318 self.to_screen(u'%s: Extracting video information' % video_id)
319
320 def report_unavailable_format(self, video_id, format):
321 """Report extracted video URL."""
322 self.to_screen(u'%s: Format %s not available' % (video_id, format))
323
324 def report_rtmp_download(self):
325 """Indicate the download will use the RTMP protocol."""
326 self.to_screen(u'RTMP download detected')
327
c4417ddb
PH
328 def _extract_signature_function(self, video_id, player_url, slen):
329 id_m = re.match(r'.*-(?P<id>[a-zA-Z0-9_-]+)\.(?P<ext>[a-z]+)$',
83799698 330 player_url)
e0df6211
PH
331 player_type = id_m.group('ext')
332 player_id = id_m.group('id')
333
c4417ddb
PH
334 # Read from filesystem cache
335 func_id = '%s_%s_%d' % (player_type, player_id, slen)
336 assert os.path.basename(func_id) == func_id
c38b1e77 337 cache_dir = get_cachedir(self._downloader.params)
c4417ddb 338
c3c88a26 339 cache_enabled = cache_dir is not None
f8061589 340 if cache_enabled:
c4417ddb
PH
341 cache_fn = os.path.join(os.path.expanduser(cache_dir),
342 u'youtube-sigfuncs',
343 func_id + '.json')
344 try:
edf3e38e 345 with io.open(cache_fn, 'r', encoding='utf-8') as cachef:
c4417ddb
PH
346 cache_spec = json.load(cachef)
347 return lambda s: u''.join(s[i] for i in cache_spec)
edf3e38e 348 except IOError:
c4417ddb 349 pass # No cache available
83799698 350
e0df6211
PH
351 if player_type == 'js':
352 code = self._download_webpage(
353 player_url, video_id,
83799698 354 note=u'Downloading %s player %s' % (player_type, player_id),
e0df6211 355 errnote=u'Download of %s failed' % player_url)
83799698 356 res = self._parse_sig_js(code)
c4417ddb 357 elif player_type == 'swf':
e0df6211
PH
358 urlh = self._request_webpage(
359 player_url, video_id,
83799698 360 note=u'Downloading %s player %s' % (player_type, player_id),
e0df6211
PH
361 errnote=u'Download of %s failed' % player_url)
362 code = urlh.read()
83799698 363 res = self._parse_sig_swf(code)
e0df6211
PH
364 else:
365 assert False, 'Invalid player type %r' % player_type
366
f8061589 367 if cache_enabled:
edf3e38e 368 try:
c705320f
PH
369 test_string = u''.join(map(compat_chr, range(slen)))
370 cache_res = res(test_string)
edf3e38e
PH
371 cache_spec = [ord(c) for c in cache_res]
372 try:
373 os.makedirs(os.path.dirname(cache_fn))
374 except OSError as ose:
375 if ose.errno != errno.EEXIST:
376 raise
377 write_json_file(cache_spec, cache_fn)
0ca96d48 378 except Exception:
edf3e38e
PH
379 tb = traceback.format_exc()
380 self._downloader.report_warning(
381 u'Writing cache to %r failed: %s' % (cache_fn, tb))
83799698
PH
382
383 return res
384
edf3e38e
PH
385 def _print_sig_code(self, func, slen):
386 def gen_sig_code(idxs):
387 def _genslice(start, end, step):
388 starts = u'' if start == 0 else str(start)
e35e4ddc
PH
389 ends = (u':%d' % (end+step)) if end + step >= 0 else u':'
390 steps = u'' if step == 1 else (u':%d' % step)
edf3e38e
PH
391 return u's[%s%s%s]' % (starts, ends, steps)
392
393 step = None
0ca96d48
PH
394 start = '(Never used)' # Quelch pyflakes warnings - start will be
395 # set as soon as step is set
edf3e38e
PH
396 for i, prev in zip(idxs[1:], idxs[:-1]):
397 if step is not None:
398 if i - prev == step:
399 continue
400 yield _genslice(start, prev, step)
401 step = None
402 continue
403 if i - prev in [-1, 1]:
404 step = i - prev
405 start = prev
406 continue
407 else:
408 yield u's[%d]' % prev
409 if step is None:
410 yield u's[%d]' % i
411 else:
412 yield _genslice(start, i, step)
413
c705320f
PH
414 test_string = u''.join(map(compat_chr, range(slen)))
415 cache_res = func(test_string)
edf3e38e
PH
416 cache_spec = [ord(c) for c in cache_res]
417 expr_code = u' + '.join(gen_sig_code(cache_spec))
418 code = u'if len(s) == %d:\n return %s\n' % (slen, expr_code)
f8061589 419 self.to_screen(u'Extracted signature function:\n' + code)
edf3e38e 420
e0df6211
PH
421 def _parse_sig_js(self, jscode):
422 funcname = self._search_regex(
423 r'signature=([a-zA-Z]+)', jscode,
424 u'Initial JS player signature function name')
425
426 functions = {}
427
428 def argidx(varname):
429 return string.lowercase.index(varname)
430
431 def interpret_statement(stmt, local_vars, allow_recursion=20):
432 if allow_recursion < 0:
0ca96d48 433 raise ExtractorError(u'Recursion limit reached')
e0df6211
PH
434
435 if stmt.startswith(u'var '):
436 stmt = stmt[len(u'var '):]
437 ass_m = re.match(r'^(?P<out>[a-z]+)(?:\[(?P<index>[^\]]+)\])?' +
438 r'=(?P<expr>.*)$', stmt)
439 if ass_m:
440 if ass_m.groupdict().get('index'):
441 def assign(val):
442 lvar = local_vars[ass_m.group('out')]
443 idx = interpret_expression(ass_m.group('index'),
444 local_vars, allow_recursion)
445 assert isinstance(idx, int)
446 lvar[idx] = val
447 return val
448 expr = ass_m.group('expr')
449 else:
450 def assign(val):
451 local_vars[ass_m.group('out')] = val
452 return val
453 expr = ass_m.group('expr')
454 elif stmt.startswith(u'return '):
455 assign = lambda v: v
456 expr = stmt[len(u'return '):]
457 else:
458 raise ExtractorError(
459 u'Cannot determine left side of statement in %r' % stmt)
460
461 v = interpret_expression(expr, local_vars, allow_recursion)
462 return assign(v)
463
464 def interpret_expression(expr, local_vars, allow_recursion):
465 if expr.isdigit():
466 return int(expr)
467
468 if expr.isalpha():
469 return local_vars[expr]
470
471 m = re.match(r'^(?P<in>[a-z]+)\.(?P<member>.*)$', expr)
472 if m:
473 member = m.group('member')
474 val = local_vars[m.group('in')]
475 if member == 'split("")':
476 return list(val)
477 if member == 'join("")':
478 return u''.join(val)
479 if member == 'length':
480 return len(val)
481 if member == 'reverse()':
482 return val[::-1]
483 slice_m = re.match(r'slice\((?P<idx>.*)\)', member)
484 if slice_m:
485 idx = interpret_expression(
486 slice_m.group('idx'), local_vars, allow_recursion-1)
487 return val[idx:]
488
489 m = re.match(
490 r'^(?P<in>[a-z]+)\[(?P<idx>.+)\]$', expr)
491 if m:
492 val = local_vars[m.group('in')]
493 idx = interpret_expression(m.group('idx'), local_vars,
494 allow_recursion-1)
495 return val[idx]
496
497 m = re.match(r'^(?P<a>.+?)(?P<op>[%])(?P<b>.+?)$', expr)
498 if m:
499 a = interpret_expression(m.group('a'),
500 local_vars, allow_recursion)
501 b = interpret_expression(m.group('b'),
502 local_vars, allow_recursion)
503 return a % b
504
505 m = re.match(
20650c86 506 r'^(?P<func>[a-zA-Z$]+)\((?P<args>[a-z0-9,]+)\)$', expr)
e0df6211
PH
507 if m:
508 fname = m.group('func')
509 if fname not in functions:
510 functions[fname] = extract_function(fname)
511 argvals = [int(v) if v.isdigit() else local_vars[v]
512 for v in m.group('args').split(',')]
513 return functions[fname](argvals)
514 raise ExtractorError(u'Unsupported JS expression %r' % expr)
515
516 def extract_function(funcname):
517 func_m = re.search(
518 r'function ' + re.escape(funcname) +
519 r'\((?P<args>[a-z,]+)\){(?P<code>[^}]+)}',
520 jscode)
521 argnames = func_m.group('args').split(',')
522
523 def resf(args):
524 local_vars = dict(zip(argnames, args))
525 for stmt in func_m.group('code').split(';'):
526 res = interpret_statement(stmt, local_vars)
527 return res
528 return resf
529
530 initial_function = extract_function(funcname)
531 return lambda s: initial_function([s])
532
533 def _parse_sig_swf(self, file_contents):
534 if file_contents[1:3] != b'WS':
535 raise ExtractorError(
536 u'Not an SWF file; header is %r' % file_contents[:3])
537 if file_contents[:1] == b'C':
538 content = zlib.decompress(file_contents[8:])
539 else:
540 raise NotImplementedError(u'Unsupported compression format %r' %
541 file_contents[:1])
542
543 def extract_tags(content):
544 pos = 0
545 while pos < len(content):
546 header16 = struct.unpack('<H', content[pos:pos+2])[0]
547 pos += 2
548 tag_code = header16 >> 6
549 tag_len = header16 & 0x3f
550 if tag_len == 0x3f:
551 tag_len = struct.unpack('<I', content[pos:pos+4])[0]
552 pos += 4
553 assert pos+tag_len <= len(content)
554 yield (tag_code, content[pos:pos+tag_len])
555 pos += tag_len
556
557 code_tag = next(tag
558 for tag_code, tag in extract_tags(content)
559 if tag_code == 82)
560 p = code_tag.index(b'\0', 4) + 1
ba552f54 561 code_reader = io.BytesIO(code_tag[p:])
e0df6211
PH
562
563 # Parse ABC (AVM2 ByteCode)
ba552f54
PH
564 def read_int(reader=None):
565 if reader is None:
566 reader = code_reader
e0df6211
PH
567 res = 0
568 shift = 0
569 for _ in range(5):
ba552f54
PH
570 buf = reader.read(1)
571 assert len(buf) == 1
572 b = struct.unpack('<B', buf)[0]
e0df6211
PH
573 res = res | ((b & 0x7f) << shift)
574 if b & 0x80 == 0:
575 break
576 shift += 7
ba552f54
PH
577 return res
578
579 def u30(reader=None):
580 res = read_int(reader)
581 assert res & 0xf0000000 == 0
e0df6211
PH
582 return res
583 u32 = read_int
584
ba552f54
PH
585 def s32(reader=None):
586 v = read_int(reader)
e0df6211
PH
587 if v & 0x80000000 != 0:
588 v = - ((v ^ 0xffffffff) + 1)
ba552f54
PH
589 return v
590
0ca96d48 591 def read_string(reader=None):
ba552f54
PH
592 if reader is None:
593 reader = code_reader
594 slen = u30(reader)
595 resb = reader.read(slen)
596 assert len(resb) == slen
597 return resb.decode('utf-8')
598
599 def read_bytes(count, reader=None):
600 if reader is None:
601 reader = code_reader
602 resb = reader.read(count)
603 assert len(resb) == count
604 return resb
605
606 def read_byte(reader=None):
607 resb = read_bytes(1, reader=reader)
608 res = struct.unpack('<B', resb)[0]
609 return res
e0df6211
PH
610
611 # minor_version + major_version
0ca96d48 612 read_bytes(2 + 2)
e0df6211
PH
613
614 # Constant pool
ba552f54 615 int_count = u30()
e0df6211 616 for _c in range(1, int_count):
0ca96d48 617 s32()
ba552f54 618 uint_count = u30()
e0df6211 619 for _c in range(1, uint_count):
0ca96d48 620 u32()
ba552f54 621 double_count = u30()
0ca96d48 622 read_bytes((double_count-1) * 8)
ba552f54 623 string_count = u30()
e0df6211
PH
624 constant_strings = [u'']
625 for _c in range(1, string_count):
0ca96d48 626 s = read_string()
e0df6211 627 constant_strings.append(s)
ba552f54 628 namespace_count = u30()
e0df6211 629 for _c in range(1, namespace_count):
0ca96d48
PH
630 read_bytes(1) # kind
631 u30() # name
ba552f54 632 ns_set_count = u30()
e0df6211 633 for _c in range(1, ns_set_count):
ba552f54 634 count = u30()
e0df6211 635 for _c2 in range(count):
0ca96d48 636 u30()
ba552f54 637 multiname_count = u30()
e0df6211
PH
638 MULTINAME_SIZES = {
639 0x07: 2, # QName
640 0x0d: 2, # QNameA
641 0x0f: 1, # RTQName
642 0x10: 1, # RTQNameA
643 0x11: 0, # RTQNameL
644 0x12: 0, # RTQNameLA
645 0x09: 2, # Multiname
646 0x0e: 2, # MultinameA
647 0x1b: 1, # MultinameL
648 0x1c: 1, # MultinameLA
649 }
650 multinames = [u'']
651 for _c in range(1, multiname_count):
ba552f54 652 kind = u30()
e0df6211
PH
653 assert kind in MULTINAME_SIZES, u'Invalid multiname kind %r' % kind
654 if kind == 0x07:
0ca96d48 655 u30() # namespace_idx
ba552f54 656 name_idx = u30()
e0df6211
PH
657 multinames.append(constant_strings[name_idx])
658 else:
659 multinames.append('[MULTINAME kind: %d]' % kind)
660 for _c2 in range(MULTINAME_SIZES[kind]):
0ca96d48 661 u30()
e0df6211
PH
662
663 # Methods
ba552f54 664 method_count = u30()
e0df6211
PH
665 MethodInfo = collections.namedtuple(
666 'MethodInfo',
667 ['NEED_ARGUMENTS', 'NEED_REST'])
668 method_infos = []
669 for method_id in range(method_count):
ba552f54 670 param_count = u30()
0ca96d48 671 u30() # return type
e0df6211 672 for _ in range(param_count):
0ca96d48
PH
673 u30() # param type
674 u30() # name index (always 0 for youtube)
ba552f54 675 flags = read_byte()
e0df6211
PH
676 if flags & 0x08 != 0:
677 # Options present
ba552f54 678 option_count = u30()
e0df6211 679 for c in range(option_count):
0ca96d48
PH
680 u30() # val
681 read_bytes(1) # kind
e0df6211
PH
682 if flags & 0x80 != 0:
683 # Param names present
684 for _ in range(param_count):
0ca96d48 685 u30() # param name
e0df6211
PH
686 mi = MethodInfo(flags & 0x01 != 0, flags & 0x04 != 0)
687 method_infos.append(mi)
688
689 # Metadata
ba552f54 690 metadata_count = u30()
e0df6211 691 for _c in range(metadata_count):
0ca96d48 692 u30() # name
ba552f54 693 item_count = u30()
e0df6211 694 for _c2 in range(item_count):
0ca96d48
PH
695 u30() # key
696 u30() # value
ba552f54
PH
697
698 def parse_traits_info():
699 trait_name_idx = u30()
700 kind_full = read_byte()
e0df6211
PH
701 kind = kind_full & 0x0f
702 attrs = kind_full >> 4
703 methods = {}
704 if kind in [0x00, 0x06]: # Slot or Const
0ca96d48
PH
705 u30() # Slot id
706 u30() # type_name_idx
ba552f54 707 vindex = u30()
e0df6211 708 if vindex != 0:
0ca96d48 709 read_byte() # vkind
e0df6211 710 elif kind in [0x01, 0x02, 0x03]: # Method / Getter / Setter
0ca96d48 711 u30() # disp_id
ba552f54 712 method_idx = u30()
e0df6211
PH
713 methods[multinames[trait_name_idx]] = method_idx
714 elif kind == 0x04: # Class
0ca96d48
PH
715 u30() # slot_id
716 u30() # classi
e0df6211 717 elif kind == 0x05: # Function
0ca96d48 718 u30() # slot_id
ba552f54 719 function_idx = u30()
e0df6211
PH
720 methods[function_idx] = multinames[trait_name_idx]
721 else:
722 raise ExtractorError(u'Unsupported trait kind %d' % kind)
723
724 if attrs & 0x4 != 0: # Metadata present
ba552f54 725 metadata_count = u30()
e0df6211 726 for _c3 in range(metadata_count):
0ca96d48 727 u30() # metadata index
e0df6211 728
ba552f54 729 return methods
e0df6211
PH
730
731 # Classes
732 TARGET_CLASSNAME = u'SignatureDecipher'
733 searched_idx = multinames.index(TARGET_CLASSNAME)
734 searched_class_id = None
ba552f54 735 class_count = u30()
e0df6211 736 for class_id in range(class_count):
ba552f54 737 name_idx = u30()
e0df6211
PH
738 if name_idx == searched_idx:
739 # We found the class we're looking for!
740 searched_class_id = class_id
0ca96d48 741 u30() # super_name idx
ba552f54 742 flags = read_byte()
e0df6211 743 if flags & 0x08 != 0: # Protected namespace is present
0ca96d48 744 u30() # protected_ns_idx
ba552f54 745 intrf_count = u30()
e0df6211 746 for _c2 in range(intrf_count):
0ca96d48
PH
747 u30()
748 u30() # iinit
ba552f54 749 trait_count = u30()
e0df6211 750 for _c2 in range(trait_count):
0ca96d48 751 parse_traits_info()
e0df6211
PH
752
753 if searched_class_id is None:
754 raise ExtractorError(u'Target class %r not found' %
755 TARGET_CLASSNAME)
756
757 method_names = {}
758 method_idxs = {}
759 for class_id in range(class_count):
0ca96d48 760 u30() # cinit
ba552f54 761 trait_count = u30()
e0df6211 762 for _c2 in range(trait_count):
ba552f54 763 trait_methods = parse_traits_info()
e0df6211
PH
764 if class_id == searched_class_id:
765 method_names.update(trait_methods.items())
766 method_idxs.update(dict(
767 (idx, name)
768 for name, idx in trait_methods.items()))
769
770 # Scripts
ba552f54 771 script_count = u30()
e0df6211 772 for _c in range(script_count):
0ca96d48 773 u30() # init
ba552f54 774 trait_count = u30()
e0df6211 775 for _c2 in range(trait_count):
0ca96d48 776 parse_traits_info()
e0df6211
PH
777
778 # Method bodies
ba552f54 779 method_body_count = u30()
e0df6211
PH
780 Method = collections.namedtuple('Method', ['code', 'local_count'])
781 methods = {}
782 for _c in range(method_body_count):
ba552f54 783 method_idx = u30()
0ca96d48 784 u30() # max_stack
ba552f54 785 local_count = u30()
0ca96d48
PH
786 u30() # init_scope_depth
787 u30() # max_scope_depth
ba552f54
PH
788 code_length = u30()
789 code = read_bytes(code_length)
e0df6211 790 if method_idx in method_idxs:
ba552f54 791 m = Method(code, local_count)
e0df6211 792 methods[method_idxs[method_idx]] = m
ba552f54 793 exception_count = u30()
e0df6211 794 for _c2 in range(exception_count):
0ca96d48
PH
795 u30() # from
796 u30() # to
797 u30() # target
798 u30() # exc_type
799 u30() # var_name
ba552f54 800 trait_count = u30()
e0df6211 801 for _c2 in range(trait_count):
0ca96d48 802 parse_traits_info()
e0df6211 803
ba552f54 804 assert p + code_reader.tell() == len(code_tag)
e0df6211
PH
805 assert len(methods) == len(method_idxs)
806
807 method_pyfunctions = {}
808
809 def extract_function(func_name):
810 if func_name in method_pyfunctions:
811 return method_pyfunctions[func_name]
812 if func_name not in methods:
813 raise ExtractorError(u'Cannot find function %r' % func_name)
814 m = methods[func_name]
815
816 def resfunc(args):
e0df6211
PH
817 registers = ['(this)'] + list(args) + [None] * m.local_count
818 stack = []
819 coder = io.BytesIO(m.code)
820 while True:
821 opcode = struct.unpack('!B', coder.read(1))[0]
a7177865 822 if opcode == 36: # pushbyte
e0df6211
PH
823 v = struct.unpack('!B', coder.read(1))[0]
824 stack.append(v)
825 elif opcode == 44: # pushstring
826 idx = u30(coder)
827 stack.append(constant_strings[idx])
828 elif opcode == 48: # pushscope
829 # We don't implement the scope register, so we'll just
830 # ignore the popped value
831 stack.pop()
832 elif opcode == 70: # callproperty
833 index = u30(coder)
834 mname = multinames[index]
835 arg_count = u30(coder)
836 args = list(reversed(
837 [stack.pop() for _ in range(arg_count)]))
838 obj = stack.pop()
839 if mname == u'split':
840 assert len(args) == 1
841 assert isinstance(args[0], compat_str)
842 assert isinstance(obj, compat_str)
843 if args[0] == u'':
844 res = list(obj)
845 else:
846 res = obj.split(args[0])
847 stack.append(res)
a7177865
PH
848 elif mname == u'slice':
849 assert len(args) == 1
850 assert isinstance(args[0], int)
851 assert isinstance(obj, list)
852 res = obj[args[0]:]
853 stack.append(res)
854 elif mname == u'join':
855 assert len(args) == 1
856 assert isinstance(args[0], compat_str)
857 assert isinstance(obj, list)
858 res = args[0].join(obj)
859 stack.append(res)
e0df6211
PH
860 elif mname in method_pyfunctions:
861 stack.append(method_pyfunctions[mname](args))
862 else:
863 raise NotImplementedError(
864 u'Unsupported property %r on %r'
865 % (mname, obj))
a7177865
PH
866 elif opcode == 72: # returnvalue
867 res = stack.pop()
868 return res
869 elif opcode == 79: # callpropvoid
870 index = u30(coder)
871 mname = multinames[index]
872 arg_count = u30(coder)
873 args = list(reversed(
874 [stack.pop() for _ in range(arg_count)]))
875 obj = stack.pop()
876 if mname == u'reverse':
877 assert isinstance(obj, list)
878 obj.reverse()
879 else:
880 raise NotImplementedError(
881 u'Unsupported (void) property %r on %r'
882 % (mname, obj))
e0df6211
PH
883 elif opcode == 93: # findpropstrict
884 index = u30(coder)
885 mname = multinames[index]
886 res = extract_function(mname)
887 stack.append(res)
888 elif opcode == 97: # setproperty
889 index = u30(coder)
890 value = stack.pop()
891 idx = stack.pop()
892 obj = stack.pop()
893 assert isinstance(obj, list)
894 assert isinstance(idx, int)
895 obj[idx] = value
896 elif opcode == 98: # getlocal
897 index = u30(coder)
898 stack.append(registers[index])
899 elif opcode == 99: # setlocal
900 index = u30(coder)
901 value = stack.pop()
902 registers[index] = value
903 elif opcode == 102: # getproperty
904 index = u30(coder)
905 pname = multinames[index]
906 if pname == u'length':
907 obj = stack.pop()
908 assert isinstance(obj, list)
909 stack.append(len(obj))
910 else: # Assume attribute access
911 idx = stack.pop()
912 assert isinstance(idx, int)
913 obj = stack.pop()
914 assert isinstance(obj, list)
915 stack.append(obj[idx])
916 elif opcode == 128: # coerce
0ca96d48 917 u30(coder)
e0df6211
PH
918 elif opcode == 133: # coerce_s
919 assert isinstance(stack[-1], (type(None), compat_str))
920 elif opcode == 164: # modulo
921 value2 = stack.pop()
922 value1 = stack.pop()
923 res = value1 % value2
924 stack.append(res)
a7177865
PH
925 elif opcode == 208: # getlocal_0
926 stack.append(registers[0])
927 elif opcode == 209: # getlocal_1
928 stack.append(registers[1])
929 elif opcode == 210: # getlocal_2
930 stack.append(registers[2])
931 elif opcode == 211: # getlocal_3
932 stack.append(registers[3])
e0df6211
PH
933 elif opcode == 214: # setlocal_2
934 registers[2] = stack.pop()
935 elif opcode == 215: # setlocal_3
936 registers[3] = stack.pop()
937 else:
938 raise NotImplementedError(
939 u'Unsupported opcode %d' % opcode)
940
941 method_pyfunctions[func_name] = resfunc
942 return resfunc
943
944 initial_function = extract_function(u'decipher')
945 return lambda s: initial_function([s])
946
83799698 947 def _decrypt_signature(self, s, video_id, player_url, age_gate=False):
257a2501 948 """Turn the encrypted s field into a working signature"""
6b37f0be 949
83799698 950 if player_url is not None:
9f9be844
PH
951 if player_url.startswith(u'//'):
952 player_url = u'https:' + player_url
e0df6211 953 try:
7f8ae73a
PH
954 player_id = (player_url, len(s))
955 if player_id not in self._player_cache:
83799698 956 func = self._extract_signature_function(
c4417ddb 957 video_id, player_url, len(s)
e0df6211 958 )
7f8ae73a
PH
959 self._player_cache[player_id] = func
960 func = self._player_cache[player_id]
edf3e38e
PH
961 if self._downloader.params.get('youtube_print_sig_code'):
962 self._print_sig_code(func, len(s))
963 return func(s)
0ca96d48 964 except Exception:
e0df6211 965 tb = traceback.format_exc()
83799698
PH
966 self._downloader.report_warning(
967 u'Automatic signature extraction failed: ' + tb)
e0df6211 968
d2d8f895
PH
969 self._downloader.report_warning(
970 u'Warning: Falling back to static signature algorithm')
920de7a2 971
2f2ffea9
PH
972 return self._static_decrypt_signature(
973 s, video_id, player_url, age_gate)
e0df6211 974
2f2ffea9 975 def _static_decrypt_signature(self, s, video_id, player_url, age_gate):
e0df6211
PH
976 if age_gate:
977 # The videos with age protection use another player, so the
978 # algorithms can be different.
979 if len(s) == 86:
980 return s[2:63] + s[82] + s[64:82] + s[63]
981
bc4b9008 982 if len(s) == 93:
983 return s[86:29:-1] + s[88] + s[28:5:-1]
984 elif len(s) == 92:
444b1165 985 return s[25] + s[3:25] + s[0] + s[26:42] + s[79] + s[43:79] + s[91] + s[80:83]
38d025b3
JMF
986 elif len(s) == 91:
987 return s[84:27:-1] + s[86] + s[26:5:-1]
444b1165
JMF
988 elif len(s) == 90:
989 return s[25] + s[3:25] + s[2] + s[26:40] + s[77] + s[41:77] + s[89] + s[78:81]
8a9d86a2 990 elif len(s) == 89:
991 return s[84:78:-1] + s[87] + s[77:60:-1] + s[0] + s[59:3:-1]
444b1165 992 elif len(s) == 88:
3e223834 993 return s[7:28] + s[87] + s[29:45] + s[55] + s[46:55] + s[2] + s[56:87] + s[28]
be547e1d 994 elif len(s) == 87:
3a725669 995 return s[6:27] + s[4] + s[28:39] + s[27] + s[40:59] + s[2] + s[60:]
be547e1d 996 elif len(s) == 86:
f2c327fd 997 return s[80:72:-1] + s[16] + s[71:39:-1] + s[72] + s[38:16:-1] + s[82] + s[15::-1]
be547e1d 998 elif len(s) == 85:
6ae8ee3f 999 return s[3:11] + s[0] + s[12:55] + s[84] + s[56:84]
be547e1d 1000 elif len(s) == 84:
6f56389b 1001 return s[78:70:-1] + s[14] + s[69:37:-1] + s[70] + s[36:14:-1] + s[80] + s[:14][::-1]
be547e1d 1002 elif len(s) == 83:
920de7a2 1003 return s[80:63:-1] + s[0] + s[62:0:-1] + s[63]
be547e1d 1004 elif len(s) == 82:
c21315f2 1005 return s[80:37:-1] + s[7] + s[36:7:-1] + s[0] + s[6:0:-1] + s[37]
be547e1d 1006 elif len(s) == 81:
aedd6bb9 1007 return s[56] + s[79:56:-1] + s[41] + s[55:41:-1] + s[80] + s[40:34:-1] + s[0] + s[33:29:-1] + s[34] + s[28:9:-1] + s[29] + s[8:0:-1] + s[9]
066090dd
JMF
1008 elif len(s) == 80:
1009 return s[1:19] + s[0] + s[20:68] + s[19] + s[69:80]
5c468ca8
JMF
1010 elif len(s) == 79:
1011 return s[54] + s[77:54:-1] + s[39] + s[53:39:-1] + s[78] + s[38:34:-1] + s[0] + s[33:29:-1] + s[34] + s[28:9:-1] + s[29] + s[8:0:-1] + s[9]
be547e1d
PH
1012
1013 else:
1014 raise ExtractorError(u'Unable to decrypt signature, key length %d not supported; retrying might work' % (len(s)))
c5e8d7af 1015
1f343eaa 1016 def _get_available_subtitles(self, video_id, webpage):
de7f3446 1017 try:
7fad1c63 1018 sub_list = self._download_webpage(
38c2e5b8 1019 'https://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,
7fad1c63
JMF
1020 video_id, note=False)
1021 except ExtractorError as err:
de7f3446
JMF
1022 self._downloader.report_warning(u'unable to download video subtitles: %s' % compat_str(err))
1023 return {}
1024 lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
1025
1026 sub_lang_list = {}
1027 for l in lang_list:
1028 lang = l[1]
1029 params = compat_urllib_parse.urlencode({
1030 'lang': lang,
1031 'v': video_id,
ca715127 1032 'fmt': self._downloader.params.get('subtitlesformat', 'srt'),
c3197e3e 1033 'name': unescapeHTML(l[0]).encode('utf-8'),
de7f3446 1034 })
38c2e5b8 1035 url = u'https://www.youtube.com/api/timedtext?' + params
de7f3446
JMF
1036 sub_lang_list[lang] = url
1037 if not sub_lang_list:
1038 self._downloader.report_warning(u'video doesn\'t have subtitles')
1039 return {}
1040 return sub_lang_list
1041
055e6f36 1042 def _get_available_automatic_caption(self, video_id, webpage):
de7f3446
JMF
1043 """We need the webpage for getting the captions url, pass it as an
1044 argument to speed up the process."""
ca715127 1045 sub_format = self._downloader.params.get('subtitlesformat', 'srt')
de7f3446
JMF
1046 self.to_screen(u'%s: Looking for automatic captions' % video_id)
1047 mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
055e6f36 1048 err_msg = u'Couldn\'t find automatic captions for %s' % video_id
de7f3446
JMF
1049 if mobj is None:
1050 self._downloader.report_warning(err_msg)
1051 return {}
1052 player_config = json.loads(mobj.group(1))
1053 try:
1054 args = player_config[u'args']
1055 caption_url = args[u'ttsurl']
1056 timestamp = args[u'timestamp']
055e6f36
JMF
1057 # We get the available subtitles
1058 list_params = compat_urllib_parse.urlencode({
1059 'type': 'list',
1060 'tlangs': 1,
1061 'asrs': 1,
de7f3446 1062 })
055e6f36 1063 list_url = caption_url + '&' + list_params
e26f8712 1064 caption_list = self._download_xml(list_url, video_id)
e3dc22ca 1065 original_lang_node = caption_list.find('track')
f6a54188 1066 if original_lang_node is None or original_lang_node.attrib.get('kind') != 'asr' :
e3dc22ca
JMF
1067 self._downloader.report_warning(u'Video doesn\'t have automatic captions')
1068 return {}
1069 original_lang = original_lang_node.attrib['lang_code']
055e6f36
JMF
1070
1071 sub_lang_list = {}
1072 for lang_node in caption_list.findall('target'):
1073 sub_lang = lang_node.attrib['lang_code']
1074 params = compat_urllib_parse.urlencode({
1075 'lang': original_lang,
1076 'tlang': sub_lang,
1077 'fmt': sub_format,
1078 'ts': timestamp,
1079 'kind': 'asr',
1080 })
1081 sub_lang_list[sub_lang] = caption_url + '&' + params
1082 return sub_lang_list
de7f3446
JMF
1083 # An extractor error can be raise by the download process if there are
1084 # no automatic captions but there are subtitles
1085 except (KeyError, ExtractorError):
1086 self._downloader.report_warning(err_msg)
1087 return {}
1088
97665381
PH
1089 @classmethod
1090 def extract_id(cls, url):
1091 mobj = re.match(cls._VALID_URL, url, re.VERBOSE)
c5e8d7af
PH
1092 if mobj is None:
1093 raise ExtractorError(u'Invalid URL: %s' % url)
1094 video_id = mobj.group(2)
1095 return video_id
1096
1d043b93
JMF
1097 def _extract_from_m3u8(self, manifest_url, video_id):
1098 url_map = {}
1099 def _get_urls(_manifest):
1100 lines = _manifest.split('\n')
1101 urls = filter(lambda l: l and not l.startswith('#'),
1102 lines)
1103 return urls
1104 manifest = self._download_webpage(manifest_url, video_id, u'Downloading formats manifest')
1105 formats_urls = _get_urls(manifest)
1106 for format_url in formats_urls:
890f62e8 1107 itag = self._search_regex(r'itag/(\d+?)/', format_url, 'itag')
1d043b93
JMF
1108 url_map[itag] = format_url
1109 return url_map
1110
1fb07d10
JG
1111 def _extract_annotations(self, video_id):
1112 url = 'https://www.youtube.com/annotations_invideo?features=1&legacy=1&video_id=%s' % video_id
1113 return self._download_webpage(url, video_id, note=u'Searching for annotations.', errnote=u'Unable to download video annotations.')
1114
c5e8d7af
PH
1115 def _real_extract(self, url):
1116 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
1117 mobj = re.search(self._NEXT_URL_RE, url)
1118 if mobj:
1119 url = 'https://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
97665381 1120 video_id = self.extract_id(url)
c5e8d7af
PH
1121
1122 # Get video webpage
c5e8d7af 1123 url = 'https://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
336c3a69 1124 video_webpage = self._download_webpage(url, video_id)
c5e8d7af
PH
1125
1126 # Attempt to extract SWF player URL
e0df6211 1127 mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
c5e8d7af
PH
1128 if mobj is not None:
1129 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1130 else:
1131 player_url = None
1132
1133 # Get video info
1134 self.report_video_info_webpage_download(video_id)
c108eb73
JMF
1135 if re.search(r'player-age-gate-content">', video_webpage) is not None:
1136 self.report_age_confirmation()
1137 age_gate = True
1138 # We simulate the access to the video from www.youtube.com/v/{video_id}
1139 # this can be viewed without login into Youtube
1140 data = compat_urllib_parse.urlencode({'video_id': video_id,
fccd3771 1141 'el': 'player_embedded',
c108eb73
JMF
1142 'gl': 'US',
1143 'hl': 'en',
1144 'eurl': 'https://youtube.googleapis.com/v/' + video_id,
1145 'asv': 3,
1146 'sts':'1588',
1147 })
1148 video_info_url = 'https://www.youtube.com/get_video_info?' + data
c5e8d7af
PH
1149 video_info_webpage = self._download_webpage(video_info_url, video_id,
1150 note=False,
1151 errnote='unable to download video info webpage')
1152 video_info = compat_parse_qs(video_info_webpage)
c108eb73
JMF
1153 else:
1154 age_gate = False
1155 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1156 video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1157 % (video_id, el_type))
1158 video_info_webpage = self._download_webpage(video_info_url, video_id,
1159 note=False,
1160 errnote='unable to download video info webpage')
1161 video_info = compat_parse_qs(video_info_webpage)
1162 if 'token' in video_info:
1163 break
c5e8d7af
PH
1164 if 'token' not in video_info:
1165 if 'reason' in video_info:
9a82b238 1166 raise ExtractorError(u'YouTube said: %s' % video_info['reason'][0], expected=True)
c5e8d7af
PH
1167 else:
1168 raise ExtractorError(u'"token" parameter not in video info for unknown reason')
1169
1d699755
PH
1170 if 'view_count' in video_info:
1171 view_count = int(video_info['view_count'][0])
1172 else:
1173 view_count = None
1174
c5e8d7af
PH
1175 # Check for "rental" videos
1176 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
1177 raise ExtractorError(u'"rental" videos not supported')
1178
1179 # Start extracting information
1180 self.report_information_extraction(video_id)
1181
1182 # uploader
1183 if 'author' not in video_info:
1184 raise ExtractorError(u'Unable to extract uploader name')
1185 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
1186
1187 # uploader_id
1188 video_uploader_id = None
1189 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
1190 if mobj is not None:
1191 video_uploader_id = mobj.group(1)
1192 else:
1193 self._downloader.report_warning(u'unable to extract uploader nickname')
1194
1195 # title
a8c6b241
PH
1196 if 'title' in video_info:
1197 video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
1198 else:
1199 self._downloader.report_warning(u'Unable to extract video title')
1200 video_title = u'_'
c5e8d7af
PH
1201
1202 # thumbnail image
7763b04e
JMF
1203 # We try first to get a high quality image:
1204 m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">',
1205 video_webpage, re.DOTALL)
1206 if m_thumb is not None:
1207 video_thumbnail = m_thumb.group(1)
1208 elif 'thumbnail_url' not in video_info:
c5e8d7af 1209 self._downloader.report_warning(u'unable to extract video thumbnail')
f490e77e 1210 video_thumbnail = None
c5e8d7af
PH
1211 else: # don't panic if we can't find it
1212 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
1213
1214 # upload date
1215 upload_date = None
1216 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
1217 if mobj is not None:
1218 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1219 upload_date = unified_strdate(upload_date)
1220
1221 # description
1222 video_description = get_element_by_id("eow-description", video_webpage)
1223 if video_description:
27dcce19
PH
1224 video_description = re.sub(r'''(?x)
1225 <a\s+
1226 (?:[a-zA-Z-]+="[^"]+"\s+)*?
1227 title="([^"]+)"\s+
1228 (?:[a-zA-Z-]+="[^"]+"\s+)*?
1229 class="yt-uix-redirect-link"\s*>
1230 [^<]+
1231 </a>
1232 ''', r'\1', video_description)
c5e8d7af
PH
1233 video_description = clean_html(video_description)
1234 else:
1235 fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
1236 if fd_mobj:
1237 video_description = unescapeHTML(fd_mobj.group(1))
1238 else:
1239 video_description = u''
1240
336c3a69 1241 def _extract_count(klass):
46374a56
PH
1242 count = self._search_regex(
1243 r'class="%s">([\d,]+)</span>' % re.escape(klass),
1244 video_webpage, klass, default=None)
336c3a69
JMF
1245 if count is not None:
1246 return int(count.replace(',', ''))
1247 return None
1248 like_count = _extract_count(u'likes-count')
1249 dislike_count = _extract_count(u'dislikes-count')
1250
c5e8d7af 1251 # subtitles
d82134c3 1252 video_subtitles = self.extract_subtitles(video_id, video_webpage)
c5e8d7af 1253
c5e8d7af 1254 if self._downloader.params.get('listsubtitles', False):
d665f8d3 1255 self._list_available_subtitles(video_id, video_webpage)
c5e8d7af
PH
1256 return
1257
1258 if 'length_seconds' not in video_info:
1259 self._downloader.report_warning(u'unable to extract video duration')
b466b702 1260 video_duration = None
c5e8d7af 1261 else:
b466b702 1262 video_duration = int(compat_urllib_parse.unquote_plus(video_info['length_seconds'][0]))
c5e8d7af 1263
1fb07d10
JG
1264 # annotations
1265 video_annotations = None
1266 if self._downloader.params.get('writeannotations', False):
1267 video_annotations = self._extract_annotations(video_id)
1268
c5e8d7af 1269 # Decide which formats to download
c5e8d7af
PH
1270 try:
1271 mobj = re.search(r';ytplayer.config = ({.*?});', video_webpage)
50be92c1
PH
1272 if not mobj:
1273 raise ValueError('Could not find vevo ID')
c5e8d7af
PH
1274 info = json.loads(mobj.group(1))
1275 args = info['args']
7ce7e394
JMF
1276 # Easy way to know if the 's' value is in url_encoded_fmt_stream_map
1277 # this signatures are encrypted
44d46655 1278 if 'url_encoded_fmt_stream_map' not in args:
f10503db 1279 raise ValueError(u'No stream_map present') # caught below
00fe14fc
JMF
1280 re_signature = re.compile(r'[&,]s=')
1281 m_s = re_signature.search(args['url_encoded_fmt_stream_map'])
7ce7e394
JMF
1282 if m_s is not None:
1283 self.to_screen(u'%s: Encrypted signatures detected.' % video_id)
c5e8d7af 1284 video_info['url_encoded_fmt_stream_map'] = [args['url_encoded_fmt_stream_map']]
00fe14fc 1285 m_s = re_signature.search(args.get('adaptive_fmts', u''))
b7a68384 1286 if m_s is not None:
00fe14fc
JMF
1287 if 'adaptive_fmts' in video_info:
1288 video_info['adaptive_fmts'][0] += ',' + args['adaptive_fmts']
37b6d5f6 1289 else:
00fe14fc 1290 video_info['adaptive_fmts'] = [args['adaptive_fmts']]
c5e8d7af
PH
1291 except ValueError:
1292 pass
1293
dd27fd17
PH
1294 def _map_to_format_list(urlmap):
1295 formats = []
1296 for itag, video_real_url in urlmap.items():
1297 dct = {
1298 'format_id': itag,
1299 'url': video_real_url,
1300 'player_url': player_url,
1301 }
0b65e5d4
PH
1302 if itag in self._formats:
1303 dct.update(self._formats[itag])
dd27fd17
PH
1304 formats.append(dct)
1305 return formats
1306
c5e8d7af
PH
1307 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1308 self.report_rtmp_download()
dd27fd17
PH
1309 formats = [{
1310 'format_id': '_rtmp',
1311 'protocol': 'rtmp',
1312 'url': video_info['conn'][0],
1313 'player_url': player_url,
1314 }]
00fe14fc
JMF
1315 elif len(video_info.get('url_encoded_fmt_stream_map', [])) >= 1 or len(video_info.get('adaptive_fmts', [])) >= 1:
1316 encoded_url_map = video_info.get('url_encoded_fmt_stream_map', [''])[0] + ',' + video_info.get('adaptive_fmts',[''])[0]
1317 if 'rtmpe%3Dyes' in encoded_url_map:
a7055eb9 1318 raise ExtractorError('rtmpe downloads are not supported, see https://github.com/rg3/youtube-dl/issues/343 for more information.', expected=True)
c5e8d7af 1319 url_map = {}
00fe14fc 1320 for url_data_str in encoded_url_map.split(','):
c5e8d7af
PH
1321 url_data = compat_parse_qs(url_data_str)
1322 if 'itag' in url_data and 'url' in url_data:
1323 url = url_data['url'][0]
1324 if 'sig' in url_data:
1325 url += '&signature=' + url_data['sig'][0]
1326 elif 's' in url_data:
e0df6211 1327 encrypted_sig = url_data['s'][0]
769fda3c 1328 if self._downloader.params.get('verbose'):
c108eb73 1329 if age_gate:
bdde940e
PH
1330 if player_url is None:
1331 player_version = 'unknown'
1332 else:
1333 player_version = self._search_regex(
1334 r'-(.+)\.swf$', player_url,
1335 u'flash player', fatal=False)
e0df6211 1336 player_desc = 'flash player %s' % player_version
c108eb73 1337 else:
83799698
PH
1338 player_version = self._search_regex(
1339 r'html5player-(.+?)\.js', video_webpage,
c108eb73 1340 'html5 player', fatal=False)
e0df6211
PH
1341 player_desc = u'html5 player %s' % player_version
1342
1343 parts_sizes = u'.'.join(compat_str(len(part)) for part in encrypted_sig.split('.'))
5a76c651 1344 self.to_screen(u'encrypted signature length %d (%s), itag %s, %s' %
e0df6211
PH
1345 (len(encrypted_sig), parts_sizes, url_data['itag'][0], player_desc))
1346
83799698 1347 if not age_gate:
e0df6211
PH
1348 jsplayer_url_json = self._search_regex(
1349 r'"assets":.+?"js":\s*("[^"]+")',
1350 video_webpage, u'JS player URL')
83799698 1351 player_url = json.loads(jsplayer_url_json)
e0df6211 1352
83799698
PH
1353 signature = self._decrypt_signature(
1354 encrypted_sig, video_id, player_url, age_gate)
c5e8d7af
PH
1355 url += '&signature=' + signature
1356 if 'ratebypass' not in url:
1357 url += '&ratebypass=yes'
1358 url_map[url_data['itag'][0]] = url
dd27fd17 1359 formats = _map_to_format_list(url_map)
1d043b93
JMF
1360 elif video_info.get('hlsvp'):
1361 manifest_url = video_info['hlsvp'][0]
1362 url_map = self._extract_from_m3u8(manifest_url, video_id)
dd27fd17 1363 formats = _map_to_format_list(url_map)
c5e8d7af 1364 else:
9abb3204 1365 raise ExtractorError(u'no conn, hlsvp or url_encoded_fmt_stream_map information found in video info')
c5e8d7af 1366
dd27fd17
PH
1367 # Look for the DASH manifest
1368 dash_manifest_url_lst = video_info.get('dashmpd')
4919603f
PH
1369 if (dash_manifest_url_lst and dash_manifest_url_lst[0] and
1370 self._downloader.params.get('youtube_include_dash_manifest', False)):
dd27fd17
PH
1371 try:
1372 dash_doc = self._download_xml(
1373 dash_manifest_url_lst[0], video_id,
1374 note=u'Downloading DASH manifest',
1375 errnote=u'Could not download DASH manifest')
1376 for r in dash_doc.findall(u'.//{urn:mpeg:DASH:schema:MPD:2011}Representation'):
1377 url_el = r.find('{urn:mpeg:DASH:schema:MPD:2011}BaseURL')
1378 if url_el is None:
1379 continue
1380 format_id = r.attrib['id']
1381 video_url = url_el.text
1382 filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength'))
1383 f = {
1384 'format_id': format_id,
1385 'url': video_url,
1386 'width': int_or_none(r.attrib.get('width')),
1387 'tbr': int_or_none(r.attrib.get('bandwidth'), 1000),
1388 'asr': int_or_none(r.attrib.get('audioSamplingRate')),
1389 'filesize': filesize,
1390 }
1391 try:
1392 existing_format = next(
1393 fo for fo in formats
1394 if fo['format_id'] == format_id)
1395 except StopIteration:
1396 f.update(self._formats.get(format_id, {}))
1397 formats.append(f)
1398 else:
1399 existing_format.update(f)
1400
1401 except (ExtractorError, KeyError) as e:
1402 self.report_warning(u'Skipping DASH manifest: %s' % e, video_id)
d80044c2 1403
4bcc7bd1 1404 self._sort_formats(formats)
4ea3be0a 1405
1406 return {
1407 'id': video_id,
1408 'uploader': video_uploader,
1409 'uploader_id': video_uploader_id,
1410 'upload_date': upload_date,
1411 'title': video_title,
1412 'thumbnail': video_thumbnail,
1413 'description': video_description,
1414 'subtitles': video_subtitles,
1415 'duration': video_duration,
1416 'age_limit': 18 if age_gate else 0,
1417 'annotations': video_annotations,
1418 'webpage_url': 'https://www.youtube.com/watch?v=%s' % video_id,
1419 'view_count': view_count,
1420 'like_count': like_count,
1421 'dislike_count': dislike_count,
1422 'formats': formats,
1423 }
c5e8d7af 1424
880e1c52 1425class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
0f818663 1426 IE_DESC = u'YouTube.com playlists'
d67cc9fa 1427 _VALID_URL = r"""(?x)(?:
c5e8d7af
PH
1428 (?:https?://)?
1429 (?:\w+\.)?
1430 youtube\.com/
1431 (?:
1432 (?:course|view_play_list|my_playlists|artist|playlist|watch)
1433 \? (?:.*?&)*? (?:p|a|list)=
1434 | p/
1435 )
d67cc9fa
JMF
1436 (
1437 (?:PL|EC|UU|FL|RD)?[0-9A-Za-z-_]{10,}
1438 # Top tracks, they can also include dots
1439 |(?:MC)[\w\.]*
1440 )
c5e8d7af
PH
1441 .*
1442 |
715c8e7b 1443 ((?:PL|EC|UU|FL|RD)[0-9A-Za-z-_]{10,})
c5e8d7af 1444 )"""
dcbb4580
JMF
1445 _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s&page=%s'
1446 _MORE_PAGES_INDICATOR = r'data-link-type="next"'
6e47b51e 1447 _VIDEO_RE = r'href="/watch\?v=(?P<id>[0-9A-Za-z_-]{11})&amp;[^"]*?index=(?P<index>\d+)'
c5e8d7af
PH
1448 IE_NAME = u'youtube:playlist'
1449
880e1c52
JMF
1450 def _real_initialize(self):
1451 self._login()
1452
652cdaa2
JMF
1453 def _ids_to_results(self, ids):
1454 return [self.url_result(vid_id, 'Youtube', video_id=vid_id)
1455 for vid_id in ids]
1456
1457 def _extract_mix(self, playlist_id):
1458 # The mixes are generated from a a single video
1459 # the id of the playlist is just 'RD' + video_id
7d4afc55 1460 url = 'https://youtube.com/watch?v=%s&list=%s' % (playlist_id[-11:], playlist_id)
652cdaa2 1461 webpage = self._download_webpage(url, playlist_id, u'Downloading Youtube mix')
76d1700b
JMF
1462 title_span = (get_element_by_attribute('class', 'title long-title', webpage) or
1463 get_element_by_attribute('class', 'title ', webpage))
1464 title = clean_html(title_span)
652cdaa2
JMF
1465 video_re = r'data-index="\d+".*?href="/watch\?v=([0-9A-Za-z_-]{11})&amp;[^"]*?list=%s' % re.escape(playlist_id)
1466 ids = orderedSet(re.findall(video_re, webpage))
1467 url_results = self._ids_to_results(ids)
1468
1469 return self.playlist_result(url_results, playlist_id, title)
1470
c5e8d7af
PH
1471 def _real_extract(self, url):
1472 # Extract playlist id
d67cc9fa 1473 mobj = re.match(self._VALID_URL, url)
c5e8d7af
PH
1474 if mobj is None:
1475 raise ExtractorError(u'Invalid URL: %s' % url)
47192f92
FV
1476 playlist_id = mobj.group(1) or mobj.group(2)
1477
1478 # Check if it's a video-specific URL
7c61bd36 1479 query_dict = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
47192f92
FV
1480 if 'v' in query_dict:
1481 video_id = query_dict['v'][0]
1482 if self._downloader.params.get('noplaylist'):
1483 self.to_screen(u'Downloading just video %s because of --no-playlist' % video_id)
7012b23c 1484 return self.url_result(video_id, 'Youtube', video_id=video_id)
47192f92
FV
1485 else:
1486 self.to_screen(u'Downloading playlist PL%s - add --no-playlist to just download video %s' % (playlist_id, video_id))
c5e8d7af 1487
7d4afc55 1488 if playlist_id.startswith('RD'):
652cdaa2
JMF
1489 # Mixes require a custom extraction process
1490 return self._extract_mix(playlist_id)
0a688bc0
JMF
1491 if playlist_id.startswith('TL'):
1492 raise ExtractorError(u'For downloading YouTube.com top lists, use '
1493 u'the "yttoplist" keyword, for example "youtube-dl \'yttoplist:music:Top Tracks\'"', expected=True)
652cdaa2 1494
dcbb4580
JMF
1495 # Extract the video ids from the playlist pages
1496 ids = []
c5e8d7af 1497
755eb032 1498 for page_num in itertools.count(1):
dcbb4580 1499 url = self._TEMPLATE_URL % (playlist_id, page_num)
c5e8d7af 1500 page = self._download_webpage(url, playlist_id, u'Downloading page #%s' % page_num)
6e47b51e
JMF
1501 matches = re.finditer(self._VIDEO_RE, page)
1502 # We remove the duplicates and the link with index 0
1503 # (it's not the first video of the playlist)
1504 new_ids = orderedSet(m.group('id') for m in matches if m.group('index') != '0')
dcbb4580 1505 ids.extend(new_ids)
c5e8d7af 1506
dcbb4580 1507 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
c5e8d7af
PH
1508 break
1509
c91778f8
PH
1510 try:
1511 playlist_title = self._og_search_title(page)
1512 except RegexNotFoundError:
1513 self.report_warning(
1514 u'Playlist page is missing OpenGraph title, falling back ...',
1515 playlist_id)
1516 playlist_title = self._html_search_regex(
1517 r'<h1 class="pl-header-title">(.*?)</h1>', page, u'title')
c5e8d7af 1518
652cdaa2 1519 url_results = self._ids_to_results(ids)
dcbb4580 1520 return self.playlist_result(url_results, playlist_id, playlist_title)
c5e8d7af
PH
1521
1522
0a688bc0
JMF
1523class YoutubeTopListIE(YoutubePlaylistIE):
1524 IE_NAME = u'youtube:toplist'
1525 IE_DESC = (u'YouTube.com top lists, "yttoplist:{channel}:{list title}"'
1526 u' (Example: "yttoplist:music:Top Tracks")')
1527 _VALID_URL = r'yttoplist:(?P<chann>.*?):(?P<title>.*?)$'
1528
1529 def _real_extract(self, url):
1530 mobj = re.match(self._VALID_URL, url)
1531 channel = mobj.group('chann')
1532 title = mobj.group('title')
1533 query = compat_urllib_parse.urlencode({'title': title})
beddbc2a 1534 playlist_re = 'href="([^"]+?%s.*?)"' % re.escape(query)
0a688bc0
JMF
1535 channel_page = self._download_webpage('https://www.youtube.com/%s' % channel, title)
1536 link = self._html_search_regex(playlist_re, channel_page, u'list')
1537 url = compat_urlparse.urljoin('https://www.youtube.com/', link)
1538
1539 video_re = r'data-index="\d+".*?data-video-id="([0-9A-Za-z_-]{11})"'
1540 ids = []
1541 # sometimes the webpage doesn't contain the videos
1542 # retry until we get them
1543 for i in itertools.count(0):
1544 msg = u'Downloading Youtube mix'
1545 if i > 0:
1546 msg += ', retry #%d' % i
1547 webpage = self._download_webpage(url, title, msg)
1548 ids = orderedSet(re.findall(video_re, webpage))
1549 if ids:
1550 break
1551 url_results = self._ids_to_results(ids)
1552 return self.playlist_result(url_results, playlist_title=title)
1553
1554
c5e8d7af 1555class YoutubeChannelIE(InfoExtractor):
0f818663 1556 IE_DESC = u'YouTube.com channels'
c5e8d7af 1557 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
c5e8d7af 1558 _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
38c2e5b8 1559 _MORE_PAGES_URL = 'https://www.youtube.com/c4_browse_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
c5e8d7af
PH
1560 IE_NAME = u'youtube:channel'
1561
1562 def extract_videos_from_page(self, page):
1563 ids_in_page = []
1564 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1565 if mobj.group(1) not in ids_in_page:
1566 ids_in_page.append(mobj.group(1))
1567 return ids_in_page
1568
1569 def _real_extract(self, url):
1570 # Extract channel id
1571 mobj = re.match(self._VALID_URL, url)
1572 if mobj is None:
1573 raise ExtractorError(u'Invalid URL: %s' % url)
1574
1575 # Download channel page
1576 channel_id = mobj.group(1)
1577 video_ids = []
b9643eed
JMF
1578 url = 'https://www.youtube.com/channel/%s/videos' % channel_id
1579 channel_page = self._download_webpage(url, channel_id)
31812a9e
PH
1580 autogenerated = re.search(r'''(?x)
1581 class="[^"]*?(?:
1582 channel-header-autogenerated-label|
1583 yt-channel-title-autogenerated
1584 )[^"]*"''', channel_page) is not None
c5e8d7af 1585
b9643eed
JMF
1586 if autogenerated:
1587 # The videos are contained in a single page
1588 # the ajax pages can't be used, they are empty
1589 video_ids = self.extract_videos_from_page(channel_page)
1590 else:
1591 # Download all channel pages using the json-based channel_ajax query
1592 for pagenum in itertools.count(1):
1593 url = self._MORE_PAGES_URL % (pagenum, channel_id)
81c2f20b
PH
1594 page = self._download_json(
1595 url, channel_id, note=u'Downloading page #%s' % pagenum,
1596 transform_source=uppercase_escape)
1597
b9643eed
JMF
1598 ids_in_page = self.extract_videos_from_page(page['content_html'])
1599 video_ids.extend(ids_in_page)
1600
1601 if self._MORE_PAGES_INDICATOR not in page['load_more_widget_html']:
1602 break
c5e8d7af
PH
1603
1604 self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1605
7012b23c
PH
1606 url_entries = [self.url_result(video_id, 'Youtube', video_id=video_id)
1607 for video_id in video_ids]
1608 return self.playlist_result(url_entries, channel_id)
c5e8d7af
PH
1609
1610
1611class YoutubeUserIE(InfoExtractor):
0f818663 1612 IE_DESC = u'YouTube.com user videos (URL or "ytuser" keyword)'
57da92b7 1613 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:user/)?(?!(?:attribution_link|watch)(?:$|[^a-z_A-Z0-9-])))|ytuser:)(?!feed/)([A-Za-z0-9_-]+)'
38c2e5b8 1614 _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/users/%s'
c5e8d7af 1615 _GDATA_PAGE_SIZE = 50
38c2e5b8 1616 _GDATA_URL = 'https://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d&alt=json'
c5e8d7af
PH
1617 IE_NAME = u'youtube:user'
1618
e3ea4790 1619 @classmethod
f4b05232 1620 def suitable(cls, url):
e3ea4790
JMF
1621 # Don't return True if the url can be extracted with other youtube
1622 # extractor, the regex would is too permissive and it would match.
1623 other_ies = iter(klass for (name, klass) in globals().items() if name.endswith('IE') and klass is not cls)
1624 if any(ie.suitable(url) for ie in other_ies): return False
f4b05232
JMF
1625 else: return super(YoutubeUserIE, cls).suitable(url)
1626
c5e8d7af
PH
1627 def _real_extract(self, url):
1628 # Extract username
1629 mobj = re.match(self._VALID_URL, url)
1630 if mobj is None:
1631 raise ExtractorError(u'Invalid URL: %s' % url)
1632
1633 username = mobj.group(1)
1634
1635 # Download video ids using YouTube Data API. Result size per
1636 # query is limited (currently to 50 videos) so we need to query
1637 # page by page until there are no video ids - it means we got
1638 # all of them.
1639
b7ab0590 1640 def download_page(pagenum):
c5e8d7af
PH
1641 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1642
1643 gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
b7ab0590
PH
1644 page = self._download_webpage(
1645 gdata_url, username,
1646 u'Downloading video ids from %d to %d' % (
1647 start_index, start_index + self._GDATA_PAGE_SIZE))
c5e8d7af 1648
fd9cf738
JMF
1649 try:
1650 response = json.loads(page)
1651 except ValueError as err:
1652 raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
71c82637 1653 if 'entry' not in response['feed']:
b7ab0590 1654 return
fd9cf738 1655
c5e8d7af 1656 # Extract video identifiers
e302f9ce
PH
1657 entries = response['feed']['entry']
1658 for entry in entries:
1659 title = entry['title']['$t']
1660 video_id = entry['id']['$t'].split('/')[-1]
b7ab0590 1661 yield {
e302f9ce
PH
1662 '_type': 'url',
1663 'url': video_id,
1664 'ie_key': 'Youtube',
b11cec41 1665 'id': video_id,
e302f9ce 1666 'title': title,
b7ab0590
PH
1667 }
1668 url_results = PagedList(download_page, self._GDATA_PAGE_SIZE)
c5e8d7af 1669
7012b23c
PH
1670 return self.playlist_result(url_results, playlist_title=username)
1671
b05654f0
PH
1672
1673class YoutubeSearchIE(SearchInfoExtractor):
0f818663 1674 IE_DESC = u'YouTube.com searches'
b05654f0
PH
1675 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1676 _MAX_RESULTS = 1000
1677 IE_NAME = u'youtube:search'
1678 _SEARCH_KEY = 'ytsearch'
1679
b05654f0
PH
1680 def _get_n_results(self, query, n):
1681 """Get a specified number of results for a query"""
1682
1683 video_ids = []
1684 pagenum = 0
1685 limit = n
1686
1687 while (50 * pagenum) < limit:
b05654f0 1688 result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
7cc3570e
PH
1689 data_json = self._download_webpage(
1690 result_url, video_id=u'query "%s"' % query,
1691 note=u'Downloading page %s' % (pagenum + 1),
1692 errnote=u'Unable to download API page')
1693 data = json.loads(data_json)
1694 api_response = data['data']
1695
1696 if 'items' not in api_response:
07ad22b8
PH
1697 raise ExtractorError(
1698 u'[youtube] No video results', expected=True)
b05654f0
PH
1699
1700 new_ids = list(video['id'] for video in api_response['items'])
1701 video_ids += new_ids
1702
1703 limit = min(n, api_response['totalItems'])
1704 pagenum += 1
1705
1706 if len(video_ids) > n:
1707 video_ids = video_ids[:n]
7012b23c
PH
1708 videos = [self.url_result(video_id, 'Youtube', video_id=video_id)
1709 for video_id in video_ids]
b05654f0 1710 return self.playlist_result(videos, query)
75dff0ee 1711
a3dd9248 1712class YoutubeSearchDateIE(YoutubeSearchIE):
cb7fb546 1713 IE_NAME = YoutubeSearchIE.IE_NAME + ':date'
a3dd9248
CM
1714 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc&orderby=published'
1715 _SEARCH_KEY = 'ytsearchdate'
08fb86c4 1716 IE_DESC = u'YouTube.com searches, newest videos first'
75dff0ee
JMF
1717
1718class YoutubeShowIE(InfoExtractor):
0f818663 1719 IE_DESC = u'YouTube.com (multi-season) shows'
75dff0ee
JMF
1720 _VALID_URL = r'https?://www\.youtube\.com/show/(.*)'
1721 IE_NAME = u'youtube:show'
1722
1723 def _real_extract(self, url):
1724 mobj = re.match(self._VALID_URL, url)
1725 show_name = mobj.group(1)
1726 webpage = self._download_webpage(url, show_name, u'Downloading show webpage')
1727 # There's one playlist for each season of the show
1728 m_seasons = list(re.finditer(r'href="(/playlist\?list=.*?)"', webpage))
1729 self.to_screen(u'%s: Found %s seasons' % (show_name, len(m_seasons)))
1730 return [self.url_result('https://www.youtube.com' + season.group(1), 'YoutubePlaylist') for season in m_seasons]
04cc9617
JMF
1731
1732
b2e8bc1b 1733class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
d7ae0639
JMF
1734 """
1735 Base class for extractors that fetch info from
1736 http://www.youtube.com/feed_ajax
1737 Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.
1738 """
b2e8bc1b 1739 _LOGIN_REQUIRED = True
43ba5456
JMF
1740 # use action_load_personal_feed instead of action_load_system_feed
1741 _PERSONAL_FEED = False
04cc9617 1742
d7ae0639
JMF
1743 @property
1744 def _FEED_TEMPLATE(self):
43ba5456
JMF
1745 action = 'action_load_system_feed'
1746 if self._PERSONAL_FEED:
1747 action = 'action_load_personal_feed'
38c2e5b8 1748 return 'https://www.youtube.com/feed_ajax?%s=1&feed_name=%s&paging=%%s' % (action, self._FEED_NAME)
d7ae0639
JMF
1749
1750 @property
1751 def IE_NAME(self):
1752 return u'youtube:%s' % self._FEED_NAME
04cc9617 1753
81f0259b 1754 def _real_initialize(self):
b2e8bc1b 1755 self._login()
81f0259b 1756
04cc9617
JMF
1757 def _real_extract(self, url):
1758 feed_entries = []
0e44d838
JMF
1759 paging = 0
1760 for i in itertools.count(1):
d7ae0639
JMF
1761 info = self._download_webpage(self._FEED_TEMPLATE % paging,
1762 u'%s feed' % self._FEED_NAME,
04cc9617
JMF
1763 u'Downloading page %s' % i)
1764 info = json.loads(info)
1765 feed_html = info['feed_html']
43ba5456 1766 m_ids = re.finditer(r'"/watch\?v=(.*?)["&]', feed_html)
04cc9617 1767 ids = orderedSet(m.group(1) for m in m_ids)
7012b23c
PH
1768 feed_entries.extend(
1769 self.url_result(video_id, 'Youtube', video_id=video_id)
1770 for video_id in ids)
04cc9617
JMF
1771 if info['paging'] is None:
1772 break
0e44d838 1773 paging = info['paging']
d7ae0639
JMF
1774 return self.playlist_result(feed_entries, playlist_title=self._PLAYLIST_TITLE)
1775
1776class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor):
1777 IE_DESC = u'YouTube.com subscriptions feed, "ytsubs" keyword(requires authentication)'
1778 _VALID_URL = r'https?://www\.youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
1779 _FEED_NAME = 'subscriptions'
1780 _PLAYLIST_TITLE = u'Youtube Subscriptions'
1781
1782class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
1783 IE_DESC = u'YouTube.com recommended videos, "ytrec" keyword (requires authentication)'
1784 _VALID_URL = r'https?://www\.youtube\.com/feed/recommended|:ytrec(?:ommended)?'
1785 _FEED_NAME = 'recommended'
1786 _PLAYLIST_TITLE = u'Youtube Recommended videos'
c626a3d9 1787
43ba5456
JMF
1788class YoutubeWatchLaterIE(YoutubeFeedsInfoExtractor):
1789 IE_DESC = u'Youtube watch later list, "ytwatchlater" keyword (requires authentication)'
1790 _VALID_URL = r'https?://www\.youtube\.com/feed/watch_later|:ytwatchlater'
1791 _FEED_NAME = 'watch_later'
1792 _PLAYLIST_TITLE = u'Youtube Watch Later'
43ba5456 1793 _PERSONAL_FEED = True
c626a3d9 1794
f459d170
JMF
1795class YoutubeHistoryIE(YoutubeFeedsInfoExtractor):
1796 IE_DESC = u'Youtube watch history, "ythistory" keyword (requires authentication)'
1797 _VALID_URL = u'https?://www\.youtube\.com/feed/history|:ythistory'
1798 _FEED_NAME = 'history'
1799 _PERSONAL_FEED = True
1800 _PLAYLIST_TITLE = u'Youtube Watch History'
1801
c626a3d9
JMF
1802class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
1803 IE_NAME = u'youtube:favorites'
1804 IE_DESC = u'YouTube.com favourite videos, "ytfav" keyword (requires authentication)'
c7a7750d 1805 _VALID_URL = r'https?://www\.youtube\.com/my_favorites|:ytfav(?:ou?rites)?'
c626a3d9
JMF
1806 _LOGIN_REQUIRED = True
1807
1808 def _real_extract(self, url):
1809 webpage = self._download_webpage('https://www.youtube.com/my_favorites', 'Youtube Favourites videos')
1810 playlist_id = self._search_regex(r'list=(.+?)["&]', webpage, u'favourites playlist id')
1811 return self.url_result(playlist_id, 'YoutubePlaylist')
15870e90
PH
1812
1813
1814class YoutubeTruncatedURLIE(InfoExtractor):
1815 IE_NAME = 'youtube:truncated_url'
1816 IE_DESC = False # Do not list
975d35db
PH
1817 _VALID_URL = r'''(?x)
1818 (?:https?://)?[^/]+/watch\?feature=[a-z_]+$|
1819 (?:https?://)?(?:www\.)?youtube\.com/attribution_link\?a=[^&]+$
1820 '''
15870e90
PH
1821
1822 def _real_extract(self, url):
1823 raise ExtractorError(
1824 u'Did you forget to quote the URL? Remember that & is a meta '
1825 u'character in most shells, so you want to put the URL in quotes, '
1826 u'like youtube-dl '
b4622a32
PH
1827 u'"http://www.youtube.com/watch?feature=foo&v=BaW_jenozKc" '
1828 u' or simply youtube-dl BaW_jenozKc .',
15870e90 1829 expected=True)