]> jfr.im git - yt-dlp.git/blame - youtube_dl/extractor/youtube.py
[youtube] Move JavaScript interpreter into its own module
[yt-dlp.git] / youtube_dl / extractor / youtube.py
CommitLineData
c5e8d7af 1# coding: utf-8
c5e8d7af 2
e0df6211 3import collections
edf3e38e 4import errno
e0df6211 5import io
0ca96d48 6import itertools
c5e8d7af 7import json
c4417ddb 8import os.path
c5e8d7af 9import re
e0df6211
PH
10import string
11import struct
12import traceback
13import zlib
c5e8d7af 14
b05654f0 15from .common import InfoExtractor, SearchInfoExtractor
54d39d8b 16from .subtitles import SubtitlesInfoExtractor
2b25cb5d 17from ..jsinterp import JSInterpreter
c5e8d7af 18from ..utils import (
edf3e38e 19 compat_chr,
c5e8d7af 20 compat_parse_qs,
c5e8d7af
PH
21 compat_urllib_parse,
22 compat_urllib_request,
7c61bd36 23 compat_urlparse,
c5e8d7af
PH
24 compat_str,
25
26 clean_html,
c38b1e77 27 get_cachedir,
c5e8d7af 28 get_element_by_id,
652cdaa2 29 get_element_by_attribute,
c5e8d7af 30 ExtractorError,
dd27fd17 31 int_or_none,
b7ab0590 32 PagedList,
c5e8d7af
PH
33 unescapeHTML,
34 unified_strdate,
04cc9617 35 orderedSet,
edf3e38e 36 write_json_file,
81c2f20b 37 uppercase_escape,
c5e8d7af
PH
38)
39
de7f3446 40class YoutubeBaseInfoExtractor(InfoExtractor):
b2e8bc1b
JMF
41 """Provide base functions for Youtube extractors"""
42 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
43 _LANG_URL = r'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
38c2e5b8 44 _AGE_URL = 'https://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
b2e8bc1b
JMF
45 _NETRC_MACHINE = 'youtube'
46 # If True it will raise an error if no login info is provided
47 _LOGIN_REQUIRED = False
48
b2e8bc1b 49 def _set_language(self):
7cc3570e
PH
50 return bool(self._download_webpage(
51 self._LANG_URL, None,
52 note=u'Setting language', errnote='unable to set language',
53 fatal=False))
b2e8bc1b
JMF
54
55 def _login(self):
56 (username, password) = self._get_login_info()
57 # No authentication to be performed
58 if username is None:
59 if self._LOGIN_REQUIRED:
60 raise ExtractorError(u'No login info available, needed for using %s.' % self.IE_NAME, expected=True)
61 return False
62
7cc3570e
PH
63 login_page = self._download_webpage(
64 self._LOGIN_URL, None,
65 note=u'Downloading login page',
66 errnote=u'unable to fetch login page', fatal=False)
67 if login_page is False:
68 return
b2e8bc1b 69
795f28f8
PH
70 galx = self._search_regex(r'(?s)<input.+?name="GALX".+?value="(.+?)"',
71 login_page, u'Login GALX parameter')
c5e8d7af 72
b2e8bc1b
JMF
73 # Log in
74 login_form_strs = {
75 u'continue': u'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
76 u'Email': username,
77 u'GALX': galx,
78 u'Passwd': password,
79 u'PersistentCookie': u'yes',
80 u'_utf8': u'霱',
81 u'bgresponse': u'js_disabled',
82 u'checkConnection': u'',
83 u'checkedDomains': u'youtube',
84 u'dnConn': u'',
b2e8bc1b
JMF
85 u'pstMsg': u'0',
86 u'rmShown': u'1',
87 u'secTok': u'',
88 u'signIn': u'Sign in',
89 u'timeStmp': u'',
90 u'service': u'youtube',
91 u'uilel': u'3',
92 u'hl': u'en_US',
93 }
94 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
95 # chokes on unicode
96 login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
97 login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
7cc3570e
PH
98
99 req = compat_urllib_request.Request(self._LOGIN_URL, login_data)
100 login_results = self._download_webpage(
101 req, None,
102 note=u'Logging in', errnote=u'unable to log in', fatal=False)
103 if login_results is False:
104 return False
105 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
106 self._downloader.report_warning(u'unable to log in: bad username or password')
b2e8bc1b
JMF
107 return False
108 return True
109
110 def _confirm_age(self):
111 age_form = {
7cc3570e
PH
112 'next_url': '/',
113 'action_confirm': 'Confirm',
114 }
5700e779
JMF
115 req = compat_urllib_request.Request(self._AGE_URL,
116 compat_urllib_parse.urlencode(age_form).encode('ascii'))
7cc3570e
PH
117
118 self._download_webpage(
119 req, None,
120 note=u'Confirming age', errnote=u'Unable to confirm age')
b2e8bc1b
JMF
121 return True
122
123 def _real_initialize(self):
124 if self._downloader is None:
125 return
126 if not self._set_language():
127 return
128 if not self._login():
129 return
130 self._confirm_age()
c5e8d7af 131
8377574c 132
de7f3446 133class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
0f818663 134 IE_DESC = u'YouTube.com'
cb7dfeea 135 _VALID_URL = r"""(?x)^
c5e8d7af 136 (
83aa5293 137 (?:https?://|//)? # http(s):// or protocol-independent URL (optional)
cb7dfeea 138 (?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/|
484aaeb2 139 (?:www\.)?deturl\.com/www\.youtube\.com/|
e70dc1d1 140 (?:www\.)?pwnyoutube\.com/|
f7000f3a 141 (?:www\.)?yourepeat\.com/|
e69ae5b9
JMF
142 tube\.majestyc\.net/|
143 youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains
c5e8d7af
PH
144 (?:.*?\#/)? # handle anchor (#/) redirect urls
145 (?: # the various things that can precede the ID:
146 (?:(?:v|embed|e)/) # v/ or embed/ or e/
147 |(?: # or the v= param in all its forms
f7000f3a 148 (?:(?:watch|movie)(?:_popup)?(?:\.php)?/?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
c5e8d7af
PH
149 (?:\?|\#!?) # the params delimiter ? or # or #!
150 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
151 v=
152 )
f4b05232
JMF
153 ))
154 |youtu\.be/ # just youtu.be/xxxx
155 )
c5e8d7af 156 )? # all until now is optional -> you can pass the naked ID
8963d9c2 157 ([0-9A-Za-z_-]{11}) # here is it! the YouTube video ID
c5e8d7af
PH
158 (?(1).+)? # if we found the ID, everything can follow
159 $"""
c5e8d7af 160 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
2c62dc26
PH
161 _formats = {
162 '5': {'ext': 'flv', 'width': 400, 'height': 240},
163 '6': {'ext': 'flv', 'width': 450, 'height': 270},
164 '13': {'ext': '3gp'},
165 '17': {'ext': '3gp', 'width': 176, 'height': 144},
166 '18': {'ext': 'mp4', 'width': 640, 'height': 360},
167 '22': {'ext': 'mp4', 'width': 1280, 'height': 720},
168 '34': {'ext': 'flv', 'width': 640, 'height': 360},
169 '35': {'ext': 'flv', 'width': 854, 'height': 480},
170 '36': {'ext': '3gp', 'width': 320, 'height': 240},
171 '37': {'ext': 'mp4', 'width': 1920, 'height': 1080},
172 '38': {'ext': 'mp4', 'width': 4096, 'height': 3072},
173 '43': {'ext': 'webm', 'width': 640, 'height': 360},
174 '44': {'ext': 'webm', 'width': 854, 'height': 480},
175 '45': {'ext': 'webm', 'width': 1280, 'height': 720},
176 '46': {'ext': 'webm', 'width': 1920, 'height': 1080},
177
1d043b93 178
86fe61c8 179 # 3d videos
43b81eb9
PH
180 '82': {'ext': 'mp4', 'height': 360, 'format_note': '3D', 'preference': -20},
181 '83': {'ext': 'mp4', 'height': 480, 'format_note': '3D', 'preference': -20},
182 '84': {'ext': 'mp4', 'height': 720, 'format_note': '3D', 'preference': -20},
183 '85': {'ext': 'mp4', 'height': 1080, 'format_note': '3D', 'preference': -20},
184 '100': {'ext': 'webm', 'height': 360, 'format_note': '3D', 'preference': -20},
185 '101': {'ext': 'webm', 'height': 480, 'format_note': '3D', 'preference': -20},
186 '102': {'ext': 'webm', 'height': 720, 'format_note': '3D', 'preference': -20},
836a086c 187
96fb5605 188 # Apple HTTP Live Streaming
43b81eb9
PH
189 '92': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'preference': -10},
190 '93': {'ext': 'mp4', 'height': 360, 'format_note': 'HLS', 'preference': -10},
191 '94': {'ext': 'mp4', 'height': 480, 'format_note': 'HLS', 'preference': -10},
192 '95': {'ext': 'mp4', 'height': 720, 'format_note': 'HLS', 'preference': -10},
193 '96': {'ext': 'mp4', 'height': 1080, 'format_note': 'HLS', 'preference': -10},
194 '132': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'preference': -10},
195 '151': {'ext': 'mp4', 'height': 72, 'format_note': 'HLS', 'preference': -10},
2c62dc26
PH
196
197 # DASH mp4 video
43b81eb9
PH
198 '133': {'ext': 'mp4', 'height': 240, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
199 '134': {'ext': 'mp4', 'height': 360, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
200 '135': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
201 '136': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
202 '137': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
203 '138': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
204 '160': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
205 '264': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
836a086c 206
f6f1fc92 207 # Dash mp4 audio
2c62dc26
PH
208 '139': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 48, 'preference': -50},
209 '140': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 128, 'preference': -50},
210 '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 256, 'preference': -50},
836a086c
AZ
211
212 # Dash webm
bc6d5978
JMF
213 '167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'acodec': 'none', 'preference': -40},
214 '168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'acodec': 'none', 'preference': -40},
215 '169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'acodec': 'none', 'preference': -40},
216 '170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'acodec': 'none', 'preference': -40},
217 '218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'acodec': 'none', 'preference': -40},
218 '219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'acodec': 'none', 'preference': -40},
43b81eb9
PH
219 '242': {'ext': 'webm', 'height': 240, 'format_note': 'DASH webm', 'preference': -40},
220 '243': {'ext': 'webm', 'height': 360, 'format_note': 'DASH webm', 'preference': -40},
221 '244': {'ext': 'webm', 'height': 480, 'format_note': 'DASH webm', 'preference': -40},
222 '245': {'ext': 'webm', 'height': 480, 'format_note': 'DASH webm', 'preference': -40},
223 '246': {'ext': 'webm', 'height': 480, 'format_note': 'DASH webm', 'preference': -40},
224 '247': {'ext': 'webm', 'height': 720, 'format_note': 'DASH webm', 'preference': -40},
225 '248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH webm', 'preference': -40},
2c62dc26
PH
226
227 # Dash webm audio
228 '171': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH webm audio', 'abr': 48, 'preference': -50},
229 '172': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH webm audio', 'abr': 256, 'preference': -50},
ce6b9a2d
PH
230
231 # RTMP (unnamed)
232 '_rtmp': {'protocol': 'rtmp'},
c5e8d7af 233 }
836a086c 234
c5e8d7af 235 IE_NAME = u'youtube'
2eb88d95
PH
236 _TESTS = [
237 {
0e853ca4
PH
238 u"url": u"http://www.youtube.com/watch?v=BaW_jenozKc",
239 u"file": u"BaW_jenozKc.mp4",
240 u"info_dict": {
241 u"title": u"youtube-dl test video \"'/\\ä↭𝕐",
242 u"uploader": u"Philipp Hagemeister",
243 u"uploader_id": u"phihag",
244 u"upload_date": u"20121002",
27dcce19 245 u"description": u"test chars: \"'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de ."
2eb88d95 246 }
0e853ca4 247 },
0e853ca4
PH
248 {
249 u"url": u"http://www.youtube.com/watch?v=UxxajLWwzqY",
250 u"file": u"UxxajLWwzqY.mp4",
251 u"note": u"Test generic use_cipher_signature video (#897)",
252 u"info_dict": {
253 u"upload_date": u"20120506",
254 u"title": u"Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]",
2dc59299 255 u"description": u"md5:5b292926389560516e384ac437c0ec07",
45ed795c 256 u"uploader": u"Icona Pop",
0e853ca4 257 u"uploader_id": u"IconaPop"
2eb88d95 258 }
c108eb73
JMF
259 },
260 {
261 u"url": u"https://www.youtube.com/watch?v=07FYdnEawAQ",
262 u"file": u"07FYdnEawAQ.mp4",
263 u"note": u"Test VEVO video with age protection (#956)",
264 u"info_dict": {
265 u"upload_date": u"20130703",
266 u"title": u"Justin Timberlake - Tunnel Vision (Explicit)",
267 u"description": u"md5:64249768eec3bc4276236606ea996373",
268 u"uploader": u"justintimberlakeVEVO",
269 u"uploader_id": u"justintimberlakeVEVO"
270 }
271 },
fccd3771 272 {
83aa5293 273 u"url": u"//www.YouTube.com/watch?v=yZIXLfi8CZQ",
fccd3771
PH
274 u"file": u"yZIXLfi8CZQ.mp4",
275 u"note": u"Embed-only video (#1746)",
276 u"info_dict": {
277 u"upload_date": u"20120608",
278 u"title": u"Principal Sexually Assaults A Teacher - Episode 117 - 8th June 2012",
279 u"description": u"md5:09b78bd971f1e3e289601dfba15ca4f7",
280 u"uploader": u"SET India",
281 u"uploader_id": u"setindia"
282 }
283 },
dd27fd17
PH
284 {
285 u"url": u"http://www.youtube.com/watch?v=a9LDPn-MO4I",
286 u"file": u"a9LDPn-MO4I.m4a",
287 u"note": u"256k DASH audio (format 141) via DASH manifest",
dd27fd17
PH
288 u"info_dict": {
289 u"upload_date": "20121002",
290 u"uploader_id": "8KVIDEO",
291 u"description": "No description available.",
292 u"uploader": "8KVIDEO",
293 u"title": "UHDTV TEST 8K VIDEO.mp4"
4919603f
PH
294 },
295 u"params": {
296 u"youtube_include_dash_manifest": True,
297 u"format": "141",
298 },
dd27fd17 299 },
3489b7d2
JMF
300 # DASH manifest with encrypted signature
301 {
302 u'url': u'https://www.youtube.com/watch?v=IB3lcPjvWLA',
303 u'info_dict': {
304 u'id': u'IB3lcPjvWLA',
305 u'ext': u'm4a',
306 u'title': u'Afrojack - The Spark ft. Spree Wilson',
307 u'description': u'md5:3199ed45ee8836572865580804d7ac0f',
308 u'uploader': u'AfrojackVEVO',
309 u'uploader_id': u'AfrojackVEVO',
310 u'upload_date': u'20131011',
311 },
312 u"params": {
313 u'youtube_include_dash_manifest': True,
314 u'format': '141',
315 },
316 },
2eb88d95
PH
317 ]
318
c5e8d7af
PH
319
320 @classmethod
321 def suitable(cls, url):
322 """Receives a URL and returns True if suitable for this IE."""
e3ea4790 323 if YoutubePlaylistIE.suitable(url): return False
fccd3771 324 return re.match(cls._VALID_URL, url) is not None
c5e8d7af 325
e0df6211
PH
326 def __init__(self, *args, **kwargs):
327 super(YoutubeIE, self).__init__(*args, **kwargs)
83799698 328 self._player_cache = {}
e0df6211 329
c5e8d7af
PH
330 def report_video_info_webpage_download(self, video_id):
331 """Report attempt to download video info webpage."""
332 self.to_screen(u'%s: Downloading video info webpage' % video_id)
333
c5e8d7af
PH
334 def report_information_extraction(self, video_id):
335 """Report attempt to extract video information."""
336 self.to_screen(u'%s: Extracting video information' % video_id)
337
338 def report_unavailable_format(self, video_id, format):
339 """Report extracted video URL."""
340 self.to_screen(u'%s: Format %s not available' % (video_id, format))
341
342 def report_rtmp_download(self):
343 """Indicate the download will use the RTMP protocol."""
344 self.to_screen(u'RTMP download detected')
345
c4417ddb
PH
346 def _extract_signature_function(self, video_id, player_url, slen):
347 id_m = re.match(r'.*-(?P<id>[a-zA-Z0-9_-]+)\.(?P<ext>[a-z]+)$',
83799698 348 player_url)
e0df6211
PH
349 player_type = id_m.group('ext')
350 player_id = id_m.group('id')
351
c4417ddb
PH
352 # Read from filesystem cache
353 func_id = '%s_%s_%d' % (player_type, player_id, slen)
354 assert os.path.basename(func_id) == func_id
c38b1e77 355 cache_dir = get_cachedir(self._downloader.params)
c4417ddb 356
c3c88a26 357 cache_enabled = cache_dir is not None
f8061589 358 if cache_enabled:
c4417ddb
PH
359 cache_fn = os.path.join(os.path.expanduser(cache_dir),
360 u'youtube-sigfuncs',
361 func_id + '.json')
362 try:
edf3e38e 363 with io.open(cache_fn, 'r', encoding='utf-8') as cachef:
c4417ddb
PH
364 cache_spec = json.load(cachef)
365 return lambda s: u''.join(s[i] for i in cache_spec)
edf3e38e 366 except IOError:
c4417ddb 367 pass # No cache available
83799698 368
e0df6211
PH
369 if player_type == 'js':
370 code = self._download_webpage(
371 player_url, video_id,
83799698 372 note=u'Downloading %s player %s' % (player_type, player_id),
e0df6211 373 errnote=u'Download of %s failed' % player_url)
83799698 374 res = self._parse_sig_js(code)
c4417ddb 375 elif player_type == 'swf':
e0df6211
PH
376 urlh = self._request_webpage(
377 player_url, video_id,
83799698 378 note=u'Downloading %s player %s' % (player_type, player_id),
e0df6211
PH
379 errnote=u'Download of %s failed' % player_url)
380 code = urlh.read()
83799698 381 res = self._parse_sig_swf(code)
e0df6211
PH
382 else:
383 assert False, 'Invalid player type %r' % player_type
384
f8061589 385 if cache_enabled:
edf3e38e 386 try:
c705320f
PH
387 test_string = u''.join(map(compat_chr, range(slen)))
388 cache_res = res(test_string)
edf3e38e
PH
389 cache_spec = [ord(c) for c in cache_res]
390 try:
391 os.makedirs(os.path.dirname(cache_fn))
392 except OSError as ose:
393 if ose.errno != errno.EEXIST:
394 raise
395 write_json_file(cache_spec, cache_fn)
0ca96d48 396 except Exception:
edf3e38e
PH
397 tb = traceback.format_exc()
398 self._downloader.report_warning(
399 u'Writing cache to %r failed: %s' % (cache_fn, tb))
83799698
PH
400
401 return res
402
edf3e38e
PH
403 def _print_sig_code(self, func, slen):
404 def gen_sig_code(idxs):
405 def _genslice(start, end, step):
406 starts = u'' if start == 0 else str(start)
e35e4ddc
PH
407 ends = (u':%d' % (end+step)) if end + step >= 0 else u':'
408 steps = u'' if step == 1 else (u':%d' % step)
edf3e38e
PH
409 return u's[%s%s%s]' % (starts, ends, steps)
410
411 step = None
0ca96d48
PH
412 start = '(Never used)' # Quelch pyflakes warnings - start will be
413 # set as soon as step is set
edf3e38e
PH
414 for i, prev in zip(idxs[1:], idxs[:-1]):
415 if step is not None:
416 if i - prev == step:
417 continue
418 yield _genslice(start, prev, step)
419 step = None
420 continue
421 if i - prev in [-1, 1]:
422 step = i - prev
423 start = prev
424 continue
425 else:
426 yield u's[%d]' % prev
427 if step is None:
428 yield u's[%d]' % i
429 else:
430 yield _genslice(start, i, step)
431
c705320f
PH
432 test_string = u''.join(map(compat_chr, range(slen)))
433 cache_res = func(test_string)
edf3e38e
PH
434 cache_spec = [ord(c) for c in cache_res]
435 expr_code = u' + '.join(gen_sig_code(cache_spec))
436 code = u'if len(s) == %d:\n return %s\n' % (slen, expr_code)
f8061589 437 self.to_screen(u'Extracted signature function:\n' + code)
edf3e38e 438
e0df6211
PH
439 def _parse_sig_js(self, jscode):
440 funcname = self._search_regex(
441 r'signature=([a-zA-Z]+)', jscode,
2b25cb5d
PH
442 u'Initial JS player signature function name')
443
444 jsi = JSInterpreter(jscode)
445 initial_function = jsi.extract_function(funcname)
e0df6211
PH
446 return lambda s: initial_function([s])
447
448 def _parse_sig_swf(self, file_contents):
449 if file_contents[1:3] != b'WS':
450 raise ExtractorError(
451 u'Not an SWF file; header is %r' % file_contents[:3])
452 if file_contents[:1] == b'C':
453 content = zlib.decompress(file_contents[8:])
454 else:
455 raise NotImplementedError(u'Unsupported compression format %r' %
456 file_contents[:1])
457
458 def extract_tags(content):
459 pos = 0
460 while pos < len(content):
461 header16 = struct.unpack('<H', content[pos:pos+2])[0]
462 pos += 2
463 tag_code = header16 >> 6
464 tag_len = header16 & 0x3f
465 if tag_len == 0x3f:
466 tag_len = struct.unpack('<I', content[pos:pos+4])[0]
467 pos += 4
468 assert pos+tag_len <= len(content)
469 yield (tag_code, content[pos:pos+tag_len])
470 pos += tag_len
471
472 code_tag = next(tag
473 for tag_code, tag in extract_tags(content)
474 if tag_code == 82)
475 p = code_tag.index(b'\0', 4) + 1
ba552f54 476 code_reader = io.BytesIO(code_tag[p:])
e0df6211
PH
477
478 # Parse ABC (AVM2 ByteCode)
ba552f54
PH
479 def read_int(reader=None):
480 if reader is None:
481 reader = code_reader
e0df6211
PH
482 res = 0
483 shift = 0
484 for _ in range(5):
ba552f54
PH
485 buf = reader.read(1)
486 assert len(buf) == 1
487 b = struct.unpack('<B', buf)[0]
e0df6211
PH
488 res = res | ((b & 0x7f) << shift)
489 if b & 0x80 == 0:
490 break
491 shift += 7
ba552f54
PH
492 return res
493
494 def u30(reader=None):
495 res = read_int(reader)
496 assert res & 0xf0000000 == 0
e0df6211
PH
497 return res
498 u32 = read_int
499
ba552f54
PH
500 def s32(reader=None):
501 v = read_int(reader)
e0df6211
PH
502 if v & 0x80000000 != 0:
503 v = - ((v ^ 0xffffffff) + 1)
ba552f54
PH
504 return v
505
0ca96d48 506 def read_string(reader=None):
ba552f54
PH
507 if reader is None:
508 reader = code_reader
509 slen = u30(reader)
510 resb = reader.read(slen)
511 assert len(resb) == slen
512 return resb.decode('utf-8')
513
514 def read_bytes(count, reader=None):
515 if reader is None:
516 reader = code_reader
517 resb = reader.read(count)
518 assert len(resb) == count
519 return resb
520
521 def read_byte(reader=None):
522 resb = read_bytes(1, reader=reader)
523 res = struct.unpack('<B', resb)[0]
524 return res
e0df6211
PH
525
526 # minor_version + major_version
0ca96d48 527 read_bytes(2 + 2)
e0df6211
PH
528
529 # Constant pool
ba552f54 530 int_count = u30()
e0df6211 531 for _c in range(1, int_count):
0ca96d48 532 s32()
ba552f54 533 uint_count = u30()
e0df6211 534 for _c in range(1, uint_count):
0ca96d48 535 u32()
ba552f54 536 double_count = u30()
0ca96d48 537 read_bytes((double_count-1) * 8)
ba552f54 538 string_count = u30()
e0df6211
PH
539 constant_strings = [u'']
540 for _c in range(1, string_count):
0ca96d48 541 s = read_string()
e0df6211 542 constant_strings.append(s)
ba552f54 543 namespace_count = u30()
e0df6211 544 for _c in range(1, namespace_count):
0ca96d48
PH
545 read_bytes(1) # kind
546 u30() # name
ba552f54 547 ns_set_count = u30()
e0df6211 548 for _c in range(1, ns_set_count):
ba552f54 549 count = u30()
e0df6211 550 for _c2 in range(count):
0ca96d48 551 u30()
ba552f54 552 multiname_count = u30()
e0df6211
PH
553 MULTINAME_SIZES = {
554 0x07: 2, # QName
555 0x0d: 2, # QNameA
556 0x0f: 1, # RTQName
557 0x10: 1, # RTQNameA
558 0x11: 0, # RTQNameL
559 0x12: 0, # RTQNameLA
560 0x09: 2, # Multiname
561 0x0e: 2, # MultinameA
562 0x1b: 1, # MultinameL
563 0x1c: 1, # MultinameLA
564 }
565 multinames = [u'']
566 for _c in range(1, multiname_count):
ba552f54 567 kind = u30()
e0df6211
PH
568 assert kind in MULTINAME_SIZES, u'Invalid multiname kind %r' % kind
569 if kind == 0x07:
0ca96d48 570 u30() # namespace_idx
ba552f54 571 name_idx = u30()
e0df6211
PH
572 multinames.append(constant_strings[name_idx])
573 else:
574 multinames.append('[MULTINAME kind: %d]' % kind)
575 for _c2 in range(MULTINAME_SIZES[kind]):
0ca96d48 576 u30()
e0df6211
PH
577
578 # Methods
ba552f54 579 method_count = u30()
e0df6211
PH
580 MethodInfo = collections.namedtuple(
581 'MethodInfo',
582 ['NEED_ARGUMENTS', 'NEED_REST'])
583 method_infos = []
584 for method_id in range(method_count):
ba552f54 585 param_count = u30()
0ca96d48 586 u30() # return type
e0df6211 587 for _ in range(param_count):
0ca96d48
PH
588 u30() # param type
589 u30() # name index (always 0 for youtube)
ba552f54 590 flags = read_byte()
e0df6211
PH
591 if flags & 0x08 != 0:
592 # Options present
ba552f54 593 option_count = u30()
e0df6211 594 for c in range(option_count):
0ca96d48
PH
595 u30() # val
596 read_bytes(1) # kind
e0df6211
PH
597 if flags & 0x80 != 0:
598 # Param names present
599 for _ in range(param_count):
0ca96d48 600 u30() # param name
e0df6211
PH
601 mi = MethodInfo(flags & 0x01 != 0, flags & 0x04 != 0)
602 method_infos.append(mi)
603
604 # Metadata
ba552f54 605 metadata_count = u30()
e0df6211 606 for _c in range(metadata_count):
0ca96d48 607 u30() # name
ba552f54 608 item_count = u30()
e0df6211 609 for _c2 in range(item_count):
0ca96d48
PH
610 u30() # key
611 u30() # value
ba552f54
PH
612
613 def parse_traits_info():
614 trait_name_idx = u30()
615 kind_full = read_byte()
e0df6211
PH
616 kind = kind_full & 0x0f
617 attrs = kind_full >> 4
618 methods = {}
619 if kind in [0x00, 0x06]: # Slot or Const
0ca96d48
PH
620 u30() # Slot id
621 u30() # type_name_idx
ba552f54 622 vindex = u30()
e0df6211 623 if vindex != 0:
0ca96d48 624 read_byte() # vkind
e0df6211 625 elif kind in [0x01, 0x02, 0x03]: # Method / Getter / Setter
0ca96d48 626 u30() # disp_id
ba552f54 627 method_idx = u30()
e0df6211
PH
628 methods[multinames[trait_name_idx]] = method_idx
629 elif kind == 0x04: # Class
0ca96d48
PH
630 u30() # slot_id
631 u30() # classi
e0df6211 632 elif kind == 0x05: # Function
0ca96d48 633 u30() # slot_id
ba552f54 634 function_idx = u30()
e0df6211
PH
635 methods[function_idx] = multinames[trait_name_idx]
636 else:
637 raise ExtractorError(u'Unsupported trait kind %d' % kind)
638
639 if attrs & 0x4 != 0: # Metadata present
ba552f54 640 metadata_count = u30()
e0df6211 641 for _c3 in range(metadata_count):
0ca96d48 642 u30() # metadata index
e0df6211 643
ba552f54 644 return methods
e0df6211
PH
645
646 # Classes
647 TARGET_CLASSNAME = u'SignatureDecipher'
648 searched_idx = multinames.index(TARGET_CLASSNAME)
649 searched_class_id = None
ba552f54 650 class_count = u30()
e0df6211 651 for class_id in range(class_count):
ba552f54 652 name_idx = u30()
e0df6211
PH
653 if name_idx == searched_idx:
654 # We found the class we're looking for!
655 searched_class_id = class_id
0ca96d48 656 u30() # super_name idx
ba552f54 657 flags = read_byte()
e0df6211 658 if flags & 0x08 != 0: # Protected namespace is present
0ca96d48 659 u30() # protected_ns_idx
ba552f54 660 intrf_count = u30()
e0df6211 661 for _c2 in range(intrf_count):
0ca96d48
PH
662 u30()
663 u30() # iinit
ba552f54 664 trait_count = u30()
e0df6211 665 for _c2 in range(trait_count):
0ca96d48 666 parse_traits_info()
e0df6211
PH
667
668 if searched_class_id is None:
669 raise ExtractorError(u'Target class %r not found' %
670 TARGET_CLASSNAME)
671
672 method_names = {}
673 method_idxs = {}
674 for class_id in range(class_count):
0ca96d48 675 u30() # cinit
ba552f54 676 trait_count = u30()
e0df6211 677 for _c2 in range(trait_count):
ba552f54 678 trait_methods = parse_traits_info()
e0df6211
PH
679 if class_id == searched_class_id:
680 method_names.update(trait_methods.items())
681 method_idxs.update(dict(
682 (idx, name)
683 for name, idx in trait_methods.items()))
684
685 # Scripts
ba552f54 686 script_count = u30()
e0df6211 687 for _c in range(script_count):
0ca96d48 688 u30() # init
ba552f54 689 trait_count = u30()
e0df6211 690 for _c2 in range(trait_count):
0ca96d48 691 parse_traits_info()
e0df6211
PH
692
693 # Method bodies
ba552f54 694 method_body_count = u30()
e0df6211
PH
695 Method = collections.namedtuple('Method', ['code', 'local_count'])
696 methods = {}
697 for _c in range(method_body_count):
ba552f54 698 method_idx = u30()
0ca96d48 699 u30() # max_stack
ba552f54 700 local_count = u30()
0ca96d48
PH
701 u30() # init_scope_depth
702 u30() # max_scope_depth
ba552f54
PH
703 code_length = u30()
704 code = read_bytes(code_length)
e0df6211 705 if method_idx in method_idxs:
ba552f54 706 m = Method(code, local_count)
e0df6211 707 methods[method_idxs[method_idx]] = m
ba552f54 708 exception_count = u30()
e0df6211 709 for _c2 in range(exception_count):
0ca96d48
PH
710 u30() # from
711 u30() # to
712 u30() # target
713 u30() # exc_type
714 u30() # var_name
ba552f54 715 trait_count = u30()
e0df6211 716 for _c2 in range(trait_count):
0ca96d48 717 parse_traits_info()
e0df6211 718
ba552f54 719 assert p + code_reader.tell() == len(code_tag)
e0df6211
PH
720 assert len(methods) == len(method_idxs)
721
722 method_pyfunctions = {}
723
724 def extract_function(func_name):
725 if func_name in method_pyfunctions:
726 return method_pyfunctions[func_name]
727 if func_name not in methods:
728 raise ExtractorError(u'Cannot find function %r' % func_name)
729 m = methods[func_name]
730
731 def resfunc(args):
e0df6211
PH
732 registers = ['(this)'] + list(args) + [None] * m.local_count
733 stack = []
734 coder = io.BytesIO(m.code)
735 while True:
736 opcode = struct.unpack('!B', coder.read(1))[0]
a7177865 737 if opcode == 36: # pushbyte
e0df6211
PH
738 v = struct.unpack('!B', coder.read(1))[0]
739 stack.append(v)
740 elif opcode == 44: # pushstring
741 idx = u30(coder)
742 stack.append(constant_strings[idx])
743 elif opcode == 48: # pushscope
744 # We don't implement the scope register, so we'll just
745 # ignore the popped value
746 stack.pop()
747 elif opcode == 70: # callproperty
748 index = u30(coder)
749 mname = multinames[index]
750 arg_count = u30(coder)
751 args = list(reversed(
752 [stack.pop() for _ in range(arg_count)]))
753 obj = stack.pop()
754 if mname == u'split':
755 assert len(args) == 1
756 assert isinstance(args[0], compat_str)
757 assert isinstance(obj, compat_str)
758 if args[0] == u'':
759 res = list(obj)
760 else:
761 res = obj.split(args[0])
762 stack.append(res)
a7177865
PH
763 elif mname == u'slice':
764 assert len(args) == 1
765 assert isinstance(args[0], int)
766 assert isinstance(obj, list)
767 res = obj[args[0]:]
768 stack.append(res)
769 elif mname == u'join':
770 assert len(args) == 1
771 assert isinstance(args[0], compat_str)
772 assert isinstance(obj, list)
773 res = args[0].join(obj)
774 stack.append(res)
e0df6211
PH
775 elif mname in method_pyfunctions:
776 stack.append(method_pyfunctions[mname](args))
777 else:
778 raise NotImplementedError(
779 u'Unsupported property %r on %r'
780 % (mname, obj))
a7177865
PH
781 elif opcode == 72: # returnvalue
782 res = stack.pop()
783 return res
784 elif opcode == 79: # callpropvoid
785 index = u30(coder)
786 mname = multinames[index]
787 arg_count = u30(coder)
788 args = list(reversed(
789 [stack.pop() for _ in range(arg_count)]))
790 obj = stack.pop()
791 if mname == u'reverse':
792 assert isinstance(obj, list)
793 obj.reverse()
794 else:
795 raise NotImplementedError(
796 u'Unsupported (void) property %r on %r'
797 % (mname, obj))
e0df6211
PH
798 elif opcode == 93: # findpropstrict
799 index = u30(coder)
800 mname = multinames[index]
801 res = extract_function(mname)
802 stack.append(res)
803 elif opcode == 97: # setproperty
804 index = u30(coder)
805 value = stack.pop()
806 idx = stack.pop()
807 obj = stack.pop()
808 assert isinstance(obj, list)
809 assert isinstance(idx, int)
810 obj[idx] = value
811 elif opcode == 98: # getlocal
812 index = u30(coder)
813 stack.append(registers[index])
814 elif opcode == 99: # setlocal
815 index = u30(coder)
816 value = stack.pop()
817 registers[index] = value
818 elif opcode == 102: # getproperty
819 index = u30(coder)
820 pname = multinames[index]
821 if pname == u'length':
822 obj = stack.pop()
823 assert isinstance(obj, list)
824 stack.append(len(obj))
825 else: # Assume attribute access
826 idx = stack.pop()
827 assert isinstance(idx, int)
828 obj = stack.pop()
829 assert isinstance(obj, list)
830 stack.append(obj[idx])
831 elif opcode == 128: # coerce
0ca96d48 832 u30(coder)
e0df6211
PH
833 elif opcode == 133: # coerce_s
834 assert isinstance(stack[-1], (type(None), compat_str))
835 elif opcode == 164: # modulo
836 value2 = stack.pop()
837 value1 = stack.pop()
838 res = value1 % value2
839 stack.append(res)
a7177865
PH
840 elif opcode == 208: # getlocal_0
841 stack.append(registers[0])
842 elif opcode == 209: # getlocal_1
843 stack.append(registers[1])
844 elif opcode == 210: # getlocal_2
845 stack.append(registers[2])
846 elif opcode == 211: # getlocal_3
847 stack.append(registers[3])
e0df6211
PH
848 elif opcode == 214: # setlocal_2
849 registers[2] = stack.pop()
850 elif opcode == 215: # setlocal_3
851 registers[3] = stack.pop()
852 else:
853 raise NotImplementedError(
854 u'Unsupported opcode %d' % opcode)
855
856 method_pyfunctions[func_name] = resfunc
857 return resfunc
858
859 initial_function = extract_function(u'decipher')
860 return lambda s: initial_function([s])
861
83799698 862 def _decrypt_signature(self, s, video_id, player_url, age_gate=False):
257a2501 863 """Turn the encrypted s field into a working signature"""
6b37f0be 864
83799698 865 if player_url is not None:
9f9be844
PH
866 if player_url.startswith(u'//'):
867 player_url = u'https:' + player_url
e0df6211 868 try:
7f8ae73a
PH
869 player_id = (player_url, len(s))
870 if player_id not in self._player_cache:
83799698 871 func = self._extract_signature_function(
c4417ddb 872 video_id, player_url, len(s)
e0df6211 873 )
7f8ae73a
PH
874 self._player_cache[player_id] = func
875 func = self._player_cache[player_id]
edf3e38e
PH
876 if self._downloader.params.get('youtube_print_sig_code'):
877 self._print_sig_code(func, len(s))
878 return func(s)
0ca96d48 879 except Exception:
e0df6211 880 tb = traceback.format_exc()
83799698
PH
881 self._downloader.report_warning(
882 u'Automatic signature extraction failed: ' + tb)
e0df6211 883
d2d8f895
PH
884 self._downloader.report_warning(
885 u'Warning: Falling back to static signature algorithm')
920de7a2 886
2f2ffea9
PH
887 return self._static_decrypt_signature(
888 s, video_id, player_url, age_gate)
e0df6211 889
2f2ffea9 890 def _static_decrypt_signature(self, s, video_id, player_url, age_gate):
e0df6211
PH
891 if age_gate:
892 # The videos with age protection use another player, so the
893 # algorithms can be different.
894 if len(s) == 86:
895 return s[2:63] + s[82] + s[64:82] + s[63]
896
bc4b9008 897 if len(s) == 93:
898 return s[86:29:-1] + s[88] + s[28:5:-1]
899 elif len(s) == 92:
444b1165 900 return s[25] + s[3:25] + s[0] + s[26:42] + s[79] + s[43:79] + s[91] + s[80:83]
38d025b3
JMF
901 elif len(s) == 91:
902 return s[84:27:-1] + s[86] + s[26:5:-1]
444b1165
JMF
903 elif len(s) == 90:
904 return s[25] + s[3:25] + s[2] + s[26:40] + s[77] + s[41:77] + s[89] + s[78:81]
8a9d86a2 905 elif len(s) == 89:
906 return s[84:78:-1] + s[87] + s[77:60:-1] + s[0] + s[59:3:-1]
444b1165 907 elif len(s) == 88:
3e223834 908 return s[7:28] + s[87] + s[29:45] + s[55] + s[46:55] + s[2] + s[56:87] + s[28]
be547e1d 909 elif len(s) == 87:
3a725669 910 return s[6:27] + s[4] + s[28:39] + s[27] + s[40:59] + s[2] + s[60:]
be547e1d 911 elif len(s) == 86:
f2c327fd 912 return s[80:72:-1] + s[16] + s[71:39:-1] + s[72] + s[38:16:-1] + s[82] + s[15::-1]
be547e1d 913 elif len(s) == 85:
6ae8ee3f 914 return s[3:11] + s[0] + s[12:55] + s[84] + s[56:84]
be547e1d 915 elif len(s) == 84:
6f56389b 916 return s[78:70:-1] + s[14] + s[69:37:-1] + s[70] + s[36:14:-1] + s[80] + s[:14][::-1]
be547e1d 917 elif len(s) == 83:
920de7a2 918 return s[80:63:-1] + s[0] + s[62:0:-1] + s[63]
be547e1d 919 elif len(s) == 82:
c21315f2 920 return s[80:37:-1] + s[7] + s[36:7:-1] + s[0] + s[6:0:-1] + s[37]
be547e1d 921 elif len(s) == 81:
aedd6bb9 922 return s[56] + s[79:56:-1] + s[41] + s[55:41:-1] + s[80] + s[40:34:-1] + s[0] + s[33:29:-1] + s[34] + s[28:9:-1] + s[29] + s[8:0:-1] + s[9]
066090dd
JMF
923 elif len(s) == 80:
924 return s[1:19] + s[0] + s[20:68] + s[19] + s[69:80]
5c468ca8
JMF
925 elif len(s) == 79:
926 return s[54] + s[77:54:-1] + s[39] + s[53:39:-1] + s[78] + s[38:34:-1] + s[0] + s[33:29:-1] + s[34] + s[28:9:-1] + s[29] + s[8:0:-1] + s[9]
be547e1d
PH
927
928 else:
929 raise ExtractorError(u'Unable to decrypt signature, key length %d not supported; retrying might work' % (len(s)))
c5e8d7af 930
1f343eaa 931 def _get_available_subtitles(self, video_id, webpage):
de7f3446 932 try:
7fad1c63 933 sub_list = self._download_webpage(
38c2e5b8 934 'https://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,
7fad1c63
JMF
935 video_id, note=False)
936 except ExtractorError as err:
de7f3446
JMF
937 self._downloader.report_warning(u'unable to download video subtitles: %s' % compat_str(err))
938 return {}
939 lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
940
941 sub_lang_list = {}
942 for l in lang_list:
943 lang = l[1]
944 params = compat_urllib_parse.urlencode({
945 'lang': lang,
946 'v': video_id,
ca715127 947 'fmt': self._downloader.params.get('subtitlesformat', 'srt'),
c3197e3e 948 'name': unescapeHTML(l[0]).encode('utf-8'),
de7f3446 949 })
38c2e5b8 950 url = u'https://www.youtube.com/api/timedtext?' + params
de7f3446
JMF
951 sub_lang_list[lang] = url
952 if not sub_lang_list:
953 self._downloader.report_warning(u'video doesn\'t have subtitles')
954 return {}
955 return sub_lang_list
956
055e6f36 957 def _get_available_automatic_caption(self, video_id, webpage):
de7f3446
JMF
958 """We need the webpage for getting the captions url, pass it as an
959 argument to speed up the process."""
ca715127 960 sub_format = self._downloader.params.get('subtitlesformat', 'srt')
de7f3446
JMF
961 self.to_screen(u'%s: Looking for automatic captions' % video_id)
962 mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
055e6f36 963 err_msg = u'Couldn\'t find automatic captions for %s' % video_id
de7f3446
JMF
964 if mobj is None:
965 self._downloader.report_warning(err_msg)
966 return {}
967 player_config = json.loads(mobj.group(1))
968 try:
969 args = player_config[u'args']
970 caption_url = args[u'ttsurl']
971 timestamp = args[u'timestamp']
055e6f36
JMF
972 # We get the available subtitles
973 list_params = compat_urllib_parse.urlencode({
974 'type': 'list',
975 'tlangs': 1,
976 'asrs': 1,
de7f3446 977 })
055e6f36 978 list_url = caption_url + '&' + list_params
e26f8712 979 caption_list = self._download_xml(list_url, video_id)
e3dc22ca 980 original_lang_node = caption_list.find('track')
f6a54188 981 if original_lang_node is None or original_lang_node.attrib.get('kind') != 'asr' :
e3dc22ca
JMF
982 self._downloader.report_warning(u'Video doesn\'t have automatic captions')
983 return {}
984 original_lang = original_lang_node.attrib['lang_code']
055e6f36
JMF
985
986 sub_lang_list = {}
987 for lang_node in caption_list.findall('target'):
988 sub_lang = lang_node.attrib['lang_code']
989 params = compat_urllib_parse.urlencode({
990 'lang': original_lang,
991 'tlang': sub_lang,
992 'fmt': sub_format,
993 'ts': timestamp,
994 'kind': 'asr',
995 })
996 sub_lang_list[sub_lang] = caption_url + '&' + params
997 return sub_lang_list
de7f3446
JMF
998 # An extractor error can be raise by the download process if there are
999 # no automatic captions but there are subtitles
1000 except (KeyError, ExtractorError):
1001 self._downloader.report_warning(err_msg)
1002 return {}
1003
97665381
PH
1004 @classmethod
1005 def extract_id(cls, url):
1006 mobj = re.match(cls._VALID_URL, url, re.VERBOSE)
c5e8d7af
PH
1007 if mobj is None:
1008 raise ExtractorError(u'Invalid URL: %s' % url)
1009 video_id = mobj.group(2)
1010 return video_id
1011
1d043b93
JMF
1012 def _extract_from_m3u8(self, manifest_url, video_id):
1013 url_map = {}
1014 def _get_urls(_manifest):
1015 lines = _manifest.split('\n')
1016 urls = filter(lambda l: l and not l.startswith('#'),
1017 lines)
1018 return urls
1019 manifest = self._download_webpage(manifest_url, video_id, u'Downloading formats manifest')
1020 formats_urls = _get_urls(manifest)
1021 for format_url in formats_urls:
890f62e8 1022 itag = self._search_regex(r'itag/(\d+?)/', format_url, 'itag')
1d043b93
JMF
1023 url_map[itag] = format_url
1024 return url_map
1025
1fb07d10
JG
1026 def _extract_annotations(self, video_id):
1027 url = 'https://www.youtube.com/annotations_invideo?features=1&legacy=1&video_id=%s' % video_id
1028 return self._download_webpage(url, video_id, note=u'Searching for annotations.', errnote=u'Unable to download video annotations.')
1029
c5e8d7af 1030 def _real_extract(self, url):
7e8c0af0
PH
1031 proto = (
1032 u'http' if self._downloader.params.get('prefer_insecure', False)
1033 else u'https')
1034
c5e8d7af
PH
1035 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
1036 mobj = re.search(self._NEXT_URL_RE, url)
1037 if mobj:
7e8c0af0 1038 url = proto + '://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
97665381 1039 video_id = self.extract_id(url)
c5e8d7af
PH
1040
1041 # Get video webpage
7e8c0af0 1042 url = proto + '://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
336c3a69 1043 video_webpage = self._download_webpage(url, video_id)
c5e8d7af
PH
1044
1045 # Attempt to extract SWF player URL
e0df6211 1046 mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
c5e8d7af
PH
1047 if mobj is not None:
1048 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1049 else:
1050 player_url = None
1051
1052 # Get video info
1053 self.report_video_info_webpage_download(video_id)
c108eb73
JMF
1054 if re.search(r'player-age-gate-content">', video_webpage) is not None:
1055 self.report_age_confirmation()
1056 age_gate = True
1057 # We simulate the access to the video from www.youtube.com/v/{video_id}
1058 # this can be viewed without login into Youtube
1059 data = compat_urllib_parse.urlencode({'video_id': video_id,
fccd3771 1060 'el': 'player_embedded',
c108eb73
JMF
1061 'gl': 'US',
1062 'hl': 'en',
1063 'eurl': 'https://youtube.googleapis.com/v/' + video_id,
1064 'asv': 3,
1065 'sts':'1588',
1066 })
7e8c0af0 1067 video_info_url = proto + '://www.youtube.com/get_video_info?' + data
c5e8d7af
PH
1068 video_info_webpage = self._download_webpage(video_info_url, video_id,
1069 note=False,
1070 errnote='unable to download video info webpage')
1071 video_info = compat_parse_qs(video_info_webpage)
c108eb73
JMF
1072 else:
1073 age_gate = False
1074 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
7e8c0af0 1075 video_info_url = (proto + '://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
c108eb73
JMF
1076 % (video_id, el_type))
1077 video_info_webpage = self._download_webpage(video_info_url, video_id,
1078 note=False,
1079 errnote='unable to download video info webpage')
1080 video_info = compat_parse_qs(video_info_webpage)
1081 if 'token' in video_info:
1082 break
c5e8d7af
PH
1083 if 'token' not in video_info:
1084 if 'reason' in video_info:
9a82b238 1085 raise ExtractorError(u'YouTube said: %s' % video_info['reason'][0], expected=True)
c5e8d7af
PH
1086 else:
1087 raise ExtractorError(u'"token" parameter not in video info for unknown reason')
1088
1d699755
PH
1089 if 'view_count' in video_info:
1090 view_count = int(video_info['view_count'][0])
1091 else:
1092 view_count = None
1093
c5e8d7af
PH
1094 # Check for "rental" videos
1095 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
1096 raise ExtractorError(u'"rental" videos not supported')
1097
1098 # Start extracting information
1099 self.report_information_extraction(video_id)
1100
1101 # uploader
1102 if 'author' not in video_info:
1103 raise ExtractorError(u'Unable to extract uploader name')
1104 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
1105
1106 # uploader_id
1107 video_uploader_id = None
1108 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
1109 if mobj is not None:
1110 video_uploader_id = mobj.group(1)
1111 else:
1112 self._downloader.report_warning(u'unable to extract uploader nickname')
1113
1114 # title
a8c6b241
PH
1115 if 'title' in video_info:
1116 video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
1117 else:
1118 self._downloader.report_warning(u'Unable to extract video title')
1119 video_title = u'_'
c5e8d7af
PH
1120
1121 # thumbnail image
7763b04e
JMF
1122 # We try first to get a high quality image:
1123 m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">',
1124 video_webpage, re.DOTALL)
1125 if m_thumb is not None:
1126 video_thumbnail = m_thumb.group(1)
1127 elif 'thumbnail_url' not in video_info:
c5e8d7af 1128 self._downloader.report_warning(u'unable to extract video thumbnail')
f490e77e 1129 video_thumbnail = None
c5e8d7af
PH
1130 else: # don't panic if we can't find it
1131 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
1132
1133 # upload date
1134 upload_date = None
1135 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
1136 if mobj is not None:
1137 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1138 upload_date = unified_strdate(upload_date)
1139
1140 # description
1141 video_description = get_element_by_id("eow-description", video_webpage)
1142 if video_description:
27dcce19
PH
1143 video_description = re.sub(r'''(?x)
1144 <a\s+
1145 (?:[a-zA-Z-]+="[^"]+"\s+)*?
1146 title="([^"]+)"\s+
1147 (?:[a-zA-Z-]+="[^"]+"\s+)*?
1148 class="yt-uix-redirect-link"\s*>
1149 [^<]+
1150 </a>
1151 ''', r'\1', video_description)
c5e8d7af
PH
1152 video_description = clean_html(video_description)
1153 else:
1154 fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
1155 if fd_mobj:
1156 video_description = unescapeHTML(fd_mobj.group(1))
1157 else:
1158 video_description = u''
1159
336c3a69 1160 def _extract_count(klass):
46374a56
PH
1161 count = self._search_regex(
1162 r'class="%s">([\d,]+)</span>' % re.escape(klass),
1163 video_webpage, klass, default=None)
336c3a69
JMF
1164 if count is not None:
1165 return int(count.replace(',', ''))
1166 return None
1167 like_count = _extract_count(u'likes-count')
1168 dislike_count = _extract_count(u'dislikes-count')
1169
c5e8d7af 1170 # subtitles
d82134c3 1171 video_subtitles = self.extract_subtitles(video_id, video_webpage)
c5e8d7af 1172
c5e8d7af 1173 if self._downloader.params.get('listsubtitles', False):
d665f8d3 1174 self._list_available_subtitles(video_id, video_webpage)
c5e8d7af
PH
1175 return
1176
1177 if 'length_seconds' not in video_info:
1178 self._downloader.report_warning(u'unable to extract video duration')
b466b702 1179 video_duration = None
c5e8d7af 1180 else:
b466b702 1181 video_duration = int(compat_urllib_parse.unquote_plus(video_info['length_seconds'][0]))
c5e8d7af 1182
1fb07d10
JG
1183 # annotations
1184 video_annotations = None
1185 if self._downloader.params.get('writeannotations', False):
1186 video_annotations = self._extract_annotations(video_id)
1187
c5e8d7af 1188 # Decide which formats to download
c5e8d7af 1189 try:
ae7ed920 1190 mobj = re.search(r';ytplayer\.config\s*=\s*({.*?});', video_webpage)
50be92c1
PH
1191 if not mobj:
1192 raise ValueError('Could not find vevo ID')
ae7ed920
PH
1193 json_code = uppercase_escape(mobj.group(1))
1194 ytplayer_config = json.loads(json_code)
3489b7d2 1195 args = ytplayer_config['args']
7ce7e394
JMF
1196 # Easy way to know if the 's' value is in url_encoded_fmt_stream_map
1197 # this signatures are encrypted
44d46655 1198 if 'url_encoded_fmt_stream_map' not in args:
f10503db 1199 raise ValueError(u'No stream_map present') # caught below
00fe14fc
JMF
1200 re_signature = re.compile(r'[&,]s=')
1201 m_s = re_signature.search(args['url_encoded_fmt_stream_map'])
7ce7e394
JMF
1202 if m_s is not None:
1203 self.to_screen(u'%s: Encrypted signatures detected.' % video_id)
c5e8d7af 1204 video_info['url_encoded_fmt_stream_map'] = [args['url_encoded_fmt_stream_map']]
00fe14fc 1205 m_s = re_signature.search(args.get('adaptive_fmts', u''))
b7a68384 1206 if m_s is not None:
00fe14fc
JMF
1207 if 'adaptive_fmts' in video_info:
1208 video_info['adaptive_fmts'][0] += ',' + args['adaptive_fmts']
37b6d5f6 1209 else:
00fe14fc 1210 video_info['adaptive_fmts'] = [args['adaptive_fmts']]
c5e8d7af
PH
1211 except ValueError:
1212 pass
1213
dd27fd17
PH
1214 def _map_to_format_list(urlmap):
1215 formats = []
1216 for itag, video_real_url in urlmap.items():
1217 dct = {
1218 'format_id': itag,
1219 'url': video_real_url,
1220 'player_url': player_url,
1221 }
0b65e5d4
PH
1222 if itag in self._formats:
1223 dct.update(self._formats[itag])
dd27fd17
PH
1224 formats.append(dct)
1225 return formats
1226
c5e8d7af
PH
1227 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1228 self.report_rtmp_download()
dd27fd17
PH
1229 formats = [{
1230 'format_id': '_rtmp',
1231 'protocol': 'rtmp',
1232 'url': video_info['conn'][0],
1233 'player_url': player_url,
1234 }]
00fe14fc
JMF
1235 elif len(video_info.get('url_encoded_fmt_stream_map', [])) >= 1 or len(video_info.get('adaptive_fmts', [])) >= 1:
1236 encoded_url_map = video_info.get('url_encoded_fmt_stream_map', [''])[0] + ',' + video_info.get('adaptive_fmts',[''])[0]
1237 if 'rtmpe%3Dyes' in encoded_url_map:
a7055eb9 1238 raise ExtractorError('rtmpe downloads are not supported, see https://github.com/rg3/youtube-dl/issues/343 for more information.', expected=True)
c5e8d7af 1239 url_map = {}
00fe14fc 1240 for url_data_str in encoded_url_map.split(','):
c5e8d7af
PH
1241 url_data = compat_parse_qs(url_data_str)
1242 if 'itag' in url_data and 'url' in url_data:
1243 url = url_data['url'][0]
1244 if 'sig' in url_data:
1245 url += '&signature=' + url_data['sig'][0]
1246 elif 's' in url_data:
e0df6211 1247 encrypted_sig = url_data['s'][0]
769fda3c 1248 if self._downloader.params.get('verbose'):
c108eb73 1249 if age_gate:
bdde940e
PH
1250 if player_url is None:
1251 player_version = 'unknown'
1252 else:
1253 player_version = self._search_regex(
1254 r'-(.+)\.swf$', player_url,
1255 u'flash player', fatal=False)
e0df6211 1256 player_desc = 'flash player %s' % player_version
c108eb73 1257 else:
83799698
PH
1258 player_version = self._search_regex(
1259 r'html5player-(.+?)\.js', video_webpage,
c108eb73 1260 'html5 player', fatal=False)
e0df6211
PH
1261 player_desc = u'html5 player %s' % player_version
1262
1263 parts_sizes = u'.'.join(compat_str(len(part)) for part in encrypted_sig.split('.'))
5a76c651 1264 self.to_screen(u'encrypted signature length %d (%s), itag %s, %s' %
e0df6211
PH
1265 (len(encrypted_sig), parts_sizes, url_data['itag'][0], player_desc))
1266
83799698 1267 if not age_gate:
e0df6211
PH
1268 jsplayer_url_json = self._search_regex(
1269 r'"assets":.+?"js":\s*("[^"]+")',
1270 video_webpage, u'JS player URL')
83799698 1271 player_url = json.loads(jsplayer_url_json)
e0df6211 1272
83799698
PH
1273 signature = self._decrypt_signature(
1274 encrypted_sig, video_id, player_url, age_gate)
c5e8d7af
PH
1275 url += '&signature=' + signature
1276 if 'ratebypass' not in url:
1277 url += '&ratebypass=yes'
1278 url_map[url_data['itag'][0]] = url
dd27fd17 1279 formats = _map_to_format_list(url_map)
1d043b93
JMF
1280 elif video_info.get('hlsvp'):
1281 manifest_url = video_info['hlsvp'][0]
1282 url_map = self._extract_from_m3u8(manifest_url, video_id)
dd27fd17 1283 formats = _map_to_format_list(url_map)
c5e8d7af 1284 else:
9abb3204 1285 raise ExtractorError(u'no conn, hlsvp or url_encoded_fmt_stream_map information found in video info')
c5e8d7af 1286
dd27fd17 1287 # Look for the DASH manifest
d68f0cdb 1288 if (self._downloader.params.get('youtube_include_dash_manifest', False)):
dd27fd17 1289 try:
d68f0cdb 1290 # The DASH manifest used needs to be the one from the original video_webpage.
1291 # The one found in get_video_info seems to be using different signatures.
1292 # However, in the case of an age restriction there won't be any embedded dashmpd in the video_webpage.
1293 # Luckily, it seems, this case uses some kind of default signature (len == 86), so the
1294 # combination of get_video_info and the _static_decrypt_signature() decryption fallback will work here.
1295 if age_gate:
3489b7d2 1296 dash_manifest_url = video_info.get('dashmpd')[0]
d68f0cdb 1297 else:
3489b7d2 1298 dash_manifest_url = ytplayer_config['args']['dashmpd']
d68f0cdb 1299 def decrypt_sig(mobj):
1300 s = mobj.group(1)
1301 dec_s = self._decrypt_signature(s, video_id, player_url, age_gate)
1302 return '/signature/%s' % dec_s
1303 dash_manifest_url = re.sub(r'/s/([\w\.]+)', decrypt_sig, dash_manifest_url)
dd27fd17 1304 dash_doc = self._download_xml(
d68f0cdb 1305 dash_manifest_url, video_id,
dd27fd17
PH
1306 note=u'Downloading DASH manifest',
1307 errnote=u'Could not download DASH manifest')
1308 for r in dash_doc.findall(u'.//{urn:mpeg:DASH:schema:MPD:2011}Representation'):
1309 url_el = r.find('{urn:mpeg:DASH:schema:MPD:2011}BaseURL')
1310 if url_el is None:
1311 continue
1312 format_id = r.attrib['id']
1313 video_url = url_el.text
1314 filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength'))
1315 f = {
1316 'format_id': format_id,
1317 'url': video_url,
1318 'width': int_or_none(r.attrib.get('width')),
1319 'tbr': int_or_none(r.attrib.get('bandwidth'), 1000),
1320 'asr': int_or_none(r.attrib.get('audioSamplingRate')),
1321 'filesize': filesize,
1322 }
1323 try:
1324 existing_format = next(
1325 fo for fo in formats
1326 if fo['format_id'] == format_id)
1327 except StopIteration:
1328 f.update(self._formats.get(format_id, {}))
1329 formats.append(f)
1330 else:
1331 existing_format.update(f)
1332
1333 except (ExtractorError, KeyError) as e:
1334 self.report_warning(u'Skipping DASH manifest: %s' % e, video_id)
d80044c2 1335
4bcc7bd1 1336 self._sort_formats(formats)
4ea3be0a 1337
1338 return {
1339 'id': video_id,
1340 'uploader': video_uploader,
1341 'uploader_id': video_uploader_id,
1342 'upload_date': upload_date,
1343 'title': video_title,
1344 'thumbnail': video_thumbnail,
1345 'description': video_description,
1346 'subtitles': video_subtitles,
1347 'duration': video_duration,
1348 'age_limit': 18 if age_gate else 0,
1349 'annotations': video_annotations,
7e8c0af0 1350 'webpage_url': proto + '://www.youtube.com/watch?v=%s' % video_id,
4ea3be0a 1351 'view_count': view_count,
1352 'like_count': like_count,
1353 'dislike_count': dislike_count,
1354 'formats': formats,
1355 }
c5e8d7af 1356
880e1c52 1357class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
0f818663 1358 IE_DESC = u'YouTube.com playlists'
d67cc9fa 1359 _VALID_URL = r"""(?x)(?:
c5e8d7af
PH
1360 (?:https?://)?
1361 (?:\w+\.)?
1362 youtube\.com/
1363 (?:
1364 (?:course|view_play_list|my_playlists|artist|playlist|watch)
1365 \? (?:.*?&)*? (?:p|a|list)=
1366 | p/
1367 )
d67cc9fa
JMF
1368 (
1369 (?:PL|EC|UU|FL|RD)?[0-9A-Za-z-_]{10,}
1370 # Top tracks, they can also include dots
1371 |(?:MC)[\w\.]*
1372 )
c5e8d7af
PH
1373 .*
1374 |
715c8e7b 1375 ((?:PL|EC|UU|FL|RD)[0-9A-Za-z-_]{10,})
c5e8d7af 1376 )"""
dbb94fb0 1377 _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s'
dcbb4580 1378 _MORE_PAGES_INDICATOR = r'data-link-type="next"'
dbb94fb0 1379 _VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})&amp;[^"]*?index=(?P<index>\d+)'
c5e8d7af
PH
1380 IE_NAME = u'youtube:playlist'
1381
880e1c52
JMF
1382 def _real_initialize(self):
1383 self._login()
1384
652cdaa2
JMF
1385 def _ids_to_results(self, ids):
1386 return [self.url_result(vid_id, 'Youtube', video_id=vid_id)
1387 for vid_id in ids]
1388
1389 def _extract_mix(self, playlist_id):
1390 # The mixes are generated from a a single video
1391 # the id of the playlist is just 'RD' + video_id
7d4afc55 1392 url = 'https://youtube.com/watch?v=%s&list=%s' % (playlist_id[-11:], playlist_id)
652cdaa2 1393 webpage = self._download_webpage(url, playlist_id, u'Downloading Youtube mix')
bc2f773b
JMF
1394 search_title = lambda class_name: get_element_by_attribute('class', class_name, webpage)
1395 title_span = (search_title('playlist-title') or
1396 search_title('title long-title') or search_title('title'))
76d1700b 1397 title = clean_html(title_span)
a2dafe28 1398 video_re = r'''(?x)data-video-username="(.*?)".*?
bc2f773b
JMF
1399 href="/watch\?v=([0-9A-Za-z_-]{11})&amp;[^"]*?list=%s''' % re.escape(playlist_id)
1400 matches = orderedSet(re.findall(video_re, webpage, flags=re.DOTALL))
a2dafe28 1401 # Some of the videos may have been deleted, their username field is empty
bc2f773b 1402 ids = [video_id for (username, video_id) in matches if username]
652cdaa2
JMF
1403 url_results = self._ids_to_results(ids)
1404
1405 return self.playlist_result(url_results, playlist_id, title)
1406
c5e8d7af
PH
1407 def _real_extract(self, url):
1408 # Extract playlist id
d67cc9fa 1409 mobj = re.match(self._VALID_URL, url)
c5e8d7af
PH
1410 if mobj is None:
1411 raise ExtractorError(u'Invalid URL: %s' % url)
47192f92
FV
1412 playlist_id = mobj.group(1) or mobj.group(2)
1413
1414 # Check if it's a video-specific URL
7c61bd36 1415 query_dict = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
47192f92
FV
1416 if 'v' in query_dict:
1417 video_id = query_dict['v'][0]
1418 if self._downloader.params.get('noplaylist'):
1419 self.to_screen(u'Downloading just video %s because of --no-playlist' % video_id)
7012b23c 1420 return self.url_result(video_id, 'Youtube', video_id=video_id)
47192f92
FV
1421 else:
1422 self.to_screen(u'Downloading playlist PL%s - add --no-playlist to just download video %s' % (playlist_id, video_id))
c5e8d7af 1423
7d4afc55 1424 if playlist_id.startswith('RD'):
652cdaa2
JMF
1425 # Mixes require a custom extraction process
1426 return self._extract_mix(playlist_id)
0a688bc0
JMF
1427 if playlist_id.startswith('TL'):
1428 raise ExtractorError(u'For downloading YouTube.com top lists, use '
1429 u'the "yttoplist" keyword, for example "youtube-dl \'yttoplist:music:Top Tracks\'"', expected=True)
652cdaa2 1430
dbb94fb0
S
1431 url = self._TEMPLATE_URL % playlist_id
1432 page = self._download_webpage(url, playlist_id)
1433 more_widget_html = content_html = page
1434
dcbb4580
JMF
1435 # Extract the video ids from the playlist pages
1436 ids = []
c5e8d7af 1437
755eb032 1438 for page_num in itertools.count(1):
dbb94fb0 1439 matches = re.finditer(self._VIDEO_RE, content_html)
6e47b51e
JMF
1440 # We remove the duplicates and the link with index 0
1441 # (it's not the first video of the playlist)
1442 new_ids = orderedSet(m.group('id') for m in matches if m.group('index') != '0')
dcbb4580 1443 ids.extend(new_ids)
c5e8d7af 1444
dbb94fb0
S
1445 mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
1446 if not mobj:
c5e8d7af
PH
1447 break
1448
dbb94fb0
S
1449 more = self._download_json(
1450 'https://youtube.com/%s' % mobj.group('more'), playlist_id, 'Downloading page #%s' % page_num)
1451 content_html = more['content_html']
1452 more_widget_html = more['load_more_widget_html']
1453
1454 playlist_title = self._html_search_regex(
1455 r'<h1 class="pl-header-title">\s*(.*?)\s*</h1>', page, u'title')
c5e8d7af 1456
652cdaa2 1457 url_results = self._ids_to_results(ids)
dcbb4580 1458 return self.playlist_result(url_results, playlist_id, playlist_title)
c5e8d7af
PH
1459
1460
0a688bc0
JMF
1461class YoutubeTopListIE(YoutubePlaylistIE):
1462 IE_NAME = u'youtube:toplist'
1463 IE_DESC = (u'YouTube.com top lists, "yttoplist:{channel}:{list title}"'
1464 u' (Example: "yttoplist:music:Top Tracks")')
1465 _VALID_URL = r'yttoplist:(?P<chann>.*?):(?P<title>.*?)$'
1466
1467 def _real_extract(self, url):
1468 mobj = re.match(self._VALID_URL, url)
1469 channel = mobj.group('chann')
1470 title = mobj.group('title')
1471 query = compat_urllib_parse.urlencode({'title': title})
beddbc2a 1472 playlist_re = 'href="([^"]+?%s.*?)"' % re.escape(query)
0a688bc0
JMF
1473 channel_page = self._download_webpage('https://www.youtube.com/%s' % channel, title)
1474 link = self._html_search_regex(playlist_re, channel_page, u'list')
1475 url = compat_urlparse.urljoin('https://www.youtube.com/', link)
1476
1477 video_re = r'data-index="\d+".*?data-video-id="([0-9A-Za-z_-]{11})"'
1478 ids = []
1479 # sometimes the webpage doesn't contain the videos
1480 # retry until we get them
1481 for i in itertools.count(0):
1482 msg = u'Downloading Youtube mix'
1483 if i > 0:
1484 msg += ', retry #%d' % i
1485 webpage = self._download_webpage(url, title, msg)
1486 ids = orderedSet(re.findall(video_re, webpage))
1487 if ids:
1488 break
1489 url_results = self._ids_to_results(ids)
1490 return self.playlist_result(url_results, playlist_title=title)
1491
1492
c5e8d7af 1493class YoutubeChannelIE(InfoExtractor):
0f818663 1494 IE_DESC = u'YouTube.com channels'
c5e8d7af 1495 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
c5e8d7af 1496 _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
38c2e5b8 1497 _MORE_PAGES_URL = 'https://www.youtube.com/c4_browse_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
c5e8d7af
PH
1498 IE_NAME = u'youtube:channel'
1499
1500 def extract_videos_from_page(self, page):
1501 ids_in_page = []
1502 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1503 if mobj.group(1) not in ids_in_page:
1504 ids_in_page.append(mobj.group(1))
1505 return ids_in_page
1506
1507 def _real_extract(self, url):
1508 # Extract channel id
1509 mobj = re.match(self._VALID_URL, url)
1510 if mobj is None:
1511 raise ExtractorError(u'Invalid URL: %s' % url)
1512
1513 # Download channel page
1514 channel_id = mobj.group(1)
1515 video_ids = []
b9643eed
JMF
1516 url = 'https://www.youtube.com/channel/%s/videos' % channel_id
1517 channel_page = self._download_webpage(url, channel_id)
31812a9e
PH
1518 autogenerated = re.search(r'''(?x)
1519 class="[^"]*?(?:
1520 channel-header-autogenerated-label|
1521 yt-channel-title-autogenerated
1522 )[^"]*"''', channel_page) is not None
c5e8d7af 1523
b9643eed
JMF
1524 if autogenerated:
1525 # The videos are contained in a single page
1526 # the ajax pages can't be used, they are empty
1527 video_ids = self.extract_videos_from_page(channel_page)
1528 else:
1529 # Download all channel pages using the json-based channel_ajax query
1530 for pagenum in itertools.count(1):
1531 url = self._MORE_PAGES_URL % (pagenum, channel_id)
81c2f20b
PH
1532 page = self._download_json(
1533 url, channel_id, note=u'Downloading page #%s' % pagenum,
1534 transform_source=uppercase_escape)
1535
b9643eed
JMF
1536 ids_in_page = self.extract_videos_from_page(page['content_html'])
1537 video_ids.extend(ids_in_page)
1538
1539 if self._MORE_PAGES_INDICATOR not in page['load_more_widget_html']:
1540 break
c5e8d7af
PH
1541
1542 self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1543
7012b23c
PH
1544 url_entries = [self.url_result(video_id, 'Youtube', video_id=video_id)
1545 for video_id in video_ids]
1546 return self.playlist_result(url_entries, channel_id)
c5e8d7af
PH
1547
1548
1549class YoutubeUserIE(InfoExtractor):
0f818663 1550 IE_DESC = u'YouTube.com user videos (URL or "ytuser" keyword)'
c9ae7b95 1551 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:user/)?(?!(?:attribution_link|watch|results)(?:$|[^a-z_A-Z0-9-])))|ytuser:)(?!feed/)([A-Za-z0-9_-]+)'
38c2e5b8 1552 _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/users/%s'
c5e8d7af 1553 _GDATA_PAGE_SIZE = 50
38c2e5b8 1554 _GDATA_URL = 'https://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d&alt=json'
c5e8d7af
PH
1555 IE_NAME = u'youtube:user'
1556
e3ea4790 1557 @classmethod
f4b05232 1558 def suitable(cls, url):
e3ea4790
JMF
1559 # Don't return True if the url can be extracted with other youtube
1560 # extractor, the regex would is too permissive and it would match.
1561 other_ies = iter(klass for (name, klass) in globals().items() if name.endswith('IE') and klass is not cls)
1562 if any(ie.suitable(url) for ie in other_ies): return False
f4b05232
JMF
1563 else: return super(YoutubeUserIE, cls).suitable(url)
1564
c5e8d7af
PH
1565 def _real_extract(self, url):
1566 # Extract username
1567 mobj = re.match(self._VALID_URL, url)
1568 if mobj is None:
1569 raise ExtractorError(u'Invalid URL: %s' % url)
1570
1571 username = mobj.group(1)
1572
1573 # Download video ids using YouTube Data API. Result size per
1574 # query is limited (currently to 50 videos) so we need to query
1575 # page by page until there are no video ids - it means we got
1576 # all of them.
1577
b7ab0590 1578 def download_page(pagenum):
c5e8d7af
PH
1579 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1580
1581 gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
b7ab0590
PH
1582 page = self._download_webpage(
1583 gdata_url, username,
1584 u'Downloading video ids from %d to %d' % (
1585 start_index, start_index + self._GDATA_PAGE_SIZE))
c5e8d7af 1586
fd9cf738
JMF
1587 try:
1588 response = json.loads(page)
1589 except ValueError as err:
1590 raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
71c82637 1591 if 'entry' not in response['feed']:
b7ab0590 1592 return
fd9cf738 1593
c5e8d7af 1594 # Extract video identifiers
e302f9ce
PH
1595 entries = response['feed']['entry']
1596 for entry in entries:
1597 title = entry['title']['$t']
1598 video_id = entry['id']['$t'].split('/')[-1]
b7ab0590 1599 yield {
e302f9ce
PH
1600 '_type': 'url',
1601 'url': video_id,
1602 'ie_key': 'Youtube',
b11cec41 1603 'id': video_id,
e302f9ce 1604 'title': title,
b7ab0590
PH
1605 }
1606 url_results = PagedList(download_page, self._GDATA_PAGE_SIZE)
c5e8d7af 1607
7012b23c
PH
1608 return self.playlist_result(url_results, playlist_title=username)
1609
b05654f0
PH
1610
1611class YoutubeSearchIE(SearchInfoExtractor):
0f818663 1612 IE_DESC = u'YouTube.com searches'
b05654f0
PH
1613 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1614 _MAX_RESULTS = 1000
1615 IE_NAME = u'youtube:search'
1616 _SEARCH_KEY = 'ytsearch'
1617
b05654f0
PH
1618 def _get_n_results(self, query, n):
1619 """Get a specified number of results for a query"""
1620
1621 video_ids = []
1622 pagenum = 0
1623 limit = n
1624
1625 while (50 * pagenum) < limit:
b05654f0 1626 result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
7cc3570e
PH
1627 data_json = self._download_webpage(
1628 result_url, video_id=u'query "%s"' % query,
1629 note=u'Downloading page %s' % (pagenum + 1),
1630 errnote=u'Unable to download API page')
1631 data = json.loads(data_json)
1632 api_response = data['data']
1633
1634 if 'items' not in api_response:
07ad22b8
PH
1635 raise ExtractorError(
1636 u'[youtube] No video results', expected=True)
b05654f0
PH
1637
1638 new_ids = list(video['id'] for video in api_response['items'])
1639 video_ids += new_ids
1640
1641 limit = min(n, api_response['totalItems'])
1642 pagenum += 1
1643
1644 if len(video_ids) > n:
1645 video_ids = video_ids[:n]
7012b23c
PH
1646 videos = [self.url_result(video_id, 'Youtube', video_id=video_id)
1647 for video_id in video_ids]
b05654f0 1648 return self.playlist_result(videos, query)
75dff0ee 1649
c9ae7b95 1650
a3dd9248 1651class YoutubeSearchDateIE(YoutubeSearchIE):
cb7fb546 1652 IE_NAME = YoutubeSearchIE.IE_NAME + ':date'
a3dd9248
CM
1653 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc&orderby=published'
1654 _SEARCH_KEY = 'ytsearchdate'
08fb86c4 1655 IE_DESC = u'YouTube.com searches, newest videos first'
75dff0ee 1656
c9ae7b95
PH
1657
1658class YoutubeSearchURLIE(InfoExtractor):
1659 IE_DESC = u'YouTube.com search URLs'
1660 IE_NAME = u'youtube:search_url'
1661 _VALID_URL = r'https?://(?:www\.)?youtube\.com/results\?(.*?&)?search_query=(?P<query>[^&]+)(?:[&]|$)'
1662
1663 def _real_extract(self, url):
1664 mobj = re.match(self._VALID_URL, url)
1665 query = compat_urllib_parse.unquote_plus(mobj.group('query'))
1666
1667 webpage = self._download_webpage(url, query)
1668 result_code = self._search_regex(
1669 r'(?s)<ol id="search-results"(.*?)</ol>', webpage, u'result HTML')
1670
1671 part_codes = re.findall(
1672 r'(?s)<h3 class="yt-lockup-title">(.*?)</h3>', result_code)
1673 entries = []
1674 for part_code in part_codes:
1675 part_title = self._html_search_regex(
1676 r'(?s)title="([^"]+)"', part_code, 'item title', fatal=False)
1677 part_url_snippet = self._html_search_regex(
1678 r'(?s)href="([^"]+)"', part_code, 'item URL')
1679 part_url = compat_urlparse.urljoin(
1680 'https://www.youtube.com/', part_url_snippet)
1681 entries.append({
1682 '_type': 'url',
1683 'url': part_url,
1684 'title': part_title,
1685 })
1686
1687 return {
1688 '_type': 'playlist',
1689 'entries': entries,
1690 'title': query,
1691 }
1692
1693
75dff0ee 1694class YoutubeShowIE(InfoExtractor):
0f818663 1695 IE_DESC = u'YouTube.com (multi-season) shows'
75dff0ee
JMF
1696 _VALID_URL = r'https?://www\.youtube\.com/show/(.*)'
1697 IE_NAME = u'youtube:show'
1698
1699 def _real_extract(self, url):
1700 mobj = re.match(self._VALID_URL, url)
1701 show_name = mobj.group(1)
1702 webpage = self._download_webpage(url, show_name, u'Downloading show webpage')
1703 # There's one playlist for each season of the show
1704 m_seasons = list(re.finditer(r'href="(/playlist\?list=.*?)"', webpage))
1705 self.to_screen(u'%s: Found %s seasons' % (show_name, len(m_seasons)))
1706 return [self.url_result('https://www.youtube.com' + season.group(1), 'YoutubePlaylist') for season in m_seasons]
04cc9617
JMF
1707
1708
b2e8bc1b 1709class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
d7ae0639
JMF
1710 """
1711 Base class for extractors that fetch info from
1712 http://www.youtube.com/feed_ajax
1713 Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.
1714 """
b2e8bc1b 1715 _LOGIN_REQUIRED = True
43ba5456
JMF
1716 # use action_load_personal_feed instead of action_load_system_feed
1717 _PERSONAL_FEED = False
04cc9617 1718
d7ae0639
JMF
1719 @property
1720 def _FEED_TEMPLATE(self):
43ba5456
JMF
1721 action = 'action_load_system_feed'
1722 if self._PERSONAL_FEED:
1723 action = 'action_load_personal_feed'
38c2e5b8 1724 return 'https://www.youtube.com/feed_ajax?%s=1&feed_name=%s&paging=%%s' % (action, self._FEED_NAME)
d7ae0639
JMF
1725
1726 @property
1727 def IE_NAME(self):
1728 return u'youtube:%s' % self._FEED_NAME
04cc9617 1729
81f0259b 1730 def _real_initialize(self):
b2e8bc1b 1731 self._login()
81f0259b 1732
04cc9617
JMF
1733 def _real_extract(self, url):
1734 feed_entries = []
0e44d838
JMF
1735 paging = 0
1736 for i in itertools.count(1):
d7ae0639
JMF
1737 info = self._download_webpage(self._FEED_TEMPLATE % paging,
1738 u'%s feed' % self._FEED_NAME,
04cc9617
JMF
1739 u'Downloading page %s' % i)
1740 info = json.loads(info)
1741 feed_html = info['feed_html']
43ba5456 1742 m_ids = re.finditer(r'"/watch\?v=(.*?)["&]', feed_html)
04cc9617 1743 ids = orderedSet(m.group(1) for m in m_ids)
7012b23c
PH
1744 feed_entries.extend(
1745 self.url_result(video_id, 'Youtube', video_id=video_id)
1746 for video_id in ids)
04cc9617
JMF
1747 if info['paging'] is None:
1748 break
0e44d838 1749 paging = info['paging']
d7ae0639
JMF
1750 return self.playlist_result(feed_entries, playlist_title=self._PLAYLIST_TITLE)
1751
1752class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor):
1753 IE_DESC = u'YouTube.com subscriptions feed, "ytsubs" keyword(requires authentication)'
1754 _VALID_URL = r'https?://www\.youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
1755 _FEED_NAME = 'subscriptions'
1756 _PLAYLIST_TITLE = u'Youtube Subscriptions'
1757
1758class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
1759 IE_DESC = u'YouTube.com recommended videos, "ytrec" keyword (requires authentication)'
1760 _VALID_URL = r'https?://www\.youtube\.com/feed/recommended|:ytrec(?:ommended)?'
1761 _FEED_NAME = 'recommended'
1762 _PLAYLIST_TITLE = u'Youtube Recommended videos'
c626a3d9 1763
43ba5456
JMF
1764class YoutubeWatchLaterIE(YoutubeFeedsInfoExtractor):
1765 IE_DESC = u'Youtube watch later list, "ytwatchlater" keyword (requires authentication)'
1766 _VALID_URL = r'https?://www\.youtube\.com/feed/watch_later|:ytwatchlater'
1767 _FEED_NAME = 'watch_later'
1768 _PLAYLIST_TITLE = u'Youtube Watch Later'
43ba5456 1769 _PERSONAL_FEED = True
c626a3d9 1770
f459d170
JMF
1771class YoutubeHistoryIE(YoutubeFeedsInfoExtractor):
1772 IE_DESC = u'Youtube watch history, "ythistory" keyword (requires authentication)'
1773 _VALID_URL = u'https?://www\.youtube\.com/feed/history|:ythistory'
1774 _FEED_NAME = 'history'
1775 _PERSONAL_FEED = True
1776 _PLAYLIST_TITLE = u'Youtube Watch History'
1777
c626a3d9
JMF
1778class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
1779 IE_NAME = u'youtube:favorites'
1780 IE_DESC = u'YouTube.com favourite videos, "ytfav" keyword (requires authentication)'
c7a7750d 1781 _VALID_URL = r'https?://www\.youtube\.com/my_favorites|:ytfav(?:ou?rites)?'
c626a3d9
JMF
1782 _LOGIN_REQUIRED = True
1783
1784 def _real_extract(self, url):
1785 webpage = self._download_webpage('https://www.youtube.com/my_favorites', 'Youtube Favourites videos')
1786 playlist_id = self._search_regex(r'list=(.+?)["&]', webpage, u'favourites playlist id')
1787 return self.url_result(playlist_id, 'YoutubePlaylist')
15870e90
PH
1788
1789
1790class YoutubeTruncatedURLIE(InfoExtractor):
1791 IE_NAME = 'youtube:truncated_url'
1792 IE_DESC = False # Do not list
975d35db 1793 _VALID_URL = r'''(?x)
2eb5d315 1794 (?:https?://)?[^/]+/watch\?(?:feature=[a-z_]+)?$|
975d35db
PH
1795 (?:https?://)?(?:www\.)?youtube\.com/attribution_link\?a=[^&]+$
1796 '''
15870e90
PH
1797
1798 def _real_extract(self, url):
1799 raise ExtractorError(
1800 u'Did you forget to quote the URL? Remember that & is a meta '
1801 u'character in most shells, so you want to put the URL in quotes, '
1802 u'like youtube-dl '
b4622a32
PH
1803 u'"http://www.youtube.com/watch?feature=foo&v=BaW_jenozKc" '
1804 u' or simply youtube-dl BaW_jenozKc .',
15870e90 1805 expected=True)