]> jfr.im git - yt-dlp.git/blame - youtube_dl/extractor/youtube.py
[rutube] Modernize
[yt-dlp.git] / youtube_dl / extractor / youtube.py
CommitLineData
c5e8d7af 1# coding: utf-8
c5e8d7af 2
e0df6211 3import collections
edf3e38e 4import errno
e0df6211 5import io
0ca96d48 6import itertools
c5e8d7af 7import json
c4417ddb 8import os.path
c5e8d7af 9import re
e0df6211
PH
10import struct
11import traceback
12import zlib
c5e8d7af 13
b05654f0 14from .common import InfoExtractor, SearchInfoExtractor
54d39d8b 15from .subtitles import SubtitlesInfoExtractor
2b25cb5d 16from ..jsinterp import JSInterpreter
c5e8d7af 17from ..utils import (
edf3e38e 18 compat_chr,
c5e8d7af 19 compat_parse_qs,
c5e8d7af
PH
20 compat_urllib_parse,
21 compat_urllib_request,
7c61bd36 22 compat_urlparse,
c5e8d7af
PH
23 compat_str,
24
25 clean_html,
c38b1e77 26 get_cachedir,
c5e8d7af 27 get_element_by_id,
652cdaa2 28 get_element_by_attribute,
c5e8d7af 29 ExtractorError,
dd27fd17 30 int_or_none,
b7ab0590 31 PagedList,
c5e8d7af
PH
32 unescapeHTML,
33 unified_strdate,
04cc9617 34 orderedSet,
edf3e38e 35 write_json_file,
81c2f20b 36 uppercase_escape,
c5e8d7af
PH
37)
38
de7f3446 39class YoutubeBaseInfoExtractor(InfoExtractor):
b2e8bc1b
JMF
40 """Provide base functions for Youtube extractors"""
41 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
42 _LANG_URL = r'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
38c2e5b8 43 _AGE_URL = 'https://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
b2e8bc1b
JMF
44 _NETRC_MACHINE = 'youtube'
45 # If True it will raise an error if no login info is provided
46 _LOGIN_REQUIRED = False
47
b2e8bc1b 48 def _set_language(self):
7cc3570e
PH
49 return bool(self._download_webpage(
50 self._LANG_URL, None,
51 note=u'Setting language', errnote='unable to set language',
52 fatal=False))
b2e8bc1b
JMF
53
54 def _login(self):
55 (username, password) = self._get_login_info()
56 # No authentication to be performed
57 if username is None:
58 if self._LOGIN_REQUIRED:
59 raise ExtractorError(u'No login info available, needed for using %s.' % self.IE_NAME, expected=True)
60 return False
61
7cc3570e
PH
62 login_page = self._download_webpage(
63 self._LOGIN_URL, None,
64 note=u'Downloading login page',
65 errnote=u'unable to fetch login page', fatal=False)
66 if login_page is False:
67 return
b2e8bc1b 68
795f28f8
PH
69 galx = self._search_regex(r'(?s)<input.+?name="GALX".+?value="(.+?)"',
70 login_page, u'Login GALX parameter')
c5e8d7af 71
b2e8bc1b
JMF
72 # Log in
73 login_form_strs = {
74 u'continue': u'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
75 u'Email': username,
76 u'GALX': galx,
77 u'Passwd': password,
78 u'PersistentCookie': u'yes',
79 u'_utf8': u'霱',
80 u'bgresponse': u'js_disabled',
81 u'checkConnection': u'',
82 u'checkedDomains': u'youtube',
83 u'dnConn': u'',
b2e8bc1b
JMF
84 u'pstMsg': u'0',
85 u'rmShown': u'1',
86 u'secTok': u'',
87 u'signIn': u'Sign in',
88 u'timeStmp': u'',
89 u'service': u'youtube',
90 u'uilel': u'3',
91 u'hl': u'en_US',
92 }
93 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
94 # chokes on unicode
95 login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
96 login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
7cc3570e
PH
97
98 req = compat_urllib_request.Request(self._LOGIN_URL, login_data)
99 login_results = self._download_webpage(
100 req, None,
101 note=u'Logging in', errnote=u'unable to log in', fatal=False)
102 if login_results is False:
103 return False
104 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
105 self._downloader.report_warning(u'unable to log in: bad username or password')
b2e8bc1b
JMF
106 return False
107 return True
108
109 def _confirm_age(self):
110 age_form = {
7cc3570e
PH
111 'next_url': '/',
112 'action_confirm': 'Confirm',
113 }
5700e779
JMF
114 req = compat_urllib_request.Request(self._AGE_URL,
115 compat_urllib_parse.urlencode(age_form).encode('ascii'))
7cc3570e
PH
116
117 self._download_webpage(
118 req, None,
119 note=u'Confirming age', errnote=u'Unable to confirm age')
b2e8bc1b
JMF
120 return True
121
122 def _real_initialize(self):
123 if self._downloader is None:
124 return
125 if not self._set_language():
126 return
127 if not self._login():
128 return
129 self._confirm_age()
c5e8d7af 130
8377574c 131
de7f3446 132class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
0f818663 133 IE_DESC = u'YouTube.com'
cb7dfeea 134 _VALID_URL = r"""(?x)^
c5e8d7af 135 (
83aa5293 136 (?:https?://|//)? # http(s):// or protocol-independent URL (optional)
cb7dfeea 137 (?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/|
484aaeb2 138 (?:www\.)?deturl\.com/www\.youtube\.com/|
e70dc1d1 139 (?:www\.)?pwnyoutube\.com/|
f7000f3a 140 (?:www\.)?yourepeat\.com/|
e69ae5b9
JMF
141 tube\.majestyc\.net/|
142 youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains
c5e8d7af
PH
143 (?:.*?\#/)? # handle anchor (#/) redirect urls
144 (?: # the various things that can precede the ID:
145 (?:(?:v|embed|e)/) # v/ or embed/ or e/
146 |(?: # or the v= param in all its forms
f7000f3a 147 (?:(?:watch|movie)(?:_popup)?(?:\.php)?/?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
c5e8d7af
PH
148 (?:\?|\#!?) # the params delimiter ? or # or #!
149 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
150 v=
151 )
f4b05232
JMF
152 ))
153 |youtu\.be/ # just youtu.be/xxxx
154 )
c5e8d7af 155 )? # all until now is optional -> you can pass the naked ID
8963d9c2 156 ([0-9A-Za-z_-]{11}) # here is it! the YouTube video ID
c5e8d7af
PH
157 (?(1).+)? # if we found the ID, everything can follow
158 $"""
c5e8d7af 159 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
2c62dc26
PH
160 _formats = {
161 '5': {'ext': 'flv', 'width': 400, 'height': 240},
162 '6': {'ext': 'flv', 'width': 450, 'height': 270},
163 '13': {'ext': '3gp'},
164 '17': {'ext': '3gp', 'width': 176, 'height': 144},
165 '18': {'ext': 'mp4', 'width': 640, 'height': 360},
166 '22': {'ext': 'mp4', 'width': 1280, 'height': 720},
167 '34': {'ext': 'flv', 'width': 640, 'height': 360},
168 '35': {'ext': 'flv', 'width': 854, 'height': 480},
169 '36': {'ext': '3gp', 'width': 320, 'height': 240},
170 '37': {'ext': 'mp4', 'width': 1920, 'height': 1080},
171 '38': {'ext': 'mp4', 'width': 4096, 'height': 3072},
172 '43': {'ext': 'webm', 'width': 640, 'height': 360},
173 '44': {'ext': 'webm', 'width': 854, 'height': 480},
174 '45': {'ext': 'webm', 'width': 1280, 'height': 720},
175 '46': {'ext': 'webm', 'width': 1920, 'height': 1080},
176
1d043b93 177
86fe61c8 178 # 3d videos
43b81eb9
PH
179 '82': {'ext': 'mp4', 'height': 360, 'format_note': '3D', 'preference': -20},
180 '83': {'ext': 'mp4', 'height': 480, 'format_note': '3D', 'preference': -20},
181 '84': {'ext': 'mp4', 'height': 720, 'format_note': '3D', 'preference': -20},
182 '85': {'ext': 'mp4', 'height': 1080, 'format_note': '3D', 'preference': -20},
183 '100': {'ext': 'webm', 'height': 360, 'format_note': '3D', 'preference': -20},
184 '101': {'ext': 'webm', 'height': 480, 'format_note': '3D', 'preference': -20},
185 '102': {'ext': 'webm', 'height': 720, 'format_note': '3D', 'preference': -20},
836a086c 186
96fb5605 187 # Apple HTTP Live Streaming
43b81eb9
PH
188 '92': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'preference': -10},
189 '93': {'ext': 'mp4', 'height': 360, 'format_note': 'HLS', 'preference': -10},
190 '94': {'ext': 'mp4', 'height': 480, 'format_note': 'HLS', 'preference': -10},
191 '95': {'ext': 'mp4', 'height': 720, 'format_note': 'HLS', 'preference': -10},
192 '96': {'ext': 'mp4', 'height': 1080, 'format_note': 'HLS', 'preference': -10},
193 '132': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'preference': -10},
194 '151': {'ext': 'mp4', 'height': 72, 'format_note': 'HLS', 'preference': -10},
2c62dc26
PH
195
196 # DASH mp4 video
43b81eb9
PH
197 '133': {'ext': 'mp4', 'height': 240, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
198 '134': {'ext': 'mp4', 'height': 360, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
199 '135': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
200 '136': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
201 '137': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
202 '138': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
203 '160': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
204 '264': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
836a086c 205
f6f1fc92 206 # Dash mp4 audio
2c62dc26
PH
207 '139': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 48, 'preference': -50},
208 '140': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 128, 'preference': -50},
209 '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 256, 'preference': -50},
836a086c
AZ
210
211 # Dash webm
bc6d5978
JMF
212 '167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'acodec': 'none', 'preference': -40},
213 '168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'acodec': 'none', 'preference': -40},
214 '169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'acodec': 'none', 'preference': -40},
215 '170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'acodec': 'none', 'preference': -40},
216 '218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'acodec': 'none', 'preference': -40},
217 '219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'acodec': 'none', 'preference': -40},
43b81eb9
PH
218 '242': {'ext': 'webm', 'height': 240, 'format_note': 'DASH webm', 'preference': -40},
219 '243': {'ext': 'webm', 'height': 360, 'format_note': 'DASH webm', 'preference': -40},
220 '244': {'ext': 'webm', 'height': 480, 'format_note': 'DASH webm', 'preference': -40},
221 '245': {'ext': 'webm', 'height': 480, 'format_note': 'DASH webm', 'preference': -40},
222 '246': {'ext': 'webm', 'height': 480, 'format_note': 'DASH webm', 'preference': -40},
223 '247': {'ext': 'webm', 'height': 720, 'format_note': 'DASH webm', 'preference': -40},
224 '248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH webm', 'preference': -40},
2c62dc26
PH
225
226 # Dash webm audio
227 '171': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH webm audio', 'abr': 48, 'preference': -50},
228 '172': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH webm audio', 'abr': 256, 'preference': -50},
ce6b9a2d
PH
229
230 # RTMP (unnamed)
231 '_rtmp': {'protocol': 'rtmp'},
c5e8d7af 232 }
836a086c 233
c5e8d7af 234 IE_NAME = u'youtube'
2eb88d95
PH
235 _TESTS = [
236 {
0e853ca4
PH
237 u"url": u"http://www.youtube.com/watch?v=BaW_jenozKc",
238 u"file": u"BaW_jenozKc.mp4",
239 u"info_dict": {
240 u"title": u"youtube-dl test video \"'/\\ä↭𝕐",
241 u"uploader": u"Philipp Hagemeister",
242 u"uploader_id": u"phihag",
243 u"upload_date": u"20121002",
27dcce19 244 u"description": u"test chars: \"'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de ."
2eb88d95 245 }
0e853ca4 246 },
0e853ca4
PH
247 {
248 u"url": u"http://www.youtube.com/watch?v=UxxajLWwzqY",
249 u"file": u"UxxajLWwzqY.mp4",
250 u"note": u"Test generic use_cipher_signature video (#897)",
251 u"info_dict": {
252 u"upload_date": u"20120506",
253 u"title": u"Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]",
2dc59299 254 u"description": u"md5:5b292926389560516e384ac437c0ec07",
45ed795c 255 u"uploader": u"Icona Pop",
0e853ca4 256 u"uploader_id": u"IconaPop"
2eb88d95 257 }
c108eb73
JMF
258 },
259 {
260 u"url": u"https://www.youtube.com/watch?v=07FYdnEawAQ",
261 u"file": u"07FYdnEawAQ.mp4",
262 u"note": u"Test VEVO video with age protection (#956)",
263 u"info_dict": {
264 u"upload_date": u"20130703",
265 u"title": u"Justin Timberlake - Tunnel Vision (Explicit)",
266 u"description": u"md5:64249768eec3bc4276236606ea996373",
267 u"uploader": u"justintimberlakeVEVO",
268 u"uploader_id": u"justintimberlakeVEVO"
269 }
270 },
fccd3771 271 {
83aa5293 272 u"url": u"//www.YouTube.com/watch?v=yZIXLfi8CZQ",
fccd3771
PH
273 u"file": u"yZIXLfi8CZQ.mp4",
274 u"note": u"Embed-only video (#1746)",
275 u"info_dict": {
276 u"upload_date": u"20120608",
277 u"title": u"Principal Sexually Assaults A Teacher - Episode 117 - 8th June 2012",
278 u"description": u"md5:09b78bd971f1e3e289601dfba15ca4f7",
279 u"uploader": u"SET India",
280 u"uploader_id": u"setindia"
281 }
282 },
dd27fd17
PH
283 {
284 u"url": u"http://www.youtube.com/watch?v=a9LDPn-MO4I",
285 u"file": u"a9LDPn-MO4I.m4a",
286 u"note": u"256k DASH audio (format 141) via DASH manifest",
dd27fd17
PH
287 u"info_dict": {
288 u"upload_date": "20121002",
289 u"uploader_id": "8KVIDEO",
290 u"description": "No description available.",
291 u"uploader": "8KVIDEO",
292 u"title": "UHDTV TEST 8K VIDEO.mp4"
4919603f
PH
293 },
294 u"params": {
295 u"youtube_include_dash_manifest": True,
296 u"format": "141",
297 },
dd27fd17 298 },
3489b7d2
JMF
299 # DASH manifest with encrypted signature
300 {
301 u'url': u'https://www.youtube.com/watch?v=IB3lcPjvWLA',
302 u'info_dict': {
303 u'id': u'IB3lcPjvWLA',
304 u'ext': u'm4a',
305 u'title': u'Afrojack - The Spark ft. Spree Wilson',
306 u'description': u'md5:3199ed45ee8836572865580804d7ac0f',
307 u'uploader': u'AfrojackVEVO',
308 u'uploader_id': u'AfrojackVEVO',
309 u'upload_date': u'20131011',
310 },
311 u"params": {
312 u'youtube_include_dash_manifest': True,
313 u'format': '141',
314 },
315 },
2eb88d95
PH
316 ]
317
c5e8d7af
PH
318
319 @classmethod
320 def suitable(cls, url):
321 """Receives a URL and returns True if suitable for this IE."""
e3ea4790 322 if YoutubePlaylistIE.suitable(url): return False
fccd3771 323 return re.match(cls._VALID_URL, url) is not None
c5e8d7af 324
e0df6211
PH
325 def __init__(self, *args, **kwargs):
326 super(YoutubeIE, self).__init__(*args, **kwargs)
83799698 327 self._player_cache = {}
e0df6211 328
c5e8d7af
PH
329 def report_video_info_webpage_download(self, video_id):
330 """Report attempt to download video info webpage."""
331 self.to_screen(u'%s: Downloading video info webpage' % video_id)
332
c5e8d7af
PH
333 def report_information_extraction(self, video_id):
334 """Report attempt to extract video information."""
335 self.to_screen(u'%s: Extracting video information' % video_id)
336
337 def report_unavailable_format(self, video_id, format):
338 """Report extracted video URL."""
339 self.to_screen(u'%s: Format %s not available' % (video_id, format))
340
341 def report_rtmp_download(self):
342 """Indicate the download will use the RTMP protocol."""
343 self.to_screen(u'RTMP download detected')
344
c4417ddb
PH
345 def _extract_signature_function(self, video_id, player_url, slen):
346 id_m = re.match(r'.*-(?P<id>[a-zA-Z0-9_-]+)\.(?P<ext>[a-z]+)$',
83799698 347 player_url)
e0df6211
PH
348 player_type = id_m.group('ext')
349 player_id = id_m.group('id')
350
c4417ddb
PH
351 # Read from filesystem cache
352 func_id = '%s_%s_%d' % (player_type, player_id, slen)
353 assert os.path.basename(func_id) == func_id
c38b1e77 354 cache_dir = get_cachedir(self._downloader.params)
c4417ddb 355
c3c88a26 356 cache_enabled = cache_dir is not None
f8061589 357 if cache_enabled:
c4417ddb
PH
358 cache_fn = os.path.join(os.path.expanduser(cache_dir),
359 u'youtube-sigfuncs',
360 func_id + '.json')
361 try:
edf3e38e 362 with io.open(cache_fn, 'r', encoding='utf-8') as cachef:
c4417ddb
PH
363 cache_spec = json.load(cachef)
364 return lambda s: u''.join(s[i] for i in cache_spec)
edf3e38e 365 except IOError:
c4417ddb 366 pass # No cache available
83799698 367
e0df6211
PH
368 if player_type == 'js':
369 code = self._download_webpage(
370 player_url, video_id,
83799698 371 note=u'Downloading %s player %s' % (player_type, player_id),
e0df6211 372 errnote=u'Download of %s failed' % player_url)
83799698 373 res = self._parse_sig_js(code)
c4417ddb 374 elif player_type == 'swf':
e0df6211
PH
375 urlh = self._request_webpage(
376 player_url, video_id,
83799698 377 note=u'Downloading %s player %s' % (player_type, player_id),
e0df6211
PH
378 errnote=u'Download of %s failed' % player_url)
379 code = urlh.read()
83799698 380 res = self._parse_sig_swf(code)
e0df6211
PH
381 else:
382 assert False, 'Invalid player type %r' % player_type
383
f8061589 384 if cache_enabled:
edf3e38e 385 try:
c705320f
PH
386 test_string = u''.join(map(compat_chr, range(slen)))
387 cache_res = res(test_string)
edf3e38e
PH
388 cache_spec = [ord(c) for c in cache_res]
389 try:
390 os.makedirs(os.path.dirname(cache_fn))
391 except OSError as ose:
392 if ose.errno != errno.EEXIST:
393 raise
394 write_json_file(cache_spec, cache_fn)
0ca96d48 395 except Exception:
edf3e38e
PH
396 tb = traceback.format_exc()
397 self._downloader.report_warning(
398 u'Writing cache to %r failed: %s' % (cache_fn, tb))
83799698
PH
399
400 return res
401
edf3e38e
PH
402 def _print_sig_code(self, func, slen):
403 def gen_sig_code(idxs):
404 def _genslice(start, end, step):
405 starts = u'' if start == 0 else str(start)
e35e4ddc
PH
406 ends = (u':%d' % (end+step)) if end + step >= 0 else u':'
407 steps = u'' if step == 1 else (u':%d' % step)
edf3e38e
PH
408 return u's[%s%s%s]' % (starts, ends, steps)
409
410 step = None
0ca96d48
PH
411 start = '(Never used)' # Quelch pyflakes warnings - start will be
412 # set as soon as step is set
edf3e38e
PH
413 for i, prev in zip(idxs[1:], idxs[:-1]):
414 if step is not None:
415 if i - prev == step:
416 continue
417 yield _genslice(start, prev, step)
418 step = None
419 continue
420 if i - prev in [-1, 1]:
421 step = i - prev
422 start = prev
423 continue
424 else:
425 yield u's[%d]' % prev
426 if step is None:
427 yield u's[%d]' % i
428 else:
429 yield _genslice(start, i, step)
430
c705320f
PH
431 test_string = u''.join(map(compat_chr, range(slen)))
432 cache_res = func(test_string)
edf3e38e
PH
433 cache_spec = [ord(c) for c in cache_res]
434 expr_code = u' + '.join(gen_sig_code(cache_spec))
435 code = u'if len(s) == %d:\n return %s\n' % (slen, expr_code)
f8061589 436 self.to_screen(u'Extracted signature function:\n' + code)
edf3e38e 437
e0df6211
PH
438 def _parse_sig_js(self, jscode):
439 funcname = self._search_regex(
440 r'signature=([a-zA-Z]+)', jscode,
2b25cb5d
PH
441 u'Initial JS player signature function name')
442
443 jsi = JSInterpreter(jscode)
444 initial_function = jsi.extract_function(funcname)
e0df6211
PH
445 return lambda s: initial_function([s])
446
447 def _parse_sig_swf(self, file_contents):
448 if file_contents[1:3] != b'WS':
449 raise ExtractorError(
450 u'Not an SWF file; header is %r' % file_contents[:3])
451 if file_contents[:1] == b'C':
452 content = zlib.decompress(file_contents[8:])
453 else:
454 raise NotImplementedError(u'Unsupported compression format %r' %
455 file_contents[:1])
456
457 def extract_tags(content):
458 pos = 0
459 while pos < len(content):
460 header16 = struct.unpack('<H', content[pos:pos+2])[0]
461 pos += 2
462 tag_code = header16 >> 6
463 tag_len = header16 & 0x3f
464 if tag_len == 0x3f:
465 tag_len = struct.unpack('<I', content[pos:pos+4])[0]
466 pos += 4
467 assert pos+tag_len <= len(content)
468 yield (tag_code, content[pos:pos+tag_len])
469 pos += tag_len
470
471 code_tag = next(tag
472 for tag_code, tag in extract_tags(content)
473 if tag_code == 82)
474 p = code_tag.index(b'\0', 4) + 1
ba552f54 475 code_reader = io.BytesIO(code_tag[p:])
e0df6211
PH
476
477 # Parse ABC (AVM2 ByteCode)
ba552f54
PH
478 def read_int(reader=None):
479 if reader is None:
480 reader = code_reader
e0df6211
PH
481 res = 0
482 shift = 0
483 for _ in range(5):
ba552f54
PH
484 buf = reader.read(1)
485 assert len(buf) == 1
486 b = struct.unpack('<B', buf)[0]
e0df6211
PH
487 res = res | ((b & 0x7f) << shift)
488 if b & 0x80 == 0:
489 break
490 shift += 7
ba552f54
PH
491 return res
492
493 def u30(reader=None):
494 res = read_int(reader)
495 assert res & 0xf0000000 == 0
e0df6211
PH
496 return res
497 u32 = read_int
498
ba552f54
PH
499 def s32(reader=None):
500 v = read_int(reader)
e0df6211
PH
501 if v & 0x80000000 != 0:
502 v = - ((v ^ 0xffffffff) + 1)
ba552f54
PH
503 return v
504
0ca96d48 505 def read_string(reader=None):
ba552f54
PH
506 if reader is None:
507 reader = code_reader
508 slen = u30(reader)
509 resb = reader.read(slen)
510 assert len(resb) == slen
511 return resb.decode('utf-8')
512
513 def read_bytes(count, reader=None):
514 if reader is None:
515 reader = code_reader
516 resb = reader.read(count)
517 assert len(resb) == count
518 return resb
519
520 def read_byte(reader=None):
521 resb = read_bytes(1, reader=reader)
522 res = struct.unpack('<B', resb)[0]
523 return res
e0df6211
PH
524
525 # minor_version + major_version
0ca96d48 526 read_bytes(2 + 2)
e0df6211
PH
527
528 # Constant pool
ba552f54 529 int_count = u30()
e0df6211 530 for _c in range(1, int_count):
0ca96d48 531 s32()
ba552f54 532 uint_count = u30()
e0df6211 533 for _c in range(1, uint_count):
0ca96d48 534 u32()
ba552f54 535 double_count = u30()
0ca96d48 536 read_bytes((double_count-1) * 8)
ba552f54 537 string_count = u30()
e0df6211
PH
538 constant_strings = [u'']
539 for _c in range(1, string_count):
0ca96d48 540 s = read_string()
e0df6211 541 constant_strings.append(s)
ba552f54 542 namespace_count = u30()
e0df6211 543 for _c in range(1, namespace_count):
0ca96d48
PH
544 read_bytes(1) # kind
545 u30() # name
ba552f54 546 ns_set_count = u30()
e0df6211 547 for _c in range(1, ns_set_count):
ba552f54 548 count = u30()
e0df6211 549 for _c2 in range(count):
0ca96d48 550 u30()
ba552f54 551 multiname_count = u30()
e0df6211
PH
552 MULTINAME_SIZES = {
553 0x07: 2, # QName
554 0x0d: 2, # QNameA
555 0x0f: 1, # RTQName
556 0x10: 1, # RTQNameA
557 0x11: 0, # RTQNameL
558 0x12: 0, # RTQNameLA
559 0x09: 2, # Multiname
560 0x0e: 2, # MultinameA
561 0x1b: 1, # MultinameL
562 0x1c: 1, # MultinameLA
563 }
564 multinames = [u'']
565 for _c in range(1, multiname_count):
ba552f54 566 kind = u30()
e0df6211
PH
567 assert kind in MULTINAME_SIZES, u'Invalid multiname kind %r' % kind
568 if kind == 0x07:
0ca96d48 569 u30() # namespace_idx
ba552f54 570 name_idx = u30()
e0df6211
PH
571 multinames.append(constant_strings[name_idx])
572 else:
573 multinames.append('[MULTINAME kind: %d]' % kind)
574 for _c2 in range(MULTINAME_SIZES[kind]):
0ca96d48 575 u30()
e0df6211
PH
576
577 # Methods
ba552f54 578 method_count = u30()
e0df6211
PH
579 MethodInfo = collections.namedtuple(
580 'MethodInfo',
581 ['NEED_ARGUMENTS', 'NEED_REST'])
582 method_infos = []
583 for method_id in range(method_count):
ba552f54 584 param_count = u30()
0ca96d48 585 u30() # return type
e0df6211 586 for _ in range(param_count):
0ca96d48
PH
587 u30() # param type
588 u30() # name index (always 0 for youtube)
ba552f54 589 flags = read_byte()
e0df6211
PH
590 if flags & 0x08 != 0:
591 # Options present
ba552f54 592 option_count = u30()
e0df6211 593 for c in range(option_count):
0ca96d48
PH
594 u30() # val
595 read_bytes(1) # kind
e0df6211
PH
596 if flags & 0x80 != 0:
597 # Param names present
598 for _ in range(param_count):
0ca96d48 599 u30() # param name
e0df6211
PH
600 mi = MethodInfo(flags & 0x01 != 0, flags & 0x04 != 0)
601 method_infos.append(mi)
602
603 # Metadata
ba552f54 604 metadata_count = u30()
e0df6211 605 for _c in range(metadata_count):
0ca96d48 606 u30() # name
ba552f54 607 item_count = u30()
e0df6211 608 for _c2 in range(item_count):
0ca96d48
PH
609 u30() # key
610 u30() # value
ba552f54
PH
611
612 def parse_traits_info():
613 trait_name_idx = u30()
614 kind_full = read_byte()
e0df6211
PH
615 kind = kind_full & 0x0f
616 attrs = kind_full >> 4
617 methods = {}
618 if kind in [0x00, 0x06]: # Slot or Const
0ca96d48
PH
619 u30() # Slot id
620 u30() # type_name_idx
ba552f54 621 vindex = u30()
e0df6211 622 if vindex != 0:
0ca96d48 623 read_byte() # vkind
e0df6211 624 elif kind in [0x01, 0x02, 0x03]: # Method / Getter / Setter
0ca96d48 625 u30() # disp_id
ba552f54 626 method_idx = u30()
e0df6211
PH
627 methods[multinames[trait_name_idx]] = method_idx
628 elif kind == 0x04: # Class
0ca96d48
PH
629 u30() # slot_id
630 u30() # classi
e0df6211 631 elif kind == 0x05: # Function
0ca96d48 632 u30() # slot_id
ba552f54 633 function_idx = u30()
e0df6211
PH
634 methods[function_idx] = multinames[trait_name_idx]
635 else:
636 raise ExtractorError(u'Unsupported trait kind %d' % kind)
637
638 if attrs & 0x4 != 0: # Metadata present
ba552f54 639 metadata_count = u30()
e0df6211 640 for _c3 in range(metadata_count):
0ca96d48 641 u30() # metadata index
e0df6211 642
ba552f54 643 return methods
e0df6211
PH
644
645 # Classes
646 TARGET_CLASSNAME = u'SignatureDecipher'
647 searched_idx = multinames.index(TARGET_CLASSNAME)
648 searched_class_id = None
ba552f54 649 class_count = u30()
e0df6211 650 for class_id in range(class_count):
ba552f54 651 name_idx = u30()
e0df6211
PH
652 if name_idx == searched_idx:
653 # We found the class we're looking for!
654 searched_class_id = class_id
0ca96d48 655 u30() # super_name idx
ba552f54 656 flags = read_byte()
e0df6211 657 if flags & 0x08 != 0: # Protected namespace is present
0ca96d48 658 u30() # protected_ns_idx
ba552f54 659 intrf_count = u30()
e0df6211 660 for _c2 in range(intrf_count):
0ca96d48
PH
661 u30()
662 u30() # iinit
ba552f54 663 trait_count = u30()
e0df6211 664 for _c2 in range(trait_count):
0ca96d48 665 parse_traits_info()
e0df6211
PH
666
667 if searched_class_id is None:
668 raise ExtractorError(u'Target class %r not found' %
669 TARGET_CLASSNAME)
670
671 method_names = {}
672 method_idxs = {}
673 for class_id in range(class_count):
0ca96d48 674 u30() # cinit
ba552f54 675 trait_count = u30()
e0df6211 676 for _c2 in range(trait_count):
ba552f54 677 trait_methods = parse_traits_info()
e0df6211
PH
678 if class_id == searched_class_id:
679 method_names.update(trait_methods.items())
680 method_idxs.update(dict(
681 (idx, name)
682 for name, idx in trait_methods.items()))
683
684 # Scripts
ba552f54 685 script_count = u30()
e0df6211 686 for _c in range(script_count):
0ca96d48 687 u30() # init
ba552f54 688 trait_count = u30()
e0df6211 689 for _c2 in range(trait_count):
0ca96d48 690 parse_traits_info()
e0df6211
PH
691
692 # Method bodies
ba552f54 693 method_body_count = u30()
e0df6211
PH
694 Method = collections.namedtuple('Method', ['code', 'local_count'])
695 methods = {}
696 for _c in range(method_body_count):
ba552f54 697 method_idx = u30()
0ca96d48 698 u30() # max_stack
ba552f54 699 local_count = u30()
0ca96d48
PH
700 u30() # init_scope_depth
701 u30() # max_scope_depth
ba552f54
PH
702 code_length = u30()
703 code = read_bytes(code_length)
e0df6211 704 if method_idx in method_idxs:
ba552f54 705 m = Method(code, local_count)
e0df6211 706 methods[method_idxs[method_idx]] = m
ba552f54 707 exception_count = u30()
e0df6211 708 for _c2 in range(exception_count):
0ca96d48
PH
709 u30() # from
710 u30() # to
711 u30() # target
712 u30() # exc_type
713 u30() # var_name
ba552f54 714 trait_count = u30()
e0df6211 715 for _c2 in range(trait_count):
0ca96d48 716 parse_traits_info()
e0df6211 717
ba552f54 718 assert p + code_reader.tell() == len(code_tag)
e0df6211
PH
719 assert len(methods) == len(method_idxs)
720
721 method_pyfunctions = {}
722
723 def extract_function(func_name):
724 if func_name in method_pyfunctions:
725 return method_pyfunctions[func_name]
726 if func_name not in methods:
727 raise ExtractorError(u'Cannot find function %r' % func_name)
728 m = methods[func_name]
729
730 def resfunc(args):
e0df6211
PH
731 registers = ['(this)'] + list(args) + [None] * m.local_count
732 stack = []
733 coder = io.BytesIO(m.code)
734 while True:
735 opcode = struct.unpack('!B', coder.read(1))[0]
a7177865 736 if opcode == 36: # pushbyte
e0df6211
PH
737 v = struct.unpack('!B', coder.read(1))[0]
738 stack.append(v)
739 elif opcode == 44: # pushstring
740 idx = u30(coder)
741 stack.append(constant_strings[idx])
742 elif opcode == 48: # pushscope
743 # We don't implement the scope register, so we'll just
744 # ignore the popped value
745 stack.pop()
746 elif opcode == 70: # callproperty
747 index = u30(coder)
748 mname = multinames[index]
749 arg_count = u30(coder)
750 args = list(reversed(
751 [stack.pop() for _ in range(arg_count)]))
752 obj = stack.pop()
753 if mname == u'split':
754 assert len(args) == 1
755 assert isinstance(args[0], compat_str)
756 assert isinstance(obj, compat_str)
757 if args[0] == u'':
758 res = list(obj)
759 else:
760 res = obj.split(args[0])
761 stack.append(res)
a7177865
PH
762 elif mname == u'slice':
763 assert len(args) == 1
764 assert isinstance(args[0], int)
765 assert isinstance(obj, list)
766 res = obj[args[0]:]
767 stack.append(res)
768 elif mname == u'join':
769 assert len(args) == 1
770 assert isinstance(args[0], compat_str)
771 assert isinstance(obj, list)
772 res = args[0].join(obj)
773 stack.append(res)
e0df6211
PH
774 elif mname in method_pyfunctions:
775 stack.append(method_pyfunctions[mname](args))
776 else:
777 raise NotImplementedError(
778 u'Unsupported property %r on %r'
779 % (mname, obj))
a7177865
PH
780 elif opcode == 72: # returnvalue
781 res = stack.pop()
782 return res
783 elif opcode == 79: # callpropvoid
784 index = u30(coder)
785 mname = multinames[index]
786 arg_count = u30(coder)
787 args = list(reversed(
788 [stack.pop() for _ in range(arg_count)]))
789 obj = stack.pop()
790 if mname == u'reverse':
791 assert isinstance(obj, list)
792 obj.reverse()
793 else:
794 raise NotImplementedError(
795 u'Unsupported (void) property %r on %r'
796 % (mname, obj))
e0df6211
PH
797 elif opcode == 93: # findpropstrict
798 index = u30(coder)
799 mname = multinames[index]
800 res = extract_function(mname)
801 stack.append(res)
802 elif opcode == 97: # setproperty
803 index = u30(coder)
804 value = stack.pop()
805 idx = stack.pop()
806 obj = stack.pop()
807 assert isinstance(obj, list)
808 assert isinstance(idx, int)
809 obj[idx] = value
810 elif opcode == 98: # getlocal
811 index = u30(coder)
812 stack.append(registers[index])
813 elif opcode == 99: # setlocal
814 index = u30(coder)
815 value = stack.pop()
816 registers[index] = value
817 elif opcode == 102: # getproperty
818 index = u30(coder)
819 pname = multinames[index]
820 if pname == u'length':
821 obj = stack.pop()
822 assert isinstance(obj, list)
823 stack.append(len(obj))
824 else: # Assume attribute access
825 idx = stack.pop()
826 assert isinstance(idx, int)
827 obj = stack.pop()
828 assert isinstance(obj, list)
829 stack.append(obj[idx])
830 elif opcode == 128: # coerce
0ca96d48 831 u30(coder)
e0df6211
PH
832 elif opcode == 133: # coerce_s
833 assert isinstance(stack[-1], (type(None), compat_str))
834 elif opcode == 164: # modulo
835 value2 = stack.pop()
836 value1 = stack.pop()
837 res = value1 % value2
838 stack.append(res)
a7177865
PH
839 elif opcode == 208: # getlocal_0
840 stack.append(registers[0])
841 elif opcode == 209: # getlocal_1
842 stack.append(registers[1])
843 elif opcode == 210: # getlocal_2
844 stack.append(registers[2])
845 elif opcode == 211: # getlocal_3
846 stack.append(registers[3])
e0df6211
PH
847 elif opcode == 214: # setlocal_2
848 registers[2] = stack.pop()
849 elif opcode == 215: # setlocal_3
850 registers[3] = stack.pop()
851 else:
852 raise NotImplementedError(
853 u'Unsupported opcode %d' % opcode)
854
855 method_pyfunctions[func_name] = resfunc
856 return resfunc
857
858 initial_function = extract_function(u'decipher')
859 return lambda s: initial_function([s])
860
83799698 861 def _decrypt_signature(self, s, video_id, player_url, age_gate=False):
257a2501 862 """Turn the encrypted s field into a working signature"""
6b37f0be 863
83799698 864 if player_url is not None:
9f9be844
PH
865 if player_url.startswith(u'//'):
866 player_url = u'https:' + player_url
e0df6211 867 try:
7f8ae73a
PH
868 player_id = (player_url, len(s))
869 if player_id not in self._player_cache:
83799698 870 func = self._extract_signature_function(
c4417ddb 871 video_id, player_url, len(s)
e0df6211 872 )
7f8ae73a
PH
873 self._player_cache[player_id] = func
874 func = self._player_cache[player_id]
edf3e38e
PH
875 if self._downloader.params.get('youtube_print_sig_code'):
876 self._print_sig_code(func, len(s))
877 return func(s)
0ca96d48 878 except Exception:
e0df6211 879 tb = traceback.format_exc()
83799698
PH
880 self._downloader.report_warning(
881 u'Automatic signature extraction failed: ' + tb)
e0df6211 882
d2d8f895
PH
883 self._downloader.report_warning(
884 u'Warning: Falling back to static signature algorithm')
920de7a2 885
2f2ffea9
PH
886 return self._static_decrypt_signature(
887 s, video_id, player_url, age_gate)
e0df6211 888
2f2ffea9 889 def _static_decrypt_signature(self, s, video_id, player_url, age_gate):
e0df6211
PH
890 if age_gate:
891 # The videos with age protection use another player, so the
892 # algorithms can be different.
893 if len(s) == 86:
894 return s[2:63] + s[82] + s[64:82] + s[63]
895
bc4b9008 896 if len(s) == 93:
897 return s[86:29:-1] + s[88] + s[28:5:-1]
898 elif len(s) == 92:
444b1165 899 return s[25] + s[3:25] + s[0] + s[26:42] + s[79] + s[43:79] + s[91] + s[80:83]
38d025b3
JMF
900 elif len(s) == 91:
901 return s[84:27:-1] + s[86] + s[26:5:-1]
444b1165
JMF
902 elif len(s) == 90:
903 return s[25] + s[3:25] + s[2] + s[26:40] + s[77] + s[41:77] + s[89] + s[78:81]
8a9d86a2 904 elif len(s) == 89:
905 return s[84:78:-1] + s[87] + s[77:60:-1] + s[0] + s[59:3:-1]
444b1165 906 elif len(s) == 88:
3e223834 907 return s[7:28] + s[87] + s[29:45] + s[55] + s[46:55] + s[2] + s[56:87] + s[28]
be547e1d 908 elif len(s) == 87:
3a725669 909 return s[6:27] + s[4] + s[28:39] + s[27] + s[40:59] + s[2] + s[60:]
be547e1d 910 elif len(s) == 86:
f2c327fd 911 return s[80:72:-1] + s[16] + s[71:39:-1] + s[72] + s[38:16:-1] + s[82] + s[15::-1]
be547e1d 912 elif len(s) == 85:
6ae8ee3f 913 return s[3:11] + s[0] + s[12:55] + s[84] + s[56:84]
be547e1d 914 elif len(s) == 84:
6f56389b 915 return s[78:70:-1] + s[14] + s[69:37:-1] + s[70] + s[36:14:-1] + s[80] + s[:14][::-1]
be547e1d 916 elif len(s) == 83:
920de7a2 917 return s[80:63:-1] + s[0] + s[62:0:-1] + s[63]
be547e1d 918 elif len(s) == 82:
c21315f2 919 return s[80:37:-1] + s[7] + s[36:7:-1] + s[0] + s[6:0:-1] + s[37]
be547e1d 920 elif len(s) == 81:
aedd6bb9 921 return s[56] + s[79:56:-1] + s[41] + s[55:41:-1] + s[80] + s[40:34:-1] + s[0] + s[33:29:-1] + s[34] + s[28:9:-1] + s[29] + s[8:0:-1] + s[9]
066090dd
JMF
922 elif len(s) == 80:
923 return s[1:19] + s[0] + s[20:68] + s[19] + s[69:80]
5c468ca8
JMF
924 elif len(s) == 79:
925 return s[54] + s[77:54:-1] + s[39] + s[53:39:-1] + s[78] + s[38:34:-1] + s[0] + s[33:29:-1] + s[34] + s[28:9:-1] + s[29] + s[8:0:-1] + s[9]
be547e1d
PH
926
927 else:
928 raise ExtractorError(u'Unable to decrypt signature, key length %d not supported; retrying might work' % (len(s)))
c5e8d7af 929
1f343eaa 930 def _get_available_subtitles(self, video_id, webpage):
de7f3446 931 try:
7fad1c63 932 sub_list = self._download_webpage(
38c2e5b8 933 'https://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,
7fad1c63
JMF
934 video_id, note=False)
935 except ExtractorError as err:
de7f3446
JMF
936 self._downloader.report_warning(u'unable to download video subtitles: %s' % compat_str(err))
937 return {}
938 lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
939
940 sub_lang_list = {}
941 for l in lang_list:
942 lang = l[1]
943 params = compat_urllib_parse.urlencode({
944 'lang': lang,
945 'v': video_id,
ca715127 946 'fmt': self._downloader.params.get('subtitlesformat', 'srt'),
c3197e3e 947 'name': unescapeHTML(l[0]).encode('utf-8'),
de7f3446 948 })
38c2e5b8 949 url = u'https://www.youtube.com/api/timedtext?' + params
de7f3446
JMF
950 sub_lang_list[lang] = url
951 if not sub_lang_list:
952 self._downloader.report_warning(u'video doesn\'t have subtitles')
953 return {}
954 return sub_lang_list
955
055e6f36 956 def _get_available_automatic_caption(self, video_id, webpage):
de7f3446
JMF
957 """We need the webpage for getting the captions url, pass it as an
958 argument to speed up the process."""
ca715127 959 sub_format = self._downloader.params.get('subtitlesformat', 'srt')
de7f3446
JMF
960 self.to_screen(u'%s: Looking for automatic captions' % video_id)
961 mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
055e6f36 962 err_msg = u'Couldn\'t find automatic captions for %s' % video_id
de7f3446
JMF
963 if mobj is None:
964 self._downloader.report_warning(err_msg)
965 return {}
966 player_config = json.loads(mobj.group(1))
967 try:
968 args = player_config[u'args']
969 caption_url = args[u'ttsurl']
970 timestamp = args[u'timestamp']
055e6f36
JMF
971 # We get the available subtitles
972 list_params = compat_urllib_parse.urlencode({
973 'type': 'list',
974 'tlangs': 1,
975 'asrs': 1,
de7f3446 976 })
055e6f36 977 list_url = caption_url + '&' + list_params
e26f8712 978 caption_list = self._download_xml(list_url, video_id)
e3dc22ca 979 original_lang_node = caption_list.find('track')
f6a54188 980 if original_lang_node is None or original_lang_node.attrib.get('kind') != 'asr' :
e3dc22ca
JMF
981 self._downloader.report_warning(u'Video doesn\'t have automatic captions')
982 return {}
983 original_lang = original_lang_node.attrib['lang_code']
055e6f36
JMF
984
985 sub_lang_list = {}
986 for lang_node in caption_list.findall('target'):
987 sub_lang = lang_node.attrib['lang_code']
988 params = compat_urllib_parse.urlencode({
989 'lang': original_lang,
990 'tlang': sub_lang,
991 'fmt': sub_format,
992 'ts': timestamp,
993 'kind': 'asr',
994 })
995 sub_lang_list[sub_lang] = caption_url + '&' + params
996 return sub_lang_list
de7f3446
JMF
997 # An extractor error can be raise by the download process if there are
998 # no automatic captions but there are subtitles
999 except (KeyError, ExtractorError):
1000 self._downloader.report_warning(err_msg)
1001 return {}
1002
97665381
PH
1003 @classmethod
1004 def extract_id(cls, url):
1005 mobj = re.match(cls._VALID_URL, url, re.VERBOSE)
c5e8d7af
PH
1006 if mobj is None:
1007 raise ExtractorError(u'Invalid URL: %s' % url)
1008 video_id = mobj.group(2)
1009 return video_id
1010
1d043b93
JMF
1011 def _extract_from_m3u8(self, manifest_url, video_id):
1012 url_map = {}
1013 def _get_urls(_manifest):
1014 lines = _manifest.split('\n')
1015 urls = filter(lambda l: l and not l.startswith('#'),
1016 lines)
1017 return urls
1018 manifest = self._download_webpage(manifest_url, video_id, u'Downloading formats manifest')
1019 formats_urls = _get_urls(manifest)
1020 for format_url in formats_urls:
890f62e8 1021 itag = self._search_regex(r'itag/(\d+?)/', format_url, 'itag')
1d043b93
JMF
1022 url_map[itag] = format_url
1023 return url_map
1024
1fb07d10
JG
1025 def _extract_annotations(self, video_id):
1026 url = 'https://www.youtube.com/annotations_invideo?features=1&legacy=1&video_id=%s' % video_id
1027 return self._download_webpage(url, video_id, note=u'Searching for annotations.', errnote=u'Unable to download video annotations.')
1028
c5e8d7af 1029 def _real_extract(self, url):
7e8c0af0
PH
1030 proto = (
1031 u'http' if self._downloader.params.get('prefer_insecure', False)
1032 else u'https')
1033
c5e8d7af
PH
1034 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
1035 mobj = re.search(self._NEXT_URL_RE, url)
1036 if mobj:
7e8c0af0 1037 url = proto + '://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
97665381 1038 video_id = self.extract_id(url)
c5e8d7af
PH
1039
1040 # Get video webpage
7e8c0af0 1041 url = proto + '://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
336c3a69 1042 video_webpage = self._download_webpage(url, video_id)
c5e8d7af
PH
1043
1044 # Attempt to extract SWF player URL
e0df6211 1045 mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
c5e8d7af
PH
1046 if mobj is not None:
1047 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1048 else:
1049 player_url = None
1050
1051 # Get video info
1052 self.report_video_info_webpage_download(video_id)
c108eb73
JMF
1053 if re.search(r'player-age-gate-content">', video_webpage) is not None:
1054 self.report_age_confirmation()
1055 age_gate = True
1056 # We simulate the access to the video from www.youtube.com/v/{video_id}
1057 # this can be viewed without login into Youtube
1058 data = compat_urllib_parse.urlencode({'video_id': video_id,
fccd3771 1059 'el': 'player_embedded',
c108eb73
JMF
1060 'gl': 'US',
1061 'hl': 'en',
1062 'eurl': 'https://youtube.googleapis.com/v/' + video_id,
1063 'asv': 3,
1064 'sts':'1588',
1065 })
7e8c0af0 1066 video_info_url = proto + '://www.youtube.com/get_video_info?' + data
c5e8d7af
PH
1067 video_info_webpage = self._download_webpage(video_info_url, video_id,
1068 note=False,
1069 errnote='unable to download video info webpage')
1070 video_info = compat_parse_qs(video_info_webpage)
c108eb73
JMF
1071 else:
1072 age_gate = False
1073 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
7e8c0af0 1074 video_info_url = (proto + '://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
c108eb73
JMF
1075 % (video_id, el_type))
1076 video_info_webpage = self._download_webpage(video_info_url, video_id,
1077 note=False,
1078 errnote='unable to download video info webpage')
1079 video_info = compat_parse_qs(video_info_webpage)
1080 if 'token' in video_info:
1081 break
c5e8d7af
PH
1082 if 'token' not in video_info:
1083 if 'reason' in video_info:
9a82b238 1084 raise ExtractorError(u'YouTube said: %s' % video_info['reason'][0], expected=True)
c5e8d7af
PH
1085 else:
1086 raise ExtractorError(u'"token" parameter not in video info for unknown reason')
1087
1d699755
PH
1088 if 'view_count' in video_info:
1089 view_count = int(video_info['view_count'][0])
1090 else:
1091 view_count = None
1092
c5e8d7af
PH
1093 # Check for "rental" videos
1094 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
1095 raise ExtractorError(u'"rental" videos not supported')
1096
1097 # Start extracting information
1098 self.report_information_extraction(video_id)
1099
1100 # uploader
1101 if 'author' not in video_info:
1102 raise ExtractorError(u'Unable to extract uploader name')
1103 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
1104
1105 # uploader_id
1106 video_uploader_id = None
1107 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
1108 if mobj is not None:
1109 video_uploader_id = mobj.group(1)
1110 else:
1111 self._downloader.report_warning(u'unable to extract uploader nickname')
1112
1113 # title
a8c6b241
PH
1114 if 'title' in video_info:
1115 video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
1116 else:
1117 self._downloader.report_warning(u'Unable to extract video title')
1118 video_title = u'_'
c5e8d7af
PH
1119
1120 # thumbnail image
7763b04e
JMF
1121 # We try first to get a high quality image:
1122 m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">',
1123 video_webpage, re.DOTALL)
1124 if m_thumb is not None:
1125 video_thumbnail = m_thumb.group(1)
1126 elif 'thumbnail_url' not in video_info:
c5e8d7af 1127 self._downloader.report_warning(u'unable to extract video thumbnail')
f490e77e 1128 video_thumbnail = None
c5e8d7af
PH
1129 else: # don't panic if we can't find it
1130 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
1131
1132 # upload date
1133 upload_date = None
1134 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
1135 if mobj is not None:
1136 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1137 upload_date = unified_strdate(upload_date)
1138
1139 # description
1140 video_description = get_element_by_id("eow-description", video_webpage)
1141 if video_description:
27dcce19
PH
1142 video_description = re.sub(r'''(?x)
1143 <a\s+
1144 (?:[a-zA-Z-]+="[^"]+"\s+)*?
1145 title="([^"]+)"\s+
1146 (?:[a-zA-Z-]+="[^"]+"\s+)*?
1147 class="yt-uix-redirect-link"\s*>
1148 [^<]+
1149 </a>
1150 ''', r'\1', video_description)
c5e8d7af
PH
1151 video_description = clean_html(video_description)
1152 else:
1153 fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
1154 if fd_mobj:
1155 video_description = unescapeHTML(fd_mobj.group(1))
1156 else:
1157 video_description = u''
1158
336c3a69 1159 def _extract_count(klass):
46374a56
PH
1160 count = self._search_regex(
1161 r'class="%s">([\d,]+)</span>' % re.escape(klass),
1162 video_webpage, klass, default=None)
336c3a69
JMF
1163 if count is not None:
1164 return int(count.replace(',', ''))
1165 return None
1166 like_count = _extract_count(u'likes-count')
1167 dislike_count = _extract_count(u'dislikes-count')
1168
c5e8d7af 1169 # subtitles
d82134c3 1170 video_subtitles = self.extract_subtitles(video_id, video_webpage)
c5e8d7af 1171
c5e8d7af 1172 if self._downloader.params.get('listsubtitles', False):
d665f8d3 1173 self._list_available_subtitles(video_id, video_webpage)
c5e8d7af
PH
1174 return
1175
1176 if 'length_seconds' not in video_info:
1177 self._downloader.report_warning(u'unable to extract video duration')
b466b702 1178 video_duration = None
c5e8d7af 1179 else:
b466b702 1180 video_duration = int(compat_urllib_parse.unquote_plus(video_info['length_seconds'][0]))
c5e8d7af 1181
1fb07d10
JG
1182 # annotations
1183 video_annotations = None
1184 if self._downloader.params.get('writeannotations', False):
1185 video_annotations = self._extract_annotations(video_id)
1186
c5e8d7af 1187 # Decide which formats to download
c5e8d7af 1188 try:
ae7ed920 1189 mobj = re.search(r';ytplayer\.config\s*=\s*({.*?});', video_webpage)
50be92c1
PH
1190 if not mobj:
1191 raise ValueError('Could not find vevo ID')
ae7ed920
PH
1192 json_code = uppercase_escape(mobj.group(1))
1193 ytplayer_config = json.loads(json_code)
3489b7d2 1194 args = ytplayer_config['args']
7ce7e394
JMF
1195 # Easy way to know if the 's' value is in url_encoded_fmt_stream_map
1196 # this signatures are encrypted
44d46655 1197 if 'url_encoded_fmt_stream_map' not in args:
f10503db 1198 raise ValueError(u'No stream_map present') # caught below
00fe14fc
JMF
1199 re_signature = re.compile(r'[&,]s=')
1200 m_s = re_signature.search(args['url_encoded_fmt_stream_map'])
7ce7e394
JMF
1201 if m_s is not None:
1202 self.to_screen(u'%s: Encrypted signatures detected.' % video_id)
c5e8d7af 1203 video_info['url_encoded_fmt_stream_map'] = [args['url_encoded_fmt_stream_map']]
00fe14fc 1204 m_s = re_signature.search(args.get('adaptive_fmts', u''))
b7a68384 1205 if m_s is not None:
00fe14fc
JMF
1206 if 'adaptive_fmts' in video_info:
1207 video_info['adaptive_fmts'][0] += ',' + args['adaptive_fmts']
37b6d5f6 1208 else:
00fe14fc 1209 video_info['adaptive_fmts'] = [args['adaptive_fmts']]
c5e8d7af
PH
1210 except ValueError:
1211 pass
1212
dd27fd17
PH
1213 def _map_to_format_list(urlmap):
1214 formats = []
1215 for itag, video_real_url in urlmap.items():
1216 dct = {
1217 'format_id': itag,
1218 'url': video_real_url,
1219 'player_url': player_url,
1220 }
0b65e5d4
PH
1221 if itag in self._formats:
1222 dct.update(self._formats[itag])
dd27fd17
PH
1223 formats.append(dct)
1224 return formats
1225
c5e8d7af
PH
1226 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1227 self.report_rtmp_download()
dd27fd17
PH
1228 formats = [{
1229 'format_id': '_rtmp',
1230 'protocol': 'rtmp',
1231 'url': video_info['conn'][0],
1232 'player_url': player_url,
1233 }]
00fe14fc
JMF
1234 elif len(video_info.get('url_encoded_fmt_stream_map', [])) >= 1 or len(video_info.get('adaptive_fmts', [])) >= 1:
1235 encoded_url_map = video_info.get('url_encoded_fmt_stream_map', [''])[0] + ',' + video_info.get('adaptive_fmts',[''])[0]
1236 if 'rtmpe%3Dyes' in encoded_url_map:
a7055eb9 1237 raise ExtractorError('rtmpe downloads are not supported, see https://github.com/rg3/youtube-dl/issues/343 for more information.', expected=True)
c5e8d7af 1238 url_map = {}
00fe14fc 1239 for url_data_str in encoded_url_map.split(','):
c5e8d7af
PH
1240 url_data = compat_parse_qs(url_data_str)
1241 if 'itag' in url_data and 'url' in url_data:
1242 url = url_data['url'][0]
1243 if 'sig' in url_data:
1244 url += '&signature=' + url_data['sig'][0]
1245 elif 's' in url_data:
e0df6211 1246 encrypted_sig = url_data['s'][0]
769fda3c 1247 if self._downloader.params.get('verbose'):
c108eb73 1248 if age_gate:
bdde940e
PH
1249 if player_url is None:
1250 player_version = 'unknown'
1251 else:
1252 player_version = self._search_regex(
1253 r'-(.+)\.swf$', player_url,
1254 u'flash player', fatal=False)
e0df6211 1255 player_desc = 'flash player %s' % player_version
c108eb73 1256 else:
83799698
PH
1257 player_version = self._search_regex(
1258 r'html5player-(.+?)\.js', video_webpage,
c108eb73 1259 'html5 player', fatal=False)
e0df6211
PH
1260 player_desc = u'html5 player %s' % player_version
1261
1262 parts_sizes = u'.'.join(compat_str(len(part)) for part in encrypted_sig.split('.'))
5a76c651 1263 self.to_screen(u'encrypted signature length %d (%s), itag %s, %s' %
e0df6211
PH
1264 (len(encrypted_sig), parts_sizes, url_data['itag'][0], player_desc))
1265
83799698 1266 if not age_gate:
e0df6211
PH
1267 jsplayer_url_json = self._search_regex(
1268 r'"assets":.+?"js":\s*("[^"]+")',
1269 video_webpage, u'JS player URL')
83799698 1270 player_url = json.loads(jsplayer_url_json)
e0df6211 1271
83799698
PH
1272 signature = self._decrypt_signature(
1273 encrypted_sig, video_id, player_url, age_gate)
c5e8d7af
PH
1274 url += '&signature=' + signature
1275 if 'ratebypass' not in url:
1276 url += '&ratebypass=yes'
1277 url_map[url_data['itag'][0]] = url
dd27fd17 1278 formats = _map_to_format_list(url_map)
1d043b93
JMF
1279 elif video_info.get('hlsvp'):
1280 manifest_url = video_info['hlsvp'][0]
1281 url_map = self._extract_from_m3u8(manifest_url, video_id)
dd27fd17 1282 formats = _map_to_format_list(url_map)
c5e8d7af 1283 else:
9abb3204 1284 raise ExtractorError(u'no conn, hlsvp or url_encoded_fmt_stream_map information found in video info')
c5e8d7af 1285
dd27fd17 1286 # Look for the DASH manifest
d68f0cdb 1287 if (self._downloader.params.get('youtube_include_dash_manifest', False)):
dd27fd17 1288 try:
d68f0cdb 1289 # The DASH manifest used needs to be the one from the original video_webpage.
1290 # The one found in get_video_info seems to be using different signatures.
1291 # However, in the case of an age restriction there won't be any embedded dashmpd in the video_webpage.
1292 # Luckily, it seems, this case uses some kind of default signature (len == 86), so the
1293 # combination of get_video_info and the _static_decrypt_signature() decryption fallback will work here.
1294 if age_gate:
3489b7d2 1295 dash_manifest_url = video_info.get('dashmpd')[0]
d68f0cdb 1296 else:
3489b7d2 1297 dash_manifest_url = ytplayer_config['args']['dashmpd']
d68f0cdb 1298 def decrypt_sig(mobj):
1299 s = mobj.group(1)
1300 dec_s = self._decrypt_signature(s, video_id, player_url, age_gate)
1301 return '/signature/%s' % dec_s
1302 dash_manifest_url = re.sub(r'/s/([\w\.]+)', decrypt_sig, dash_manifest_url)
dd27fd17 1303 dash_doc = self._download_xml(
d68f0cdb 1304 dash_manifest_url, video_id,
dd27fd17
PH
1305 note=u'Downloading DASH manifest',
1306 errnote=u'Could not download DASH manifest')
1307 for r in dash_doc.findall(u'.//{urn:mpeg:DASH:schema:MPD:2011}Representation'):
1308 url_el = r.find('{urn:mpeg:DASH:schema:MPD:2011}BaseURL')
1309 if url_el is None:
1310 continue
1311 format_id = r.attrib['id']
1312 video_url = url_el.text
1313 filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength'))
1314 f = {
1315 'format_id': format_id,
1316 'url': video_url,
1317 'width': int_or_none(r.attrib.get('width')),
1318 'tbr': int_or_none(r.attrib.get('bandwidth'), 1000),
1319 'asr': int_or_none(r.attrib.get('audioSamplingRate')),
1320 'filesize': filesize,
1321 }
1322 try:
1323 existing_format = next(
1324 fo for fo in formats
1325 if fo['format_id'] == format_id)
1326 except StopIteration:
1327 f.update(self._formats.get(format_id, {}))
1328 formats.append(f)
1329 else:
1330 existing_format.update(f)
1331
1332 except (ExtractorError, KeyError) as e:
1333 self.report_warning(u'Skipping DASH manifest: %s' % e, video_id)
d80044c2 1334
4bcc7bd1 1335 self._sort_formats(formats)
4ea3be0a 1336
1337 return {
1338 'id': video_id,
1339 'uploader': video_uploader,
1340 'uploader_id': video_uploader_id,
1341 'upload_date': upload_date,
1342 'title': video_title,
1343 'thumbnail': video_thumbnail,
1344 'description': video_description,
1345 'subtitles': video_subtitles,
1346 'duration': video_duration,
1347 'age_limit': 18 if age_gate else 0,
1348 'annotations': video_annotations,
7e8c0af0 1349 'webpage_url': proto + '://www.youtube.com/watch?v=%s' % video_id,
4ea3be0a 1350 'view_count': view_count,
1351 'like_count': like_count,
1352 'dislike_count': dislike_count,
1353 'formats': formats,
1354 }
c5e8d7af 1355
880e1c52 1356class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
0f818663 1357 IE_DESC = u'YouTube.com playlists'
d67cc9fa 1358 _VALID_URL = r"""(?x)(?:
c5e8d7af
PH
1359 (?:https?://)?
1360 (?:\w+\.)?
1361 youtube\.com/
1362 (?:
1363 (?:course|view_play_list|my_playlists|artist|playlist|watch)
1364 \? (?:.*?&)*? (?:p|a|list)=
1365 | p/
1366 )
d67cc9fa
JMF
1367 (
1368 (?:PL|EC|UU|FL|RD)?[0-9A-Za-z-_]{10,}
1369 # Top tracks, they can also include dots
1370 |(?:MC)[\w\.]*
1371 )
c5e8d7af
PH
1372 .*
1373 |
715c8e7b 1374 ((?:PL|EC|UU|FL|RD)[0-9A-Za-z-_]{10,})
c5e8d7af 1375 )"""
dbb94fb0 1376 _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s'
dcbb4580 1377 _MORE_PAGES_INDICATOR = r'data-link-type="next"'
dbb94fb0 1378 _VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})&amp;[^"]*?index=(?P<index>\d+)'
c5e8d7af
PH
1379 IE_NAME = u'youtube:playlist'
1380
880e1c52
JMF
1381 def _real_initialize(self):
1382 self._login()
1383
652cdaa2
JMF
1384 def _ids_to_results(self, ids):
1385 return [self.url_result(vid_id, 'Youtube', video_id=vid_id)
1386 for vid_id in ids]
1387
1388 def _extract_mix(self, playlist_id):
1389 # The mixes are generated from a a single video
1390 # the id of the playlist is just 'RD' + video_id
7d4afc55 1391 url = 'https://youtube.com/watch?v=%s&list=%s' % (playlist_id[-11:], playlist_id)
652cdaa2 1392 webpage = self._download_webpage(url, playlist_id, u'Downloading Youtube mix')
bc2f773b
JMF
1393 search_title = lambda class_name: get_element_by_attribute('class', class_name, webpage)
1394 title_span = (search_title('playlist-title') or
1395 search_title('title long-title') or search_title('title'))
76d1700b 1396 title = clean_html(title_span)
a2dafe28 1397 video_re = r'''(?x)data-video-username="(.*?)".*?
bc2f773b
JMF
1398 href="/watch\?v=([0-9A-Za-z_-]{11})&amp;[^"]*?list=%s''' % re.escape(playlist_id)
1399 matches = orderedSet(re.findall(video_re, webpage, flags=re.DOTALL))
a2dafe28 1400 # Some of the videos may have been deleted, their username field is empty
bc2f773b 1401 ids = [video_id for (username, video_id) in matches if username]
652cdaa2
JMF
1402 url_results = self._ids_to_results(ids)
1403
1404 return self.playlist_result(url_results, playlist_id, title)
1405
c5e8d7af
PH
1406 def _real_extract(self, url):
1407 # Extract playlist id
d67cc9fa 1408 mobj = re.match(self._VALID_URL, url)
c5e8d7af
PH
1409 if mobj is None:
1410 raise ExtractorError(u'Invalid URL: %s' % url)
47192f92
FV
1411 playlist_id = mobj.group(1) or mobj.group(2)
1412
1413 # Check if it's a video-specific URL
7c61bd36 1414 query_dict = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
47192f92
FV
1415 if 'v' in query_dict:
1416 video_id = query_dict['v'][0]
1417 if self._downloader.params.get('noplaylist'):
1418 self.to_screen(u'Downloading just video %s because of --no-playlist' % video_id)
7012b23c 1419 return self.url_result(video_id, 'Youtube', video_id=video_id)
47192f92
FV
1420 else:
1421 self.to_screen(u'Downloading playlist PL%s - add --no-playlist to just download video %s' % (playlist_id, video_id))
c5e8d7af 1422
7d4afc55 1423 if playlist_id.startswith('RD'):
652cdaa2
JMF
1424 # Mixes require a custom extraction process
1425 return self._extract_mix(playlist_id)
0a688bc0
JMF
1426 if playlist_id.startswith('TL'):
1427 raise ExtractorError(u'For downloading YouTube.com top lists, use '
1428 u'the "yttoplist" keyword, for example "youtube-dl \'yttoplist:music:Top Tracks\'"', expected=True)
652cdaa2 1429
dbb94fb0
S
1430 url = self._TEMPLATE_URL % playlist_id
1431 page = self._download_webpage(url, playlist_id)
1432 more_widget_html = content_html = page
1433
dcbb4580
JMF
1434 # Extract the video ids from the playlist pages
1435 ids = []
c5e8d7af 1436
755eb032 1437 for page_num in itertools.count(1):
dbb94fb0 1438 matches = re.finditer(self._VIDEO_RE, content_html)
6e47b51e
JMF
1439 # We remove the duplicates and the link with index 0
1440 # (it's not the first video of the playlist)
1441 new_ids = orderedSet(m.group('id') for m in matches if m.group('index') != '0')
dcbb4580 1442 ids.extend(new_ids)
c5e8d7af 1443
dbb94fb0
S
1444 mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
1445 if not mobj:
c5e8d7af
PH
1446 break
1447
dbb94fb0
S
1448 more = self._download_json(
1449 'https://youtube.com/%s' % mobj.group('more'), playlist_id, 'Downloading page #%s' % page_num)
1450 content_html = more['content_html']
1451 more_widget_html = more['load_more_widget_html']
1452
1453 playlist_title = self._html_search_regex(
1454 r'<h1 class="pl-header-title">\s*(.*?)\s*</h1>', page, u'title')
c5e8d7af 1455
652cdaa2 1456 url_results = self._ids_to_results(ids)
dcbb4580 1457 return self.playlist_result(url_results, playlist_id, playlist_title)
c5e8d7af
PH
1458
1459
0a688bc0
JMF
1460class YoutubeTopListIE(YoutubePlaylistIE):
1461 IE_NAME = u'youtube:toplist'
1462 IE_DESC = (u'YouTube.com top lists, "yttoplist:{channel}:{list title}"'
1463 u' (Example: "yttoplist:music:Top Tracks")')
1464 _VALID_URL = r'yttoplist:(?P<chann>.*?):(?P<title>.*?)$'
1465
1466 def _real_extract(self, url):
1467 mobj = re.match(self._VALID_URL, url)
1468 channel = mobj.group('chann')
1469 title = mobj.group('title')
1470 query = compat_urllib_parse.urlencode({'title': title})
beddbc2a 1471 playlist_re = 'href="([^"]+?%s.*?)"' % re.escape(query)
0a688bc0
JMF
1472 channel_page = self._download_webpage('https://www.youtube.com/%s' % channel, title)
1473 link = self._html_search_regex(playlist_re, channel_page, u'list')
1474 url = compat_urlparse.urljoin('https://www.youtube.com/', link)
1475
1476 video_re = r'data-index="\d+".*?data-video-id="([0-9A-Za-z_-]{11})"'
1477 ids = []
1478 # sometimes the webpage doesn't contain the videos
1479 # retry until we get them
1480 for i in itertools.count(0):
1481 msg = u'Downloading Youtube mix'
1482 if i > 0:
1483 msg += ', retry #%d' % i
1484 webpage = self._download_webpage(url, title, msg)
1485 ids = orderedSet(re.findall(video_re, webpage))
1486 if ids:
1487 break
1488 url_results = self._ids_to_results(ids)
1489 return self.playlist_result(url_results, playlist_title=title)
1490
1491
c5e8d7af 1492class YoutubeChannelIE(InfoExtractor):
0f818663 1493 IE_DESC = u'YouTube.com channels'
c5e8d7af 1494 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
c5e8d7af 1495 _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
38c2e5b8 1496 _MORE_PAGES_URL = 'https://www.youtube.com/c4_browse_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
c5e8d7af
PH
1497 IE_NAME = u'youtube:channel'
1498
1499 def extract_videos_from_page(self, page):
1500 ids_in_page = []
1501 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1502 if mobj.group(1) not in ids_in_page:
1503 ids_in_page.append(mobj.group(1))
1504 return ids_in_page
1505
1506 def _real_extract(self, url):
1507 # Extract channel id
1508 mobj = re.match(self._VALID_URL, url)
1509 if mobj is None:
1510 raise ExtractorError(u'Invalid URL: %s' % url)
1511
1512 # Download channel page
1513 channel_id = mobj.group(1)
1514 video_ids = []
b9643eed
JMF
1515 url = 'https://www.youtube.com/channel/%s/videos' % channel_id
1516 channel_page = self._download_webpage(url, channel_id)
31812a9e
PH
1517 autogenerated = re.search(r'''(?x)
1518 class="[^"]*?(?:
1519 channel-header-autogenerated-label|
1520 yt-channel-title-autogenerated
1521 )[^"]*"''', channel_page) is not None
c5e8d7af 1522
b9643eed
JMF
1523 if autogenerated:
1524 # The videos are contained in a single page
1525 # the ajax pages can't be used, they are empty
1526 video_ids = self.extract_videos_from_page(channel_page)
1527 else:
1528 # Download all channel pages using the json-based channel_ajax query
1529 for pagenum in itertools.count(1):
1530 url = self._MORE_PAGES_URL % (pagenum, channel_id)
81c2f20b
PH
1531 page = self._download_json(
1532 url, channel_id, note=u'Downloading page #%s' % pagenum,
1533 transform_source=uppercase_escape)
1534
b9643eed
JMF
1535 ids_in_page = self.extract_videos_from_page(page['content_html'])
1536 video_ids.extend(ids_in_page)
1537
1538 if self._MORE_PAGES_INDICATOR not in page['load_more_widget_html']:
1539 break
c5e8d7af
PH
1540
1541 self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1542
7012b23c
PH
1543 url_entries = [self.url_result(video_id, 'Youtube', video_id=video_id)
1544 for video_id in video_ids]
1545 return self.playlist_result(url_entries, channel_id)
c5e8d7af
PH
1546
1547
1548class YoutubeUserIE(InfoExtractor):
0f818663 1549 IE_DESC = u'YouTube.com user videos (URL or "ytuser" keyword)'
c9ae7b95 1550 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:user/)?(?!(?:attribution_link|watch|results)(?:$|[^a-z_A-Z0-9-])))|ytuser:)(?!feed/)([A-Za-z0-9_-]+)'
38c2e5b8 1551 _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/users/%s'
c5e8d7af 1552 _GDATA_PAGE_SIZE = 50
38c2e5b8 1553 _GDATA_URL = 'https://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d&alt=json'
c5e8d7af
PH
1554 IE_NAME = u'youtube:user'
1555
e3ea4790 1556 @classmethod
f4b05232 1557 def suitable(cls, url):
e3ea4790
JMF
1558 # Don't return True if the url can be extracted with other youtube
1559 # extractor, the regex would is too permissive and it would match.
1560 other_ies = iter(klass for (name, klass) in globals().items() if name.endswith('IE') and klass is not cls)
1561 if any(ie.suitable(url) for ie in other_ies): return False
f4b05232
JMF
1562 else: return super(YoutubeUserIE, cls).suitable(url)
1563
c5e8d7af
PH
1564 def _real_extract(self, url):
1565 # Extract username
1566 mobj = re.match(self._VALID_URL, url)
1567 if mobj is None:
1568 raise ExtractorError(u'Invalid URL: %s' % url)
1569
1570 username = mobj.group(1)
1571
1572 # Download video ids using YouTube Data API. Result size per
1573 # query is limited (currently to 50 videos) so we need to query
1574 # page by page until there are no video ids - it means we got
1575 # all of them.
1576
b7ab0590 1577 def download_page(pagenum):
c5e8d7af
PH
1578 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1579
1580 gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
b7ab0590
PH
1581 page = self._download_webpage(
1582 gdata_url, username,
1583 u'Downloading video ids from %d to %d' % (
1584 start_index, start_index + self._GDATA_PAGE_SIZE))
c5e8d7af 1585
fd9cf738
JMF
1586 try:
1587 response = json.loads(page)
1588 except ValueError as err:
1589 raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
71c82637 1590 if 'entry' not in response['feed']:
b7ab0590 1591 return
fd9cf738 1592
c5e8d7af 1593 # Extract video identifiers
e302f9ce
PH
1594 entries = response['feed']['entry']
1595 for entry in entries:
1596 title = entry['title']['$t']
1597 video_id = entry['id']['$t'].split('/')[-1]
b7ab0590 1598 yield {
e302f9ce
PH
1599 '_type': 'url',
1600 'url': video_id,
1601 'ie_key': 'Youtube',
b11cec41 1602 'id': video_id,
e302f9ce 1603 'title': title,
b7ab0590
PH
1604 }
1605 url_results = PagedList(download_page, self._GDATA_PAGE_SIZE)
c5e8d7af 1606
7012b23c
PH
1607 return self.playlist_result(url_results, playlist_title=username)
1608
b05654f0
PH
1609
1610class YoutubeSearchIE(SearchInfoExtractor):
0f818663 1611 IE_DESC = u'YouTube.com searches'
b05654f0
PH
1612 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1613 _MAX_RESULTS = 1000
1614 IE_NAME = u'youtube:search'
1615 _SEARCH_KEY = 'ytsearch'
1616
b05654f0
PH
1617 def _get_n_results(self, query, n):
1618 """Get a specified number of results for a query"""
1619
1620 video_ids = []
1621 pagenum = 0
1622 limit = n
1623
1624 while (50 * pagenum) < limit:
b05654f0 1625 result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
7cc3570e
PH
1626 data_json = self._download_webpage(
1627 result_url, video_id=u'query "%s"' % query,
1628 note=u'Downloading page %s' % (pagenum + 1),
1629 errnote=u'Unable to download API page')
1630 data = json.loads(data_json)
1631 api_response = data['data']
1632
1633 if 'items' not in api_response:
07ad22b8
PH
1634 raise ExtractorError(
1635 u'[youtube] No video results', expected=True)
b05654f0
PH
1636
1637 new_ids = list(video['id'] for video in api_response['items'])
1638 video_ids += new_ids
1639
1640 limit = min(n, api_response['totalItems'])
1641 pagenum += 1
1642
1643 if len(video_ids) > n:
1644 video_ids = video_ids[:n]
7012b23c
PH
1645 videos = [self.url_result(video_id, 'Youtube', video_id=video_id)
1646 for video_id in video_ids]
b05654f0 1647 return self.playlist_result(videos, query)
75dff0ee 1648
c9ae7b95 1649
a3dd9248 1650class YoutubeSearchDateIE(YoutubeSearchIE):
cb7fb546 1651 IE_NAME = YoutubeSearchIE.IE_NAME + ':date'
a3dd9248
CM
1652 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc&orderby=published'
1653 _SEARCH_KEY = 'ytsearchdate'
08fb86c4 1654 IE_DESC = u'YouTube.com searches, newest videos first'
75dff0ee 1655
c9ae7b95
PH
1656
1657class YoutubeSearchURLIE(InfoExtractor):
1658 IE_DESC = u'YouTube.com search URLs'
1659 IE_NAME = u'youtube:search_url'
1660 _VALID_URL = r'https?://(?:www\.)?youtube\.com/results\?(.*?&)?search_query=(?P<query>[^&]+)(?:[&]|$)'
1661
1662 def _real_extract(self, url):
1663 mobj = re.match(self._VALID_URL, url)
1664 query = compat_urllib_parse.unquote_plus(mobj.group('query'))
1665
1666 webpage = self._download_webpage(url, query)
1667 result_code = self._search_regex(
1668 r'(?s)<ol id="search-results"(.*?)</ol>', webpage, u'result HTML')
1669
1670 part_codes = re.findall(
1671 r'(?s)<h3 class="yt-lockup-title">(.*?)</h3>', result_code)
1672 entries = []
1673 for part_code in part_codes:
1674 part_title = self._html_search_regex(
1675 r'(?s)title="([^"]+)"', part_code, 'item title', fatal=False)
1676 part_url_snippet = self._html_search_regex(
1677 r'(?s)href="([^"]+)"', part_code, 'item URL')
1678 part_url = compat_urlparse.urljoin(
1679 'https://www.youtube.com/', part_url_snippet)
1680 entries.append({
1681 '_type': 'url',
1682 'url': part_url,
1683 'title': part_title,
1684 })
1685
1686 return {
1687 '_type': 'playlist',
1688 'entries': entries,
1689 'title': query,
1690 }
1691
1692
75dff0ee 1693class YoutubeShowIE(InfoExtractor):
0f818663 1694 IE_DESC = u'YouTube.com (multi-season) shows'
75dff0ee
JMF
1695 _VALID_URL = r'https?://www\.youtube\.com/show/(.*)'
1696 IE_NAME = u'youtube:show'
1697
1698 def _real_extract(self, url):
1699 mobj = re.match(self._VALID_URL, url)
1700 show_name = mobj.group(1)
1701 webpage = self._download_webpage(url, show_name, u'Downloading show webpage')
1702 # There's one playlist for each season of the show
1703 m_seasons = list(re.finditer(r'href="(/playlist\?list=.*?)"', webpage))
1704 self.to_screen(u'%s: Found %s seasons' % (show_name, len(m_seasons)))
1705 return [self.url_result('https://www.youtube.com' + season.group(1), 'YoutubePlaylist') for season in m_seasons]
04cc9617
JMF
1706
1707
b2e8bc1b 1708class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
d7ae0639
JMF
1709 """
1710 Base class for extractors that fetch info from
1711 http://www.youtube.com/feed_ajax
1712 Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.
1713 """
b2e8bc1b 1714 _LOGIN_REQUIRED = True
43ba5456
JMF
1715 # use action_load_personal_feed instead of action_load_system_feed
1716 _PERSONAL_FEED = False
04cc9617 1717
d7ae0639
JMF
1718 @property
1719 def _FEED_TEMPLATE(self):
43ba5456
JMF
1720 action = 'action_load_system_feed'
1721 if self._PERSONAL_FEED:
1722 action = 'action_load_personal_feed'
38c2e5b8 1723 return 'https://www.youtube.com/feed_ajax?%s=1&feed_name=%s&paging=%%s' % (action, self._FEED_NAME)
d7ae0639
JMF
1724
1725 @property
1726 def IE_NAME(self):
1727 return u'youtube:%s' % self._FEED_NAME
04cc9617 1728
81f0259b 1729 def _real_initialize(self):
b2e8bc1b 1730 self._login()
81f0259b 1731
04cc9617
JMF
1732 def _real_extract(self, url):
1733 feed_entries = []
0e44d838
JMF
1734 paging = 0
1735 for i in itertools.count(1):
d7ae0639
JMF
1736 info = self._download_webpage(self._FEED_TEMPLATE % paging,
1737 u'%s feed' % self._FEED_NAME,
04cc9617
JMF
1738 u'Downloading page %s' % i)
1739 info = json.loads(info)
1740 feed_html = info['feed_html']
43ba5456 1741 m_ids = re.finditer(r'"/watch\?v=(.*?)["&]', feed_html)
04cc9617 1742 ids = orderedSet(m.group(1) for m in m_ids)
7012b23c
PH
1743 feed_entries.extend(
1744 self.url_result(video_id, 'Youtube', video_id=video_id)
1745 for video_id in ids)
04cc9617
JMF
1746 if info['paging'] is None:
1747 break
0e44d838 1748 paging = info['paging']
d7ae0639
JMF
1749 return self.playlist_result(feed_entries, playlist_title=self._PLAYLIST_TITLE)
1750
1751class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor):
1752 IE_DESC = u'YouTube.com subscriptions feed, "ytsubs" keyword(requires authentication)'
1753 _VALID_URL = r'https?://www\.youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
1754 _FEED_NAME = 'subscriptions'
1755 _PLAYLIST_TITLE = u'Youtube Subscriptions'
1756
1757class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
1758 IE_DESC = u'YouTube.com recommended videos, "ytrec" keyword (requires authentication)'
1759 _VALID_URL = r'https?://www\.youtube\.com/feed/recommended|:ytrec(?:ommended)?'
1760 _FEED_NAME = 'recommended'
1761 _PLAYLIST_TITLE = u'Youtube Recommended videos'
c626a3d9 1762
43ba5456
JMF
1763class YoutubeWatchLaterIE(YoutubeFeedsInfoExtractor):
1764 IE_DESC = u'Youtube watch later list, "ytwatchlater" keyword (requires authentication)'
1765 _VALID_URL = r'https?://www\.youtube\.com/feed/watch_later|:ytwatchlater'
1766 _FEED_NAME = 'watch_later'
1767 _PLAYLIST_TITLE = u'Youtube Watch Later'
43ba5456 1768 _PERSONAL_FEED = True
c626a3d9 1769
f459d170
JMF
1770class YoutubeHistoryIE(YoutubeFeedsInfoExtractor):
1771 IE_DESC = u'Youtube watch history, "ythistory" keyword (requires authentication)'
1772 _VALID_URL = u'https?://www\.youtube\.com/feed/history|:ythistory'
1773 _FEED_NAME = 'history'
1774 _PERSONAL_FEED = True
1775 _PLAYLIST_TITLE = u'Youtube Watch History'
1776
c626a3d9
JMF
1777class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
1778 IE_NAME = u'youtube:favorites'
1779 IE_DESC = u'YouTube.com favourite videos, "ytfav" keyword (requires authentication)'
c7a7750d 1780 _VALID_URL = r'https?://www\.youtube\.com/my_favorites|:ytfav(?:ou?rites)?'
c626a3d9
JMF
1781 _LOGIN_REQUIRED = True
1782
1783 def _real_extract(self, url):
1784 webpage = self._download_webpage('https://www.youtube.com/my_favorites', 'Youtube Favourites videos')
1785 playlist_id = self._search_regex(r'list=(.+?)["&]', webpage, u'favourites playlist id')
1786 return self.url_result(playlist_id, 'YoutubePlaylist')
15870e90
PH
1787
1788
1789class YoutubeTruncatedURLIE(InfoExtractor):
1790 IE_NAME = 'youtube:truncated_url'
1791 IE_DESC = False # Do not list
975d35db 1792 _VALID_URL = r'''(?x)
2eb5d315 1793 (?:https?://)?[^/]+/watch\?(?:feature=[a-z_]+)?$|
975d35db
PH
1794 (?:https?://)?(?:www\.)?youtube\.com/attribution_link\?a=[^&]+$
1795 '''
15870e90
PH
1796
1797 def _real_extract(self, url):
1798 raise ExtractorError(
1799 u'Did you forget to quote the URL? Remember that & is a meta '
1800 u'character in most shells, so you want to put the URL in quotes, '
1801 u'like youtube-dl '
b4622a32
PH
1802 u'"http://www.youtube.com/watch?feature=foo&v=BaW_jenozKc" '
1803 u' or simply youtube-dl BaW_jenozKc .',
15870e90 1804 expected=True)