]> jfr.im git - yt-dlp.git/blob - youtube_dl/extractor/youtube.py
Merge pull request #3180 from hakatashi/niconico-without-authentication
[yt-dlp.git] / youtube_dl / extractor / youtube.py
1 # coding: utf-8
2
3 import collections
4 import errno
5 import io
6 import itertools
7 import json
8 import os.path
9 import re
10 import struct
11 import traceback
12 import zlib
13
14 from .common import InfoExtractor, SearchInfoExtractor
15 from .subtitles import SubtitlesInfoExtractor
16 from ..jsinterp import JSInterpreter
17 from ..utils import (
18 compat_chr,
19 compat_parse_qs,
20 compat_urllib_parse,
21 compat_urllib_request,
22 compat_urlparse,
23 compat_str,
24
25 clean_html,
26 get_cachedir,
27 get_element_by_id,
28 get_element_by_attribute,
29 ExtractorError,
30 int_or_none,
31 PagedList,
32 unescapeHTML,
33 unified_strdate,
34 orderedSet,
35 write_json_file,
36 uppercase_escape,
37 )
38
39 class YoutubeBaseInfoExtractor(InfoExtractor):
40 """Provide base functions for Youtube extractors"""
41 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
42 _LANG_URL = r'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
43 _AGE_URL = 'https://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
44 _NETRC_MACHINE = 'youtube'
45 # If True it will raise an error if no login info is provided
46 _LOGIN_REQUIRED = False
47
48 def _set_language(self):
49 return bool(self._download_webpage(
50 self._LANG_URL, None,
51 note=u'Setting language', errnote='unable to set language',
52 fatal=False))
53
54 def _login(self):
55 (username, password) = self._get_login_info()
56 # No authentication to be performed
57 if username is None:
58 if self._LOGIN_REQUIRED:
59 raise ExtractorError(u'No login info available, needed for using %s.' % self.IE_NAME, expected=True)
60 return False
61
62 login_page = self._download_webpage(
63 self._LOGIN_URL, None,
64 note=u'Downloading login page',
65 errnote=u'unable to fetch login page', fatal=False)
66 if login_page is False:
67 return
68
69 galx = self._search_regex(r'(?s)<input.+?name="GALX".+?value="(.+?)"',
70 login_page, u'Login GALX parameter')
71
72 # Log in
73 login_form_strs = {
74 u'continue': u'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
75 u'Email': username,
76 u'GALX': galx,
77 u'Passwd': password,
78 u'PersistentCookie': u'yes',
79 u'_utf8': u'霱',
80 u'bgresponse': u'js_disabled',
81 u'checkConnection': u'',
82 u'checkedDomains': u'youtube',
83 u'dnConn': u'',
84 u'pstMsg': u'0',
85 u'rmShown': u'1',
86 u'secTok': u'',
87 u'signIn': u'Sign in',
88 u'timeStmp': u'',
89 u'service': u'youtube',
90 u'uilel': u'3',
91 u'hl': u'en_US',
92 }
93 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
94 # chokes on unicode
95 login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
96 login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
97
98 req = compat_urllib_request.Request(self._LOGIN_URL, login_data)
99 login_results = self._download_webpage(
100 req, None,
101 note=u'Logging in', errnote=u'unable to log in', fatal=False)
102 if login_results is False:
103 return False
104 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
105 self._downloader.report_warning(u'unable to log in: bad username or password')
106 return False
107 return True
108
109 def _confirm_age(self):
110 age_form = {
111 'next_url': '/',
112 'action_confirm': 'Confirm',
113 }
114 req = compat_urllib_request.Request(self._AGE_URL,
115 compat_urllib_parse.urlencode(age_form).encode('ascii'))
116
117 self._download_webpage(
118 req, None,
119 note=u'Confirming age', errnote=u'Unable to confirm age')
120 return True
121
122 def _real_initialize(self):
123 if self._downloader is None:
124 return
125 if not self._set_language():
126 return
127 if not self._login():
128 return
129 self._confirm_age()
130
131
132 class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
133 IE_DESC = u'YouTube.com'
134 _VALID_URL = r"""(?x)^
135 (
136 (?:https?://|//)? # http(s):// or protocol-independent URL (optional)
137 (?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/|
138 (?:www\.)?deturl\.com/www\.youtube\.com/|
139 (?:www\.)?pwnyoutube\.com/|
140 (?:www\.)?yourepeat\.com/|
141 tube\.majestyc\.net/|
142 youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains
143 (?:.*?\#/)? # handle anchor (#/) redirect urls
144 (?: # the various things that can precede the ID:
145 (?:(?:v|embed|e)/) # v/ or embed/ or e/
146 |(?: # or the v= param in all its forms
147 (?:(?:watch|movie)(?:_popup)?(?:\.php)?/?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
148 (?:\?|\#!?) # the params delimiter ? or # or #!
149 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
150 v=
151 )
152 ))
153 |youtu\.be/ # just youtu.be/xxxx
154 |https?://(?:www\.)?cleanvideosearch\.com/media/action/yt/watch\?videoId=
155 )
156 )? # all until now is optional -> you can pass the naked ID
157 ([0-9A-Za-z_-]{11}) # here is it! the YouTube video ID
158 (?(1).+)? # if we found the ID, everything can follow
159 $"""
160 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
161 _formats = {
162 '5': {'ext': 'flv', 'width': 400, 'height': 240},
163 '6': {'ext': 'flv', 'width': 450, 'height': 270},
164 '13': {'ext': '3gp'},
165 '17': {'ext': '3gp', 'width': 176, 'height': 144},
166 '18': {'ext': 'mp4', 'width': 640, 'height': 360},
167 '22': {'ext': 'mp4', 'width': 1280, 'height': 720},
168 '34': {'ext': 'flv', 'width': 640, 'height': 360},
169 '35': {'ext': 'flv', 'width': 854, 'height': 480},
170 '36': {'ext': '3gp', 'width': 320, 'height': 240},
171 '37': {'ext': 'mp4', 'width': 1920, 'height': 1080},
172 '38': {'ext': 'mp4', 'width': 4096, 'height': 3072},
173 '43': {'ext': 'webm', 'width': 640, 'height': 360},
174 '44': {'ext': 'webm', 'width': 854, 'height': 480},
175 '45': {'ext': 'webm', 'width': 1280, 'height': 720},
176 '46': {'ext': 'webm', 'width': 1920, 'height': 1080},
177
178
179 # 3d videos
180 '82': {'ext': 'mp4', 'height': 360, 'format_note': '3D', 'preference': -20},
181 '83': {'ext': 'mp4', 'height': 480, 'format_note': '3D', 'preference': -20},
182 '84': {'ext': 'mp4', 'height': 720, 'format_note': '3D', 'preference': -20},
183 '85': {'ext': 'mp4', 'height': 1080, 'format_note': '3D', 'preference': -20},
184 '100': {'ext': 'webm', 'height': 360, 'format_note': '3D', 'preference': -20},
185 '101': {'ext': 'webm', 'height': 480, 'format_note': '3D', 'preference': -20},
186 '102': {'ext': 'webm', 'height': 720, 'format_note': '3D', 'preference': -20},
187
188 # Apple HTTP Live Streaming
189 '92': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'preference': -10},
190 '93': {'ext': 'mp4', 'height': 360, 'format_note': 'HLS', 'preference': -10},
191 '94': {'ext': 'mp4', 'height': 480, 'format_note': 'HLS', 'preference': -10},
192 '95': {'ext': 'mp4', 'height': 720, 'format_note': 'HLS', 'preference': -10},
193 '96': {'ext': 'mp4', 'height': 1080, 'format_note': 'HLS', 'preference': -10},
194 '132': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'preference': -10},
195 '151': {'ext': 'mp4', 'height': 72, 'format_note': 'HLS', 'preference': -10},
196
197 # DASH mp4 video
198 '133': {'ext': 'mp4', 'height': 240, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
199 '134': {'ext': 'mp4', 'height': 360, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
200 '135': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
201 '136': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
202 '137': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
203 '138': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
204 '160': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
205 '264': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
206
207 # Dash mp4 audio
208 '139': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 48, 'preference': -50},
209 '140': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 128, 'preference': -50},
210 '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 256, 'preference': -50},
211
212 # Dash webm
213 '167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
214 '168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
215 '169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
216 '170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
217 '218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
218 '219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
219 '242': {'ext': 'webm', 'height': 240, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
220 '243': {'ext': 'webm', 'height': 360, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
221 '244': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
222 '245': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
223 '246': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
224 '247': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
225 '248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
226 '271': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
227 '272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
228
229 # Dash webm audio
230 '171': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'abr': 48, 'preference': -50},
231 '172': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'abr': 256, 'preference': -50},
232
233 # RTMP (unnamed)
234 '_rtmp': {'protocol': 'rtmp'},
235 }
236
237 IE_NAME = u'youtube'
238 _TESTS = [
239 {
240 u"url": u"http://www.youtube.com/watch?v=BaW_jenozKc",
241 u"file": u"BaW_jenozKc.mp4",
242 u"info_dict": {
243 u"title": u"youtube-dl test video \"'/\\ä↭𝕐",
244 u"uploader": u"Philipp Hagemeister",
245 u"uploader_id": u"phihag",
246 u"upload_date": u"20121002",
247 u"description": u"test chars: \"'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .",
248 u"categories": [u'Science & Technology'],
249 }
250 },
251 {
252 u"url": u"http://www.youtube.com/watch?v=UxxajLWwzqY",
253 u"file": u"UxxajLWwzqY.mp4",
254 u"note": u"Test generic use_cipher_signature video (#897)",
255 u"info_dict": {
256 u"upload_date": u"20120506",
257 u"title": u"Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]",
258 u"description": u"md5:fea86fda2d5a5784273df5c7cc994d9f",
259 u"uploader": u"Icona Pop",
260 u"uploader_id": u"IconaPop"
261 }
262 },
263 {
264 u"url": u"https://www.youtube.com/watch?v=07FYdnEawAQ",
265 u"file": u"07FYdnEawAQ.mp4",
266 u"note": u"Test VEVO video with age protection (#956)",
267 u"info_dict": {
268 u"upload_date": u"20130703",
269 u"title": u"Justin Timberlake - Tunnel Vision (Explicit)",
270 u"description": u"md5:64249768eec3bc4276236606ea996373",
271 u"uploader": u"justintimberlakeVEVO",
272 u"uploader_id": u"justintimberlakeVEVO"
273 }
274 },
275 {
276 u"url": u"//www.YouTube.com/watch?v=yZIXLfi8CZQ",
277 u"file": u"yZIXLfi8CZQ.mp4",
278 u"note": u"Embed-only video (#1746)",
279 u"info_dict": {
280 u"upload_date": u"20120608",
281 u"title": u"Principal Sexually Assaults A Teacher - Episode 117 - 8th June 2012",
282 u"description": u"md5:09b78bd971f1e3e289601dfba15ca4f7",
283 u"uploader": u"SET India",
284 u"uploader_id": u"setindia"
285 }
286 },
287 {
288 u"url": u"http://www.youtube.com/watch?v=a9LDPn-MO4I",
289 u"file": u"a9LDPn-MO4I.m4a",
290 u"note": u"256k DASH audio (format 141) via DASH manifest",
291 u"info_dict": {
292 u"upload_date": "20121002",
293 u"uploader_id": "8KVIDEO",
294 u"description": "No description available.",
295 u"uploader": "8KVIDEO",
296 u"title": "UHDTV TEST 8K VIDEO.mp4"
297 },
298 u"params": {
299 u"youtube_include_dash_manifest": True,
300 u"format": "141",
301 },
302 },
303 # DASH manifest with encrypted signature
304 {
305 u'url': u'https://www.youtube.com/watch?v=IB3lcPjvWLA',
306 u'info_dict': {
307 u'id': u'IB3lcPjvWLA',
308 u'ext': u'm4a',
309 u'title': u'Afrojack - The Spark ft. Spree Wilson',
310 u'description': u'md5:9717375db5a9a3992be4668bbf3bc0a8',
311 u'uploader': u'AfrojackVEVO',
312 u'uploader_id': u'AfrojackVEVO',
313 u'upload_date': u'20131011',
314 },
315 u"params": {
316 u'youtube_include_dash_manifest': True,
317 u'format': '141',
318 },
319 },
320 ]
321
322
323 @classmethod
324 def suitable(cls, url):
325 """Receives a URL and returns True if suitable for this IE."""
326 if YoutubePlaylistIE.suitable(url): return False
327 return re.match(cls._VALID_URL, url) is not None
328
329 def __init__(self, *args, **kwargs):
330 super(YoutubeIE, self).__init__(*args, **kwargs)
331 self._player_cache = {}
332
333 def report_video_info_webpage_download(self, video_id):
334 """Report attempt to download video info webpage."""
335 self.to_screen(u'%s: Downloading video info webpage' % video_id)
336
337 def report_information_extraction(self, video_id):
338 """Report attempt to extract video information."""
339 self.to_screen(u'%s: Extracting video information' % video_id)
340
341 def report_unavailable_format(self, video_id, format):
342 """Report extracted video URL."""
343 self.to_screen(u'%s: Format %s not available' % (video_id, format))
344
345 def report_rtmp_download(self):
346 """Indicate the download will use the RTMP protocol."""
347 self.to_screen(u'RTMP download detected')
348
349 def _extract_signature_function(self, video_id, player_url, slen):
350 id_m = re.match(r'.*-(?P<id>[a-zA-Z0-9_-]+)\.(?P<ext>[a-z]+)$',
351 player_url)
352 player_type = id_m.group('ext')
353 player_id = id_m.group('id')
354
355 # Read from filesystem cache
356 func_id = '%s_%s_%d' % (player_type, player_id, slen)
357 assert os.path.basename(func_id) == func_id
358 cache_dir = get_cachedir(self._downloader.params)
359
360 cache_enabled = cache_dir is not None
361 if cache_enabled:
362 cache_fn = os.path.join(os.path.expanduser(cache_dir),
363 u'youtube-sigfuncs',
364 func_id + '.json')
365 try:
366 with io.open(cache_fn, 'r', encoding='utf-8') as cachef:
367 cache_spec = json.load(cachef)
368 return lambda s: u''.join(s[i] for i in cache_spec)
369 except IOError:
370 pass # No cache available
371
372 if player_type == 'js':
373 code = self._download_webpage(
374 player_url, video_id,
375 note=u'Downloading %s player %s' % (player_type, player_id),
376 errnote=u'Download of %s failed' % player_url)
377 res = self._parse_sig_js(code)
378 elif player_type == 'swf':
379 urlh = self._request_webpage(
380 player_url, video_id,
381 note=u'Downloading %s player %s' % (player_type, player_id),
382 errnote=u'Download of %s failed' % player_url)
383 code = urlh.read()
384 res = self._parse_sig_swf(code)
385 else:
386 assert False, 'Invalid player type %r' % player_type
387
388 if cache_enabled:
389 try:
390 test_string = u''.join(map(compat_chr, range(slen)))
391 cache_res = res(test_string)
392 cache_spec = [ord(c) for c in cache_res]
393 try:
394 os.makedirs(os.path.dirname(cache_fn))
395 except OSError as ose:
396 if ose.errno != errno.EEXIST:
397 raise
398 write_json_file(cache_spec, cache_fn)
399 except Exception:
400 tb = traceback.format_exc()
401 self._downloader.report_warning(
402 u'Writing cache to %r failed: %s' % (cache_fn, tb))
403
404 return res
405
406 def _print_sig_code(self, func, slen):
407 def gen_sig_code(idxs):
408 def _genslice(start, end, step):
409 starts = u'' if start == 0 else str(start)
410 ends = (u':%d' % (end+step)) if end + step >= 0 else u':'
411 steps = u'' if step == 1 else (u':%d' % step)
412 return u's[%s%s%s]' % (starts, ends, steps)
413
414 step = None
415 start = '(Never used)' # Quelch pyflakes warnings - start will be
416 # set as soon as step is set
417 for i, prev in zip(idxs[1:], idxs[:-1]):
418 if step is not None:
419 if i - prev == step:
420 continue
421 yield _genslice(start, prev, step)
422 step = None
423 continue
424 if i - prev in [-1, 1]:
425 step = i - prev
426 start = prev
427 continue
428 else:
429 yield u's[%d]' % prev
430 if step is None:
431 yield u's[%d]' % i
432 else:
433 yield _genslice(start, i, step)
434
435 test_string = u''.join(map(compat_chr, range(slen)))
436 cache_res = func(test_string)
437 cache_spec = [ord(c) for c in cache_res]
438 expr_code = u' + '.join(gen_sig_code(cache_spec))
439 code = u'if len(s) == %d:\n return %s\n' % (slen, expr_code)
440 self.to_screen(u'Extracted signature function:\n' + code)
441
442 def _parse_sig_js(self, jscode):
443 funcname = self._search_regex(
444 r'signature=([$a-zA-Z]+)', jscode,
445 u'Initial JS player signature function name')
446
447 jsi = JSInterpreter(jscode)
448 initial_function = jsi.extract_function(funcname)
449 return lambda s: initial_function([s])
450
451 def _parse_sig_swf(self, file_contents):
452 if file_contents[1:3] != b'WS':
453 raise ExtractorError(
454 u'Not an SWF file; header is %r' % file_contents[:3])
455 if file_contents[:1] == b'C':
456 content = zlib.decompress(file_contents[8:])
457 else:
458 raise NotImplementedError(u'Unsupported compression format %r' %
459 file_contents[:1])
460
461 def extract_tags(content):
462 pos = 0
463 while pos < len(content):
464 header16 = struct.unpack('<H', content[pos:pos+2])[0]
465 pos += 2
466 tag_code = header16 >> 6
467 tag_len = header16 & 0x3f
468 if tag_len == 0x3f:
469 tag_len = struct.unpack('<I', content[pos:pos+4])[0]
470 pos += 4
471 assert pos+tag_len <= len(content)
472 yield (tag_code, content[pos:pos+tag_len])
473 pos += tag_len
474
475 code_tag = next(tag
476 for tag_code, tag in extract_tags(content)
477 if tag_code == 82)
478 p = code_tag.index(b'\0', 4) + 1
479 code_reader = io.BytesIO(code_tag[p:])
480
481 # Parse ABC (AVM2 ByteCode)
482 def read_int(reader=None):
483 if reader is None:
484 reader = code_reader
485 res = 0
486 shift = 0
487 for _ in range(5):
488 buf = reader.read(1)
489 assert len(buf) == 1
490 b = struct.unpack('<B', buf)[0]
491 res = res | ((b & 0x7f) << shift)
492 if b & 0x80 == 0:
493 break
494 shift += 7
495 return res
496
497 def u30(reader=None):
498 res = read_int(reader)
499 assert res & 0xf0000000 == 0
500 return res
501 u32 = read_int
502
503 def s32(reader=None):
504 v = read_int(reader)
505 if v & 0x80000000 != 0:
506 v = - ((v ^ 0xffffffff) + 1)
507 return v
508
509 def read_string(reader=None):
510 if reader is None:
511 reader = code_reader
512 slen = u30(reader)
513 resb = reader.read(slen)
514 assert len(resb) == slen
515 return resb.decode('utf-8')
516
517 def read_bytes(count, reader=None):
518 if reader is None:
519 reader = code_reader
520 resb = reader.read(count)
521 assert len(resb) == count
522 return resb
523
524 def read_byte(reader=None):
525 resb = read_bytes(1, reader=reader)
526 res = struct.unpack('<B', resb)[0]
527 return res
528
529 # minor_version + major_version
530 read_bytes(2 + 2)
531
532 # Constant pool
533 int_count = u30()
534 for _c in range(1, int_count):
535 s32()
536 uint_count = u30()
537 for _c in range(1, uint_count):
538 u32()
539 double_count = u30()
540 read_bytes((double_count-1) * 8)
541 string_count = u30()
542 constant_strings = [u'']
543 for _c in range(1, string_count):
544 s = read_string()
545 constant_strings.append(s)
546 namespace_count = u30()
547 for _c in range(1, namespace_count):
548 read_bytes(1) # kind
549 u30() # name
550 ns_set_count = u30()
551 for _c in range(1, ns_set_count):
552 count = u30()
553 for _c2 in range(count):
554 u30()
555 multiname_count = u30()
556 MULTINAME_SIZES = {
557 0x07: 2, # QName
558 0x0d: 2, # QNameA
559 0x0f: 1, # RTQName
560 0x10: 1, # RTQNameA
561 0x11: 0, # RTQNameL
562 0x12: 0, # RTQNameLA
563 0x09: 2, # Multiname
564 0x0e: 2, # MultinameA
565 0x1b: 1, # MultinameL
566 0x1c: 1, # MultinameLA
567 }
568 multinames = [u'']
569 for _c in range(1, multiname_count):
570 kind = u30()
571 assert kind in MULTINAME_SIZES, u'Invalid multiname kind %r' % kind
572 if kind == 0x07:
573 u30() # namespace_idx
574 name_idx = u30()
575 multinames.append(constant_strings[name_idx])
576 else:
577 multinames.append('[MULTINAME kind: %d]' % kind)
578 for _c2 in range(MULTINAME_SIZES[kind]):
579 u30()
580
581 # Methods
582 method_count = u30()
583 MethodInfo = collections.namedtuple(
584 'MethodInfo',
585 ['NEED_ARGUMENTS', 'NEED_REST'])
586 method_infos = []
587 for method_id in range(method_count):
588 param_count = u30()
589 u30() # return type
590 for _ in range(param_count):
591 u30() # param type
592 u30() # name index (always 0 for youtube)
593 flags = read_byte()
594 if flags & 0x08 != 0:
595 # Options present
596 option_count = u30()
597 for c in range(option_count):
598 u30() # val
599 read_bytes(1) # kind
600 if flags & 0x80 != 0:
601 # Param names present
602 for _ in range(param_count):
603 u30() # param name
604 mi = MethodInfo(flags & 0x01 != 0, flags & 0x04 != 0)
605 method_infos.append(mi)
606
607 # Metadata
608 metadata_count = u30()
609 for _c in range(metadata_count):
610 u30() # name
611 item_count = u30()
612 for _c2 in range(item_count):
613 u30() # key
614 u30() # value
615
616 def parse_traits_info():
617 trait_name_idx = u30()
618 kind_full = read_byte()
619 kind = kind_full & 0x0f
620 attrs = kind_full >> 4
621 methods = {}
622 if kind in [0x00, 0x06]: # Slot or Const
623 u30() # Slot id
624 u30() # type_name_idx
625 vindex = u30()
626 if vindex != 0:
627 read_byte() # vkind
628 elif kind in [0x01, 0x02, 0x03]: # Method / Getter / Setter
629 u30() # disp_id
630 method_idx = u30()
631 methods[multinames[trait_name_idx]] = method_idx
632 elif kind == 0x04: # Class
633 u30() # slot_id
634 u30() # classi
635 elif kind == 0x05: # Function
636 u30() # slot_id
637 function_idx = u30()
638 methods[function_idx] = multinames[trait_name_idx]
639 else:
640 raise ExtractorError(u'Unsupported trait kind %d' % kind)
641
642 if attrs & 0x4 != 0: # Metadata present
643 metadata_count = u30()
644 for _c3 in range(metadata_count):
645 u30() # metadata index
646
647 return methods
648
649 # Classes
650 TARGET_CLASSNAME = u'SignatureDecipher'
651 searched_idx = multinames.index(TARGET_CLASSNAME)
652 searched_class_id = None
653 class_count = u30()
654 for class_id in range(class_count):
655 name_idx = u30()
656 if name_idx == searched_idx:
657 # We found the class we're looking for!
658 searched_class_id = class_id
659 u30() # super_name idx
660 flags = read_byte()
661 if flags & 0x08 != 0: # Protected namespace is present
662 u30() # protected_ns_idx
663 intrf_count = u30()
664 for _c2 in range(intrf_count):
665 u30()
666 u30() # iinit
667 trait_count = u30()
668 for _c2 in range(trait_count):
669 parse_traits_info()
670
671 if searched_class_id is None:
672 raise ExtractorError(u'Target class %r not found' %
673 TARGET_CLASSNAME)
674
675 method_names = {}
676 method_idxs = {}
677 for class_id in range(class_count):
678 u30() # cinit
679 trait_count = u30()
680 for _c2 in range(trait_count):
681 trait_methods = parse_traits_info()
682 if class_id == searched_class_id:
683 method_names.update(trait_methods.items())
684 method_idxs.update(dict(
685 (idx, name)
686 for name, idx in trait_methods.items()))
687
688 # Scripts
689 script_count = u30()
690 for _c in range(script_count):
691 u30() # init
692 trait_count = u30()
693 for _c2 in range(trait_count):
694 parse_traits_info()
695
696 # Method bodies
697 method_body_count = u30()
698 Method = collections.namedtuple('Method', ['code', 'local_count'])
699 methods = {}
700 for _c in range(method_body_count):
701 method_idx = u30()
702 u30() # max_stack
703 local_count = u30()
704 u30() # init_scope_depth
705 u30() # max_scope_depth
706 code_length = u30()
707 code = read_bytes(code_length)
708 if method_idx in method_idxs:
709 m = Method(code, local_count)
710 methods[method_idxs[method_idx]] = m
711 exception_count = u30()
712 for _c2 in range(exception_count):
713 u30() # from
714 u30() # to
715 u30() # target
716 u30() # exc_type
717 u30() # var_name
718 trait_count = u30()
719 for _c2 in range(trait_count):
720 parse_traits_info()
721
722 assert p + code_reader.tell() == len(code_tag)
723 assert len(methods) == len(method_idxs)
724
725 method_pyfunctions = {}
726
727 def extract_function(func_name):
728 if func_name in method_pyfunctions:
729 return method_pyfunctions[func_name]
730 if func_name not in methods:
731 raise ExtractorError(u'Cannot find function %r' % func_name)
732 m = methods[func_name]
733
734 def resfunc(args):
735 registers = ['(this)'] + list(args) + [None] * m.local_count
736 stack = []
737 coder = io.BytesIO(m.code)
738 while True:
739 opcode = struct.unpack('!B', coder.read(1))[0]
740 if opcode == 36: # pushbyte
741 v = struct.unpack('!B', coder.read(1))[0]
742 stack.append(v)
743 elif opcode == 44: # pushstring
744 idx = u30(coder)
745 stack.append(constant_strings[idx])
746 elif opcode == 48: # pushscope
747 # We don't implement the scope register, so we'll just
748 # ignore the popped value
749 stack.pop()
750 elif opcode == 70: # callproperty
751 index = u30(coder)
752 mname = multinames[index]
753 arg_count = u30(coder)
754 args = list(reversed(
755 [stack.pop() for _ in range(arg_count)]))
756 obj = stack.pop()
757 if mname == u'split':
758 assert len(args) == 1
759 assert isinstance(args[0], compat_str)
760 assert isinstance(obj, compat_str)
761 if args[0] == u'':
762 res = list(obj)
763 else:
764 res = obj.split(args[0])
765 stack.append(res)
766 elif mname == u'slice':
767 assert len(args) == 1
768 assert isinstance(args[0], int)
769 assert isinstance(obj, list)
770 res = obj[args[0]:]
771 stack.append(res)
772 elif mname == u'join':
773 assert len(args) == 1
774 assert isinstance(args[0], compat_str)
775 assert isinstance(obj, list)
776 res = args[0].join(obj)
777 stack.append(res)
778 elif mname in method_pyfunctions:
779 stack.append(method_pyfunctions[mname](args))
780 else:
781 raise NotImplementedError(
782 u'Unsupported property %r on %r'
783 % (mname, obj))
784 elif opcode == 72: # returnvalue
785 res = stack.pop()
786 return res
787 elif opcode == 79: # callpropvoid
788 index = u30(coder)
789 mname = multinames[index]
790 arg_count = u30(coder)
791 args = list(reversed(
792 [stack.pop() for _ in range(arg_count)]))
793 obj = stack.pop()
794 if mname == u'reverse':
795 assert isinstance(obj, list)
796 obj.reverse()
797 else:
798 raise NotImplementedError(
799 u'Unsupported (void) property %r on %r'
800 % (mname, obj))
801 elif opcode == 93: # findpropstrict
802 index = u30(coder)
803 mname = multinames[index]
804 res = extract_function(mname)
805 stack.append(res)
806 elif opcode == 97: # setproperty
807 index = u30(coder)
808 value = stack.pop()
809 idx = stack.pop()
810 obj = stack.pop()
811 assert isinstance(obj, list)
812 assert isinstance(idx, int)
813 obj[idx] = value
814 elif opcode == 98: # getlocal
815 index = u30(coder)
816 stack.append(registers[index])
817 elif opcode == 99: # setlocal
818 index = u30(coder)
819 value = stack.pop()
820 registers[index] = value
821 elif opcode == 102: # getproperty
822 index = u30(coder)
823 pname = multinames[index]
824 if pname == u'length':
825 obj = stack.pop()
826 assert isinstance(obj, list)
827 stack.append(len(obj))
828 else: # Assume attribute access
829 idx = stack.pop()
830 assert isinstance(idx, int)
831 obj = stack.pop()
832 assert isinstance(obj, list)
833 stack.append(obj[idx])
834 elif opcode == 128: # coerce
835 u30(coder)
836 elif opcode == 133: # coerce_s
837 assert isinstance(stack[-1], (type(None), compat_str))
838 elif opcode == 164: # modulo
839 value2 = stack.pop()
840 value1 = stack.pop()
841 res = value1 % value2
842 stack.append(res)
843 elif opcode == 208: # getlocal_0
844 stack.append(registers[0])
845 elif opcode == 209: # getlocal_1
846 stack.append(registers[1])
847 elif opcode == 210: # getlocal_2
848 stack.append(registers[2])
849 elif opcode == 211: # getlocal_3
850 stack.append(registers[3])
851 elif opcode == 214: # setlocal_2
852 registers[2] = stack.pop()
853 elif opcode == 215: # setlocal_3
854 registers[3] = stack.pop()
855 else:
856 raise NotImplementedError(
857 u'Unsupported opcode %d' % opcode)
858
859 method_pyfunctions[func_name] = resfunc
860 return resfunc
861
862 initial_function = extract_function(u'decipher')
863 return lambda s: initial_function([s])
864
865 def _decrypt_signature(self, s, video_id, player_url, age_gate=False):
866 """Turn the encrypted s field into a working signature"""
867
868 if player_url is not None:
869 if player_url.startswith(u'//'):
870 player_url = u'https:' + player_url
871 try:
872 player_id = (player_url, len(s))
873 if player_id not in self._player_cache:
874 func = self._extract_signature_function(
875 video_id, player_url, len(s)
876 )
877 self._player_cache[player_id] = func
878 func = self._player_cache[player_id]
879 if self._downloader.params.get('youtube_print_sig_code'):
880 self._print_sig_code(func, len(s))
881 return func(s)
882 except Exception:
883 tb = traceback.format_exc()
884 self._downloader.report_warning(
885 u'Automatic signature extraction failed: ' + tb)
886
887 self._downloader.report_warning(
888 u'Warning: Falling back to static signature algorithm')
889
890 return self._static_decrypt_signature(
891 s, video_id, player_url, age_gate)
892
893 def _static_decrypt_signature(self, s, video_id, player_url, age_gate):
894 if age_gate:
895 # The videos with age protection use another player, so the
896 # algorithms can be different.
897 if len(s) == 86:
898 return s[2:63] + s[82] + s[64:82] + s[63]
899
900 if len(s) == 93:
901 return s[86:29:-1] + s[88] + s[28:5:-1]
902 elif len(s) == 92:
903 return s[25] + s[3:25] + s[0] + s[26:42] + s[79] + s[43:79] + s[91] + s[80:83]
904 elif len(s) == 91:
905 return s[84:27:-1] + s[86] + s[26:5:-1]
906 elif len(s) == 90:
907 return s[25] + s[3:25] + s[2] + s[26:40] + s[77] + s[41:77] + s[89] + s[78:81]
908 elif len(s) == 89:
909 return s[84:78:-1] + s[87] + s[77:60:-1] + s[0] + s[59:3:-1]
910 elif len(s) == 88:
911 return s[7:28] + s[87] + s[29:45] + s[55] + s[46:55] + s[2] + s[56:87] + s[28]
912 elif len(s) == 87:
913 return s[6:27] + s[4] + s[28:39] + s[27] + s[40:59] + s[2] + s[60:]
914 elif len(s) == 86:
915 return s[80:72:-1] + s[16] + s[71:39:-1] + s[72] + s[38:16:-1] + s[82] + s[15::-1]
916 elif len(s) == 85:
917 return s[3:11] + s[0] + s[12:55] + s[84] + s[56:84]
918 elif len(s) == 84:
919 return s[78:70:-1] + s[14] + s[69:37:-1] + s[70] + s[36:14:-1] + s[80] + s[:14][::-1]
920 elif len(s) == 83:
921 return s[80:63:-1] + s[0] + s[62:0:-1] + s[63]
922 elif len(s) == 82:
923 return s[80:37:-1] + s[7] + s[36:7:-1] + s[0] + s[6:0:-1] + s[37]
924 elif len(s) == 81:
925 return s[56] + s[79:56:-1] + s[41] + s[55:41:-1] + s[80] + s[40:34:-1] + s[0] + s[33:29:-1] + s[34] + s[28:9:-1] + s[29] + s[8:0:-1] + s[9]
926 elif len(s) == 80:
927 return s[1:19] + s[0] + s[20:68] + s[19] + s[69:80]
928 elif len(s) == 79:
929 return s[54] + s[77:54:-1] + s[39] + s[53:39:-1] + s[78] + s[38:34:-1] + s[0] + s[33:29:-1] + s[34] + s[28:9:-1] + s[29] + s[8:0:-1] + s[9]
930
931 else:
932 raise ExtractorError(u'Unable to decrypt signature, key length %d not supported; retrying might work' % (len(s)))
933
934 def _get_available_subtitles(self, video_id, webpage):
935 try:
936 sub_list = self._download_webpage(
937 'https://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,
938 video_id, note=False)
939 except ExtractorError as err:
940 self._downloader.report_warning(u'unable to download video subtitles: %s' % compat_str(err))
941 return {}
942 lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
943
944 sub_lang_list = {}
945 for l in lang_list:
946 lang = l[1]
947 params = compat_urllib_parse.urlencode({
948 'lang': lang,
949 'v': video_id,
950 'fmt': self._downloader.params.get('subtitlesformat', 'srt'),
951 'name': unescapeHTML(l[0]).encode('utf-8'),
952 })
953 url = u'https://www.youtube.com/api/timedtext?' + params
954 sub_lang_list[lang] = url
955 if not sub_lang_list:
956 self._downloader.report_warning(u'video doesn\'t have subtitles')
957 return {}
958 return sub_lang_list
959
960 def _get_available_automatic_caption(self, video_id, webpage):
961 """We need the webpage for getting the captions url, pass it as an
962 argument to speed up the process."""
963 sub_format = self._downloader.params.get('subtitlesformat', 'srt')
964 self.to_screen(u'%s: Looking for automatic captions' % video_id)
965 mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
966 err_msg = u'Couldn\'t find automatic captions for %s' % video_id
967 if mobj is None:
968 self._downloader.report_warning(err_msg)
969 return {}
970 player_config = json.loads(mobj.group(1))
971 try:
972 args = player_config[u'args']
973 caption_url = args[u'ttsurl']
974 timestamp = args[u'timestamp']
975 # We get the available subtitles
976 list_params = compat_urllib_parse.urlencode({
977 'type': 'list',
978 'tlangs': 1,
979 'asrs': 1,
980 })
981 list_url = caption_url + '&' + list_params
982 caption_list = self._download_xml(list_url, video_id)
983 original_lang_node = caption_list.find('track')
984 if original_lang_node is None or original_lang_node.attrib.get('kind') != 'asr' :
985 self._downloader.report_warning(u'Video doesn\'t have automatic captions')
986 return {}
987 original_lang = original_lang_node.attrib['lang_code']
988
989 sub_lang_list = {}
990 for lang_node in caption_list.findall('target'):
991 sub_lang = lang_node.attrib['lang_code']
992 params = compat_urllib_parse.urlencode({
993 'lang': original_lang,
994 'tlang': sub_lang,
995 'fmt': sub_format,
996 'ts': timestamp,
997 'kind': 'asr',
998 })
999 sub_lang_list[sub_lang] = caption_url + '&' + params
1000 return sub_lang_list
1001 # An extractor error can be raise by the download process if there are
1002 # no automatic captions but there are subtitles
1003 except (KeyError, ExtractorError):
1004 self._downloader.report_warning(err_msg)
1005 return {}
1006
1007 @classmethod
1008 def extract_id(cls, url):
1009 mobj = re.match(cls._VALID_URL, url, re.VERBOSE)
1010 if mobj is None:
1011 raise ExtractorError(u'Invalid URL: %s' % url)
1012 video_id = mobj.group(2)
1013 return video_id
1014
1015 def _extract_from_m3u8(self, manifest_url, video_id):
1016 url_map = {}
1017 def _get_urls(_manifest):
1018 lines = _manifest.split('\n')
1019 urls = filter(lambda l: l and not l.startswith('#'),
1020 lines)
1021 return urls
1022 manifest = self._download_webpage(manifest_url, video_id, u'Downloading formats manifest')
1023 formats_urls = _get_urls(manifest)
1024 for format_url in formats_urls:
1025 itag = self._search_regex(r'itag/(\d+?)/', format_url, 'itag')
1026 url_map[itag] = format_url
1027 return url_map
1028
1029 def _extract_annotations(self, video_id):
1030 url = 'https://www.youtube.com/annotations_invideo?features=1&legacy=1&video_id=%s' % video_id
1031 return self._download_webpage(url, video_id, note=u'Searching for annotations.', errnote=u'Unable to download video annotations.')
1032
1033 def _real_extract(self, url):
1034 proto = (
1035 u'http' if self._downloader.params.get('prefer_insecure', False)
1036 else u'https')
1037
1038 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
1039 mobj = re.search(self._NEXT_URL_RE, url)
1040 if mobj:
1041 url = proto + '://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
1042 video_id = self.extract_id(url)
1043
1044 # Get video webpage
1045 url = proto + '://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
1046 video_webpage = self._download_webpage(url, video_id)
1047
1048 # Attempt to extract SWF player URL
1049 mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
1050 if mobj is not None:
1051 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1052 else:
1053 player_url = None
1054
1055 # Get video info
1056 self.report_video_info_webpage_download(video_id)
1057 if re.search(r'player-age-gate-content">', video_webpage) is not None:
1058 self.report_age_confirmation()
1059 age_gate = True
1060 # We simulate the access to the video from www.youtube.com/v/{video_id}
1061 # this can be viewed without login into Youtube
1062 data = compat_urllib_parse.urlencode({'video_id': video_id,
1063 'el': 'player_embedded',
1064 'gl': 'US',
1065 'hl': 'en',
1066 'eurl': 'https://youtube.googleapis.com/v/' + video_id,
1067 'asv': 3,
1068 'sts':'1588',
1069 })
1070 video_info_url = proto + '://www.youtube.com/get_video_info?' + data
1071 video_info_webpage = self._download_webpage(video_info_url, video_id,
1072 note=False,
1073 errnote='unable to download video info webpage')
1074 video_info = compat_parse_qs(video_info_webpage)
1075 else:
1076 age_gate = False
1077 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1078 video_info_url = (proto + '://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1079 % (video_id, el_type))
1080 video_info_webpage = self._download_webpage(video_info_url, video_id,
1081 note=False,
1082 errnote='unable to download video info webpage')
1083 video_info = compat_parse_qs(video_info_webpage)
1084 if 'token' in video_info:
1085 break
1086 if 'token' not in video_info:
1087 if 'reason' in video_info:
1088 raise ExtractorError(
1089 u'YouTube said: %s' % video_info['reason'][0],
1090 expected=True, video_id=video_id)
1091 else:
1092 raise ExtractorError(
1093 u'"token" parameter not in video info for unknown reason',
1094 video_id=video_id)
1095
1096 if 'view_count' in video_info:
1097 view_count = int(video_info['view_count'][0])
1098 else:
1099 view_count = None
1100
1101 # Check for "rental" videos
1102 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
1103 raise ExtractorError(u'"rental" videos not supported')
1104
1105 # Start extracting information
1106 self.report_information_extraction(video_id)
1107
1108 # uploader
1109 if 'author' not in video_info:
1110 raise ExtractorError(u'Unable to extract uploader name')
1111 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
1112
1113 # uploader_id
1114 video_uploader_id = None
1115 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
1116 if mobj is not None:
1117 video_uploader_id = mobj.group(1)
1118 else:
1119 self._downloader.report_warning(u'unable to extract uploader nickname')
1120
1121 # title
1122 if 'title' in video_info:
1123 video_title = video_info['title'][0]
1124 else:
1125 self._downloader.report_warning(u'Unable to extract video title')
1126 video_title = u'_'
1127
1128 # thumbnail image
1129 # We try first to get a high quality image:
1130 m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">',
1131 video_webpage, re.DOTALL)
1132 if m_thumb is not None:
1133 video_thumbnail = m_thumb.group(1)
1134 elif 'thumbnail_url' not in video_info:
1135 self._downloader.report_warning(u'unable to extract video thumbnail')
1136 video_thumbnail = None
1137 else: # don't panic if we can't find it
1138 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
1139
1140 # upload date
1141 upload_date = None
1142 mobj = re.search(r'(?s)id="eow-date.*?>(.*?)</span>', video_webpage)
1143 if mobj is None:
1144 mobj = re.search(
1145 r'(?s)id="watch-uploader-info".*?>.*?(?:Published|Uploaded|Streamed live) on (.*?)</strong>',
1146 video_webpage)
1147 if mobj is not None:
1148 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1149 upload_date = unified_strdate(upload_date)
1150
1151 m_cat_container = get_element_by_id("eow-category", video_webpage)
1152 if m_cat_container:
1153 category = self._html_search_regex(
1154 r'(?s)<a[^<]+>(.*?)</a>', m_cat_container, 'category',
1155 default=None)
1156 video_categories = None if category is None else [category]
1157 else:
1158 video_categories = None
1159
1160 # description
1161 video_description = get_element_by_id("eow-description", video_webpage)
1162 if video_description:
1163 video_description = re.sub(r'''(?x)
1164 <a\s+
1165 (?:[a-zA-Z-]+="[^"]+"\s+)*?
1166 title="([^"]+)"\s+
1167 (?:[a-zA-Z-]+="[^"]+"\s+)*?
1168 class="yt-uix-redirect-link"\s*>
1169 [^<]+
1170 </a>
1171 ''', r'\1', video_description)
1172 video_description = clean_html(video_description)
1173 else:
1174 fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
1175 if fd_mobj:
1176 video_description = unescapeHTML(fd_mobj.group(1))
1177 else:
1178 video_description = u''
1179
1180 def _extract_count(klass):
1181 count = self._search_regex(
1182 r'class="%s">([\d,]+)</span>' % re.escape(klass),
1183 video_webpage, klass, default=None)
1184 if count is not None:
1185 return int(count.replace(',', ''))
1186 return None
1187 like_count = _extract_count(u'likes-count')
1188 dislike_count = _extract_count(u'dislikes-count')
1189
1190 # subtitles
1191 video_subtitles = self.extract_subtitles(video_id, video_webpage)
1192
1193 if self._downloader.params.get('listsubtitles', False):
1194 self._list_available_subtitles(video_id, video_webpage)
1195 return
1196
1197 if 'length_seconds' not in video_info:
1198 self._downloader.report_warning(u'unable to extract video duration')
1199 video_duration = None
1200 else:
1201 video_duration = int(compat_urllib_parse.unquote_plus(video_info['length_seconds'][0]))
1202
1203 # annotations
1204 video_annotations = None
1205 if self._downloader.params.get('writeannotations', False):
1206 video_annotations = self._extract_annotations(video_id)
1207
1208 # Decide which formats to download
1209 try:
1210 mobj = re.search(r';ytplayer\.config\s*=\s*({.*?});', video_webpage)
1211 if not mobj:
1212 raise ValueError('Could not find vevo ID')
1213 json_code = uppercase_escape(mobj.group(1))
1214 ytplayer_config = json.loads(json_code)
1215 args = ytplayer_config['args']
1216 # Easy way to know if the 's' value is in url_encoded_fmt_stream_map
1217 # this signatures are encrypted
1218 if 'url_encoded_fmt_stream_map' not in args:
1219 raise ValueError(u'No stream_map present') # caught below
1220 re_signature = re.compile(r'[&,]s=')
1221 m_s = re_signature.search(args['url_encoded_fmt_stream_map'])
1222 if m_s is not None:
1223 self.to_screen(u'%s: Encrypted signatures detected.' % video_id)
1224 video_info['url_encoded_fmt_stream_map'] = [args['url_encoded_fmt_stream_map']]
1225 m_s = re_signature.search(args.get('adaptive_fmts', u''))
1226 if m_s is not None:
1227 if 'adaptive_fmts' in video_info:
1228 video_info['adaptive_fmts'][0] += ',' + args['adaptive_fmts']
1229 else:
1230 video_info['adaptive_fmts'] = [args['adaptive_fmts']]
1231 except ValueError:
1232 pass
1233
1234 def _map_to_format_list(urlmap):
1235 formats = []
1236 for itag, video_real_url in urlmap.items():
1237 dct = {
1238 'format_id': itag,
1239 'url': video_real_url,
1240 'player_url': player_url,
1241 }
1242 if itag in self._formats:
1243 dct.update(self._formats[itag])
1244 formats.append(dct)
1245 return formats
1246
1247 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1248 self.report_rtmp_download()
1249 formats = [{
1250 'format_id': '_rtmp',
1251 'protocol': 'rtmp',
1252 'url': video_info['conn'][0],
1253 'player_url': player_url,
1254 }]
1255 elif len(video_info.get('url_encoded_fmt_stream_map', [])) >= 1 or len(video_info.get('adaptive_fmts', [])) >= 1:
1256 encoded_url_map = video_info.get('url_encoded_fmt_stream_map', [''])[0] + ',' + video_info.get('adaptive_fmts',[''])[0]
1257 if 'rtmpe%3Dyes' in encoded_url_map:
1258 raise ExtractorError('rtmpe downloads are not supported, see https://github.com/rg3/youtube-dl/issues/343 for more information.', expected=True)
1259 url_map = {}
1260 for url_data_str in encoded_url_map.split(','):
1261 url_data = compat_parse_qs(url_data_str)
1262 if 'itag' in url_data and 'url' in url_data:
1263 url = url_data['url'][0]
1264 if 'sig' in url_data:
1265 url += '&signature=' + url_data['sig'][0]
1266 elif 's' in url_data:
1267 encrypted_sig = url_data['s'][0]
1268 if self._downloader.params.get('verbose'):
1269 if age_gate:
1270 if player_url is None:
1271 player_version = 'unknown'
1272 else:
1273 player_version = self._search_regex(
1274 r'-(.+)\.swf$', player_url,
1275 u'flash player', fatal=False)
1276 player_desc = 'flash player %s' % player_version
1277 else:
1278 player_version = self._search_regex(
1279 r'html5player-(.+?)\.js', video_webpage,
1280 'html5 player', fatal=False)
1281 player_desc = u'html5 player %s' % player_version
1282
1283 parts_sizes = u'.'.join(compat_str(len(part)) for part in encrypted_sig.split('.'))
1284 self.to_screen(u'encrypted signature length %d (%s), itag %s, %s' %
1285 (len(encrypted_sig), parts_sizes, url_data['itag'][0], player_desc))
1286
1287 if not age_gate:
1288 jsplayer_url_json = self._search_regex(
1289 r'"assets":.+?"js":\s*("[^"]+")',
1290 video_webpage, u'JS player URL')
1291 player_url = json.loads(jsplayer_url_json)
1292
1293 signature = self._decrypt_signature(
1294 encrypted_sig, video_id, player_url, age_gate)
1295 url += '&signature=' + signature
1296 if 'ratebypass' not in url:
1297 url += '&ratebypass=yes'
1298 url_map[url_data['itag'][0]] = url
1299 formats = _map_to_format_list(url_map)
1300 elif video_info.get('hlsvp'):
1301 manifest_url = video_info['hlsvp'][0]
1302 url_map = self._extract_from_m3u8(manifest_url, video_id)
1303 formats = _map_to_format_list(url_map)
1304 else:
1305 raise ExtractorError(u'no conn, hlsvp or url_encoded_fmt_stream_map information found in video info')
1306
1307 # Look for the DASH manifest
1308 if (self._downloader.params.get('youtube_include_dash_manifest', False)):
1309 try:
1310 # The DASH manifest used needs to be the one from the original video_webpage.
1311 # The one found in get_video_info seems to be using different signatures.
1312 # However, in the case of an age restriction there won't be any embedded dashmpd in the video_webpage.
1313 # Luckily, it seems, this case uses some kind of default signature (len == 86), so the
1314 # combination of get_video_info and the _static_decrypt_signature() decryption fallback will work here.
1315 if age_gate:
1316 dash_manifest_url = video_info.get('dashmpd')[0]
1317 else:
1318 dash_manifest_url = ytplayer_config['args']['dashmpd']
1319 def decrypt_sig(mobj):
1320 s = mobj.group(1)
1321 dec_s = self._decrypt_signature(s, video_id, player_url, age_gate)
1322 return '/signature/%s' % dec_s
1323 dash_manifest_url = re.sub(r'/s/([\w\.]+)', decrypt_sig, dash_manifest_url)
1324 dash_doc = self._download_xml(
1325 dash_manifest_url, video_id,
1326 note=u'Downloading DASH manifest',
1327 errnote=u'Could not download DASH manifest')
1328 for r in dash_doc.findall(u'.//{urn:mpeg:DASH:schema:MPD:2011}Representation'):
1329 url_el = r.find('{urn:mpeg:DASH:schema:MPD:2011}BaseURL')
1330 if url_el is None:
1331 continue
1332 format_id = r.attrib['id']
1333 video_url = url_el.text
1334 filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength'))
1335 f = {
1336 'format_id': format_id,
1337 'url': video_url,
1338 'width': int_or_none(r.attrib.get('width')),
1339 'tbr': int_or_none(r.attrib.get('bandwidth'), 1000),
1340 'asr': int_or_none(r.attrib.get('audioSamplingRate')),
1341 'filesize': filesize,
1342 }
1343 try:
1344 existing_format = next(
1345 fo for fo in formats
1346 if fo['format_id'] == format_id)
1347 except StopIteration:
1348 f.update(self._formats.get(format_id, {}))
1349 formats.append(f)
1350 else:
1351 existing_format.update(f)
1352
1353 except (ExtractorError, KeyError) as e:
1354 self.report_warning(u'Skipping DASH manifest: %s' % e, video_id)
1355
1356 self._sort_formats(formats)
1357
1358 return {
1359 'id': video_id,
1360 'uploader': video_uploader,
1361 'uploader_id': video_uploader_id,
1362 'upload_date': upload_date,
1363 'title': video_title,
1364 'thumbnail': video_thumbnail,
1365 'description': video_description,
1366 'categories': video_categories,
1367 'subtitles': video_subtitles,
1368 'duration': video_duration,
1369 'age_limit': 18 if age_gate else 0,
1370 'annotations': video_annotations,
1371 'webpage_url': proto + '://www.youtube.com/watch?v=%s' % video_id,
1372 'view_count': view_count,
1373 'like_count': like_count,
1374 'dislike_count': dislike_count,
1375 'formats': formats,
1376 }
1377
1378 class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
1379 IE_DESC = u'YouTube.com playlists'
1380 _VALID_URL = r"""(?x)(?:
1381 (?:https?://)?
1382 (?:\w+\.)?
1383 youtube\.com/
1384 (?:
1385 (?:course|view_play_list|my_playlists|artist|playlist|watch)
1386 \? (?:.*?&)*? (?:p|a|list)=
1387 | p/
1388 )
1389 (
1390 (?:PL|LL|EC|UU|FL|RD)?[0-9A-Za-z-_]{10,}
1391 # Top tracks, they can also include dots
1392 |(?:MC)[\w\.]*
1393 )
1394 .*
1395 |
1396 ((?:PL|LL|EC|UU|FL|RD)[0-9A-Za-z-_]{10,})
1397 )"""
1398 _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s'
1399 _MORE_PAGES_INDICATOR = r'data-link-type="next"'
1400 _VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})&amp;[^"]*?index=(?P<index>\d+)'
1401 IE_NAME = u'youtube:playlist'
1402
1403 def _real_initialize(self):
1404 self._login()
1405
1406 def _ids_to_results(self, ids):
1407 return [self.url_result(vid_id, 'Youtube', video_id=vid_id)
1408 for vid_id in ids]
1409
1410 def _extract_mix(self, playlist_id):
1411 # The mixes are generated from a a single video
1412 # the id of the playlist is just 'RD' + video_id
1413 url = 'https://youtube.com/watch?v=%s&list=%s' % (playlist_id[-11:], playlist_id)
1414 webpage = self._download_webpage(url, playlist_id, u'Downloading Youtube mix')
1415 search_title = lambda class_name: get_element_by_attribute('class', class_name, webpage)
1416 title_span = (search_title('playlist-title') or
1417 search_title('title long-title') or search_title('title'))
1418 title = clean_html(title_span)
1419 video_re = r'''(?x)data-video-username=".*?".*?
1420 href="/watch\?v=([0-9A-Za-z_-]{11})&amp;[^"]*?list=%s''' % re.escape(playlist_id)
1421 ids = orderedSet(re.findall(video_re, webpage, flags=re.DOTALL))
1422 url_results = self._ids_to_results(ids)
1423
1424 return self.playlist_result(url_results, playlist_id, title)
1425
1426 def _real_extract(self, url):
1427 # Extract playlist id
1428 mobj = re.match(self._VALID_URL, url)
1429 if mobj is None:
1430 raise ExtractorError(u'Invalid URL: %s' % url)
1431 playlist_id = mobj.group(1) or mobj.group(2)
1432
1433 # Check if it's a video-specific URL
1434 query_dict = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
1435 if 'v' in query_dict:
1436 video_id = query_dict['v'][0]
1437 if self._downloader.params.get('noplaylist'):
1438 self.to_screen(u'Downloading just video %s because of --no-playlist' % video_id)
1439 return self.url_result(video_id, 'Youtube', video_id=video_id)
1440 else:
1441 self.to_screen(u'Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id))
1442
1443 if playlist_id.startswith('RD'):
1444 # Mixes require a custom extraction process
1445 return self._extract_mix(playlist_id)
1446 if playlist_id.startswith('TL'):
1447 raise ExtractorError(u'For downloading YouTube.com top lists, use '
1448 u'the "yttoplist" keyword, for example "youtube-dl \'yttoplist:music:Top Tracks\'"', expected=True)
1449
1450 url = self._TEMPLATE_URL % playlist_id
1451 page = self._download_webpage(url, playlist_id)
1452 more_widget_html = content_html = page
1453
1454 # Check if the playlist exists or is private
1455 if re.search(r'<div class="yt-alert-message">[^<]*?(The|This) playlist (does not exist|is private)[^<]*?</div>', page) is not None:
1456 raise ExtractorError(
1457 u'The playlist doesn\'t exist or is private, use --username or '
1458 '--netrc to access it.',
1459 expected=True)
1460
1461 # Extract the video ids from the playlist pages
1462 ids = []
1463
1464 for page_num in itertools.count(1):
1465 matches = re.finditer(self._VIDEO_RE, content_html)
1466 # We remove the duplicates and the link with index 0
1467 # (it's not the first video of the playlist)
1468 new_ids = orderedSet(m.group('id') for m in matches if m.group('index') != '0')
1469 ids.extend(new_ids)
1470
1471 mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
1472 if not mobj:
1473 break
1474
1475 more = self._download_json(
1476 'https://youtube.com/%s' % mobj.group('more'), playlist_id,
1477 'Downloading page #%s' % page_num,
1478 transform_source=uppercase_escape)
1479 content_html = more['content_html']
1480 more_widget_html = more['load_more_widget_html']
1481
1482 playlist_title = self._html_search_regex(
1483 r'(?s)<h1 class="pl-header-title[^"]*">\s*(.*?)\s*</h1>',
1484 page, u'title')
1485
1486 url_results = self._ids_to_results(ids)
1487 return self.playlist_result(url_results, playlist_id, playlist_title)
1488
1489
1490 class YoutubeTopListIE(YoutubePlaylistIE):
1491 IE_NAME = u'youtube:toplist'
1492 IE_DESC = (u'YouTube.com top lists, "yttoplist:{channel}:{list title}"'
1493 u' (Example: "yttoplist:music:Top Tracks")')
1494 _VALID_URL = r'yttoplist:(?P<chann>.*?):(?P<title>.*?)$'
1495
1496 def _real_extract(self, url):
1497 mobj = re.match(self._VALID_URL, url)
1498 channel = mobj.group('chann')
1499 title = mobj.group('title')
1500 query = compat_urllib_parse.urlencode({'title': title})
1501 playlist_re = 'href="([^"]+?%s.*?)"' % re.escape(query)
1502 channel_page = self._download_webpage('https://www.youtube.com/%s' % channel, title)
1503 link = self._html_search_regex(playlist_re, channel_page, u'list')
1504 url = compat_urlparse.urljoin('https://www.youtube.com/', link)
1505
1506 video_re = r'data-index="\d+".*?data-video-id="([0-9A-Za-z_-]{11})"'
1507 ids = []
1508 # sometimes the webpage doesn't contain the videos
1509 # retry until we get them
1510 for i in itertools.count(0):
1511 msg = u'Downloading Youtube mix'
1512 if i > 0:
1513 msg += ', retry #%d' % i
1514 webpage = self._download_webpage(url, title, msg)
1515 ids = orderedSet(re.findall(video_re, webpage))
1516 if ids:
1517 break
1518 url_results = self._ids_to_results(ids)
1519 return self.playlist_result(url_results, playlist_title=title)
1520
1521
1522 class YoutubeChannelIE(InfoExtractor):
1523 IE_DESC = u'YouTube.com channels'
1524 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
1525 _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
1526 _MORE_PAGES_URL = 'https://www.youtube.com/c4_browse_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
1527 IE_NAME = u'youtube:channel'
1528
1529 def extract_videos_from_page(self, page):
1530 ids_in_page = []
1531 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1532 if mobj.group(1) not in ids_in_page:
1533 ids_in_page.append(mobj.group(1))
1534 return ids_in_page
1535
1536 def _real_extract(self, url):
1537 # Extract channel id
1538 mobj = re.match(self._VALID_URL, url)
1539 if mobj is None:
1540 raise ExtractorError(u'Invalid URL: %s' % url)
1541
1542 # Download channel page
1543 channel_id = mobj.group(1)
1544 video_ids = []
1545 url = 'https://www.youtube.com/channel/%s/videos' % channel_id
1546 channel_page = self._download_webpage(url, channel_id)
1547 autogenerated = re.search(r'''(?x)
1548 class="[^"]*?(?:
1549 channel-header-autogenerated-label|
1550 yt-channel-title-autogenerated
1551 )[^"]*"''', channel_page) is not None
1552
1553 if autogenerated:
1554 # The videos are contained in a single page
1555 # the ajax pages can't be used, they are empty
1556 video_ids = self.extract_videos_from_page(channel_page)
1557 else:
1558 # Download all channel pages using the json-based channel_ajax query
1559 for pagenum in itertools.count(1):
1560 url = self._MORE_PAGES_URL % (pagenum, channel_id)
1561 page = self._download_json(
1562 url, channel_id, note=u'Downloading page #%s' % pagenum,
1563 transform_source=uppercase_escape)
1564
1565 ids_in_page = self.extract_videos_from_page(page['content_html'])
1566 video_ids.extend(ids_in_page)
1567
1568 if self._MORE_PAGES_INDICATOR not in page['load_more_widget_html']:
1569 break
1570
1571 self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1572
1573 url_entries = [self.url_result(video_id, 'Youtube', video_id=video_id)
1574 for video_id in video_ids]
1575 return self.playlist_result(url_entries, channel_id)
1576
1577
1578 class YoutubeUserIE(InfoExtractor):
1579 IE_DESC = u'YouTube.com user videos (URL or "ytuser" keyword)'
1580 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:user/)?(?!(?:attribution_link|watch|results)(?:$|[^a-z_A-Z0-9-])))|ytuser:)(?!feed/)([A-Za-z0-9_-]+)'
1581 _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/users/%s'
1582 _GDATA_PAGE_SIZE = 50
1583 _GDATA_URL = 'https://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d&alt=json'
1584 IE_NAME = u'youtube:user'
1585
1586 @classmethod
1587 def suitable(cls, url):
1588 # Don't return True if the url can be extracted with other youtube
1589 # extractor, the regex would is too permissive and it would match.
1590 other_ies = iter(klass for (name, klass) in globals().items() if name.endswith('IE') and klass is not cls)
1591 if any(ie.suitable(url) for ie in other_ies): return False
1592 else: return super(YoutubeUserIE, cls).suitable(url)
1593
1594 def _real_extract(self, url):
1595 # Extract username
1596 mobj = re.match(self._VALID_URL, url)
1597 if mobj is None:
1598 raise ExtractorError(u'Invalid URL: %s' % url)
1599
1600 username = mobj.group(1)
1601
1602 # Download video ids using YouTube Data API. Result size per
1603 # query is limited (currently to 50 videos) so we need to query
1604 # page by page until there are no video ids - it means we got
1605 # all of them.
1606
1607 def download_page(pagenum):
1608 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1609
1610 gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
1611 page = self._download_webpage(
1612 gdata_url, username,
1613 u'Downloading video ids from %d to %d' % (
1614 start_index, start_index + self._GDATA_PAGE_SIZE))
1615
1616 try:
1617 response = json.loads(page)
1618 except ValueError as err:
1619 raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
1620 if 'entry' not in response['feed']:
1621 return
1622
1623 # Extract video identifiers
1624 entries = response['feed']['entry']
1625 for entry in entries:
1626 title = entry['title']['$t']
1627 video_id = entry['id']['$t'].split('/')[-1]
1628 yield {
1629 '_type': 'url',
1630 'url': video_id,
1631 'ie_key': 'Youtube',
1632 'id': video_id,
1633 'title': title,
1634 }
1635 url_results = PagedList(download_page, self._GDATA_PAGE_SIZE)
1636
1637 return self.playlist_result(url_results, playlist_title=username)
1638
1639
1640 class YoutubeSearchIE(SearchInfoExtractor):
1641 IE_DESC = u'YouTube.com searches'
1642 _API_URL = u'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1643 _MAX_RESULTS = 1000
1644 IE_NAME = u'youtube:search'
1645 _SEARCH_KEY = 'ytsearch'
1646
1647 def _get_n_results(self, query, n):
1648 """Get a specified number of results for a query"""
1649
1650 video_ids = []
1651 pagenum = 0
1652 limit = n
1653 PAGE_SIZE = 50
1654
1655 while (PAGE_SIZE * pagenum) < limit:
1656 result_url = self._API_URL % (
1657 compat_urllib_parse.quote_plus(query.encode('utf-8')),
1658 (PAGE_SIZE * pagenum) + 1)
1659 data_json = self._download_webpage(
1660 result_url, video_id=u'query "%s"' % query,
1661 note=u'Downloading page %s' % (pagenum + 1),
1662 errnote=u'Unable to download API page')
1663 data = json.loads(data_json)
1664 api_response = data['data']
1665
1666 if 'items' not in api_response:
1667 raise ExtractorError(
1668 u'[youtube] No video results', expected=True)
1669
1670 new_ids = list(video['id'] for video in api_response['items'])
1671 video_ids += new_ids
1672
1673 limit = min(n, api_response['totalItems'])
1674 pagenum += 1
1675
1676 if len(video_ids) > n:
1677 video_ids = video_ids[:n]
1678 videos = [self.url_result(video_id, 'Youtube', video_id=video_id)
1679 for video_id in video_ids]
1680 return self.playlist_result(videos, query)
1681
1682
1683 class YoutubeSearchDateIE(YoutubeSearchIE):
1684 IE_NAME = YoutubeSearchIE.IE_NAME + ':date'
1685 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc&orderby=published'
1686 _SEARCH_KEY = 'ytsearchdate'
1687 IE_DESC = u'YouTube.com searches, newest videos first'
1688
1689
1690 class YoutubeSearchURLIE(InfoExtractor):
1691 IE_DESC = u'YouTube.com search URLs'
1692 IE_NAME = u'youtube:search_url'
1693 _VALID_URL = r'https?://(?:www\.)?youtube\.com/results\?(.*?&)?search_query=(?P<query>[^&]+)(?:[&]|$)'
1694
1695 def _real_extract(self, url):
1696 mobj = re.match(self._VALID_URL, url)
1697 query = compat_urllib_parse.unquote_plus(mobj.group('query'))
1698
1699 webpage = self._download_webpage(url, query)
1700 result_code = self._search_regex(
1701 r'(?s)<ol class="item-section"(.*?)</ol>', webpage, u'result HTML')
1702
1703 part_codes = re.findall(
1704 r'(?s)<h3 class="yt-lockup-title">(.*?)</h3>', result_code)
1705 entries = []
1706 for part_code in part_codes:
1707 part_title = self._html_search_regex(
1708 [r'(?s)title="([^"]+)"', r'>([^<]+)</a>'], part_code, 'item title', fatal=False)
1709 part_url_snippet = self._html_search_regex(
1710 r'(?s)href="([^"]+)"', part_code, 'item URL')
1711 part_url = compat_urlparse.urljoin(
1712 'https://www.youtube.com/', part_url_snippet)
1713 entries.append({
1714 '_type': 'url',
1715 'url': part_url,
1716 'title': part_title,
1717 })
1718
1719 return {
1720 '_type': 'playlist',
1721 'entries': entries,
1722 'title': query,
1723 }
1724
1725
1726 class YoutubeShowIE(InfoExtractor):
1727 IE_DESC = u'YouTube.com (multi-season) shows'
1728 _VALID_URL = r'https?://www\.youtube\.com/show/(.*)'
1729 IE_NAME = u'youtube:show'
1730
1731 def _real_extract(self, url):
1732 mobj = re.match(self._VALID_URL, url)
1733 show_name = mobj.group(1)
1734 webpage = self._download_webpage(url, show_name, u'Downloading show webpage')
1735 # There's one playlist for each season of the show
1736 m_seasons = list(re.finditer(r'href="(/playlist\?list=.*?)"', webpage))
1737 self.to_screen(u'%s: Found %s seasons' % (show_name, len(m_seasons)))
1738 return [self.url_result('https://www.youtube.com' + season.group(1), 'YoutubePlaylist') for season in m_seasons]
1739
1740
1741 class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
1742 """
1743 Base class for extractors that fetch info from
1744 http://www.youtube.com/feed_ajax
1745 Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.
1746 """
1747 _LOGIN_REQUIRED = True
1748 # use action_load_personal_feed instead of action_load_system_feed
1749 _PERSONAL_FEED = False
1750
1751 @property
1752 def _FEED_TEMPLATE(self):
1753 action = 'action_load_system_feed'
1754 if self._PERSONAL_FEED:
1755 action = 'action_load_personal_feed'
1756 return 'https://www.youtube.com/feed_ajax?%s=1&feed_name=%s&paging=%%s' % (action, self._FEED_NAME)
1757
1758 @property
1759 def IE_NAME(self):
1760 return u'youtube:%s' % self._FEED_NAME
1761
1762 def _real_initialize(self):
1763 self._login()
1764
1765 def _real_extract(self, url):
1766 feed_entries = []
1767 paging = 0
1768 for i in itertools.count(1):
1769 info = self._download_json(self._FEED_TEMPLATE % paging,
1770 u'%s feed' % self._FEED_NAME,
1771 u'Downloading page %s' % i)
1772 feed_html = info.get('feed_html') or info.get('content_html')
1773 m_ids = re.finditer(r'"/watch\?v=(.*?)["&]', feed_html)
1774 ids = orderedSet(m.group(1) for m in m_ids)
1775 feed_entries.extend(
1776 self.url_result(video_id, 'Youtube', video_id=video_id)
1777 for video_id in ids)
1778 mobj = re.search(
1779 r'data-uix-load-more-href="/?[^"]+paging=(?P<paging>\d+)',
1780 feed_html)
1781 if mobj is None:
1782 break
1783 paging = mobj.group('paging')
1784 return self.playlist_result(feed_entries, playlist_title=self._PLAYLIST_TITLE)
1785
1786 class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor):
1787 IE_DESC = u'YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication)'
1788 _VALID_URL = r'https?://www\.youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
1789 _FEED_NAME = 'subscriptions'
1790 _PLAYLIST_TITLE = u'Youtube Subscriptions'
1791
1792 class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
1793 IE_DESC = u'YouTube.com recommended videos, "ytrec" keyword (requires authentication)'
1794 _VALID_URL = r'https?://www\.youtube\.com/feed/recommended|:ytrec(?:ommended)?'
1795 _FEED_NAME = 'recommended'
1796 _PLAYLIST_TITLE = u'Youtube Recommended videos'
1797
1798 class YoutubeWatchLaterIE(YoutubeFeedsInfoExtractor):
1799 IE_DESC = u'Youtube watch later list, "ytwatchlater" keyword (requires authentication)'
1800 _VALID_URL = r'https?://www\.youtube\.com/feed/watch_later|:ytwatchlater'
1801 _FEED_NAME = 'watch_later'
1802 _PLAYLIST_TITLE = u'Youtube Watch Later'
1803 _PERSONAL_FEED = True
1804
1805 class YoutubeHistoryIE(YoutubeFeedsInfoExtractor):
1806 IE_DESC = u'Youtube watch history, "ythistory" keyword (requires authentication)'
1807 _VALID_URL = u'https?://www\.youtube\.com/feed/history|:ythistory'
1808 _FEED_NAME = 'history'
1809 _PERSONAL_FEED = True
1810 _PLAYLIST_TITLE = u'Youtube Watch History'
1811
1812 class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
1813 IE_NAME = u'youtube:favorites'
1814 IE_DESC = u'YouTube.com favourite videos, "ytfav" keyword (requires authentication)'
1815 _VALID_URL = r'https?://www\.youtube\.com/my_favorites|:ytfav(?:ou?rites)?'
1816 _LOGIN_REQUIRED = True
1817
1818 def _real_extract(self, url):
1819 webpage = self._download_webpage('https://www.youtube.com/my_favorites', 'Youtube Favourites videos')
1820 playlist_id = self._search_regex(r'list=(.+?)["&]', webpage, u'favourites playlist id')
1821 return self.url_result(playlist_id, 'YoutubePlaylist')
1822
1823
1824 class YoutubeTruncatedURLIE(InfoExtractor):
1825 IE_NAME = 'youtube:truncated_url'
1826 IE_DESC = False # Do not list
1827 _VALID_URL = r'''(?x)
1828 (?:https?://)?[^/]+/watch\?(?:
1829 feature=[a-z_]+|
1830 annotation_id=annotation_[^&]+
1831 )?$|
1832 (?:https?://)?(?:www\.)?youtube\.com/attribution_link\?a=[^&]+$
1833 '''
1834
1835 _TESTS = [{
1836 'url': 'http://www.youtube.com/watch?annotation_id=annotation_3951667041',
1837 'only_matching': True,
1838 }, {
1839 'url': 'http://www.youtube.com/watch?',
1840 'only_matching': True,
1841 }]
1842
1843 def _real_extract(self, url):
1844 raise ExtractorError(
1845 u'Did you forget to quote the URL? Remember that & is a meta '
1846 u'character in most shells, so you want to put the URL in quotes, '
1847 u'like youtube-dl '
1848 u'"http://www.youtube.com/watch?feature=foo&v=BaW_jenozKc" '
1849 u' or simply youtube-dl BaW_jenozKc .',
1850 expected=True)