]> jfr.im git - yt-dlp.git/blame_incremental - youtube_dl/extractor/youtube.py
release 2014.03.03
[yt-dlp.git] / youtube_dl / extractor / youtube.py
... / ...
CommitLineData
1# coding: utf-8
2
3import collections
4import errno
5import io
6import itertools
7import json
8import os.path
9import re
10import string
11import struct
12import traceback
13import zlib
14
15from .common import InfoExtractor, SearchInfoExtractor
16from .subtitles import SubtitlesInfoExtractor
17from ..utils import (
18 compat_chr,
19 compat_parse_qs,
20 compat_urllib_parse,
21 compat_urllib_request,
22 compat_urlparse,
23 compat_str,
24
25 clean_html,
26 get_cachedir,
27 get_element_by_id,
28 get_element_by_attribute,
29 ExtractorError,
30 int_or_none,
31 PagedList,
32 unescapeHTML,
33 unified_strdate,
34 orderedSet,
35 write_json_file,
36 uppercase_escape,
37)
38
39class YoutubeBaseInfoExtractor(InfoExtractor):
40 """Provide base functions for Youtube extractors"""
41 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
42 _LANG_URL = r'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
43 _AGE_URL = 'https://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
44 _NETRC_MACHINE = 'youtube'
45 # If True it will raise an error if no login info is provided
46 _LOGIN_REQUIRED = False
47
48 def _set_language(self):
49 return bool(self._download_webpage(
50 self._LANG_URL, None,
51 note=u'Setting language', errnote='unable to set language',
52 fatal=False))
53
54 def _login(self):
55 (username, password) = self._get_login_info()
56 # No authentication to be performed
57 if username is None:
58 if self._LOGIN_REQUIRED:
59 raise ExtractorError(u'No login info available, needed for using %s.' % self.IE_NAME, expected=True)
60 return False
61
62 login_page = self._download_webpage(
63 self._LOGIN_URL, None,
64 note=u'Downloading login page',
65 errnote=u'unable to fetch login page', fatal=False)
66 if login_page is False:
67 return
68
69 galx = self._search_regex(r'(?s)<input.+?name="GALX".+?value="(.+?)"',
70 login_page, u'Login GALX parameter')
71
72 # Log in
73 login_form_strs = {
74 u'continue': u'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
75 u'Email': username,
76 u'GALX': galx,
77 u'Passwd': password,
78 u'PersistentCookie': u'yes',
79 u'_utf8': u'霱',
80 u'bgresponse': u'js_disabled',
81 u'checkConnection': u'',
82 u'checkedDomains': u'youtube',
83 u'dnConn': u'',
84 u'pstMsg': u'0',
85 u'rmShown': u'1',
86 u'secTok': u'',
87 u'signIn': u'Sign in',
88 u'timeStmp': u'',
89 u'service': u'youtube',
90 u'uilel': u'3',
91 u'hl': u'en_US',
92 }
93 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
94 # chokes on unicode
95 login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
96 login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
97
98 req = compat_urllib_request.Request(self._LOGIN_URL, login_data)
99 login_results = self._download_webpage(
100 req, None,
101 note=u'Logging in', errnote=u'unable to log in', fatal=False)
102 if login_results is False:
103 return False
104 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
105 self._downloader.report_warning(u'unable to log in: bad username or password')
106 return False
107 return True
108
109 def _confirm_age(self):
110 age_form = {
111 'next_url': '/',
112 'action_confirm': 'Confirm',
113 }
114 req = compat_urllib_request.Request(self._AGE_URL,
115 compat_urllib_parse.urlencode(age_form).encode('ascii'))
116
117 self._download_webpage(
118 req, None,
119 note=u'Confirming age', errnote=u'Unable to confirm age')
120 return True
121
122 def _real_initialize(self):
123 if self._downloader is None:
124 return
125 if not self._set_language():
126 return
127 if not self._login():
128 return
129 self._confirm_age()
130
131
132class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
133 IE_DESC = u'YouTube.com'
134 _VALID_URL = r"""(?x)^
135 (
136 (?:https?://|//)? # http(s):// or protocol-independent URL (optional)
137 (?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/|
138 (?:www\.)?deturl\.com/www\.youtube\.com/|
139 (?:www\.)?pwnyoutube\.com/|
140 (?:www\.)?yourepeat\.com/|
141 tube\.majestyc\.net/|
142 youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains
143 (?:.*?\#/)? # handle anchor (#/) redirect urls
144 (?: # the various things that can precede the ID:
145 (?:(?:v|embed|e)/) # v/ or embed/ or e/
146 |(?: # or the v= param in all its forms
147 (?:(?:watch|movie)(?:_popup)?(?:\.php)?/?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
148 (?:\?|\#!?) # the params delimiter ? or # or #!
149 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
150 v=
151 )
152 ))
153 |youtu\.be/ # just youtu.be/xxxx
154 )
155 )? # all until now is optional -> you can pass the naked ID
156 ([0-9A-Za-z_-]{11}) # here is it! the YouTube video ID
157 (?(1).+)? # if we found the ID, everything can follow
158 $"""
159 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
160 _formats = {
161 '5': {'ext': 'flv', 'width': 400, 'height': 240},
162 '6': {'ext': 'flv', 'width': 450, 'height': 270},
163 '13': {'ext': '3gp'},
164 '17': {'ext': '3gp', 'width': 176, 'height': 144},
165 '18': {'ext': 'mp4', 'width': 640, 'height': 360},
166 '22': {'ext': 'mp4', 'width': 1280, 'height': 720},
167 '34': {'ext': 'flv', 'width': 640, 'height': 360},
168 '35': {'ext': 'flv', 'width': 854, 'height': 480},
169 '36': {'ext': '3gp', 'width': 320, 'height': 240},
170 '37': {'ext': 'mp4', 'width': 1920, 'height': 1080},
171 '38': {'ext': 'mp4', 'width': 4096, 'height': 3072},
172 '43': {'ext': 'webm', 'width': 640, 'height': 360},
173 '44': {'ext': 'webm', 'width': 854, 'height': 480},
174 '45': {'ext': 'webm', 'width': 1280, 'height': 720},
175 '46': {'ext': 'webm', 'width': 1920, 'height': 1080},
176
177
178 # 3d videos
179 '82': {'ext': 'mp4', 'height': 360, 'resolution': '360p', 'format_note': '3D', 'preference': -20},
180 '83': {'ext': 'mp4', 'height': 480, 'resolution': '480p', 'format_note': '3D', 'preference': -20},
181 '84': {'ext': 'mp4', 'height': 720, 'resolution': '720p', 'format_note': '3D', 'preference': -20},
182 '85': {'ext': 'mp4', 'height': 1080, 'resolution': '1080p', 'format_note': '3D', 'preference': -20},
183 '100': {'ext': 'webm', 'height': 360, 'resolution': '360p', 'format_note': '3D', 'preference': -20},
184 '101': {'ext': 'webm', 'height': 480, 'resolution': '480p', 'format_note': '3D', 'preference': -20},
185 '102': {'ext': 'webm', 'height': 720, 'resolution': '720p', 'format_note': '3D', 'preference': -20},
186
187 # Apple HTTP Live Streaming
188 '92': {'ext': 'mp4', 'height': 240, 'resolution': '240p', 'format_note': 'HLS', 'preference': -10},
189 '93': {'ext': 'mp4', 'height': 360, 'resolution': '360p', 'format_note': 'HLS', 'preference': -10},
190 '94': {'ext': 'mp4', 'height': 480, 'resolution': '480p', 'format_note': 'HLS', 'preference': -10},
191 '95': {'ext': 'mp4', 'height': 720, 'resolution': '720p', 'format_note': 'HLS', 'preference': -10},
192 '96': {'ext': 'mp4', 'height': 1080, 'resolution': '1080p', 'format_note': 'HLS', 'preference': -10},
193 '132': {'ext': 'mp4', 'height': 240, 'resolution': '240p', 'format_note': 'HLS', 'preference': -10},
194 '151': {'ext': 'mp4', 'height': 72, 'resolution': '72p', 'format_note': 'HLS', 'preference': -10},
195
196 # DASH mp4 video
197 '133': {'ext': 'mp4', 'height': 240, 'resolution': '240p', 'format_note': 'DASH video', 'preference': -40},
198 '134': {'ext': 'mp4', 'height': 360, 'resolution': '360p', 'format_note': 'DASH video', 'preference': -40},
199 '135': {'ext': 'mp4', 'height': 480, 'resolution': '480p', 'format_note': 'DASH video', 'preference': -40},
200 '136': {'ext': 'mp4', 'height': 720, 'resolution': '720p', 'format_note': 'DASH video', 'preference': -40},
201 '137': {'ext': 'mp4', 'height': 1080, 'resolution': '1080p', 'format_note': 'DASH video', 'preference': -40},
202 '138': {'ext': 'mp4', 'height': 2160, 'resolution': '2160p', 'format_note': 'DASH video', 'preference': -40},
203 '160': {'ext': 'mp4', 'height': 192, 'resolution': '192p', 'format_note': 'DASH video', 'preference': -40},
204 '264': {'ext': 'mp4', 'height': 1440, 'resolution': '1440p', 'format_note': 'DASH video', 'preference': -40},
205
206 # Dash mp4 audio
207 '139': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 48, 'preference': -50},
208 '140': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 128, 'preference': -50},
209 '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 256, 'preference': -50},
210
211 # Dash webm
212 '167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'VP8', 'acodec': 'none', 'preference': -40},
213 '168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'VP8', 'acodec': 'none', 'preference': -40},
214 '169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'VP8', 'acodec': 'none', 'preference': -40},
215 '170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'VP8', 'acodec': 'none', 'preference': -40},
216 '218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'VP8', 'acodec': 'none', 'preference': -40},
217 '219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'VP8', 'acodec': 'none', 'preference': -40},
218 '242': {'ext': 'webm', 'height': 240, 'resolution': '240p', 'format_note': 'DASH webm', 'preference': -40},
219 '243': {'ext': 'webm', 'height': 360, 'resolution': '360p', 'format_note': 'DASH webm', 'preference': -40},
220 '244': {'ext': 'webm', 'height': 480, 'resolution': '480p', 'format_note': 'DASH webm', 'preference': -40},
221 '245': {'ext': 'webm', 'height': 480, 'resolution': '480p', 'format_note': 'DASH webm', 'preference': -40},
222 '246': {'ext': 'webm', 'height': 480, 'resolution': '480p', 'format_note': 'DASH webm', 'preference': -40},
223 '247': {'ext': 'webm', 'height': 720, 'resolution': '720p', 'format_note': 'DASH webm', 'preference': -40},
224 '248': {'ext': 'webm', 'height': 1080, 'resolution': '1080p', 'format_note': 'DASH webm', 'preference': -40},
225
226 # Dash webm audio
227 '171': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH webm audio', 'abr': 48, 'preference': -50},
228 '172': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH webm audio', 'abr': 256, 'preference': -50},
229
230 # RTMP (unnamed)
231 '_rtmp': {'protocol': 'rtmp'},
232 }
233
234 IE_NAME = u'youtube'
235 _TESTS = [
236 {
237 u"url": u"http://www.youtube.com/watch?v=BaW_jenozKc",
238 u"file": u"BaW_jenozKc.mp4",
239 u"info_dict": {
240 u"title": u"youtube-dl test video \"'/\\ä↭𝕐",
241 u"uploader": u"Philipp Hagemeister",
242 u"uploader_id": u"phihag",
243 u"upload_date": u"20121002",
244 u"description": u"test chars: \"'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de ."
245 }
246 },
247 {
248 u"url": u"http://www.youtube.com/watch?v=UxxajLWwzqY",
249 u"file": u"UxxajLWwzqY.mp4",
250 u"note": u"Test generic use_cipher_signature video (#897)",
251 u"info_dict": {
252 u"upload_date": u"20120506",
253 u"title": u"Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]",
254 u"description": u"md5:5b292926389560516e384ac437c0ec07",
255 u"uploader": u"Icona Pop",
256 u"uploader_id": u"IconaPop"
257 }
258 },
259 {
260 u"url": u"https://www.youtube.com/watch?v=07FYdnEawAQ",
261 u"file": u"07FYdnEawAQ.mp4",
262 u"note": u"Test VEVO video with age protection (#956)",
263 u"info_dict": {
264 u"upload_date": u"20130703",
265 u"title": u"Justin Timberlake - Tunnel Vision (Explicit)",
266 u"description": u"md5:64249768eec3bc4276236606ea996373",
267 u"uploader": u"justintimberlakeVEVO",
268 u"uploader_id": u"justintimberlakeVEVO"
269 }
270 },
271 {
272 u"url": u"//www.YouTube.com/watch?v=yZIXLfi8CZQ",
273 u"file": u"yZIXLfi8CZQ.mp4",
274 u"note": u"Embed-only video (#1746)",
275 u"info_dict": {
276 u"upload_date": u"20120608",
277 u"title": u"Principal Sexually Assaults A Teacher - Episode 117 - 8th June 2012",
278 u"description": u"md5:09b78bd971f1e3e289601dfba15ca4f7",
279 u"uploader": u"SET India",
280 u"uploader_id": u"setindia"
281 }
282 },
283 {
284 u"url": u"http://www.youtube.com/watch?v=a9LDPn-MO4I",
285 u"file": u"a9LDPn-MO4I.m4a",
286 u"note": u"256k DASH audio (format 141) via DASH manifest",
287 u"info_dict": {
288 u"upload_date": "20121002",
289 u"uploader_id": "8KVIDEO",
290 u"description": "No description available.",
291 u"uploader": "8KVIDEO",
292 u"title": "UHDTV TEST 8K VIDEO.mp4"
293 },
294 u"params": {
295 u"youtube_include_dash_manifest": True,
296 u"format": "141",
297 },
298 },
299 # DASH manifest with encrypted signature
300 {
301 u'url': u'https://www.youtube.com/watch?v=IB3lcPjvWLA',
302 u'info_dict': {
303 u'id': u'IB3lcPjvWLA',
304 u'ext': u'm4a',
305 u'title': u'Afrojack - The Spark ft. Spree Wilson',
306 u'description': u'md5:3199ed45ee8836572865580804d7ac0f',
307 u'uploader': u'AfrojackVEVO',
308 u'uploader_id': u'AfrojackVEVO',
309 u'upload_date': u'20131011',
310 },
311 u"params": {
312 u'youtube_include_dash_manifest': True,
313 u'format': '141',
314 },
315 },
316 ]
317
318
319 @classmethod
320 def suitable(cls, url):
321 """Receives a URL and returns True if suitable for this IE."""
322 if YoutubePlaylistIE.suitable(url): return False
323 return re.match(cls._VALID_URL, url) is not None
324
325 def __init__(self, *args, **kwargs):
326 super(YoutubeIE, self).__init__(*args, **kwargs)
327 self._player_cache = {}
328
329 def report_video_info_webpage_download(self, video_id):
330 """Report attempt to download video info webpage."""
331 self.to_screen(u'%s: Downloading video info webpage' % video_id)
332
333 def report_information_extraction(self, video_id):
334 """Report attempt to extract video information."""
335 self.to_screen(u'%s: Extracting video information' % video_id)
336
337 def report_unavailable_format(self, video_id, format):
338 """Report extracted video URL."""
339 self.to_screen(u'%s: Format %s not available' % (video_id, format))
340
341 def report_rtmp_download(self):
342 """Indicate the download will use the RTMP protocol."""
343 self.to_screen(u'RTMP download detected')
344
345 def _extract_signature_function(self, video_id, player_url, slen):
346 id_m = re.match(r'.*-(?P<id>[a-zA-Z0-9_-]+)\.(?P<ext>[a-z]+)$',
347 player_url)
348 player_type = id_m.group('ext')
349 player_id = id_m.group('id')
350
351 # Read from filesystem cache
352 func_id = '%s_%s_%d' % (player_type, player_id, slen)
353 assert os.path.basename(func_id) == func_id
354 cache_dir = get_cachedir(self._downloader.params)
355
356 cache_enabled = cache_dir is not None
357 if cache_enabled:
358 cache_fn = os.path.join(os.path.expanduser(cache_dir),
359 u'youtube-sigfuncs',
360 func_id + '.json')
361 try:
362 with io.open(cache_fn, 'r', encoding='utf-8') as cachef:
363 cache_spec = json.load(cachef)
364 return lambda s: u''.join(s[i] for i in cache_spec)
365 except IOError:
366 pass # No cache available
367
368 if player_type == 'js':
369 code = self._download_webpage(
370 player_url, video_id,
371 note=u'Downloading %s player %s' % (player_type, player_id),
372 errnote=u'Download of %s failed' % player_url)
373 res = self._parse_sig_js(code)
374 elif player_type == 'swf':
375 urlh = self._request_webpage(
376 player_url, video_id,
377 note=u'Downloading %s player %s' % (player_type, player_id),
378 errnote=u'Download of %s failed' % player_url)
379 code = urlh.read()
380 res = self._parse_sig_swf(code)
381 else:
382 assert False, 'Invalid player type %r' % player_type
383
384 if cache_enabled:
385 try:
386 test_string = u''.join(map(compat_chr, range(slen)))
387 cache_res = res(test_string)
388 cache_spec = [ord(c) for c in cache_res]
389 try:
390 os.makedirs(os.path.dirname(cache_fn))
391 except OSError as ose:
392 if ose.errno != errno.EEXIST:
393 raise
394 write_json_file(cache_spec, cache_fn)
395 except Exception:
396 tb = traceback.format_exc()
397 self._downloader.report_warning(
398 u'Writing cache to %r failed: %s' % (cache_fn, tb))
399
400 return res
401
402 def _print_sig_code(self, func, slen):
403 def gen_sig_code(idxs):
404 def _genslice(start, end, step):
405 starts = u'' if start == 0 else str(start)
406 ends = (u':%d' % (end+step)) if end + step >= 0 else u':'
407 steps = u'' if step == 1 else (u':%d' % step)
408 return u's[%s%s%s]' % (starts, ends, steps)
409
410 step = None
411 start = '(Never used)' # Quelch pyflakes warnings - start will be
412 # set as soon as step is set
413 for i, prev in zip(idxs[1:], idxs[:-1]):
414 if step is not None:
415 if i - prev == step:
416 continue
417 yield _genslice(start, prev, step)
418 step = None
419 continue
420 if i - prev in [-1, 1]:
421 step = i - prev
422 start = prev
423 continue
424 else:
425 yield u's[%d]' % prev
426 if step is None:
427 yield u's[%d]' % i
428 else:
429 yield _genslice(start, i, step)
430
431 test_string = u''.join(map(compat_chr, range(slen)))
432 cache_res = func(test_string)
433 cache_spec = [ord(c) for c in cache_res]
434 expr_code = u' + '.join(gen_sig_code(cache_spec))
435 code = u'if len(s) == %d:\n return %s\n' % (slen, expr_code)
436 self.to_screen(u'Extracted signature function:\n' + code)
437
438 def _parse_sig_js(self, jscode):
439 funcname = self._search_regex(
440 r'signature=([a-zA-Z]+)', jscode,
441 u'Initial JS player signature function name')
442
443 functions = {}
444
445 def argidx(varname):
446 return string.lowercase.index(varname)
447
448 def interpret_statement(stmt, local_vars, allow_recursion=20):
449 if allow_recursion < 0:
450 raise ExtractorError(u'Recursion limit reached')
451
452 if stmt.startswith(u'var '):
453 stmt = stmt[len(u'var '):]
454 ass_m = re.match(r'^(?P<out>[a-z]+)(?:\[(?P<index>[^\]]+)\])?' +
455 r'=(?P<expr>.*)$', stmt)
456 if ass_m:
457 if ass_m.groupdict().get('index'):
458 def assign(val):
459 lvar = local_vars[ass_m.group('out')]
460 idx = interpret_expression(ass_m.group('index'),
461 local_vars, allow_recursion)
462 assert isinstance(idx, int)
463 lvar[idx] = val
464 return val
465 expr = ass_m.group('expr')
466 else:
467 def assign(val):
468 local_vars[ass_m.group('out')] = val
469 return val
470 expr = ass_m.group('expr')
471 elif stmt.startswith(u'return '):
472 assign = lambda v: v
473 expr = stmt[len(u'return '):]
474 else:
475 raise ExtractorError(
476 u'Cannot determine left side of statement in %r' % stmt)
477
478 v = interpret_expression(expr, local_vars, allow_recursion)
479 return assign(v)
480
481 def interpret_expression(expr, local_vars, allow_recursion):
482 if expr.isdigit():
483 return int(expr)
484
485 if expr.isalpha():
486 return local_vars[expr]
487
488 m = re.match(r'^(?P<in>[a-z]+)\.(?P<member>.*)$', expr)
489 if m:
490 member = m.group('member')
491 val = local_vars[m.group('in')]
492 if member == 'split("")':
493 return list(val)
494 if member == 'join("")':
495 return u''.join(val)
496 if member == 'length':
497 return len(val)
498 if member == 'reverse()':
499 return val[::-1]
500 slice_m = re.match(r'slice\((?P<idx>.*)\)', member)
501 if slice_m:
502 idx = interpret_expression(
503 slice_m.group('idx'), local_vars, allow_recursion-1)
504 return val[idx:]
505
506 m = re.match(
507 r'^(?P<in>[a-z]+)\[(?P<idx>.+)\]$', expr)
508 if m:
509 val = local_vars[m.group('in')]
510 idx = interpret_expression(m.group('idx'), local_vars,
511 allow_recursion-1)
512 return val[idx]
513
514 m = re.match(r'^(?P<a>.+?)(?P<op>[%])(?P<b>.+?)$', expr)
515 if m:
516 a = interpret_expression(m.group('a'),
517 local_vars, allow_recursion)
518 b = interpret_expression(m.group('b'),
519 local_vars, allow_recursion)
520 return a % b
521
522 m = re.match(
523 r'^(?P<func>[a-zA-Z$]+)\((?P<args>[a-z0-9,]+)\)$', expr)
524 if m:
525 fname = m.group('func')
526 if fname not in functions:
527 functions[fname] = extract_function(fname)
528 argvals = [int(v) if v.isdigit() else local_vars[v]
529 for v in m.group('args').split(',')]
530 return functions[fname](argvals)
531 raise ExtractorError(u'Unsupported JS expression %r' % expr)
532
533 def extract_function(funcname):
534 func_m = re.search(
535 r'function ' + re.escape(funcname) +
536 r'\((?P<args>[a-z,]+)\){(?P<code>[^}]+)}',
537 jscode)
538 argnames = func_m.group('args').split(',')
539
540 def resf(args):
541 local_vars = dict(zip(argnames, args))
542 for stmt in func_m.group('code').split(';'):
543 res = interpret_statement(stmt, local_vars)
544 return res
545 return resf
546
547 initial_function = extract_function(funcname)
548 return lambda s: initial_function([s])
549
550 def _parse_sig_swf(self, file_contents):
551 if file_contents[1:3] != b'WS':
552 raise ExtractorError(
553 u'Not an SWF file; header is %r' % file_contents[:3])
554 if file_contents[:1] == b'C':
555 content = zlib.decompress(file_contents[8:])
556 else:
557 raise NotImplementedError(u'Unsupported compression format %r' %
558 file_contents[:1])
559
560 def extract_tags(content):
561 pos = 0
562 while pos < len(content):
563 header16 = struct.unpack('<H', content[pos:pos+2])[0]
564 pos += 2
565 tag_code = header16 >> 6
566 tag_len = header16 & 0x3f
567 if tag_len == 0x3f:
568 tag_len = struct.unpack('<I', content[pos:pos+4])[0]
569 pos += 4
570 assert pos+tag_len <= len(content)
571 yield (tag_code, content[pos:pos+tag_len])
572 pos += tag_len
573
574 code_tag = next(tag
575 for tag_code, tag in extract_tags(content)
576 if tag_code == 82)
577 p = code_tag.index(b'\0', 4) + 1
578 code_reader = io.BytesIO(code_tag[p:])
579
580 # Parse ABC (AVM2 ByteCode)
581 def read_int(reader=None):
582 if reader is None:
583 reader = code_reader
584 res = 0
585 shift = 0
586 for _ in range(5):
587 buf = reader.read(1)
588 assert len(buf) == 1
589 b = struct.unpack('<B', buf)[0]
590 res = res | ((b & 0x7f) << shift)
591 if b & 0x80 == 0:
592 break
593 shift += 7
594 return res
595
596 def u30(reader=None):
597 res = read_int(reader)
598 assert res & 0xf0000000 == 0
599 return res
600 u32 = read_int
601
602 def s32(reader=None):
603 v = read_int(reader)
604 if v & 0x80000000 != 0:
605 v = - ((v ^ 0xffffffff) + 1)
606 return v
607
608 def read_string(reader=None):
609 if reader is None:
610 reader = code_reader
611 slen = u30(reader)
612 resb = reader.read(slen)
613 assert len(resb) == slen
614 return resb.decode('utf-8')
615
616 def read_bytes(count, reader=None):
617 if reader is None:
618 reader = code_reader
619 resb = reader.read(count)
620 assert len(resb) == count
621 return resb
622
623 def read_byte(reader=None):
624 resb = read_bytes(1, reader=reader)
625 res = struct.unpack('<B', resb)[0]
626 return res
627
628 # minor_version + major_version
629 read_bytes(2 + 2)
630
631 # Constant pool
632 int_count = u30()
633 for _c in range(1, int_count):
634 s32()
635 uint_count = u30()
636 for _c in range(1, uint_count):
637 u32()
638 double_count = u30()
639 read_bytes((double_count-1) * 8)
640 string_count = u30()
641 constant_strings = [u'']
642 for _c in range(1, string_count):
643 s = read_string()
644 constant_strings.append(s)
645 namespace_count = u30()
646 for _c in range(1, namespace_count):
647 read_bytes(1) # kind
648 u30() # name
649 ns_set_count = u30()
650 for _c in range(1, ns_set_count):
651 count = u30()
652 for _c2 in range(count):
653 u30()
654 multiname_count = u30()
655 MULTINAME_SIZES = {
656 0x07: 2, # QName
657 0x0d: 2, # QNameA
658 0x0f: 1, # RTQName
659 0x10: 1, # RTQNameA
660 0x11: 0, # RTQNameL
661 0x12: 0, # RTQNameLA
662 0x09: 2, # Multiname
663 0x0e: 2, # MultinameA
664 0x1b: 1, # MultinameL
665 0x1c: 1, # MultinameLA
666 }
667 multinames = [u'']
668 for _c in range(1, multiname_count):
669 kind = u30()
670 assert kind in MULTINAME_SIZES, u'Invalid multiname kind %r' % kind
671 if kind == 0x07:
672 u30() # namespace_idx
673 name_idx = u30()
674 multinames.append(constant_strings[name_idx])
675 else:
676 multinames.append('[MULTINAME kind: %d]' % kind)
677 for _c2 in range(MULTINAME_SIZES[kind]):
678 u30()
679
680 # Methods
681 method_count = u30()
682 MethodInfo = collections.namedtuple(
683 'MethodInfo',
684 ['NEED_ARGUMENTS', 'NEED_REST'])
685 method_infos = []
686 for method_id in range(method_count):
687 param_count = u30()
688 u30() # return type
689 for _ in range(param_count):
690 u30() # param type
691 u30() # name index (always 0 for youtube)
692 flags = read_byte()
693 if flags & 0x08 != 0:
694 # Options present
695 option_count = u30()
696 for c in range(option_count):
697 u30() # val
698 read_bytes(1) # kind
699 if flags & 0x80 != 0:
700 # Param names present
701 for _ in range(param_count):
702 u30() # param name
703 mi = MethodInfo(flags & 0x01 != 0, flags & 0x04 != 0)
704 method_infos.append(mi)
705
706 # Metadata
707 metadata_count = u30()
708 for _c in range(metadata_count):
709 u30() # name
710 item_count = u30()
711 for _c2 in range(item_count):
712 u30() # key
713 u30() # value
714
715 def parse_traits_info():
716 trait_name_idx = u30()
717 kind_full = read_byte()
718 kind = kind_full & 0x0f
719 attrs = kind_full >> 4
720 methods = {}
721 if kind in [0x00, 0x06]: # Slot or Const
722 u30() # Slot id
723 u30() # type_name_idx
724 vindex = u30()
725 if vindex != 0:
726 read_byte() # vkind
727 elif kind in [0x01, 0x02, 0x03]: # Method / Getter / Setter
728 u30() # disp_id
729 method_idx = u30()
730 methods[multinames[trait_name_idx]] = method_idx
731 elif kind == 0x04: # Class
732 u30() # slot_id
733 u30() # classi
734 elif kind == 0x05: # Function
735 u30() # slot_id
736 function_idx = u30()
737 methods[function_idx] = multinames[trait_name_idx]
738 else:
739 raise ExtractorError(u'Unsupported trait kind %d' % kind)
740
741 if attrs & 0x4 != 0: # Metadata present
742 metadata_count = u30()
743 for _c3 in range(metadata_count):
744 u30() # metadata index
745
746 return methods
747
748 # Classes
749 TARGET_CLASSNAME = u'SignatureDecipher'
750 searched_idx = multinames.index(TARGET_CLASSNAME)
751 searched_class_id = None
752 class_count = u30()
753 for class_id in range(class_count):
754 name_idx = u30()
755 if name_idx == searched_idx:
756 # We found the class we're looking for!
757 searched_class_id = class_id
758 u30() # super_name idx
759 flags = read_byte()
760 if flags & 0x08 != 0: # Protected namespace is present
761 u30() # protected_ns_idx
762 intrf_count = u30()
763 for _c2 in range(intrf_count):
764 u30()
765 u30() # iinit
766 trait_count = u30()
767 for _c2 in range(trait_count):
768 parse_traits_info()
769
770 if searched_class_id is None:
771 raise ExtractorError(u'Target class %r not found' %
772 TARGET_CLASSNAME)
773
774 method_names = {}
775 method_idxs = {}
776 for class_id in range(class_count):
777 u30() # cinit
778 trait_count = u30()
779 for _c2 in range(trait_count):
780 trait_methods = parse_traits_info()
781 if class_id == searched_class_id:
782 method_names.update(trait_methods.items())
783 method_idxs.update(dict(
784 (idx, name)
785 for name, idx in trait_methods.items()))
786
787 # Scripts
788 script_count = u30()
789 for _c in range(script_count):
790 u30() # init
791 trait_count = u30()
792 for _c2 in range(trait_count):
793 parse_traits_info()
794
795 # Method bodies
796 method_body_count = u30()
797 Method = collections.namedtuple('Method', ['code', 'local_count'])
798 methods = {}
799 for _c in range(method_body_count):
800 method_idx = u30()
801 u30() # max_stack
802 local_count = u30()
803 u30() # init_scope_depth
804 u30() # max_scope_depth
805 code_length = u30()
806 code = read_bytes(code_length)
807 if method_idx in method_idxs:
808 m = Method(code, local_count)
809 methods[method_idxs[method_idx]] = m
810 exception_count = u30()
811 for _c2 in range(exception_count):
812 u30() # from
813 u30() # to
814 u30() # target
815 u30() # exc_type
816 u30() # var_name
817 trait_count = u30()
818 for _c2 in range(trait_count):
819 parse_traits_info()
820
821 assert p + code_reader.tell() == len(code_tag)
822 assert len(methods) == len(method_idxs)
823
824 method_pyfunctions = {}
825
826 def extract_function(func_name):
827 if func_name in method_pyfunctions:
828 return method_pyfunctions[func_name]
829 if func_name not in methods:
830 raise ExtractorError(u'Cannot find function %r' % func_name)
831 m = methods[func_name]
832
833 def resfunc(args):
834 registers = ['(this)'] + list(args) + [None] * m.local_count
835 stack = []
836 coder = io.BytesIO(m.code)
837 while True:
838 opcode = struct.unpack('!B', coder.read(1))[0]
839 if opcode == 36: # pushbyte
840 v = struct.unpack('!B', coder.read(1))[0]
841 stack.append(v)
842 elif opcode == 44: # pushstring
843 idx = u30(coder)
844 stack.append(constant_strings[idx])
845 elif opcode == 48: # pushscope
846 # We don't implement the scope register, so we'll just
847 # ignore the popped value
848 stack.pop()
849 elif opcode == 70: # callproperty
850 index = u30(coder)
851 mname = multinames[index]
852 arg_count = u30(coder)
853 args = list(reversed(
854 [stack.pop() for _ in range(arg_count)]))
855 obj = stack.pop()
856 if mname == u'split':
857 assert len(args) == 1
858 assert isinstance(args[0], compat_str)
859 assert isinstance(obj, compat_str)
860 if args[0] == u'':
861 res = list(obj)
862 else:
863 res = obj.split(args[0])
864 stack.append(res)
865 elif mname == u'slice':
866 assert len(args) == 1
867 assert isinstance(args[0], int)
868 assert isinstance(obj, list)
869 res = obj[args[0]:]
870 stack.append(res)
871 elif mname == u'join':
872 assert len(args) == 1
873 assert isinstance(args[0], compat_str)
874 assert isinstance(obj, list)
875 res = args[0].join(obj)
876 stack.append(res)
877 elif mname in method_pyfunctions:
878 stack.append(method_pyfunctions[mname](args))
879 else:
880 raise NotImplementedError(
881 u'Unsupported property %r on %r'
882 % (mname, obj))
883 elif opcode == 72: # returnvalue
884 res = stack.pop()
885 return res
886 elif opcode == 79: # callpropvoid
887 index = u30(coder)
888 mname = multinames[index]
889 arg_count = u30(coder)
890 args = list(reversed(
891 [stack.pop() for _ in range(arg_count)]))
892 obj = stack.pop()
893 if mname == u'reverse':
894 assert isinstance(obj, list)
895 obj.reverse()
896 else:
897 raise NotImplementedError(
898 u'Unsupported (void) property %r on %r'
899 % (mname, obj))
900 elif opcode == 93: # findpropstrict
901 index = u30(coder)
902 mname = multinames[index]
903 res = extract_function(mname)
904 stack.append(res)
905 elif opcode == 97: # setproperty
906 index = u30(coder)
907 value = stack.pop()
908 idx = stack.pop()
909 obj = stack.pop()
910 assert isinstance(obj, list)
911 assert isinstance(idx, int)
912 obj[idx] = value
913 elif opcode == 98: # getlocal
914 index = u30(coder)
915 stack.append(registers[index])
916 elif opcode == 99: # setlocal
917 index = u30(coder)
918 value = stack.pop()
919 registers[index] = value
920 elif opcode == 102: # getproperty
921 index = u30(coder)
922 pname = multinames[index]
923 if pname == u'length':
924 obj = stack.pop()
925 assert isinstance(obj, list)
926 stack.append(len(obj))
927 else: # Assume attribute access
928 idx = stack.pop()
929 assert isinstance(idx, int)
930 obj = stack.pop()
931 assert isinstance(obj, list)
932 stack.append(obj[idx])
933 elif opcode == 128: # coerce
934 u30(coder)
935 elif opcode == 133: # coerce_s
936 assert isinstance(stack[-1], (type(None), compat_str))
937 elif opcode == 164: # modulo
938 value2 = stack.pop()
939 value1 = stack.pop()
940 res = value1 % value2
941 stack.append(res)
942 elif opcode == 208: # getlocal_0
943 stack.append(registers[0])
944 elif opcode == 209: # getlocal_1
945 stack.append(registers[1])
946 elif opcode == 210: # getlocal_2
947 stack.append(registers[2])
948 elif opcode == 211: # getlocal_3
949 stack.append(registers[3])
950 elif opcode == 214: # setlocal_2
951 registers[2] = stack.pop()
952 elif opcode == 215: # setlocal_3
953 registers[3] = stack.pop()
954 else:
955 raise NotImplementedError(
956 u'Unsupported opcode %d' % opcode)
957
958 method_pyfunctions[func_name] = resfunc
959 return resfunc
960
961 initial_function = extract_function(u'decipher')
962 return lambda s: initial_function([s])
963
964 def _decrypt_signature(self, s, video_id, player_url, age_gate=False):
965 """Turn the encrypted s field into a working signature"""
966
967 if player_url is not None:
968 if player_url.startswith(u'//'):
969 player_url = u'https:' + player_url
970 try:
971 player_id = (player_url, len(s))
972 if player_id not in self._player_cache:
973 func = self._extract_signature_function(
974 video_id, player_url, len(s)
975 )
976 self._player_cache[player_id] = func
977 func = self._player_cache[player_id]
978 if self._downloader.params.get('youtube_print_sig_code'):
979 self._print_sig_code(func, len(s))
980 return func(s)
981 except Exception:
982 tb = traceback.format_exc()
983 self._downloader.report_warning(
984 u'Automatic signature extraction failed: ' + tb)
985
986 self._downloader.report_warning(
987 u'Warning: Falling back to static signature algorithm')
988
989 return self._static_decrypt_signature(
990 s, video_id, player_url, age_gate)
991
992 def _static_decrypt_signature(self, s, video_id, player_url, age_gate):
993 if age_gate:
994 # The videos with age protection use another player, so the
995 # algorithms can be different.
996 if len(s) == 86:
997 return s[2:63] + s[82] + s[64:82] + s[63]
998
999 if len(s) == 93:
1000 return s[86:29:-1] + s[88] + s[28:5:-1]
1001 elif len(s) == 92:
1002 return s[25] + s[3:25] + s[0] + s[26:42] + s[79] + s[43:79] + s[91] + s[80:83]
1003 elif len(s) == 91:
1004 return s[84:27:-1] + s[86] + s[26:5:-1]
1005 elif len(s) == 90:
1006 return s[25] + s[3:25] + s[2] + s[26:40] + s[77] + s[41:77] + s[89] + s[78:81]
1007 elif len(s) == 89:
1008 return s[84:78:-1] + s[87] + s[77:60:-1] + s[0] + s[59:3:-1]
1009 elif len(s) == 88:
1010 return s[7:28] + s[87] + s[29:45] + s[55] + s[46:55] + s[2] + s[56:87] + s[28]
1011 elif len(s) == 87:
1012 return s[6:27] + s[4] + s[28:39] + s[27] + s[40:59] + s[2] + s[60:]
1013 elif len(s) == 86:
1014 return s[80:72:-1] + s[16] + s[71:39:-1] + s[72] + s[38:16:-1] + s[82] + s[15::-1]
1015 elif len(s) == 85:
1016 return s[3:11] + s[0] + s[12:55] + s[84] + s[56:84]
1017 elif len(s) == 84:
1018 return s[78:70:-1] + s[14] + s[69:37:-1] + s[70] + s[36:14:-1] + s[80] + s[:14][::-1]
1019 elif len(s) == 83:
1020 return s[80:63:-1] + s[0] + s[62:0:-1] + s[63]
1021 elif len(s) == 82:
1022 return s[80:37:-1] + s[7] + s[36:7:-1] + s[0] + s[6:0:-1] + s[37]
1023 elif len(s) == 81:
1024 return s[56] + s[79:56:-1] + s[41] + s[55:41:-1] + s[80] + s[40:34:-1] + s[0] + s[33:29:-1] + s[34] + s[28:9:-1] + s[29] + s[8:0:-1] + s[9]
1025 elif len(s) == 80:
1026 return s[1:19] + s[0] + s[20:68] + s[19] + s[69:80]
1027 elif len(s) == 79:
1028 return s[54] + s[77:54:-1] + s[39] + s[53:39:-1] + s[78] + s[38:34:-1] + s[0] + s[33:29:-1] + s[34] + s[28:9:-1] + s[29] + s[8:0:-1] + s[9]
1029
1030 else:
1031 raise ExtractorError(u'Unable to decrypt signature, key length %d not supported; retrying might work' % (len(s)))
1032
1033 def _get_available_subtitles(self, video_id, webpage):
1034 try:
1035 sub_list = self._download_webpage(
1036 'https://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,
1037 video_id, note=False)
1038 except ExtractorError as err:
1039 self._downloader.report_warning(u'unable to download video subtitles: %s' % compat_str(err))
1040 return {}
1041 lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
1042
1043 sub_lang_list = {}
1044 for l in lang_list:
1045 lang = l[1]
1046 params = compat_urllib_parse.urlencode({
1047 'lang': lang,
1048 'v': video_id,
1049 'fmt': self._downloader.params.get('subtitlesformat', 'srt'),
1050 'name': unescapeHTML(l[0]).encode('utf-8'),
1051 })
1052 url = u'https://www.youtube.com/api/timedtext?' + params
1053 sub_lang_list[lang] = url
1054 if not sub_lang_list:
1055 self._downloader.report_warning(u'video doesn\'t have subtitles')
1056 return {}
1057 return sub_lang_list
1058
1059 def _get_available_automatic_caption(self, video_id, webpage):
1060 """We need the webpage for getting the captions url, pass it as an
1061 argument to speed up the process."""
1062 sub_format = self._downloader.params.get('subtitlesformat', 'srt')
1063 self.to_screen(u'%s: Looking for automatic captions' % video_id)
1064 mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
1065 err_msg = u'Couldn\'t find automatic captions for %s' % video_id
1066 if mobj is None:
1067 self._downloader.report_warning(err_msg)
1068 return {}
1069 player_config = json.loads(mobj.group(1))
1070 try:
1071 args = player_config[u'args']
1072 caption_url = args[u'ttsurl']
1073 timestamp = args[u'timestamp']
1074 # We get the available subtitles
1075 list_params = compat_urllib_parse.urlencode({
1076 'type': 'list',
1077 'tlangs': 1,
1078 'asrs': 1,
1079 })
1080 list_url = caption_url + '&' + list_params
1081 caption_list = self._download_xml(list_url, video_id)
1082 original_lang_node = caption_list.find('track')
1083 if original_lang_node is None or original_lang_node.attrib.get('kind') != 'asr' :
1084 self._downloader.report_warning(u'Video doesn\'t have automatic captions')
1085 return {}
1086 original_lang = original_lang_node.attrib['lang_code']
1087
1088 sub_lang_list = {}
1089 for lang_node in caption_list.findall('target'):
1090 sub_lang = lang_node.attrib['lang_code']
1091 params = compat_urllib_parse.urlencode({
1092 'lang': original_lang,
1093 'tlang': sub_lang,
1094 'fmt': sub_format,
1095 'ts': timestamp,
1096 'kind': 'asr',
1097 })
1098 sub_lang_list[sub_lang] = caption_url + '&' + params
1099 return sub_lang_list
1100 # An extractor error can be raise by the download process if there are
1101 # no automatic captions but there are subtitles
1102 except (KeyError, ExtractorError):
1103 self._downloader.report_warning(err_msg)
1104 return {}
1105
1106 @classmethod
1107 def extract_id(cls, url):
1108 mobj = re.match(cls._VALID_URL, url, re.VERBOSE)
1109 if mobj is None:
1110 raise ExtractorError(u'Invalid URL: %s' % url)
1111 video_id = mobj.group(2)
1112 return video_id
1113
1114 def _extract_from_m3u8(self, manifest_url, video_id):
1115 url_map = {}
1116 def _get_urls(_manifest):
1117 lines = _manifest.split('\n')
1118 urls = filter(lambda l: l and not l.startswith('#'),
1119 lines)
1120 return urls
1121 manifest = self._download_webpage(manifest_url, video_id, u'Downloading formats manifest')
1122 formats_urls = _get_urls(manifest)
1123 for format_url in formats_urls:
1124 itag = self._search_regex(r'itag/(\d+?)/', format_url, 'itag')
1125 url_map[itag] = format_url
1126 return url_map
1127
1128 def _extract_annotations(self, video_id):
1129 url = 'https://www.youtube.com/annotations_invideo?features=1&legacy=1&video_id=%s' % video_id
1130 return self._download_webpage(url, video_id, note=u'Searching for annotations.', errnote=u'Unable to download video annotations.')
1131
1132 def _real_extract(self, url):
1133 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
1134 mobj = re.search(self._NEXT_URL_RE, url)
1135 if mobj:
1136 url = 'https://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
1137 video_id = self.extract_id(url)
1138
1139 # Get video webpage
1140 url = 'https://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
1141 video_webpage = self._download_webpage(url, video_id)
1142
1143 # Attempt to extract SWF player URL
1144 mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
1145 if mobj is not None:
1146 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1147 else:
1148 player_url = None
1149
1150 # Get video info
1151 self.report_video_info_webpage_download(video_id)
1152 if re.search(r'player-age-gate-content">', video_webpage) is not None:
1153 self.report_age_confirmation()
1154 age_gate = True
1155 # We simulate the access to the video from www.youtube.com/v/{video_id}
1156 # this can be viewed without login into Youtube
1157 data = compat_urllib_parse.urlencode({'video_id': video_id,
1158 'el': 'player_embedded',
1159 'gl': 'US',
1160 'hl': 'en',
1161 'eurl': 'https://youtube.googleapis.com/v/' + video_id,
1162 'asv': 3,
1163 'sts':'1588',
1164 })
1165 video_info_url = 'https://www.youtube.com/get_video_info?' + data
1166 video_info_webpage = self._download_webpage(video_info_url, video_id,
1167 note=False,
1168 errnote='unable to download video info webpage')
1169 video_info = compat_parse_qs(video_info_webpage)
1170 else:
1171 age_gate = False
1172 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1173 video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1174 % (video_id, el_type))
1175 video_info_webpage = self._download_webpage(video_info_url, video_id,
1176 note=False,
1177 errnote='unable to download video info webpage')
1178 video_info = compat_parse_qs(video_info_webpage)
1179 if 'token' in video_info:
1180 break
1181 if 'token' not in video_info:
1182 if 'reason' in video_info:
1183 raise ExtractorError(u'YouTube said: %s' % video_info['reason'][0], expected=True)
1184 else:
1185 raise ExtractorError(u'"token" parameter not in video info for unknown reason')
1186
1187 if 'view_count' in video_info:
1188 view_count = int(video_info['view_count'][0])
1189 else:
1190 view_count = None
1191
1192 # Check for "rental" videos
1193 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
1194 raise ExtractorError(u'"rental" videos not supported')
1195
1196 # Start extracting information
1197 self.report_information_extraction(video_id)
1198
1199 # uploader
1200 if 'author' not in video_info:
1201 raise ExtractorError(u'Unable to extract uploader name')
1202 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
1203
1204 # uploader_id
1205 video_uploader_id = None
1206 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
1207 if mobj is not None:
1208 video_uploader_id = mobj.group(1)
1209 else:
1210 self._downloader.report_warning(u'unable to extract uploader nickname')
1211
1212 # title
1213 if 'title' in video_info:
1214 video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
1215 else:
1216 self._downloader.report_warning(u'Unable to extract video title')
1217 video_title = u'_'
1218
1219 # thumbnail image
1220 # We try first to get a high quality image:
1221 m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">',
1222 video_webpage, re.DOTALL)
1223 if m_thumb is not None:
1224 video_thumbnail = m_thumb.group(1)
1225 elif 'thumbnail_url' not in video_info:
1226 self._downloader.report_warning(u'unable to extract video thumbnail')
1227 video_thumbnail = None
1228 else: # don't panic if we can't find it
1229 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
1230
1231 # upload date
1232 upload_date = None
1233 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
1234 if mobj is not None:
1235 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1236 upload_date = unified_strdate(upload_date)
1237
1238 # description
1239 video_description = get_element_by_id("eow-description", video_webpage)
1240 if video_description:
1241 video_description = re.sub(r'''(?x)
1242 <a\s+
1243 (?:[a-zA-Z-]+="[^"]+"\s+)*?
1244 title="([^"]+)"\s+
1245 (?:[a-zA-Z-]+="[^"]+"\s+)*?
1246 class="yt-uix-redirect-link"\s*>
1247 [^<]+
1248 </a>
1249 ''', r'\1', video_description)
1250 video_description = clean_html(video_description)
1251 else:
1252 fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
1253 if fd_mobj:
1254 video_description = unescapeHTML(fd_mobj.group(1))
1255 else:
1256 video_description = u''
1257
1258 def _extract_count(klass):
1259 count = self._search_regex(
1260 r'class="%s">([\d,]+)</span>' % re.escape(klass),
1261 video_webpage, klass, default=None)
1262 if count is not None:
1263 return int(count.replace(',', ''))
1264 return None
1265 like_count = _extract_count(u'likes-count')
1266 dislike_count = _extract_count(u'dislikes-count')
1267
1268 # subtitles
1269 video_subtitles = self.extract_subtitles(video_id, video_webpage)
1270
1271 if self._downloader.params.get('listsubtitles', False):
1272 self._list_available_subtitles(video_id, video_webpage)
1273 return
1274
1275 if 'length_seconds' not in video_info:
1276 self._downloader.report_warning(u'unable to extract video duration')
1277 video_duration = None
1278 else:
1279 video_duration = int(compat_urllib_parse.unquote_plus(video_info['length_seconds'][0]))
1280
1281 # annotations
1282 video_annotations = None
1283 if self._downloader.params.get('writeannotations', False):
1284 video_annotations = self._extract_annotations(video_id)
1285
1286 # Decide which formats to download
1287 try:
1288 mobj = re.search(r';ytplayer.config = ({.*?});', video_webpage)
1289 if not mobj:
1290 raise ValueError('Could not find vevo ID')
1291 ytplayer_config = json.loads(mobj.group(1))
1292 args = ytplayer_config['args']
1293 # Easy way to know if the 's' value is in url_encoded_fmt_stream_map
1294 # this signatures are encrypted
1295 if 'url_encoded_fmt_stream_map' not in args:
1296 raise ValueError(u'No stream_map present') # caught below
1297 re_signature = re.compile(r'[&,]s=')
1298 m_s = re_signature.search(args['url_encoded_fmt_stream_map'])
1299 if m_s is not None:
1300 self.to_screen(u'%s: Encrypted signatures detected.' % video_id)
1301 video_info['url_encoded_fmt_stream_map'] = [args['url_encoded_fmt_stream_map']]
1302 m_s = re_signature.search(args.get('adaptive_fmts', u''))
1303 if m_s is not None:
1304 if 'adaptive_fmts' in video_info:
1305 video_info['adaptive_fmts'][0] += ',' + args['adaptive_fmts']
1306 else:
1307 video_info['adaptive_fmts'] = [args['adaptive_fmts']]
1308 except ValueError:
1309 pass
1310
1311 def _map_to_format_list(urlmap):
1312 formats = []
1313 for itag, video_real_url in urlmap.items():
1314 dct = {
1315 'format_id': itag,
1316 'url': video_real_url,
1317 'player_url': player_url,
1318 }
1319 if itag in self._formats:
1320 dct.update(self._formats[itag])
1321 formats.append(dct)
1322 return formats
1323
1324 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1325 self.report_rtmp_download()
1326 formats = [{
1327 'format_id': '_rtmp',
1328 'protocol': 'rtmp',
1329 'url': video_info['conn'][0],
1330 'player_url': player_url,
1331 }]
1332 elif len(video_info.get('url_encoded_fmt_stream_map', [])) >= 1 or len(video_info.get('adaptive_fmts', [])) >= 1:
1333 encoded_url_map = video_info.get('url_encoded_fmt_stream_map', [''])[0] + ',' + video_info.get('adaptive_fmts',[''])[0]
1334 if 'rtmpe%3Dyes' in encoded_url_map:
1335 raise ExtractorError('rtmpe downloads are not supported, see https://github.com/rg3/youtube-dl/issues/343 for more information.', expected=True)
1336 url_map = {}
1337 for url_data_str in encoded_url_map.split(','):
1338 url_data = compat_parse_qs(url_data_str)
1339 if 'itag' in url_data and 'url' in url_data:
1340 url = url_data['url'][0]
1341 if 'sig' in url_data:
1342 url += '&signature=' + url_data['sig'][0]
1343 elif 's' in url_data:
1344 encrypted_sig = url_data['s'][0]
1345 if self._downloader.params.get('verbose'):
1346 if age_gate:
1347 if player_url is None:
1348 player_version = 'unknown'
1349 else:
1350 player_version = self._search_regex(
1351 r'-(.+)\.swf$', player_url,
1352 u'flash player', fatal=False)
1353 player_desc = 'flash player %s' % player_version
1354 else:
1355 player_version = self._search_regex(
1356 r'html5player-(.+?)\.js', video_webpage,
1357 'html5 player', fatal=False)
1358 player_desc = u'html5 player %s' % player_version
1359
1360 parts_sizes = u'.'.join(compat_str(len(part)) for part in encrypted_sig.split('.'))
1361 self.to_screen(u'encrypted signature length %d (%s), itag %s, %s' %
1362 (len(encrypted_sig), parts_sizes, url_data['itag'][0], player_desc))
1363
1364 if not age_gate:
1365 jsplayer_url_json = self._search_regex(
1366 r'"assets":.+?"js":\s*("[^"]+")',
1367 video_webpage, u'JS player URL')
1368 player_url = json.loads(jsplayer_url_json)
1369
1370 signature = self._decrypt_signature(
1371 encrypted_sig, video_id, player_url, age_gate)
1372 url += '&signature=' + signature
1373 if 'ratebypass' not in url:
1374 url += '&ratebypass=yes'
1375 url_map[url_data['itag'][0]] = url
1376 formats = _map_to_format_list(url_map)
1377 elif video_info.get('hlsvp'):
1378 manifest_url = video_info['hlsvp'][0]
1379 url_map = self._extract_from_m3u8(manifest_url, video_id)
1380 formats = _map_to_format_list(url_map)
1381 else:
1382 raise ExtractorError(u'no conn, hlsvp or url_encoded_fmt_stream_map information found in video info')
1383
1384 # Look for the DASH manifest
1385 if (self._downloader.params.get('youtube_include_dash_manifest', False)):
1386 try:
1387 # The DASH manifest used needs to be the one from the original video_webpage.
1388 # The one found in get_video_info seems to be using different signatures.
1389 # However, in the case of an age restriction there won't be any embedded dashmpd in the video_webpage.
1390 # Luckily, it seems, this case uses some kind of default signature (len == 86), so the
1391 # combination of get_video_info and the _static_decrypt_signature() decryption fallback will work here.
1392 if age_gate:
1393 dash_manifest_url = video_info.get('dashmpd')[0]
1394 else:
1395 dash_manifest_url = ytplayer_config['args']['dashmpd']
1396 def decrypt_sig(mobj):
1397 s = mobj.group(1)
1398 dec_s = self._decrypt_signature(s, video_id, player_url, age_gate)
1399 return '/signature/%s' % dec_s
1400 dash_manifest_url = re.sub(r'/s/([\w\.]+)', decrypt_sig, dash_manifest_url)
1401 dash_doc = self._download_xml(
1402 dash_manifest_url, video_id,
1403 note=u'Downloading DASH manifest',
1404 errnote=u'Could not download DASH manifest')
1405 for r in dash_doc.findall(u'.//{urn:mpeg:DASH:schema:MPD:2011}Representation'):
1406 url_el = r.find('{urn:mpeg:DASH:schema:MPD:2011}BaseURL')
1407 if url_el is None:
1408 continue
1409 format_id = r.attrib['id']
1410 video_url = url_el.text
1411 filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength'))
1412 f = {
1413 'format_id': format_id,
1414 'url': video_url,
1415 'width': int_or_none(r.attrib.get('width')),
1416 'tbr': int_or_none(r.attrib.get('bandwidth'), 1000),
1417 'asr': int_or_none(r.attrib.get('audioSamplingRate')),
1418 'filesize': filesize,
1419 }
1420 try:
1421 existing_format = next(
1422 fo for fo in formats
1423 if fo['format_id'] == format_id)
1424 except StopIteration:
1425 f.update(self._formats.get(format_id, {}))
1426 formats.append(f)
1427 else:
1428 existing_format.update(f)
1429
1430 except (ExtractorError, KeyError) as e:
1431 self.report_warning(u'Skipping DASH manifest: %s' % e, video_id)
1432
1433 self._sort_formats(formats)
1434
1435 return {
1436 'id': video_id,
1437 'uploader': video_uploader,
1438 'uploader_id': video_uploader_id,
1439 'upload_date': upload_date,
1440 'title': video_title,
1441 'thumbnail': video_thumbnail,
1442 'description': video_description,
1443 'subtitles': video_subtitles,
1444 'duration': video_duration,
1445 'age_limit': 18 if age_gate else 0,
1446 'annotations': video_annotations,
1447 'webpage_url': 'https://www.youtube.com/watch?v=%s' % video_id,
1448 'view_count': view_count,
1449 'like_count': like_count,
1450 'dislike_count': dislike_count,
1451 'formats': formats,
1452 }
1453
1454class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
1455 IE_DESC = u'YouTube.com playlists'
1456 _VALID_URL = r"""(?x)(?:
1457 (?:https?://)?
1458 (?:\w+\.)?
1459 youtube\.com/
1460 (?:
1461 (?:course|view_play_list|my_playlists|artist|playlist|watch)
1462 \? (?:.*?&)*? (?:p|a|list)=
1463 | p/
1464 )
1465 (
1466 (?:PL|EC|UU|FL|RD)?[0-9A-Za-z-_]{10,}
1467 # Top tracks, they can also include dots
1468 |(?:MC)[\w\.]*
1469 )
1470 .*
1471 |
1472 ((?:PL|EC|UU|FL|RD)[0-9A-Za-z-_]{10,})
1473 )"""
1474 _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s'
1475 _MORE_PAGES_INDICATOR = r'data-link-type="next"'
1476 _VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})&amp;[^"]*?index=(?P<index>\d+)'
1477 IE_NAME = u'youtube:playlist'
1478
1479 def _real_initialize(self):
1480 self._login()
1481
1482 def _ids_to_results(self, ids):
1483 return [self.url_result(vid_id, 'Youtube', video_id=vid_id)
1484 for vid_id in ids]
1485
1486 def _extract_mix(self, playlist_id):
1487 # The mixes are generated from a a single video
1488 # the id of the playlist is just 'RD' + video_id
1489 url = 'https://youtube.com/watch?v=%s&list=%s' % (playlist_id[-11:], playlist_id)
1490 webpage = self._download_webpage(url, playlist_id, u'Downloading Youtube mix')
1491 search_title = lambda class_name: get_element_by_attribute('class', class_name, webpage)
1492 title_span = (search_title('playlist-title') or
1493 search_title('title long-title') or search_title('title'))
1494 title = clean_html(title_span)
1495 video_re = r'''(?x)data-video-username="(.*?)".*?
1496 href="/watch\?v=([0-9A-Za-z_-]{11})&amp;[^"]*?list=%s''' % re.escape(playlist_id)
1497 matches = orderedSet(re.findall(video_re, webpage, flags=re.DOTALL))
1498 # Some of the videos may have been deleted, their username field is empty
1499 ids = [video_id for (username, video_id) in matches if username]
1500 url_results = self._ids_to_results(ids)
1501
1502 return self.playlist_result(url_results, playlist_id, title)
1503
1504 def _real_extract(self, url):
1505 # Extract playlist id
1506 mobj = re.match(self._VALID_URL, url)
1507 if mobj is None:
1508 raise ExtractorError(u'Invalid URL: %s' % url)
1509 playlist_id = mobj.group(1) or mobj.group(2)
1510
1511 # Check if it's a video-specific URL
1512 query_dict = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
1513 if 'v' in query_dict:
1514 video_id = query_dict['v'][0]
1515 if self._downloader.params.get('noplaylist'):
1516 self.to_screen(u'Downloading just video %s because of --no-playlist' % video_id)
1517 return self.url_result(video_id, 'Youtube', video_id=video_id)
1518 else:
1519 self.to_screen(u'Downloading playlist PL%s - add --no-playlist to just download video %s' % (playlist_id, video_id))
1520
1521 if playlist_id.startswith('RD'):
1522 # Mixes require a custom extraction process
1523 return self._extract_mix(playlist_id)
1524 if playlist_id.startswith('TL'):
1525 raise ExtractorError(u'For downloading YouTube.com top lists, use '
1526 u'the "yttoplist" keyword, for example "youtube-dl \'yttoplist:music:Top Tracks\'"', expected=True)
1527
1528 url = self._TEMPLATE_URL % playlist_id
1529 page = self._download_webpage(url, playlist_id)
1530 more_widget_html = content_html = page
1531
1532 # Extract the video ids from the playlist pages
1533 ids = []
1534
1535 for page_num in itertools.count(1):
1536 matches = re.finditer(self._VIDEO_RE, content_html)
1537 # We remove the duplicates and the link with index 0
1538 # (it's not the first video of the playlist)
1539 new_ids = orderedSet(m.group('id') for m in matches if m.group('index') != '0')
1540 ids.extend(new_ids)
1541
1542 mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
1543 if not mobj:
1544 break
1545
1546 more = self._download_json(
1547 'https://youtube.com/%s' % mobj.group('more'), playlist_id, 'Downloading page #%s' % page_num)
1548 content_html = more['content_html']
1549 more_widget_html = more['load_more_widget_html']
1550
1551 playlist_title = self._html_search_regex(
1552 r'<h1 class="pl-header-title">\s*(.*?)\s*</h1>', page, u'title')
1553
1554 url_results = self._ids_to_results(ids)
1555 return self.playlist_result(url_results, playlist_id, playlist_title)
1556
1557
1558class YoutubeTopListIE(YoutubePlaylistIE):
1559 IE_NAME = u'youtube:toplist'
1560 IE_DESC = (u'YouTube.com top lists, "yttoplist:{channel}:{list title}"'
1561 u' (Example: "yttoplist:music:Top Tracks")')
1562 _VALID_URL = r'yttoplist:(?P<chann>.*?):(?P<title>.*?)$'
1563
1564 def _real_extract(self, url):
1565 mobj = re.match(self._VALID_URL, url)
1566 channel = mobj.group('chann')
1567 title = mobj.group('title')
1568 query = compat_urllib_parse.urlencode({'title': title})
1569 playlist_re = 'href="([^"]+?%s.*?)"' % re.escape(query)
1570 channel_page = self._download_webpage('https://www.youtube.com/%s' % channel, title)
1571 link = self._html_search_regex(playlist_re, channel_page, u'list')
1572 url = compat_urlparse.urljoin('https://www.youtube.com/', link)
1573
1574 video_re = r'data-index="\d+".*?data-video-id="([0-9A-Za-z_-]{11})"'
1575 ids = []
1576 # sometimes the webpage doesn't contain the videos
1577 # retry until we get them
1578 for i in itertools.count(0):
1579 msg = u'Downloading Youtube mix'
1580 if i > 0:
1581 msg += ', retry #%d' % i
1582 webpage = self._download_webpage(url, title, msg)
1583 ids = orderedSet(re.findall(video_re, webpage))
1584 if ids:
1585 break
1586 url_results = self._ids_to_results(ids)
1587 return self.playlist_result(url_results, playlist_title=title)
1588
1589
1590class YoutubeChannelIE(InfoExtractor):
1591 IE_DESC = u'YouTube.com channels'
1592 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
1593 _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
1594 _MORE_PAGES_URL = 'https://www.youtube.com/c4_browse_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
1595 IE_NAME = u'youtube:channel'
1596
1597 def extract_videos_from_page(self, page):
1598 ids_in_page = []
1599 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1600 if mobj.group(1) not in ids_in_page:
1601 ids_in_page.append(mobj.group(1))
1602 return ids_in_page
1603
1604 def _real_extract(self, url):
1605 # Extract channel id
1606 mobj = re.match(self._VALID_URL, url)
1607 if mobj is None:
1608 raise ExtractorError(u'Invalid URL: %s' % url)
1609
1610 # Download channel page
1611 channel_id = mobj.group(1)
1612 video_ids = []
1613 url = 'https://www.youtube.com/channel/%s/videos' % channel_id
1614 channel_page = self._download_webpage(url, channel_id)
1615 autogenerated = re.search(r'''(?x)
1616 class="[^"]*?(?:
1617 channel-header-autogenerated-label|
1618 yt-channel-title-autogenerated
1619 )[^"]*"''', channel_page) is not None
1620
1621 if autogenerated:
1622 # The videos are contained in a single page
1623 # the ajax pages can't be used, they are empty
1624 video_ids = self.extract_videos_from_page(channel_page)
1625 else:
1626 # Download all channel pages using the json-based channel_ajax query
1627 for pagenum in itertools.count(1):
1628 url = self._MORE_PAGES_URL % (pagenum, channel_id)
1629 page = self._download_json(
1630 url, channel_id, note=u'Downloading page #%s' % pagenum,
1631 transform_source=uppercase_escape)
1632
1633 ids_in_page = self.extract_videos_from_page(page['content_html'])
1634 video_ids.extend(ids_in_page)
1635
1636 if self._MORE_PAGES_INDICATOR not in page['load_more_widget_html']:
1637 break
1638
1639 self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1640
1641 url_entries = [self.url_result(video_id, 'Youtube', video_id=video_id)
1642 for video_id in video_ids]
1643 return self.playlist_result(url_entries, channel_id)
1644
1645
1646class YoutubeUserIE(InfoExtractor):
1647 IE_DESC = u'YouTube.com user videos (URL or "ytuser" keyword)'
1648 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:user/)?(?!(?:attribution_link|watch)(?:$|[^a-z_A-Z0-9-])))|ytuser:)(?!feed/)([A-Za-z0-9_-]+)'
1649 _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/users/%s'
1650 _GDATA_PAGE_SIZE = 50
1651 _GDATA_URL = 'https://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d&alt=json'
1652 IE_NAME = u'youtube:user'
1653
1654 @classmethod
1655 def suitable(cls, url):
1656 # Don't return True if the url can be extracted with other youtube
1657 # extractor, the regex would is too permissive and it would match.
1658 other_ies = iter(klass for (name, klass) in globals().items() if name.endswith('IE') and klass is not cls)
1659 if any(ie.suitable(url) for ie in other_ies): return False
1660 else: return super(YoutubeUserIE, cls).suitable(url)
1661
1662 def _real_extract(self, url):
1663 # Extract username
1664 mobj = re.match(self._VALID_URL, url)
1665 if mobj is None:
1666 raise ExtractorError(u'Invalid URL: %s' % url)
1667
1668 username = mobj.group(1)
1669
1670 # Download video ids using YouTube Data API. Result size per
1671 # query is limited (currently to 50 videos) so we need to query
1672 # page by page until there are no video ids - it means we got
1673 # all of them.
1674
1675 def download_page(pagenum):
1676 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1677
1678 gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
1679 page = self._download_webpage(
1680 gdata_url, username,
1681 u'Downloading video ids from %d to %d' % (
1682 start_index, start_index + self._GDATA_PAGE_SIZE))
1683
1684 try:
1685 response = json.loads(page)
1686 except ValueError as err:
1687 raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
1688 if 'entry' not in response['feed']:
1689 return
1690
1691 # Extract video identifiers
1692 entries = response['feed']['entry']
1693 for entry in entries:
1694 title = entry['title']['$t']
1695 video_id = entry['id']['$t'].split('/')[-1]
1696 yield {
1697 '_type': 'url',
1698 'url': video_id,
1699 'ie_key': 'Youtube',
1700 'id': video_id,
1701 'title': title,
1702 }
1703 url_results = PagedList(download_page, self._GDATA_PAGE_SIZE)
1704
1705 return self.playlist_result(url_results, playlist_title=username)
1706
1707
1708class YoutubeSearchIE(SearchInfoExtractor):
1709 IE_DESC = u'YouTube.com searches'
1710 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1711 _MAX_RESULTS = 1000
1712 IE_NAME = u'youtube:search'
1713 _SEARCH_KEY = 'ytsearch'
1714
1715 def _get_n_results(self, query, n):
1716 """Get a specified number of results for a query"""
1717
1718 video_ids = []
1719 pagenum = 0
1720 limit = n
1721
1722 while (50 * pagenum) < limit:
1723 result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1724 data_json = self._download_webpage(
1725 result_url, video_id=u'query "%s"' % query,
1726 note=u'Downloading page %s' % (pagenum + 1),
1727 errnote=u'Unable to download API page')
1728 data = json.loads(data_json)
1729 api_response = data['data']
1730
1731 if 'items' not in api_response:
1732 raise ExtractorError(
1733 u'[youtube] No video results', expected=True)
1734
1735 new_ids = list(video['id'] for video in api_response['items'])
1736 video_ids += new_ids
1737
1738 limit = min(n, api_response['totalItems'])
1739 pagenum += 1
1740
1741 if len(video_ids) > n:
1742 video_ids = video_ids[:n]
1743 videos = [self.url_result(video_id, 'Youtube', video_id=video_id)
1744 for video_id in video_ids]
1745 return self.playlist_result(videos, query)
1746
1747class YoutubeSearchDateIE(YoutubeSearchIE):
1748 IE_NAME = YoutubeSearchIE.IE_NAME + ':date'
1749 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc&orderby=published'
1750 _SEARCH_KEY = 'ytsearchdate'
1751 IE_DESC = u'YouTube.com searches, newest videos first'
1752
1753class YoutubeShowIE(InfoExtractor):
1754 IE_DESC = u'YouTube.com (multi-season) shows'
1755 _VALID_URL = r'https?://www\.youtube\.com/show/(.*)'
1756 IE_NAME = u'youtube:show'
1757
1758 def _real_extract(self, url):
1759 mobj = re.match(self._VALID_URL, url)
1760 show_name = mobj.group(1)
1761 webpage = self._download_webpage(url, show_name, u'Downloading show webpage')
1762 # There's one playlist for each season of the show
1763 m_seasons = list(re.finditer(r'href="(/playlist\?list=.*?)"', webpage))
1764 self.to_screen(u'%s: Found %s seasons' % (show_name, len(m_seasons)))
1765 return [self.url_result('https://www.youtube.com' + season.group(1), 'YoutubePlaylist') for season in m_seasons]
1766
1767
1768class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
1769 """
1770 Base class for extractors that fetch info from
1771 http://www.youtube.com/feed_ajax
1772 Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.
1773 """
1774 _LOGIN_REQUIRED = True
1775 # use action_load_personal_feed instead of action_load_system_feed
1776 _PERSONAL_FEED = False
1777
1778 @property
1779 def _FEED_TEMPLATE(self):
1780 action = 'action_load_system_feed'
1781 if self._PERSONAL_FEED:
1782 action = 'action_load_personal_feed'
1783 return 'https://www.youtube.com/feed_ajax?%s=1&feed_name=%s&paging=%%s' % (action, self._FEED_NAME)
1784
1785 @property
1786 def IE_NAME(self):
1787 return u'youtube:%s' % self._FEED_NAME
1788
1789 def _real_initialize(self):
1790 self._login()
1791
1792 def _real_extract(self, url):
1793 feed_entries = []
1794 paging = 0
1795 for i in itertools.count(1):
1796 info = self._download_webpage(self._FEED_TEMPLATE % paging,
1797 u'%s feed' % self._FEED_NAME,
1798 u'Downloading page %s' % i)
1799 info = json.loads(info)
1800 feed_html = info['feed_html']
1801 m_ids = re.finditer(r'"/watch\?v=(.*?)["&]', feed_html)
1802 ids = orderedSet(m.group(1) for m in m_ids)
1803 feed_entries.extend(
1804 self.url_result(video_id, 'Youtube', video_id=video_id)
1805 for video_id in ids)
1806 if info['paging'] is None:
1807 break
1808 paging = info['paging']
1809 return self.playlist_result(feed_entries, playlist_title=self._PLAYLIST_TITLE)
1810
1811class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor):
1812 IE_DESC = u'YouTube.com subscriptions feed, "ytsubs" keyword(requires authentication)'
1813 _VALID_URL = r'https?://www\.youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
1814 _FEED_NAME = 'subscriptions'
1815 _PLAYLIST_TITLE = u'Youtube Subscriptions'
1816
1817class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
1818 IE_DESC = u'YouTube.com recommended videos, "ytrec" keyword (requires authentication)'
1819 _VALID_URL = r'https?://www\.youtube\.com/feed/recommended|:ytrec(?:ommended)?'
1820 _FEED_NAME = 'recommended'
1821 _PLAYLIST_TITLE = u'Youtube Recommended videos'
1822
1823class YoutubeWatchLaterIE(YoutubeFeedsInfoExtractor):
1824 IE_DESC = u'Youtube watch later list, "ytwatchlater" keyword (requires authentication)'
1825 _VALID_URL = r'https?://www\.youtube\.com/feed/watch_later|:ytwatchlater'
1826 _FEED_NAME = 'watch_later'
1827 _PLAYLIST_TITLE = u'Youtube Watch Later'
1828 _PERSONAL_FEED = True
1829
1830class YoutubeHistoryIE(YoutubeFeedsInfoExtractor):
1831 IE_DESC = u'Youtube watch history, "ythistory" keyword (requires authentication)'
1832 _VALID_URL = u'https?://www\.youtube\.com/feed/history|:ythistory'
1833 _FEED_NAME = 'history'
1834 _PERSONAL_FEED = True
1835 _PLAYLIST_TITLE = u'Youtube Watch History'
1836
1837class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
1838 IE_NAME = u'youtube:favorites'
1839 IE_DESC = u'YouTube.com favourite videos, "ytfav" keyword (requires authentication)'
1840 _VALID_URL = r'https?://www\.youtube\.com/my_favorites|:ytfav(?:ou?rites)?'
1841 _LOGIN_REQUIRED = True
1842
1843 def _real_extract(self, url):
1844 webpage = self._download_webpage('https://www.youtube.com/my_favorites', 'Youtube Favourites videos')
1845 playlist_id = self._search_regex(r'list=(.+?)["&]', webpage, u'favourites playlist id')
1846 return self.url_result(playlist_id, 'YoutubePlaylist')
1847
1848
1849class YoutubeTruncatedURLIE(InfoExtractor):
1850 IE_NAME = 'youtube:truncated_url'
1851 IE_DESC = False # Do not list
1852 _VALID_URL = r'''(?x)
1853 (?:https?://)?[^/]+/watch\?(?:feature=[a-z_]+)?$|
1854 (?:https?://)?(?:www\.)?youtube\.com/attribution_link\?a=[^&]+$
1855 '''
1856
1857 def _real_extract(self, url):
1858 raise ExtractorError(
1859 u'Did you forget to quote the URL? Remember that & is a meta '
1860 u'character in most shells, so you want to put the URL in quotes, '
1861 u'like youtube-dl '
1862 u'"http://www.youtube.com/watch?feature=foo&v=BaW_jenozKc" '
1863 u' or simply youtube-dl BaW_jenozKc .',
1864 expected=True)