]> jfr.im git - yt-dlp.git/blame_incremental - youtube_dl/extractor/youtube.py
[jeuxvideo] Modernize
[yt-dlp.git] / youtube_dl / extractor / youtube.py
... / ...
CommitLineData
1# coding: utf-8
2
3import collections
4import errno
5import io
6import itertools
7import json
8import os.path
9import re
10import string
11import struct
12import traceback
13import zlib
14
15from .common import InfoExtractor, SearchInfoExtractor
16from .subtitles import SubtitlesInfoExtractor
17from ..utils import (
18 compat_chr,
19 compat_parse_qs,
20 compat_urllib_parse,
21 compat_urllib_request,
22 compat_urlparse,
23 compat_str,
24
25 clean_html,
26 get_cachedir,
27 get_element_by_id,
28 get_element_by_attribute,
29 ExtractorError,
30 int_or_none,
31 PagedList,
32 RegexNotFoundError,
33 unescapeHTML,
34 unified_strdate,
35 orderedSet,
36 write_json_file,
37)
38
39class YoutubeBaseInfoExtractor(InfoExtractor):
40 """Provide base functions for Youtube extractors"""
41 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
42 _LANG_URL = r'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
43 _AGE_URL = 'https://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
44 _NETRC_MACHINE = 'youtube'
45 # If True it will raise an error if no login info is provided
46 _LOGIN_REQUIRED = False
47
48 def _set_language(self):
49 return bool(self._download_webpage(
50 self._LANG_URL, None,
51 note=u'Setting language', errnote='unable to set language',
52 fatal=False))
53
54 def _login(self):
55 (username, password) = self._get_login_info()
56 # No authentication to be performed
57 if username is None:
58 if self._LOGIN_REQUIRED:
59 raise ExtractorError(u'No login info available, needed for using %s.' % self.IE_NAME, expected=True)
60 return False
61
62 login_page = self._download_webpage(
63 self._LOGIN_URL, None,
64 note=u'Downloading login page',
65 errnote=u'unable to fetch login page', fatal=False)
66 if login_page is False:
67 return
68
69 galx = self._search_regex(r'(?s)<input.+?name="GALX".+?value="(.+?)"',
70 login_page, u'Login GALX parameter')
71
72 # Log in
73 login_form_strs = {
74 u'continue': u'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
75 u'Email': username,
76 u'GALX': galx,
77 u'Passwd': password,
78 u'PersistentCookie': u'yes',
79 u'_utf8': u'霱',
80 u'bgresponse': u'js_disabled',
81 u'checkConnection': u'',
82 u'checkedDomains': u'youtube',
83 u'dnConn': u'',
84 u'pstMsg': u'0',
85 u'rmShown': u'1',
86 u'secTok': u'',
87 u'signIn': u'Sign in',
88 u'timeStmp': u'',
89 u'service': u'youtube',
90 u'uilel': u'3',
91 u'hl': u'en_US',
92 }
93 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
94 # chokes on unicode
95 login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
96 login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
97
98 req = compat_urllib_request.Request(self._LOGIN_URL, login_data)
99 login_results = self._download_webpage(
100 req, None,
101 note=u'Logging in', errnote=u'unable to log in', fatal=False)
102 if login_results is False:
103 return False
104 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
105 self._downloader.report_warning(u'unable to log in: bad username or password')
106 return False
107 return True
108
109 def _confirm_age(self):
110 age_form = {
111 'next_url': '/',
112 'action_confirm': 'Confirm',
113 }
114 req = compat_urllib_request.Request(self._AGE_URL,
115 compat_urllib_parse.urlencode(age_form).encode('ascii'))
116
117 self._download_webpage(
118 req, None,
119 note=u'Confirming age', errnote=u'Unable to confirm age')
120 return True
121
122 def _real_initialize(self):
123 if self._downloader is None:
124 return
125 if not self._set_language():
126 return
127 if not self._login():
128 return
129 self._confirm_age()
130
131
132class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
133 IE_DESC = u'YouTube.com'
134 _VALID_URL = r"""(?x)^
135 (
136 (?:https?://|//)? # http(s):// or protocol-independent URL (optional)
137 (?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/|
138 (?:www\.)?deturl\.com/www\.youtube\.com/|
139 (?:www\.)?pwnyoutube\.com|
140 tube\.majestyc\.net/|
141 youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains
142 (?:.*?\#/)? # handle anchor (#/) redirect urls
143 (?: # the various things that can precede the ID:
144 (?:(?:v|embed|e)/) # v/ or embed/ or e/
145 |(?: # or the v= param in all its forms
146 (?:(?:watch|movie)(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
147 (?:\?|\#!?) # the params delimiter ? or # or #!
148 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
149 v=
150 )
151 ))
152 |youtu\.be/ # just youtu.be/xxxx
153 )
154 )? # all until now is optional -> you can pass the naked ID
155 ([0-9A-Za-z_-]{11}) # here is it! the YouTube video ID
156 (?(1).+)? # if we found the ID, everything can follow
157 $"""
158 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
159 _formats = {
160 '5': {'ext': 'flv', 'width': 400, 'height': 240},
161 '6': {'ext': 'flv', 'width': 450, 'height': 270},
162 '13': {'ext': '3gp'},
163 '17': {'ext': '3gp', 'width': 176, 'height': 144},
164 '18': {'ext': 'mp4', 'width': 640, 'height': 360},
165 '22': {'ext': 'mp4', 'width': 1280, 'height': 720},
166 '34': {'ext': 'flv', 'width': 640, 'height': 360},
167 '35': {'ext': 'flv', 'width': 854, 'height': 480},
168 '36': {'ext': '3gp', 'width': 320, 'height': 240},
169 '37': {'ext': 'mp4', 'width': 1920, 'height': 1080},
170 '38': {'ext': 'mp4', 'width': 4096, 'height': 3072},
171 '43': {'ext': 'webm', 'width': 640, 'height': 360},
172 '44': {'ext': 'webm', 'width': 854, 'height': 480},
173 '45': {'ext': 'webm', 'width': 1280, 'height': 720},
174 '46': {'ext': 'webm', 'width': 1920, 'height': 1080},
175
176
177 # 3d videos
178 '82': {'ext': 'mp4', 'height': 360, 'resolution': '360p', 'format_note': '3D', 'preference': -20},
179 '83': {'ext': 'mp4', 'height': 480, 'resolution': '480p', 'format_note': '3D', 'preference': -20},
180 '84': {'ext': 'mp4', 'height': 720, 'resolution': '720p', 'format_note': '3D', 'preference': -20},
181 '85': {'ext': 'mp4', 'height': 1080, 'resolution': '1080p', 'format_note': '3D', 'preference': -20},
182 '100': {'ext': 'webm', 'height': 360, 'resolution': '360p', 'format_note': '3D', 'preference': -20},
183 '101': {'ext': 'webm', 'height': 480, 'resolution': '480p', 'format_note': '3D', 'preference': -20},
184 '102': {'ext': 'webm', 'height': 720, 'resolution': '720p', 'format_note': '3D', 'preference': -20},
185
186 # Apple HTTP Live Streaming
187 '92': {'ext': 'mp4', 'height': 240, 'resolution': '240p', 'format_note': 'HLS', 'preference': -10},
188 '93': {'ext': 'mp4', 'height': 360, 'resolution': '360p', 'format_note': 'HLS', 'preference': -10},
189 '94': {'ext': 'mp4', 'height': 480, 'resolution': '480p', 'format_note': 'HLS', 'preference': -10},
190 '95': {'ext': 'mp4', 'height': 720, 'resolution': '720p', 'format_note': 'HLS', 'preference': -10},
191 '96': {'ext': 'mp4', 'height': 1080, 'resolution': '1080p', 'format_note': 'HLS', 'preference': -10},
192 '132': {'ext': 'mp4', 'height': 240, 'resolution': '240p', 'format_note': 'HLS', 'preference': -10},
193 '151': {'ext': 'mp4', 'height': 72, 'resolution': '72p', 'format_note': 'HLS', 'preference': -10},
194
195 # DASH mp4 video
196 '133': {'ext': 'mp4', 'height': 240, 'resolution': '240p', 'format_note': 'DASH video', 'preference': -40},
197 '134': {'ext': 'mp4', 'height': 360, 'resolution': '360p', 'format_note': 'DASH video', 'preference': -40},
198 '135': {'ext': 'mp4', 'height': 480, 'resolution': '480p', 'format_note': 'DASH video', 'preference': -40},
199 '136': {'ext': 'mp4', 'height': 720, 'resolution': '720p', 'format_note': 'DASH video', 'preference': -40},
200 '137': {'ext': 'mp4', 'height': 1080, 'resolution': '1080p', 'format_note': 'DASH video', 'preference': -40},
201 '138': {'ext': 'mp4', 'height': 1081, 'resolution': '>1080p', 'format_note': 'DASH video', 'preference': -40},
202 '160': {'ext': 'mp4', 'height': 192, 'resolution': '192p', 'format_note': 'DASH video', 'preference': -40},
203 '264': {'ext': 'mp4', 'height': 1080, 'resolution': '1080p', 'format_note': 'DASH video', 'preference': -40},
204
205 # Dash mp4 audio
206 '139': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 48, 'preference': -50},
207 '140': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 128, 'preference': -50},
208 '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 256, 'preference': -50},
209
210 # Dash webm
211 '167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'VP8', 'acodec': 'none', 'preference': -40},
212 '168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'VP8', 'acodec': 'none', 'preference': -40},
213 '169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'VP8', 'acodec': 'none', 'preference': -40},
214 '170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'VP8', 'acodec': 'none', 'preference': -40},
215 '218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'VP8', 'acodec': 'none', 'preference': -40},
216 '219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'VP8', 'acodec': 'none', 'preference': -40},
217 '242': {'ext': 'webm', 'height': 240, 'resolution': '240p', 'format_note': 'DASH webm', 'preference': -40},
218 '243': {'ext': 'webm', 'height': 360, 'resolution': '360p', 'format_note': 'DASH webm', 'preference': -40},
219 '244': {'ext': 'webm', 'height': 480, 'resolution': '480p', 'format_note': 'DASH webm', 'preference': -40},
220 '245': {'ext': 'webm', 'height': 480, 'resolution': '480p', 'format_note': 'DASH webm', 'preference': -40},
221 '246': {'ext': 'webm', 'height': 480, 'resolution': '480p', 'format_note': 'DASH webm', 'preference': -40},
222 '247': {'ext': 'webm', 'height': 720, 'resolution': '720p', 'format_note': 'DASH webm', 'preference': -40},
223 '248': {'ext': 'webm', 'height': 1080, 'resolution': '1080p', 'format_note': 'DASH webm', 'preference': -40},
224
225 # Dash webm audio
226 '171': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH webm audio', 'abr': 48, 'preference': -50},
227 '172': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH webm audio', 'abr': 256, 'preference': -50},
228
229 # RTMP (unnamed)
230 '_rtmp': {'protocol': 'rtmp'},
231 }
232
233 IE_NAME = u'youtube'
234 _TESTS = [
235 {
236 u"url": u"http://www.youtube.com/watch?v=BaW_jenozKc",
237 u"file": u"BaW_jenozKc.mp4",
238 u"info_dict": {
239 u"title": u"youtube-dl test video \"'/\\ä↭𝕐",
240 u"uploader": u"Philipp Hagemeister",
241 u"uploader_id": u"phihag",
242 u"upload_date": u"20121002",
243 u"description": u"test chars: \"'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de ."
244 }
245 },
246 {
247 u"url": u"http://www.youtube.com/watch?v=UxxajLWwzqY",
248 u"file": u"UxxajLWwzqY.mp4",
249 u"note": u"Test generic use_cipher_signature video (#897)",
250 u"info_dict": {
251 u"upload_date": u"20120506",
252 u"title": u"Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]",
253 u"description": u"md5:5b292926389560516e384ac437c0ec07",
254 u"uploader": u"Icona Pop",
255 u"uploader_id": u"IconaPop"
256 }
257 },
258 {
259 u"url": u"https://www.youtube.com/watch?v=07FYdnEawAQ",
260 u"file": u"07FYdnEawAQ.mp4",
261 u"note": u"Test VEVO video with age protection (#956)",
262 u"info_dict": {
263 u"upload_date": u"20130703",
264 u"title": u"Justin Timberlake - Tunnel Vision (Explicit)",
265 u"description": u"md5:64249768eec3bc4276236606ea996373",
266 u"uploader": u"justintimberlakeVEVO",
267 u"uploader_id": u"justintimberlakeVEVO"
268 }
269 },
270 {
271 u"url": u"//www.YouTube.com/watch?v=yZIXLfi8CZQ",
272 u"file": u"yZIXLfi8CZQ.mp4",
273 u"note": u"Embed-only video (#1746)",
274 u"info_dict": {
275 u"upload_date": u"20120608",
276 u"title": u"Principal Sexually Assaults A Teacher - Episode 117 - 8th June 2012",
277 u"description": u"md5:09b78bd971f1e3e289601dfba15ca4f7",
278 u"uploader": u"SET India",
279 u"uploader_id": u"setindia"
280 }
281 },
282 {
283 u"url": u"http://www.youtube.com/watch?v=a9LDPn-MO4I",
284 u"file": u"a9LDPn-MO4I.m4a",
285 u"note": u"256k DASH audio (format 141) via DASH manifest",
286 u"info_dict": {
287 u"upload_date": "20121002",
288 u"uploader_id": "8KVIDEO",
289 u"description": "No description available.",
290 u"uploader": "8KVIDEO",
291 u"title": "UHDTV TEST 8K VIDEO.mp4"
292 },
293 u"params": {
294 u"youtube_include_dash_manifest": True,
295 u"format": "141",
296 },
297 },
298 ]
299
300
301 @classmethod
302 def suitable(cls, url):
303 """Receives a URL and returns True if suitable for this IE."""
304 if YoutubePlaylistIE.suitable(url): return False
305 return re.match(cls._VALID_URL, url) is not None
306
307 def __init__(self, *args, **kwargs):
308 super(YoutubeIE, self).__init__(*args, **kwargs)
309 self._player_cache = {}
310
311 def report_video_info_webpage_download(self, video_id):
312 """Report attempt to download video info webpage."""
313 self.to_screen(u'%s: Downloading video info webpage' % video_id)
314
315 def report_information_extraction(self, video_id):
316 """Report attempt to extract video information."""
317 self.to_screen(u'%s: Extracting video information' % video_id)
318
319 def report_unavailable_format(self, video_id, format):
320 """Report extracted video URL."""
321 self.to_screen(u'%s: Format %s not available' % (video_id, format))
322
323 def report_rtmp_download(self):
324 """Indicate the download will use the RTMP protocol."""
325 self.to_screen(u'RTMP download detected')
326
327 def _extract_signature_function(self, video_id, player_url, slen):
328 id_m = re.match(r'.*-(?P<id>[a-zA-Z0-9_-]+)\.(?P<ext>[a-z]+)$',
329 player_url)
330 player_type = id_m.group('ext')
331 player_id = id_m.group('id')
332
333 # Read from filesystem cache
334 func_id = '%s_%s_%d' % (player_type, player_id, slen)
335 assert os.path.basename(func_id) == func_id
336 cache_dir = get_cachedir(self._downloader.params)
337
338 cache_enabled = cache_dir is not None
339 if cache_enabled:
340 cache_fn = os.path.join(os.path.expanduser(cache_dir),
341 u'youtube-sigfuncs',
342 func_id + '.json')
343 try:
344 with io.open(cache_fn, 'r', encoding='utf-8') as cachef:
345 cache_spec = json.load(cachef)
346 return lambda s: u''.join(s[i] for i in cache_spec)
347 except IOError:
348 pass # No cache available
349
350 if player_type == 'js':
351 code = self._download_webpage(
352 player_url, video_id,
353 note=u'Downloading %s player %s' % (player_type, player_id),
354 errnote=u'Download of %s failed' % player_url)
355 res = self._parse_sig_js(code)
356 elif player_type == 'swf':
357 urlh = self._request_webpage(
358 player_url, video_id,
359 note=u'Downloading %s player %s' % (player_type, player_id),
360 errnote=u'Download of %s failed' % player_url)
361 code = urlh.read()
362 res = self._parse_sig_swf(code)
363 else:
364 assert False, 'Invalid player type %r' % player_type
365
366 if cache_enabled:
367 try:
368 test_string = u''.join(map(compat_chr, range(slen)))
369 cache_res = res(test_string)
370 cache_spec = [ord(c) for c in cache_res]
371 try:
372 os.makedirs(os.path.dirname(cache_fn))
373 except OSError as ose:
374 if ose.errno != errno.EEXIST:
375 raise
376 write_json_file(cache_spec, cache_fn)
377 except Exception:
378 tb = traceback.format_exc()
379 self._downloader.report_warning(
380 u'Writing cache to %r failed: %s' % (cache_fn, tb))
381
382 return res
383
384 def _print_sig_code(self, func, slen):
385 def gen_sig_code(idxs):
386 def _genslice(start, end, step):
387 starts = u'' if start == 0 else str(start)
388 ends = (u':%d' % (end+step)) if end + step >= 0 else u':'
389 steps = u'' if step == 1 else (u':%d' % step)
390 return u's[%s%s%s]' % (starts, ends, steps)
391
392 step = None
393 start = '(Never used)' # Quelch pyflakes warnings - start will be
394 # set as soon as step is set
395 for i, prev in zip(idxs[1:], idxs[:-1]):
396 if step is not None:
397 if i - prev == step:
398 continue
399 yield _genslice(start, prev, step)
400 step = None
401 continue
402 if i - prev in [-1, 1]:
403 step = i - prev
404 start = prev
405 continue
406 else:
407 yield u's[%d]' % prev
408 if step is None:
409 yield u's[%d]' % i
410 else:
411 yield _genslice(start, i, step)
412
413 test_string = u''.join(map(compat_chr, range(slen)))
414 cache_res = func(test_string)
415 cache_spec = [ord(c) for c in cache_res]
416 expr_code = u' + '.join(gen_sig_code(cache_spec))
417 code = u'if len(s) == %d:\n return %s\n' % (slen, expr_code)
418 self.to_screen(u'Extracted signature function:\n' + code)
419
420 def _parse_sig_js(self, jscode):
421 funcname = self._search_regex(
422 r'signature=([a-zA-Z]+)', jscode,
423 u'Initial JS player signature function name')
424
425 functions = {}
426
427 def argidx(varname):
428 return string.lowercase.index(varname)
429
430 def interpret_statement(stmt, local_vars, allow_recursion=20):
431 if allow_recursion < 0:
432 raise ExtractorError(u'Recursion limit reached')
433
434 if stmt.startswith(u'var '):
435 stmt = stmt[len(u'var '):]
436 ass_m = re.match(r'^(?P<out>[a-z]+)(?:\[(?P<index>[^\]]+)\])?' +
437 r'=(?P<expr>.*)$', stmt)
438 if ass_m:
439 if ass_m.groupdict().get('index'):
440 def assign(val):
441 lvar = local_vars[ass_m.group('out')]
442 idx = interpret_expression(ass_m.group('index'),
443 local_vars, allow_recursion)
444 assert isinstance(idx, int)
445 lvar[idx] = val
446 return val
447 expr = ass_m.group('expr')
448 else:
449 def assign(val):
450 local_vars[ass_m.group('out')] = val
451 return val
452 expr = ass_m.group('expr')
453 elif stmt.startswith(u'return '):
454 assign = lambda v: v
455 expr = stmt[len(u'return '):]
456 else:
457 raise ExtractorError(
458 u'Cannot determine left side of statement in %r' % stmt)
459
460 v = interpret_expression(expr, local_vars, allow_recursion)
461 return assign(v)
462
463 def interpret_expression(expr, local_vars, allow_recursion):
464 if expr.isdigit():
465 return int(expr)
466
467 if expr.isalpha():
468 return local_vars[expr]
469
470 m = re.match(r'^(?P<in>[a-z]+)\.(?P<member>.*)$', expr)
471 if m:
472 member = m.group('member')
473 val = local_vars[m.group('in')]
474 if member == 'split("")':
475 return list(val)
476 if member == 'join("")':
477 return u''.join(val)
478 if member == 'length':
479 return len(val)
480 if member == 'reverse()':
481 return val[::-1]
482 slice_m = re.match(r'slice\((?P<idx>.*)\)', member)
483 if slice_m:
484 idx = interpret_expression(
485 slice_m.group('idx'), local_vars, allow_recursion-1)
486 return val[idx:]
487
488 m = re.match(
489 r'^(?P<in>[a-z]+)\[(?P<idx>.+)\]$', expr)
490 if m:
491 val = local_vars[m.group('in')]
492 idx = interpret_expression(m.group('idx'), local_vars,
493 allow_recursion-1)
494 return val[idx]
495
496 m = re.match(r'^(?P<a>.+?)(?P<op>[%])(?P<b>.+?)$', expr)
497 if m:
498 a = interpret_expression(m.group('a'),
499 local_vars, allow_recursion)
500 b = interpret_expression(m.group('b'),
501 local_vars, allow_recursion)
502 return a % b
503
504 m = re.match(
505 r'^(?P<func>[a-zA-Z$]+)\((?P<args>[a-z0-9,]+)\)$', expr)
506 if m:
507 fname = m.group('func')
508 if fname not in functions:
509 functions[fname] = extract_function(fname)
510 argvals = [int(v) if v.isdigit() else local_vars[v]
511 for v in m.group('args').split(',')]
512 return functions[fname](argvals)
513 raise ExtractorError(u'Unsupported JS expression %r' % expr)
514
515 def extract_function(funcname):
516 func_m = re.search(
517 r'function ' + re.escape(funcname) +
518 r'\((?P<args>[a-z,]+)\){(?P<code>[^}]+)}',
519 jscode)
520 argnames = func_m.group('args').split(',')
521
522 def resf(args):
523 local_vars = dict(zip(argnames, args))
524 for stmt in func_m.group('code').split(';'):
525 res = interpret_statement(stmt, local_vars)
526 return res
527 return resf
528
529 initial_function = extract_function(funcname)
530 return lambda s: initial_function([s])
531
532 def _parse_sig_swf(self, file_contents):
533 if file_contents[1:3] != b'WS':
534 raise ExtractorError(
535 u'Not an SWF file; header is %r' % file_contents[:3])
536 if file_contents[:1] == b'C':
537 content = zlib.decompress(file_contents[8:])
538 else:
539 raise NotImplementedError(u'Unsupported compression format %r' %
540 file_contents[:1])
541
542 def extract_tags(content):
543 pos = 0
544 while pos < len(content):
545 header16 = struct.unpack('<H', content[pos:pos+2])[0]
546 pos += 2
547 tag_code = header16 >> 6
548 tag_len = header16 & 0x3f
549 if tag_len == 0x3f:
550 tag_len = struct.unpack('<I', content[pos:pos+4])[0]
551 pos += 4
552 assert pos+tag_len <= len(content)
553 yield (tag_code, content[pos:pos+tag_len])
554 pos += tag_len
555
556 code_tag = next(tag
557 for tag_code, tag in extract_tags(content)
558 if tag_code == 82)
559 p = code_tag.index(b'\0', 4) + 1
560 code_reader = io.BytesIO(code_tag[p:])
561
562 # Parse ABC (AVM2 ByteCode)
563 def read_int(reader=None):
564 if reader is None:
565 reader = code_reader
566 res = 0
567 shift = 0
568 for _ in range(5):
569 buf = reader.read(1)
570 assert len(buf) == 1
571 b = struct.unpack('<B', buf)[0]
572 res = res | ((b & 0x7f) << shift)
573 if b & 0x80 == 0:
574 break
575 shift += 7
576 return res
577
578 def u30(reader=None):
579 res = read_int(reader)
580 assert res & 0xf0000000 == 0
581 return res
582 u32 = read_int
583
584 def s32(reader=None):
585 v = read_int(reader)
586 if v & 0x80000000 != 0:
587 v = - ((v ^ 0xffffffff) + 1)
588 return v
589
590 def read_string(reader=None):
591 if reader is None:
592 reader = code_reader
593 slen = u30(reader)
594 resb = reader.read(slen)
595 assert len(resb) == slen
596 return resb.decode('utf-8')
597
598 def read_bytes(count, reader=None):
599 if reader is None:
600 reader = code_reader
601 resb = reader.read(count)
602 assert len(resb) == count
603 return resb
604
605 def read_byte(reader=None):
606 resb = read_bytes(1, reader=reader)
607 res = struct.unpack('<B', resb)[0]
608 return res
609
610 # minor_version + major_version
611 read_bytes(2 + 2)
612
613 # Constant pool
614 int_count = u30()
615 for _c in range(1, int_count):
616 s32()
617 uint_count = u30()
618 for _c in range(1, uint_count):
619 u32()
620 double_count = u30()
621 read_bytes((double_count-1) * 8)
622 string_count = u30()
623 constant_strings = [u'']
624 for _c in range(1, string_count):
625 s = read_string()
626 constant_strings.append(s)
627 namespace_count = u30()
628 for _c in range(1, namespace_count):
629 read_bytes(1) # kind
630 u30() # name
631 ns_set_count = u30()
632 for _c in range(1, ns_set_count):
633 count = u30()
634 for _c2 in range(count):
635 u30()
636 multiname_count = u30()
637 MULTINAME_SIZES = {
638 0x07: 2, # QName
639 0x0d: 2, # QNameA
640 0x0f: 1, # RTQName
641 0x10: 1, # RTQNameA
642 0x11: 0, # RTQNameL
643 0x12: 0, # RTQNameLA
644 0x09: 2, # Multiname
645 0x0e: 2, # MultinameA
646 0x1b: 1, # MultinameL
647 0x1c: 1, # MultinameLA
648 }
649 multinames = [u'']
650 for _c in range(1, multiname_count):
651 kind = u30()
652 assert kind in MULTINAME_SIZES, u'Invalid multiname kind %r' % kind
653 if kind == 0x07:
654 u30() # namespace_idx
655 name_idx = u30()
656 multinames.append(constant_strings[name_idx])
657 else:
658 multinames.append('[MULTINAME kind: %d]' % kind)
659 for _c2 in range(MULTINAME_SIZES[kind]):
660 u30()
661
662 # Methods
663 method_count = u30()
664 MethodInfo = collections.namedtuple(
665 'MethodInfo',
666 ['NEED_ARGUMENTS', 'NEED_REST'])
667 method_infos = []
668 for method_id in range(method_count):
669 param_count = u30()
670 u30() # return type
671 for _ in range(param_count):
672 u30() # param type
673 u30() # name index (always 0 for youtube)
674 flags = read_byte()
675 if flags & 0x08 != 0:
676 # Options present
677 option_count = u30()
678 for c in range(option_count):
679 u30() # val
680 read_bytes(1) # kind
681 if flags & 0x80 != 0:
682 # Param names present
683 for _ in range(param_count):
684 u30() # param name
685 mi = MethodInfo(flags & 0x01 != 0, flags & 0x04 != 0)
686 method_infos.append(mi)
687
688 # Metadata
689 metadata_count = u30()
690 for _c in range(metadata_count):
691 u30() # name
692 item_count = u30()
693 for _c2 in range(item_count):
694 u30() # key
695 u30() # value
696
697 def parse_traits_info():
698 trait_name_idx = u30()
699 kind_full = read_byte()
700 kind = kind_full & 0x0f
701 attrs = kind_full >> 4
702 methods = {}
703 if kind in [0x00, 0x06]: # Slot or Const
704 u30() # Slot id
705 u30() # type_name_idx
706 vindex = u30()
707 if vindex != 0:
708 read_byte() # vkind
709 elif kind in [0x01, 0x02, 0x03]: # Method / Getter / Setter
710 u30() # disp_id
711 method_idx = u30()
712 methods[multinames[trait_name_idx]] = method_idx
713 elif kind == 0x04: # Class
714 u30() # slot_id
715 u30() # classi
716 elif kind == 0x05: # Function
717 u30() # slot_id
718 function_idx = u30()
719 methods[function_idx] = multinames[trait_name_idx]
720 else:
721 raise ExtractorError(u'Unsupported trait kind %d' % kind)
722
723 if attrs & 0x4 != 0: # Metadata present
724 metadata_count = u30()
725 for _c3 in range(metadata_count):
726 u30() # metadata index
727
728 return methods
729
730 # Classes
731 TARGET_CLASSNAME = u'SignatureDecipher'
732 searched_idx = multinames.index(TARGET_CLASSNAME)
733 searched_class_id = None
734 class_count = u30()
735 for class_id in range(class_count):
736 name_idx = u30()
737 if name_idx == searched_idx:
738 # We found the class we're looking for!
739 searched_class_id = class_id
740 u30() # super_name idx
741 flags = read_byte()
742 if flags & 0x08 != 0: # Protected namespace is present
743 u30() # protected_ns_idx
744 intrf_count = u30()
745 for _c2 in range(intrf_count):
746 u30()
747 u30() # iinit
748 trait_count = u30()
749 for _c2 in range(trait_count):
750 parse_traits_info()
751
752 if searched_class_id is None:
753 raise ExtractorError(u'Target class %r not found' %
754 TARGET_CLASSNAME)
755
756 method_names = {}
757 method_idxs = {}
758 for class_id in range(class_count):
759 u30() # cinit
760 trait_count = u30()
761 for _c2 in range(trait_count):
762 trait_methods = parse_traits_info()
763 if class_id == searched_class_id:
764 method_names.update(trait_methods.items())
765 method_idxs.update(dict(
766 (idx, name)
767 for name, idx in trait_methods.items()))
768
769 # Scripts
770 script_count = u30()
771 for _c in range(script_count):
772 u30() # init
773 trait_count = u30()
774 for _c2 in range(trait_count):
775 parse_traits_info()
776
777 # Method bodies
778 method_body_count = u30()
779 Method = collections.namedtuple('Method', ['code', 'local_count'])
780 methods = {}
781 for _c in range(method_body_count):
782 method_idx = u30()
783 u30() # max_stack
784 local_count = u30()
785 u30() # init_scope_depth
786 u30() # max_scope_depth
787 code_length = u30()
788 code = read_bytes(code_length)
789 if method_idx in method_idxs:
790 m = Method(code, local_count)
791 methods[method_idxs[method_idx]] = m
792 exception_count = u30()
793 for _c2 in range(exception_count):
794 u30() # from
795 u30() # to
796 u30() # target
797 u30() # exc_type
798 u30() # var_name
799 trait_count = u30()
800 for _c2 in range(trait_count):
801 parse_traits_info()
802
803 assert p + code_reader.tell() == len(code_tag)
804 assert len(methods) == len(method_idxs)
805
806 method_pyfunctions = {}
807
808 def extract_function(func_name):
809 if func_name in method_pyfunctions:
810 return method_pyfunctions[func_name]
811 if func_name not in methods:
812 raise ExtractorError(u'Cannot find function %r' % func_name)
813 m = methods[func_name]
814
815 def resfunc(args):
816 registers = ['(this)'] + list(args) + [None] * m.local_count
817 stack = []
818 coder = io.BytesIO(m.code)
819 while True:
820 opcode = struct.unpack('!B', coder.read(1))[0]
821 if opcode == 36: # pushbyte
822 v = struct.unpack('!B', coder.read(1))[0]
823 stack.append(v)
824 elif opcode == 44: # pushstring
825 idx = u30(coder)
826 stack.append(constant_strings[idx])
827 elif opcode == 48: # pushscope
828 # We don't implement the scope register, so we'll just
829 # ignore the popped value
830 stack.pop()
831 elif opcode == 70: # callproperty
832 index = u30(coder)
833 mname = multinames[index]
834 arg_count = u30(coder)
835 args = list(reversed(
836 [stack.pop() for _ in range(arg_count)]))
837 obj = stack.pop()
838 if mname == u'split':
839 assert len(args) == 1
840 assert isinstance(args[0], compat_str)
841 assert isinstance(obj, compat_str)
842 if args[0] == u'':
843 res = list(obj)
844 else:
845 res = obj.split(args[0])
846 stack.append(res)
847 elif mname == u'slice':
848 assert len(args) == 1
849 assert isinstance(args[0], int)
850 assert isinstance(obj, list)
851 res = obj[args[0]:]
852 stack.append(res)
853 elif mname == u'join':
854 assert len(args) == 1
855 assert isinstance(args[0], compat_str)
856 assert isinstance(obj, list)
857 res = args[0].join(obj)
858 stack.append(res)
859 elif mname in method_pyfunctions:
860 stack.append(method_pyfunctions[mname](args))
861 else:
862 raise NotImplementedError(
863 u'Unsupported property %r on %r'
864 % (mname, obj))
865 elif opcode == 72: # returnvalue
866 res = stack.pop()
867 return res
868 elif opcode == 79: # callpropvoid
869 index = u30(coder)
870 mname = multinames[index]
871 arg_count = u30(coder)
872 args = list(reversed(
873 [stack.pop() for _ in range(arg_count)]))
874 obj = stack.pop()
875 if mname == u'reverse':
876 assert isinstance(obj, list)
877 obj.reverse()
878 else:
879 raise NotImplementedError(
880 u'Unsupported (void) property %r on %r'
881 % (mname, obj))
882 elif opcode == 93: # findpropstrict
883 index = u30(coder)
884 mname = multinames[index]
885 res = extract_function(mname)
886 stack.append(res)
887 elif opcode == 97: # setproperty
888 index = u30(coder)
889 value = stack.pop()
890 idx = stack.pop()
891 obj = stack.pop()
892 assert isinstance(obj, list)
893 assert isinstance(idx, int)
894 obj[idx] = value
895 elif opcode == 98: # getlocal
896 index = u30(coder)
897 stack.append(registers[index])
898 elif opcode == 99: # setlocal
899 index = u30(coder)
900 value = stack.pop()
901 registers[index] = value
902 elif opcode == 102: # getproperty
903 index = u30(coder)
904 pname = multinames[index]
905 if pname == u'length':
906 obj = stack.pop()
907 assert isinstance(obj, list)
908 stack.append(len(obj))
909 else: # Assume attribute access
910 idx = stack.pop()
911 assert isinstance(idx, int)
912 obj = stack.pop()
913 assert isinstance(obj, list)
914 stack.append(obj[idx])
915 elif opcode == 128: # coerce
916 u30(coder)
917 elif opcode == 133: # coerce_s
918 assert isinstance(stack[-1], (type(None), compat_str))
919 elif opcode == 164: # modulo
920 value2 = stack.pop()
921 value1 = stack.pop()
922 res = value1 % value2
923 stack.append(res)
924 elif opcode == 208: # getlocal_0
925 stack.append(registers[0])
926 elif opcode == 209: # getlocal_1
927 stack.append(registers[1])
928 elif opcode == 210: # getlocal_2
929 stack.append(registers[2])
930 elif opcode == 211: # getlocal_3
931 stack.append(registers[3])
932 elif opcode == 214: # setlocal_2
933 registers[2] = stack.pop()
934 elif opcode == 215: # setlocal_3
935 registers[3] = stack.pop()
936 else:
937 raise NotImplementedError(
938 u'Unsupported opcode %d' % opcode)
939
940 method_pyfunctions[func_name] = resfunc
941 return resfunc
942
943 initial_function = extract_function(u'decipher')
944 return lambda s: initial_function([s])
945
946 def _decrypt_signature(self, s, video_id, player_url, age_gate=False):
947 """Turn the encrypted s field into a working signature"""
948
949 if player_url is not None:
950 if player_url.startswith(u'//'):
951 player_url = u'https:' + player_url
952 try:
953 player_id = (player_url, len(s))
954 if player_id not in self._player_cache:
955 func = self._extract_signature_function(
956 video_id, player_url, len(s)
957 )
958 self._player_cache[player_id] = func
959 func = self._player_cache[player_id]
960 if self._downloader.params.get('youtube_print_sig_code'):
961 self._print_sig_code(func, len(s))
962 return func(s)
963 except Exception:
964 tb = traceback.format_exc()
965 self._downloader.report_warning(
966 u'Automatic signature extraction failed: ' + tb)
967
968 self._downloader.report_warning(
969 u'Warning: Falling back to static signature algorithm')
970
971 return self._static_decrypt_signature(
972 s, video_id, player_url, age_gate)
973
974 def _static_decrypt_signature(self, s, video_id, player_url, age_gate):
975 if age_gate:
976 # The videos with age protection use another player, so the
977 # algorithms can be different.
978 if len(s) == 86:
979 return s[2:63] + s[82] + s[64:82] + s[63]
980
981 if len(s) == 93:
982 return s[86:29:-1] + s[88] + s[28:5:-1]
983 elif len(s) == 92:
984 return s[25] + s[3:25] + s[0] + s[26:42] + s[79] + s[43:79] + s[91] + s[80:83]
985 elif len(s) == 91:
986 return s[84:27:-1] + s[86] + s[26:5:-1]
987 elif len(s) == 90:
988 return s[25] + s[3:25] + s[2] + s[26:40] + s[77] + s[41:77] + s[89] + s[78:81]
989 elif len(s) == 89:
990 return s[84:78:-1] + s[87] + s[77:60:-1] + s[0] + s[59:3:-1]
991 elif len(s) == 88:
992 return s[7:28] + s[87] + s[29:45] + s[55] + s[46:55] + s[2] + s[56:87] + s[28]
993 elif len(s) == 87:
994 return s[6:27] + s[4] + s[28:39] + s[27] + s[40:59] + s[2] + s[60:]
995 elif len(s) == 86:
996 return s[80:72:-1] + s[16] + s[71:39:-1] + s[72] + s[38:16:-1] + s[82] + s[15::-1]
997 elif len(s) == 85:
998 return s[3:11] + s[0] + s[12:55] + s[84] + s[56:84]
999 elif len(s) == 84:
1000 return s[78:70:-1] + s[14] + s[69:37:-1] + s[70] + s[36:14:-1] + s[80] + s[:14][::-1]
1001 elif len(s) == 83:
1002 return s[80:63:-1] + s[0] + s[62:0:-1] + s[63]
1003 elif len(s) == 82:
1004 return s[80:37:-1] + s[7] + s[36:7:-1] + s[0] + s[6:0:-1] + s[37]
1005 elif len(s) == 81:
1006 return s[56] + s[79:56:-1] + s[41] + s[55:41:-1] + s[80] + s[40:34:-1] + s[0] + s[33:29:-1] + s[34] + s[28:9:-1] + s[29] + s[8:0:-1] + s[9]
1007 elif len(s) == 80:
1008 return s[1:19] + s[0] + s[20:68] + s[19] + s[69:80]
1009 elif len(s) == 79:
1010 return s[54] + s[77:54:-1] + s[39] + s[53:39:-1] + s[78] + s[38:34:-1] + s[0] + s[33:29:-1] + s[34] + s[28:9:-1] + s[29] + s[8:0:-1] + s[9]
1011
1012 else:
1013 raise ExtractorError(u'Unable to decrypt signature, key length %d not supported; retrying might work' % (len(s)))
1014
1015 def _get_available_subtitles(self, video_id, webpage):
1016 try:
1017 sub_list = self._download_webpage(
1018 'https://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,
1019 video_id, note=False)
1020 except ExtractorError as err:
1021 self._downloader.report_warning(u'unable to download video subtitles: %s' % compat_str(err))
1022 return {}
1023 lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
1024
1025 sub_lang_list = {}
1026 for l in lang_list:
1027 lang = l[1]
1028 params = compat_urllib_parse.urlencode({
1029 'lang': lang,
1030 'v': video_id,
1031 'fmt': self._downloader.params.get('subtitlesformat', 'srt'),
1032 'name': unescapeHTML(l[0]).encode('utf-8'),
1033 })
1034 url = u'https://www.youtube.com/api/timedtext?' + params
1035 sub_lang_list[lang] = url
1036 if not sub_lang_list:
1037 self._downloader.report_warning(u'video doesn\'t have subtitles')
1038 return {}
1039 return sub_lang_list
1040
1041 def _get_available_automatic_caption(self, video_id, webpage):
1042 """We need the webpage for getting the captions url, pass it as an
1043 argument to speed up the process."""
1044 sub_format = self._downloader.params.get('subtitlesformat', 'srt')
1045 self.to_screen(u'%s: Looking for automatic captions' % video_id)
1046 mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
1047 err_msg = u'Couldn\'t find automatic captions for %s' % video_id
1048 if mobj is None:
1049 self._downloader.report_warning(err_msg)
1050 return {}
1051 player_config = json.loads(mobj.group(1))
1052 try:
1053 args = player_config[u'args']
1054 caption_url = args[u'ttsurl']
1055 timestamp = args[u'timestamp']
1056 # We get the available subtitles
1057 list_params = compat_urllib_parse.urlencode({
1058 'type': 'list',
1059 'tlangs': 1,
1060 'asrs': 1,
1061 })
1062 list_url = caption_url + '&' + list_params
1063 caption_list = self._download_xml(list_url, video_id)
1064 original_lang_node = caption_list.find('track')
1065 if original_lang_node is None or original_lang_node.attrib.get('kind') != 'asr' :
1066 self._downloader.report_warning(u'Video doesn\'t have automatic captions')
1067 return {}
1068 original_lang = original_lang_node.attrib['lang_code']
1069
1070 sub_lang_list = {}
1071 for lang_node in caption_list.findall('target'):
1072 sub_lang = lang_node.attrib['lang_code']
1073 params = compat_urllib_parse.urlencode({
1074 'lang': original_lang,
1075 'tlang': sub_lang,
1076 'fmt': sub_format,
1077 'ts': timestamp,
1078 'kind': 'asr',
1079 })
1080 sub_lang_list[sub_lang] = caption_url + '&' + params
1081 return sub_lang_list
1082 # An extractor error can be raise by the download process if there are
1083 # no automatic captions but there are subtitles
1084 except (KeyError, ExtractorError):
1085 self._downloader.report_warning(err_msg)
1086 return {}
1087
1088 def _extract_id(self, url):
1089 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1090 if mobj is None:
1091 raise ExtractorError(u'Invalid URL: %s' % url)
1092 video_id = mobj.group(2)
1093 return video_id
1094
1095 def _extract_from_m3u8(self, manifest_url, video_id):
1096 url_map = {}
1097 def _get_urls(_manifest):
1098 lines = _manifest.split('\n')
1099 urls = filter(lambda l: l and not l.startswith('#'),
1100 lines)
1101 return urls
1102 manifest = self._download_webpage(manifest_url, video_id, u'Downloading formats manifest')
1103 formats_urls = _get_urls(manifest)
1104 for format_url in formats_urls:
1105 itag = self._search_regex(r'itag/(\d+?)/', format_url, 'itag')
1106 url_map[itag] = format_url
1107 return url_map
1108
1109 def _extract_annotations(self, video_id):
1110 url = 'https://www.youtube.com/annotations_invideo?features=1&legacy=1&video_id=%s' % video_id
1111 return self._download_webpage(url, video_id, note=u'Searching for annotations.', errnote=u'Unable to download video annotations.')
1112
1113 def _real_extract(self, url):
1114 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
1115 mobj = re.search(self._NEXT_URL_RE, url)
1116 if mobj:
1117 url = 'https://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
1118 video_id = self._extract_id(url)
1119
1120 # Get video webpage
1121 url = 'https://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
1122 video_webpage = self._download_webpage(url, video_id)
1123
1124 # Attempt to extract SWF player URL
1125 mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
1126 if mobj is not None:
1127 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1128 else:
1129 player_url = None
1130
1131 # Get video info
1132 self.report_video_info_webpage_download(video_id)
1133 if re.search(r'player-age-gate-content">', video_webpage) is not None:
1134 self.report_age_confirmation()
1135 age_gate = True
1136 # We simulate the access to the video from www.youtube.com/v/{video_id}
1137 # this can be viewed without login into Youtube
1138 data = compat_urllib_parse.urlencode({'video_id': video_id,
1139 'el': 'player_embedded',
1140 'gl': 'US',
1141 'hl': 'en',
1142 'eurl': 'https://youtube.googleapis.com/v/' + video_id,
1143 'asv': 3,
1144 'sts':'1588',
1145 })
1146 video_info_url = 'https://www.youtube.com/get_video_info?' + data
1147 video_info_webpage = self._download_webpage(video_info_url, video_id,
1148 note=False,
1149 errnote='unable to download video info webpage')
1150 video_info = compat_parse_qs(video_info_webpage)
1151 else:
1152 age_gate = False
1153 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1154 video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1155 % (video_id, el_type))
1156 video_info_webpage = self._download_webpage(video_info_url, video_id,
1157 note=False,
1158 errnote='unable to download video info webpage')
1159 video_info = compat_parse_qs(video_info_webpage)
1160 if 'token' in video_info:
1161 break
1162 if 'token' not in video_info:
1163 if 'reason' in video_info:
1164 raise ExtractorError(u'YouTube said: %s' % video_info['reason'][0], expected=True)
1165 else:
1166 raise ExtractorError(u'"token" parameter not in video info for unknown reason')
1167
1168 if 'view_count' in video_info:
1169 view_count = int(video_info['view_count'][0])
1170 else:
1171 view_count = None
1172
1173 # Check for "rental" videos
1174 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
1175 raise ExtractorError(u'"rental" videos not supported')
1176
1177 # Start extracting information
1178 self.report_information_extraction(video_id)
1179
1180 # uploader
1181 if 'author' not in video_info:
1182 raise ExtractorError(u'Unable to extract uploader name')
1183 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
1184
1185 # uploader_id
1186 video_uploader_id = None
1187 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
1188 if mobj is not None:
1189 video_uploader_id = mobj.group(1)
1190 else:
1191 self._downloader.report_warning(u'unable to extract uploader nickname')
1192
1193 # title
1194 if 'title' in video_info:
1195 video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
1196 else:
1197 self._downloader.report_warning(u'Unable to extract video title')
1198 video_title = u'_'
1199
1200 # thumbnail image
1201 # We try first to get a high quality image:
1202 m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">',
1203 video_webpage, re.DOTALL)
1204 if m_thumb is not None:
1205 video_thumbnail = m_thumb.group(1)
1206 elif 'thumbnail_url' not in video_info:
1207 self._downloader.report_warning(u'unable to extract video thumbnail')
1208 video_thumbnail = None
1209 else: # don't panic if we can't find it
1210 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
1211
1212 # upload date
1213 upload_date = None
1214 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
1215 if mobj is not None:
1216 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1217 upload_date = unified_strdate(upload_date)
1218
1219 # description
1220 video_description = get_element_by_id("eow-description", video_webpage)
1221 if video_description:
1222 video_description = re.sub(r'''(?x)
1223 <a\s+
1224 (?:[a-zA-Z-]+="[^"]+"\s+)*?
1225 title="([^"]+)"\s+
1226 (?:[a-zA-Z-]+="[^"]+"\s+)*?
1227 class="yt-uix-redirect-link"\s*>
1228 [^<]+
1229 </a>
1230 ''', r'\1', video_description)
1231 video_description = clean_html(video_description)
1232 else:
1233 fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
1234 if fd_mobj:
1235 video_description = unescapeHTML(fd_mobj.group(1))
1236 else:
1237 video_description = u''
1238
1239 def _extract_count(klass):
1240 count = self._search_regex(
1241 r'class="%s">([\d,]+)</span>' % re.escape(klass),
1242 video_webpage, klass, default=None)
1243 if count is not None:
1244 return int(count.replace(',', ''))
1245 return None
1246 like_count = _extract_count(u'likes-count')
1247 dislike_count = _extract_count(u'dislikes-count')
1248
1249 # subtitles
1250 video_subtitles = self.extract_subtitles(video_id, video_webpage)
1251
1252 if self._downloader.params.get('listsubtitles', False):
1253 self._list_available_subtitles(video_id, video_webpage)
1254 return
1255
1256 if 'length_seconds' not in video_info:
1257 self._downloader.report_warning(u'unable to extract video duration')
1258 video_duration = None
1259 else:
1260 video_duration = int(compat_urllib_parse.unquote_plus(video_info['length_seconds'][0]))
1261
1262 # annotations
1263 video_annotations = None
1264 if self._downloader.params.get('writeannotations', False):
1265 video_annotations = self._extract_annotations(video_id)
1266
1267 # Decide which formats to download
1268 try:
1269 mobj = re.search(r';ytplayer.config = ({.*?});', video_webpage)
1270 if not mobj:
1271 raise ValueError('Could not find vevo ID')
1272 info = json.loads(mobj.group(1))
1273 args = info['args']
1274 # Easy way to know if the 's' value is in url_encoded_fmt_stream_map
1275 # this signatures are encrypted
1276 if 'url_encoded_fmt_stream_map' not in args:
1277 raise ValueError(u'No stream_map present') # caught below
1278 re_signature = re.compile(r'[&,]s=')
1279 m_s = re_signature.search(args['url_encoded_fmt_stream_map'])
1280 if m_s is not None:
1281 self.to_screen(u'%s: Encrypted signatures detected.' % video_id)
1282 video_info['url_encoded_fmt_stream_map'] = [args['url_encoded_fmt_stream_map']]
1283 m_s = re_signature.search(args.get('adaptive_fmts', u''))
1284 if m_s is not None:
1285 if 'adaptive_fmts' in video_info:
1286 video_info['adaptive_fmts'][0] += ',' + args['adaptive_fmts']
1287 else:
1288 video_info['adaptive_fmts'] = [args['adaptive_fmts']]
1289 except ValueError:
1290 pass
1291
1292 def _map_to_format_list(urlmap):
1293 formats = []
1294 for itag, video_real_url in urlmap.items():
1295 dct = {
1296 'format_id': itag,
1297 'url': video_real_url,
1298 'player_url': player_url,
1299 }
1300 if itag in self._formats:
1301 dct.update(self._formats[itag])
1302 formats.append(dct)
1303 return formats
1304
1305 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1306 self.report_rtmp_download()
1307 formats = [{
1308 'format_id': '_rtmp',
1309 'protocol': 'rtmp',
1310 'url': video_info['conn'][0],
1311 'player_url': player_url,
1312 }]
1313 elif len(video_info.get('url_encoded_fmt_stream_map', [])) >= 1 or len(video_info.get('adaptive_fmts', [])) >= 1:
1314 encoded_url_map = video_info.get('url_encoded_fmt_stream_map', [''])[0] + ',' + video_info.get('adaptive_fmts',[''])[0]
1315 if 'rtmpe%3Dyes' in encoded_url_map:
1316 raise ExtractorError('rtmpe downloads are not supported, see https://github.com/rg3/youtube-dl/issues/343 for more information.', expected=True)
1317 url_map = {}
1318 for url_data_str in encoded_url_map.split(','):
1319 url_data = compat_parse_qs(url_data_str)
1320 if 'itag' in url_data and 'url' in url_data:
1321 url = url_data['url'][0]
1322 if 'sig' in url_data:
1323 url += '&signature=' + url_data['sig'][0]
1324 elif 's' in url_data:
1325 encrypted_sig = url_data['s'][0]
1326 if self._downloader.params.get('verbose'):
1327 if age_gate:
1328 if player_url is None:
1329 player_version = 'unknown'
1330 else:
1331 player_version = self._search_regex(
1332 r'-(.+)\.swf$', player_url,
1333 u'flash player', fatal=False)
1334 player_desc = 'flash player %s' % player_version
1335 else:
1336 player_version = self._search_regex(
1337 r'html5player-(.+?)\.js', video_webpage,
1338 'html5 player', fatal=False)
1339 player_desc = u'html5 player %s' % player_version
1340
1341 parts_sizes = u'.'.join(compat_str(len(part)) for part in encrypted_sig.split('.'))
1342 self.to_screen(u'encrypted signature length %d (%s), itag %s, %s' %
1343 (len(encrypted_sig), parts_sizes, url_data['itag'][0], player_desc))
1344
1345 if not age_gate:
1346 jsplayer_url_json = self._search_regex(
1347 r'"assets":.+?"js":\s*("[^"]+")',
1348 video_webpage, u'JS player URL')
1349 player_url = json.loads(jsplayer_url_json)
1350
1351 signature = self._decrypt_signature(
1352 encrypted_sig, video_id, player_url, age_gate)
1353 url += '&signature=' + signature
1354 if 'ratebypass' not in url:
1355 url += '&ratebypass=yes'
1356 url_map[url_data['itag'][0]] = url
1357 formats = _map_to_format_list(url_map)
1358 elif video_info.get('hlsvp'):
1359 manifest_url = video_info['hlsvp'][0]
1360 url_map = self._extract_from_m3u8(manifest_url, video_id)
1361 formats = _map_to_format_list(url_map)
1362 else:
1363 raise ExtractorError(u'no conn, hlsvp or url_encoded_fmt_stream_map information found in video info')
1364
1365 # Look for the DASH manifest
1366 dash_manifest_url_lst = video_info.get('dashmpd')
1367 if (dash_manifest_url_lst and dash_manifest_url_lst[0] and
1368 self._downloader.params.get('youtube_include_dash_manifest', False)):
1369 try:
1370 dash_doc = self._download_xml(
1371 dash_manifest_url_lst[0], video_id,
1372 note=u'Downloading DASH manifest',
1373 errnote=u'Could not download DASH manifest')
1374 for r in dash_doc.findall(u'.//{urn:mpeg:DASH:schema:MPD:2011}Representation'):
1375 url_el = r.find('{urn:mpeg:DASH:schema:MPD:2011}BaseURL')
1376 if url_el is None:
1377 continue
1378 format_id = r.attrib['id']
1379 video_url = url_el.text
1380 filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength'))
1381 f = {
1382 'format_id': format_id,
1383 'url': video_url,
1384 'width': int_or_none(r.attrib.get('width')),
1385 'tbr': int_or_none(r.attrib.get('bandwidth'), 1000),
1386 'asr': int_or_none(r.attrib.get('audioSamplingRate')),
1387 'filesize': filesize,
1388 }
1389 try:
1390 existing_format = next(
1391 fo for fo in formats
1392 if fo['format_id'] == format_id)
1393 except StopIteration:
1394 f.update(self._formats.get(format_id, {}))
1395 formats.append(f)
1396 else:
1397 existing_format.update(f)
1398
1399 except (ExtractorError, KeyError) as e:
1400 self.report_warning(u'Skipping DASH manifest: %s' % e, video_id)
1401
1402 self._sort_formats(formats)
1403
1404 return {
1405 'id': video_id,
1406 'uploader': video_uploader,
1407 'uploader_id': video_uploader_id,
1408 'upload_date': upload_date,
1409 'title': video_title,
1410 'thumbnail': video_thumbnail,
1411 'description': video_description,
1412 'subtitles': video_subtitles,
1413 'duration': video_duration,
1414 'age_limit': 18 if age_gate else 0,
1415 'annotations': video_annotations,
1416 'webpage_url': 'https://www.youtube.com/watch?v=%s' % video_id,
1417 'view_count': view_count,
1418 'like_count': like_count,
1419 'dislike_count': dislike_count,
1420 'formats': formats,
1421 }
1422
1423class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
1424 IE_DESC = u'YouTube.com playlists'
1425 _VALID_URL = r"""(?x)(?:
1426 (?:https?://)?
1427 (?:\w+\.)?
1428 youtube\.com/
1429 (?:
1430 (?:course|view_play_list|my_playlists|artist|playlist|watch)
1431 \? (?:.*?&)*? (?:p|a|list)=
1432 | p/
1433 )
1434 (
1435 (?:PL|EC|UU|FL|RD)?[0-9A-Za-z-_]{10,}
1436 # Top tracks, they can also include dots
1437 |(?:MC)[\w\.]*
1438 )
1439 .*
1440 |
1441 ((?:PL|EC|UU|FL|RD)[0-9A-Za-z-_]{10,})
1442 )"""
1443 _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s&page=%s'
1444 _MORE_PAGES_INDICATOR = r'data-link-type="next"'
1445 _VIDEO_RE = r'href="/watch\?v=(?P<id>[0-9A-Za-z_-]{11})&amp;[^"]*?index=(?P<index>\d+)'
1446 IE_NAME = u'youtube:playlist'
1447
1448 def _real_initialize(self):
1449 self._login()
1450
1451 def _ids_to_results(self, ids):
1452 return [self.url_result(vid_id, 'Youtube', video_id=vid_id)
1453 for vid_id in ids]
1454
1455 def _extract_mix(self, playlist_id):
1456 # The mixes are generated from a a single video
1457 # the id of the playlist is just 'RD' + video_id
1458 url = 'https://youtube.com/watch?v=%s&list=%s' % (playlist_id[-11:], playlist_id)
1459 webpage = self._download_webpage(url, playlist_id, u'Downloading Youtube mix')
1460 title_span = (get_element_by_attribute('class', 'title long-title', webpage) or
1461 get_element_by_attribute('class', 'title ', webpage))
1462 title = clean_html(title_span)
1463 video_re = r'data-index="\d+".*?href="/watch\?v=([0-9A-Za-z_-]{11})&amp;[^"]*?list=%s' % re.escape(playlist_id)
1464 ids = orderedSet(re.findall(video_re, webpage))
1465 url_results = self._ids_to_results(ids)
1466
1467 return self.playlist_result(url_results, playlist_id, title)
1468
1469 def _real_extract(self, url):
1470 # Extract playlist id
1471 mobj = re.match(self._VALID_URL, url)
1472 if mobj is None:
1473 raise ExtractorError(u'Invalid URL: %s' % url)
1474 playlist_id = mobj.group(1) or mobj.group(2)
1475
1476 # Check if it's a video-specific URL
1477 query_dict = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
1478 if 'v' in query_dict:
1479 video_id = query_dict['v'][0]
1480 if self._downloader.params.get('noplaylist'):
1481 self.to_screen(u'Downloading just video %s because of --no-playlist' % video_id)
1482 return self.url_result(video_id, 'Youtube', video_id=video_id)
1483 else:
1484 self.to_screen(u'Downloading playlist PL%s - add --no-playlist to just download video %s' % (playlist_id, video_id))
1485
1486 if playlist_id.startswith('RD'):
1487 # Mixes require a custom extraction process
1488 return self._extract_mix(playlist_id)
1489 if playlist_id.startswith('TL'):
1490 raise ExtractorError(u'For downloading YouTube.com top lists, use '
1491 u'the "yttoplist" keyword, for example "youtube-dl \'yttoplist:music:Top Tracks\'"', expected=True)
1492
1493 # Extract the video ids from the playlist pages
1494 ids = []
1495
1496 for page_num in itertools.count(1):
1497 url = self._TEMPLATE_URL % (playlist_id, page_num)
1498 page = self._download_webpage(url, playlist_id, u'Downloading page #%s' % page_num)
1499 matches = re.finditer(self._VIDEO_RE, page)
1500 # We remove the duplicates and the link with index 0
1501 # (it's not the first video of the playlist)
1502 new_ids = orderedSet(m.group('id') for m in matches if m.group('index') != '0')
1503 ids.extend(new_ids)
1504
1505 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1506 break
1507
1508 try:
1509 playlist_title = self._og_search_title(page)
1510 except RegexNotFoundError:
1511 self.report_warning(
1512 u'Playlist page is missing OpenGraph title, falling back ...',
1513 playlist_id)
1514 playlist_title = self._html_search_regex(
1515 r'<h1 class="pl-header-title">(.*?)</h1>', page, u'title')
1516
1517 url_results = self._ids_to_results(ids)
1518 return self.playlist_result(url_results, playlist_id, playlist_title)
1519
1520
1521class YoutubeTopListIE(YoutubePlaylistIE):
1522 IE_NAME = u'youtube:toplist'
1523 IE_DESC = (u'YouTube.com top lists, "yttoplist:{channel}:{list title}"'
1524 u' (Example: "yttoplist:music:Top Tracks")')
1525 _VALID_URL = r'yttoplist:(?P<chann>.*?):(?P<title>.*?)$'
1526
1527 def _real_extract(self, url):
1528 mobj = re.match(self._VALID_URL, url)
1529 channel = mobj.group('chann')
1530 title = mobj.group('title')
1531 query = compat_urllib_parse.urlencode({'title': title})
1532 playlist_re = 'href="([^"]+?%s.*?)"' % re.escape(query)
1533 channel_page = self._download_webpage('https://www.youtube.com/%s' % channel, title)
1534 link = self._html_search_regex(playlist_re, channel_page, u'list')
1535 url = compat_urlparse.urljoin('https://www.youtube.com/', link)
1536
1537 video_re = r'data-index="\d+".*?data-video-id="([0-9A-Za-z_-]{11})"'
1538 ids = []
1539 # sometimes the webpage doesn't contain the videos
1540 # retry until we get them
1541 for i in itertools.count(0):
1542 msg = u'Downloading Youtube mix'
1543 if i > 0:
1544 msg += ', retry #%d' % i
1545 webpage = self._download_webpage(url, title, msg)
1546 ids = orderedSet(re.findall(video_re, webpage))
1547 if ids:
1548 break
1549 url_results = self._ids_to_results(ids)
1550 return self.playlist_result(url_results, playlist_title=title)
1551
1552
1553class YoutubeChannelIE(InfoExtractor):
1554 IE_DESC = u'YouTube.com channels'
1555 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
1556 _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
1557 _MORE_PAGES_URL = 'https://www.youtube.com/c4_browse_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
1558 IE_NAME = u'youtube:channel'
1559
1560 def extract_videos_from_page(self, page):
1561 ids_in_page = []
1562 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1563 if mobj.group(1) not in ids_in_page:
1564 ids_in_page.append(mobj.group(1))
1565 return ids_in_page
1566
1567 def _real_extract(self, url):
1568 # Extract channel id
1569 mobj = re.match(self._VALID_URL, url)
1570 if mobj is None:
1571 raise ExtractorError(u'Invalid URL: %s' % url)
1572
1573 # Download channel page
1574 channel_id = mobj.group(1)
1575 video_ids = []
1576 url = 'https://www.youtube.com/channel/%s/videos' % channel_id
1577 channel_page = self._download_webpage(url, channel_id)
1578 autogenerated = re.search(r'''(?x)
1579 class="[^"]*?(?:
1580 channel-header-autogenerated-label|
1581 yt-channel-title-autogenerated
1582 )[^"]*"''', channel_page) is not None
1583
1584 if autogenerated:
1585 # The videos are contained in a single page
1586 # the ajax pages can't be used, they are empty
1587 video_ids = self.extract_videos_from_page(channel_page)
1588 else:
1589 # Download all channel pages using the json-based channel_ajax query
1590 for pagenum in itertools.count(1):
1591 url = self._MORE_PAGES_URL % (pagenum, channel_id)
1592 page = self._download_webpage(url, channel_id,
1593 u'Downloading page #%s' % pagenum)
1594
1595 page = json.loads(page)
1596
1597 ids_in_page = self.extract_videos_from_page(page['content_html'])
1598 video_ids.extend(ids_in_page)
1599
1600 if self._MORE_PAGES_INDICATOR not in page['load_more_widget_html']:
1601 break
1602
1603 self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1604
1605 url_entries = [self.url_result(video_id, 'Youtube', video_id=video_id)
1606 for video_id in video_ids]
1607 return self.playlist_result(url_entries, channel_id)
1608
1609
1610class YoutubeUserIE(InfoExtractor):
1611 IE_DESC = u'YouTube.com user videos (URL or "ytuser" keyword)'
1612 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:user/)?(?!(?:attribution_link|watch)(?:$|[^a-z_A-Z0-9-])))|ytuser:)(?!feed/)([A-Za-z0-9_-]+)'
1613 _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/users/%s'
1614 _GDATA_PAGE_SIZE = 50
1615 _GDATA_URL = 'https://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d&alt=json'
1616 IE_NAME = u'youtube:user'
1617
1618 @classmethod
1619 def suitable(cls, url):
1620 # Don't return True if the url can be extracted with other youtube
1621 # extractor, the regex would is too permissive and it would match.
1622 other_ies = iter(klass for (name, klass) in globals().items() if name.endswith('IE') and klass is not cls)
1623 if any(ie.suitable(url) for ie in other_ies): return False
1624 else: return super(YoutubeUserIE, cls).suitable(url)
1625
1626 def _real_extract(self, url):
1627 # Extract username
1628 mobj = re.match(self._VALID_URL, url)
1629 if mobj is None:
1630 raise ExtractorError(u'Invalid URL: %s' % url)
1631
1632 username = mobj.group(1)
1633
1634 # Download video ids using YouTube Data API. Result size per
1635 # query is limited (currently to 50 videos) so we need to query
1636 # page by page until there are no video ids - it means we got
1637 # all of them.
1638
1639 def download_page(pagenum):
1640 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1641
1642 gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
1643 page = self._download_webpage(
1644 gdata_url, username,
1645 u'Downloading video ids from %d to %d' % (
1646 start_index, start_index + self._GDATA_PAGE_SIZE))
1647
1648 try:
1649 response = json.loads(page)
1650 except ValueError as err:
1651 raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
1652 if 'entry' not in response['feed']:
1653 return
1654
1655 # Extract video identifiers
1656 entries = response['feed']['entry']
1657 for entry in entries:
1658 title = entry['title']['$t']
1659 video_id = entry['id']['$t'].split('/')[-1]
1660 yield {
1661 '_type': 'url',
1662 'url': video_id,
1663 'ie_key': 'Youtube',
1664 'id': video_id,
1665 'title': title,
1666 }
1667 url_results = PagedList(download_page, self._GDATA_PAGE_SIZE)
1668
1669 return self.playlist_result(url_results, playlist_title=username)
1670
1671
1672class YoutubeSearchIE(SearchInfoExtractor):
1673 IE_DESC = u'YouTube.com searches'
1674 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1675 _MAX_RESULTS = 1000
1676 IE_NAME = u'youtube:search'
1677 _SEARCH_KEY = 'ytsearch'
1678
1679 def _get_n_results(self, query, n):
1680 """Get a specified number of results for a query"""
1681
1682 video_ids = []
1683 pagenum = 0
1684 limit = n
1685
1686 while (50 * pagenum) < limit:
1687 result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1688 data_json = self._download_webpage(
1689 result_url, video_id=u'query "%s"' % query,
1690 note=u'Downloading page %s' % (pagenum + 1),
1691 errnote=u'Unable to download API page')
1692 data = json.loads(data_json)
1693 api_response = data['data']
1694
1695 if 'items' not in api_response:
1696 raise ExtractorError(u'[youtube] No video results')
1697
1698 new_ids = list(video['id'] for video in api_response['items'])
1699 video_ids += new_ids
1700
1701 limit = min(n, api_response['totalItems'])
1702 pagenum += 1
1703
1704 if len(video_ids) > n:
1705 video_ids = video_ids[:n]
1706 videos = [self.url_result(video_id, 'Youtube', video_id=video_id)
1707 for video_id in video_ids]
1708 return self.playlist_result(videos, query)
1709
1710class YoutubeSearchDateIE(YoutubeSearchIE):
1711 IE_NAME = YoutubeSearchIE.IE_NAME + ':date'
1712 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc&orderby=published'
1713 _SEARCH_KEY = 'ytsearchdate'
1714 IE_DESC = u'YouTube.com searches, newest videos first'
1715
1716class YoutubeShowIE(InfoExtractor):
1717 IE_DESC = u'YouTube.com (multi-season) shows'
1718 _VALID_URL = r'https?://www\.youtube\.com/show/(.*)'
1719 IE_NAME = u'youtube:show'
1720
1721 def _real_extract(self, url):
1722 mobj = re.match(self._VALID_URL, url)
1723 show_name = mobj.group(1)
1724 webpage = self._download_webpage(url, show_name, u'Downloading show webpage')
1725 # There's one playlist for each season of the show
1726 m_seasons = list(re.finditer(r'href="(/playlist\?list=.*?)"', webpage))
1727 self.to_screen(u'%s: Found %s seasons' % (show_name, len(m_seasons)))
1728 return [self.url_result('https://www.youtube.com' + season.group(1), 'YoutubePlaylist') for season in m_seasons]
1729
1730
1731class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
1732 """
1733 Base class for extractors that fetch info from
1734 http://www.youtube.com/feed_ajax
1735 Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.
1736 """
1737 _LOGIN_REQUIRED = True
1738 # use action_load_personal_feed instead of action_load_system_feed
1739 _PERSONAL_FEED = False
1740
1741 @property
1742 def _FEED_TEMPLATE(self):
1743 action = 'action_load_system_feed'
1744 if self._PERSONAL_FEED:
1745 action = 'action_load_personal_feed'
1746 return 'https://www.youtube.com/feed_ajax?%s=1&feed_name=%s&paging=%%s' % (action, self._FEED_NAME)
1747
1748 @property
1749 def IE_NAME(self):
1750 return u'youtube:%s' % self._FEED_NAME
1751
1752 def _real_initialize(self):
1753 self._login()
1754
1755 def _real_extract(self, url):
1756 feed_entries = []
1757 paging = 0
1758 for i in itertools.count(1):
1759 info = self._download_webpage(self._FEED_TEMPLATE % paging,
1760 u'%s feed' % self._FEED_NAME,
1761 u'Downloading page %s' % i)
1762 info = json.loads(info)
1763 feed_html = info['feed_html']
1764 m_ids = re.finditer(r'"/watch\?v=(.*?)["&]', feed_html)
1765 ids = orderedSet(m.group(1) for m in m_ids)
1766 feed_entries.extend(
1767 self.url_result(video_id, 'Youtube', video_id=video_id)
1768 for video_id in ids)
1769 if info['paging'] is None:
1770 break
1771 paging = info['paging']
1772 return self.playlist_result(feed_entries, playlist_title=self._PLAYLIST_TITLE)
1773
1774class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor):
1775 IE_DESC = u'YouTube.com subscriptions feed, "ytsubs" keyword(requires authentication)'
1776 _VALID_URL = r'https?://www\.youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
1777 _FEED_NAME = 'subscriptions'
1778 _PLAYLIST_TITLE = u'Youtube Subscriptions'
1779
1780class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
1781 IE_DESC = u'YouTube.com recommended videos, "ytrec" keyword (requires authentication)'
1782 _VALID_URL = r'https?://www\.youtube\.com/feed/recommended|:ytrec(?:ommended)?'
1783 _FEED_NAME = 'recommended'
1784 _PLAYLIST_TITLE = u'Youtube Recommended videos'
1785
1786class YoutubeWatchLaterIE(YoutubeFeedsInfoExtractor):
1787 IE_DESC = u'Youtube watch later list, "ytwatchlater" keyword (requires authentication)'
1788 _VALID_URL = r'https?://www\.youtube\.com/feed/watch_later|:ytwatchlater'
1789 _FEED_NAME = 'watch_later'
1790 _PLAYLIST_TITLE = u'Youtube Watch Later'
1791 _PERSONAL_FEED = True
1792
1793class YoutubeHistoryIE(YoutubeFeedsInfoExtractor):
1794 IE_DESC = u'Youtube watch history, "ythistory" keyword (requires authentication)'
1795 _VALID_URL = u'https?://www\.youtube\.com/feed/history|:ythistory'
1796 _FEED_NAME = 'history'
1797 _PERSONAL_FEED = True
1798 _PLAYLIST_TITLE = u'Youtube Watch History'
1799
1800class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
1801 IE_NAME = u'youtube:favorites'
1802 IE_DESC = u'YouTube.com favourite videos, "ytfav" keyword (requires authentication)'
1803 _VALID_URL = r'https?://www\.youtube\.com/my_favorites|:ytfav(?:ou?rites)?'
1804 _LOGIN_REQUIRED = True
1805
1806 def _real_extract(self, url):
1807 webpage = self._download_webpage('https://www.youtube.com/my_favorites', 'Youtube Favourites videos')
1808 playlist_id = self._search_regex(r'list=(.+?)["&]', webpage, u'favourites playlist id')
1809 return self.url_result(playlist_id, 'YoutubePlaylist')
1810
1811
1812class YoutubeTruncatedURLIE(InfoExtractor):
1813 IE_NAME = 'youtube:truncated_url'
1814 IE_DESC = False # Do not list
1815 _VALID_URL = r'''(?x)
1816 (?:https?://)?[^/]+/watch\?feature=[a-z_]+$|
1817 (?:https?://)?(?:www\.)?youtube\.com/attribution_link\?a=[^&]+$
1818 '''
1819
1820 def _real_extract(self, url):
1821 raise ExtractorError(
1822 u'Did you forget to quote the URL? Remember that & is a meta '
1823 u'character in most shells, so you want to put the URL in quotes, '
1824 u'like youtube-dl '
1825 u'"http://www.youtube.com/watch?feature=foo&v=BaW_jenozKc" '
1826 u' or simply youtube-dl BaW_jenozKc .',
1827 expected=True)