]> jfr.im git - yt-dlp.git/blame_incremental - youtube_dl/extractor/youtube.py
Some pep8 style fixes
[yt-dlp.git] / youtube_dl / extractor / youtube.py
... / ...
CommitLineData
1# coding: utf-8
2
3import collections
4import errno
5import io
6import itertools
7import json
8import os.path
9import re
10import string
11import struct
12import traceback
13import zlib
14
15from .common import InfoExtractor, SearchInfoExtractor
16from .subtitles import SubtitlesInfoExtractor
17from ..utils import (
18 compat_chr,
19 compat_parse_qs,
20 compat_urllib_parse,
21 compat_urllib_request,
22 compat_urlparse,
23 compat_str,
24
25 clean_html,
26 get_cachedir,
27 get_element_by_id,
28 get_element_by_attribute,
29 ExtractorError,
30 int_or_none,
31 PagedList,
32 RegexNotFoundError,
33 unescapeHTML,
34 unified_strdate,
35 orderedSet,
36 write_json_file,
37)
38
39class YoutubeBaseInfoExtractor(InfoExtractor):
40 """Provide base functions for Youtube extractors"""
41 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
42 _LANG_URL = r'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
43 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
44 _NETRC_MACHINE = 'youtube'
45 # If True it will raise an error if no login info is provided
46 _LOGIN_REQUIRED = False
47
48 def _set_language(self):
49 return bool(self._download_webpage(
50 self._LANG_URL, None,
51 note=u'Setting language', errnote='unable to set language',
52 fatal=False))
53
54 def _login(self):
55 (username, password) = self._get_login_info()
56 # No authentication to be performed
57 if username is None:
58 if self._LOGIN_REQUIRED:
59 raise ExtractorError(u'No login info available, needed for using %s.' % self.IE_NAME, expected=True)
60 return False
61
62 login_page = self._download_webpage(
63 self._LOGIN_URL, None,
64 note=u'Downloading login page',
65 errnote=u'unable to fetch login page', fatal=False)
66 if login_page is False:
67 return
68
69 galx = self._search_regex(r'(?s)<input.+?name="GALX".+?value="(.+?)"',
70 login_page, u'Login GALX parameter')
71
72 # Log in
73 login_form_strs = {
74 u'continue': u'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
75 u'Email': username,
76 u'GALX': galx,
77 u'Passwd': password,
78 u'PersistentCookie': u'yes',
79 u'_utf8': u'霱',
80 u'bgresponse': u'js_disabled',
81 u'checkConnection': u'',
82 u'checkedDomains': u'youtube',
83 u'dnConn': u'',
84 u'pstMsg': u'0',
85 u'rmShown': u'1',
86 u'secTok': u'',
87 u'signIn': u'Sign in',
88 u'timeStmp': u'',
89 u'service': u'youtube',
90 u'uilel': u'3',
91 u'hl': u'en_US',
92 }
93 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
94 # chokes on unicode
95 login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
96 login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
97
98 req = compat_urllib_request.Request(self._LOGIN_URL, login_data)
99 login_results = self._download_webpage(
100 req, None,
101 note=u'Logging in', errnote=u'unable to log in', fatal=False)
102 if login_results is False:
103 return False
104 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
105 self._downloader.report_warning(u'unable to log in: bad username or password')
106 return False
107 return True
108
109 def _confirm_age(self):
110 age_form = {
111 'next_url': '/',
112 'action_confirm': 'Confirm',
113 }
114 req = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
115
116 self._download_webpage(
117 req, None,
118 note=u'Confirming age', errnote=u'Unable to confirm age')
119 return True
120
121 def _real_initialize(self):
122 if self._downloader is None:
123 return
124 if not self._set_language():
125 return
126 if not self._login():
127 return
128 self._confirm_age()
129
130
131class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
132 IE_DESC = u'YouTube.com'
133 _VALID_URL = r"""(?x)^
134 (
135 (?:https?://|//)? # http(s):// or protocol-independent URL (optional)
136 (?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/|
137 (?:www\.)?deturl\.com/www\.youtube\.com/|
138 (?:www\.)?pwnyoutube\.com|
139 tube\.majestyc\.net/|
140 youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains
141 (?:.*?\#/)? # handle anchor (#/) redirect urls
142 (?: # the various things that can precede the ID:
143 (?:(?:v|embed|e)/) # v/ or embed/ or e/
144 |(?: # or the v= param in all its forms
145 (?:(?:watch|movie)(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
146 (?:\?|\#!?) # the params delimiter ? or # or #!
147 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
148 v=
149 )
150 ))
151 |youtu\.be/ # just youtu.be/xxxx
152 )
153 )? # all until now is optional -> you can pass the naked ID
154 ([0-9A-Za-z_-]{11}) # here is it! the YouTube video ID
155 (?(1).+)? # if we found the ID, everything can follow
156 $"""
157 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
158 _formats = {
159 '5': {'ext': 'flv', 'width': 400, 'height': 240},
160 '6': {'ext': 'flv', 'width': 450, 'height': 270},
161 '13': {'ext': '3gp'},
162 '17': {'ext': '3gp', 'width': 176, 'height': 144},
163 '18': {'ext': 'mp4', 'width': 640, 'height': 360},
164 '22': {'ext': 'mp4', 'width': 1280, 'height': 720},
165 '34': {'ext': 'flv', 'width': 640, 'height': 360},
166 '35': {'ext': 'flv', 'width': 854, 'height': 480},
167 '36': {'ext': '3gp', 'width': 320, 'height': 240},
168 '37': {'ext': 'mp4', 'width': 1920, 'height': 1080},
169 '38': {'ext': 'mp4', 'width': 4096, 'height': 3072},
170 '43': {'ext': 'webm', 'width': 640, 'height': 360},
171 '44': {'ext': 'webm', 'width': 854, 'height': 480},
172 '45': {'ext': 'webm', 'width': 1280, 'height': 720},
173 '46': {'ext': 'webm', 'width': 1920, 'height': 1080},
174
175
176 # 3d videos
177 '82': {'ext': 'mp4', 'height': 360, 'resolution': '360p', 'format_note': '3D', 'preference': -20},
178 '83': {'ext': 'mp4', 'height': 480, 'resolution': '480p', 'format_note': '3D', 'preference': -20},
179 '84': {'ext': 'mp4', 'height': 720, 'resolution': '720p', 'format_note': '3D', 'preference': -20},
180 '85': {'ext': 'mp4', 'height': 1080, 'resolution': '1080p', 'format_note': '3D', 'preference': -20},
181 '100': {'ext': 'webm', 'height': 360, 'resolution': '360p', 'format_note': '3D', 'preference': -20},
182 '101': {'ext': 'webm', 'height': 480, 'resolution': '480p', 'format_note': '3D', 'preference': -20},
183 '102': {'ext': 'webm', 'height': 720, 'resolution': '720p', 'format_note': '3D', 'preference': -20},
184
185 # Apple HTTP Live Streaming
186 '92': {'ext': 'mp4', 'height': 240, 'resolution': '240p', 'format_note': 'HLS', 'preference': -10},
187 '93': {'ext': 'mp4', 'height': 360, 'resolution': '360p', 'format_note': 'HLS', 'preference': -10},
188 '94': {'ext': 'mp4', 'height': 480, 'resolution': '480p', 'format_note': 'HLS', 'preference': -10},
189 '95': {'ext': 'mp4', 'height': 720, 'resolution': '720p', 'format_note': 'HLS', 'preference': -10},
190 '96': {'ext': 'mp4', 'height': 1080, 'resolution': '1080p', 'format_note': 'HLS', 'preference': -10},
191 '132': {'ext': 'mp4', 'height': 240, 'resolution': '240p', 'format_note': 'HLS', 'preference': -10},
192 '151': {'ext': 'mp4', 'height': 72, 'resolution': '72p', 'format_note': 'HLS', 'preference': -10},
193
194 # DASH mp4 video
195 '133': {'ext': 'mp4', 'height': 240, 'resolution': '240p', 'format_note': 'DASH video', 'preference': -40},
196 '134': {'ext': 'mp4', 'height': 360, 'resolution': '360p', 'format_note': 'DASH video', 'preference': -40},
197 '135': {'ext': 'mp4', 'height': 480, 'resolution': '480p', 'format_note': 'DASH video', 'preference': -40},
198 '136': {'ext': 'mp4', 'height': 720, 'resolution': '720p', 'format_note': 'DASH video', 'preference': -40},
199 '137': {'ext': 'mp4', 'height': 1080, 'resolution': '1080p', 'format_note': 'DASH video', 'preference': -40},
200 '138': {'ext': 'mp4', 'height': 1081, 'resolution': '>1080p', 'format_note': 'DASH video', 'preference': -40},
201 '160': {'ext': 'mp4', 'height': 192, 'resolution': '192p', 'format_note': 'DASH video', 'preference': -40},
202 '264': {'ext': 'mp4', 'height': 1080, 'resolution': '1080p', 'format_note': 'DASH video', 'preference': -40},
203
204 # Dash mp4 audio
205 '139': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 48, 'preference': -50},
206 '140': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 128, 'preference': -50},
207 '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 256, 'preference': -50},
208
209 # Dash webm
210 '167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'VP8', 'acodec': 'none', 'preference': -40},
211 '168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'VP8', 'acodec': 'none', 'preference': -40},
212 '169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'VP8', 'acodec': 'none', 'preference': -40},
213 '170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'VP8', 'acodec': 'none', 'preference': -40},
214 '218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'VP8', 'acodec': 'none', 'preference': -40},
215 '219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'VP8', 'acodec': 'none', 'preference': -40},
216 '242': {'ext': 'webm', 'height': 240, 'resolution': '240p', 'format_note': 'DASH webm', 'preference': -40},
217 '243': {'ext': 'webm', 'height': 360, 'resolution': '360p', 'format_note': 'DASH webm', 'preference': -40},
218 '244': {'ext': 'webm', 'height': 480, 'resolution': '480p', 'format_note': 'DASH webm', 'preference': -40},
219 '245': {'ext': 'webm', 'height': 480, 'resolution': '480p', 'format_note': 'DASH webm', 'preference': -40},
220 '246': {'ext': 'webm', 'height': 480, 'resolution': '480p', 'format_note': 'DASH webm', 'preference': -40},
221 '247': {'ext': 'webm', 'height': 720, 'resolution': '720p', 'format_note': 'DASH webm', 'preference': -40},
222 '248': {'ext': 'webm', 'height': 1080, 'resolution': '1080p', 'format_note': 'DASH webm', 'preference': -40},
223
224 # Dash webm audio
225 '171': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH webm audio', 'abr': 48, 'preference': -50},
226 '172': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH webm audio', 'abr': 256, 'preference': -50},
227
228 # RTMP (unnamed)
229 '_rtmp': {'protocol': 'rtmp'},
230 }
231
232 IE_NAME = u'youtube'
233 _TESTS = [
234 {
235 u"url": u"http://www.youtube.com/watch?v=BaW_jenozKc",
236 u"file": u"BaW_jenozKc.mp4",
237 u"info_dict": {
238 u"title": u"youtube-dl test video \"'/\\ä↭𝕐",
239 u"uploader": u"Philipp Hagemeister",
240 u"uploader_id": u"phihag",
241 u"upload_date": u"20121002",
242 u"description": u"test chars: \"'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de ."
243 }
244 },
245 {
246 u"url": u"http://www.youtube.com/watch?v=UxxajLWwzqY",
247 u"file": u"UxxajLWwzqY.mp4",
248 u"note": u"Test generic use_cipher_signature video (#897)",
249 u"info_dict": {
250 u"upload_date": u"20120506",
251 u"title": u"Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]",
252 u"description": u"md5:5b292926389560516e384ac437c0ec07",
253 u"uploader": u"Icona Pop",
254 u"uploader_id": u"IconaPop"
255 }
256 },
257 {
258 u"url": u"https://www.youtube.com/watch?v=07FYdnEawAQ",
259 u"file": u"07FYdnEawAQ.mp4",
260 u"note": u"Test VEVO video with age protection (#956)",
261 u"info_dict": {
262 u"upload_date": u"20130703",
263 u"title": u"Justin Timberlake - Tunnel Vision (Explicit)",
264 u"description": u"md5:64249768eec3bc4276236606ea996373",
265 u"uploader": u"justintimberlakeVEVO",
266 u"uploader_id": u"justintimberlakeVEVO"
267 }
268 },
269 {
270 u"url": u"//www.YouTube.com/watch?v=yZIXLfi8CZQ",
271 u"file": u"yZIXLfi8CZQ.mp4",
272 u"note": u"Embed-only video (#1746)",
273 u"info_dict": {
274 u"upload_date": u"20120608",
275 u"title": u"Principal Sexually Assaults A Teacher - Episode 117 - 8th June 2012",
276 u"description": u"md5:09b78bd971f1e3e289601dfba15ca4f7",
277 u"uploader": u"SET India",
278 u"uploader_id": u"setindia"
279 }
280 },
281 {
282 u"url": u"http://www.youtube.com/watch?v=a9LDPn-MO4I",
283 u"file": u"a9LDPn-MO4I.m4a",
284 u"note": u"256k DASH audio (format 141) via DASH manifest",
285 u"info_dict": {
286 u"upload_date": "20121002",
287 u"uploader_id": "8KVIDEO",
288 u"description": "No description available.",
289 u"uploader": "8KVIDEO",
290 u"title": "UHDTV TEST 8K VIDEO.mp4"
291 },
292 u"params": {
293 u"youtube_include_dash_manifest": True,
294 u"format": "141",
295 },
296 },
297 ]
298
299
300 @classmethod
301 def suitable(cls, url):
302 """Receives a URL and returns True if suitable for this IE."""
303 if YoutubePlaylistIE.suitable(url): return False
304 return re.match(cls._VALID_URL, url) is not None
305
306 def __init__(self, *args, **kwargs):
307 super(YoutubeIE, self).__init__(*args, **kwargs)
308 self._player_cache = {}
309
310 def report_video_info_webpage_download(self, video_id):
311 """Report attempt to download video info webpage."""
312 self.to_screen(u'%s: Downloading video info webpage' % video_id)
313
314 def report_information_extraction(self, video_id):
315 """Report attempt to extract video information."""
316 self.to_screen(u'%s: Extracting video information' % video_id)
317
318 def report_unavailable_format(self, video_id, format):
319 """Report extracted video URL."""
320 self.to_screen(u'%s: Format %s not available' % (video_id, format))
321
322 def report_rtmp_download(self):
323 """Indicate the download will use the RTMP protocol."""
324 self.to_screen(u'RTMP download detected')
325
326 def _extract_signature_function(self, video_id, player_url, slen):
327 id_m = re.match(r'.*-(?P<id>[a-zA-Z0-9_-]+)\.(?P<ext>[a-z]+)$',
328 player_url)
329 player_type = id_m.group('ext')
330 player_id = id_m.group('id')
331
332 # Read from filesystem cache
333 func_id = '%s_%s_%d' % (player_type, player_id, slen)
334 assert os.path.basename(func_id) == func_id
335 cache_dir = get_cachedir(self._downloader.params)
336
337 cache_enabled = cache_dir is not None
338 if cache_enabled:
339 cache_fn = os.path.join(os.path.expanduser(cache_dir),
340 u'youtube-sigfuncs',
341 func_id + '.json')
342 try:
343 with io.open(cache_fn, 'r', encoding='utf-8') as cachef:
344 cache_spec = json.load(cachef)
345 return lambda s: u''.join(s[i] for i in cache_spec)
346 except IOError:
347 pass # No cache available
348
349 if player_type == 'js':
350 code = self._download_webpage(
351 player_url, video_id,
352 note=u'Downloading %s player %s' % (player_type, player_id),
353 errnote=u'Download of %s failed' % player_url)
354 res = self._parse_sig_js(code)
355 elif player_type == 'swf':
356 urlh = self._request_webpage(
357 player_url, video_id,
358 note=u'Downloading %s player %s' % (player_type, player_id),
359 errnote=u'Download of %s failed' % player_url)
360 code = urlh.read()
361 res = self._parse_sig_swf(code)
362 else:
363 assert False, 'Invalid player type %r' % player_type
364
365 if cache_enabled:
366 try:
367 test_string = u''.join(map(compat_chr, range(slen)))
368 cache_res = res(test_string)
369 cache_spec = [ord(c) for c in cache_res]
370 try:
371 os.makedirs(os.path.dirname(cache_fn))
372 except OSError as ose:
373 if ose.errno != errno.EEXIST:
374 raise
375 write_json_file(cache_spec, cache_fn)
376 except Exception:
377 tb = traceback.format_exc()
378 self._downloader.report_warning(
379 u'Writing cache to %r failed: %s' % (cache_fn, tb))
380
381 return res
382
383 def _print_sig_code(self, func, slen):
384 def gen_sig_code(idxs):
385 def _genslice(start, end, step):
386 starts = u'' if start == 0 else str(start)
387 ends = (u':%d' % (end+step)) if end + step >= 0 else u':'
388 steps = u'' if step == 1 else (u':%d' % step)
389 return u's[%s%s%s]' % (starts, ends, steps)
390
391 step = None
392 start = '(Never used)' # Quelch pyflakes warnings - start will be
393 # set as soon as step is set
394 for i, prev in zip(idxs[1:], idxs[:-1]):
395 if step is not None:
396 if i - prev == step:
397 continue
398 yield _genslice(start, prev, step)
399 step = None
400 continue
401 if i - prev in [-1, 1]:
402 step = i - prev
403 start = prev
404 continue
405 else:
406 yield u's[%d]' % prev
407 if step is None:
408 yield u's[%d]' % i
409 else:
410 yield _genslice(start, i, step)
411
412 test_string = u''.join(map(compat_chr, range(slen)))
413 cache_res = func(test_string)
414 cache_spec = [ord(c) for c in cache_res]
415 expr_code = u' + '.join(gen_sig_code(cache_spec))
416 code = u'if len(s) == %d:\n return %s\n' % (slen, expr_code)
417 self.to_screen(u'Extracted signature function:\n' + code)
418
419 def _parse_sig_js(self, jscode):
420 funcname = self._search_regex(
421 r'signature=([a-zA-Z]+)', jscode,
422 u'Initial JS player signature function name')
423
424 functions = {}
425
426 def argidx(varname):
427 return string.lowercase.index(varname)
428
429 def interpret_statement(stmt, local_vars, allow_recursion=20):
430 if allow_recursion < 0:
431 raise ExtractorError(u'Recursion limit reached')
432
433 if stmt.startswith(u'var '):
434 stmt = stmt[len(u'var '):]
435 ass_m = re.match(r'^(?P<out>[a-z]+)(?:\[(?P<index>[^\]]+)\])?' +
436 r'=(?P<expr>.*)$', stmt)
437 if ass_m:
438 if ass_m.groupdict().get('index'):
439 def assign(val):
440 lvar = local_vars[ass_m.group('out')]
441 idx = interpret_expression(ass_m.group('index'),
442 local_vars, allow_recursion)
443 assert isinstance(idx, int)
444 lvar[idx] = val
445 return val
446 expr = ass_m.group('expr')
447 else:
448 def assign(val):
449 local_vars[ass_m.group('out')] = val
450 return val
451 expr = ass_m.group('expr')
452 elif stmt.startswith(u'return '):
453 assign = lambda v: v
454 expr = stmt[len(u'return '):]
455 else:
456 raise ExtractorError(
457 u'Cannot determine left side of statement in %r' % stmt)
458
459 v = interpret_expression(expr, local_vars, allow_recursion)
460 return assign(v)
461
462 def interpret_expression(expr, local_vars, allow_recursion):
463 if expr.isdigit():
464 return int(expr)
465
466 if expr.isalpha():
467 return local_vars[expr]
468
469 m = re.match(r'^(?P<in>[a-z]+)\.(?P<member>.*)$', expr)
470 if m:
471 member = m.group('member')
472 val = local_vars[m.group('in')]
473 if member == 'split("")':
474 return list(val)
475 if member == 'join("")':
476 return u''.join(val)
477 if member == 'length':
478 return len(val)
479 if member == 'reverse()':
480 return val[::-1]
481 slice_m = re.match(r'slice\((?P<idx>.*)\)', member)
482 if slice_m:
483 idx = interpret_expression(
484 slice_m.group('idx'), local_vars, allow_recursion-1)
485 return val[idx:]
486
487 m = re.match(
488 r'^(?P<in>[a-z]+)\[(?P<idx>.+)\]$', expr)
489 if m:
490 val = local_vars[m.group('in')]
491 idx = interpret_expression(m.group('idx'), local_vars,
492 allow_recursion-1)
493 return val[idx]
494
495 m = re.match(r'^(?P<a>.+?)(?P<op>[%])(?P<b>.+?)$', expr)
496 if m:
497 a = interpret_expression(m.group('a'),
498 local_vars, allow_recursion)
499 b = interpret_expression(m.group('b'),
500 local_vars, allow_recursion)
501 return a % b
502
503 m = re.match(
504 r'^(?P<func>[a-zA-Z]+)\((?P<args>[a-z0-9,]+)\)$', expr)
505 if m:
506 fname = m.group('func')
507 if fname not in functions:
508 functions[fname] = extract_function(fname)
509 argvals = [int(v) if v.isdigit() else local_vars[v]
510 for v in m.group('args').split(',')]
511 return functions[fname](argvals)
512 raise ExtractorError(u'Unsupported JS expression %r' % expr)
513
514 def extract_function(funcname):
515 func_m = re.search(
516 r'function ' + re.escape(funcname) +
517 r'\((?P<args>[a-z,]+)\){(?P<code>[^}]+)}',
518 jscode)
519 argnames = func_m.group('args').split(',')
520
521 def resf(args):
522 local_vars = dict(zip(argnames, args))
523 for stmt in func_m.group('code').split(';'):
524 res = interpret_statement(stmt, local_vars)
525 return res
526 return resf
527
528 initial_function = extract_function(funcname)
529 return lambda s: initial_function([s])
530
531 def _parse_sig_swf(self, file_contents):
532 if file_contents[1:3] != b'WS':
533 raise ExtractorError(
534 u'Not an SWF file; header is %r' % file_contents[:3])
535 if file_contents[:1] == b'C':
536 content = zlib.decompress(file_contents[8:])
537 else:
538 raise NotImplementedError(u'Unsupported compression format %r' %
539 file_contents[:1])
540
541 def extract_tags(content):
542 pos = 0
543 while pos < len(content):
544 header16 = struct.unpack('<H', content[pos:pos+2])[0]
545 pos += 2
546 tag_code = header16 >> 6
547 tag_len = header16 & 0x3f
548 if tag_len == 0x3f:
549 tag_len = struct.unpack('<I', content[pos:pos+4])[0]
550 pos += 4
551 assert pos+tag_len <= len(content)
552 yield (tag_code, content[pos:pos+tag_len])
553 pos += tag_len
554
555 code_tag = next(tag
556 for tag_code, tag in extract_tags(content)
557 if tag_code == 82)
558 p = code_tag.index(b'\0', 4) + 1
559 code_reader = io.BytesIO(code_tag[p:])
560
561 # Parse ABC (AVM2 ByteCode)
562 def read_int(reader=None):
563 if reader is None:
564 reader = code_reader
565 res = 0
566 shift = 0
567 for _ in range(5):
568 buf = reader.read(1)
569 assert len(buf) == 1
570 b = struct.unpack('<B', buf)[0]
571 res = res | ((b & 0x7f) << shift)
572 if b & 0x80 == 0:
573 break
574 shift += 7
575 return res
576
577 def u30(reader=None):
578 res = read_int(reader)
579 assert res & 0xf0000000 == 0
580 return res
581 u32 = read_int
582
583 def s32(reader=None):
584 v = read_int(reader)
585 if v & 0x80000000 != 0:
586 v = - ((v ^ 0xffffffff) + 1)
587 return v
588
589 def read_string(reader=None):
590 if reader is None:
591 reader = code_reader
592 slen = u30(reader)
593 resb = reader.read(slen)
594 assert len(resb) == slen
595 return resb.decode('utf-8')
596
597 def read_bytes(count, reader=None):
598 if reader is None:
599 reader = code_reader
600 resb = reader.read(count)
601 assert len(resb) == count
602 return resb
603
604 def read_byte(reader=None):
605 resb = read_bytes(1, reader=reader)
606 res = struct.unpack('<B', resb)[0]
607 return res
608
609 # minor_version + major_version
610 read_bytes(2 + 2)
611
612 # Constant pool
613 int_count = u30()
614 for _c in range(1, int_count):
615 s32()
616 uint_count = u30()
617 for _c in range(1, uint_count):
618 u32()
619 double_count = u30()
620 read_bytes((double_count-1) * 8)
621 string_count = u30()
622 constant_strings = [u'']
623 for _c in range(1, string_count):
624 s = read_string()
625 constant_strings.append(s)
626 namespace_count = u30()
627 for _c in range(1, namespace_count):
628 read_bytes(1) # kind
629 u30() # name
630 ns_set_count = u30()
631 for _c in range(1, ns_set_count):
632 count = u30()
633 for _c2 in range(count):
634 u30()
635 multiname_count = u30()
636 MULTINAME_SIZES = {
637 0x07: 2, # QName
638 0x0d: 2, # QNameA
639 0x0f: 1, # RTQName
640 0x10: 1, # RTQNameA
641 0x11: 0, # RTQNameL
642 0x12: 0, # RTQNameLA
643 0x09: 2, # Multiname
644 0x0e: 2, # MultinameA
645 0x1b: 1, # MultinameL
646 0x1c: 1, # MultinameLA
647 }
648 multinames = [u'']
649 for _c in range(1, multiname_count):
650 kind = u30()
651 assert kind in MULTINAME_SIZES, u'Invalid multiname kind %r' % kind
652 if kind == 0x07:
653 u30() # namespace_idx
654 name_idx = u30()
655 multinames.append(constant_strings[name_idx])
656 else:
657 multinames.append('[MULTINAME kind: %d]' % kind)
658 for _c2 in range(MULTINAME_SIZES[kind]):
659 u30()
660
661 # Methods
662 method_count = u30()
663 MethodInfo = collections.namedtuple(
664 'MethodInfo',
665 ['NEED_ARGUMENTS', 'NEED_REST'])
666 method_infos = []
667 for method_id in range(method_count):
668 param_count = u30()
669 u30() # return type
670 for _ in range(param_count):
671 u30() # param type
672 u30() # name index (always 0 for youtube)
673 flags = read_byte()
674 if flags & 0x08 != 0:
675 # Options present
676 option_count = u30()
677 for c in range(option_count):
678 u30() # val
679 read_bytes(1) # kind
680 if flags & 0x80 != 0:
681 # Param names present
682 for _ in range(param_count):
683 u30() # param name
684 mi = MethodInfo(flags & 0x01 != 0, flags & 0x04 != 0)
685 method_infos.append(mi)
686
687 # Metadata
688 metadata_count = u30()
689 for _c in range(metadata_count):
690 u30() # name
691 item_count = u30()
692 for _c2 in range(item_count):
693 u30() # key
694 u30() # value
695
696 def parse_traits_info():
697 trait_name_idx = u30()
698 kind_full = read_byte()
699 kind = kind_full & 0x0f
700 attrs = kind_full >> 4
701 methods = {}
702 if kind in [0x00, 0x06]: # Slot or Const
703 u30() # Slot id
704 u30() # type_name_idx
705 vindex = u30()
706 if vindex != 0:
707 read_byte() # vkind
708 elif kind in [0x01, 0x02, 0x03]: # Method / Getter / Setter
709 u30() # disp_id
710 method_idx = u30()
711 methods[multinames[trait_name_idx]] = method_idx
712 elif kind == 0x04: # Class
713 u30() # slot_id
714 u30() # classi
715 elif kind == 0x05: # Function
716 u30() # slot_id
717 function_idx = u30()
718 methods[function_idx] = multinames[trait_name_idx]
719 else:
720 raise ExtractorError(u'Unsupported trait kind %d' % kind)
721
722 if attrs & 0x4 != 0: # Metadata present
723 metadata_count = u30()
724 for _c3 in range(metadata_count):
725 u30() # metadata index
726
727 return methods
728
729 # Classes
730 TARGET_CLASSNAME = u'SignatureDecipher'
731 searched_idx = multinames.index(TARGET_CLASSNAME)
732 searched_class_id = None
733 class_count = u30()
734 for class_id in range(class_count):
735 name_idx = u30()
736 if name_idx == searched_idx:
737 # We found the class we're looking for!
738 searched_class_id = class_id
739 u30() # super_name idx
740 flags = read_byte()
741 if flags & 0x08 != 0: # Protected namespace is present
742 u30() # protected_ns_idx
743 intrf_count = u30()
744 for _c2 in range(intrf_count):
745 u30()
746 u30() # iinit
747 trait_count = u30()
748 for _c2 in range(trait_count):
749 parse_traits_info()
750
751 if searched_class_id is None:
752 raise ExtractorError(u'Target class %r not found' %
753 TARGET_CLASSNAME)
754
755 method_names = {}
756 method_idxs = {}
757 for class_id in range(class_count):
758 u30() # cinit
759 trait_count = u30()
760 for _c2 in range(trait_count):
761 trait_methods = parse_traits_info()
762 if class_id == searched_class_id:
763 method_names.update(trait_methods.items())
764 method_idxs.update(dict(
765 (idx, name)
766 for name, idx in trait_methods.items()))
767
768 # Scripts
769 script_count = u30()
770 for _c in range(script_count):
771 u30() # init
772 trait_count = u30()
773 for _c2 in range(trait_count):
774 parse_traits_info()
775
776 # Method bodies
777 method_body_count = u30()
778 Method = collections.namedtuple('Method', ['code', 'local_count'])
779 methods = {}
780 for _c in range(method_body_count):
781 method_idx = u30()
782 u30() # max_stack
783 local_count = u30()
784 u30() # init_scope_depth
785 u30() # max_scope_depth
786 code_length = u30()
787 code = read_bytes(code_length)
788 if method_idx in method_idxs:
789 m = Method(code, local_count)
790 methods[method_idxs[method_idx]] = m
791 exception_count = u30()
792 for _c2 in range(exception_count):
793 u30() # from
794 u30() # to
795 u30() # target
796 u30() # exc_type
797 u30() # var_name
798 trait_count = u30()
799 for _c2 in range(trait_count):
800 parse_traits_info()
801
802 assert p + code_reader.tell() == len(code_tag)
803 assert len(methods) == len(method_idxs)
804
805 method_pyfunctions = {}
806
807 def extract_function(func_name):
808 if func_name in method_pyfunctions:
809 return method_pyfunctions[func_name]
810 if func_name not in methods:
811 raise ExtractorError(u'Cannot find function %r' % func_name)
812 m = methods[func_name]
813
814 def resfunc(args):
815 registers = ['(this)'] + list(args) + [None] * m.local_count
816 stack = []
817 coder = io.BytesIO(m.code)
818 while True:
819 opcode = struct.unpack('!B', coder.read(1))[0]
820 if opcode == 36: # pushbyte
821 v = struct.unpack('!B', coder.read(1))[0]
822 stack.append(v)
823 elif opcode == 44: # pushstring
824 idx = u30(coder)
825 stack.append(constant_strings[idx])
826 elif opcode == 48: # pushscope
827 # We don't implement the scope register, so we'll just
828 # ignore the popped value
829 stack.pop()
830 elif opcode == 70: # callproperty
831 index = u30(coder)
832 mname = multinames[index]
833 arg_count = u30(coder)
834 args = list(reversed(
835 [stack.pop() for _ in range(arg_count)]))
836 obj = stack.pop()
837 if mname == u'split':
838 assert len(args) == 1
839 assert isinstance(args[0], compat_str)
840 assert isinstance(obj, compat_str)
841 if args[0] == u'':
842 res = list(obj)
843 else:
844 res = obj.split(args[0])
845 stack.append(res)
846 elif mname == u'slice':
847 assert len(args) == 1
848 assert isinstance(args[0], int)
849 assert isinstance(obj, list)
850 res = obj[args[0]:]
851 stack.append(res)
852 elif mname == u'join':
853 assert len(args) == 1
854 assert isinstance(args[0], compat_str)
855 assert isinstance(obj, list)
856 res = args[0].join(obj)
857 stack.append(res)
858 elif mname in method_pyfunctions:
859 stack.append(method_pyfunctions[mname](args))
860 else:
861 raise NotImplementedError(
862 u'Unsupported property %r on %r'
863 % (mname, obj))
864 elif opcode == 72: # returnvalue
865 res = stack.pop()
866 return res
867 elif opcode == 79: # callpropvoid
868 index = u30(coder)
869 mname = multinames[index]
870 arg_count = u30(coder)
871 args = list(reversed(
872 [stack.pop() for _ in range(arg_count)]))
873 obj = stack.pop()
874 if mname == u'reverse':
875 assert isinstance(obj, list)
876 obj.reverse()
877 else:
878 raise NotImplementedError(
879 u'Unsupported (void) property %r on %r'
880 % (mname, obj))
881 elif opcode == 93: # findpropstrict
882 index = u30(coder)
883 mname = multinames[index]
884 res = extract_function(mname)
885 stack.append(res)
886 elif opcode == 97: # setproperty
887 index = u30(coder)
888 value = stack.pop()
889 idx = stack.pop()
890 obj = stack.pop()
891 assert isinstance(obj, list)
892 assert isinstance(idx, int)
893 obj[idx] = value
894 elif opcode == 98: # getlocal
895 index = u30(coder)
896 stack.append(registers[index])
897 elif opcode == 99: # setlocal
898 index = u30(coder)
899 value = stack.pop()
900 registers[index] = value
901 elif opcode == 102: # getproperty
902 index = u30(coder)
903 pname = multinames[index]
904 if pname == u'length':
905 obj = stack.pop()
906 assert isinstance(obj, list)
907 stack.append(len(obj))
908 else: # Assume attribute access
909 idx = stack.pop()
910 assert isinstance(idx, int)
911 obj = stack.pop()
912 assert isinstance(obj, list)
913 stack.append(obj[idx])
914 elif opcode == 128: # coerce
915 u30(coder)
916 elif opcode == 133: # coerce_s
917 assert isinstance(stack[-1], (type(None), compat_str))
918 elif opcode == 164: # modulo
919 value2 = stack.pop()
920 value1 = stack.pop()
921 res = value1 % value2
922 stack.append(res)
923 elif opcode == 208: # getlocal_0
924 stack.append(registers[0])
925 elif opcode == 209: # getlocal_1
926 stack.append(registers[1])
927 elif opcode == 210: # getlocal_2
928 stack.append(registers[2])
929 elif opcode == 211: # getlocal_3
930 stack.append(registers[3])
931 elif opcode == 214: # setlocal_2
932 registers[2] = stack.pop()
933 elif opcode == 215: # setlocal_3
934 registers[3] = stack.pop()
935 else:
936 raise NotImplementedError(
937 u'Unsupported opcode %d' % opcode)
938
939 method_pyfunctions[func_name] = resfunc
940 return resfunc
941
942 initial_function = extract_function(u'decipher')
943 return lambda s: initial_function([s])
944
945 def _decrypt_signature(self, s, video_id, player_url, age_gate=False):
946 """Turn the encrypted s field into a working signature"""
947
948 if player_url is not None:
949 if player_url.startswith(u'//'):
950 player_url = u'https:' + player_url
951 try:
952 player_id = (player_url, len(s))
953 if player_id not in self._player_cache:
954 func = self._extract_signature_function(
955 video_id, player_url, len(s)
956 )
957 self._player_cache[player_id] = func
958 func = self._player_cache[player_id]
959 if self._downloader.params.get('youtube_print_sig_code'):
960 self._print_sig_code(func, len(s))
961 return func(s)
962 except Exception:
963 tb = traceback.format_exc()
964 self._downloader.report_warning(
965 u'Automatic signature extraction failed: ' + tb)
966
967 self._downloader.report_warning(
968 u'Warning: Falling back to static signature algorithm')
969
970 return self._static_decrypt_signature(
971 s, video_id, player_url, age_gate)
972
973 def _static_decrypt_signature(self, s, video_id, player_url, age_gate):
974 if age_gate:
975 # The videos with age protection use another player, so the
976 # algorithms can be different.
977 if len(s) == 86:
978 return s[2:63] + s[82] + s[64:82] + s[63]
979
980 if len(s) == 93:
981 return s[86:29:-1] + s[88] + s[28:5:-1]
982 elif len(s) == 92:
983 return s[25] + s[3:25] + s[0] + s[26:42] + s[79] + s[43:79] + s[91] + s[80:83]
984 elif len(s) == 91:
985 return s[84:27:-1] + s[86] + s[26:5:-1]
986 elif len(s) == 90:
987 return s[25] + s[3:25] + s[2] + s[26:40] + s[77] + s[41:77] + s[89] + s[78:81]
988 elif len(s) == 89:
989 return s[84:78:-1] + s[87] + s[77:60:-1] + s[0] + s[59:3:-1]
990 elif len(s) == 88:
991 return s[7:28] + s[87] + s[29:45] + s[55] + s[46:55] + s[2] + s[56:87] + s[28]
992 elif len(s) == 87:
993 return s[6:27] + s[4] + s[28:39] + s[27] + s[40:59] + s[2] + s[60:]
994 elif len(s) == 86:
995 return s[80:72:-1] + s[16] + s[71:39:-1] + s[72] + s[38:16:-1] + s[82] + s[15::-1]
996 elif len(s) == 85:
997 return s[3:11] + s[0] + s[12:55] + s[84] + s[56:84]
998 elif len(s) == 84:
999 return s[78:70:-1] + s[14] + s[69:37:-1] + s[70] + s[36:14:-1] + s[80] + s[:14][::-1]
1000 elif len(s) == 83:
1001 return s[80:63:-1] + s[0] + s[62:0:-1] + s[63]
1002 elif len(s) == 82:
1003 return s[80:37:-1] + s[7] + s[36:7:-1] + s[0] + s[6:0:-1] + s[37]
1004 elif len(s) == 81:
1005 return s[56] + s[79:56:-1] + s[41] + s[55:41:-1] + s[80] + s[40:34:-1] + s[0] + s[33:29:-1] + s[34] + s[28:9:-1] + s[29] + s[8:0:-1] + s[9]
1006 elif len(s) == 80:
1007 return s[1:19] + s[0] + s[20:68] + s[19] + s[69:80]
1008 elif len(s) == 79:
1009 return s[54] + s[77:54:-1] + s[39] + s[53:39:-1] + s[78] + s[38:34:-1] + s[0] + s[33:29:-1] + s[34] + s[28:9:-1] + s[29] + s[8:0:-1] + s[9]
1010
1011 else:
1012 raise ExtractorError(u'Unable to decrypt signature, key length %d not supported; retrying might work' % (len(s)))
1013
1014 def _get_available_subtitles(self, video_id, webpage):
1015 try:
1016 sub_list = self._download_webpage(
1017 'http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,
1018 video_id, note=False)
1019 except ExtractorError as err:
1020 self._downloader.report_warning(u'unable to download video subtitles: %s' % compat_str(err))
1021 return {}
1022 lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
1023
1024 sub_lang_list = {}
1025 for l in lang_list:
1026 lang = l[1]
1027 params = compat_urllib_parse.urlencode({
1028 'lang': lang,
1029 'v': video_id,
1030 'fmt': self._downloader.params.get('subtitlesformat', 'srt'),
1031 'name': unescapeHTML(l[0]).encode('utf-8'),
1032 })
1033 url = u'http://www.youtube.com/api/timedtext?' + params
1034 sub_lang_list[lang] = url
1035 if not sub_lang_list:
1036 self._downloader.report_warning(u'video doesn\'t have subtitles')
1037 return {}
1038 return sub_lang_list
1039
1040 def _get_available_automatic_caption(self, video_id, webpage):
1041 """We need the webpage for getting the captions url, pass it as an
1042 argument to speed up the process."""
1043 sub_format = self._downloader.params.get('subtitlesformat', 'srt')
1044 self.to_screen(u'%s: Looking for automatic captions' % video_id)
1045 mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
1046 err_msg = u'Couldn\'t find automatic captions for %s' % video_id
1047 if mobj is None:
1048 self._downloader.report_warning(err_msg)
1049 return {}
1050 player_config = json.loads(mobj.group(1))
1051 try:
1052 args = player_config[u'args']
1053 caption_url = args[u'ttsurl']
1054 timestamp = args[u'timestamp']
1055 # We get the available subtitles
1056 list_params = compat_urllib_parse.urlencode({
1057 'type': 'list',
1058 'tlangs': 1,
1059 'asrs': 1,
1060 })
1061 list_url = caption_url + '&' + list_params
1062 caption_list = self._download_xml(list_url, video_id)
1063 original_lang_node = caption_list.find('track')
1064 if original_lang_node is None or original_lang_node.attrib.get('kind') != 'asr' :
1065 self._downloader.report_warning(u'Video doesn\'t have automatic captions')
1066 return {}
1067 original_lang = original_lang_node.attrib['lang_code']
1068
1069 sub_lang_list = {}
1070 for lang_node in caption_list.findall('target'):
1071 sub_lang = lang_node.attrib['lang_code']
1072 params = compat_urllib_parse.urlencode({
1073 'lang': original_lang,
1074 'tlang': sub_lang,
1075 'fmt': sub_format,
1076 'ts': timestamp,
1077 'kind': 'asr',
1078 })
1079 sub_lang_list[sub_lang] = caption_url + '&' + params
1080 return sub_lang_list
1081 # An extractor error can be raise by the download process if there are
1082 # no automatic captions but there are subtitles
1083 except (KeyError, ExtractorError):
1084 self._downloader.report_warning(err_msg)
1085 return {}
1086
1087 def _extract_id(self, url):
1088 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1089 if mobj is None:
1090 raise ExtractorError(u'Invalid URL: %s' % url)
1091 video_id = mobj.group(2)
1092 return video_id
1093
1094 def _extract_from_m3u8(self, manifest_url, video_id):
1095 url_map = {}
1096 def _get_urls(_manifest):
1097 lines = _manifest.split('\n')
1098 urls = filter(lambda l: l and not l.startswith('#'),
1099 lines)
1100 return urls
1101 manifest = self._download_webpage(manifest_url, video_id, u'Downloading formats manifest')
1102 formats_urls = _get_urls(manifest)
1103 for format_url in formats_urls:
1104 itag = self._search_regex(r'itag/(\d+?)/', format_url, 'itag')
1105 url_map[itag] = format_url
1106 return url_map
1107
1108 def _extract_annotations(self, video_id):
1109 url = 'https://www.youtube.com/annotations_invideo?features=1&legacy=1&video_id=%s' % video_id
1110 return self._download_webpage(url, video_id, note=u'Searching for annotations.', errnote=u'Unable to download video annotations.')
1111
1112 def _real_extract(self, url):
1113 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
1114 mobj = re.search(self._NEXT_URL_RE, url)
1115 if mobj:
1116 url = 'https://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
1117 video_id = self._extract_id(url)
1118
1119 # Get video webpage
1120 url = 'https://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
1121 video_webpage = self._download_webpage(url, video_id)
1122
1123 # Attempt to extract SWF player URL
1124 mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
1125 if mobj is not None:
1126 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1127 else:
1128 player_url = None
1129
1130 # Get video info
1131 self.report_video_info_webpage_download(video_id)
1132 if re.search(r'player-age-gate-content">', video_webpage) is not None:
1133 self.report_age_confirmation()
1134 age_gate = True
1135 # We simulate the access to the video from www.youtube.com/v/{video_id}
1136 # this can be viewed without login into Youtube
1137 data = compat_urllib_parse.urlencode({'video_id': video_id,
1138 'el': 'player_embedded',
1139 'gl': 'US',
1140 'hl': 'en',
1141 'eurl': 'https://youtube.googleapis.com/v/' + video_id,
1142 'asv': 3,
1143 'sts':'1588',
1144 })
1145 video_info_url = 'https://www.youtube.com/get_video_info?' + data
1146 video_info_webpage = self._download_webpage(video_info_url, video_id,
1147 note=False,
1148 errnote='unable to download video info webpage')
1149 video_info = compat_parse_qs(video_info_webpage)
1150 else:
1151 age_gate = False
1152 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1153 video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1154 % (video_id, el_type))
1155 video_info_webpage = self._download_webpage(video_info_url, video_id,
1156 note=False,
1157 errnote='unable to download video info webpage')
1158 video_info = compat_parse_qs(video_info_webpage)
1159 if 'token' in video_info:
1160 break
1161 if 'token' not in video_info:
1162 if 'reason' in video_info:
1163 raise ExtractorError(u'YouTube said: %s' % video_info['reason'][0], expected=True)
1164 else:
1165 raise ExtractorError(u'"token" parameter not in video info for unknown reason')
1166
1167 if 'view_count' in video_info:
1168 view_count = int(video_info['view_count'][0])
1169 else:
1170 view_count = None
1171
1172 # Check for "rental" videos
1173 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
1174 raise ExtractorError(u'"rental" videos not supported')
1175
1176 # Start extracting information
1177 self.report_information_extraction(video_id)
1178
1179 # uploader
1180 if 'author' not in video_info:
1181 raise ExtractorError(u'Unable to extract uploader name')
1182 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
1183
1184 # uploader_id
1185 video_uploader_id = None
1186 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
1187 if mobj is not None:
1188 video_uploader_id = mobj.group(1)
1189 else:
1190 self._downloader.report_warning(u'unable to extract uploader nickname')
1191
1192 # title
1193 if 'title' in video_info:
1194 video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
1195 else:
1196 self._downloader.report_warning(u'Unable to extract video title')
1197 video_title = u'_'
1198
1199 # thumbnail image
1200 # We try first to get a high quality image:
1201 m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">',
1202 video_webpage, re.DOTALL)
1203 if m_thumb is not None:
1204 video_thumbnail = m_thumb.group(1)
1205 elif 'thumbnail_url' not in video_info:
1206 self._downloader.report_warning(u'unable to extract video thumbnail')
1207 video_thumbnail = None
1208 else: # don't panic if we can't find it
1209 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
1210
1211 # upload date
1212 upload_date = None
1213 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
1214 if mobj is not None:
1215 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1216 upload_date = unified_strdate(upload_date)
1217
1218 # description
1219 video_description = get_element_by_id("eow-description", video_webpage)
1220 if video_description:
1221 video_description = re.sub(r'''(?x)
1222 <a\s+
1223 (?:[a-zA-Z-]+="[^"]+"\s+)*?
1224 title="([^"]+)"\s+
1225 (?:[a-zA-Z-]+="[^"]+"\s+)*?
1226 class="yt-uix-redirect-link"\s*>
1227 [^<]+
1228 </a>
1229 ''', r'\1', video_description)
1230 video_description = clean_html(video_description)
1231 else:
1232 fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
1233 if fd_mobj:
1234 video_description = unescapeHTML(fd_mobj.group(1))
1235 else:
1236 video_description = u''
1237
1238 def _extract_count(klass):
1239 count = self._search_regex(
1240 r'class="%s">([\d,]+)</span>' % re.escape(klass),
1241 video_webpage, klass, default=None)
1242 if count is not None:
1243 return int(count.replace(',', ''))
1244 return None
1245 like_count = _extract_count(u'likes-count')
1246 dislike_count = _extract_count(u'dislikes-count')
1247
1248 # subtitles
1249 video_subtitles = self.extract_subtitles(video_id, video_webpage)
1250
1251 if self._downloader.params.get('listsubtitles', False):
1252 self._list_available_subtitles(video_id, video_webpage)
1253 return
1254
1255 if 'length_seconds' not in video_info:
1256 self._downloader.report_warning(u'unable to extract video duration')
1257 video_duration = None
1258 else:
1259 video_duration = int(compat_urllib_parse.unquote_plus(video_info['length_seconds'][0]))
1260
1261 # annotations
1262 video_annotations = None
1263 if self._downloader.params.get('writeannotations', False):
1264 video_annotations = self._extract_annotations(video_id)
1265
1266 # Decide which formats to download
1267 try:
1268 mobj = re.search(r';ytplayer.config = ({.*?});', video_webpage)
1269 if not mobj:
1270 raise ValueError('Could not find vevo ID')
1271 info = json.loads(mobj.group(1))
1272 args = info['args']
1273 # Easy way to know if the 's' value is in url_encoded_fmt_stream_map
1274 # this signatures are encrypted
1275 if 'url_encoded_fmt_stream_map' not in args:
1276 raise ValueError(u'No stream_map present') # caught below
1277 re_signature = re.compile(r'[&,]s=')
1278 m_s = re_signature.search(args['url_encoded_fmt_stream_map'])
1279 if m_s is not None:
1280 self.to_screen(u'%s: Encrypted signatures detected.' % video_id)
1281 video_info['url_encoded_fmt_stream_map'] = [args['url_encoded_fmt_stream_map']]
1282 m_s = re_signature.search(args.get('adaptive_fmts', u''))
1283 if m_s is not None:
1284 if 'adaptive_fmts' in video_info:
1285 video_info['adaptive_fmts'][0] += ',' + args['adaptive_fmts']
1286 else:
1287 video_info['adaptive_fmts'] = [args['adaptive_fmts']]
1288 except ValueError:
1289 pass
1290
1291 def _map_to_format_list(urlmap):
1292 formats = []
1293 for itag, video_real_url in urlmap.items():
1294 dct = {
1295 'format_id': itag,
1296 'url': video_real_url,
1297 'player_url': player_url,
1298 }
1299 if itag in self._formats:
1300 dct.update(self._formats[itag])
1301 formats.append(dct)
1302 return formats
1303
1304 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1305 self.report_rtmp_download()
1306 formats = [{
1307 'format_id': '_rtmp',
1308 'protocol': 'rtmp',
1309 'url': video_info['conn'][0],
1310 'player_url': player_url,
1311 }]
1312 elif len(video_info.get('url_encoded_fmt_stream_map', [])) >= 1 or len(video_info.get('adaptive_fmts', [])) >= 1:
1313 encoded_url_map = video_info.get('url_encoded_fmt_stream_map', [''])[0] + ',' + video_info.get('adaptive_fmts',[''])[0]
1314 if 'rtmpe%3Dyes' in encoded_url_map:
1315 raise ExtractorError('rtmpe downloads are not supported, see https://github.com/rg3/youtube-dl/issues/343 for more information.', expected=True)
1316 url_map = {}
1317 for url_data_str in encoded_url_map.split(','):
1318 url_data = compat_parse_qs(url_data_str)
1319 if 'itag' in url_data and 'url' in url_data:
1320 url = url_data['url'][0]
1321 if 'sig' in url_data:
1322 url += '&signature=' + url_data['sig'][0]
1323 elif 's' in url_data:
1324 encrypted_sig = url_data['s'][0]
1325 if self._downloader.params.get('verbose'):
1326 if age_gate:
1327 if player_url is None:
1328 player_version = 'unknown'
1329 else:
1330 player_version = self._search_regex(
1331 r'-(.+)\.swf$', player_url,
1332 u'flash player', fatal=False)
1333 player_desc = 'flash player %s' % player_version
1334 else:
1335 player_version = self._search_regex(
1336 r'html5player-(.+?)\.js', video_webpage,
1337 'html5 player', fatal=False)
1338 player_desc = u'html5 player %s' % player_version
1339
1340 parts_sizes = u'.'.join(compat_str(len(part)) for part in encrypted_sig.split('.'))
1341 self.to_screen(u'encrypted signature length %d (%s), itag %s, %s' %
1342 (len(encrypted_sig), parts_sizes, url_data['itag'][0], player_desc))
1343
1344 if not age_gate:
1345 jsplayer_url_json = self._search_regex(
1346 r'"assets":.+?"js":\s*("[^"]+")',
1347 video_webpage, u'JS player URL')
1348 player_url = json.loads(jsplayer_url_json)
1349
1350 signature = self._decrypt_signature(
1351 encrypted_sig, video_id, player_url, age_gate)
1352 url += '&signature=' + signature
1353 if 'ratebypass' not in url:
1354 url += '&ratebypass=yes'
1355 url_map[url_data['itag'][0]] = url
1356 formats = _map_to_format_list(url_map)
1357 elif video_info.get('hlsvp'):
1358 manifest_url = video_info['hlsvp'][0]
1359 url_map = self._extract_from_m3u8(manifest_url, video_id)
1360 formats = _map_to_format_list(url_map)
1361 else:
1362 raise ExtractorError(u'no conn, hlsvp or url_encoded_fmt_stream_map information found in video info')
1363
1364 # Look for the DASH manifest
1365 dash_manifest_url_lst = video_info.get('dashmpd')
1366 if (dash_manifest_url_lst and dash_manifest_url_lst[0] and
1367 self._downloader.params.get('youtube_include_dash_manifest', False)):
1368 try:
1369 dash_doc = self._download_xml(
1370 dash_manifest_url_lst[0], video_id,
1371 note=u'Downloading DASH manifest',
1372 errnote=u'Could not download DASH manifest')
1373 for r in dash_doc.findall(u'.//{urn:mpeg:DASH:schema:MPD:2011}Representation'):
1374 url_el = r.find('{urn:mpeg:DASH:schema:MPD:2011}BaseURL')
1375 if url_el is None:
1376 continue
1377 format_id = r.attrib['id']
1378 video_url = url_el.text
1379 filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength'))
1380 f = {
1381 'format_id': format_id,
1382 'url': video_url,
1383 'width': int_or_none(r.attrib.get('width')),
1384 'tbr': int_or_none(r.attrib.get('bandwidth'), 1000),
1385 'asr': int_or_none(r.attrib.get('audioSamplingRate')),
1386 'filesize': filesize,
1387 }
1388 try:
1389 existing_format = next(
1390 fo for fo in formats
1391 if fo['format_id'] == format_id)
1392 except StopIteration:
1393 f.update(self._formats.get(format_id, {}))
1394 formats.append(f)
1395 else:
1396 existing_format.update(f)
1397
1398 except (ExtractorError, KeyError) as e:
1399 self.report_warning(u'Skipping DASH manifest: %s' % e, video_id)
1400
1401 self._sort_formats(formats)
1402
1403 return {
1404 'id': video_id,
1405 'uploader': video_uploader,
1406 'uploader_id': video_uploader_id,
1407 'upload_date': upload_date,
1408 'title': video_title,
1409 'thumbnail': video_thumbnail,
1410 'description': video_description,
1411 'subtitles': video_subtitles,
1412 'duration': video_duration,
1413 'age_limit': 18 if age_gate else 0,
1414 'annotations': video_annotations,
1415 'webpage_url': 'https://www.youtube.com/watch?v=%s' % video_id,
1416 'view_count': view_count,
1417 'like_count': like_count,
1418 'dislike_count': dislike_count,
1419 'formats': formats,
1420 }
1421
1422class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
1423 IE_DESC = u'YouTube.com playlists'
1424 _VALID_URL = r"""(?:
1425 (?:https?://)?
1426 (?:\w+\.)?
1427 youtube\.com/
1428 (?:
1429 (?:course|view_play_list|my_playlists|artist|playlist|watch)
1430 \? (?:.*?&)*? (?:p|a|list)=
1431 | p/
1432 )
1433 ((?:PL|EC|UU|FL|RD)?[0-9A-Za-z-_]{10,})
1434 .*
1435 |
1436 ((?:PL|EC|UU|FL|RD)[0-9A-Za-z-_]{10,})
1437 )"""
1438 _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s&page=%s'
1439 _MORE_PAGES_INDICATOR = r'data-link-type="next"'
1440 _VIDEO_RE = r'href="/watch\?v=(?P<id>[0-9A-Za-z_-]{11})&amp;[^"]*?index=(?P<index>\d+)'
1441 IE_NAME = u'youtube:playlist'
1442
1443 @classmethod
1444 def suitable(cls, url):
1445 """Receives a URL and returns True if suitable for this IE."""
1446 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1447
1448 def _real_initialize(self):
1449 self._login()
1450
1451 def _ids_to_results(self, ids):
1452 return [self.url_result(vid_id, 'Youtube', video_id=vid_id)
1453 for vid_id in ids]
1454
1455 def _extract_mix(self, playlist_id):
1456 # The mixes are generated from a a single video
1457 # the id of the playlist is just 'RD' + video_id
1458 url = 'https://youtube.com/watch?v=%s&list=%s' % (playlist_id[-11:], playlist_id)
1459 webpage = self._download_webpage(url, playlist_id, u'Downloading Youtube mix')
1460 title_span = (get_element_by_attribute('class', 'title long-title', webpage) or
1461 get_element_by_attribute('class', 'title ', webpage))
1462 title = clean_html(title_span)
1463 video_re = r'data-index="\d+".*?href="/watch\?v=([0-9A-Za-z_-]{11})&amp;[^"]*?list=%s' % re.escape(playlist_id)
1464 ids = orderedSet(re.findall(video_re, webpage))
1465 url_results = self._ids_to_results(ids)
1466
1467 return self.playlist_result(url_results, playlist_id, title)
1468
1469 def _real_extract(self, url):
1470 # Extract playlist id
1471 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1472 if mobj is None:
1473 raise ExtractorError(u'Invalid URL: %s' % url)
1474 playlist_id = mobj.group(1) or mobj.group(2)
1475
1476 # Check if it's a video-specific URL
1477 query_dict = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
1478 if 'v' in query_dict:
1479 video_id = query_dict['v'][0]
1480 if self._downloader.params.get('noplaylist'):
1481 self.to_screen(u'Downloading just video %s because of --no-playlist' % video_id)
1482 return self.url_result(video_id, 'Youtube', video_id=video_id)
1483 else:
1484 self.to_screen(u'Downloading playlist PL%s - add --no-playlist to just download video %s' % (playlist_id, video_id))
1485
1486 if playlist_id.startswith('RD'):
1487 # Mixes require a custom extraction process
1488 return self._extract_mix(playlist_id)
1489 if playlist_id.startswith('TL'):
1490 raise ExtractorError(u'For downloading YouTube.com top lists, use '
1491 u'the "yttoplist" keyword, for example "youtube-dl \'yttoplist:music:Top Tracks\'"', expected=True)
1492
1493 # Extract the video ids from the playlist pages
1494 ids = []
1495
1496 for page_num in itertools.count(1):
1497 url = self._TEMPLATE_URL % (playlist_id, page_num)
1498 page = self._download_webpage(url, playlist_id, u'Downloading page #%s' % page_num)
1499 matches = re.finditer(self._VIDEO_RE, page)
1500 # We remove the duplicates and the link with index 0
1501 # (it's not the first video of the playlist)
1502 new_ids = orderedSet(m.group('id') for m in matches if m.group('index') != '0')
1503 ids.extend(new_ids)
1504
1505 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1506 break
1507
1508 try:
1509 playlist_title = self._og_search_title(page)
1510 except RegexNotFoundError:
1511 self.report_warning(
1512 u'Playlist page is missing OpenGraph title, falling back ...',
1513 playlist_id)
1514 playlist_title = self._html_search_regex(
1515 r'<h1 class="pl-header-title">(.*?)</h1>', page, u'title')
1516
1517 url_results = self._ids_to_results(ids)
1518 return self.playlist_result(url_results, playlist_id, playlist_title)
1519
1520
1521class YoutubeTopListIE(YoutubePlaylistIE):
1522 IE_NAME = u'youtube:toplist'
1523 IE_DESC = (u'YouTube.com top lists, "yttoplist:{channel}:{list title}"'
1524 u' (Example: "yttoplist:music:Top Tracks")')
1525 _VALID_URL = r'yttoplist:(?P<chann>.*?):(?P<title>.*?)$'
1526
1527 def _real_extract(self, url):
1528 mobj = re.match(self._VALID_URL, url)
1529 channel = mobj.group('chann')
1530 title = mobj.group('title')
1531 query = compat_urllib_parse.urlencode({'title': title})
1532 playlist_re = 'href="([^"]+?%s[^"]+?)"' % re.escape(query)
1533 channel_page = self._download_webpage('https://www.youtube.com/%s' % channel, title)
1534 link = self._html_search_regex(playlist_re, channel_page, u'list')
1535 url = compat_urlparse.urljoin('https://www.youtube.com/', link)
1536
1537 video_re = r'data-index="\d+".*?data-video-id="([0-9A-Za-z_-]{11})"'
1538 ids = []
1539 # sometimes the webpage doesn't contain the videos
1540 # retry until we get them
1541 for i in itertools.count(0):
1542 msg = u'Downloading Youtube mix'
1543 if i > 0:
1544 msg += ', retry #%d' % i
1545 webpage = self._download_webpage(url, title, msg)
1546 ids = orderedSet(re.findall(video_re, webpage))
1547 if ids:
1548 break
1549 url_results = self._ids_to_results(ids)
1550 return self.playlist_result(url_results, playlist_title=title)
1551
1552
1553class YoutubeChannelIE(InfoExtractor):
1554 IE_DESC = u'YouTube.com channels'
1555 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
1556 _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
1557 _MORE_PAGES_URL = 'http://www.youtube.com/c4_browse_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
1558 IE_NAME = u'youtube:channel'
1559
1560 def extract_videos_from_page(self, page):
1561 ids_in_page = []
1562 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1563 if mobj.group(1) not in ids_in_page:
1564 ids_in_page.append(mobj.group(1))
1565 return ids_in_page
1566
1567 def _real_extract(self, url):
1568 # Extract channel id
1569 mobj = re.match(self._VALID_URL, url)
1570 if mobj is None:
1571 raise ExtractorError(u'Invalid URL: %s' % url)
1572
1573 # Download channel page
1574 channel_id = mobj.group(1)
1575 video_ids = []
1576 url = 'https://www.youtube.com/channel/%s/videos' % channel_id
1577 channel_page = self._download_webpage(url, channel_id)
1578 autogenerated = re.search(r'''(?x)
1579 class="[^"]*?(?:
1580 channel-header-autogenerated-label|
1581 yt-channel-title-autogenerated
1582 )[^"]*"''', channel_page) is not None
1583
1584 if autogenerated:
1585 # The videos are contained in a single page
1586 # the ajax pages can't be used, they are empty
1587 video_ids = self.extract_videos_from_page(channel_page)
1588 else:
1589 # Download all channel pages using the json-based channel_ajax query
1590 for pagenum in itertools.count(1):
1591 url = self._MORE_PAGES_URL % (pagenum, channel_id)
1592 page = self._download_webpage(url, channel_id,
1593 u'Downloading page #%s' % pagenum)
1594
1595 page = json.loads(page)
1596
1597 ids_in_page = self.extract_videos_from_page(page['content_html'])
1598 video_ids.extend(ids_in_page)
1599
1600 if self._MORE_PAGES_INDICATOR not in page['load_more_widget_html']:
1601 break
1602
1603 self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1604
1605 url_entries = [self.url_result(video_id, 'Youtube', video_id=video_id)
1606 for video_id in video_ids]
1607 return self.playlist_result(url_entries, channel_id)
1608
1609
1610class YoutubeUserIE(InfoExtractor):
1611 IE_DESC = u'YouTube.com user videos (URL or "ytuser" keyword)'
1612 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:user/)?(?!(?:attribution_link|watch)(?:$|[^a-z_A-Z0-9-])))|ytuser:)(?!feed/)([A-Za-z0-9_-]+)'
1613 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1614 _GDATA_PAGE_SIZE = 50
1615 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d&alt=json'
1616 IE_NAME = u'youtube:user'
1617
1618 @classmethod
1619 def suitable(cls, url):
1620 # Don't return True if the url can be extracted with other youtube
1621 # extractor, the regex would is too permissive and it would match.
1622 other_ies = iter(klass for (name, klass) in globals().items() if name.endswith('IE') and klass is not cls)
1623 if any(ie.suitable(url) for ie in other_ies): return False
1624 else: return super(YoutubeUserIE, cls).suitable(url)
1625
1626 def _real_extract(self, url):
1627 # Extract username
1628 mobj = re.match(self._VALID_URL, url)
1629 if mobj is None:
1630 raise ExtractorError(u'Invalid URL: %s' % url)
1631
1632 username = mobj.group(1)
1633
1634 # Download video ids using YouTube Data API. Result size per
1635 # query is limited (currently to 50 videos) so we need to query
1636 # page by page until there are no video ids - it means we got
1637 # all of them.
1638
1639 def download_page(pagenum):
1640 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1641
1642 gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
1643 page = self._download_webpage(
1644 gdata_url, username,
1645 u'Downloading video ids from %d to %d' % (
1646 start_index, start_index + self._GDATA_PAGE_SIZE))
1647
1648 try:
1649 response = json.loads(page)
1650 except ValueError as err:
1651 raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
1652 if 'entry' not in response['feed']:
1653 return
1654
1655 # Extract video identifiers
1656 entries = response['feed']['entry']
1657 for entry in entries:
1658 title = entry['title']['$t']
1659 video_id = entry['id']['$t'].split('/')[-1]
1660 yield {
1661 '_type': 'url',
1662 'url': video_id,
1663 'ie_key': 'Youtube',
1664 'id': 'video_id',
1665 'title': title,
1666 }
1667 url_results = PagedList(download_page, self._GDATA_PAGE_SIZE)
1668
1669 return self.playlist_result(url_results, playlist_title=username)
1670
1671
1672class YoutubeSearchIE(SearchInfoExtractor):
1673 IE_DESC = u'YouTube.com searches'
1674 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1675 _MAX_RESULTS = 1000
1676 IE_NAME = u'youtube:search'
1677 _SEARCH_KEY = 'ytsearch'
1678
1679 def _get_n_results(self, query, n):
1680 """Get a specified number of results for a query"""
1681
1682 video_ids = []
1683 pagenum = 0
1684 limit = n
1685
1686 while (50 * pagenum) < limit:
1687 result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1688 data_json = self._download_webpage(
1689 result_url, video_id=u'query "%s"' % query,
1690 note=u'Downloading page %s' % (pagenum + 1),
1691 errnote=u'Unable to download API page')
1692 data = json.loads(data_json)
1693 api_response = data['data']
1694
1695 if 'items' not in api_response:
1696 raise ExtractorError(u'[youtube] No video results')
1697
1698 new_ids = list(video['id'] for video in api_response['items'])
1699 video_ids += new_ids
1700
1701 limit = min(n, api_response['totalItems'])
1702 pagenum += 1
1703
1704 if len(video_ids) > n:
1705 video_ids = video_ids[:n]
1706 videos = [self.url_result(video_id, 'Youtube', video_id=video_id)
1707 for video_id in video_ids]
1708 return self.playlist_result(videos, query)
1709
1710class YoutubeSearchDateIE(YoutubeSearchIE):
1711 IE_NAME = YoutubeSearchIE.IE_NAME + ':date'
1712 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc&orderby=published'
1713 _SEARCH_KEY = 'ytsearchdate'
1714 IE_DESC = u'YouTube.com searches, newest videos first'
1715
1716class YoutubeShowIE(InfoExtractor):
1717 IE_DESC = u'YouTube.com (multi-season) shows'
1718 _VALID_URL = r'https?://www\.youtube\.com/show/(.*)'
1719 IE_NAME = u'youtube:show'
1720
1721 def _real_extract(self, url):
1722 mobj = re.match(self._VALID_URL, url)
1723 show_name = mobj.group(1)
1724 webpage = self._download_webpage(url, show_name, u'Downloading show webpage')
1725 # There's one playlist for each season of the show
1726 m_seasons = list(re.finditer(r'href="(/playlist\?list=.*?)"', webpage))
1727 self.to_screen(u'%s: Found %s seasons' % (show_name, len(m_seasons)))
1728 return [self.url_result('https://www.youtube.com' + season.group(1), 'YoutubePlaylist') for season in m_seasons]
1729
1730
1731class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
1732 """
1733 Base class for extractors that fetch info from
1734 http://www.youtube.com/feed_ajax
1735 Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.
1736 """
1737 _LOGIN_REQUIRED = True
1738 # use action_load_personal_feed instead of action_load_system_feed
1739 _PERSONAL_FEED = False
1740
1741 @property
1742 def _FEED_TEMPLATE(self):
1743 action = 'action_load_system_feed'
1744 if self._PERSONAL_FEED:
1745 action = 'action_load_personal_feed'
1746 return 'http://www.youtube.com/feed_ajax?%s=1&feed_name=%s&paging=%%s' % (action, self._FEED_NAME)
1747
1748 @property
1749 def IE_NAME(self):
1750 return u'youtube:%s' % self._FEED_NAME
1751
1752 def _real_initialize(self):
1753 self._login()
1754
1755 def _real_extract(self, url):
1756 feed_entries = []
1757 paging = 0
1758 for i in itertools.count(1):
1759 info = self._download_webpage(self._FEED_TEMPLATE % paging,
1760 u'%s feed' % self._FEED_NAME,
1761 u'Downloading page %s' % i)
1762 info = json.loads(info)
1763 feed_html = info['feed_html']
1764 m_ids = re.finditer(r'"/watch\?v=(.*?)["&]', feed_html)
1765 ids = orderedSet(m.group(1) for m in m_ids)
1766 feed_entries.extend(
1767 self.url_result(video_id, 'Youtube', video_id=video_id)
1768 for video_id in ids)
1769 if info['paging'] is None:
1770 break
1771 paging = info['paging']
1772 return self.playlist_result(feed_entries, playlist_title=self._PLAYLIST_TITLE)
1773
1774class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor):
1775 IE_DESC = u'YouTube.com subscriptions feed, "ytsubs" keyword(requires authentication)'
1776 _VALID_URL = r'https?://www\.youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
1777 _FEED_NAME = 'subscriptions'
1778 _PLAYLIST_TITLE = u'Youtube Subscriptions'
1779
1780class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
1781 IE_DESC = u'YouTube.com recommended videos, "ytrec" keyword (requires authentication)'
1782 _VALID_URL = r'https?://www\.youtube\.com/feed/recommended|:ytrec(?:ommended)?'
1783 _FEED_NAME = 'recommended'
1784 _PLAYLIST_TITLE = u'Youtube Recommended videos'
1785
1786class YoutubeWatchLaterIE(YoutubeFeedsInfoExtractor):
1787 IE_DESC = u'Youtube watch later list, "ytwatchlater" keyword (requires authentication)'
1788 _VALID_URL = r'https?://www\.youtube\.com/feed/watch_later|:ytwatchlater'
1789 _FEED_NAME = 'watch_later'
1790 _PLAYLIST_TITLE = u'Youtube Watch Later'
1791 _PERSONAL_FEED = True
1792
1793class YoutubeHistoryIE(YoutubeFeedsInfoExtractor):
1794 IE_DESC = u'Youtube watch history, "ythistory" keyword (requires authentication)'
1795 _VALID_URL = u'https?://www\.youtube\.com/feed/history|:ythistory'
1796 _FEED_NAME = 'history'
1797 _PERSONAL_FEED = True
1798 _PLAYLIST_TITLE = u'Youtube Watch History'
1799
1800class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
1801 IE_NAME = u'youtube:favorites'
1802 IE_DESC = u'YouTube.com favourite videos, "ytfav" keyword (requires authentication)'
1803 _VALID_URL = r'https?://www\.youtube\.com/my_favorites|:ytfav(?:ou?rites)?'
1804 _LOGIN_REQUIRED = True
1805
1806 def _real_extract(self, url):
1807 webpage = self._download_webpage('https://www.youtube.com/my_favorites', 'Youtube Favourites videos')
1808 playlist_id = self._search_regex(r'list=(.+?)["&]', webpage, u'favourites playlist id')
1809 return self.url_result(playlist_id, 'YoutubePlaylist')
1810
1811
1812class YoutubeTruncatedURLIE(InfoExtractor):
1813 IE_NAME = 'youtube:truncated_url'
1814 IE_DESC = False # Do not list
1815 _VALID_URL = r'''(?x)
1816 (?:https?://)?[^/]+/watch\?feature=[a-z_]+$|
1817 (?:https?://)?(?:www\.)?youtube\.com/attribution_link\?a=[^&]+$
1818 '''
1819
1820 def _real_extract(self, url):
1821 raise ExtractorError(
1822 u'Did you forget to quote the URL? Remember that & is a meta '
1823 u'character in most shells, so you want to put the URL in quotes, '
1824 u'like youtube-dl '
1825 u'"http://www.youtube.com/watch?feature=foo&v=BaW_jenozKc" '
1826 u' or simply youtube-dl BaW_jenozKc .',
1827 expected=True)