]> jfr.im git - yt-dlp.git/blame_incremental - youtube_dl/extractor/youtube.py
[youtube] Correct subtitle URL (Fixes #2120)
[yt-dlp.git] / youtube_dl / extractor / youtube.py
... / ...
CommitLineData
1# coding: utf-8
2
3import collections
4import errno
5import io
6import itertools
7import json
8import os.path
9import re
10import string
11import struct
12import traceback
13import zlib
14
15from .common import InfoExtractor, SearchInfoExtractor
16from .subtitles import SubtitlesInfoExtractor
17from ..utils import (
18 compat_chr,
19 compat_parse_qs,
20 compat_urllib_parse,
21 compat_urllib_request,
22 compat_urlparse,
23 compat_str,
24
25 clean_html,
26 get_cachedir,
27 get_element_by_id,
28 get_element_by_attribute,
29 ExtractorError,
30 unescapeHTML,
31 unified_strdate,
32 orderedSet,
33 write_json_file,
34)
35
36class YoutubeBaseInfoExtractor(InfoExtractor):
37 """Provide base functions for Youtube extractors"""
38 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
39 _LANG_URL = r'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
40 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
41 _NETRC_MACHINE = 'youtube'
42 # If True it will raise an error if no login info is provided
43 _LOGIN_REQUIRED = False
44
45 def _set_language(self):
46 return bool(self._download_webpage(
47 self._LANG_URL, None,
48 note=u'Setting language', errnote='unable to set language',
49 fatal=False))
50
51 def _login(self):
52 (username, password) = self._get_login_info()
53 # No authentication to be performed
54 if username is None:
55 if self._LOGIN_REQUIRED:
56 raise ExtractorError(u'No login info available, needed for using %s.' % self.IE_NAME, expected=True)
57 return False
58
59 login_page = self._download_webpage(
60 self._LOGIN_URL, None,
61 note=u'Downloading login page',
62 errnote=u'unable to fetch login page', fatal=False)
63 if login_page is False:
64 return
65
66 galx = self._search_regex(r'(?s)<input.+?name="GALX".+?value="(.+?)"',
67 login_page, u'Login GALX parameter')
68
69 # Log in
70 login_form_strs = {
71 u'continue': u'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
72 u'Email': username,
73 u'GALX': galx,
74 u'Passwd': password,
75 u'PersistentCookie': u'yes',
76 u'_utf8': u'霱',
77 u'bgresponse': u'js_disabled',
78 u'checkConnection': u'',
79 u'checkedDomains': u'youtube',
80 u'dnConn': u'',
81 u'pstMsg': u'0',
82 u'rmShown': u'1',
83 u'secTok': u'',
84 u'signIn': u'Sign in',
85 u'timeStmp': u'',
86 u'service': u'youtube',
87 u'uilel': u'3',
88 u'hl': u'en_US',
89 }
90 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
91 # chokes on unicode
92 login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
93 login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
94
95 req = compat_urllib_request.Request(self._LOGIN_URL, login_data)
96 login_results = self._download_webpage(
97 req, None,
98 note=u'Logging in', errnote=u'unable to log in', fatal=False)
99 if login_results is False:
100 return False
101 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
102 self._downloader.report_warning(u'unable to log in: bad username or password')
103 return False
104 return True
105
106 def _confirm_age(self):
107 age_form = {
108 'next_url': '/',
109 'action_confirm': 'Confirm',
110 }
111 req = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
112
113 self._download_webpage(
114 req, None,
115 note=u'Confirming age', errnote=u'Unable to confirm age')
116 return True
117
118 def _real_initialize(self):
119 if self._downloader is None:
120 return
121 if not self._set_language():
122 return
123 if not self._login():
124 return
125 self._confirm_age()
126
127
128class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
129 IE_DESC = u'YouTube.com'
130 _VALID_URL = r"""(?x)^
131 (
132 (?:https?://|//)? # http(s):// or protocol-independent URL (optional)
133 (?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/|
134 tube\.majestyc\.net/|
135 youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains
136 (?:.*?\#/)? # handle anchor (#/) redirect urls
137 (?: # the various things that can precede the ID:
138 (?:(?:v|embed|e)/) # v/ or embed/ or e/
139 |(?: # or the v= param in all its forms
140 (?:(?:watch|movie)(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
141 (?:\?|\#!?) # the params delimiter ? or # or #!
142 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
143 v=
144 )
145 ))
146 |youtu\.be/ # just youtu.be/xxxx
147 )
148 )? # all until now is optional -> you can pass the naked ID
149 ([0-9A-Za-z_-]{11}) # here is it! the YouTube video ID
150 (?(1).+)? # if we found the ID, everything can follow
151 $"""
152 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
153 _formats = {
154 '5': {'ext': 'flv', 'width': 400, 'height': 240},
155 '6': {'ext': 'flv', 'width': 450, 'height': 270},
156 '13': {'ext': '3gp'},
157 '17': {'ext': '3gp', 'width': 176, 'height': 144},
158 '18': {'ext': 'mp4', 'width': 640, 'height': 360},
159 '22': {'ext': 'mp4', 'width': 1280, 'height': 720},
160 '34': {'ext': 'flv', 'width': 640, 'height': 360},
161 '35': {'ext': 'flv', 'width': 854, 'height': 480},
162 '36': {'ext': '3gp', 'width': 320, 'height': 240},
163 '37': {'ext': 'mp4', 'width': 1920, 'height': 1080},
164 '38': {'ext': 'mp4', 'width': 4096, 'height': 3072},
165 '43': {'ext': 'webm', 'width': 640, 'height': 360},
166 '44': {'ext': 'webm', 'width': 854, 'height': 480},
167 '45': {'ext': 'webm', 'width': 1280, 'height': 720},
168 '46': {'ext': 'webm', 'width': 1920, 'height': 1080},
169
170
171 # 3d videos
172 '82': {'ext': 'mp4', 'height': 360, 'resolution': '360p', 'format_note': '3D', 'preference': -20},
173 '83': {'ext': 'mp4', 'height': 480, 'resolution': '480p', 'format_note': '3D', 'preference': -20},
174 '84': {'ext': 'mp4', 'height': 720, 'resolution': '720p', 'format_note': '3D', 'preference': -20},
175 '85': {'ext': 'mp4', 'height': 1080, 'resolution': '1080p', 'format_note': '3D', 'preference': -20},
176 '100': {'ext': 'webm', 'height': 360, 'resolution': '360p', 'format_note': '3D', 'preference': -20},
177 '101': {'ext': 'webm', 'height': 480, 'resolution': '480p', 'format_note': '3D', 'preference': -20},
178 '102': {'ext': 'webm', 'height': 720, 'resolution': '720p', 'format_note': '3D', 'preference': -20},
179
180 # Apple HTTP Live Streaming
181 '92': {'ext': 'mp4', 'height': 240, 'resolution': '240p', 'format_note': 'HLS', 'preference': -10},
182 '93': {'ext': 'mp4', 'height': 360, 'resolution': '360p', 'format_note': 'HLS', 'preference': -10},
183 '94': {'ext': 'mp4', 'height': 480, 'resolution': '480p', 'format_note': 'HLS', 'preference': -10},
184 '95': {'ext': 'mp4', 'height': 720, 'resolution': '720p', 'format_note': 'HLS', 'preference': -10},
185 '96': {'ext': 'mp4', 'height': 1080, 'resolution': '1080p', 'format_note': 'HLS', 'preference': -10},
186 '132': {'ext': 'mp4', 'height': 240, 'resolution': '240p', 'format_note': 'HLS', 'preference': -10},
187 '151': {'ext': 'mp4', 'height': 72, 'resolution': '72p', 'format_note': 'HLS', 'preference': -10},
188
189 # DASH mp4 video
190 '133': {'ext': 'mp4', 'height': 240, 'resolution': '240p', 'format_note': 'DASH video', 'preference': -40},
191 '134': {'ext': 'mp4', 'height': 360, 'resolution': '360p', 'format_note': 'DASH video', 'preference': -40},
192 '135': {'ext': 'mp4', 'height': 480, 'resolution': '480p', 'format_note': 'DASH video', 'preference': -40},
193 '136': {'ext': 'mp4', 'height': 720, 'resolution': '720p', 'format_note': 'DASH video', 'preference': -40},
194 '137': {'ext': 'mp4', 'height': 1080, 'resolution': '1080p', 'format_note': 'DASH video', 'preference': -40},
195 '138': {'ext': 'mp4', 'height': 1081, 'resolution': '>1080p', 'format_note': 'DASH video', 'preference': -40},
196 '160': {'ext': 'mp4', 'height': 192, 'resolution': '192p', 'format_note': 'DASH video', 'preference': -40},
197 '264': {'ext': 'mp4', 'height': 1080, 'resolution': '1080p', 'format_note': 'DASH video', 'preference': -40},
198
199 # Dash mp4 audio
200 '139': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 48, 'preference': -50},
201 '140': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 128, 'preference': -50},
202 '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 256, 'preference': -50},
203
204 # Dash webm
205 '242': {'ext': 'webm', 'height': 240, 'resolution': '240p', 'format_note': 'DASH webm', 'preference': -40},
206 '243': {'ext': 'webm', 'height': 360, 'resolution': '360p', 'format_note': 'DASH webm', 'preference': -40},
207 '244': {'ext': 'webm', 'height': 480, 'resolution': '480p', 'format_note': 'DASH webm', 'preference': -40},
208 '245': {'ext': 'webm', 'height': 480, 'resolution': '480p', 'format_note': 'DASH webm', 'preference': -40},
209 '246': {'ext': 'webm', 'height': 480, 'resolution': '480p', 'format_note': 'DASH webm', 'preference': -40},
210 '247': {'ext': 'webm', 'height': 720, 'resolution': '720p', 'format_note': 'DASH webm', 'preference': -40},
211 '248': {'ext': 'webm', 'height': 1080, 'resolution': '1080p', 'format_note': 'DASH webm', 'preference': -40},
212
213 # Dash webm audio
214 '171': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH webm audio', 'abr': 48, 'preference': -50},
215 '172': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH webm audio', 'abr': 256, 'preference': -50},
216 }
217
218 IE_NAME = u'youtube'
219 _TESTS = [
220 {
221 u"url": u"http://www.youtube.com/watch?v=BaW_jenozKc",
222 u"file": u"BaW_jenozKc.mp4",
223 u"info_dict": {
224 u"title": u"youtube-dl test video \"'/\\ä↭𝕐",
225 u"uploader": u"Philipp Hagemeister",
226 u"uploader_id": u"phihag",
227 u"upload_date": u"20121002",
228 u"description": u"test chars: \"'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de ."
229 }
230 },
231 {
232 u"url": u"http://www.youtube.com/watch?v=UxxajLWwzqY",
233 u"file": u"UxxajLWwzqY.mp4",
234 u"note": u"Test generic use_cipher_signature video (#897)",
235 u"info_dict": {
236 u"upload_date": u"20120506",
237 u"title": u"Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]",
238 u"description": u"md5:5b292926389560516e384ac437c0ec07",
239 u"uploader": u"Icona Pop",
240 u"uploader_id": u"IconaPop"
241 }
242 },
243 {
244 u"url": u"https://www.youtube.com/watch?v=07FYdnEawAQ",
245 u"file": u"07FYdnEawAQ.mp4",
246 u"note": u"Test VEVO video with age protection (#956)",
247 u"info_dict": {
248 u"upload_date": u"20130703",
249 u"title": u"Justin Timberlake - Tunnel Vision (Explicit)",
250 u"description": u"md5:64249768eec3bc4276236606ea996373",
251 u"uploader": u"justintimberlakeVEVO",
252 u"uploader_id": u"justintimberlakeVEVO"
253 }
254 },
255 {
256 u"url": u"//www.YouTube.com/watch?v=yZIXLfi8CZQ",
257 u"file": u"yZIXLfi8CZQ.mp4",
258 u"note": u"Embed-only video (#1746)",
259 u"info_dict": {
260 u"upload_date": u"20120608",
261 u"title": u"Principal Sexually Assaults A Teacher - Episode 117 - 8th June 2012",
262 u"description": u"md5:09b78bd971f1e3e289601dfba15ca4f7",
263 u"uploader": u"SET India",
264 u"uploader_id": u"setindia"
265 }
266 },
267 ]
268
269
270 @classmethod
271 def suitable(cls, url):
272 """Receives a URL and returns True if suitable for this IE."""
273 if YoutubePlaylistIE.suitable(url): return False
274 return re.match(cls._VALID_URL, url) is not None
275
276 def __init__(self, *args, **kwargs):
277 super(YoutubeIE, self).__init__(*args, **kwargs)
278 self._player_cache = {}
279
280 def report_video_info_webpage_download(self, video_id):
281 """Report attempt to download video info webpage."""
282 self.to_screen(u'%s: Downloading video info webpage' % video_id)
283
284 def report_information_extraction(self, video_id):
285 """Report attempt to extract video information."""
286 self.to_screen(u'%s: Extracting video information' % video_id)
287
288 def report_unavailable_format(self, video_id, format):
289 """Report extracted video URL."""
290 self.to_screen(u'%s: Format %s not available' % (video_id, format))
291
292 def report_rtmp_download(self):
293 """Indicate the download will use the RTMP protocol."""
294 self.to_screen(u'RTMP download detected')
295
296 def _extract_signature_function(self, video_id, player_url, slen):
297 id_m = re.match(r'.*-(?P<id>[a-zA-Z0-9_-]+)\.(?P<ext>[a-z]+)$',
298 player_url)
299 player_type = id_m.group('ext')
300 player_id = id_m.group('id')
301
302 # Read from filesystem cache
303 func_id = '%s_%s_%d' % (player_type, player_id, slen)
304 assert os.path.basename(func_id) == func_id
305 cache_dir = get_cachedir(self._downloader.params)
306
307 cache_enabled = cache_dir is not None
308 if cache_enabled:
309 cache_fn = os.path.join(os.path.expanduser(cache_dir),
310 u'youtube-sigfuncs',
311 func_id + '.json')
312 try:
313 with io.open(cache_fn, 'r', encoding='utf-8') as cachef:
314 cache_spec = json.load(cachef)
315 return lambda s: u''.join(s[i] for i in cache_spec)
316 except IOError:
317 pass # No cache available
318
319 if player_type == 'js':
320 code = self._download_webpage(
321 player_url, video_id,
322 note=u'Downloading %s player %s' % (player_type, player_id),
323 errnote=u'Download of %s failed' % player_url)
324 res = self._parse_sig_js(code)
325 elif player_type == 'swf':
326 urlh = self._request_webpage(
327 player_url, video_id,
328 note=u'Downloading %s player %s' % (player_type, player_id),
329 errnote=u'Download of %s failed' % player_url)
330 code = urlh.read()
331 res = self._parse_sig_swf(code)
332 else:
333 assert False, 'Invalid player type %r' % player_type
334
335 if cache_enabled:
336 try:
337 test_string = u''.join(map(compat_chr, range(slen)))
338 cache_res = res(test_string)
339 cache_spec = [ord(c) for c in cache_res]
340 try:
341 os.makedirs(os.path.dirname(cache_fn))
342 except OSError as ose:
343 if ose.errno != errno.EEXIST:
344 raise
345 write_json_file(cache_spec, cache_fn)
346 except Exception:
347 tb = traceback.format_exc()
348 self._downloader.report_warning(
349 u'Writing cache to %r failed: %s' % (cache_fn, tb))
350
351 return res
352
353 def _print_sig_code(self, func, slen):
354 def gen_sig_code(idxs):
355 def _genslice(start, end, step):
356 starts = u'' if start == 0 else str(start)
357 ends = (u':%d' % (end+step)) if end + step >= 0 else u':'
358 steps = u'' if step == 1 else (u':%d' % step)
359 return u's[%s%s%s]' % (starts, ends, steps)
360
361 step = None
362 start = '(Never used)' # Quelch pyflakes warnings - start will be
363 # set as soon as step is set
364 for i, prev in zip(idxs[1:], idxs[:-1]):
365 if step is not None:
366 if i - prev == step:
367 continue
368 yield _genslice(start, prev, step)
369 step = None
370 continue
371 if i - prev in [-1, 1]:
372 step = i - prev
373 start = prev
374 continue
375 else:
376 yield u's[%d]' % prev
377 if step is None:
378 yield u's[%d]' % i
379 else:
380 yield _genslice(start, i, step)
381
382 test_string = u''.join(map(compat_chr, range(slen)))
383 cache_res = func(test_string)
384 cache_spec = [ord(c) for c in cache_res]
385 expr_code = u' + '.join(gen_sig_code(cache_spec))
386 code = u'if len(s) == %d:\n return %s\n' % (slen, expr_code)
387 self.to_screen(u'Extracted signature function:\n' + code)
388
389 def _parse_sig_js(self, jscode):
390 funcname = self._search_regex(
391 r'signature=([a-zA-Z]+)', jscode,
392 u'Initial JS player signature function name')
393
394 functions = {}
395
396 def argidx(varname):
397 return string.lowercase.index(varname)
398
399 def interpret_statement(stmt, local_vars, allow_recursion=20):
400 if allow_recursion < 0:
401 raise ExtractorError(u'Recursion limit reached')
402
403 if stmt.startswith(u'var '):
404 stmt = stmt[len(u'var '):]
405 ass_m = re.match(r'^(?P<out>[a-z]+)(?:\[(?P<index>[^\]]+)\])?' +
406 r'=(?P<expr>.*)$', stmt)
407 if ass_m:
408 if ass_m.groupdict().get('index'):
409 def assign(val):
410 lvar = local_vars[ass_m.group('out')]
411 idx = interpret_expression(ass_m.group('index'),
412 local_vars, allow_recursion)
413 assert isinstance(idx, int)
414 lvar[idx] = val
415 return val
416 expr = ass_m.group('expr')
417 else:
418 def assign(val):
419 local_vars[ass_m.group('out')] = val
420 return val
421 expr = ass_m.group('expr')
422 elif stmt.startswith(u'return '):
423 assign = lambda v: v
424 expr = stmt[len(u'return '):]
425 else:
426 raise ExtractorError(
427 u'Cannot determine left side of statement in %r' % stmt)
428
429 v = interpret_expression(expr, local_vars, allow_recursion)
430 return assign(v)
431
432 def interpret_expression(expr, local_vars, allow_recursion):
433 if expr.isdigit():
434 return int(expr)
435
436 if expr.isalpha():
437 return local_vars[expr]
438
439 m = re.match(r'^(?P<in>[a-z]+)\.(?P<member>.*)$', expr)
440 if m:
441 member = m.group('member')
442 val = local_vars[m.group('in')]
443 if member == 'split("")':
444 return list(val)
445 if member == 'join("")':
446 return u''.join(val)
447 if member == 'length':
448 return len(val)
449 if member == 'reverse()':
450 return val[::-1]
451 slice_m = re.match(r'slice\((?P<idx>.*)\)', member)
452 if slice_m:
453 idx = interpret_expression(
454 slice_m.group('idx'), local_vars, allow_recursion-1)
455 return val[idx:]
456
457 m = re.match(
458 r'^(?P<in>[a-z]+)\[(?P<idx>.+)\]$', expr)
459 if m:
460 val = local_vars[m.group('in')]
461 idx = interpret_expression(m.group('idx'), local_vars,
462 allow_recursion-1)
463 return val[idx]
464
465 m = re.match(r'^(?P<a>.+?)(?P<op>[%])(?P<b>.+?)$', expr)
466 if m:
467 a = interpret_expression(m.group('a'),
468 local_vars, allow_recursion)
469 b = interpret_expression(m.group('b'),
470 local_vars, allow_recursion)
471 return a % b
472
473 m = re.match(
474 r'^(?P<func>[a-zA-Z]+)\((?P<args>[a-z0-9,]+)\)$', expr)
475 if m:
476 fname = m.group('func')
477 if fname not in functions:
478 functions[fname] = extract_function(fname)
479 argvals = [int(v) if v.isdigit() else local_vars[v]
480 for v in m.group('args').split(',')]
481 return functions[fname](argvals)
482 raise ExtractorError(u'Unsupported JS expression %r' % expr)
483
484 def extract_function(funcname):
485 func_m = re.search(
486 r'function ' + re.escape(funcname) +
487 r'\((?P<args>[a-z,]+)\){(?P<code>[^}]+)}',
488 jscode)
489 argnames = func_m.group('args').split(',')
490
491 def resf(args):
492 local_vars = dict(zip(argnames, args))
493 for stmt in func_m.group('code').split(';'):
494 res = interpret_statement(stmt, local_vars)
495 return res
496 return resf
497
498 initial_function = extract_function(funcname)
499 return lambda s: initial_function([s])
500
501 def _parse_sig_swf(self, file_contents):
502 if file_contents[1:3] != b'WS':
503 raise ExtractorError(
504 u'Not an SWF file; header is %r' % file_contents[:3])
505 if file_contents[:1] == b'C':
506 content = zlib.decompress(file_contents[8:])
507 else:
508 raise NotImplementedError(u'Unsupported compression format %r' %
509 file_contents[:1])
510
511 def extract_tags(content):
512 pos = 0
513 while pos < len(content):
514 header16 = struct.unpack('<H', content[pos:pos+2])[0]
515 pos += 2
516 tag_code = header16 >> 6
517 tag_len = header16 & 0x3f
518 if tag_len == 0x3f:
519 tag_len = struct.unpack('<I', content[pos:pos+4])[0]
520 pos += 4
521 assert pos+tag_len <= len(content)
522 yield (tag_code, content[pos:pos+tag_len])
523 pos += tag_len
524
525 code_tag = next(tag
526 for tag_code, tag in extract_tags(content)
527 if tag_code == 82)
528 p = code_tag.index(b'\0', 4) + 1
529 code_reader = io.BytesIO(code_tag[p:])
530
531 # Parse ABC (AVM2 ByteCode)
532 def read_int(reader=None):
533 if reader is None:
534 reader = code_reader
535 res = 0
536 shift = 0
537 for _ in range(5):
538 buf = reader.read(1)
539 assert len(buf) == 1
540 b = struct.unpack('<B', buf)[0]
541 res = res | ((b & 0x7f) << shift)
542 if b & 0x80 == 0:
543 break
544 shift += 7
545 return res
546
547 def u30(reader=None):
548 res = read_int(reader)
549 assert res & 0xf0000000 == 0
550 return res
551 u32 = read_int
552
553 def s32(reader=None):
554 v = read_int(reader)
555 if v & 0x80000000 != 0:
556 v = - ((v ^ 0xffffffff) + 1)
557 return v
558
559 def read_string(reader=None):
560 if reader is None:
561 reader = code_reader
562 slen = u30(reader)
563 resb = reader.read(slen)
564 assert len(resb) == slen
565 return resb.decode('utf-8')
566
567 def read_bytes(count, reader=None):
568 if reader is None:
569 reader = code_reader
570 resb = reader.read(count)
571 assert len(resb) == count
572 return resb
573
574 def read_byte(reader=None):
575 resb = read_bytes(1, reader=reader)
576 res = struct.unpack('<B', resb)[0]
577 return res
578
579 # minor_version + major_version
580 read_bytes(2 + 2)
581
582 # Constant pool
583 int_count = u30()
584 for _c in range(1, int_count):
585 s32()
586 uint_count = u30()
587 for _c in range(1, uint_count):
588 u32()
589 double_count = u30()
590 read_bytes((double_count-1) * 8)
591 string_count = u30()
592 constant_strings = [u'']
593 for _c in range(1, string_count):
594 s = read_string()
595 constant_strings.append(s)
596 namespace_count = u30()
597 for _c in range(1, namespace_count):
598 read_bytes(1) # kind
599 u30() # name
600 ns_set_count = u30()
601 for _c in range(1, ns_set_count):
602 count = u30()
603 for _c2 in range(count):
604 u30()
605 multiname_count = u30()
606 MULTINAME_SIZES = {
607 0x07: 2, # QName
608 0x0d: 2, # QNameA
609 0x0f: 1, # RTQName
610 0x10: 1, # RTQNameA
611 0x11: 0, # RTQNameL
612 0x12: 0, # RTQNameLA
613 0x09: 2, # Multiname
614 0x0e: 2, # MultinameA
615 0x1b: 1, # MultinameL
616 0x1c: 1, # MultinameLA
617 }
618 multinames = [u'']
619 for _c in range(1, multiname_count):
620 kind = u30()
621 assert kind in MULTINAME_SIZES, u'Invalid multiname kind %r' % kind
622 if kind == 0x07:
623 u30() # namespace_idx
624 name_idx = u30()
625 multinames.append(constant_strings[name_idx])
626 else:
627 multinames.append('[MULTINAME kind: %d]' % kind)
628 for _c2 in range(MULTINAME_SIZES[kind]):
629 u30()
630
631 # Methods
632 method_count = u30()
633 MethodInfo = collections.namedtuple(
634 'MethodInfo',
635 ['NEED_ARGUMENTS', 'NEED_REST'])
636 method_infos = []
637 for method_id in range(method_count):
638 param_count = u30()
639 u30() # return type
640 for _ in range(param_count):
641 u30() # param type
642 u30() # name index (always 0 for youtube)
643 flags = read_byte()
644 if flags & 0x08 != 0:
645 # Options present
646 option_count = u30()
647 for c in range(option_count):
648 u30() # val
649 read_bytes(1) # kind
650 if flags & 0x80 != 0:
651 # Param names present
652 for _ in range(param_count):
653 u30() # param name
654 mi = MethodInfo(flags & 0x01 != 0, flags & 0x04 != 0)
655 method_infos.append(mi)
656
657 # Metadata
658 metadata_count = u30()
659 for _c in range(metadata_count):
660 u30() # name
661 item_count = u30()
662 for _c2 in range(item_count):
663 u30() # key
664 u30() # value
665
666 def parse_traits_info():
667 trait_name_idx = u30()
668 kind_full = read_byte()
669 kind = kind_full & 0x0f
670 attrs = kind_full >> 4
671 methods = {}
672 if kind in [0x00, 0x06]: # Slot or Const
673 u30() # Slot id
674 u30() # type_name_idx
675 vindex = u30()
676 if vindex != 0:
677 read_byte() # vkind
678 elif kind in [0x01, 0x02, 0x03]: # Method / Getter / Setter
679 u30() # disp_id
680 method_idx = u30()
681 methods[multinames[trait_name_idx]] = method_idx
682 elif kind == 0x04: # Class
683 u30() # slot_id
684 u30() # classi
685 elif kind == 0x05: # Function
686 u30() # slot_id
687 function_idx = u30()
688 methods[function_idx] = multinames[trait_name_idx]
689 else:
690 raise ExtractorError(u'Unsupported trait kind %d' % kind)
691
692 if attrs & 0x4 != 0: # Metadata present
693 metadata_count = u30()
694 for _c3 in range(metadata_count):
695 u30() # metadata index
696
697 return methods
698
699 # Classes
700 TARGET_CLASSNAME = u'SignatureDecipher'
701 searched_idx = multinames.index(TARGET_CLASSNAME)
702 searched_class_id = None
703 class_count = u30()
704 for class_id in range(class_count):
705 name_idx = u30()
706 if name_idx == searched_idx:
707 # We found the class we're looking for!
708 searched_class_id = class_id
709 u30() # super_name idx
710 flags = read_byte()
711 if flags & 0x08 != 0: # Protected namespace is present
712 u30() # protected_ns_idx
713 intrf_count = u30()
714 for _c2 in range(intrf_count):
715 u30()
716 u30() # iinit
717 trait_count = u30()
718 for _c2 in range(trait_count):
719 parse_traits_info()
720
721 if searched_class_id is None:
722 raise ExtractorError(u'Target class %r not found' %
723 TARGET_CLASSNAME)
724
725 method_names = {}
726 method_idxs = {}
727 for class_id in range(class_count):
728 u30() # cinit
729 trait_count = u30()
730 for _c2 in range(trait_count):
731 trait_methods = parse_traits_info()
732 if class_id == searched_class_id:
733 method_names.update(trait_methods.items())
734 method_idxs.update(dict(
735 (idx, name)
736 for name, idx in trait_methods.items()))
737
738 # Scripts
739 script_count = u30()
740 for _c in range(script_count):
741 u30() # init
742 trait_count = u30()
743 for _c2 in range(trait_count):
744 parse_traits_info()
745
746 # Method bodies
747 method_body_count = u30()
748 Method = collections.namedtuple('Method', ['code', 'local_count'])
749 methods = {}
750 for _c in range(method_body_count):
751 method_idx = u30()
752 u30() # max_stack
753 local_count = u30()
754 u30() # init_scope_depth
755 u30() # max_scope_depth
756 code_length = u30()
757 code = read_bytes(code_length)
758 if method_idx in method_idxs:
759 m = Method(code, local_count)
760 methods[method_idxs[method_idx]] = m
761 exception_count = u30()
762 for _c2 in range(exception_count):
763 u30() # from
764 u30() # to
765 u30() # target
766 u30() # exc_type
767 u30() # var_name
768 trait_count = u30()
769 for _c2 in range(trait_count):
770 parse_traits_info()
771
772 assert p + code_reader.tell() == len(code_tag)
773 assert len(methods) == len(method_idxs)
774
775 method_pyfunctions = {}
776
777 def extract_function(func_name):
778 if func_name in method_pyfunctions:
779 return method_pyfunctions[func_name]
780 if func_name not in methods:
781 raise ExtractorError(u'Cannot find function %r' % func_name)
782 m = methods[func_name]
783
784 def resfunc(args):
785 registers = ['(this)'] + list(args) + [None] * m.local_count
786 stack = []
787 coder = io.BytesIO(m.code)
788 while True:
789 opcode = struct.unpack('!B', coder.read(1))[0]
790 if opcode == 36: # pushbyte
791 v = struct.unpack('!B', coder.read(1))[0]
792 stack.append(v)
793 elif opcode == 44: # pushstring
794 idx = u30(coder)
795 stack.append(constant_strings[idx])
796 elif opcode == 48: # pushscope
797 # We don't implement the scope register, so we'll just
798 # ignore the popped value
799 stack.pop()
800 elif opcode == 70: # callproperty
801 index = u30(coder)
802 mname = multinames[index]
803 arg_count = u30(coder)
804 args = list(reversed(
805 [stack.pop() for _ in range(arg_count)]))
806 obj = stack.pop()
807 if mname == u'split':
808 assert len(args) == 1
809 assert isinstance(args[0], compat_str)
810 assert isinstance(obj, compat_str)
811 if args[0] == u'':
812 res = list(obj)
813 else:
814 res = obj.split(args[0])
815 stack.append(res)
816 elif mname == u'slice':
817 assert len(args) == 1
818 assert isinstance(args[0], int)
819 assert isinstance(obj, list)
820 res = obj[args[0]:]
821 stack.append(res)
822 elif mname == u'join':
823 assert len(args) == 1
824 assert isinstance(args[0], compat_str)
825 assert isinstance(obj, list)
826 res = args[0].join(obj)
827 stack.append(res)
828 elif mname in method_pyfunctions:
829 stack.append(method_pyfunctions[mname](args))
830 else:
831 raise NotImplementedError(
832 u'Unsupported property %r on %r'
833 % (mname, obj))
834 elif opcode == 72: # returnvalue
835 res = stack.pop()
836 return res
837 elif opcode == 79: # callpropvoid
838 index = u30(coder)
839 mname = multinames[index]
840 arg_count = u30(coder)
841 args = list(reversed(
842 [stack.pop() for _ in range(arg_count)]))
843 obj = stack.pop()
844 if mname == u'reverse':
845 assert isinstance(obj, list)
846 obj.reverse()
847 else:
848 raise NotImplementedError(
849 u'Unsupported (void) property %r on %r'
850 % (mname, obj))
851 elif opcode == 93: # findpropstrict
852 index = u30(coder)
853 mname = multinames[index]
854 res = extract_function(mname)
855 stack.append(res)
856 elif opcode == 97: # setproperty
857 index = u30(coder)
858 value = stack.pop()
859 idx = stack.pop()
860 obj = stack.pop()
861 assert isinstance(obj, list)
862 assert isinstance(idx, int)
863 obj[idx] = value
864 elif opcode == 98: # getlocal
865 index = u30(coder)
866 stack.append(registers[index])
867 elif opcode == 99: # setlocal
868 index = u30(coder)
869 value = stack.pop()
870 registers[index] = value
871 elif opcode == 102: # getproperty
872 index = u30(coder)
873 pname = multinames[index]
874 if pname == u'length':
875 obj = stack.pop()
876 assert isinstance(obj, list)
877 stack.append(len(obj))
878 else: # Assume attribute access
879 idx = stack.pop()
880 assert isinstance(idx, int)
881 obj = stack.pop()
882 assert isinstance(obj, list)
883 stack.append(obj[idx])
884 elif opcode == 128: # coerce
885 u30(coder)
886 elif opcode == 133: # coerce_s
887 assert isinstance(stack[-1], (type(None), compat_str))
888 elif opcode == 164: # modulo
889 value2 = stack.pop()
890 value1 = stack.pop()
891 res = value1 % value2
892 stack.append(res)
893 elif opcode == 208: # getlocal_0
894 stack.append(registers[0])
895 elif opcode == 209: # getlocal_1
896 stack.append(registers[1])
897 elif opcode == 210: # getlocal_2
898 stack.append(registers[2])
899 elif opcode == 211: # getlocal_3
900 stack.append(registers[3])
901 elif opcode == 214: # setlocal_2
902 registers[2] = stack.pop()
903 elif opcode == 215: # setlocal_3
904 registers[3] = stack.pop()
905 else:
906 raise NotImplementedError(
907 u'Unsupported opcode %d' % opcode)
908
909 method_pyfunctions[func_name] = resfunc
910 return resfunc
911
912 initial_function = extract_function(u'decipher')
913 return lambda s: initial_function([s])
914
915 def _decrypt_signature(self, s, video_id, player_url, age_gate=False):
916 """Turn the encrypted s field into a working signature"""
917
918 if player_url is not None:
919 if player_url.startswith(u'//'):
920 player_url = u'https:' + player_url
921 try:
922 player_id = (player_url, len(s))
923 if player_id not in self._player_cache:
924 func = self._extract_signature_function(
925 video_id, player_url, len(s)
926 )
927 self._player_cache[player_id] = func
928 func = self._player_cache[player_id]
929 if self._downloader.params.get('youtube_print_sig_code'):
930 self._print_sig_code(func, len(s))
931 return func(s)
932 except Exception:
933 tb = traceback.format_exc()
934 self._downloader.report_warning(
935 u'Automatic signature extraction failed: ' + tb)
936
937 self._downloader.report_warning(
938 u'Warning: Falling back to static signature algorithm')
939
940 return self._static_decrypt_signature(
941 s, video_id, player_url, age_gate)
942
943 def _static_decrypt_signature(self, s, video_id, player_url, age_gate):
944 if age_gate:
945 # The videos with age protection use another player, so the
946 # algorithms can be different.
947 if len(s) == 86:
948 return s[2:63] + s[82] + s[64:82] + s[63]
949
950 if len(s) == 93:
951 return s[86:29:-1] + s[88] + s[28:5:-1]
952 elif len(s) == 92:
953 return s[25] + s[3:25] + s[0] + s[26:42] + s[79] + s[43:79] + s[91] + s[80:83]
954 elif len(s) == 91:
955 return s[84:27:-1] + s[86] + s[26:5:-1]
956 elif len(s) == 90:
957 return s[25] + s[3:25] + s[2] + s[26:40] + s[77] + s[41:77] + s[89] + s[78:81]
958 elif len(s) == 89:
959 return s[84:78:-1] + s[87] + s[77:60:-1] + s[0] + s[59:3:-1]
960 elif len(s) == 88:
961 return s[7:28] + s[87] + s[29:45] + s[55] + s[46:55] + s[2] + s[56:87] + s[28]
962 elif len(s) == 87:
963 return s[6:27] + s[4] + s[28:39] + s[27] + s[40:59] + s[2] + s[60:]
964 elif len(s) == 86:
965 return s[80:72:-1] + s[16] + s[71:39:-1] + s[72] + s[38:16:-1] + s[82] + s[15::-1]
966 elif len(s) == 85:
967 return s[3:11] + s[0] + s[12:55] + s[84] + s[56:84]
968 elif len(s) == 84:
969 return s[78:70:-1] + s[14] + s[69:37:-1] + s[70] + s[36:14:-1] + s[80] + s[:14][::-1]
970 elif len(s) == 83:
971 return s[80:63:-1] + s[0] + s[62:0:-1] + s[63]
972 elif len(s) == 82:
973 return s[80:37:-1] + s[7] + s[36:7:-1] + s[0] + s[6:0:-1] + s[37]
974 elif len(s) == 81:
975 return s[56] + s[79:56:-1] + s[41] + s[55:41:-1] + s[80] + s[40:34:-1] + s[0] + s[33:29:-1] + s[34] + s[28:9:-1] + s[29] + s[8:0:-1] + s[9]
976 elif len(s) == 80:
977 return s[1:19] + s[0] + s[20:68] + s[19] + s[69:80]
978 elif len(s) == 79:
979 return s[54] + s[77:54:-1] + s[39] + s[53:39:-1] + s[78] + s[38:34:-1] + s[0] + s[33:29:-1] + s[34] + s[28:9:-1] + s[29] + s[8:0:-1] + s[9]
980
981 else:
982 raise ExtractorError(u'Unable to decrypt signature, key length %d not supported; retrying might work' % (len(s)))
983
984 def _get_available_subtitles(self, video_id, webpage):
985 try:
986 sub_list = self._download_webpage(
987 'http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,
988 video_id, note=False)
989 except ExtractorError as err:
990 self._downloader.report_warning(u'unable to download video subtitles: %s' % compat_str(err))
991 return {}
992 lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
993
994 sub_lang_list = {}
995 for l in lang_list:
996 lang = l[1]
997 params = compat_urllib_parse.urlencode({
998 'lang': lang,
999 'v': video_id,
1000 'fmt': self._downloader.params.get('subtitlesformat', 'srt'),
1001 'name': unescapeHTML(l[0]).encode('utf-8'),
1002 })
1003 url = u'http://www.youtube.com/api/timedtext?' + params
1004 sub_lang_list[lang] = url
1005 if not sub_lang_list:
1006 self._downloader.report_warning(u'video doesn\'t have subtitles')
1007 return {}
1008 return sub_lang_list
1009
1010 def _get_available_automatic_caption(self, video_id, webpage):
1011 """We need the webpage for getting the captions url, pass it as an
1012 argument to speed up the process."""
1013 sub_format = self._downloader.params.get('subtitlesformat', 'srt')
1014 self.to_screen(u'%s: Looking for automatic captions' % video_id)
1015 mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
1016 err_msg = u'Couldn\'t find automatic captions for %s' % video_id
1017 if mobj is None:
1018 self._downloader.report_warning(err_msg)
1019 return {}
1020 player_config = json.loads(mobj.group(1))
1021 try:
1022 args = player_config[u'args']
1023 caption_url = args[u'ttsurl']
1024 timestamp = args[u'timestamp']
1025 # We get the available subtitles
1026 list_params = compat_urllib_parse.urlencode({
1027 'type': 'list',
1028 'tlangs': 1,
1029 'asrs': 1,
1030 })
1031 list_url = caption_url + '&' + list_params
1032 caption_list = self._download_xml(list_url, video_id)
1033 original_lang_node = caption_list.find('track')
1034 if original_lang_node is None or original_lang_node.attrib.get('kind') != 'asr' :
1035 self._downloader.report_warning(u'Video doesn\'t have automatic captions')
1036 return {}
1037 original_lang = original_lang_node.attrib['lang_code']
1038
1039 sub_lang_list = {}
1040 for lang_node in caption_list.findall('target'):
1041 sub_lang = lang_node.attrib['lang_code']
1042 params = compat_urllib_parse.urlencode({
1043 'lang': original_lang,
1044 'tlang': sub_lang,
1045 'fmt': sub_format,
1046 'ts': timestamp,
1047 'kind': 'asr',
1048 })
1049 sub_lang_list[sub_lang] = caption_url + '&' + params
1050 return sub_lang_list
1051 # An extractor error can be raise by the download process if there are
1052 # no automatic captions but there are subtitles
1053 except (KeyError, ExtractorError):
1054 self._downloader.report_warning(err_msg)
1055 return {}
1056
1057 def _extract_id(self, url):
1058 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1059 if mobj is None:
1060 raise ExtractorError(u'Invalid URL: %s' % url)
1061 video_id = mobj.group(2)
1062 return video_id
1063
1064 def _get_video_url_list(self, url_map):
1065 """
1066 Transform a dictionary in the format {itag:url} to a list of (itag, url)
1067 with the requested formats.
1068 """
1069 existing_formats = [x for x in self._formats if x in url_map]
1070 if len(existing_formats) == 0:
1071 raise ExtractorError(u'no known formats available for video')
1072 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1073 video_url_list.reverse() # order worst to best
1074 return video_url_list
1075
1076 def _extract_from_m3u8(self, manifest_url, video_id):
1077 url_map = {}
1078 def _get_urls(_manifest):
1079 lines = _manifest.split('\n')
1080 urls = filter(lambda l: l and not l.startswith('#'),
1081 lines)
1082 return urls
1083 manifest = self._download_webpage(manifest_url, video_id, u'Downloading formats manifest')
1084 formats_urls = _get_urls(manifest)
1085 for format_url in formats_urls:
1086 itag = self._search_regex(r'itag/(\d+?)/', format_url, 'itag')
1087 url_map[itag] = format_url
1088 return url_map
1089
1090 def _extract_annotations(self, video_id):
1091 url = 'https://www.youtube.com/annotations_invideo?features=1&legacy=1&video_id=%s' % video_id
1092 return self._download_webpage(url, video_id, note=u'Searching for annotations.', errnote=u'Unable to download video annotations.')
1093
1094 def _real_extract(self, url):
1095 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
1096 mobj = re.search(self._NEXT_URL_RE, url)
1097 if mobj:
1098 url = 'https://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
1099 video_id = self._extract_id(url)
1100
1101 # Get video webpage
1102 url = 'https://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
1103 video_webpage = self._download_webpage(url, video_id)
1104
1105 # Attempt to extract SWF player URL
1106 mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
1107 if mobj is not None:
1108 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1109 else:
1110 player_url = None
1111
1112 # Get video info
1113 self.report_video_info_webpage_download(video_id)
1114 if re.search(r'player-age-gate-content">', video_webpage) is not None:
1115 self.report_age_confirmation()
1116 age_gate = True
1117 # We simulate the access to the video from www.youtube.com/v/{video_id}
1118 # this can be viewed without login into Youtube
1119 data = compat_urllib_parse.urlencode({'video_id': video_id,
1120 'el': 'player_embedded',
1121 'gl': 'US',
1122 'hl': 'en',
1123 'eurl': 'https://youtube.googleapis.com/v/' + video_id,
1124 'asv': 3,
1125 'sts':'1588',
1126 })
1127 video_info_url = 'https://www.youtube.com/get_video_info?' + data
1128 video_info_webpage = self._download_webpage(video_info_url, video_id,
1129 note=False,
1130 errnote='unable to download video info webpage')
1131 video_info = compat_parse_qs(video_info_webpage)
1132 else:
1133 age_gate = False
1134 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1135 video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1136 % (video_id, el_type))
1137 video_info_webpage = self._download_webpage(video_info_url, video_id,
1138 note=False,
1139 errnote='unable to download video info webpage')
1140 video_info = compat_parse_qs(video_info_webpage)
1141 if 'token' in video_info:
1142 break
1143 if 'token' not in video_info:
1144 if 'reason' in video_info:
1145 raise ExtractorError(u'YouTube said: %s' % video_info['reason'][0], expected=True)
1146 else:
1147 raise ExtractorError(u'"token" parameter not in video info for unknown reason')
1148
1149 if 'view_count' in video_info:
1150 view_count = int(video_info['view_count'][0])
1151 else:
1152 view_count = None
1153
1154 # Check for "rental" videos
1155 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
1156 raise ExtractorError(u'"rental" videos not supported')
1157
1158 # Start extracting information
1159 self.report_information_extraction(video_id)
1160
1161 # uploader
1162 if 'author' not in video_info:
1163 raise ExtractorError(u'Unable to extract uploader name')
1164 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
1165
1166 # uploader_id
1167 video_uploader_id = None
1168 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
1169 if mobj is not None:
1170 video_uploader_id = mobj.group(1)
1171 else:
1172 self._downloader.report_warning(u'unable to extract uploader nickname')
1173
1174 # title
1175 if 'title' in video_info:
1176 video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
1177 else:
1178 self._downloader.report_warning(u'Unable to extract video title')
1179 video_title = u'_'
1180
1181 # thumbnail image
1182 # We try first to get a high quality image:
1183 m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">',
1184 video_webpage, re.DOTALL)
1185 if m_thumb is not None:
1186 video_thumbnail = m_thumb.group(1)
1187 elif 'thumbnail_url' not in video_info:
1188 self._downloader.report_warning(u'unable to extract video thumbnail')
1189 video_thumbnail = None
1190 else: # don't panic if we can't find it
1191 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
1192
1193 # upload date
1194 upload_date = None
1195 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
1196 if mobj is not None:
1197 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1198 upload_date = unified_strdate(upload_date)
1199
1200 # description
1201 video_description = get_element_by_id("eow-description", video_webpage)
1202 if video_description:
1203 video_description = re.sub(r'''(?x)
1204 <a\s+
1205 (?:[a-zA-Z-]+="[^"]+"\s+)*?
1206 title="([^"]+)"\s+
1207 (?:[a-zA-Z-]+="[^"]+"\s+)*?
1208 class="yt-uix-redirect-link"\s*>
1209 [^<]+
1210 </a>
1211 ''', r'\1', video_description)
1212 video_description = clean_html(video_description)
1213 else:
1214 fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
1215 if fd_mobj:
1216 video_description = unescapeHTML(fd_mobj.group(1))
1217 else:
1218 video_description = u''
1219
1220 def _extract_count(klass):
1221 count = self._search_regex(
1222 r'class="%s">([\d,]+)</span>' % re.escape(klass),
1223 video_webpage, klass, default=None)
1224 if count is not None:
1225 return int(count.replace(',', ''))
1226 return None
1227 like_count = _extract_count(u'likes-count')
1228 dislike_count = _extract_count(u'dislikes-count')
1229
1230 # subtitles
1231 video_subtitles = self.extract_subtitles(video_id, video_webpage)
1232
1233 if self._downloader.params.get('listsubtitles', False):
1234 self._list_available_subtitles(video_id, video_webpage)
1235 return
1236
1237 if 'length_seconds' not in video_info:
1238 self._downloader.report_warning(u'unable to extract video duration')
1239 video_duration = None
1240 else:
1241 video_duration = int(compat_urllib_parse.unquote_plus(video_info['length_seconds'][0]))
1242
1243 # annotations
1244 video_annotations = None
1245 if self._downloader.params.get('writeannotations', False):
1246 video_annotations = self._extract_annotations(video_id)
1247
1248 # Decide which formats to download
1249
1250 try:
1251 mobj = re.search(r';ytplayer.config = ({.*?});', video_webpage)
1252 if not mobj:
1253 raise ValueError('Could not find vevo ID')
1254 info = json.loads(mobj.group(1))
1255 args = info['args']
1256 # Easy way to know if the 's' value is in url_encoded_fmt_stream_map
1257 # this signatures are encrypted
1258 if 'url_encoded_fmt_stream_map' not in args:
1259 raise ValueError(u'No stream_map present') # caught below
1260 re_signature = re.compile(r'[&,]s=')
1261 m_s = re_signature.search(args['url_encoded_fmt_stream_map'])
1262 if m_s is not None:
1263 self.to_screen(u'%s: Encrypted signatures detected.' % video_id)
1264 video_info['url_encoded_fmt_stream_map'] = [args['url_encoded_fmt_stream_map']]
1265 m_s = re_signature.search(args.get('adaptive_fmts', u''))
1266 if m_s is not None:
1267 if 'adaptive_fmts' in video_info:
1268 video_info['adaptive_fmts'][0] += ',' + args['adaptive_fmts']
1269 else:
1270 video_info['adaptive_fmts'] = [args['adaptive_fmts']]
1271 except ValueError:
1272 pass
1273
1274 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1275 self.report_rtmp_download()
1276 video_url_list = [(None, video_info['conn'][0])]
1277 elif len(video_info.get('url_encoded_fmt_stream_map', [])) >= 1 or len(video_info.get('adaptive_fmts', [])) >= 1:
1278 encoded_url_map = video_info.get('url_encoded_fmt_stream_map', [''])[0] + ',' + video_info.get('adaptive_fmts',[''])[0]
1279 if 'rtmpe%3Dyes' in encoded_url_map:
1280 raise ExtractorError('rtmpe downloads are not supported, see https://github.com/rg3/youtube-dl/issues/343 for more information.', expected=True)
1281 url_map = {}
1282 for url_data_str in encoded_url_map.split(','):
1283 url_data = compat_parse_qs(url_data_str)
1284 if 'itag' in url_data and 'url' in url_data:
1285 url = url_data['url'][0]
1286 if 'sig' in url_data:
1287 url += '&signature=' + url_data['sig'][0]
1288 elif 's' in url_data:
1289 encrypted_sig = url_data['s'][0]
1290 if self._downloader.params.get('verbose'):
1291 if age_gate:
1292 if player_url is None:
1293 player_version = 'unknown'
1294 else:
1295 player_version = self._search_regex(
1296 r'-(.+)\.swf$', player_url,
1297 u'flash player', fatal=False)
1298 player_desc = 'flash player %s' % player_version
1299 else:
1300 player_version = self._search_regex(
1301 r'html5player-(.+?)\.js', video_webpage,
1302 'html5 player', fatal=False)
1303 player_desc = u'html5 player %s' % player_version
1304
1305 parts_sizes = u'.'.join(compat_str(len(part)) for part in encrypted_sig.split('.'))
1306 self.to_screen(u'encrypted signature length %d (%s), itag %s, %s' %
1307 (len(encrypted_sig), parts_sizes, url_data['itag'][0], player_desc))
1308
1309 if not age_gate:
1310 jsplayer_url_json = self._search_regex(
1311 r'"assets":.+?"js":\s*("[^"]+")',
1312 video_webpage, u'JS player URL')
1313 player_url = json.loads(jsplayer_url_json)
1314
1315 signature = self._decrypt_signature(
1316 encrypted_sig, video_id, player_url, age_gate)
1317 url += '&signature=' + signature
1318 if 'ratebypass' not in url:
1319 url += '&ratebypass=yes'
1320 url_map[url_data['itag'][0]] = url
1321 video_url_list = self._get_video_url_list(url_map)
1322 elif video_info.get('hlsvp'):
1323 manifest_url = video_info['hlsvp'][0]
1324 url_map = self._extract_from_m3u8(manifest_url, video_id)
1325 video_url_list = self._get_video_url_list(url_map)
1326 else:
1327 raise ExtractorError(u'no conn, hlsvp or url_encoded_fmt_stream_map information found in video info')
1328
1329 formats = []
1330 for itag, video_real_url in video_url_list:
1331 dct = {
1332 'format_id': itag,
1333 'url': video_real_url,
1334 'player_url': player_url,
1335 }
1336 dct.update(self._formats[itag])
1337 formats.append(dct)
1338
1339 self._sort_formats(formats)
1340
1341 return {
1342 'id': video_id,
1343 'uploader': video_uploader,
1344 'uploader_id': video_uploader_id,
1345 'upload_date': upload_date,
1346 'title': video_title,
1347 'thumbnail': video_thumbnail,
1348 'description': video_description,
1349 'subtitles': video_subtitles,
1350 'duration': video_duration,
1351 'age_limit': 18 if age_gate else 0,
1352 'annotations': video_annotations,
1353 'webpage_url': 'https://www.youtube.com/watch?v=%s' % video_id,
1354 'view_count': view_count,
1355 'like_count': like_count,
1356 'dislike_count': dislike_count,
1357 'formats': formats,
1358 }
1359
1360class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
1361 IE_DESC = u'YouTube.com playlists'
1362 _VALID_URL = r"""(?:
1363 (?:https?://)?
1364 (?:\w+\.)?
1365 youtube\.com/
1366 (?:
1367 (?:course|view_play_list|my_playlists|artist|playlist|watch)
1368 \? (?:.*?&)*? (?:p|a|list)=
1369 | p/
1370 )
1371 ((?:PL|EC|UU|FL|RD)?[0-9A-Za-z-_]{10,})
1372 .*
1373 |
1374 ((?:PL|EC|UU|FL|RD)[0-9A-Za-z-_]{10,})
1375 )"""
1376 _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s&page=%s'
1377 _MORE_PAGES_INDICATOR = r'data-link-type="next"'
1378 _VIDEO_RE = r'href="/watch\?v=(?P<id>[0-9A-Za-z_-]{11})&amp;[^"]*?index=(?P<index>\d+)'
1379 IE_NAME = u'youtube:playlist'
1380
1381 @classmethod
1382 def suitable(cls, url):
1383 """Receives a URL and returns True if suitable for this IE."""
1384 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1385
1386 def _real_initialize(self):
1387 self._login()
1388
1389 def _ids_to_results(self, ids):
1390 return [self.url_result(vid_id, 'Youtube', video_id=vid_id)
1391 for vid_id in ids]
1392
1393 def _extract_mix(self, playlist_id):
1394 # The mixes are generated from a a single video
1395 # the id of the playlist is just 'RD' + video_id
1396 url = 'https://youtube.com/watch?v=%s&list=%s' % (playlist_id[-11:], playlist_id)
1397 webpage = self._download_webpage(url, playlist_id, u'Downloading Youtube mix')
1398 title_span = (get_element_by_attribute('class', 'title long-title', webpage) or
1399 get_element_by_attribute('class', 'title ', webpage))
1400 title = clean_html(title_span)
1401 video_re = r'data-index="\d+".*?href="/watch\?v=([0-9A-Za-z_-]{11})&amp;[^"]*?list=%s' % re.escape(playlist_id)
1402 ids = orderedSet(re.findall(video_re, webpage))
1403 url_results = self._ids_to_results(ids)
1404
1405 return self.playlist_result(url_results, playlist_id, title)
1406
1407 def _real_extract(self, url):
1408 # Extract playlist id
1409 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1410 if mobj is None:
1411 raise ExtractorError(u'Invalid URL: %s' % url)
1412 playlist_id = mobj.group(1) or mobj.group(2)
1413
1414 # Check if it's a video-specific URL
1415 query_dict = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
1416 if 'v' in query_dict:
1417 video_id = query_dict['v'][0]
1418 if self._downloader.params.get('noplaylist'):
1419 self.to_screen(u'Downloading just video %s because of --no-playlist' % video_id)
1420 return self.url_result(video_id, 'Youtube', video_id=video_id)
1421 else:
1422 self.to_screen(u'Downloading playlist PL%s - add --no-playlist to just download video %s' % (playlist_id, video_id))
1423
1424 if playlist_id.startswith('RD'):
1425 # Mixes require a custom extraction process
1426 return self._extract_mix(playlist_id)
1427 if playlist_id.startswith('TL'):
1428 raise ExtractorError(u'For downloading YouTube.com top lists, use '
1429 u'the "yttoplist" keyword, for example "youtube-dl \'yttoplist:music:Top Tracks\'"', expected=True)
1430
1431 # Extract the video ids from the playlist pages
1432 ids = []
1433
1434 for page_num in itertools.count(1):
1435 url = self._TEMPLATE_URL % (playlist_id, page_num)
1436 page = self._download_webpage(url, playlist_id, u'Downloading page #%s' % page_num)
1437 matches = re.finditer(self._VIDEO_RE, page)
1438 # We remove the duplicates and the link with index 0
1439 # (it's not the first video of the playlist)
1440 new_ids = orderedSet(m.group('id') for m in matches if m.group('index') != '0')
1441 ids.extend(new_ids)
1442
1443 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1444 break
1445
1446 playlist_title = self._og_search_title(page)
1447
1448 url_results = self._ids_to_results(ids)
1449 return self.playlist_result(url_results, playlist_id, playlist_title)
1450
1451
1452class YoutubeTopListIE(YoutubePlaylistIE):
1453 IE_NAME = u'youtube:toplist'
1454 IE_DESC = (u'YouTube.com top lists, "yttoplist:{channel}:{list title}"'
1455 u' (Example: "yttoplist:music:Top Tracks")')
1456 _VALID_URL = r'yttoplist:(?P<chann>.*?):(?P<title>.*?)$'
1457
1458 def _real_extract(self, url):
1459 mobj = re.match(self._VALID_URL, url)
1460 channel = mobj.group('chann')
1461 title = mobj.group('title')
1462 query = compat_urllib_parse.urlencode({'title': title})
1463 playlist_re = 'href="([^"]+?%s[^"]+?)"' % re.escape(query)
1464 channel_page = self._download_webpage('https://www.youtube.com/%s' % channel, title)
1465 link = self._html_search_regex(playlist_re, channel_page, u'list')
1466 url = compat_urlparse.urljoin('https://www.youtube.com/', link)
1467
1468 video_re = r'data-index="\d+".*?data-video-id="([0-9A-Za-z_-]{11})"'
1469 ids = []
1470 # sometimes the webpage doesn't contain the videos
1471 # retry until we get them
1472 for i in itertools.count(0):
1473 msg = u'Downloading Youtube mix'
1474 if i > 0:
1475 msg += ', retry #%d' % i
1476 webpage = self._download_webpage(url, title, msg)
1477 ids = orderedSet(re.findall(video_re, webpage))
1478 if ids:
1479 break
1480 url_results = self._ids_to_results(ids)
1481 return self.playlist_result(url_results, playlist_title=title)
1482
1483
1484class YoutubeChannelIE(InfoExtractor):
1485 IE_DESC = u'YouTube.com channels'
1486 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
1487 _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
1488 _MORE_PAGES_URL = 'http://www.youtube.com/c4_browse_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
1489 IE_NAME = u'youtube:channel'
1490
1491 def extract_videos_from_page(self, page):
1492 ids_in_page = []
1493 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1494 if mobj.group(1) not in ids_in_page:
1495 ids_in_page.append(mobj.group(1))
1496 return ids_in_page
1497
1498 def _real_extract(self, url):
1499 # Extract channel id
1500 mobj = re.match(self._VALID_URL, url)
1501 if mobj is None:
1502 raise ExtractorError(u'Invalid URL: %s' % url)
1503
1504 # Download channel page
1505 channel_id = mobj.group(1)
1506 video_ids = []
1507 url = 'https://www.youtube.com/channel/%s/videos' % channel_id
1508 channel_page = self._download_webpage(url, channel_id)
1509 autogenerated = re.search(r'''(?x)
1510 class="[^"]*?(?:
1511 channel-header-autogenerated-label|
1512 yt-channel-title-autogenerated
1513 )[^"]*"''', channel_page) is not None
1514
1515 if autogenerated:
1516 # The videos are contained in a single page
1517 # the ajax pages can't be used, they are empty
1518 video_ids = self.extract_videos_from_page(channel_page)
1519 else:
1520 # Download all channel pages using the json-based channel_ajax query
1521 for pagenum in itertools.count(1):
1522 url = self._MORE_PAGES_URL % (pagenum, channel_id)
1523 page = self._download_webpage(url, channel_id,
1524 u'Downloading page #%s' % pagenum)
1525
1526 page = json.loads(page)
1527
1528 ids_in_page = self.extract_videos_from_page(page['content_html'])
1529 video_ids.extend(ids_in_page)
1530
1531 if self._MORE_PAGES_INDICATOR not in page['load_more_widget_html']:
1532 break
1533
1534 self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1535
1536 url_entries = [self.url_result(video_id, 'Youtube', video_id=video_id)
1537 for video_id in video_ids]
1538 return self.playlist_result(url_entries, channel_id)
1539
1540
1541class YoutubeUserIE(InfoExtractor):
1542 IE_DESC = u'YouTube.com user videos (URL or "ytuser" keyword)'
1543 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:user/)?(?!(?:attribution_link|watch)(?:$|[^a-z_A-Z0-9-])))|ytuser:)(?!feed/)([A-Za-z0-9_-]+)'
1544 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1545 _GDATA_PAGE_SIZE = 50
1546 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d&alt=json'
1547 IE_NAME = u'youtube:user'
1548
1549 @classmethod
1550 def suitable(cls, url):
1551 # Don't return True if the url can be extracted with other youtube
1552 # extractor, the regex would is too permissive and it would match.
1553 other_ies = iter(klass for (name, klass) in globals().items() if name.endswith('IE') and klass is not cls)
1554 if any(ie.suitable(url) for ie in other_ies): return False
1555 else: return super(YoutubeUserIE, cls).suitable(url)
1556
1557 def _real_extract(self, url):
1558 # Extract username
1559 mobj = re.match(self._VALID_URL, url)
1560 if mobj is None:
1561 raise ExtractorError(u'Invalid URL: %s' % url)
1562
1563 username = mobj.group(1)
1564
1565 # Download video ids using YouTube Data API. Result size per
1566 # query is limited (currently to 50 videos) so we need to query
1567 # page by page until there are no video ids - it means we got
1568 # all of them.
1569
1570 url_results = []
1571
1572 for pagenum in itertools.count(0):
1573 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1574
1575 gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
1576 page = self._download_webpage(gdata_url, username,
1577 u'Downloading video ids from %d to %d' % (start_index, start_index + self._GDATA_PAGE_SIZE))
1578
1579 try:
1580 response = json.loads(page)
1581 except ValueError as err:
1582 raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
1583 if 'entry' not in response['feed']:
1584 # Number of videos is a multiple of self._MAX_RESULTS
1585 break
1586
1587 # Extract video identifiers
1588 entries = response['feed']['entry']
1589 for entry in entries:
1590 title = entry['title']['$t']
1591 video_id = entry['id']['$t'].split('/')[-1]
1592 url_results.append({
1593 '_type': 'url',
1594 'url': video_id,
1595 'ie_key': 'Youtube',
1596 'id': 'video_id',
1597 'title': title,
1598 })
1599
1600 # A little optimization - if current page is not
1601 # "full", ie. does not contain PAGE_SIZE video ids then
1602 # we can assume that this page is the last one - there
1603 # are no more ids on further pages - no need to query
1604 # again.
1605
1606 if len(entries) < self._GDATA_PAGE_SIZE:
1607 break
1608
1609 return self.playlist_result(url_results, playlist_title=username)
1610
1611
1612class YoutubeSearchIE(SearchInfoExtractor):
1613 IE_DESC = u'YouTube.com searches'
1614 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1615 _MAX_RESULTS = 1000
1616 IE_NAME = u'youtube:search'
1617 _SEARCH_KEY = 'ytsearch'
1618
1619 def _get_n_results(self, query, n):
1620 """Get a specified number of results for a query"""
1621
1622 video_ids = []
1623 pagenum = 0
1624 limit = n
1625
1626 while (50 * pagenum) < limit:
1627 result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1628 data_json = self._download_webpage(
1629 result_url, video_id=u'query "%s"' % query,
1630 note=u'Downloading page %s' % (pagenum + 1),
1631 errnote=u'Unable to download API page')
1632 data = json.loads(data_json)
1633 api_response = data['data']
1634
1635 if 'items' not in api_response:
1636 raise ExtractorError(u'[youtube] No video results')
1637
1638 new_ids = list(video['id'] for video in api_response['items'])
1639 video_ids += new_ids
1640
1641 limit = min(n, api_response['totalItems'])
1642 pagenum += 1
1643
1644 if len(video_ids) > n:
1645 video_ids = video_ids[:n]
1646 videos = [self.url_result(video_id, 'Youtube', video_id=video_id)
1647 for video_id in video_ids]
1648 return self.playlist_result(videos, query)
1649
1650class YoutubeSearchDateIE(YoutubeSearchIE):
1651 IE_NAME = YoutubeSearchIE.IE_NAME + ':date'
1652 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc&orderby=published'
1653 _SEARCH_KEY = 'ytsearchdate'
1654 IE_DESC = u'YouTube.com searches, newest videos first'
1655
1656class YoutubeShowIE(InfoExtractor):
1657 IE_DESC = u'YouTube.com (multi-season) shows'
1658 _VALID_URL = r'https?://www\.youtube\.com/show/(.*)'
1659 IE_NAME = u'youtube:show'
1660
1661 def _real_extract(self, url):
1662 mobj = re.match(self._VALID_URL, url)
1663 show_name = mobj.group(1)
1664 webpage = self._download_webpage(url, show_name, u'Downloading show webpage')
1665 # There's one playlist for each season of the show
1666 m_seasons = list(re.finditer(r'href="(/playlist\?list=.*?)"', webpage))
1667 self.to_screen(u'%s: Found %s seasons' % (show_name, len(m_seasons)))
1668 return [self.url_result('https://www.youtube.com' + season.group(1), 'YoutubePlaylist') for season in m_seasons]
1669
1670
1671class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
1672 """
1673 Base class for extractors that fetch info from
1674 http://www.youtube.com/feed_ajax
1675 Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.
1676 """
1677 _LOGIN_REQUIRED = True
1678 # use action_load_personal_feed instead of action_load_system_feed
1679 _PERSONAL_FEED = False
1680
1681 @property
1682 def _FEED_TEMPLATE(self):
1683 action = 'action_load_system_feed'
1684 if self._PERSONAL_FEED:
1685 action = 'action_load_personal_feed'
1686 return 'http://www.youtube.com/feed_ajax?%s=1&feed_name=%s&paging=%%s' % (action, self._FEED_NAME)
1687
1688 @property
1689 def IE_NAME(self):
1690 return u'youtube:%s' % self._FEED_NAME
1691
1692 def _real_initialize(self):
1693 self._login()
1694
1695 def _real_extract(self, url):
1696 feed_entries = []
1697 paging = 0
1698 for i in itertools.count(1):
1699 info = self._download_webpage(self._FEED_TEMPLATE % paging,
1700 u'%s feed' % self._FEED_NAME,
1701 u'Downloading page %s' % i)
1702 info = json.loads(info)
1703 feed_html = info['feed_html']
1704 m_ids = re.finditer(r'"/watch\?v=(.*?)["&]', feed_html)
1705 ids = orderedSet(m.group(1) for m in m_ids)
1706 feed_entries.extend(
1707 self.url_result(video_id, 'Youtube', video_id=video_id)
1708 for video_id in ids)
1709 if info['paging'] is None:
1710 break
1711 paging = info['paging']
1712 return self.playlist_result(feed_entries, playlist_title=self._PLAYLIST_TITLE)
1713
1714class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor):
1715 IE_DESC = u'YouTube.com subscriptions feed, "ytsubs" keyword(requires authentication)'
1716 _VALID_URL = r'https?://www\.youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
1717 _FEED_NAME = 'subscriptions'
1718 _PLAYLIST_TITLE = u'Youtube Subscriptions'
1719
1720class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
1721 IE_DESC = u'YouTube.com recommended videos, "ytrec" keyword (requires authentication)'
1722 _VALID_URL = r'https?://www\.youtube\.com/feed/recommended|:ytrec(?:ommended)?'
1723 _FEED_NAME = 'recommended'
1724 _PLAYLIST_TITLE = u'Youtube Recommended videos'
1725
1726class YoutubeWatchLaterIE(YoutubeFeedsInfoExtractor):
1727 IE_DESC = u'Youtube watch later list, "ytwatchlater" keyword (requires authentication)'
1728 _VALID_URL = r'https?://www\.youtube\.com/feed/watch_later|:ytwatchlater'
1729 _FEED_NAME = 'watch_later'
1730 _PLAYLIST_TITLE = u'Youtube Watch Later'
1731 _PERSONAL_FEED = True
1732
1733class YoutubeHistoryIE(YoutubeFeedsInfoExtractor):
1734 IE_DESC = u'Youtube watch history, "ythistory" keyword (requires authentication)'
1735 _VALID_URL = u'https?://www\.youtube\.com/feed/history|:ythistory'
1736 _FEED_NAME = 'history'
1737 _PERSONAL_FEED = True
1738 _PLAYLIST_TITLE = u'Youtube Watch History'
1739
1740class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
1741 IE_NAME = u'youtube:favorites'
1742 IE_DESC = u'YouTube.com favourite videos, "ytfav" keyword (requires authentication)'
1743 _VALID_URL = r'https?://www\.youtube\.com/my_favorites|:ytfav(?:ou?rites)?'
1744 _LOGIN_REQUIRED = True
1745
1746 def _real_extract(self, url):
1747 webpage = self._download_webpage('https://www.youtube.com/my_favorites', 'Youtube Favourites videos')
1748 playlist_id = self._search_regex(r'list=(.+?)["&]', webpage, u'favourites playlist id')
1749 return self.url_result(playlist_id, 'YoutubePlaylist')
1750
1751
1752class YoutubeTruncatedURLIE(InfoExtractor):
1753 IE_NAME = 'youtube:truncated_url'
1754 IE_DESC = False # Do not list
1755 _VALID_URL = r'(?:https?://)?[^/]+/watch\?feature=[a-z_]+$'
1756
1757 def _real_extract(self, url):
1758 raise ExtractorError(
1759 u'Did you forget to quote the URL? Remember that & is a meta '
1760 u'character in most shells, so you want to put the URL in quotes, '
1761 u'like youtube-dl '
1762 u'"http://www.youtube.com/watch?feature=foo&v=BaW_jenozKc" '
1763 u' or simply youtube-dl BaW_jenozKc .',
1764 expected=True)