]> jfr.im git - yt-dlp.git/blame_incremental - youtube_dl/extractor/youtube.py
[sina] Recognize http://video.sina.com.cn/v/b/{id}-*.html urls (fixes #2212)
[yt-dlp.git] / youtube_dl / extractor / youtube.py
... / ...
CommitLineData
1# coding: utf-8
2
3import collections
4import errno
5import io
6import itertools
7import json
8import os.path
9import re
10import string
11import struct
12import traceback
13import zlib
14
15from .common import InfoExtractor, SearchInfoExtractor
16from .subtitles import SubtitlesInfoExtractor
17from ..utils import (
18 compat_chr,
19 compat_parse_qs,
20 compat_urllib_parse,
21 compat_urllib_request,
22 compat_urlparse,
23 compat_str,
24
25 clean_html,
26 get_cachedir,
27 get_element_by_id,
28 get_element_by_attribute,
29 ExtractorError,
30 int_or_none,
31 PagedList,
32 RegexNotFoundError,
33 unescapeHTML,
34 unified_strdate,
35 orderedSet,
36 write_json_file,
37)
38
39class YoutubeBaseInfoExtractor(InfoExtractor):
40 """Provide base functions for Youtube extractors"""
41 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
42 _LANG_URL = r'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
43 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
44 _NETRC_MACHINE = 'youtube'
45 # If True it will raise an error if no login info is provided
46 _LOGIN_REQUIRED = False
47
48 def _set_language(self):
49 return bool(self._download_webpage(
50 self._LANG_URL, None,
51 note=u'Setting language', errnote='unable to set language',
52 fatal=False))
53
54 def _login(self):
55 (username, password) = self._get_login_info()
56 # No authentication to be performed
57 if username is None:
58 if self._LOGIN_REQUIRED:
59 raise ExtractorError(u'No login info available, needed for using %s.' % self.IE_NAME, expected=True)
60 return False
61
62 login_page = self._download_webpage(
63 self._LOGIN_URL, None,
64 note=u'Downloading login page',
65 errnote=u'unable to fetch login page', fatal=False)
66 if login_page is False:
67 return
68
69 galx = self._search_regex(r'(?s)<input.+?name="GALX".+?value="(.+?)"',
70 login_page, u'Login GALX parameter')
71
72 # Log in
73 login_form_strs = {
74 u'continue': u'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
75 u'Email': username,
76 u'GALX': galx,
77 u'Passwd': password,
78 u'PersistentCookie': u'yes',
79 u'_utf8': u'霱',
80 u'bgresponse': u'js_disabled',
81 u'checkConnection': u'',
82 u'checkedDomains': u'youtube',
83 u'dnConn': u'',
84 u'pstMsg': u'0',
85 u'rmShown': u'1',
86 u'secTok': u'',
87 u'signIn': u'Sign in',
88 u'timeStmp': u'',
89 u'service': u'youtube',
90 u'uilel': u'3',
91 u'hl': u'en_US',
92 }
93 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
94 # chokes on unicode
95 login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
96 login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
97
98 req = compat_urllib_request.Request(self._LOGIN_URL, login_data)
99 login_results = self._download_webpage(
100 req, None,
101 note=u'Logging in', errnote=u'unable to log in', fatal=False)
102 if login_results is False:
103 return False
104 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
105 self._downloader.report_warning(u'unable to log in: bad username or password')
106 return False
107 return True
108
109 def _confirm_age(self):
110 age_form = {
111 'next_url': '/',
112 'action_confirm': 'Confirm',
113 }
114 req = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
115
116 self._download_webpage(
117 req, None,
118 note=u'Confirming age', errnote=u'Unable to confirm age')
119 return True
120
121 def _real_initialize(self):
122 if self._downloader is None:
123 return
124 if not self._set_language():
125 return
126 if not self._login():
127 return
128 self._confirm_age()
129
130
131class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
132 IE_DESC = u'YouTube.com'
133 _VALID_URL = r"""(?x)^
134 (
135 (?:https?://|//)? # http(s):// or protocol-independent URL (optional)
136 (?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/|
137 (?:www\.)?deturl\.com/www\.youtube\.com/|
138 (?:www\.)?pwnyoutube\.com|
139 tube\.majestyc\.net/|
140 youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains
141 (?:.*?\#/)? # handle anchor (#/) redirect urls
142 (?: # the various things that can precede the ID:
143 (?:(?:v|embed|e)/) # v/ or embed/ or e/
144 |(?: # or the v= param in all its forms
145 (?:(?:watch|movie)(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
146 (?:\?|\#!?) # the params delimiter ? or # or #!
147 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
148 v=
149 )
150 ))
151 |youtu\.be/ # just youtu.be/xxxx
152 )
153 )? # all until now is optional -> you can pass the naked ID
154 ([0-9A-Za-z_-]{11}) # here is it! the YouTube video ID
155 (?(1).+)? # if we found the ID, everything can follow
156 $"""
157 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
158 _formats = {
159 '5': {'ext': 'flv', 'width': 400, 'height': 240},
160 '6': {'ext': 'flv', 'width': 450, 'height': 270},
161 '13': {'ext': '3gp'},
162 '17': {'ext': '3gp', 'width': 176, 'height': 144},
163 '18': {'ext': 'mp4', 'width': 640, 'height': 360},
164 '22': {'ext': 'mp4', 'width': 1280, 'height': 720},
165 '34': {'ext': 'flv', 'width': 640, 'height': 360},
166 '35': {'ext': 'flv', 'width': 854, 'height': 480},
167 '36': {'ext': '3gp', 'width': 320, 'height': 240},
168 '37': {'ext': 'mp4', 'width': 1920, 'height': 1080},
169 '38': {'ext': 'mp4', 'width': 4096, 'height': 3072},
170 '43': {'ext': 'webm', 'width': 640, 'height': 360},
171 '44': {'ext': 'webm', 'width': 854, 'height': 480},
172 '45': {'ext': 'webm', 'width': 1280, 'height': 720},
173 '46': {'ext': 'webm', 'width': 1920, 'height': 1080},
174
175
176 # 3d videos
177 '82': {'ext': 'mp4', 'height': 360, 'resolution': '360p', 'format_note': '3D', 'preference': -20},
178 '83': {'ext': 'mp4', 'height': 480, 'resolution': '480p', 'format_note': '3D', 'preference': -20},
179 '84': {'ext': 'mp4', 'height': 720, 'resolution': '720p', 'format_note': '3D', 'preference': -20},
180 '85': {'ext': 'mp4', 'height': 1080, 'resolution': '1080p', 'format_note': '3D', 'preference': -20},
181 '100': {'ext': 'webm', 'height': 360, 'resolution': '360p', 'format_note': '3D', 'preference': -20},
182 '101': {'ext': 'webm', 'height': 480, 'resolution': '480p', 'format_note': '3D', 'preference': -20},
183 '102': {'ext': 'webm', 'height': 720, 'resolution': '720p', 'format_note': '3D', 'preference': -20},
184
185 # Apple HTTP Live Streaming
186 '92': {'ext': 'mp4', 'height': 240, 'resolution': '240p', 'format_note': 'HLS', 'preference': -10},
187 '93': {'ext': 'mp4', 'height': 360, 'resolution': '360p', 'format_note': 'HLS', 'preference': -10},
188 '94': {'ext': 'mp4', 'height': 480, 'resolution': '480p', 'format_note': 'HLS', 'preference': -10},
189 '95': {'ext': 'mp4', 'height': 720, 'resolution': '720p', 'format_note': 'HLS', 'preference': -10},
190 '96': {'ext': 'mp4', 'height': 1080, 'resolution': '1080p', 'format_note': 'HLS', 'preference': -10},
191 '132': {'ext': 'mp4', 'height': 240, 'resolution': '240p', 'format_note': 'HLS', 'preference': -10},
192 '151': {'ext': 'mp4', 'height': 72, 'resolution': '72p', 'format_note': 'HLS', 'preference': -10},
193
194 # DASH mp4 video
195 '133': {'ext': 'mp4', 'height': 240, 'resolution': '240p', 'format_note': 'DASH video', 'preference': -40},
196 '134': {'ext': 'mp4', 'height': 360, 'resolution': '360p', 'format_note': 'DASH video', 'preference': -40},
197 '135': {'ext': 'mp4', 'height': 480, 'resolution': '480p', 'format_note': 'DASH video', 'preference': -40},
198 '136': {'ext': 'mp4', 'height': 720, 'resolution': '720p', 'format_note': 'DASH video', 'preference': -40},
199 '137': {'ext': 'mp4', 'height': 1080, 'resolution': '1080p', 'format_note': 'DASH video', 'preference': -40},
200 '138': {'ext': 'mp4', 'height': 1081, 'resolution': '>1080p', 'format_note': 'DASH video', 'preference': -40},
201 '160': {'ext': 'mp4', 'height': 192, 'resolution': '192p', 'format_note': 'DASH video', 'preference': -40},
202 '264': {'ext': 'mp4', 'height': 1080, 'resolution': '1080p', 'format_note': 'DASH video', 'preference': -40},
203
204 # Dash mp4 audio
205 '139': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 48, 'preference': -50},
206 '140': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 128, 'preference': -50},
207 '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 256, 'preference': -50},
208
209 # Dash webm
210 '242': {'ext': 'webm', 'height': 240, 'resolution': '240p', 'format_note': 'DASH webm', 'preference': -40},
211 '243': {'ext': 'webm', 'height': 360, 'resolution': '360p', 'format_note': 'DASH webm', 'preference': -40},
212 '244': {'ext': 'webm', 'height': 480, 'resolution': '480p', 'format_note': 'DASH webm', 'preference': -40},
213 '245': {'ext': 'webm', 'height': 480, 'resolution': '480p', 'format_note': 'DASH webm', 'preference': -40},
214 '246': {'ext': 'webm', 'height': 480, 'resolution': '480p', 'format_note': 'DASH webm', 'preference': -40},
215 '247': {'ext': 'webm', 'height': 720, 'resolution': '720p', 'format_note': 'DASH webm', 'preference': -40},
216 '248': {'ext': 'webm', 'height': 1080, 'resolution': '1080p', 'format_note': 'DASH webm', 'preference': -40},
217
218 # Dash webm audio
219 '171': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH webm audio', 'abr': 48, 'preference': -50},
220 '172': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH webm audio', 'abr': 256, 'preference': -50},
221
222 # RTMP (unnamed)
223 '_rtmp': {'protocol': 'rtmp'},
224 }
225
226 IE_NAME = u'youtube'
227 _TESTS = [
228 {
229 u"url": u"http://www.youtube.com/watch?v=BaW_jenozKc",
230 u"file": u"BaW_jenozKc.mp4",
231 u"info_dict": {
232 u"title": u"youtube-dl test video \"'/\\ä↭𝕐",
233 u"uploader": u"Philipp Hagemeister",
234 u"uploader_id": u"phihag",
235 u"upload_date": u"20121002",
236 u"description": u"test chars: \"'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de ."
237 }
238 },
239 {
240 u"url": u"http://www.youtube.com/watch?v=UxxajLWwzqY",
241 u"file": u"UxxajLWwzqY.mp4",
242 u"note": u"Test generic use_cipher_signature video (#897)",
243 u"info_dict": {
244 u"upload_date": u"20120506",
245 u"title": u"Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]",
246 u"description": u"md5:5b292926389560516e384ac437c0ec07",
247 u"uploader": u"Icona Pop",
248 u"uploader_id": u"IconaPop"
249 }
250 },
251 {
252 u"url": u"https://www.youtube.com/watch?v=07FYdnEawAQ",
253 u"file": u"07FYdnEawAQ.mp4",
254 u"note": u"Test VEVO video with age protection (#956)",
255 u"info_dict": {
256 u"upload_date": u"20130703",
257 u"title": u"Justin Timberlake - Tunnel Vision (Explicit)",
258 u"description": u"md5:64249768eec3bc4276236606ea996373",
259 u"uploader": u"justintimberlakeVEVO",
260 u"uploader_id": u"justintimberlakeVEVO"
261 }
262 },
263 {
264 u"url": u"//www.YouTube.com/watch?v=yZIXLfi8CZQ",
265 u"file": u"yZIXLfi8CZQ.mp4",
266 u"note": u"Embed-only video (#1746)",
267 u"info_dict": {
268 u"upload_date": u"20120608",
269 u"title": u"Principal Sexually Assaults A Teacher - Episode 117 - 8th June 2012",
270 u"description": u"md5:09b78bd971f1e3e289601dfba15ca4f7",
271 u"uploader": u"SET India",
272 u"uploader_id": u"setindia"
273 }
274 },
275 {
276 u"url": u"http://www.youtube.com/watch?v=a9LDPn-MO4I",
277 u"file": u"a9LDPn-MO4I.m4a",
278 u"note": u"256k DASH audio (format 141) via DASH manifest",
279 u"info_dict": {
280 u"upload_date": "20121002",
281 u"uploader_id": "8KVIDEO",
282 u"description": "No description available.",
283 u"uploader": "8KVIDEO",
284 u"title": "UHDTV TEST 8K VIDEO.mp4"
285 },
286 u"params": {
287 u"youtube_include_dash_manifest": True,
288 u"format": "141",
289 },
290 },
291 ]
292
293
294 @classmethod
295 def suitable(cls, url):
296 """Receives a URL and returns True if suitable for this IE."""
297 if YoutubePlaylistIE.suitable(url): return False
298 return re.match(cls._VALID_URL, url) is not None
299
300 def __init__(self, *args, **kwargs):
301 super(YoutubeIE, self).__init__(*args, **kwargs)
302 self._player_cache = {}
303
304 def report_video_info_webpage_download(self, video_id):
305 """Report attempt to download video info webpage."""
306 self.to_screen(u'%s: Downloading video info webpage' % video_id)
307
308 def report_information_extraction(self, video_id):
309 """Report attempt to extract video information."""
310 self.to_screen(u'%s: Extracting video information' % video_id)
311
312 def report_unavailable_format(self, video_id, format):
313 """Report extracted video URL."""
314 self.to_screen(u'%s: Format %s not available' % (video_id, format))
315
316 def report_rtmp_download(self):
317 """Indicate the download will use the RTMP protocol."""
318 self.to_screen(u'RTMP download detected')
319
320 def _extract_signature_function(self, video_id, player_url, slen):
321 id_m = re.match(r'.*-(?P<id>[a-zA-Z0-9_-]+)\.(?P<ext>[a-z]+)$',
322 player_url)
323 player_type = id_m.group('ext')
324 player_id = id_m.group('id')
325
326 # Read from filesystem cache
327 func_id = '%s_%s_%d' % (player_type, player_id, slen)
328 assert os.path.basename(func_id) == func_id
329 cache_dir = get_cachedir(self._downloader.params)
330
331 cache_enabled = cache_dir is not None
332 if cache_enabled:
333 cache_fn = os.path.join(os.path.expanduser(cache_dir),
334 u'youtube-sigfuncs',
335 func_id + '.json')
336 try:
337 with io.open(cache_fn, 'r', encoding='utf-8') as cachef:
338 cache_spec = json.load(cachef)
339 return lambda s: u''.join(s[i] for i in cache_spec)
340 except IOError:
341 pass # No cache available
342
343 if player_type == 'js':
344 code = self._download_webpage(
345 player_url, video_id,
346 note=u'Downloading %s player %s' % (player_type, player_id),
347 errnote=u'Download of %s failed' % player_url)
348 res = self._parse_sig_js(code)
349 elif player_type == 'swf':
350 urlh = self._request_webpage(
351 player_url, video_id,
352 note=u'Downloading %s player %s' % (player_type, player_id),
353 errnote=u'Download of %s failed' % player_url)
354 code = urlh.read()
355 res = self._parse_sig_swf(code)
356 else:
357 assert False, 'Invalid player type %r' % player_type
358
359 if cache_enabled:
360 try:
361 test_string = u''.join(map(compat_chr, range(slen)))
362 cache_res = res(test_string)
363 cache_spec = [ord(c) for c in cache_res]
364 try:
365 os.makedirs(os.path.dirname(cache_fn))
366 except OSError as ose:
367 if ose.errno != errno.EEXIST:
368 raise
369 write_json_file(cache_spec, cache_fn)
370 except Exception:
371 tb = traceback.format_exc()
372 self._downloader.report_warning(
373 u'Writing cache to %r failed: %s' % (cache_fn, tb))
374
375 return res
376
377 def _print_sig_code(self, func, slen):
378 def gen_sig_code(idxs):
379 def _genslice(start, end, step):
380 starts = u'' if start == 0 else str(start)
381 ends = (u':%d' % (end+step)) if end + step >= 0 else u':'
382 steps = u'' if step == 1 else (u':%d' % step)
383 return u's[%s%s%s]' % (starts, ends, steps)
384
385 step = None
386 start = '(Never used)' # Quelch pyflakes warnings - start will be
387 # set as soon as step is set
388 for i, prev in zip(idxs[1:], idxs[:-1]):
389 if step is not None:
390 if i - prev == step:
391 continue
392 yield _genslice(start, prev, step)
393 step = None
394 continue
395 if i - prev in [-1, 1]:
396 step = i - prev
397 start = prev
398 continue
399 else:
400 yield u's[%d]' % prev
401 if step is None:
402 yield u's[%d]' % i
403 else:
404 yield _genslice(start, i, step)
405
406 test_string = u''.join(map(compat_chr, range(slen)))
407 cache_res = func(test_string)
408 cache_spec = [ord(c) for c in cache_res]
409 expr_code = u' + '.join(gen_sig_code(cache_spec))
410 code = u'if len(s) == %d:\n return %s\n' % (slen, expr_code)
411 self.to_screen(u'Extracted signature function:\n' + code)
412
413 def _parse_sig_js(self, jscode):
414 funcname = self._search_regex(
415 r'signature=([a-zA-Z]+)', jscode,
416 u'Initial JS player signature function name')
417
418 functions = {}
419
420 def argidx(varname):
421 return string.lowercase.index(varname)
422
423 def interpret_statement(stmt, local_vars, allow_recursion=20):
424 if allow_recursion < 0:
425 raise ExtractorError(u'Recursion limit reached')
426
427 if stmt.startswith(u'var '):
428 stmt = stmt[len(u'var '):]
429 ass_m = re.match(r'^(?P<out>[a-z]+)(?:\[(?P<index>[^\]]+)\])?' +
430 r'=(?P<expr>.*)$', stmt)
431 if ass_m:
432 if ass_m.groupdict().get('index'):
433 def assign(val):
434 lvar = local_vars[ass_m.group('out')]
435 idx = interpret_expression(ass_m.group('index'),
436 local_vars, allow_recursion)
437 assert isinstance(idx, int)
438 lvar[idx] = val
439 return val
440 expr = ass_m.group('expr')
441 else:
442 def assign(val):
443 local_vars[ass_m.group('out')] = val
444 return val
445 expr = ass_m.group('expr')
446 elif stmt.startswith(u'return '):
447 assign = lambda v: v
448 expr = stmt[len(u'return '):]
449 else:
450 raise ExtractorError(
451 u'Cannot determine left side of statement in %r' % stmt)
452
453 v = interpret_expression(expr, local_vars, allow_recursion)
454 return assign(v)
455
456 def interpret_expression(expr, local_vars, allow_recursion):
457 if expr.isdigit():
458 return int(expr)
459
460 if expr.isalpha():
461 return local_vars[expr]
462
463 m = re.match(r'^(?P<in>[a-z]+)\.(?P<member>.*)$', expr)
464 if m:
465 member = m.group('member')
466 val = local_vars[m.group('in')]
467 if member == 'split("")':
468 return list(val)
469 if member == 'join("")':
470 return u''.join(val)
471 if member == 'length':
472 return len(val)
473 if member == 'reverse()':
474 return val[::-1]
475 slice_m = re.match(r'slice\((?P<idx>.*)\)', member)
476 if slice_m:
477 idx = interpret_expression(
478 slice_m.group('idx'), local_vars, allow_recursion-1)
479 return val[idx:]
480
481 m = re.match(
482 r'^(?P<in>[a-z]+)\[(?P<idx>.+)\]$', expr)
483 if m:
484 val = local_vars[m.group('in')]
485 idx = interpret_expression(m.group('idx'), local_vars,
486 allow_recursion-1)
487 return val[idx]
488
489 m = re.match(r'^(?P<a>.+?)(?P<op>[%])(?P<b>.+?)$', expr)
490 if m:
491 a = interpret_expression(m.group('a'),
492 local_vars, allow_recursion)
493 b = interpret_expression(m.group('b'),
494 local_vars, allow_recursion)
495 return a % b
496
497 m = re.match(
498 r'^(?P<func>[a-zA-Z]+)\((?P<args>[a-z0-9,]+)\)$', expr)
499 if m:
500 fname = m.group('func')
501 if fname not in functions:
502 functions[fname] = extract_function(fname)
503 argvals = [int(v) if v.isdigit() else local_vars[v]
504 for v in m.group('args').split(',')]
505 return functions[fname](argvals)
506 raise ExtractorError(u'Unsupported JS expression %r' % expr)
507
508 def extract_function(funcname):
509 func_m = re.search(
510 r'function ' + re.escape(funcname) +
511 r'\((?P<args>[a-z,]+)\){(?P<code>[^}]+)}',
512 jscode)
513 argnames = func_m.group('args').split(',')
514
515 def resf(args):
516 local_vars = dict(zip(argnames, args))
517 for stmt in func_m.group('code').split(';'):
518 res = interpret_statement(stmt, local_vars)
519 return res
520 return resf
521
522 initial_function = extract_function(funcname)
523 return lambda s: initial_function([s])
524
525 def _parse_sig_swf(self, file_contents):
526 if file_contents[1:3] != b'WS':
527 raise ExtractorError(
528 u'Not an SWF file; header is %r' % file_contents[:3])
529 if file_contents[:1] == b'C':
530 content = zlib.decompress(file_contents[8:])
531 else:
532 raise NotImplementedError(u'Unsupported compression format %r' %
533 file_contents[:1])
534
535 def extract_tags(content):
536 pos = 0
537 while pos < len(content):
538 header16 = struct.unpack('<H', content[pos:pos+2])[0]
539 pos += 2
540 tag_code = header16 >> 6
541 tag_len = header16 & 0x3f
542 if tag_len == 0x3f:
543 tag_len = struct.unpack('<I', content[pos:pos+4])[0]
544 pos += 4
545 assert pos+tag_len <= len(content)
546 yield (tag_code, content[pos:pos+tag_len])
547 pos += tag_len
548
549 code_tag = next(tag
550 for tag_code, tag in extract_tags(content)
551 if tag_code == 82)
552 p = code_tag.index(b'\0', 4) + 1
553 code_reader = io.BytesIO(code_tag[p:])
554
555 # Parse ABC (AVM2 ByteCode)
556 def read_int(reader=None):
557 if reader is None:
558 reader = code_reader
559 res = 0
560 shift = 0
561 for _ in range(5):
562 buf = reader.read(1)
563 assert len(buf) == 1
564 b = struct.unpack('<B', buf)[0]
565 res = res | ((b & 0x7f) << shift)
566 if b & 0x80 == 0:
567 break
568 shift += 7
569 return res
570
571 def u30(reader=None):
572 res = read_int(reader)
573 assert res & 0xf0000000 == 0
574 return res
575 u32 = read_int
576
577 def s32(reader=None):
578 v = read_int(reader)
579 if v & 0x80000000 != 0:
580 v = - ((v ^ 0xffffffff) + 1)
581 return v
582
583 def read_string(reader=None):
584 if reader is None:
585 reader = code_reader
586 slen = u30(reader)
587 resb = reader.read(slen)
588 assert len(resb) == slen
589 return resb.decode('utf-8')
590
591 def read_bytes(count, reader=None):
592 if reader is None:
593 reader = code_reader
594 resb = reader.read(count)
595 assert len(resb) == count
596 return resb
597
598 def read_byte(reader=None):
599 resb = read_bytes(1, reader=reader)
600 res = struct.unpack('<B', resb)[0]
601 return res
602
603 # minor_version + major_version
604 read_bytes(2 + 2)
605
606 # Constant pool
607 int_count = u30()
608 for _c in range(1, int_count):
609 s32()
610 uint_count = u30()
611 for _c in range(1, uint_count):
612 u32()
613 double_count = u30()
614 read_bytes((double_count-1) * 8)
615 string_count = u30()
616 constant_strings = [u'']
617 for _c in range(1, string_count):
618 s = read_string()
619 constant_strings.append(s)
620 namespace_count = u30()
621 for _c in range(1, namespace_count):
622 read_bytes(1) # kind
623 u30() # name
624 ns_set_count = u30()
625 for _c in range(1, ns_set_count):
626 count = u30()
627 for _c2 in range(count):
628 u30()
629 multiname_count = u30()
630 MULTINAME_SIZES = {
631 0x07: 2, # QName
632 0x0d: 2, # QNameA
633 0x0f: 1, # RTQName
634 0x10: 1, # RTQNameA
635 0x11: 0, # RTQNameL
636 0x12: 0, # RTQNameLA
637 0x09: 2, # Multiname
638 0x0e: 2, # MultinameA
639 0x1b: 1, # MultinameL
640 0x1c: 1, # MultinameLA
641 }
642 multinames = [u'']
643 for _c in range(1, multiname_count):
644 kind = u30()
645 assert kind in MULTINAME_SIZES, u'Invalid multiname kind %r' % kind
646 if kind == 0x07:
647 u30() # namespace_idx
648 name_idx = u30()
649 multinames.append(constant_strings[name_idx])
650 else:
651 multinames.append('[MULTINAME kind: %d]' % kind)
652 for _c2 in range(MULTINAME_SIZES[kind]):
653 u30()
654
655 # Methods
656 method_count = u30()
657 MethodInfo = collections.namedtuple(
658 'MethodInfo',
659 ['NEED_ARGUMENTS', 'NEED_REST'])
660 method_infos = []
661 for method_id in range(method_count):
662 param_count = u30()
663 u30() # return type
664 for _ in range(param_count):
665 u30() # param type
666 u30() # name index (always 0 for youtube)
667 flags = read_byte()
668 if flags & 0x08 != 0:
669 # Options present
670 option_count = u30()
671 for c in range(option_count):
672 u30() # val
673 read_bytes(1) # kind
674 if flags & 0x80 != 0:
675 # Param names present
676 for _ in range(param_count):
677 u30() # param name
678 mi = MethodInfo(flags & 0x01 != 0, flags & 0x04 != 0)
679 method_infos.append(mi)
680
681 # Metadata
682 metadata_count = u30()
683 for _c in range(metadata_count):
684 u30() # name
685 item_count = u30()
686 for _c2 in range(item_count):
687 u30() # key
688 u30() # value
689
690 def parse_traits_info():
691 trait_name_idx = u30()
692 kind_full = read_byte()
693 kind = kind_full & 0x0f
694 attrs = kind_full >> 4
695 methods = {}
696 if kind in [0x00, 0x06]: # Slot or Const
697 u30() # Slot id
698 u30() # type_name_idx
699 vindex = u30()
700 if vindex != 0:
701 read_byte() # vkind
702 elif kind in [0x01, 0x02, 0x03]: # Method / Getter / Setter
703 u30() # disp_id
704 method_idx = u30()
705 methods[multinames[trait_name_idx]] = method_idx
706 elif kind == 0x04: # Class
707 u30() # slot_id
708 u30() # classi
709 elif kind == 0x05: # Function
710 u30() # slot_id
711 function_idx = u30()
712 methods[function_idx] = multinames[trait_name_idx]
713 else:
714 raise ExtractorError(u'Unsupported trait kind %d' % kind)
715
716 if attrs & 0x4 != 0: # Metadata present
717 metadata_count = u30()
718 for _c3 in range(metadata_count):
719 u30() # metadata index
720
721 return methods
722
723 # Classes
724 TARGET_CLASSNAME = u'SignatureDecipher'
725 searched_idx = multinames.index(TARGET_CLASSNAME)
726 searched_class_id = None
727 class_count = u30()
728 for class_id in range(class_count):
729 name_idx = u30()
730 if name_idx == searched_idx:
731 # We found the class we're looking for!
732 searched_class_id = class_id
733 u30() # super_name idx
734 flags = read_byte()
735 if flags & 0x08 != 0: # Protected namespace is present
736 u30() # protected_ns_idx
737 intrf_count = u30()
738 for _c2 in range(intrf_count):
739 u30()
740 u30() # iinit
741 trait_count = u30()
742 for _c2 in range(trait_count):
743 parse_traits_info()
744
745 if searched_class_id is None:
746 raise ExtractorError(u'Target class %r not found' %
747 TARGET_CLASSNAME)
748
749 method_names = {}
750 method_idxs = {}
751 for class_id in range(class_count):
752 u30() # cinit
753 trait_count = u30()
754 for _c2 in range(trait_count):
755 trait_methods = parse_traits_info()
756 if class_id == searched_class_id:
757 method_names.update(trait_methods.items())
758 method_idxs.update(dict(
759 (idx, name)
760 for name, idx in trait_methods.items()))
761
762 # Scripts
763 script_count = u30()
764 for _c in range(script_count):
765 u30() # init
766 trait_count = u30()
767 for _c2 in range(trait_count):
768 parse_traits_info()
769
770 # Method bodies
771 method_body_count = u30()
772 Method = collections.namedtuple('Method', ['code', 'local_count'])
773 methods = {}
774 for _c in range(method_body_count):
775 method_idx = u30()
776 u30() # max_stack
777 local_count = u30()
778 u30() # init_scope_depth
779 u30() # max_scope_depth
780 code_length = u30()
781 code = read_bytes(code_length)
782 if method_idx in method_idxs:
783 m = Method(code, local_count)
784 methods[method_idxs[method_idx]] = m
785 exception_count = u30()
786 for _c2 in range(exception_count):
787 u30() # from
788 u30() # to
789 u30() # target
790 u30() # exc_type
791 u30() # var_name
792 trait_count = u30()
793 for _c2 in range(trait_count):
794 parse_traits_info()
795
796 assert p + code_reader.tell() == len(code_tag)
797 assert len(methods) == len(method_idxs)
798
799 method_pyfunctions = {}
800
801 def extract_function(func_name):
802 if func_name in method_pyfunctions:
803 return method_pyfunctions[func_name]
804 if func_name not in methods:
805 raise ExtractorError(u'Cannot find function %r' % func_name)
806 m = methods[func_name]
807
808 def resfunc(args):
809 registers = ['(this)'] + list(args) + [None] * m.local_count
810 stack = []
811 coder = io.BytesIO(m.code)
812 while True:
813 opcode = struct.unpack('!B', coder.read(1))[0]
814 if opcode == 36: # pushbyte
815 v = struct.unpack('!B', coder.read(1))[0]
816 stack.append(v)
817 elif opcode == 44: # pushstring
818 idx = u30(coder)
819 stack.append(constant_strings[idx])
820 elif opcode == 48: # pushscope
821 # We don't implement the scope register, so we'll just
822 # ignore the popped value
823 stack.pop()
824 elif opcode == 70: # callproperty
825 index = u30(coder)
826 mname = multinames[index]
827 arg_count = u30(coder)
828 args = list(reversed(
829 [stack.pop() for _ in range(arg_count)]))
830 obj = stack.pop()
831 if mname == u'split':
832 assert len(args) == 1
833 assert isinstance(args[0], compat_str)
834 assert isinstance(obj, compat_str)
835 if args[0] == u'':
836 res = list(obj)
837 else:
838 res = obj.split(args[0])
839 stack.append(res)
840 elif mname == u'slice':
841 assert len(args) == 1
842 assert isinstance(args[0], int)
843 assert isinstance(obj, list)
844 res = obj[args[0]:]
845 stack.append(res)
846 elif mname == u'join':
847 assert len(args) == 1
848 assert isinstance(args[0], compat_str)
849 assert isinstance(obj, list)
850 res = args[0].join(obj)
851 stack.append(res)
852 elif mname in method_pyfunctions:
853 stack.append(method_pyfunctions[mname](args))
854 else:
855 raise NotImplementedError(
856 u'Unsupported property %r on %r'
857 % (mname, obj))
858 elif opcode == 72: # returnvalue
859 res = stack.pop()
860 return res
861 elif opcode == 79: # callpropvoid
862 index = u30(coder)
863 mname = multinames[index]
864 arg_count = u30(coder)
865 args = list(reversed(
866 [stack.pop() for _ in range(arg_count)]))
867 obj = stack.pop()
868 if mname == u'reverse':
869 assert isinstance(obj, list)
870 obj.reverse()
871 else:
872 raise NotImplementedError(
873 u'Unsupported (void) property %r on %r'
874 % (mname, obj))
875 elif opcode == 93: # findpropstrict
876 index = u30(coder)
877 mname = multinames[index]
878 res = extract_function(mname)
879 stack.append(res)
880 elif opcode == 97: # setproperty
881 index = u30(coder)
882 value = stack.pop()
883 idx = stack.pop()
884 obj = stack.pop()
885 assert isinstance(obj, list)
886 assert isinstance(idx, int)
887 obj[idx] = value
888 elif opcode == 98: # getlocal
889 index = u30(coder)
890 stack.append(registers[index])
891 elif opcode == 99: # setlocal
892 index = u30(coder)
893 value = stack.pop()
894 registers[index] = value
895 elif opcode == 102: # getproperty
896 index = u30(coder)
897 pname = multinames[index]
898 if pname == u'length':
899 obj = stack.pop()
900 assert isinstance(obj, list)
901 stack.append(len(obj))
902 else: # Assume attribute access
903 idx = stack.pop()
904 assert isinstance(idx, int)
905 obj = stack.pop()
906 assert isinstance(obj, list)
907 stack.append(obj[idx])
908 elif opcode == 128: # coerce
909 u30(coder)
910 elif opcode == 133: # coerce_s
911 assert isinstance(stack[-1], (type(None), compat_str))
912 elif opcode == 164: # modulo
913 value2 = stack.pop()
914 value1 = stack.pop()
915 res = value1 % value2
916 stack.append(res)
917 elif opcode == 208: # getlocal_0
918 stack.append(registers[0])
919 elif opcode == 209: # getlocal_1
920 stack.append(registers[1])
921 elif opcode == 210: # getlocal_2
922 stack.append(registers[2])
923 elif opcode == 211: # getlocal_3
924 stack.append(registers[3])
925 elif opcode == 214: # setlocal_2
926 registers[2] = stack.pop()
927 elif opcode == 215: # setlocal_3
928 registers[3] = stack.pop()
929 else:
930 raise NotImplementedError(
931 u'Unsupported opcode %d' % opcode)
932
933 method_pyfunctions[func_name] = resfunc
934 return resfunc
935
936 initial_function = extract_function(u'decipher')
937 return lambda s: initial_function([s])
938
939 def _decrypt_signature(self, s, video_id, player_url, age_gate=False):
940 """Turn the encrypted s field into a working signature"""
941
942 if player_url is not None:
943 if player_url.startswith(u'//'):
944 player_url = u'https:' + player_url
945 try:
946 player_id = (player_url, len(s))
947 if player_id not in self._player_cache:
948 func = self._extract_signature_function(
949 video_id, player_url, len(s)
950 )
951 self._player_cache[player_id] = func
952 func = self._player_cache[player_id]
953 if self._downloader.params.get('youtube_print_sig_code'):
954 self._print_sig_code(func, len(s))
955 return func(s)
956 except Exception:
957 tb = traceback.format_exc()
958 self._downloader.report_warning(
959 u'Automatic signature extraction failed: ' + tb)
960
961 self._downloader.report_warning(
962 u'Warning: Falling back to static signature algorithm')
963
964 return self._static_decrypt_signature(
965 s, video_id, player_url, age_gate)
966
967 def _static_decrypt_signature(self, s, video_id, player_url, age_gate):
968 if age_gate:
969 # The videos with age protection use another player, so the
970 # algorithms can be different.
971 if len(s) == 86:
972 return s[2:63] + s[82] + s[64:82] + s[63]
973
974 if len(s) == 93:
975 return s[86:29:-1] + s[88] + s[28:5:-1]
976 elif len(s) == 92:
977 return s[25] + s[3:25] + s[0] + s[26:42] + s[79] + s[43:79] + s[91] + s[80:83]
978 elif len(s) == 91:
979 return s[84:27:-1] + s[86] + s[26:5:-1]
980 elif len(s) == 90:
981 return s[25] + s[3:25] + s[2] + s[26:40] + s[77] + s[41:77] + s[89] + s[78:81]
982 elif len(s) == 89:
983 return s[84:78:-1] + s[87] + s[77:60:-1] + s[0] + s[59:3:-1]
984 elif len(s) == 88:
985 return s[7:28] + s[87] + s[29:45] + s[55] + s[46:55] + s[2] + s[56:87] + s[28]
986 elif len(s) == 87:
987 return s[6:27] + s[4] + s[28:39] + s[27] + s[40:59] + s[2] + s[60:]
988 elif len(s) == 86:
989 return s[80:72:-1] + s[16] + s[71:39:-1] + s[72] + s[38:16:-1] + s[82] + s[15::-1]
990 elif len(s) == 85:
991 return s[3:11] + s[0] + s[12:55] + s[84] + s[56:84]
992 elif len(s) == 84:
993 return s[78:70:-1] + s[14] + s[69:37:-1] + s[70] + s[36:14:-1] + s[80] + s[:14][::-1]
994 elif len(s) == 83:
995 return s[80:63:-1] + s[0] + s[62:0:-1] + s[63]
996 elif len(s) == 82:
997 return s[80:37:-1] + s[7] + s[36:7:-1] + s[0] + s[6:0:-1] + s[37]
998 elif len(s) == 81:
999 return s[56] + s[79:56:-1] + s[41] + s[55:41:-1] + s[80] + s[40:34:-1] + s[0] + s[33:29:-1] + s[34] + s[28:9:-1] + s[29] + s[8:0:-1] + s[9]
1000 elif len(s) == 80:
1001 return s[1:19] + s[0] + s[20:68] + s[19] + s[69:80]
1002 elif len(s) == 79:
1003 return s[54] + s[77:54:-1] + s[39] + s[53:39:-1] + s[78] + s[38:34:-1] + s[0] + s[33:29:-1] + s[34] + s[28:9:-1] + s[29] + s[8:0:-1] + s[9]
1004
1005 else:
1006 raise ExtractorError(u'Unable to decrypt signature, key length %d not supported; retrying might work' % (len(s)))
1007
1008 def _get_available_subtitles(self, video_id, webpage):
1009 try:
1010 sub_list = self._download_webpage(
1011 'http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,
1012 video_id, note=False)
1013 except ExtractorError as err:
1014 self._downloader.report_warning(u'unable to download video subtitles: %s' % compat_str(err))
1015 return {}
1016 lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
1017
1018 sub_lang_list = {}
1019 for l in lang_list:
1020 lang = l[1]
1021 params = compat_urllib_parse.urlencode({
1022 'lang': lang,
1023 'v': video_id,
1024 'fmt': self._downloader.params.get('subtitlesformat', 'srt'),
1025 'name': unescapeHTML(l[0]).encode('utf-8'),
1026 })
1027 url = u'http://www.youtube.com/api/timedtext?' + params
1028 sub_lang_list[lang] = url
1029 if not sub_lang_list:
1030 self._downloader.report_warning(u'video doesn\'t have subtitles')
1031 return {}
1032 return sub_lang_list
1033
1034 def _get_available_automatic_caption(self, video_id, webpage):
1035 """We need the webpage for getting the captions url, pass it as an
1036 argument to speed up the process."""
1037 sub_format = self._downloader.params.get('subtitlesformat', 'srt')
1038 self.to_screen(u'%s: Looking for automatic captions' % video_id)
1039 mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
1040 err_msg = u'Couldn\'t find automatic captions for %s' % video_id
1041 if mobj is None:
1042 self._downloader.report_warning(err_msg)
1043 return {}
1044 player_config = json.loads(mobj.group(1))
1045 try:
1046 args = player_config[u'args']
1047 caption_url = args[u'ttsurl']
1048 timestamp = args[u'timestamp']
1049 # We get the available subtitles
1050 list_params = compat_urllib_parse.urlencode({
1051 'type': 'list',
1052 'tlangs': 1,
1053 'asrs': 1,
1054 })
1055 list_url = caption_url + '&' + list_params
1056 caption_list = self._download_xml(list_url, video_id)
1057 original_lang_node = caption_list.find('track')
1058 if original_lang_node is None or original_lang_node.attrib.get('kind') != 'asr' :
1059 self._downloader.report_warning(u'Video doesn\'t have automatic captions')
1060 return {}
1061 original_lang = original_lang_node.attrib['lang_code']
1062
1063 sub_lang_list = {}
1064 for lang_node in caption_list.findall('target'):
1065 sub_lang = lang_node.attrib['lang_code']
1066 params = compat_urllib_parse.urlencode({
1067 'lang': original_lang,
1068 'tlang': sub_lang,
1069 'fmt': sub_format,
1070 'ts': timestamp,
1071 'kind': 'asr',
1072 })
1073 sub_lang_list[sub_lang] = caption_url + '&' + params
1074 return sub_lang_list
1075 # An extractor error can be raise by the download process if there are
1076 # no automatic captions but there are subtitles
1077 except (KeyError, ExtractorError):
1078 self._downloader.report_warning(err_msg)
1079 return {}
1080
1081 def _extract_id(self, url):
1082 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1083 if mobj is None:
1084 raise ExtractorError(u'Invalid URL: %s' % url)
1085 video_id = mobj.group(2)
1086 return video_id
1087
1088 def _extract_from_m3u8(self, manifest_url, video_id):
1089 url_map = {}
1090 def _get_urls(_manifest):
1091 lines = _manifest.split('\n')
1092 urls = filter(lambda l: l and not l.startswith('#'),
1093 lines)
1094 return urls
1095 manifest = self._download_webpage(manifest_url, video_id, u'Downloading formats manifest')
1096 formats_urls = _get_urls(manifest)
1097 for format_url in formats_urls:
1098 itag = self._search_regex(r'itag/(\d+?)/', format_url, 'itag')
1099 url_map[itag] = format_url
1100 return url_map
1101
1102 def _extract_annotations(self, video_id):
1103 url = 'https://www.youtube.com/annotations_invideo?features=1&legacy=1&video_id=%s' % video_id
1104 return self._download_webpage(url, video_id, note=u'Searching for annotations.', errnote=u'Unable to download video annotations.')
1105
1106 def _real_extract(self, url):
1107 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
1108 mobj = re.search(self._NEXT_URL_RE, url)
1109 if mobj:
1110 url = 'https://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
1111 video_id = self._extract_id(url)
1112
1113 # Get video webpage
1114 url = 'https://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
1115 video_webpage = self._download_webpage(url, video_id)
1116
1117 # Attempt to extract SWF player URL
1118 mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
1119 if mobj is not None:
1120 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1121 else:
1122 player_url = None
1123
1124 # Get video info
1125 self.report_video_info_webpage_download(video_id)
1126 if re.search(r'player-age-gate-content">', video_webpage) is not None:
1127 self.report_age_confirmation()
1128 age_gate = True
1129 # We simulate the access to the video from www.youtube.com/v/{video_id}
1130 # this can be viewed without login into Youtube
1131 data = compat_urllib_parse.urlencode({'video_id': video_id,
1132 'el': 'player_embedded',
1133 'gl': 'US',
1134 'hl': 'en',
1135 'eurl': 'https://youtube.googleapis.com/v/' + video_id,
1136 'asv': 3,
1137 'sts':'1588',
1138 })
1139 video_info_url = 'https://www.youtube.com/get_video_info?' + data
1140 video_info_webpage = self._download_webpage(video_info_url, video_id,
1141 note=False,
1142 errnote='unable to download video info webpage')
1143 video_info = compat_parse_qs(video_info_webpage)
1144 else:
1145 age_gate = False
1146 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1147 video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1148 % (video_id, el_type))
1149 video_info_webpage = self._download_webpage(video_info_url, video_id,
1150 note=False,
1151 errnote='unable to download video info webpage')
1152 video_info = compat_parse_qs(video_info_webpage)
1153 if 'token' in video_info:
1154 break
1155 if 'token' not in video_info:
1156 if 'reason' in video_info:
1157 raise ExtractorError(u'YouTube said: %s' % video_info['reason'][0], expected=True)
1158 else:
1159 raise ExtractorError(u'"token" parameter not in video info for unknown reason')
1160
1161 if 'view_count' in video_info:
1162 view_count = int(video_info['view_count'][0])
1163 else:
1164 view_count = None
1165
1166 # Check for "rental" videos
1167 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
1168 raise ExtractorError(u'"rental" videos not supported')
1169
1170 # Start extracting information
1171 self.report_information_extraction(video_id)
1172
1173 # uploader
1174 if 'author' not in video_info:
1175 raise ExtractorError(u'Unable to extract uploader name')
1176 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
1177
1178 # uploader_id
1179 video_uploader_id = None
1180 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
1181 if mobj is not None:
1182 video_uploader_id = mobj.group(1)
1183 else:
1184 self._downloader.report_warning(u'unable to extract uploader nickname')
1185
1186 # title
1187 if 'title' in video_info:
1188 video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
1189 else:
1190 self._downloader.report_warning(u'Unable to extract video title')
1191 video_title = u'_'
1192
1193 # thumbnail image
1194 # We try first to get a high quality image:
1195 m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">',
1196 video_webpage, re.DOTALL)
1197 if m_thumb is not None:
1198 video_thumbnail = m_thumb.group(1)
1199 elif 'thumbnail_url' not in video_info:
1200 self._downloader.report_warning(u'unable to extract video thumbnail')
1201 video_thumbnail = None
1202 else: # don't panic if we can't find it
1203 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
1204
1205 # upload date
1206 upload_date = None
1207 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
1208 if mobj is not None:
1209 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1210 upload_date = unified_strdate(upload_date)
1211
1212 # description
1213 video_description = get_element_by_id("eow-description", video_webpage)
1214 if video_description:
1215 video_description = re.sub(r'''(?x)
1216 <a\s+
1217 (?:[a-zA-Z-]+="[^"]+"\s+)*?
1218 title="([^"]+)"\s+
1219 (?:[a-zA-Z-]+="[^"]+"\s+)*?
1220 class="yt-uix-redirect-link"\s*>
1221 [^<]+
1222 </a>
1223 ''', r'\1', video_description)
1224 video_description = clean_html(video_description)
1225 else:
1226 fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
1227 if fd_mobj:
1228 video_description = unescapeHTML(fd_mobj.group(1))
1229 else:
1230 video_description = u''
1231
1232 def _extract_count(klass):
1233 count = self._search_regex(
1234 r'class="%s">([\d,]+)</span>' % re.escape(klass),
1235 video_webpage, klass, default=None)
1236 if count is not None:
1237 return int(count.replace(',', ''))
1238 return None
1239 like_count = _extract_count(u'likes-count')
1240 dislike_count = _extract_count(u'dislikes-count')
1241
1242 # subtitles
1243 video_subtitles = self.extract_subtitles(video_id, video_webpage)
1244
1245 if self._downloader.params.get('listsubtitles', False):
1246 self._list_available_subtitles(video_id, video_webpage)
1247 return
1248
1249 if 'length_seconds' not in video_info:
1250 self._downloader.report_warning(u'unable to extract video duration')
1251 video_duration = None
1252 else:
1253 video_duration = int(compat_urllib_parse.unquote_plus(video_info['length_seconds'][0]))
1254
1255 # annotations
1256 video_annotations = None
1257 if self._downloader.params.get('writeannotations', False):
1258 video_annotations = self._extract_annotations(video_id)
1259
1260 # Decide which formats to download
1261 try:
1262 mobj = re.search(r';ytplayer.config = ({.*?});', video_webpage)
1263 if not mobj:
1264 raise ValueError('Could not find vevo ID')
1265 info = json.loads(mobj.group(1))
1266 args = info['args']
1267 # Easy way to know if the 's' value is in url_encoded_fmt_stream_map
1268 # this signatures are encrypted
1269 if 'url_encoded_fmt_stream_map' not in args:
1270 raise ValueError(u'No stream_map present') # caught below
1271 re_signature = re.compile(r'[&,]s=')
1272 m_s = re_signature.search(args['url_encoded_fmt_stream_map'])
1273 if m_s is not None:
1274 self.to_screen(u'%s: Encrypted signatures detected.' % video_id)
1275 video_info['url_encoded_fmt_stream_map'] = [args['url_encoded_fmt_stream_map']]
1276 m_s = re_signature.search(args.get('adaptive_fmts', u''))
1277 if m_s is not None:
1278 if 'adaptive_fmts' in video_info:
1279 video_info['adaptive_fmts'][0] += ',' + args['adaptive_fmts']
1280 else:
1281 video_info['adaptive_fmts'] = [args['adaptive_fmts']]
1282 except ValueError:
1283 pass
1284
1285 def _map_to_format_list(urlmap):
1286 formats = []
1287 for itag, video_real_url in urlmap.items():
1288 dct = {
1289 'format_id': itag,
1290 'url': video_real_url,
1291 'player_url': player_url,
1292 }
1293 dct.update(self._formats[itag])
1294 formats.append(dct)
1295 return formats
1296
1297 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1298 self.report_rtmp_download()
1299 formats = [{
1300 'format_id': '_rtmp',
1301 'protocol': 'rtmp',
1302 'url': video_info['conn'][0],
1303 'player_url': player_url,
1304 }]
1305 elif len(video_info.get('url_encoded_fmt_stream_map', [])) >= 1 or len(video_info.get('adaptive_fmts', [])) >= 1:
1306 encoded_url_map = video_info.get('url_encoded_fmt_stream_map', [''])[0] + ',' + video_info.get('adaptive_fmts',[''])[0]
1307 if 'rtmpe%3Dyes' in encoded_url_map:
1308 raise ExtractorError('rtmpe downloads are not supported, see https://github.com/rg3/youtube-dl/issues/343 for more information.', expected=True)
1309 url_map = {}
1310 for url_data_str in encoded_url_map.split(','):
1311 url_data = compat_parse_qs(url_data_str)
1312 if 'itag' in url_data and 'url' in url_data:
1313 url = url_data['url'][0]
1314 if 'sig' in url_data:
1315 url += '&signature=' + url_data['sig'][0]
1316 elif 's' in url_data:
1317 encrypted_sig = url_data['s'][0]
1318 if self._downloader.params.get('verbose'):
1319 if age_gate:
1320 if player_url is None:
1321 player_version = 'unknown'
1322 else:
1323 player_version = self._search_regex(
1324 r'-(.+)\.swf$', player_url,
1325 u'flash player', fatal=False)
1326 player_desc = 'flash player %s' % player_version
1327 else:
1328 player_version = self._search_regex(
1329 r'html5player-(.+?)\.js', video_webpage,
1330 'html5 player', fatal=False)
1331 player_desc = u'html5 player %s' % player_version
1332
1333 parts_sizes = u'.'.join(compat_str(len(part)) for part in encrypted_sig.split('.'))
1334 self.to_screen(u'encrypted signature length %d (%s), itag %s, %s' %
1335 (len(encrypted_sig), parts_sizes, url_data['itag'][0], player_desc))
1336
1337 if not age_gate:
1338 jsplayer_url_json = self._search_regex(
1339 r'"assets":.+?"js":\s*("[^"]+")',
1340 video_webpage, u'JS player URL')
1341 player_url = json.loads(jsplayer_url_json)
1342
1343 signature = self._decrypt_signature(
1344 encrypted_sig, video_id, player_url, age_gate)
1345 url += '&signature=' + signature
1346 if 'ratebypass' not in url:
1347 url += '&ratebypass=yes'
1348 url_map[url_data['itag'][0]] = url
1349 formats = _map_to_format_list(url_map)
1350 elif video_info.get('hlsvp'):
1351 manifest_url = video_info['hlsvp'][0]
1352 url_map = self._extract_from_m3u8(manifest_url, video_id)
1353 formats = _map_to_format_list(url_map)
1354 else:
1355 raise ExtractorError(u'no conn, hlsvp or url_encoded_fmt_stream_map information found in video info')
1356
1357 # Look for the DASH manifest
1358 dash_manifest_url_lst = video_info.get('dashmpd')
1359 if (dash_manifest_url_lst and dash_manifest_url_lst[0] and
1360 self._downloader.params.get('youtube_include_dash_manifest', False)):
1361 try:
1362 dash_doc = self._download_xml(
1363 dash_manifest_url_lst[0], video_id,
1364 note=u'Downloading DASH manifest',
1365 errnote=u'Could not download DASH manifest')
1366 for r in dash_doc.findall(u'.//{urn:mpeg:DASH:schema:MPD:2011}Representation'):
1367 url_el = r.find('{urn:mpeg:DASH:schema:MPD:2011}BaseURL')
1368 if url_el is None:
1369 continue
1370 format_id = r.attrib['id']
1371 video_url = url_el.text
1372 filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength'))
1373 f = {
1374 'format_id': format_id,
1375 'url': video_url,
1376 'width': int_or_none(r.attrib.get('width')),
1377 'tbr': int_or_none(r.attrib.get('bandwidth'), 1000),
1378 'asr': int_or_none(r.attrib.get('audioSamplingRate')),
1379 'filesize': filesize,
1380 }
1381 try:
1382 existing_format = next(
1383 fo for fo in formats
1384 if fo['format_id'] == format_id)
1385 except StopIteration:
1386 f.update(self._formats.get(format_id, {}))
1387 formats.append(f)
1388 else:
1389 existing_format.update(f)
1390
1391 except (ExtractorError, KeyError) as e:
1392 self.report_warning(u'Skipping DASH manifest: %s' % e, video_id)
1393
1394 self._sort_formats(formats)
1395
1396 return {
1397 'id': video_id,
1398 'uploader': video_uploader,
1399 'uploader_id': video_uploader_id,
1400 'upload_date': upload_date,
1401 'title': video_title,
1402 'thumbnail': video_thumbnail,
1403 'description': video_description,
1404 'subtitles': video_subtitles,
1405 'duration': video_duration,
1406 'age_limit': 18 if age_gate else 0,
1407 'annotations': video_annotations,
1408 'webpage_url': 'https://www.youtube.com/watch?v=%s' % video_id,
1409 'view_count': view_count,
1410 'like_count': like_count,
1411 'dislike_count': dislike_count,
1412 'formats': formats,
1413 }
1414
1415class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
1416 IE_DESC = u'YouTube.com playlists'
1417 _VALID_URL = r"""(?:
1418 (?:https?://)?
1419 (?:\w+\.)?
1420 youtube\.com/
1421 (?:
1422 (?:course|view_play_list|my_playlists|artist|playlist|watch)
1423 \? (?:.*?&)*? (?:p|a|list)=
1424 | p/
1425 )
1426 ((?:PL|EC|UU|FL|RD)?[0-9A-Za-z-_]{10,})
1427 .*
1428 |
1429 ((?:PL|EC|UU|FL|RD)[0-9A-Za-z-_]{10,})
1430 )"""
1431 _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s&page=%s'
1432 _MORE_PAGES_INDICATOR = r'data-link-type="next"'
1433 _VIDEO_RE = r'href="/watch\?v=(?P<id>[0-9A-Za-z_-]{11})&amp;[^"]*?index=(?P<index>\d+)'
1434 IE_NAME = u'youtube:playlist'
1435
1436 @classmethod
1437 def suitable(cls, url):
1438 """Receives a URL and returns True if suitable for this IE."""
1439 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1440
1441 def _real_initialize(self):
1442 self._login()
1443
1444 def _ids_to_results(self, ids):
1445 return [self.url_result(vid_id, 'Youtube', video_id=vid_id)
1446 for vid_id in ids]
1447
1448 def _extract_mix(self, playlist_id):
1449 # The mixes are generated from a a single video
1450 # the id of the playlist is just 'RD' + video_id
1451 url = 'https://youtube.com/watch?v=%s&list=%s' % (playlist_id[-11:], playlist_id)
1452 webpage = self._download_webpage(url, playlist_id, u'Downloading Youtube mix')
1453 title_span = (get_element_by_attribute('class', 'title long-title', webpage) or
1454 get_element_by_attribute('class', 'title ', webpage))
1455 title = clean_html(title_span)
1456 video_re = r'data-index="\d+".*?href="/watch\?v=([0-9A-Za-z_-]{11})&amp;[^"]*?list=%s' % re.escape(playlist_id)
1457 ids = orderedSet(re.findall(video_re, webpage))
1458 url_results = self._ids_to_results(ids)
1459
1460 return self.playlist_result(url_results, playlist_id, title)
1461
1462 def _real_extract(self, url):
1463 # Extract playlist id
1464 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1465 if mobj is None:
1466 raise ExtractorError(u'Invalid URL: %s' % url)
1467 playlist_id = mobj.group(1) or mobj.group(2)
1468
1469 # Check if it's a video-specific URL
1470 query_dict = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
1471 if 'v' in query_dict:
1472 video_id = query_dict['v'][0]
1473 if self._downloader.params.get('noplaylist'):
1474 self.to_screen(u'Downloading just video %s because of --no-playlist' % video_id)
1475 return self.url_result(video_id, 'Youtube', video_id=video_id)
1476 else:
1477 self.to_screen(u'Downloading playlist PL%s - add --no-playlist to just download video %s' % (playlist_id, video_id))
1478
1479 if playlist_id.startswith('RD'):
1480 # Mixes require a custom extraction process
1481 return self._extract_mix(playlist_id)
1482 if playlist_id.startswith('TL'):
1483 raise ExtractorError(u'For downloading YouTube.com top lists, use '
1484 u'the "yttoplist" keyword, for example "youtube-dl \'yttoplist:music:Top Tracks\'"', expected=True)
1485
1486 # Extract the video ids from the playlist pages
1487 ids = []
1488
1489 for page_num in itertools.count(1):
1490 url = self._TEMPLATE_URL % (playlist_id, page_num)
1491 page = self._download_webpage(url, playlist_id, u'Downloading page #%s' % page_num)
1492 matches = re.finditer(self._VIDEO_RE, page)
1493 # We remove the duplicates and the link with index 0
1494 # (it's not the first video of the playlist)
1495 new_ids = orderedSet(m.group('id') for m in matches if m.group('index') != '0')
1496 ids.extend(new_ids)
1497
1498 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1499 break
1500
1501 try:
1502 playlist_title = self._og_search_title(page)
1503 except RegexNotFoundError:
1504 self.report_warning(
1505 u'Playlist page is missing OpenGraph title, falling back ...',
1506 playlist_id)
1507 playlist_title = self._html_search_regex(
1508 r'<h1 class="pl-header-title">(.*?)</h1>', page, u'title')
1509
1510 url_results = self._ids_to_results(ids)
1511 return self.playlist_result(url_results, playlist_id, playlist_title)
1512
1513
1514class YoutubeTopListIE(YoutubePlaylistIE):
1515 IE_NAME = u'youtube:toplist'
1516 IE_DESC = (u'YouTube.com top lists, "yttoplist:{channel}:{list title}"'
1517 u' (Example: "yttoplist:music:Top Tracks")')
1518 _VALID_URL = r'yttoplist:(?P<chann>.*?):(?P<title>.*?)$'
1519
1520 def _real_extract(self, url):
1521 mobj = re.match(self._VALID_URL, url)
1522 channel = mobj.group('chann')
1523 title = mobj.group('title')
1524 query = compat_urllib_parse.urlencode({'title': title})
1525 playlist_re = 'href="([^"]+?%s[^"]+?)"' % re.escape(query)
1526 channel_page = self._download_webpage('https://www.youtube.com/%s' % channel, title)
1527 link = self._html_search_regex(playlist_re, channel_page, u'list')
1528 url = compat_urlparse.urljoin('https://www.youtube.com/', link)
1529
1530 video_re = r'data-index="\d+".*?data-video-id="([0-9A-Za-z_-]{11})"'
1531 ids = []
1532 # sometimes the webpage doesn't contain the videos
1533 # retry until we get them
1534 for i in itertools.count(0):
1535 msg = u'Downloading Youtube mix'
1536 if i > 0:
1537 msg += ', retry #%d' % i
1538 webpage = self._download_webpage(url, title, msg)
1539 ids = orderedSet(re.findall(video_re, webpage))
1540 if ids:
1541 break
1542 url_results = self._ids_to_results(ids)
1543 return self.playlist_result(url_results, playlist_title=title)
1544
1545
1546class YoutubeChannelIE(InfoExtractor):
1547 IE_DESC = u'YouTube.com channels'
1548 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
1549 _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
1550 _MORE_PAGES_URL = 'http://www.youtube.com/c4_browse_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
1551 IE_NAME = u'youtube:channel'
1552
1553 def extract_videos_from_page(self, page):
1554 ids_in_page = []
1555 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1556 if mobj.group(1) not in ids_in_page:
1557 ids_in_page.append(mobj.group(1))
1558 return ids_in_page
1559
1560 def _real_extract(self, url):
1561 # Extract channel id
1562 mobj = re.match(self._VALID_URL, url)
1563 if mobj is None:
1564 raise ExtractorError(u'Invalid URL: %s' % url)
1565
1566 # Download channel page
1567 channel_id = mobj.group(1)
1568 video_ids = []
1569 url = 'https://www.youtube.com/channel/%s/videos' % channel_id
1570 channel_page = self._download_webpage(url, channel_id)
1571 autogenerated = re.search(r'''(?x)
1572 class="[^"]*?(?:
1573 channel-header-autogenerated-label|
1574 yt-channel-title-autogenerated
1575 )[^"]*"''', channel_page) is not None
1576
1577 if autogenerated:
1578 # The videos are contained in a single page
1579 # the ajax pages can't be used, they are empty
1580 video_ids = self.extract_videos_from_page(channel_page)
1581 else:
1582 # Download all channel pages using the json-based channel_ajax query
1583 for pagenum in itertools.count(1):
1584 url = self._MORE_PAGES_URL % (pagenum, channel_id)
1585 page = self._download_webpage(url, channel_id,
1586 u'Downloading page #%s' % pagenum)
1587
1588 page = json.loads(page)
1589
1590 ids_in_page = self.extract_videos_from_page(page['content_html'])
1591 video_ids.extend(ids_in_page)
1592
1593 if self._MORE_PAGES_INDICATOR not in page['load_more_widget_html']:
1594 break
1595
1596 self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1597
1598 url_entries = [self.url_result(video_id, 'Youtube', video_id=video_id)
1599 for video_id in video_ids]
1600 return self.playlist_result(url_entries, channel_id)
1601
1602
1603class YoutubeUserIE(InfoExtractor):
1604 IE_DESC = u'YouTube.com user videos (URL or "ytuser" keyword)'
1605 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:user/)?(?!(?:attribution_link|watch)(?:$|[^a-z_A-Z0-9-])))|ytuser:)(?!feed/)([A-Za-z0-9_-]+)'
1606 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1607 _GDATA_PAGE_SIZE = 50
1608 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d&alt=json'
1609 IE_NAME = u'youtube:user'
1610
1611 @classmethod
1612 def suitable(cls, url):
1613 # Don't return True if the url can be extracted with other youtube
1614 # extractor, the regex would is too permissive and it would match.
1615 other_ies = iter(klass for (name, klass) in globals().items() if name.endswith('IE') and klass is not cls)
1616 if any(ie.suitable(url) for ie in other_ies): return False
1617 else: return super(YoutubeUserIE, cls).suitable(url)
1618
1619 def _real_extract(self, url):
1620 # Extract username
1621 mobj = re.match(self._VALID_URL, url)
1622 if mobj is None:
1623 raise ExtractorError(u'Invalid URL: %s' % url)
1624
1625 username = mobj.group(1)
1626
1627 # Download video ids using YouTube Data API. Result size per
1628 # query is limited (currently to 50 videos) so we need to query
1629 # page by page until there are no video ids - it means we got
1630 # all of them.
1631
1632 def download_page(pagenum):
1633 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1634
1635 gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
1636 page = self._download_webpage(
1637 gdata_url, username,
1638 u'Downloading video ids from %d to %d' % (
1639 start_index, start_index + self._GDATA_PAGE_SIZE))
1640
1641 try:
1642 response = json.loads(page)
1643 except ValueError as err:
1644 raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
1645 if 'entry' not in response['feed']:
1646 return
1647
1648 # Extract video identifiers
1649 entries = response['feed']['entry']
1650 for entry in entries:
1651 title = entry['title']['$t']
1652 video_id = entry['id']['$t'].split('/')[-1]
1653 yield {
1654 '_type': 'url',
1655 'url': video_id,
1656 'ie_key': 'Youtube',
1657 'id': 'video_id',
1658 'title': title,
1659 }
1660 url_results = PagedList(download_page, self._GDATA_PAGE_SIZE)
1661
1662 return self.playlist_result(url_results, playlist_title=username)
1663
1664
1665class YoutubeSearchIE(SearchInfoExtractor):
1666 IE_DESC = u'YouTube.com searches'
1667 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1668 _MAX_RESULTS = 1000
1669 IE_NAME = u'youtube:search'
1670 _SEARCH_KEY = 'ytsearch'
1671
1672 def _get_n_results(self, query, n):
1673 """Get a specified number of results for a query"""
1674
1675 video_ids = []
1676 pagenum = 0
1677 limit = n
1678
1679 while (50 * pagenum) < limit:
1680 result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1681 data_json = self._download_webpage(
1682 result_url, video_id=u'query "%s"' % query,
1683 note=u'Downloading page %s' % (pagenum + 1),
1684 errnote=u'Unable to download API page')
1685 data = json.loads(data_json)
1686 api_response = data['data']
1687
1688 if 'items' not in api_response:
1689 raise ExtractorError(u'[youtube] No video results')
1690
1691 new_ids = list(video['id'] for video in api_response['items'])
1692 video_ids += new_ids
1693
1694 limit = min(n, api_response['totalItems'])
1695 pagenum += 1
1696
1697 if len(video_ids) > n:
1698 video_ids = video_ids[:n]
1699 videos = [self.url_result(video_id, 'Youtube', video_id=video_id)
1700 for video_id in video_ids]
1701 return self.playlist_result(videos, query)
1702
1703class YoutubeSearchDateIE(YoutubeSearchIE):
1704 IE_NAME = YoutubeSearchIE.IE_NAME + ':date'
1705 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc&orderby=published'
1706 _SEARCH_KEY = 'ytsearchdate'
1707 IE_DESC = u'YouTube.com searches, newest videos first'
1708
1709class YoutubeShowIE(InfoExtractor):
1710 IE_DESC = u'YouTube.com (multi-season) shows'
1711 _VALID_URL = r'https?://www\.youtube\.com/show/(.*)'
1712 IE_NAME = u'youtube:show'
1713
1714 def _real_extract(self, url):
1715 mobj = re.match(self._VALID_URL, url)
1716 show_name = mobj.group(1)
1717 webpage = self._download_webpage(url, show_name, u'Downloading show webpage')
1718 # There's one playlist for each season of the show
1719 m_seasons = list(re.finditer(r'href="(/playlist\?list=.*?)"', webpage))
1720 self.to_screen(u'%s: Found %s seasons' % (show_name, len(m_seasons)))
1721 return [self.url_result('https://www.youtube.com' + season.group(1), 'YoutubePlaylist') for season in m_seasons]
1722
1723
1724class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
1725 """
1726 Base class for extractors that fetch info from
1727 http://www.youtube.com/feed_ajax
1728 Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.
1729 """
1730 _LOGIN_REQUIRED = True
1731 # use action_load_personal_feed instead of action_load_system_feed
1732 _PERSONAL_FEED = False
1733
1734 @property
1735 def _FEED_TEMPLATE(self):
1736 action = 'action_load_system_feed'
1737 if self._PERSONAL_FEED:
1738 action = 'action_load_personal_feed'
1739 return 'http://www.youtube.com/feed_ajax?%s=1&feed_name=%s&paging=%%s' % (action, self._FEED_NAME)
1740
1741 @property
1742 def IE_NAME(self):
1743 return u'youtube:%s' % self._FEED_NAME
1744
1745 def _real_initialize(self):
1746 self._login()
1747
1748 def _real_extract(self, url):
1749 feed_entries = []
1750 paging = 0
1751 for i in itertools.count(1):
1752 info = self._download_webpage(self._FEED_TEMPLATE % paging,
1753 u'%s feed' % self._FEED_NAME,
1754 u'Downloading page %s' % i)
1755 info = json.loads(info)
1756 feed_html = info['feed_html']
1757 m_ids = re.finditer(r'"/watch\?v=(.*?)["&]', feed_html)
1758 ids = orderedSet(m.group(1) for m in m_ids)
1759 feed_entries.extend(
1760 self.url_result(video_id, 'Youtube', video_id=video_id)
1761 for video_id in ids)
1762 if info['paging'] is None:
1763 break
1764 paging = info['paging']
1765 return self.playlist_result(feed_entries, playlist_title=self._PLAYLIST_TITLE)
1766
1767class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor):
1768 IE_DESC = u'YouTube.com subscriptions feed, "ytsubs" keyword(requires authentication)'
1769 _VALID_URL = r'https?://www\.youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
1770 _FEED_NAME = 'subscriptions'
1771 _PLAYLIST_TITLE = u'Youtube Subscriptions'
1772
1773class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
1774 IE_DESC = u'YouTube.com recommended videos, "ytrec" keyword (requires authentication)'
1775 _VALID_URL = r'https?://www\.youtube\.com/feed/recommended|:ytrec(?:ommended)?'
1776 _FEED_NAME = 'recommended'
1777 _PLAYLIST_TITLE = u'Youtube Recommended videos'
1778
1779class YoutubeWatchLaterIE(YoutubeFeedsInfoExtractor):
1780 IE_DESC = u'Youtube watch later list, "ytwatchlater" keyword (requires authentication)'
1781 _VALID_URL = r'https?://www\.youtube\.com/feed/watch_later|:ytwatchlater'
1782 _FEED_NAME = 'watch_later'
1783 _PLAYLIST_TITLE = u'Youtube Watch Later'
1784 _PERSONAL_FEED = True
1785
1786class YoutubeHistoryIE(YoutubeFeedsInfoExtractor):
1787 IE_DESC = u'Youtube watch history, "ythistory" keyword (requires authentication)'
1788 _VALID_URL = u'https?://www\.youtube\.com/feed/history|:ythistory'
1789 _FEED_NAME = 'history'
1790 _PERSONAL_FEED = True
1791 _PLAYLIST_TITLE = u'Youtube Watch History'
1792
1793class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
1794 IE_NAME = u'youtube:favorites'
1795 IE_DESC = u'YouTube.com favourite videos, "ytfav" keyword (requires authentication)'
1796 _VALID_URL = r'https?://www\.youtube\.com/my_favorites|:ytfav(?:ou?rites)?'
1797 _LOGIN_REQUIRED = True
1798
1799 def _real_extract(self, url):
1800 webpage = self._download_webpage('https://www.youtube.com/my_favorites', 'Youtube Favourites videos')
1801 playlist_id = self._search_regex(r'list=(.+?)["&]', webpage, u'favourites playlist id')
1802 return self.url_result(playlist_id, 'YoutubePlaylist')
1803
1804
1805class YoutubeTruncatedURLIE(InfoExtractor):
1806 IE_NAME = 'youtube:truncated_url'
1807 IE_DESC = False # Do not list
1808 _VALID_URL = r'(?:https?://)?[^/]+/watch\?feature=[a-z_]+$'
1809
1810 def _real_extract(self, url):
1811 raise ExtractorError(
1812 u'Did you forget to quote the URL? Remember that & is a meta '
1813 u'character in most shells, so you want to put the URL in quotes, '
1814 u'like youtube-dl '
1815 u'"http://www.youtube.com/watch?feature=foo&v=BaW_jenozKc" '
1816 u' or simply youtube-dl BaW_jenozKc .',
1817 expected=True)