]> jfr.im git - yt-dlp.git/blob - youtube_dl/extractor/youtube.py
[yahoo] Add support for movies (Fixes #2780)
[yt-dlp.git] / youtube_dl / extractor / youtube.py
1 # coding: utf-8
2
3 import collections
4 import errno
5 import io
6 import itertools
7 import json
8 import os.path
9 import re
10 import struct
11 import traceback
12 import zlib
13
14 from .common import InfoExtractor, SearchInfoExtractor
15 from .subtitles import SubtitlesInfoExtractor
16 from ..jsinterp import JSInterpreter
17 from ..utils import (
18 compat_chr,
19 compat_parse_qs,
20 compat_urllib_parse,
21 compat_urllib_request,
22 compat_urlparse,
23 compat_str,
24
25 clean_html,
26 get_cachedir,
27 get_element_by_id,
28 get_element_by_attribute,
29 ExtractorError,
30 int_or_none,
31 PagedList,
32 unescapeHTML,
33 unified_strdate,
34 orderedSet,
35 write_json_file,
36 uppercase_escape,
37 )
38
39 class YoutubeBaseInfoExtractor(InfoExtractor):
40 """Provide base functions for Youtube extractors"""
41 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
42 _LANG_URL = r'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
43 _AGE_URL = 'https://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
44 _NETRC_MACHINE = 'youtube'
45 # If True it will raise an error if no login info is provided
46 _LOGIN_REQUIRED = False
47
48 def _set_language(self):
49 return bool(self._download_webpage(
50 self._LANG_URL, None,
51 note=u'Setting language', errnote='unable to set language',
52 fatal=False))
53
54 def _login(self):
55 (username, password) = self._get_login_info()
56 # No authentication to be performed
57 if username is None:
58 if self._LOGIN_REQUIRED:
59 raise ExtractorError(u'No login info available, needed for using %s.' % self.IE_NAME, expected=True)
60 return False
61
62 login_page = self._download_webpage(
63 self._LOGIN_URL, None,
64 note=u'Downloading login page',
65 errnote=u'unable to fetch login page', fatal=False)
66 if login_page is False:
67 return
68
69 galx = self._search_regex(r'(?s)<input.+?name="GALX".+?value="(.+?)"',
70 login_page, u'Login GALX parameter')
71
72 # Log in
73 login_form_strs = {
74 u'continue': u'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
75 u'Email': username,
76 u'GALX': galx,
77 u'Passwd': password,
78 u'PersistentCookie': u'yes',
79 u'_utf8': u'霱',
80 u'bgresponse': u'js_disabled',
81 u'checkConnection': u'',
82 u'checkedDomains': u'youtube',
83 u'dnConn': u'',
84 u'pstMsg': u'0',
85 u'rmShown': u'1',
86 u'secTok': u'',
87 u'signIn': u'Sign in',
88 u'timeStmp': u'',
89 u'service': u'youtube',
90 u'uilel': u'3',
91 u'hl': u'en_US',
92 }
93 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
94 # chokes on unicode
95 login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
96 login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
97
98 req = compat_urllib_request.Request(self._LOGIN_URL, login_data)
99 login_results = self._download_webpage(
100 req, None,
101 note=u'Logging in', errnote=u'unable to log in', fatal=False)
102 if login_results is False:
103 return False
104 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
105 self._downloader.report_warning(u'unable to log in: bad username or password')
106 return False
107 return True
108
109 def _confirm_age(self):
110 age_form = {
111 'next_url': '/',
112 'action_confirm': 'Confirm',
113 }
114 req = compat_urllib_request.Request(self._AGE_URL,
115 compat_urllib_parse.urlencode(age_form).encode('ascii'))
116
117 self._download_webpage(
118 req, None,
119 note=u'Confirming age', errnote=u'Unable to confirm age')
120 return True
121
122 def _real_initialize(self):
123 if self._downloader is None:
124 return
125 if not self._set_language():
126 return
127 if not self._login():
128 return
129 self._confirm_age()
130
131
132 class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
133 IE_DESC = u'YouTube.com'
134 _VALID_URL = r"""(?x)^
135 (
136 (?:https?://|//)? # http(s):// or protocol-independent URL (optional)
137 (?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/|
138 (?:www\.)?deturl\.com/www\.youtube\.com/|
139 (?:www\.)?pwnyoutube\.com/|
140 (?:www\.)?yourepeat\.com/|
141 tube\.majestyc\.net/|
142 youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains
143 (?:.*?\#/)? # handle anchor (#/) redirect urls
144 (?: # the various things that can precede the ID:
145 (?:(?:v|embed|e)/) # v/ or embed/ or e/
146 |(?: # or the v= param in all its forms
147 (?:(?:watch|movie)(?:_popup)?(?:\.php)?/?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
148 (?:\?|\#!?) # the params delimiter ? or # or #!
149 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
150 v=
151 )
152 ))
153 |youtu\.be/ # just youtu.be/xxxx
154 |https?://(?:www\.)?cleanvideosearch\.com/media/action/yt/watch\?videoId=
155 )
156 )? # all until now is optional -> you can pass the naked ID
157 ([0-9A-Za-z_-]{11}) # here is it! the YouTube video ID
158 (?(1).+)? # if we found the ID, everything can follow
159 $"""
160 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
161 _formats = {
162 '5': {'ext': 'flv', 'width': 400, 'height': 240},
163 '6': {'ext': 'flv', 'width': 450, 'height': 270},
164 '13': {'ext': '3gp'},
165 '17': {'ext': '3gp', 'width': 176, 'height': 144},
166 '18': {'ext': 'mp4', 'width': 640, 'height': 360},
167 '22': {'ext': 'mp4', 'width': 1280, 'height': 720},
168 '34': {'ext': 'flv', 'width': 640, 'height': 360},
169 '35': {'ext': 'flv', 'width': 854, 'height': 480},
170 '36': {'ext': '3gp', 'width': 320, 'height': 240},
171 '37': {'ext': 'mp4', 'width': 1920, 'height': 1080},
172 '38': {'ext': 'mp4', 'width': 4096, 'height': 3072},
173 '43': {'ext': 'webm', 'width': 640, 'height': 360},
174 '44': {'ext': 'webm', 'width': 854, 'height': 480},
175 '45': {'ext': 'webm', 'width': 1280, 'height': 720},
176 '46': {'ext': 'webm', 'width': 1920, 'height': 1080},
177
178
179 # 3d videos
180 '82': {'ext': 'mp4', 'height': 360, 'format_note': '3D', 'preference': -20},
181 '83': {'ext': 'mp4', 'height': 480, 'format_note': '3D', 'preference': -20},
182 '84': {'ext': 'mp4', 'height': 720, 'format_note': '3D', 'preference': -20},
183 '85': {'ext': 'mp4', 'height': 1080, 'format_note': '3D', 'preference': -20},
184 '100': {'ext': 'webm', 'height': 360, 'format_note': '3D', 'preference': -20},
185 '101': {'ext': 'webm', 'height': 480, 'format_note': '3D', 'preference': -20},
186 '102': {'ext': 'webm', 'height': 720, 'format_note': '3D', 'preference': -20},
187
188 # Apple HTTP Live Streaming
189 '92': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'preference': -10},
190 '93': {'ext': 'mp4', 'height': 360, 'format_note': 'HLS', 'preference': -10},
191 '94': {'ext': 'mp4', 'height': 480, 'format_note': 'HLS', 'preference': -10},
192 '95': {'ext': 'mp4', 'height': 720, 'format_note': 'HLS', 'preference': -10},
193 '96': {'ext': 'mp4', 'height': 1080, 'format_note': 'HLS', 'preference': -10},
194 '132': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'preference': -10},
195 '151': {'ext': 'mp4', 'height': 72, 'format_note': 'HLS', 'preference': -10},
196
197 # DASH mp4 video
198 '133': {'ext': 'mp4', 'height': 240, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
199 '134': {'ext': 'mp4', 'height': 360, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
200 '135': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
201 '136': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
202 '137': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
203 '138': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
204 '160': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
205 '264': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
206
207 # Dash mp4 audio
208 '139': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 48, 'preference': -50},
209 '140': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 128, 'preference': -50},
210 '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 256, 'preference': -50},
211
212 # Dash webm
213 '167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'acodec': 'none', 'preference': -40},
214 '168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'acodec': 'none', 'preference': -40},
215 '169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'acodec': 'none', 'preference': -40},
216 '170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'acodec': 'none', 'preference': -40},
217 '218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'acodec': 'none', 'preference': -40},
218 '219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'acodec': 'none', 'preference': -40},
219 '242': {'ext': 'webm', 'height': 240, 'format_note': 'DASH webm', 'preference': -40},
220 '243': {'ext': 'webm', 'height': 360, 'format_note': 'DASH webm', 'preference': -40},
221 '244': {'ext': 'webm', 'height': 480, 'format_note': 'DASH webm', 'preference': -40},
222 '245': {'ext': 'webm', 'height': 480, 'format_note': 'DASH webm', 'preference': -40},
223 '246': {'ext': 'webm', 'height': 480, 'format_note': 'DASH webm', 'preference': -40},
224 '247': {'ext': 'webm', 'height': 720, 'format_note': 'DASH webm', 'preference': -40},
225 '248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH webm', 'preference': -40},
226
227 # Dash webm audio
228 '171': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH webm audio', 'abr': 48, 'preference': -50},
229 '172': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH webm audio', 'abr': 256, 'preference': -50},
230
231 # RTMP (unnamed)
232 '_rtmp': {'protocol': 'rtmp'},
233 }
234
235 IE_NAME = u'youtube'
236 _TESTS = [
237 {
238 u"url": u"http://www.youtube.com/watch?v=BaW_jenozKc",
239 u"file": u"BaW_jenozKc.mp4",
240 u"info_dict": {
241 u"title": u"youtube-dl test video \"'/\\ä↭𝕐",
242 u"uploader": u"Philipp Hagemeister",
243 u"uploader_id": u"phihag",
244 u"upload_date": u"20121002",
245 u"description": u"test chars: \"'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de ."
246 }
247 },
248 {
249 u"url": u"http://www.youtube.com/watch?v=UxxajLWwzqY",
250 u"file": u"UxxajLWwzqY.mp4",
251 u"note": u"Test generic use_cipher_signature video (#897)",
252 u"info_dict": {
253 u"upload_date": u"20120506",
254 u"title": u"Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]",
255 u"description": u"md5:5b292926389560516e384ac437c0ec07",
256 u"uploader": u"Icona Pop",
257 u"uploader_id": u"IconaPop"
258 }
259 },
260 {
261 u"url": u"https://www.youtube.com/watch?v=07FYdnEawAQ",
262 u"file": u"07FYdnEawAQ.mp4",
263 u"note": u"Test VEVO video with age protection (#956)",
264 u"info_dict": {
265 u"upload_date": u"20130703",
266 u"title": u"Justin Timberlake - Tunnel Vision (Explicit)",
267 u"description": u"md5:64249768eec3bc4276236606ea996373",
268 u"uploader": u"justintimberlakeVEVO",
269 u"uploader_id": u"justintimberlakeVEVO"
270 }
271 },
272 {
273 u"url": u"//www.YouTube.com/watch?v=yZIXLfi8CZQ",
274 u"file": u"yZIXLfi8CZQ.mp4",
275 u"note": u"Embed-only video (#1746)",
276 u"info_dict": {
277 u"upload_date": u"20120608",
278 u"title": u"Principal Sexually Assaults A Teacher - Episode 117 - 8th June 2012",
279 u"description": u"md5:09b78bd971f1e3e289601dfba15ca4f7",
280 u"uploader": u"SET India",
281 u"uploader_id": u"setindia"
282 }
283 },
284 {
285 u"url": u"http://www.youtube.com/watch?v=a9LDPn-MO4I",
286 u"file": u"a9LDPn-MO4I.m4a",
287 u"note": u"256k DASH audio (format 141) via DASH manifest",
288 u"info_dict": {
289 u"upload_date": "20121002",
290 u"uploader_id": "8KVIDEO",
291 u"description": "No description available.",
292 u"uploader": "8KVIDEO",
293 u"title": "UHDTV TEST 8K VIDEO.mp4"
294 },
295 u"params": {
296 u"youtube_include_dash_manifest": True,
297 u"format": "141",
298 },
299 },
300 # DASH manifest with encrypted signature
301 {
302 u'url': u'https://www.youtube.com/watch?v=IB3lcPjvWLA',
303 u'info_dict': {
304 u'id': u'IB3lcPjvWLA',
305 u'ext': u'm4a',
306 u'title': u'Afrojack - The Spark ft. Spree Wilson',
307 u'description': u'md5:3199ed45ee8836572865580804d7ac0f',
308 u'uploader': u'AfrojackVEVO',
309 u'uploader_id': u'AfrojackVEVO',
310 u'upload_date': u'20131011',
311 },
312 u"params": {
313 u'youtube_include_dash_manifest': True,
314 u'format': '141',
315 },
316 },
317 ]
318
319
320 @classmethod
321 def suitable(cls, url):
322 """Receives a URL and returns True if suitable for this IE."""
323 if YoutubePlaylistIE.suitable(url): return False
324 return re.match(cls._VALID_URL, url) is not None
325
326 def __init__(self, *args, **kwargs):
327 super(YoutubeIE, self).__init__(*args, **kwargs)
328 self._player_cache = {}
329
330 def report_video_info_webpage_download(self, video_id):
331 """Report attempt to download video info webpage."""
332 self.to_screen(u'%s: Downloading video info webpage' % video_id)
333
334 def report_information_extraction(self, video_id):
335 """Report attempt to extract video information."""
336 self.to_screen(u'%s: Extracting video information' % video_id)
337
338 def report_unavailable_format(self, video_id, format):
339 """Report extracted video URL."""
340 self.to_screen(u'%s: Format %s not available' % (video_id, format))
341
342 def report_rtmp_download(self):
343 """Indicate the download will use the RTMP protocol."""
344 self.to_screen(u'RTMP download detected')
345
346 def _extract_signature_function(self, video_id, player_url, slen):
347 id_m = re.match(r'.*-(?P<id>[a-zA-Z0-9_-]+)\.(?P<ext>[a-z]+)$',
348 player_url)
349 player_type = id_m.group('ext')
350 player_id = id_m.group('id')
351
352 # Read from filesystem cache
353 func_id = '%s_%s_%d' % (player_type, player_id, slen)
354 assert os.path.basename(func_id) == func_id
355 cache_dir = get_cachedir(self._downloader.params)
356
357 cache_enabled = cache_dir is not None
358 if cache_enabled:
359 cache_fn = os.path.join(os.path.expanduser(cache_dir),
360 u'youtube-sigfuncs',
361 func_id + '.json')
362 try:
363 with io.open(cache_fn, 'r', encoding='utf-8') as cachef:
364 cache_spec = json.load(cachef)
365 return lambda s: u''.join(s[i] for i in cache_spec)
366 except IOError:
367 pass # No cache available
368
369 if player_type == 'js':
370 code = self._download_webpage(
371 player_url, video_id,
372 note=u'Downloading %s player %s' % (player_type, player_id),
373 errnote=u'Download of %s failed' % player_url)
374 res = self._parse_sig_js(code)
375 elif player_type == 'swf':
376 urlh = self._request_webpage(
377 player_url, video_id,
378 note=u'Downloading %s player %s' % (player_type, player_id),
379 errnote=u'Download of %s failed' % player_url)
380 code = urlh.read()
381 res = self._parse_sig_swf(code)
382 else:
383 assert False, 'Invalid player type %r' % player_type
384
385 if cache_enabled:
386 try:
387 test_string = u''.join(map(compat_chr, range(slen)))
388 cache_res = res(test_string)
389 cache_spec = [ord(c) for c in cache_res]
390 try:
391 os.makedirs(os.path.dirname(cache_fn))
392 except OSError as ose:
393 if ose.errno != errno.EEXIST:
394 raise
395 write_json_file(cache_spec, cache_fn)
396 except Exception:
397 tb = traceback.format_exc()
398 self._downloader.report_warning(
399 u'Writing cache to %r failed: %s' % (cache_fn, tb))
400
401 return res
402
403 def _print_sig_code(self, func, slen):
404 def gen_sig_code(idxs):
405 def _genslice(start, end, step):
406 starts = u'' if start == 0 else str(start)
407 ends = (u':%d' % (end+step)) if end + step >= 0 else u':'
408 steps = u'' if step == 1 else (u':%d' % step)
409 return u's[%s%s%s]' % (starts, ends, steps)
410
411 step = None
412 start = '(Never used)' # Quelch pyflakes warnings - start will be
413 # set as soon as step is set
414 for i, prev in zip(idxs[1:], idxs[:-1]):
415 if step is not None:
416 if i - prev == step:
417 continue
418 yield _genslice(start, prev, step)
419 step = None
420 continue
421 if i - prev in [-1, 1]:
422 step = i - prev
423 start = prev
424 continue
425 else:
426 yield u's[%d]' % prev
427 if step is None:
428 yield u's[%d]' % i
429 else:
430 yield _genslice(start, i, step)
431
432 test_string = u''.join(map(compat_chr, range(slen)))
433 cache_res = func(test_string)
434 cache_spec = [ord(c) for c in cache_res]
435 expr_code = u' + '.join(gen_sig_code(cache_spec))
436 code = u'if len(s) == %d:\n return %s\n' % (slen, expr_code)
437 self.to_screen(u'Extracted signature function:\n' + code)
438
439 def _parse_sig_js(self, jscode):
440 funcname = self._search_regex(
441 r'signature=([a-zA-Z]+)', jscode,
442 u'Initial JS player signature function name')
443
444 jsi = JSInterpreter(jscode)
445 initial_function = jsi.extract_function(funcname)
446 return lambda s: initial_function([s])
447
448 def _parse_sig_swf(self, file_contents):
449 if file_contents[1:3] != b'WS':
450 raise ExtractorError(
451 u'Not an SWF file; header is %r' % file_contents[:3])
452 if file_contents[:1] == b'C':
453 content = zlib.decompress(file_contents[8:])
454 else:
455 raise NotImplementedError(u'Unsupported compression format %r' %
456 file_contents[:1])
457
458 def extract_tags(content):
459 pos = 0
460 while pos < len(content):
461 header16 = struct.unpack('<H', content[pos:pos+2])[0]
462 pos += 2
463 tag_code = header16 >> 6
464 tag_len = header16 & 0x3f
465 if tag_len == 0x3f:
466 tag_len = struct.unpack('<I', content[pos:pos+4])[0]
467 pos += 4
468 assert pos+tag_len <= len(content)
469 yield (tag_code, content[pos:pos+tag_len])
470 pos += tag_len
471
472 code_tag = next(tag
473 for tag_code, tag in extract_tags(content)
474 if tag_code == 82)
475 p = code_tag.index(b'\0', 4) + 1
476 code_reader = io.BytesIO(code_tag[p:])
477
478 # Parse ABC (AVM2 ByteCode)
479 def read_int(reader=None):
480 if reader is None:
481 reader = code_reader
482 res = 0
483 shift = 0
484 for _ in range(5):
485 buf = reader.read(1)
486 assert len(buf) == 1
487 b = struct.unpack('<B', buf)[0]
488 res = res | ((b & 0x7f) << shift)
489 if b & 0x80 == 0:
490 break
491 shift += 7
492 return res
493
494 def u30(reader=None):
495 res = read_int(reader)
496 assert res & 0xf0000000 == 0
497 return res
498 u32 = read_int
499
500 def s32(reader=None):
501 v = read_int(reader)
502 if v & 0x80000000 != 0:
503 v = - ((v ^ 0xffffffff) + 1)
504 return v
505
506 def read_string(reader=None):
507 if reader is None:
508 reader = code_reader
509 slen = u30(reader)
510 resb = reader.read(slen)
511 assert len(resb) == slen
512 return resb.decode('utf-8')
513
514 def read_bytes(count, reader=None):
515 if reader is None:
516 reader = code_reader
517 resb = reader.read(count)
518 assert len(resb) == count
519 return resb
520
521 def read_byte(reader=None):
522 resb = read_bytes(1, reader=reader)
523 res = struct.unpack('<B', resb)[0]
524 return res
525
526 # minor_version + major_version
527 read_bytes(2 + 2)
528
529 # Constant pool
530 int_count = u30()
531 for _c in range(1, int_count):
532 s32()
533 uint_count = u30()
534 for _c in range(1, uint_count):
535 u32()
536 double_count = u30()
537 read_bytes((double_count-1) * 8)
538 string_count = u30()
539 constant_strings = [u'']
540 for _c in range(1, string_count):
541 s = read_string()
542 constant_strings.append(s)
543 namespace_count = u30()
544 for _c in range(1, namespace_count):
545 read_bytes(1) # kind
546 u30() # name
547 ns_set_count = u30()
548 for _c in range(1, ns_set_count):
549 count = u30()
550 for _c2 in range(count):
551 u30()
552 multiname_count = u30()
553 MULTINAME_SIZES = {
554 0x07: 2, # QName
555 0x0d: 2, # QNameA
556 0x0f: 1, # RTQName
557 0x10: 1, # RTQNameA
558 0x11: 0, # RTQNameL
559 0x12: 0, # RTQNameLA
560 0x09: 2, # Multiname
561 0x0e: 2, # MultinameA
562 0x1b: 1, # MultinameL
563 0x1c: 1, # MultinameLA
564 }
565 multinames = [u'']
566 for _c in range(1, multiname_count):
567 kind = u30()
568 assert kind in MULTINAME_SIZES, u'Invalid multiname kind %r' % kind
569 if kind == 0x07:
570 u30() # namespace_idx
571 name_idx = u30()
572 multinames.append(constant_strings[name_idx])
573 else:
574 multinames.append('[MULTINAME kind: %d]' % kind)
575 for _c2 in range(MULTINAME_SIZES[kind]):
576 u30()
577
578 # Methods
579 method_count = u30()
580 MethodInfo = collections.namedtuple(
581 'MethodInfo',
582 ['NEED_ARGUMENTS', 'NEED_REST'])
583 method_infos = []
584 for method_id in range(method_count):
585 param_count = u30()
586 u30() # return type
587 for _ in range(param_count):
588 u30() # param type
589 u30() # name index (always 0 for youtube)
590 flags = read_byte()
591 if flags & 0x08 != 0:
592 # Options present
593 option_count = u30()
594 for c in range(option_count):
595 u30() # val
596 read_bytes(1) # kind
597 if flags & 0x80 != 0:
598 # Param names present
599 for _ in range(param_count):
600 u30() # param name
601 mi = MethodInfo(flags & 0x01 != 0, flags & 0x04 != 0)
602 method_infos.append(mi)
603
604 # Metadata
605 metadata_count = u30()
606 for _c in range(metadata_count):
607 u30() # name
608 item_count = u30()
609 for _c2 in range(item_count):
610 u30() # key
611 u30() # value
612
613 def parse_traits_info():
614 trait_name_idx = u30()
615 kind_full = read_byte()
616 kind = kind_full & 0x0f
617 attrs = kind_full >> 4
618 methods = {}
619 if kind in [0x00, 0x06]: # Slot or Const
620 u30() # Slot id
621 u30() # type_name_idx
622 vindex = u30()
623 if vindex != 0:
624 read_byte() # vkind
625 elif kind in [0x01, 0x02, 0x03]: # Method / Getter / Setter
626 u30() # disp_id
627 method_idx = u30()
628 methods[multinames[trait_name_idx]] = method_idx
629 elif kind == 0x04: # Class
630 u30() # slot_id
631 u30() # classi
632 elif kind == 0x05: # Function
633 u30() # slot_id
634 function_idx = u30()
635 methods[function_idx] = multinames[trait_name_idx]
636 else:
637 raise ExtractorError(u'Unsupported trait kind %d' % kind)
638
639 if attrs & 0x4 != 0: # Metadata present
640 metadata_count = u30()
641 for _c3 in range(metadata_count):
642 u30() # metadata index
643
644 return methods
645
646 # Classes
647 TARGET_CLASSNAME = u'SignatureDecipher'
648 searched_idx = multinames.index(TARGET_CLASSNAME)
649 searched_class_id = None
650 class_count = u30()
651 for class_id in range(class_count):
652 name_idx = u30()
653 if name_idx == searched_idx:
654 # We found the class we're looking for!
655 searched_class_id = class_id
656 u30() # super_name idx
657 flags = read_byte()
658 if flags & 0x08 != 0: # Protected namespace is present
659 u30() # protected_ns_idx
660 intrf_count = u30()
661 for _c2 in range(intrf_count):
662 u30()
663 u30() # iinit
664 trait_count = u30()
665 for _c2 in range(trait_count):
666 parse_traits_info()
667
668 if searched_class_id is None:
669 raise ExtractorError(u'Target class %r not found' %
670 TARGET_CLASSNAME)
671
672 method_names = {}
673 method_idxs = {}
674 for class_id in range(class_count):
675 u30() # cinit
676 trait_count = u30()
677 for _c2 in range(trait_count):
678 trait_methods = parse_traits_info()
679 if class_id == searched_class_id:
680 method_names.update(trait_methods.items())
681 method_idxs.update(dict(
682 (idx, name)
683 for name, idx in trait_methods.items()))
684
685 # Scripts
686 script_count = u30()
687 for _c in range(script_count):
688 u30() # init
689 trait_count = u30()
690 for _c2 in range(trait_count):
691 parse_traits_info()
692
693 # Method bodies
694 method_body_count = u30()
695 Method = collections.namedtuple('Method', ['code', 'local_count'])
696 methods = {}
697 for _c in range(method_body_count):
698 method_idx = u30()
699 u30() # max_stack
700 local_count = u30()
701 u30() # init_scope_depth
702 u30() # max_scope_depth
703 code_length = u30()
704 code = read_bytes(code_length)
705 if method_idx in method_idxs:
706 m = Method(code, local_count)
707 methods[method_idxs[method_idx]] = m
708 exception_count = u30()
709 for _c2 in range(exception_count):
710 u30() # from
711 u30() # to
712 u30() # target
713 u30() # exc_type
714 u30() # var_name
715 trait_count = u30()
716 for _c2 in range(trait_count):
717 parse_traits_info()
718
719 assert p + code_reader.tell() == len(code_tag)
720 assert len(methods) == len(method_idxs)
721
722 method_pyfunctions = {}
723
724 def extract_function(func_name):
725 if func_name in method_pyfunctions:
726 return method_pyfunctions[func_name]
727 if func_name not in methods:
728 raise ExtractorError(u'Cannot find function %r' % func_name)
729 m = methods[func_name]
730
731 def resfunc(args):
732 registers = ['(this)'] + list(args) + [None] * m.local_count
733 stack = []
734 coder = io.BytesIO(m.code)
735 while True:
736 opcode = struct.unpack('!B', coder.read(1))[0]
737 if opcode == 36: # pushbyte
738 v = struct.unpack('!B', coder.read(1))[0]
739 stack.append(v)
740 elif opcode == 44: # pushstring
741 idx = u30(coder)
742 stack.append(constant_strings[idx])
743 elif opcode == 48: # pushscope
744 # We don't implement the scope register, so we'll just
745 # ignore the popped value
746 stack.pop()
747 elif opcode == 70: # callproperty
748 index = u30(coder)
749 mname = multinames[index]
750 arg_count = u30(coder)
751 args = list(reversed(
752 [stack.pop() for _ in range(arg_count)]))
753 obj = stack.pop()
754 if mname == u'split':
755 assert len(args) == 1
756 assert isinstance(args[0], compat_str)
757 assert isinstance(obj, compat_str)
758 if args[0] == u'':
759 res = list(obj)
760 else:
761 res = obj.split(args[0])
762 stack.append(res)
763 elif mname == u'slice':
764 assert len(args) == 1
765 assert isinstance(args[0], int)
766 assert isinstance(obj, list)
767 res = obj[args[0]:]
768 stack.append(res)
769 elif mname == u'join':
770 assert len(args) == 1
771 assert isinstance(args[0], compat_str)
772 assert isinstance(obj, list)
773 res = args[0].join(obj)
774 stack.append(res)
775 elif mname in method_pyfunctions:
776 stack.append(method_pyfunctions[mname](args))
777 else:
778 raise NotImplementedError(
779 u'Unsupported property %r on %r'
780 % (mname, obj))
781 elif opcode == 72: # returnvalue
782 res = stack.pop()
783 return res
784 elif opcode == 79: # callpropvoid
785 index = u30(coder)
786 mname = multinames[index]
787 arg_count = u30(coder)
788 args = list(reversed(
789 [stack.pop() for _ in range(arg_count)]))
790 obj = stack.pop()
791 if mname == u'reverse':
792 assert isinstance(obj, list)
793 obj.reverse()
794 else:
795 raise NotImplementedError(
796 u'Unsupported (void) property %r on %r'
797 % (mname, obj))
798 elif opcode == 93: # findpropstrict
799 index = u30(coder)
800 mname = multinames[index]
801 res = extract_function(mname)
802 stack.append(res)
803 elif opcode == 97: # setproperty
804 index = u30(coder)
805 value = stack.pop()
806 idx = stack.pop()
807 obj = stack.pop()
808 assert isinstance(obj, list)
809 assert isinstance(idx, int)
810 obj[idx] = value
811 elif opcode == 98: # getlocal
812 index = u30(coder)
813 stack.append(registers[index])
814 elif opcode == 99: # setlocal
815 index = u30(coder)
816 value = stack.pop()
817 registers[index] = value
818 elif opcode == 102: # getproperty
819 index = u30(coder)
820 pname = multinames[index]
821 if pname == u'length':
822 obj = stack.pop()
823 assert isinstance(obj, list)
824 stack.append(len(obj))
825 else: # Assume attribute access
826 idx = stack.pop()
827 assert isinstance(idx, int)
828 obj = stack.pop()
829 assert isinstance(obj, list)
830 stack.append(obj[idx])
831 elif opcode == 128: # coerce
832 u30(coder)
833 elif opcode == 133: # coerce_s
834 assert isinstance(stack[-1], (type(None), compat_str))
835 elif opcode == 164: # modulo
836 value2 = stack.pop()
837 value1 = stack.pop()
838 res = value1 % value2
839 stack.append(res)
840 elif opcode == 208: # getlocal_0
841 stack.append(registers[0])
842 elif opcode == 209: # getlocal_1
843 stack.append(registers[1])
844 elif opcode == 210: # getlocal_2
845 stack.append(registers[2])
846 elif opcode == 211: # getlocal_3
847 stack.append(registers[3])
848 elif opcode == 214: # setlocal_2
849 registers[2] = stack.pop()
850 elif opcode == 215: # setlocal_3
851 registers[3] = stack.pop()
852 else:
853 raise NotImplementedError(
854 u'Unsupported opcode %d' % opcode)
855
856 method_pyfunctions[func_name] = resfunc
857 return resfunc
858
859 initial_function = extract_function(u'decipher')
860 return lambda s: initial_function([s])
861
862 def _decrypt_signature(self, s, video_id, player_url, age_gate=False):
863 """Turn the encrypted s field into a working signature"""
864
865 if player_url is not None:
866 if player_url.startswith(u'//'):
867 player_url = u'https:' + player_url
868 try:
869 player_id = (player_url, len(s))
870 if player_id not in self._player_cache:
871 func = self._extract_signature_function(
872 video_id, player_url, len(s)
873 )
874 self._player_cache[player_id] = func
875 func = self._player_cache[player_id]
876 if self._downloader.params.get('youtube_print_sig_code'):
877 self._print_sig_code(func, len(s))
878 return func(s)
879 except Exception:
880 tb = traceback.format_exc()
881 self._downloader.report_warning(
882 u'Automatic signature extraction failed: ' + tb)
883
884 self._downloader.report_warning(
885 u'Warning: Falling back to static signature algorithm')
886
887 return self._static_decrypt_signature(
888 s, video_id, player_url, age_gate)
889
890 def _static_decrypt_signature(self, s, video_id, player_url, age_gate):
891 if age_gate:
892 # The videos with age protection use another player, so the
893 # algorithms can be different.
894 if len(s) == 86:
895 return s[2:63] + s[82] + s[64:82] + s[63]
896
897 if len(s) == 93:
898 return s[86:29:-1] + s[88] + s[28:5:-1]
899 elif len(s) == 92:
900 return s[25] + s[3:25] + s[0] + s[26:42] + s[79] + s[43:79] + s[91] + s[80:83]
901 elif len(s) == 91:
902 return s[84:27:-1] + s[86] + s[26:5:-1]
903 elif len(s) == 90:
904 return s[25] + s[3:25] + s[2] + s[26:40] + s[77] + s[41:77] + s[89] + s[78:81]
905 elif len(s) == 89:
906 return s[84:78:-1] + s[87] + s[77:60:-1] + s[0] + s[59:3:-1]
907 elif len(s) == 88:
908 return s[7:28] + s[87] + s[29:45] + s[55] + s[46:55] + s[2] + s[56:87] + s[28]
909 elif len(s) == 87:
910 return s[6:27] + s[4] + s[28:39] + s[27] + s[40:59] + s[2] + s[60:]
911 elif len(s) == 86:
912 return s[80:72:-1] + s[16] + s[71:39:-1] + s[72] + s[38:16:-1] + s[82] + s[15::-1]
913 elif len(s) == 85:
914 return s[3:11] + s[0] + s[12:55] + s[84] + s[56:84]
915 elif len(s) == 84:
916 return s[78:70:-1] + s[14] + s[69:37:-1] + s[70] + s[36:14:-1] + s[80] + s[:14][::-1]
917 elif len(s) == 83:
918 return s[80:63:-1] + s[0] + s[62:0:-1] + s[63]
919 elif len(s) == 82:
920 return s[80:37:-1] + s[7] + s[36:7:-1] + s[0] + s[6:0:-1] + s[37]
921 elif len(s) == 81:
922 return s[56] + s[79:56:-1] + s[41] + s[55:41:-1] + s[80] + s[40:34:-1] + s[0] + s[33:29:-1] + s[34] + s[28:9:-1] + s[29] + s[8:0:-1] + s[9]
923 elif len(s) == 80:
924 return s[1:19] + s[0] + s[20:68] + s[19] + s[69:80]
925 elif len(s) == 79:
926 return s[54] + s[77:54:-1] + s[39] + s[53:39:-1] + s[78] + s[38:34:-1] + s[0] + s[33:29:-1] + s[34] + s[28:9:-1] + s[29] + s[8:0:-1] + s[9]
927
928 else:
929 raise ExtractorError(u'Unable to decrypt signature, key length %d not supported; retrying might work' % (len(s)))
930
931 def _get_available_subtitles(self, video_id, webpage):
932 try:
933 sub_list = self._download_webpage(
934 'https://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,
935 video_id, note=False)
936 except ExtractorError as err:
937 self._downloader.report_warning(u'unable to download video subtitles: %s' % compat_str(err))
938 return {}
939 lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
940
941 sub_lang_list = {}
942 for l in lang_list:
943 lang = l[1]
944 params = compat_urllib_parse.urlencode({
945 'lang': lang,
946 'v': video_id,
947 'fmt': self._downloader.params.get('subtitlesformat', 'srt'),
948 'name': unescapeHTML(l[0]).encode('utf-8'),
949 })
950 url = u'https://www.youtube.com/api/timedtext?' + params
951 sub_lang_list[lang] = url
952 if not sub_lang_list:
953 self._downloader.report_warning(u'video doesn\'t have subtitles')
954 return {}
955 return sub_lang_list
956
957 def _get_available_automatic_caption(self, video_id, webpage):
958 """We need the webpage for getting the captions url, pass it as an
959 argument to speed up the process."""
960 sub_format = self._downloader.params.get('subtitlesformat', 'srt')
961 self.to_screen(u'%s: Looking for automatic captions' % video_id)
962 mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
963 err_msg = u'Couldn\'t find automatic captions for %s' % video_id
964 if mobj is None:
965 self._downloader.report_warning(err_msg)
966 return {}
967 player_config = json.loads(mobj.group(1))
968 try:
969 args = player_config[u'args']
970 caption_url = args[u'ttsurl']
971 timestamp = args[u'timestamp']
972 # We get the available subtitles
973 list_params = compat_urllib_parse.urlencode({
974 'type': 'list',
975 'tlangs': 1,
976 'asrs': 1,
977 })
978 list_url = caption_url + '&' + list_params
979 caption_list = self._download_xml(list_url, video_id)
980 original_lang_node = caption_list.find('track')
981 if original_lang_node is None or original_lang_node.attrib.get('kind') != 'asr' :
982 self._downloader.report_warning(u'Video doesn\'t have automatic captions')
983 return {}
984 original_lang = original_lang_node.attrib['lang_code']
985
986 sub_lang_list = {}
987 for lang_node in caption_list.findall('target'):
988 sub_lang = lang_node.attrib['lang_code']
989 params = compat_urllib_parse.urlencode({
990 'lang': original_lang,
991 'tlang': sub_lang,
992 'fmt': sub_format,
993 'ts': timestamp,
994 'kind': 'asr',
995 })
996 sub_lang_list[sub_lang] = caption_url + '&' + params
997 return sub_lang_list
998 # An extractor error can be raise by the download process if there are
999 # no automatic captions but there are subtitles
1000 except (KeyError, ExtractorError):
1001 self._downloader.report_warning(err_msg)
1002 return {}
1003
1004 @classmethod
1005 def extract_id(cls, url):
1006 mobj = re.match(cls._VALID_URL, url, re.VERBOSE)
1007 if mobj is None:
1008 raise ExtractorError(u'Invalid URL: %s' % url)
1009 video_id = mobj.group(2)
1010 return video_id
1011
1012 def _extract_from_m3u8(self, manifest_url, video_id):
1013 url_map = {}
1014 def _get_urls(_manifest):
1015 lines = _manifest.split('\n')
1016 urls = filter(lambda l: l and not l.startswith('#'),
1017 lines)
1018 return urls
1019 manifest = self._download_webpage(manifest_url, video_id, u'Downloading formats manifest')
1020 formats_urls = _get_urls(manifest)
1021 for format_url in formats_urls:
1022 itag = self._search_regex(r'itag/(\d+?)/', format_url, 'itag')
1023 url_map[itag] = format_url
1024 return url_map
1025
1026 def _extract_annotations(self, video_id):
1027 url = 'https://www.youtube.com/annotations_invideo?features=1&legacy=1&video_id=%s' % video_id
1028 return self._download_webpage(url, video_id, note=u'Searching for annotations.', errnote=u'Unable to download video annotations.')
1029
1030 def _real_extract(self, url):
1031 proto = (
1032 u'http' if self._downloader.params.get('prefer_insecure', False)
1033 else u'https')
1034
1035 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
1036 mobj = re.search(self._NEXT_URL_RE, url)
1037 if mobj:
1038 url = proto + '://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
1039 video_id = self.extract_id(url)
1040
1041 # Get video webpage
1042 url = proto + '://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
1043 video_webpage = self._download_webpage(url, video_id)
1044
1045 # Attempt to extract SWF player URL
1046 mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
1047 if mobj is not None:
1048 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1049 else:
1050 player_url = None
1051
1052 # Get video info
1053 self.report_video_info_webpage_download(video_id)
1054 if re.search(r'player-age-gate-content">', video_webpage) is not None:
1055 self.report_age_confirmation()
1056 age_gate = True
1057 # We simulate the access to the video from www.youtube.com/v/{video_id}
1058 # this can be viewed without login into Youtube
1059 data = compat_urllib_parse.urlencode({'video_id': video_id,
1060 'el': 'player_embedded',
1061 'gl': 'US',
1062 'hl': 'en',
1063 'eurl': 'https://youtube.googleapis.com/v/' + video_id,
1064 'asv': 3,
1065 'sts':'1588',
1066 })
1067 video_info_url = proto + '://www.youtube.com/get_video_info?' + data
1068 video_info_webpage = self._download_webpage(video_info_url, video_id,
1069 note=False,
1070 errnote='unable to download video info webpage')
1071 video_info = compat_parse_qs(video_info_webpage)
1072 else:
1073 age_gate = False
1074 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1075 video_info_url = (proto + '://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1076 % (video_id, el_type))
1077 video_info_webpage = self._download_webpage(video_info_url, video_id,
1078 note=False,
1079 errnote='unable to download video info webpage')
1080 video_info = compat_parse_qs(video_info_webpage)
1081 if 'token' in video_info:
1082 break
1083 if 'token' not in video_info:
1084 if 'reason' in video_info:
1085 raise ExtractorError(u'YouTube said: %s' % video_info['reason'][0], expected=True)
1086 else:
1087 raise ExtractorError(u'"token" parameter not in video info for unknown reason')
1088
1089 if 'view_count' in video_info:
1090 view_count = int(video_info['view_count'][0])
1091 else:
1092 view_count = None
1093
1094 # Check for "rental" videos
1095 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
1096 raise ExtractorError(u'"rental" videos not supported')
1097
1098 # Start extracting information
1099 self.report_information_extraction(video_id)
1100
1101 # uploader
1102 if 'author' not in video_info:
1103 raise ExtractorError(u'Unable to extract uploader name')
1104 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
1105
1106 # uploader_id
1107 video_uploader_id = None
1108 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
1109 if mobj is not None:
1110 video_uploader_id = mobj.group(1)
1111 else:
1112 self._downloader.report_warning(u'unable to extract uploader nickname')
1113
1114 # title
1115 if 'title' in video_info:
1116 video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
1117 else:
1118 self._downloader.report_warning(u'Unable to extract video title')
1119 video_title = u'_'
1120
1121 # thumbnail image
1122 # We try first to get a high quality image:
1123 m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">',
1124 video_webpage, re.DOTALL)
1125 if m_thumb is not None:
1126 video_thumbnail = m_thumb.group(1)
1127 elif 'thumbnail_url' not in video_info:
1128 self._downloader.report_warning(u'unable to extract video thumbnail')
1129 video_thumbnail = None
1130 else: # don't panic if we can't find it
1131 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
1132
1133 # upload date
1134 upload_date = None
1135 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
1136 if mobj is not None:
1137 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1138 upload_date = unified_strdate(upload_date)
1139
1140 # description
1141 video_description = get_element_by_id("eow-description", video_webpage)
1142 if video_description:
1143 video_description = re.sub(r'''(?x)
1144 <a\s+
1145 (?:[a-zA-Z-]+="[^"]+"\s+)*?
1146 title="([^"]+)"\s+
1147 (?:[a-zA-Z-]+="[^"]+"\s+)*?
1148 class="yt-uix-redirect-link"\s*>
1149 [^<]+
1150 </a>
1151 ''', r'\1', video_description)
1152 video_description = clean_html(video_description)
1153 else:
1154 fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
1155 if fd_mobj:
1156 video_description = unescapeHTML(fd_mobj.group(1))
1157 else:
1158 video_description = u''
1159
1160 def _extract_count(klass):
1161 count = self._search_regex(
1162 r'class="%s">([\d,]+)</span>' % re.escape(klass),
1163 video_webpage, klass, default=None)
1164 if count is not None:
1165 return int(count.replace(',', ''))
1166 return None
1167 like_count = _extract_count(u'likes-count')
1168 dislike_count = _extract_count(u'dislikes-count')
1169
1170 # subtitles
1171 video_subtitles = self.extract_subtitles(video_id, video_webpage)
1172
1173 if self._downloader.params.get('listsubtitles', False):
1174 self._list_available_subtitles(video_id, video_webpage)
1175 return
1176
1177 if 'length_seconds' not in video_info:
1178 self._downloader.report_warning(u'unable to extract video duration')
1179 video_duration = None
1180 else:
1181 video_duration = int(compat_urllib_parse.unquote_plus(video_info['length_seconds'][0]))
1182
1183 # annotations
1184 video_annotations = None
1185 if self._downloader.params.get('writeannotations', False):
1186 video_annotations = self._extract_annotations(video_id)
1187
1188 # Decide which formats to download
1189 try:
1190 mobj = re.search(r';ytplayer\.config\s*=\s*({.*?});', video_webpage)
1191 if not mobj:
1192 raise ValueError('Could not find vevo ID')
1193 json_code = uppercase_escape(mobj.group(1))
1194 ytplayer_config = json.loads(json_code)
1195 args = ytplayer_config['args']
1196 # Easy way to know if the 's' value is in url_encoded_fmt_stream_map
1197 # this signatures are encrypted
1198 if 'url_encoded_fmt_stream_map' not in args:
1199 raise ValueError(u'No stream_map present') # caught below
1200 re_signature = re.compile(r'[&,]s=')
1201 m_s = re_signature.search(args['url_encoded_fmt_stream_map'])
1202 if m_s is not None:
1203 self.to_screen(u'%s: Encrypted signatures detected.' % video_id)
1204 video_info['url_encoded_fmt_stream_map'] = [args['url_encoded_fmt_stream_map']]
1205 m_s = re_signature.search(args.get('adaptive_fmts', u''))
1206 if m_s is not None:
1207 if 'adaptive_fmts' in video_info:
1208 video_info['adaptive_fmts'][0] += ',' + args['adaptive_fmts']
1209 else:
1210 video_info['adaptive_fmts'] = [args['adaptive_fmts']]
1211 except ValueError:
1212 pass
1213
1214 def _map_to_format_list(urlmap):
1215 formats = []
1216 for itag, video_real_url in urlmap.items():
1217 dct = {
1218 'format_id': itag,
1219 'url': video_real_url,
1220 'player_url': player_url,
1221 }
1222 if itag in self._formats:
1223 dct.update(self._formats[itag])
1224 formats.append(dct)
1225 return formats
1226
1227 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1228 self.report_rtmp_download()
1229 formats = [{
1230 'format_id': '_rtmp',
1231 'protocol': 'rtmp',
1232 'url': video_info['conn'][0],
1233 'player_url': player_url,
1234 }]
1235 elif len(video_info.get('url_encoded_fmt_stream_map', [])) >= 1 or len(video_info.get('adaptive_fmts', [])) >= 1:
1236 encoded_url_map = video_info.get('url_encoded_fmt_stream_map', [''])[0] + ',' + video_info.get('adaptive_fmts',[''])[0]
1237 if 'rtmpe%3Dyes' in encoded_url_map:
1238 raise ExtractorError('rtmpe downloads are not supported, see https://github.com/rg3/youtube-dl/issues/343 for more information.', expected=True)
1239 url_map = {}
1240 for url_data_str in encoded_url_map.split(','):
1241 url_data = compat_parse_qs(url_data_str)
1242 if 'itag' in url_data and 'url' in url_data:
1243 url = url_data['url'][0]
1244 if 'sig' in url_data:
1245 url += '&signature=' + url_data['sig'][0]
1246 elif 's' in url_data:
1247 encrypted_sig = url_data['s'][0]
1248 if self._downloader.params.get('verbose'):
1249 if age_gate:
1250 if player_url is None:
1251 player_version = 'unknown'
1252 else:
1253 player_version = self._search_regex(
1254 r'-(.+)\.swf$', player_url,
1255 u'flash player', fatal=False)
1256 player_desc = 'flash player %s' % player_version
1257 else:
1258 player_version = self._search_regex(
1259 r'html5player-(.+?)\.js', video_webpage,
1260 'html5 player', fatal=False)
1261 player_desc = u'html5 player %s' % player_version
1262
1263 parts_sizes = u'.'.join(compat_str(len(part)) for part in encrypted_sig.split('.'))
1264 self.to_screen(u'encrypted signature length %d (%s), itag %s, %s' %
1265 (len(encrypted_sig), parts_sizes, url_data['itag'][0], player_desc))
1266
1267 if not age_gate:
1268 jsplayer_url_json = self._search_regex(
1269 r'"assets":.+?"js":\s*("[^"]+")',
1270 video_webpage, u'JS player URL')
1271 player_url = json.loads(jsplayer_url_json)
1272
1273 signature = self._decrypt_signature(
1274 encrypted_sig, video_id, player_url, age_gate)
1275 url += '&signature=' + signature
1276 if 'ratebypass' not in url:
1277 url += '&ratebypass=yes'
1278 url_map[url_data['itag'][0]] = url
1279 formats = _map_to_format_list(url_map)
1280 elif video_info.get('hlsvp'):
1281 manifest_url = video_info['hlsvp'][0]
1282 url_map = self._extract_from_m3u8(manifest_url, video_id)
1283 formats = _map_to_format_list(url_map)
1284 else:
1285 raise ExtractorError(u'no conn, hlsvp or url_encoded_fmt_stream_map information found in video info')
1286
1287 # Look for the DASH manifest
1288 if (self._downloader.params.get('youtube_include_dash_manifest', False)):
1289 try:
1290 # The DASH manifest used needs to be the one from the original video_webpage.
1291 # The one found in get_video_info seems to be using different signatures.
1292 # However, in the case of an age restriction there won't be any embedded dashmpd in the video_webpage.
1293 # Luckily, it seems, this case uses some kind of default signature (len == 86), so the
1294 # combination of get_video_info and the _static_decrypt_signature() decryption fallback will work here.
1295 if age_gate:
1296 dash_manifest_url = video_info.get('dashmpd')[0]
1297 else:
1298 dash_manifest_url = ytplayer_config['args']['dashmpd']
1299 def decrypt_sig(mobj):
1300 s = mobj.group(1)
1301 dec_s = self._decrypt_signature(s, video_id, player_url, age_gate)
1302 return '/signature/%s' % dec_s
1303 dash_manifest_url = re.sub(r'/s/([\w\.]+)', decrypt_sig, dash_manifest_url)
1304 dash_doc = self._download_xml(
1305 dash_manifest_url, video_id,
1306 note=u'Downloading DASH manifest',
1307 errnote=u'Could not download DASH manifest')
1308 for r in dash_doc.findall(u'.//{urn:mpeg:DASH:schema:MPD:2011}Representation'):
1309 url_el = r.find('{urn:mpeg:DASH:schema:MPD:2011}BaseURL')
1310 if url_el is None:
1311 continue
1312 format_id = r.attrib['id']
1313 video_url = url_el.text
1314 filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength'))
1315 f = {
1316 'format_id': format_id,
1317 'url': video_url,
1318 'width': int_or_none(r.attrib.get('width')),
1319 'tbr': int_or_none(r.attrib.get('bandwidth'), 1000),
1320 'asr': int_or_none(r.attrib.get('audioSamplingRate')),
1321 'filesize': filesize,
1322 }
1323 try:
1324 existing_format = next(
1325 fo for fo in formats
1326 if fo['format_id'] == format_id)
1327 except StopIteration:
1328 f.update(self._formats.get(format_id, {}))
1329 formats.append(f)
1330 else:
1331 existing_format.update(f)
1332
1333 except (ExtractorError, KeyError) as e:
1334 self.report_warning(u'Skipping DASH manifest: %s' % e, video_id)
1335
1336 self._sort_formats(formats)
1337
1338 return {
1339 'id': video_id,
1340 'uploader': video_uploader,
1341 'uploader_id': video_uploader_id,
1342 'upload_date': upload_date,
1343 'title': video_title,
1344 'thumbnail': video_thumbnail,
1345 'description': video_description,
1346 'subtitles': video_subtitles,
1347 'duration': video_duration,
1348 'age_limit': 18 if age_gate else 0,
1349 'annotations': video_annotations,
1350 'webpage_url': proto + '://www.youtube.com/watch?v=%s' % video_id,
1351 'view_count': view_count,
1352 'like_count': like_count,
1353 'dislike_count': dislike_count,
1354 'formats': formats,
1355 }
1356
1357 class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
1358 IE_DESC = u'YouTube.com playlists'
1359 _VALID_URL = r"""(?x)(?:
1360 (?:https?://)?
1361 (?:\w+\.)?
1362 youtube\.com/
1363 (?:
1364 (?:course|view_play_list|my_playlists|artist|playlist|watch)
1365 \? (?:.*?&)*? (?:p|a|list)=
1366 | p/
1367 )
1368 (
1369 (?:PL|EC|UU|FL|RD)?[0-9A-Za-z-_]{10,}
1370 # Top tracks, they can also include dots
1371 |(?:MC)[\w\.]*
1372 )
1373 .*
1374 |
1375 ((?:PL|EC|UU|FL|RD)[0-9A-Za-z-_]{10,})
1376 )"""
1377 _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s'
1378 _MORE_PAGES_INDICATOR = r'data-link-type="next"'
1379 _VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})&amp;[^"]*?index=(?P<index>\d+)'
1380 IE_NAME = u'youtube:playlist'
1381
1382 def _real_initialize(self):
1383 self._login()
1384
1385 def _ids_to_results(self, ids):
1386 return [self.url_result(vid_id, 'Youtube', video_id=vid_id)
1387 for vid_id in ids]
1388
1389 def _extract_mix(self, playlist_id):
1390 # The mixes are generated from a a single video
1391 # the id of the playlist is just 'RD' + video_id
1392 url = 'https://youtube.com/watch?v=%s&list=%s' % (playlist_id[-11:], playlist_id)
1393 webpage = self._download_webpage(url, playlist_id, u'Downloading Youtube mix')
1394 search_title = lambda class_name: get_element_by_attribute('class', class_name, webpage)
1395 title_span = (search_title('playlist-title') or
1396 search_title('title long-title') or search_title('title'))
1397 title = clean_html(title_span)
1398 video_re = r'''(?x)data-video-username="(.*?)".*?
1399 href="/watch\?v=([0-9A-Za-z_-]{11})&amp;[^"]*?list=%s''' % re.escape(playlist_id)
1400 matches = orderedSet(re.findall(video_re, webpage, flags=re.DOTALL))
1401 # Some of the videos may have been deleted, their username field is empty
1402 ids = [video_id for (username, video_id) in matches if username]
1403 url_results = self._ids_to_results(ids)
1404
1405 return self.playlist_result(url_results, playlist_id, title)
1406
1407 def _real_extract(self, url):
1408 # Extract playlist id
1409 mobj = re.match(self._VALID_URL, url)
1410 if mobj is None:
1411 raise ExtractorError(u'Invalid URL: %s' % url)
1412 playlist_id = mobj.group(1) or mobj.group(2)
1413
1414 # Check if it's a video-specific URL
1415 query_dict = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
1416 if 'v' in query_dict:
1417 video_id = query_dict['v'][0]
1418 if self._downloader.params.get('noplaylist'):
1419 self.to_screen(u'Downloading just video %s because of --no-playlist' % video_id)
1420 return self.url_result(video_id, 'Youtube', video_id=video_id)
1421 else:
1422 self.to_screen(u'Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id))
1423
1424 if playlist_id.startswith('RD'):
1425 # Mixes require a custom extraction process
1426 return self._extract_mix(playlist_id)
1427 if playlist_id.startswith('TL'):
1428 raise ExtractorError(u'For downloading YouTube.com top lists, use '
1429 u'the "yttoplist" keyword, for example "youtube-dl \'yttoplist:music:Top Tracks\'"', expected=True)
1430
1431 url = self._TEMPLATE_URL % playlist_id
1432 page = self._download_webpage(url, playlist_id)
1433 more_widget_html = content_html = page
1434
1435 # Extract the video ids from the playlist pages
1436 ids = []
1437
1438 for page_num in itertools.count(1):
1439 matches = re.finditer(self._VIDEO_RE, content_html)
1440 # We remove the duplicates and the link with index 0
1441 # (it's not the first video of the playlist)
1442 new_ids = orderedSet(m.group('id') for m in matches if m.group('index') != '0')
1443 ids.extend(new_ids)
1444
1445 mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
1446 if not mobj:
1447 break
1448
1449 more = self._download_json(
1450 'https://youtube.com/%s' % mobj.group('more'), playlist_id,
1451 'Downloading page #%s' % page_num,
1452 transform_source=uppercase_escape)
1453 content_html = more['content_html']
1454 more_widget_html = more['load_more_widget_html']
1455
1456 playlist_title = self._html_search_regex(
1457 r'(?s)<h1 class="pl-header-title[^"]*">\s*(.*?)\s*</h1>',
1458 page, u'title')
1459
1460 url_results = self._ids_to_results(ids)
1461 return self.playlist_result(url_results, playlist_id, playlist_title)
1462
1463
1464 class YoutubeTopListIE(YoutubePlaylistIE):
1465 IE_NAME = u'youtube:toplist'
1466 IE_DESC = (u'YouTube.com top lists, "yttoplist:{channel}:{list title}"'
1467 u' (Example: "yttoplist:music:Top Tracks")')
1468 _VALID_URL = r'yttoplist:(?P<chann>.*?):(?P<title>.*?)$'
1469
1470 def _real_extract(self, url):
1471 mobj = re.match(self._VALID_URL, url)
1472 channel = mobj.group('chann')
1473 title = mobj.group('title')
1474 query = compat_urllib_parse.urlencode({'title': title})
1475 playlist_re = 'href="([^"]+?%s.*?)"' % re.escape(query)
1476 channel_page = self._download_webpage('https://www.youtube.com/%s' % channel, title)
1477 link = self._html_search_regex(playlist_re, channel_page, u'list')
1478 url = compat_urlparse.urljoin('https://www.youtube.com/', link)
1479
1480 video_re = r'data-index="\d+".*?data-video-id="([0-9A-Za-z_-]{11})"'
1481 ids = []
1482 # sometimes the webpage doesn't contain the videos
1483 # retry until we get them
1484 for i in itertools.count(0):
1485 msg = u'Downloading Youtube mix'
1486 if i > 0:
1487 msg += ', retry #%d' % i
1488 webpage = self._download_webpage(url, title, msg)
1489 ids = orderedSet(re.findall(video_re, webpage))
1490 if ids:
1491 break
1492 url_results = self._ids_to_results(ids)
1493 return self.playlist_result(url_results, playlist_title=title)
1494
1495
1496 class YoutubeChannelIE(InfoExtractor):
1497 IE_DESC = u'YouTube.com channels'
1498 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
1499 _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
1500 _MORE_PAGES_URL = 'https://www.youtube.com/c4_browse_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
1501 IE_NAME = u'youtube:channel'
1502
1503 def extract_videos_from_page(self, page):
1504 ids_in_page = []
1505 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1506 if mobj.group(1) not in ids_in_page:
1507 ids_in_page.append(mobj.group(1))
1508 return ids_in_page
1509
1510 def _real_extract(self, url):
1511 # Extract channel id
1512 mobj = re.match(self._VALID_URL, url)
1513 if mobj is None:
1514 raise ExtractorError(u'Invalid URL: %s' % url)
1515
1516 # Download channel page
1517 channel_id = mobj.group(1)
1518 video_ids = []
1519 url = 'https://www.youtube.com/channel/%s/videos' % channel_id
1520 channel_page = self._download_webpage(url, channel_id)
1521 autogenerated = re.search(r'''(?x)
1522 class="[^"]*?(?:
1523 channel-header-autogenerated-label|
1524 yt-channel-title-autogenerated
1525 )[^"]*"''', channel_page) is not None
1526
1527 if autogenerated:
1528 # The videos are contained in a single page
1529 # the ajax pages can't be used, they are empty
1530 video_ids = self.extract_videos_from_page(channel_page)
1531 else:
1532 # Download all channel pages using the json-based channel_ajax query
1533 for pagenum in itertools.count(1):
1534 url = self._MORE_PAGES_URL % (pagenum, channel_id)
1535 page = self._download_json(
1536 url, channel_id, note=u'Downloading page #%s' % pagenum,
1537 transform_source=uppercase_escape)
1538
1539 ids_in_page = self.extract_videos_from_page(page['content_html'])
1540 video_ids.extend(ids_in_page)
1541
1542 if self._MORE_PAGES_INDICATOR not in page['load_more_widget_html']:
1543 break
1544
1545 self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1546
1547 url_entries = [self.url_result(video_id, 'Youtube', video_id=video_id)
1548 for video_id in video_ids]
1549 return self.playlist_result(url_entries, channel_id)
1550
1551
1552 class YoutubeUserIE(InfoExtractor):
1553 IE_DESC = u'YouTube.com user videos (URL or "ytuser" keyword)'
1554 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:user/)?(?!(?:attribution_link|watch|results)(?:$|[^a-z_A-Z0-9-])))|ytuser:)(?!feed/)([A-Za-z0-9_-]+)'
1555 _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/users/%s'
1556 _GDATA_PAGE_SIZE = 50
1557 _GDATA_URL = 'https://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d&alt=json'
1558 IE_NAME = u'youtube:user'
1559
1560 @classmethod
1561 def suitable(cls, url):
1562 # Don't return True if the url can be extracted with other youtube
1563 # extractor, the regex would is too permissive and it would match.
1564 other_ies = iter(klass for (name, klass) in globals().items() if name.endswith('IE') and klass is not cls)
1565 if any(ie.suitable(url) for ie in other_ies): return False
1566 else: return super(YoutubeUserIE, cls).suitable(url)
1567
1568 def _real_extract(self, url):
1569 # Extract username
1570 mobj = re.match(self._VALID_URL, url)
1571 if mobj is None:
1572 raise ExtractorError(u'Invalid URL: %s' % url)
1573
1574 username = mobj.group(1)
1575
1576 # Download video ids using YouTube Data API. Result size per
1577 # query is limited (currently to 50 videos) so we need to query
1578 # page by page until there are no video ids - it means we got
1579 # all of them.
1580
1581 def download_page(pagenum):
1582 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1583
1584 gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
1585 page = self._download_webpage(
1586 gdata_url, username,
1587 u'Downloading video ids from %d to %d' % (
1588 start_index, start_index + self._GDATA_PAGE_SIZE))
1589
1590 try:
1591 response = json.loads(page)
1592 except ValueError as err:
1593 raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
1594 if 'entry' not in response['feed']:
1595 return
1596
1597 # Extract video identifiers
1598 entries = response['feed']['entry']
1599 for entry in entries:
1600 title = entry['title']['$t']
1601 video_id = entry['id']['$t'].split('/')[-1]
1602 yield {
1603 '_type': 'url',
1604 'url': video_id,
1605 'ie_key': 'Youtube',
1606 'id': video_id,
1607 'title': title,
1608 }
1609 url_results = PagedList(download_page, self._GDATA_PAGE_SIZE)
1610
1611 return self.playlist_result(url_results, playlist_title=username)
1612
1613
1614 class YoutubeSearchIE(SearchInfoExtractor):
1615 IE_DESC = u'YouTube.com searches'
1616 _API_URL = u'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1617 _MAX_RESULTS = 1000
1618 IE_NAME = u'youtube:search'
1619 _SEARCH_KEY = 'ytsearch'
1620
1621 def _get_n_results(self, query, n):
1622 """Get a specified number of results for a query"""
1623
1624 video_ids = []
1625 pagenum = 0
1626 limit = n
1627 PAGE_SIZE = 50
1628
1629 while (PAGE_SIZE * pagenum) < limit:
1630 result_url = self._API_URL % (
1631 compat_urllib_parse.quote_plus(query.encode('utf-8')),
1632 (PAGE_SIZE * pagenum) + 1)
1633 data_json = self._download_webpage(
1634 result_url, video_id=u'query "%s"' % query,
1635 note=u'Downloading page %s' % (pagenum + 1),
1636 errnote=u'Unable to download API page')
1637 data = json.loads(data_json)
1638 api_response = data['data']
1639
1640 if 'items' not in api_response:
1641 raise ExtractorError(
1642 u'[youtube] No video results', expected=True)
1643
1644 new_ids = list(video['id'] for video in api_response['items'])
1645 video_ids += new_ids
1646
1647 limit = min(n, api_response['totalItems'])
1648 pagenum += 1
1649
1650 if len(video_ids) > n:
1651 video_ids = video_ids[:n]
1652 videos = [self.url_result(video_id, 'Youtube', video_id=video_id)
1653 for video_id in video_ids]
1654 return self.playlist_result(videos, query)
1655
1656
1657 class YoutubeSearchDateIE(YoutubeSearchIE):
1658 IE_NAME = YoutubeSearchIE.IE_NAME + ':date'
1659 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc&orderby=published'
1660 _SEARCH_KEY = 'ytsearchdate'
1661 IE_DESC = u'YouTube.com searches, newest videos first'
1662
1663
1664 class YoutubeSearchURLIE(InfoExtractor):
1665 IE_DESC = u'YouTube.com search URLs'
1666 IE_NAME = u'youtube:search_url'
1667 _VALID_URL = r'https?://(?:www\.)?youtube\.com/results\?(.*?&)?search_query=(?P<query>[^&]+)(?:[&]|$)'
1668
1669 def _real_extract(self, url):
1670 mobj = re.match(self._VALID_URL, url)
1671 query = compat_urllib_parse.unquote_plus(mobj.group('query'))
1672
1673 webpage = self._download_webpage(url, query)
1674 result_code = self._search_regex(
1675 r'(?s)<ol id="search-results"(.*?)</ol>', webpage, u'result HTML')
1676
1677 part_codes = re.findall(
1678 r'(?s)<h3 class="yt-lockup-title">(.*?)</h3>', result_code)
1679 entries = []
1680 for part_code in part_codes:
1681 part_title = self._html_search_regex(
1682 r'(?s)title="([^"]+)"', part_code, 'item title', fatal=False)
1683 part_url_snippet = self._html_search_regex(
1684 r'(?s)href="([^"]+)"', part_code, 'item URL')
1685 part_url = compat_urlparse.urljoin(
1686 'https://www.youtube.com/', part_url_snippet)
1687 entries.append({
1688 '_type': 'url',
1689 'url': part_url,
1690 'title': part_title,
1691 })
1692
1693 return {
1694 '_type': 'playlist',
1695 'entries': entries,
1696 'title': query,
1697 }
1698
1699
1700 class YoutubeShowIE(InfoExtractor):
1701 IE_DESC = u'YouTube.com (multi-season) shows'
1702 _VALID_URL = r'https?://www\.youtube\.com/show/(.*)'
1703 IE_NAME = u'youtube:show'
1704
1705 def _real_extract(self, url):
1706 mobj = re.match(self._VALID_URL, url)
1707 show_name = mobj.group(1)
1708 webpage = self._download_webpage(url, show_name, u'Downloading show webpage')
1709 # There's one playlist for each season of the show
1710 m_seasons = list(re.finditer(r'href="(/playlist\?list=.*?)"', webpage))
1711 self.to_screen(u'%s: Found %s seasons' % (show_name, len(m_seasons)))
1712 return [self.url_result('https://www.youtube.com' + season.group(1), 'YoutubePlaylist') for season in m_seasons]
1713
1714
1715 class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
1716 """
1717 Base class for extractors that fetch info from
1718 http://www.youtube.com/feed_ajax
1719 Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.
1720 """
1721 _LOGIN_REQUIRED = True
1722 # use action_load_personal_feed instead of action_load_system_feed
1723 _PERSONAL_FEED = False
1724
1725 @property
1726 def _FEED_TEMPLATE(self):
1727 action = 'action_load_system_feed'
1728 if self._PERSONAL_FEED:
1729 action = 'action_load_personal_feed'
1730 return 'https://www.youtube.com/feed_ajax?%s=1&feed_name=%s&paging=%%s' % (action, self._FEED_NAME)
1731
1732 @property
1733 def IE_NAME(self):
1734 return u'youtube:%s' % self._FEED_NAME
1735
1736 def _real_initialize(self):
1737 self._login()
1738
1739 def _real_extract(self, url):
1740 feed_entries = []
1741 paging = 0
1742 for i in itertools.count(1):
1743 info = self._download_json(self._FEED_TEMPLATE % paging,
1744 u'%s feed' % self._FEED_NAME,
1745 u'Downloading page %s' % i)
1746 feed_html = info.get('feed_html') or info.get('content_html')
1747 m_ids = re.finditer(r'"/watch\?v=(.*?)["&]', feed_html)
1748 ids = orderedSet(m.group(1) for m in m_ids)
1749 feed_entries.extend(
1750 self.url_result(video_id, 'Youtube', video_id=video_id)
1751 for video_id in ids)
1752 if info['paging'] is None:
1753 break
1754 paging = info['paging']
1755 return self.playlist_result(feed_entries, playlist_title=self._PLAYLIST_TITLE)
1756
1757 class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor):
1758 IE_DESC = u'YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication)'
1759 _VALID_URL = r'https?://www\.youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
1760 _FEED_NAME = 'subscriptions'
1761 _PLAYLIST_TITLE = u'Youtube Subscriptions'
1762
1763 class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
1764 IE_DESC = u'YouTube.com recommended videos, "ytrec" keyword (requires authentication)'
1765 _VALID_URL = r'https?://www\.youtube\.com/feed/recommended|:ytrec(?:ommended)?'
1766 _FEED_NAME = 'recommended'
1767 _PLAYLIST_TITLE = u'Youtube Recommended videos'
1768
1769 class YoutubeWatchLaterIE(YoutubeFeedsInfoExtractor):
1770 IE_DESC = u'Youtube watch later list, "ytwatchlater" keyword (requires authentication)'
1771 _VALID_URL = r'https?://www\.youtube\.com/feed/watch_later|:ytwatchlater'
1772 _FEED_NAME = 'watch_later'
1773 _PLAYLIST_TITLE = u'Youtube Watch Later'
1774 _PERSONAL_FEED = True
1775
1776 class YoutubeHistoryIE(YoutubeFeedsInfoExtractor):
1777 IE_DESC = u'Youtube watch history, "ythistory" keyword (requires authentication)'
1778 _VALID_URL = u'https?://www\.youtube\.com/feed/history|:ythistory'
1779 _FEED_NAME = 'history'
1780 _PERSONAL_FEED = True
1781 _PLAYLIST_TITLE = u'Youtube Watch History'
1782
1783 class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
1784 IE_NAME = u'youtube:favorites'
1785 IE_DESC = u'YouTube.com favourite videos, "ytfav" keyword (requires authentication)'
1786 _VALID_URL = r'https?://www\.youtube\.com/my_favorites|:ytfav(?:ou?rites)?'
1787 _LOGIN_REQUIRED = True
1788
1789 def _real_extract(self, url):
1790 webpage = self._download_webpage('https://www.youtube.com/my_favorites', 'Youtube Favourites videos')
1791 playlist_id = self._search_regex(r'list=(.+?)["&]', webpage, u'favourites playlist id')
1792 return self.url_result(playlist_id, 'YoutubePlaylist')
1793
1794
1795 class YoutubeTruncatedURLIE(InfoExtractor):
1796 IE_NAME = 'youtube:truncated_url'
1797 IE_DESC = False # Do not list
1798 _VALID_URL = r'''(?x)
1799 (?:https?://)?[^/]+/watch\?(?:feature=[a-z_]+)?$|
1800 (?:https?://)?(?:www\.)?youtube\.com/attribution_link\?a=[^&]+$
1801 '''
1802
1803 def _real_extract(self, url):
1804 raise ExtractorError(
1805 u'Did you forget to quote the URL? Remember that & is a meta '
1806 u'character in most shells, so you want to put the URL in quotes, '
1807 u'like youtube-dl '
1808 u'"http://www.youtube.com/watch?feature=foo&v=BaW_jenozKc" '
1809 u' or simply youtube-dl BaW_jenozKc .',
1810 expected=True)