]> jfr.im git - yt-dlp.git/blame_incremental - youtube_dl/extractor/youtube.py
[youtube] Update test description field
[yt-dlp.git] / youtube_dl / extractor / youtube.py
... / ...
CommitLineData
1# coding: utf-8
2
3import collections
4import errno
5import io
6import itertools
7import json
8import os.path
9import re
10import struct
11import traceback
12import zlib
13
14from .common import InfoExtractor, SearchInfoExtractor
15from .subtitles import SubtitlesInfoExtractor
16from ..jsinterp import JSInterpreter
17from ..utils import (
18 compat_chr,
19 compat_parse_qs,
20 compat_urllib_parse,
21 compat_urllib_request,
22 compat_urlparse,
23 compat_str,
24
25 clean_html,
26 get_cachedir,
27 get_element_by_id,
28 get_element_by_attribute,
29 ExtractorError,
30 int_or_none,
31 PagedList,
32 unescapeHTML,
33 unified_strdate,
34 orderedSet,
35 write_json_file,
36 uppercase_escape,
37)
38
39class YoutubeBaseInfoExtractor(InfoExtractor):
40 """Provide base functions for Youtube extractors"""
41 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
42 _LANG_URL = r'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
43 _AGE_URL = 'https://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
44 _NETRC_MACHINE = 'youtube'
45 # If True it will raise an error if no login info is provided
46 _LOGIN_REQUIRED = False
47
48 def _set_language(self):
49 return bool(self._download_webpage(
50 self._LANG_URL, None,
51 note=u'Setting language', errnote='unable to set language',
52 fatal=False))
53
54 def _login(self):
55 (username, password) = self._get_login_info()
56 # No authentication to be performed
57 if username is None:
58 if self._LOGIN_REQUIRED:
59 raise ExtractorError(u'No login info available, needed for using %s.' % self.IE_NAME, expected=True)
60 return False
61
62 login_page = self._download_webpage(
63 self._LOGIN_URL, None,
64 note=u'Downloading login page',
65 errnote=u'unable to fetch login page', fatal=False)
66 if login_page is False:
67 return
68
69 galx = self._search_regex(r'(?s)<input.+?name="GALX".+?value="(.+?)"',
70 login_page, u'Login GALX parameter')
71
72 # Log in
73 login_form_strs = {
74 u'continue': u'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
75 u'Email': username,
76 u'GALX': galx,
77 u'Passwd': password,
78 u'PersistentCookie': u'yes',
79 u'_utf8': u'霱',
80 u'bgresponse': u'js_disabled',
81 u'checkConnection': u'',
82 u'checkedDomains': u'youtube',
83 u'dnConn': u'',
84 u'pstMsg': u'0',
85 u'rmShown': u'1',
86 u'secTok': u'',
87 u'signIn': u'Sign in',
88 u'timeStmp': u'',
89 u'service': u'youtube',
90 u'uilel': u'3',
91 u'hl': u'en_US',
92 }
93 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
94 # chokes on unicode
95 login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
96 login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
97
98 req = compat_urllib_request.Request(self._LOGIN_URL, login_data)
99 login_results = self._download_webpage(
100 req, None,
101 note=u'Logging in', errnote=u'unable to log in', fatal=False)
102 if login_results is False:
103 return False
104 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
105 self._downloader.report_warning(u'unable to log in: bad username or password')
106 return False
107 return True
108
109 def _confirm_age(self):
110 age_form = {
111 'next_url': '/',
112 'action_confirm': 'Confirm',
113 }
114 req = compat_urllib_request.Request(self._AGE_URL,
115 compat_urllib_parse.urlencode(age_form).encode('ascii'))
116
117 self._download_webpage(
118 req, None,
119 note=u'Confirming age', errnote=u'Unable to confirm age')
120 return True
121
122 def _real_initialize(self):
123 if self._downloader is None:
124 return
125 if not self._set_language():
126 return
127 if not self._login():
128 return
129 self._confirm_age()
130
131
132class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
133 IE_DESC = u'YouTube.com'
134 _VALID_URL = r"""(?x)^
135 (
136 (?:https?://|//)? # http(s):// or protocol-independent URL (optional)
137 (?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/|
138 (?:www\.)?deturl\.com/www\.youtube\.com/|
139 (?:www\.)?pwnyoutube\.com/|
140 (?:www\.)?yourepeat\.com/|
141 tube\.majestyc\.net/|
142 youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains
143 (?:.*?\#/)? # handle anchor (#/) redirect urls
144 (?: # the various things that can precede the ID:
145 (?:(?:v|embed|e)/) # v/ or embed/ or e/
146 |(?: # or the v= param in all its forms
147 (?:(?:watch|movie)(?:_popup)?(?:\.php)?/?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
148 (?:\?|\#!?) # the params delimiter ? or # or #!
149 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
150 v=
151 )
152 ))
153 |youtu\.be/ # just youtu.be/xxxx
154 |https?://(?:www\.)?cleanvideosearch\.com/media/action/yt/watch\?videoId=
155 )
156 )? # all until now is optional -> you can pass the naked ID
157 ([0-9A-Za-z_-]{11}) # here is it! the YouTube video ID
158 (?(1).+)? # if we found the ID, everything can follow
159 $"""
160 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
161 _formats = {
162 '5': {'ext': 'flv', 'width': 400, 'height': 240},
163 '6': {'ext': 'flv', 'width': 450, 'height': 270},
164 '13': {'ext': '3gp'},
165 '17': {'ext': '3gp', 'width': 176, 'height': 144},
166 '18': {'ext': 'mp4', 'width': 640, 'height': 360},
167 '22': {'ext': 'mp4', 'width': 1280, 'height': 720},
168 '34': {'ext': 'flv', 'width': 640, 'height': 360},
169 '35': {'ext': 'flv', 'width': 854, 'height': 480},
170 '36': {'ext': '3gp', 'width': 320, 'height': 240},
171 '37': {'ext': 'mp4', 'width': 1920, 'height': 1080},
172 '38': {'ext': 'mp4', 'width': 4096, 'height': 3072},
173 '43': {'ext': 'webm', 'width': 640, 'height': 360},
174 '44': {'ext': 'webm', 'width': 854, 'height': 480},
175 '45': {'ext': 'webm', 'width': 1280, 'height': 720},
176 '46': {'ext': 'webm', 'width': 1920, 'height': 1080},
177
178
179 # 3d videos
180 '82': {'ext': 'mp4', 'height': 360, 'format_note': '3D', 'preference': -20},
181 '83': {'ext': 'mp4', 'height': 480, 'format_note': '3D', 'preference': -20},
182 '84': {'ext': 'mp4', 'height': 720, 'format_note': '3D', 'preference': -20},
183 '85': {'ext': 'mp4', 'height': 1080, 'format_note': '3D', 'preference': -20},
184 '100': {'ext': 'webm', 'height': 360, 'format_note': '3D', 'preference': -20},
185 '101': {'ext': 'webm', 'height': 480, 'format_note': '3D', 'preference': -20},
186 '102': {'ext': 'webm', 'height': 720, 'format_note': '3D', 'preference': -20},
187
188 # Apple HTTP Live Streaming
189 '92': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'preference': -10},
190 '93': {'ext': 'mp4', 'height': 360, 'format_note': 'HLS', 'preference': -10},
191 '94': {'ext': 'mp4', 'height': 480, 'format_note': 'HLS', 'preference': -10},
192 '95': {'ext': 'mp4', 'height': 720, 'format_note': 'HLS', 'preference': -10},
193 '96': {'ext': 'mp4', 'height': 1080, 'format_note': 'HLS', 'preference': -10},
194 '132': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'preference': -10},
195 '151': {'ext': 'mp4', 'height': 72, 'format_note': 'HLS', 'preference': -10},
196
197 # DASH mp4 video
198 '133': {'ext': 'mp4', 'height': 240, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
199 '134': {'ext': 'mp4', 'height': 360, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
200 '135': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
201 '136': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
202 '137': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
203 '138': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
204 '160': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
205 '264': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
206
207 # Dash mp4 audio
208 '139': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 48, 'preference': -50},
209 '140': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 128, 'preference': -50},
210 '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 256, 'preference': -50},
211
212 # Dash webm
213 '167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'acodec': 'none', 'preference': -40},
214 '168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'acodec': 'none', 'preference': -40},
215 '169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'acodec': 'none', 'preference': -40},
216 '170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'acodec': 'none', 'preference': -40},
217 '218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'acodec': 'none', 'preference': -40},
218 '219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'acodec': 'none', 'preference': -40},
219 '242': {'ext': 'webm', 'height': 240, 'format_note': 'DASH webm', 'preference': -40},
220 '243': {'ext': 'webm', 'height': 360, 'format_note': 'DASH webm', 'preference': -40},
221 '244': {'ext': 'webm', 'height': 480, 'format_note': 'DASH webm', 'preference': -40},
222 '245': {'ext': 'webm', 'height': 480, 'format_note': 'DASH webm', 'preference': -40},
223 '246': {'ext': 'webm', 'height': 480, 'format_note': 'DASH webm', 'preference': -40},
224 '247': {'ext': 'webm', 'height': 720, 'format_note': 'DASH webm', 'preference': -40},
225 '248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH webm', 'preference': -40},
226
227 # Dash webm audio
228 '171': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH webm audio', 'abr': 48, 'preference': -50},
229 '172': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH webm audio', 'abr': 256, 'preference': -50},
230
231 # RTMP (unnamed)
232 '_rtmp': {'protocol': 'rtmp'},
233 }
234
235 IE_NAME = u'youtube'
236 _TESTS = [
237 {
238 u"url": u"http://www.youtube.com/watch?v=BaW_jenozKc",
239 u"file": u"BaW_jenozKc.mp4",
240 u"info_dict": {
241 u"title": u"youtube-dl test video \"'/\\ä↭𝕐",
242 u"uploader": u"Philipp Hagemeister",
243 u"uploader_id": u"phihag",
244 u"upload_date": u"20121002",
245 u"description": u"test chars: \"'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de ."
246 }
247 },
248 {
249 u"url": u"http://www.youtube.com/watch?v=UxxajLWwzqY",
250 u"file": u"UxxajLWwzqY.mp4",
251 u"note": u"Test generic use_cipher_signature video (#897)",
252 u"info_dict": {
253 u"upload_date": u"20120506",
254 u"title": u"Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]",
255 u"description": u"md5:fea86fda2d5a5784273df5c7cc994d9f",
256 u"uploader": u"Icona Pop",
257 u"uploader_id": u"IconaPop"
258 }
259 },
260 {
261 u"url": u"https://www.youtube.com/watch?v=07FYdnEawAQ",
262 u"file": u"07FYdnEawAQ.mp4",
263 u"note": u"Test VEVO video with age protection (#956)",
264 u"info_dict": {
265 u"upload_date": u"20130703",
266 u"title": u"Justin Timberlake - Tunnel Vision (Explicit)",
267 u"description": u"md5:64249768eec3bc4276236606ea996373",
268 u"uploader": u"justintimberlakeVEVO",
269 u"uploader_id": u"justintimberlakeVEVO"
270 }
271 },
272 {
273 u"url": u"//www.YouTube.com/watch?v=yZIXLfi8CZQ",
274 u"file": u"yZIXLfi8CZQ.mp4",
275 u"note": u"Embed-only video (#1746)",
276 u"info_dict": {
277 u"upload_date": u"20120608",
278 u"title": u"Principal Sexually Assaults A Teacher - Episode 117 - 8th June 2012",
279 u"description": u"md5:09b78bd971f1e3e289601dfba15ca4f7",
280 u"uploader": u"SET India",
281 u"uploader_id": u"setindia"
282 }
283 },
284 {
285 u"url": u"http://www.youtube.com/watch?v=a9LDPn-MO4I",
286 u"file": u"a9LDPn-MO4I.m4a",
287 u"note": u"256k DASH audio (format 141) via DASH manifest",
288 u"info_dict": {
289 u"upload_date": "20121002",
290 u"uploader_id": "8KVIDEO",
291 u"description": "No description available.",
292 u"uploader": "8KVIDEO",
293 u"title": "UHDTV TEST 8K VIDEO.mp4"
294 },
295 u"params": {
296 u"youtube_include_dash_manifest": True,
297 u"format": "141",
298 },
299 },
300 # DASH manifest with encrypted signature
301 {
302 u'url': u'https://www.youtube.com/watch?v=IB3lcPjvWLA',
303 u'info_dict': {
304 u'id': u'IB3lcPjvWLA',
305 u'ext': u'm4a',
306 u'title': u'Afrojack - The Spark ft. Spree Wilson',
307 u'description': u'md5:3199ed45ee8836572865580804d7ac0f',
308 u'uploader': u'AfrojackVEVO',
309 u'uploader_id': u'AfrojackVEVO',
310 u'upload_date': u'20131011',
311 },
312 u"params": {
313 u'youtube_include_dash_manifest': True,
314 u'format': '141',
315 },
316 },
317 ]
318
319
320 @classmethod
321 def suitable(cls, url):
322 """Receives a URL and returns True if suitable for this IE."""
323 if YoutubePlaylistIE.suitable(url): return False
324 return re.match(cls._VALID_URL, url) is not None
325
326 def __init__(self, *args, **kwargs):
327 super(YoutubeIE, self).__init__(*args, **kwargs)
328 self._player_cache = {}
329
330 def report_video_info_webpage_download(self, video_id):
331 """Report attempt to download video info webpage."""
332 self.to_screen(u'%s: Downloading video info webpage' % video_id)
333
334 def report_information_extraction(self, video_id):
335 """Report attempt to extract video information."""
336 self.to_screen(u'%s: Extracting video information' % video_id)
337
338 def report_unavailable_format(self, video_id, format):
339 """Report extracted video URL."""
340 self.to_screen(u'%s: Format %s not available' % (video_id, format))
341
342 def report_rtmp_download(self):
343 """Indicate the download will use the RTMP protocol."""
344 self.to_screen(u'RTMP download detected')
345
346 def _extract_signature_function(self, video_id, player_url, slen):
347 id_m = re.match(r'.*-(?P<id>[a-zA-Z0-9_-]+)\.(?P<ext>[a-z]+)$',
348 player_url)
349 player_type = id_m.group('ext')
350 player_id = id_m.group('id')
351
352 # Read from filesystem cache
353 func_id = '%s_%s_%d' % (player_type, player_id, slen)
354 assert os.path.basename(func_id) == func_id
355 cache_dir = get_cachedir(self._downloader.params)
356
357 cache_enabled = cache_dir is not None
358 if cache_enabled:
359 cache_fn = os.path.join(os.path.expanduser(cache_dir),
360 u'youtube-sigfuncs',
361 func_id + '.json')
362 try:
363 with io.open(cache_fn, 'r', encoding='utf-8') as cachef:
364 cache_spec = json.load(cachef)
365 return lambda s: u''.join(s[i] for i in cache_spec)
366 except IOError:
367 pass # No cache available
368
369 if player_type == 'js':
370 code = self._download_webpage(
371 player_url, video_id,
372 note=u'Downloading %s player %s' % (player_type, player_id),
373 errnote=u'Download of %s failed' % player_url)
374 res = self._parse_sig_js(code)
375 elif player_type == 'swf':
376 urlh = self._request_webpage(
377 player_url, video_id,
378 note=u'Downloading %s player %s' % (player_type, player_id),
379 errnote=u'Download of %s failed' % player_url)
380 code = urlh.read()
381 res = self._parse_sig_swf(code)
382 else:
383 assert False, 'Invalid player type %r' % player_type
384
385 if cache_enabled:
386 try:
387 test_string = u''.join(map(compat_chr, range(slen)))
388 cache_res = res(test_string)
389 cache_spec = [ord(c) for c in cache_res]
390 try:
391 os.makedirs(os.path.dirname(cache_fn))
392 except OSError as ose:
393 if ose.errno != errno.EEXIST:
394 raise
395 write_json_file(cache_spec, cache_fn)
396 except Exception:
397 tb = traceback.format_exc()
398 self._downloader.report_warning(
399 u'Writing cache to %r failed: %s' % (cache_fn, tb))
400
401 return res
402
403 def _print_sig_code(self, func, slen):
404 def gen_sig_code(idxs):
405 def _genslice(start, end, step):
406 starts = u'' if start == 0 else str(start)
407 ends = (u':%d' % (end+step)) if end + step >= 0 else u':'
408 steps = u'' if step == 1 else (u':%d' % step)
409 return u's[%s%s%s]' % (starts, ends, steps)
410
411 step = None
412 start = '(Never used)' # Quelch pyflakes warnings - start will be
413 # set as soon as step is set
414 for i, prev in zip(idxs[1:], idxs[:-1]):
415 if step is not None:
416 if i - prev == step:
417 continue
418 yield _genslice(start, prev, step)
419 step = None
420 continue
421 if i - prev in [-1, 1]:
422 step = i - prev
423 start = prev
424 continue
425 else:
426 yield u's[%d]' % prev
427 if step is None:
428 yield u's[%d]' % i
429 else:
430 yield _genslice(start, i, step)
431
432 test_string = u''.join(map(compat_chr, range(slen)))
433 cache_res = func(test_string)
434 cache_spec = [ord(c) for c in cache_res]
435 expr_code = u' + '.join(gen_sig_code(cache_spec))
436 code = u'if len(s) == %d:\n return %s\n' % (slen, expr_code)
437 self.to_screen(u'Extracted signature function:\n' + code)
438
439 def _parse_sig_js(self, jscode):
440 funcname = self._search_regex(
441 r'signature=([a-zA-Z]+)', jscode,
442 u'Initial JS player signature function name')
443
444 jsi = JSInterpreter(jscode)
445 initial_function = jsi.extract_function(funcname)
446 return lambda s: initial_function([s])
447
448 def _parse_sig_swf(self, file_contents):
449 if file_contents[1:3] != b'WS':
450 raise ExtractorError(
451 u'Not an SWF file; header is %r' % file_contents[:3])
452 if file_contents[:1] == b'C':
453 content = zlib.decompress(file_contents[8:])
454 else:
455 raise NotImplementedError(u'Unsupported compression format %r' %
456 file_contents[:1])
457
458 def extract_tags(content):
459 pos = 0
460 while pos < len(content):
461 header16 = struct.unpack('<H', content[pos:pos+2])[0]
462 pos += 2
463 tag_code = header16 >> 6
464 tag_len = header16 & 0x3f
465 if tag_len == 0x3f:
466 tag_len = struct.unpack('<I', content[pos:pos+4])[0]
467 pos += 4
468 assert pos+tag_len <= len(content)
469 yield (tag_code, content[pos:pos+tag_len])
470 pos += tag_len
471
472 code_tag = next(tag
473 for tag_code, tag in extract_tags(content)
474 if tag_code == 82)
475 p = code_tag.index(b'\0', 4) + 1
476 code_reader = io.BytesIO(code_tag[p:])
477
478 # Parse ABC (AVM2 ByteCode)
479 def read_int(reader=None):
480 if reader is None:
481 reader = code_reader
482 res = 0
483 shift = 0
484 for _ in range(5):
485 buf = reader.read(1)
486 assert len(buf) == 1
487 b = struct.unpack('<B', buf)[0]
488 res = res | ((b & 0x7f) << shift)
489 if b & 0x80 == 0:
490 break
491 shift += 7
492 return res
493
494 def u30(reader=None):
495 res = read_int(reader)
496 assert res & 0xf0000000 == 0
497 return res
498 u32 = read_int
499
500 def s32(reader=None):
501 v = read_int(reader)
502 if v & 0x80000000 != 0:
503 v = - ((v ^ 0xffffffff) + 1)
504 return v
505
506 def read_string(reader=None):
507 if reader is None:
508 reader = code_reader
509 slen = u30(reader)
510 resb = reader.read(slen)
511 assert len(resb) == slen
512 return resb.decode('utf-8')
513
514 def read_bytes(count, reader=None):
515 if reader is None:
516 reader = code_reader
517 resb = reader.read(count)
518 assert len(resb) == count
519 return resb
520
521 def read_byte(reader=None):
522 resb = read_bytes(1, reader=reader)
523 res = struct.unpack('<B', resb)[0]
524 return res
525
526 # minor_version + major_version
527 read_bytes(2 + 2)
528
529 # Constant pool
530 int_count = u30()
531 for _c in range(1, int_count):
532 s32()
533 uint_count = u30()
534 for _c in range(1, uint_count):
535 u32()
536 double_count = u30()
537 read_bytes((double_count-1) * 8)
538 string_count = u30()
539 constant_strings = [u'']
540 for _c in range(1, string_count):
541 s = read_string()
542 constant_strings.append(s)
543 namespace_count = u30()
544 for _c in range(1, namespace_count):
545 read_bytes(1) # kind
546 u30() # name
547 ns_set_count = u30()
548 for _c in range(1, ns_set_count):
549 count = u30()
550 for _c2 in range(count):
551 u30()
552 multiname_count = u30()
553 MULTINAME_SIZES = {
554 0x07: 2, # QName
555 0x0d: 2, # QNameA
556 0x0f: 1, # RTQName
557 0x10: 1, # RTQNameA
558 0x11: 0, # RTQNameL
559 0x12: 0, # RTQNameLA
560 0x09: 2, # Multiname
561 0x0e: 2, # MultinameA
562 0x1b: 1, # MultinameL
563 0x1c: 1, # MultinameLA
564 }
565 multinames = [u'']
566 for _c in range(1, multiname_count):
567 kind = u30()
568 assert kind in MULTINAME_SIZES, u'Invalid multiname kind %r' % kind
569 if kind == 0x07:
570 u30() # namespace_idx
571 name_idx = u30()
572 multinames.append(constant_strings[name_idx])
573 else:
574 multinames.append('[MULTINAME kind: %d]' % kind)
575 for _c2 in range(MULTINAME_SIZES[kind]):
576 u30()
577
578 # Methods
579 method_count = u30()
580 MethodInfo = collections.namedtuple(
581 'MethodInfo',
582 ['NEED_ARGUMENTS', 'NEED_REST'])
583 method_infos = []
584 for method_id in range(method_count):
585 param_count = u30()
586 u30() # return type
587 for _ in range(param_count):
588 u30() # param type
589 u30() # name index (always 0 for youtube)
590 flags = read_byte()
591 if flags & 0x08 != 0:
592 # Options present
593 option_count = u30()
594 for c in range(option_count):
595 u30() # val
596 read_bytes(1) # kind
597 if flags & 0x80 != 0:
598 # Param names present
599 for _ in range(param_count):
600 u30() # param name
601 mi = MethodInfo(flags & 0x01 != 0, flags & 0x04 != 0)
602 method_infos.append(mi)
603
604 # Metadata
605 metadata_count = u30()
606 for _c in range(metadata_count):
607 u30() # name
608 item_count = u30()
609 for _c2 in range(item_count):
610 u30() # key
611 u30() # value
612
613 def parse_traits_info():
614 trait_name_idx = u30()
615 kind_full = read_byte()
616 kind = kind_full & 0x0f
617 attrs = kind_full >> 4
618 methods = {}
619 if kind in [0x00, 0x06]: # Slot or Const
620 u30() # Slot id
621 u30() # type_name_idx
622 vindex = u30()
623 if vindex != 0:
624 read_byte() # vkind
625 elif kind in [0x01, 0x02, 0x03]: # Method / Getter / Setter
626 u30() # disp_id
627 method_idx = u30()
628 methods[multinames[trait_name_idx]] = method_idx
629 elif kind == 0x04: # Class
630 u30() # slot_id
631 u30() # classi
632 elif kind == 0x05: # Function
633 u30() # slot_id
634 function_idx = u30()
635 methods[function_idx] = multinames[trait_name_idx]
636 else:
637 raise ExtractorError(u'Unsupported trait kind %d' % kind)
638
639 if attrs & 0x4 != 0: # Metadata present
640 metadata_count = u30()
641 for _c3 in range(metadata_count):
642 u30() # metadata index
643
644 return methods
645
646 # Classes
647 TARGET_CLASSNAME = u'SignatureDecipher'
648 searched_idx = multinames.index(TARGET_CLASSNAME)
649 searched_class_id = None
650 class_count = u30()
651 for class_id in range(class_count):
652 name_idx = u30()
653 if name_idx == searched_idx:
654 # We found the class we're looking for!
655 searched_class_id = class_id
656 u30() # super_name idx
657 flags = read_byte()
658 if flags & 0x08 != 0: # Protected namespace is present
659 u30() # protected_ns_idx
660 intrf_count = u30()
661 for _c2 in range(intrf_count):
662 u30()
663 u30() # iinit
664 trait_count = u30()
665 for _c2 in range(trait_count):
666 parse_traits_info()
667
668 if searched_class_id is None:
669 raise ExtractorError(u'Target class %r not found' %
670 TARGET_CLASSNAME)
671
672 method_names = {}
673 method_idxs = {}
674 for class_id in range(class_count):
675 u30() # cinit
676 trait_count = u30()
677 for _c2 in range(trait_count):
678 trait_methods = parse_traits_info()
679 if class_id == searched_class_id:
680 method_names.update(trait_methods.items())
681 method_idxs.update(dict(
682 (idx, name)
683 for name, idx in trait_methods.items()))
684
685 # Scripts
686 script_count = u30()
687 for _c in range(script_count):
688 u30() # init
689 trait_count = u30()
690 for _c2 in range(trait_count):
691 parse_traits_info()
692
693 # Method bodies
694 method_body_count = u30()
695 Method = collections.namedtuple('Method', ['code', 'local_count'])
696 methods = {}
697 for _c in range(method_body_count):
698 method_idx = u30()
699 u30() # max_stack
700 local_count = u30()
701 u30() # init_scope_depth
702 u30() # max_scope_depth
703 code_length = u30()
704 code = read_bytes(code_length)
705 if method_idx in method_idxs:
706 m = Method(code, local_count)
707 methods[method_idxs[method_idx]] = m
708 exception_count = u30()
709 for _c2 in range(exception_count):
710 u30() # from
711 u30() # to
712 u30() # target
713 u30() # exc_type
714 u30() # var_name
715 trait_count = u30()
716 for _c2 in range(trait_count):
717 parse_traits_info()
718
719 assert p + code_reader.tell() == len(code_tag)
720 assert len(methods) == len(method_idxs)
721
722 method_pyfunctions = {}
723
724 def extract_function(func_name):
725 if func_name in method_pyfunctions:
726 return method_pyfunctions[func_name]
727 if func_name not in methods:
728 raise ExtractorError(u'Cannot find function %r' % func_name)
729 m = methods[func_name]
730
731 def resfunc(args):
732 registers = ['(this)'] + list(args) + [None] * m.local_count
733 stack = []
734 coder = io.BytesIO(m.code)
735 while True:
736 opcode = struct.unpack('!B', coder.read(1))[0]
737 if opcode == 36: # pushbyte
738 v = struct.unpack('!B', coder.read(1))[0]
739 stack.append(v)
740 elif opcode == 44: # pushstring
741 idx = u30(coder)
742 stack.append(constant_strings[idx])
743 elif opcode == 48: # pushscope
744 # We don't implement the scope register, so we'll just
745 # ignore the popped value
746 stack.pop()
747 elif opcode == 70: # callproperty
748 index = u30(coder)
749 mname = multinames[index]
750 arg_count = u30(coder)
751 args = list(reversed(
752 [stack.pop() for _ in range(arg_count)]))
753 obj = stack.pop()
754 if mname == u'split':
755 assert len(args) == 1
756 assert isinstance(args[0], compat_str)
757 assert isinstance(obj, compat_str)
758 if args[0] == u'':
759 res = list(obj)
760 else:
761 res = obj.split(args[0])
762 stack.append(res)
763 elif mname == u'slice':
764 assert len(args) == 1
765 assert isinstance(args[0], int)
766 assert isinstance(obj, list)
767 res = obj[args[0]:]
768 stack.append(res)
769 elif mname == u'join':
770 assert len(args) == 1
771 assert isinstance(args[0], compat_str)
772 assert isinstance(obj, list)
773 res = args[0].join(obj)
774 stack.append(res)
775 elif mname in method_pyfunctions:
776 stack.append(method_pyfunctions[mname](args))
777 else:
778 raise NotImplementedError(
779 u'Unsupported property %r on %r'
780 % (mname, obj))
781 elif opcode == 72: # returnvalue
782 res = stack.pop()
783 return res
784 elif opcode == 79: # callpropvoid
785 index = u30(coder)
786 mname = multinames[index]
787 arg_count = u30(coder)
788 args = list(reversed(
789 [stack.pop() for _ in range(arg_count)]))
790 obj = stack.pop()
791 if mname == u'reverse':
792 assert isinstance(obj, list)
793 obj.reverse()
794 else:
795 raise NotImplementedError(
796 u'Unsupported (void) property %r on %r'
797 % (mname, obj))
798 elif opcode == 93: # findpropstrict
799 index = u30(coder)
800 mname = multinames[index]
801 res = extract_function(mname)
802 stack.append(res)
803 elif opcode == 97: # setproperty
804 index = u30(coder)
805 value = stack.pop()
806 idx = stack.pop()
807 obj = stack.pop()
808 assert isinstance(obj, list)
809 assert isinstance(idx, int)
810 obj[idx] = value
811 elif opcode == 98: # getlocal
812 index = u30(coder)
813 stack.append(registers[index])
814 elif opcode == 99: # setlocal
815 index = u30(coder)
816 value = stack.pop()
817 registers[index] = value
818 elif opcode == 102: # getproperty
819 index = u30(coder)
820 pname = multinames[index]
821 if pname == u'length':
822 obj = stack.pop()
823 assert isinstance(obj, list)
824 stack.append(len(obj))
825 else: # Assume attribute access
826 idx = stack.pop()
827 assert isinstance(idx, int)
828 obj = stack.pop()
829 assert isinstance(obj, list)
830 stack.append(obj[idx])
831 elif opcode == 128: # coerce
832 u30(coder)
833 elif opcode == 133: # coerce_s
834 assert isinstance(stack[-1], (type(None), compat_str))
835 elif opcode == 164: # modulo
836 value2 = stack.pop()
837 value1 = stack.pop()
838 res = value1 % value2
839 stack.append(res)
840 elif opcode == 208: # getlocal_0
841 stack.append(registers[0])
842 elif opcode == 209: # getlocal_1
843 stack.append(registers[1])
844 elif opcode == 210: # getlocal_2
845 stack.append(registers[2])
846 elif opcode == 211: # getlocal_3
847 stack.append(registers[3])
848 elif opcode == 214: # setlocal_2
849 registers[2] = stack.pop()
850 elif opcode == 215: # setlocal_3
851 registers[3] = stack.pop()
852 else:
853 raise NotImplementedError(
854 u'Unsupported opcode %d' % opcode)
855
856 method_pyfunctions[func_name] = resfunc
857 return resfunc
858
859 initial_function = extract_function(u'decipher')
860 return lambda s: initial_function([s])
861
862 def _decrypt_signature(self, s, video_id, player_url, age_gate=False):
863 """Turn the encrypted s field into a working signature"""
864
865 if player_url is not None:
866 if player_url.startswith(u'//'):
867 player_url = u'https:' + player_url
868 try:
869 player_id = (player_url, len(s))
870 if player_id not in self._player_cache:
871 func = self._extract_signature_function(
872 video_id, player_url, len(s)
873 )
874 self._player_cache[player_id] = func
875 func = self._player_cache[player_id]
876 if self._downloader.params.get('youtube_print_sig_code'):
877 self._print_sig_code(func, len(s))
878 return func(s)
879 except Exception:
880 tb = traceback.format_exc()
881 self._downloader.report_warning(
882 u'Automatic signature extraction failed: ' + tb)
883
884 self._downloader.report_warning(
885 u'Warning: Falling back to static signature algorithm')
886
887 return self._static_decrypt_signature(
888 s, video_id, player_url, age_gate)
889
890 def _static_decrypt_signature(self, s, video_id, player_url, age_gate):
891 if age_gate:
892 # The videos with age protection use another player, so the
893 # algorithms can be different.
894 if len(s) == 86:
895 return s[2:63] + s[82] + s[64:82] + s[63]
896
897 if len(s) == 93:
898 return s[86:29:-1] + s[88] + s[28:5:-1]
899 elif len(s) == 92:
900 return s[25] + s[3:25] + s[0] + s[26:42] + s[79] + s[43:79] + s[91] + s[80:83]
901 elif len(s) == 91:
902 return s[84:27:-1] + s[86] + s[26:5:-1]
903 elif len(s) == 90:
904 return s[25] + s[3:25] + s[2] + s[26:40] + s[77] + s[41:77] + s[89] + s[78:81]
905 elif len(s) == 89:
906 return s[84:78:-1] + s[87] + s[77:60:-1] + s[0] + s[59:3:-1]
907 elif len(s) == 88:
908 return s[7:28] + s[87] + s[29:45] + s[55] + s[46:55] + s[2] + s[56:87] + s[28]
909 elif len(s) == 87:
910 return s[6:27] + s[4] + s[28:39] + s[27] + s[40:59] + s[2] + s[60:]
911 elif len(s) == 86:
912 return s[80:72:-1] + s[16] + s[71:39:-1] + s[72] + s[38:16:-1] + s[82] + s[15::-1]
913 elif len(s) == 85:
914 return s[3:11] + s[0] + s[12:55] + s[84] + s[56:84]
915 elif len(s) == 84:
916 return s[78:70:-1] + s[14] + s[69:37:-1] + s[70] + s[36:14:-1] + s[80] + s[:14][::-1]
917 elif len(s) == 83:
918 return s[80:63:-1] + s[0] + s[62:0:-1] + s[63]
919 elif len(s) == 82:
920 return s[80:37:-1] + s[7] + s[36:7:-1] + s[0] + s[6:0:-1] + s[37]
921 elif len(s) == 81:
922 return s[56] + s[79:56:-1] + s[41] + s[55:41:-1] + s[80] + s[40:34:-1] + s[0] + s[33:29:-1] + s[34] + s[28:9:-1] + s[29] + s[8:0:-1] + s[9]
923 elif len(s) == 80:
924 return s[1:19] + s[0] + s[20:68] + s[19] + s[69:80]
925 elif len(s) == 79:
926 return s[54] + s[77:54:-1] + s[39] + s[53:39:-1] + s[78] + s[38:34:-1] + s[0] + s[33:29:-1] + s[34] + s[28:9:-1] + s[29] + s[8:0:-1] + s[9]
927
928 else:
929 raise ExtractorError(u'Unable to decrypt signature, key length %d not supported; retrying might work' % (len(s)))
930
931 def _get_available_subtitles(self, video_id, webpage):
932 try:
933 sub_list = self._download_webpage(
934 'https://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,
935 video_id, note=False)
936 except ExtractorError as err:
937 self._downloader.report_warning(u'unable to download video subtitles: %s' % compat_str(err))
938 return {}
939 lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
940
941 sub_lang_list = {}
942 for l in lang_list:
943 lang = l[1]
944 params = compat_urllib_parse.urlencode({
945 'lang': lang,
946 'v': video_id,
947 'fmt': self._downloader.params.get('subtitlesformat', 'srt'),
948 'name': unescapeHTML(l[0]).encode('utf-8'),
949 })
950 url = u'https://www.youtube.com/api/timedtext?' + params
951 sub_lang_list[lang] = url
952 if not sub_lang_list:
953 self._downloader.report_warning(u'video doesn\'t have subtitles')
954 return {}
955 return sub_lang_list
956
957 def _get_available_automatic_caption(self, video_id, webpage):
958 """We need the webpage for getting the captions url, pass it as an
959 argument to speed up the process."""
960 sub_format = self._downloader.params.get('subtitlesformat', 'srt')
961 self.to_screen(u'%s: Looking for automatic captions' % video_id)
962 mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
963 err_msg = u'Couldn\'t find automatic captions for %s' % video_id
964 if mobj is None:
965 self._downloader.report_warning(err_msg)
966 return {}
967 player_config = json.loads(mobj.group(1))
968 try:
969 args = player_config[u'args']
970 caption_url = args[u'ttsurl']
971 timestamp = args[u'timestamp']
972 # We get the available subtitles
973 list_params = compat_urllib_parse.urlencode({
974 'type': 'list',
975 'tlangs': 1,
976 'asrs': 1,
977 })
978 list_url = caption_url + '&' + list_params
979 caption_list = self._download_xml(list_url, video_id)
980 original_lang_node = caption_list.find('track')
981 if original_lang_node is None or original_lang_node.attrib.get('kind') != 'asr' :
982 self._downloader.report_warning(u'Video doesn\'t have automatic captions')
983 return {}
984 original_lang = original_lang_node.attrib['lang_code']
985
986 sub_lang_list = {}
987 for lang_node in caption_list.findall('target'):
988 sub_lang = lang_node.attrib['lang_code']
989 params = compat_urllib_parse.urlencode({
990 'lang': original_lang,
991 'tlang': sub_lang,
992 'fmt': sub_format,
993 'ts': timestamp,
994 'kind': 'asr',
995 })
996 sub_lang_list[sub_lang] = caption_url + '&' + params
997 return sub_lang_list
998 # An extractor error can be raise by the download process if there are
999 # no automatic captions but there are subtitles
1000 except (KeyError, ExtractorError):
1001 self._downloader.report_warning(err_msg)
1002 return {}
1003
1004 @classmethod
1005 def extract_id(cls, url):
1006 mobj = re.match(cls._VALID_URL, url, re.VERBOSE)
1007 if mobj is None:
1008 raise ExtractorError(u'Invalid URL: %s' % url)
1009 video_id = mobj.group(2)
1010 return video_id
1011
1012 def _extract_from_m3u8(self, manifest_url, video_id):
1013 url_map = {}
1014 def _get_urls(_manifest):
1015 lines = _manifest.split('\n')
1016 urls = filter(lambda l: l and not l.startswith('#'),
1017 lines)
1018 return urls
1019 manifest = self._download_webpage(manifest_url, video_id, u'Downloading formats manifest')
1020 formats_urls = _get_urls(manifest)
1021 for format_url in formats_urls:
1022 itag = self._search_regex(r'itag/(\d+?)/', format_url, 'itag')
1023 url_map[itag] = format_url
1024 return url_map
1025
1026 def _extract_annotations(self, video_id):
1027 url = 'https://www.youtube.com/annotations_invideo?features=1&legacy=1&video_id=%s' % video_id
1028 return self._download_webpage(url, video_id, note=u'Searching for annotations.', errnote=u'Unable to download video annotations.')
1029
1030 def _real_extract(self, url):
1031 proto = (
1032 u'http' if self._downloader.params.get('prefer_insecure', False)
1033 else u'https')
1034
1035 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
1036 mobj = re.search(self._NEXT_URL_RE, url)
1037 if mobj:
1038 url = proto + '://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
1039 video_id = self.extract_id(url)
1040
1041 # Get video webpage
1042 url = proto + '://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
1043 video_webpage = self._download_webpage(url, video_id)
1044
1045 # Attempt to extract SWF player URL
1046 mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
1047 if mobj is not None:
1048 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1049 else:
1050 player_url = None
1051
1052 # Get video info
1053 self.report_video_info_webpage_download(video_id)
1054 if re.search(r'player-age-gate-content">', video_webpage) is not None:
1055 self.report_age_confirmation()
1056 age_gate = True
1057 # We simulate the access to the video from www.youtube.com/v/{video_id}
1058 # this can be viewed without login into Youtube
1059 data = compat_urllib_parse.urlencode({'video_id': video_id,
1060 'el': 'player_embedded',
1061 'gl': 'US',
1062 'hl': 'en',
1063 'eurl': 'https://youtube.googleapis.com/v/' + video_id,
1064 'asv': 3,
1065 'sts':'1588',
1066 })
1067 video_info_url = proto + '://www.youtube.com/get_video_info?' + data
1068 video_info_webpage = self._download_webpage(video_info_url, video_id,
1069 note=False,
1070 errnote='unable to download video info webpage')
1071 video_info = compat_parse_qs(video_info_webpage)
1072 else:
1073 age_gate = False
1074 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1075 video_info_url = (proto + '://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1076 % (video_id, el_type))
1077 video_info_webpage = self._download_webpage(video_info_url, video_id,
1078 note=False,
1079 errnote='unable to download video info webpage')
1080 video_info = compat_parse_qs(video_info_webpage)
1081 if 'token' in video_info:
1082 break
1083 if 'token' not in video_info:
1084 if 'reason' in video_info:
1085 raise ExtractorError(
1086 u'YouTube said: %s' % video_info['reason'][0],
1087 expected=True, video_id=video_id)
1088 else:
1089 raise ExtractorError(
1090 u'"token" parameter not in video info for unknown reason',
1091 video_id=video_id)
1092
1093 if 'view_count' in video_info:
1094 view_count = int(video_info['view_count'][0])
1095 else:
1096 view_count = None
1097
1098 # Check for "rental" videos
1099 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
1100 raise ExtractorError(u'"rental" videos not supported')
1101
1102 # Start extracting information
1103 self.report_information_extraction(video_id)
1104
1105 # uploader
1106 if 'author' not in video_info:
1107 raise ExtractorError(u'Unable to extract uploader name')
1108 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
1109
1110 # uploader_id
1111 video_uploader_id = None
1112 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
1113 if mobj is not None:
1114 video_uploader_id = mobj.group(1)
1115 else:
1116 self._downloader.report_warning(u'unable to extract uploader nickname')
1117
1118 # title
1119 if 'title' in video_info:
1120 video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
1121 else:
1122 self._downloader.report_warning(u'Unable to extract video title')
1123 video_title = u'_'
1124
1125 # thumbnail image
1126 # We try first to get a high quality image:
1127 m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">',
1128 video_webpage, re.DOTALL)
1129 if m_thumb is not None:
1130 video_thumbnail = m_thumb.group(1)
1131 elif 'thumbnail_url' not in video_info:
1132 self._downloader.report_warning(u'unable to extract video thumbnail')
1133 video_thumbnail = None
1134 else: # don't panic if we can't find it
1135 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
1136
1137 # upload date
1138 upload_date = None
1139 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
1140 if mobj is not None:
1141 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1142 upload_date = unified_strdate(upload_date)
1143
1144 # description
1145 video_description = get_element_by_id("eow-description", video_webpage)
1146 if video_description:
1147 video_description = re.sub(r'''(?x)
1148 <a\s+
1149 (?:[a-zA-Z-]+="[^"]+"\s+)*?
1150 title="([^"]+)"\s+
1151 (?:[a-zA-Z-]+="[^"]+"\s+)*?
1152 class="yt-uix-redirect-link"\s*>
1153 [^<]+
1154 </a>
1155 ''', r'\1', video_description)
1156 video_description = clean_html(video_description)
1157 else:
1158 fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
1159 if fd_mobj:
1160 video_description = unescapeHTML(fd_mobj.group(1))
1161 else:
1162 video_description = u''
1163
1164 def _extract_count(klass):
1165 count = self._search_regex(
1166 r'class="%s">([\d,]+)</span>' % re.escape(klass),
1167 video_webpage, klass, default=None)
1168 if count is not None:
1169 return int(count.replace(',', ''))
1170 return None
1171 like_count = _extract_count(u'likes-count')
1172 dislike_count = _extract_count(u'dislikes-count')
1173
1174 # subtitles
1175 video_subtitles = self.extract_subtitles(video_id, video_webpage)
1176
1177 if self._downloader.params.get('listsubtitles', False):
1178 self._list_available_subtitles(video_id, video_webpage)
1179 return
1180
1181 if 'length_seconds' not in video_info:
1182 self._downloader.report_warning(u'unable to extract video duration')
1183 video_duration = None
1184 else:
1185 video_duration = int(compat_urllib_parse.unquote_plus(video_info['length_seconds'][0]))
1186
1187 # annotations
1188 video_annotations = None
1189 if self._downloader.params.get('writeannotations', False):
1190 video_annotations = self._extract_annotations(video_id)
1191
1192 # Decide which formats to download
1193 try:
1194 mobj = re.search(r';ytplayer\.config\s*=\s*({.*?});', video_webpage)
1195 if not mobj:
1196 raise ValueError('Could not find vevo ID')
1197 json_code = uppercase_escape(mobj.group(1))
1198 ytplayer_config = json.loads(json_code)
1199 args = ytplayer_config['args']
1200 # Easy way to know if the 's' value is in url_encoded_fmt_stream_map
1201 # this signatures are encrypted
1202 if 'url_encoded_fmt_stream_map' not in args:
1203 raise ValueError(u'No stream_map present') # caught below
1204 re_signature = re.compile(r'[&,]s=')
1205 m_s = re_signature.search(args['url_encoded_fmt_stream_map'])
1206 if m_s is not None:
1207 self.to_screen(u'%s: Encrypted signatures detected.' % video_id)
1208 video_info['url_encoded_fmt_stream_map'] = [args['url_encoded_fmt_stream_map']]
1209 m_s = re_signature.search(args.get('adaptive_fmts', u''))
1210 if m_s is not None:
1211 if 'adaptive_fmts' in video_info:
1212 video_info['adaptive_fmts'][0] += ',' + args['adaptive_fmts']
1213 else:
1214 video_info['adaptive_fmts'] = [args['adaptive_fmts']]
1215 except ValueError:
1216 pass
1217
1218 def _map_to_format_list(urlmap):
1219 formats = []
1220 for itag, video_real_url in urlmap.items():
1221 dct = {
1222 'format_id': itag,
1223 'url': video_real_url,
1224 'player_url': player_url,
1225 }
1226 if itag in self._formats:
1227 dct.update(self._formats[itag])
1228 formats.append(dct)
1229 return formats
1230
1231 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1232 self.report_rtmp_download()
1233 formats = [{
1234 'format_id': '_rtmp',
1235 'protocol': 'rtmp',
1236 'url': video_info['conn'][0],
1237 'player_url': player_url,
1238 }]
1239 elif len(video_info.get('url_encoded_fmt_stream_map', [])) >= 1 or len(video_info.get('adaptive_fmts', [])) >= 1:
1240 encoded_url_map = video_info.get('url_encoded_fmt_stream_map', [''])[0] + ',' + video_info.get('adaptive_fmts',[''])[0]
1241 if 'rtmpe%3Dyes' in encoded_url_map:
1242 raise ExtractorError('rtmpe downloads are not supported, see https://github.com/rg3/youtube-dl/issues/343 for more information.', expected=True)
1243 url_map = {}
1244 for url_data_str in encoded_url_map.split(','):
1245 url_data = compat_parse_qs(url_data_str)
1246 if 'itag' in url_data and 'url' in url_data:
1247 url = url_data['url'][0]
1248 if 'sig' in url_data:
1249 url += '&signature=' + url_data['sig'][0]
1250 elif 's' in url_data:
1251 encrypted_sig = url_data['s'][0]
1252 if self._downloader.params.get('verbose'):
1253 if age_gate:
1254 if player_url is None:
1255 player_version = 'unknown'
1256 else:
1257 player_version = self._search_regex(
1258 r'-(.+)\.swf$', player_url,
1259 u'flash player', fatal=False)
1260 player_desc = 'flash player %s' % player_version
1261 else:
1262 player_version = self._search_regex(
1263 r'html5player-(.+?)\.js', video_webpage,
1264 'html5 player', fatal=False)
1265 player_desc = u'html5 player %s' % player_version
1266
1267 parts_sizes = u'.'.join(compat_str(len(part)) for part in encrypted_sig.split('.'))
1268 self.to_screen(u'encrypted signature length %d (%s), itag %s, %s' %
1269 (len(encrypted_sig), parts_sizes, url_data['itag'][0], player_desc))
1270
1271 if not age_gate:
1272 jsplayer_url_json = self._search_regex(
1273 r'"assets":.+?"js":\s*("[^"]+")',
1274 video_webpage, u'JS player URL')
1275 player_url = json.loads(jsplayer_url_json)
1276
1277 signature = self._decrypt_signature(
1278 encrypted_sig, video_id, player_url, age_gate)
1279 url += '&signature=' + signature
1280 if 'ratebypass' not in url:
1281 url += '&ratebypass=yes'
1282 url_map[url_data['itag'][0]] = url
1283 formats = _map_to_format_list(url_map)
1284 elif video_info.get('hlsvp'):
1285 manifest_url = video_info['hlsvp'][0]
1286 url_map = self._extract_from_m3u8(manifest_url, video_id)
1287 formats = _map_to_format_list(url_map)
1288 else:
1289 raise ExtractorError(u'no conn, hlsvp or url_encoded_fmt_stream_map information found in video info')
1290
1291 # Look for the DASH manifest
1292 if (self._downloader.params.get('youtube_include_dash_manifest', False)):
1293 try:
1294 # The DASH manifest used needs to be the one from the original video_webpage.
1295 # The one found in get_video_info seems to be using different signatures.
1296 # However, in the case of an age restriction there won't be any embedded dashmpd in the video_webpage.
1297 # Luckily, it seems, this case uses some kind of default signature (len == 86), so the
1298 # combination of get_video_info and the _static_decrypt_signature() decryption fallback will work here.
1299 if age_gate:
1300 dash_manifest_url = video_info.get('dashmpd')[0]
1301 else:
1302 dash_manifest_url = ytplayer_config['args']['dashmpd']
1303 def decrypt_sig(mobj):
1304 s = mobj.group(1)
1305 dec_s = self._decrypt_signature(s, video_id, player_url, age_gate)
1306 return '/signature/%s' % dec_s
1307 dash_manifest_url = re.sub(r'/s/([\w\.]+)', decrypt_sig, dash_manifest_url)
1308 dash_doc = self._download_xml(
1309 dash_manifest_url, video_id,
1310 note=u'Downloading DASH manifest',
1311 errnote=u'Could not download DASH manifest')
1312 for r in dash_doc.findall(u'.//{urn:mpeg:DASH:schema:MPD:2011}Representation'):
1313 url_el = r.find('{urn:mpeg:DASH:schema:MPD:2011}BaseURL')
1314 if url_el is None:
1315 continue
1316 format_id = r.attrib['id']
1317 video_url = url_el.text
1318 filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength'))
1319 f = {
1320 'format_id': format_id,
1321 'url': video_url,
1322 'width': int_or_none(r.attrib.get('width')),
1323 'tbr': int_or_none(r.attrib.get('bandwidth'), 1000),
1324 'asr': int_or_none(r.attrib.get('audioSamplingRate')),
1325 'filesize': filesize,
1326 }
1327 try:
1328 existing_format = next(
1329 fo for fo in formats
1330 if fo['format_id'] == format_id)
1331 except StopIteration:
1332 f.update(self._formats.get(format_id, {}))
1333 formats.append(f)
1334 else:
1335 existing_format.update(f)
1336
1337 except (ExtractorError, KeyError) as e:
1338 self.report_warning(u'Skipping DASH manifest: %s' % e, video_id)
1339
1340 self._sort_formats(formats)
1341
1342 return {
1343 'id': video_id,
1344 'uploader': video_uploader,
1345 'uploader_id': video_uploader_id,
1346 'upload_date': upload_date,
1347 'title': video_title,
1348 'thumbnail': video_thumbnail,
1349 'description': video_description,
1350 'subtitles': video_subtitles,
1351 'duration': video_duration,
1352 'age_limit': 18 if age_gate else 0,
1353 'annotations': video_annotations,
1354 'webpage_url': proto + '://www.youtube.com/watch?v=%s' % video_id,
1355 'view_count': view_count,
1356 'like_count': like_count,
1357 'dislike_count': dislike_count,
1358 'formats': formats,
1359 }
1360
1361class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
1362 IE_DESC = u'YouTube.com playlists'
1363 _VALID_URL = r"""(?x)(?:
1364 (?:https?://)?
1365 (?:\w+\.)?
1366 youtube\.com/
1367 (?:
1368 (?:course|view_play_list|my_playlists|artist|playlist|watch)
1369 \? (?:.*?&)*? (?:p|a|list)=
1370 | p/
1371 )
1372 (
1373 (?:PL|EC|UU|FL|RD)?[0-9A-Za-z-_]{10,}
1374 # Top tracks, they can also include dots
1375 |(?:MC)[\w\.]*
1376 )
1377 .*
1378 |
1379 ((?:PL|EC|UU|FL|RD)[0-9A-Za-z-_]{10,})
1380 )"""
1381 _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s'
1382 _MORE_PAGES_INDICATOR = r'data-link-type="next"'
1383 _VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})&amp;[^"]*?index=(?P<index>\d+)'
1384 IE_NAME = u'youtube:playlist'
1385
1386 def _real_initialize(self):
1387 self._login()
1388
1389 def _ids_to_results(self, ids):
1390 return [self.url_result(vid_id, 'Youtube', video_id=vid_id)
1391 for vid_id in ids]
1392
1393 def _extract_mix(self, playlist_id):
1394 # The mixes are generated from a a single video
1395 # the id of the playlist is just 'RD' + video_id
1396 url = 'https://youtube.com/watch?v=%s&list=%s' % (playlist_id[-11:], playlist_id)
1397 webpage = self._download_webpage(url, playlist_id, u'Downloading Youtube mix')
1398 search_title = lambda class_name: get_element_by_attribute('class', class_name, webpage)
1399 title_span = (search_title('playlist-title') or
1400 search_title('title long-title') or search_title('title'))
1401 title = clean_html(title_span)
1402 video_re = r'''(?x)data-video-username="(.*?)".*?
1403 href="/watch\?v=([0-9A-Za-z_-]{11})&amp;[^"]*?list=%s''' % re.escape(playlist_id)
1404 matches = orderedSet(re.findall(video_re, webpage, flags=re.DOTALL))
1405 # Some of the videos may have been deleted, their username field is empty
1406 ids = [video_id for (username, video_id) in matches if username]
1407 url_results = self._ids_to_results(ids)
1408
1409 return self.playlist_result(url_results, playlist_id, title)
1410
1411 def _real_extract(self, url):
1412 # Extract playlist id
1413 mobj = re.match(self._VALID_URL, url)
1414 if mobj is None:
1415 raise ExtractorError(u'Invalid URL: %s' % url)
1416 playlist_id = mobj.group(1) or mobj.group(2)
1417
1418 # Check if it's a video-specific URL
1419 query_dict = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
1420 if 'v' in query_dict:
1421 video_id = query_dict['v'][0]
1422 if self._downloader.params.get('noplaylist'):
1423 self.to_screen(u'Downloading just video %s because of --no-playlist' % video_id)
1424 return self.url_result(video_id, 'Youtube', video_id=video_id)
1425 else:
1426 self.to_screen(u'Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id))
1427
1428 if playlist_id.startswith('RD'):
1429 # Mixes require a custom extraction process
1430 return self._extract_mix(playlist_id)
1431 if playlist_id.startswith('TL'):
1432 raise ExtractorError(u'For downloading YouTube.com top lists, use '
1433 u'the "yttoplist" keyword, for example "youtube-dl \'yttoplist:music:Top Tracks\'"', expected=True)
1434
1435 url = self._TEMPLATE_URL % playlist_id
1436 page = self._download_webpage(url, playlist_id)
1437 more_widget_html = content_html = page
1438
1439 # Extract the video ids from the playlist pages
1440 ids = []
1441
1442 for page_num in itertools.count(1):
1443 matches = re.finditer(self._VIDEO_RE, content_html)
1444 # We remove the duplicates and the link with index 0
1445 # (it's not the first video of the playlist)
1446 new_ids = orderedSet(m.group('id') for m in matches if m.group('index') != '0')
1447 ids.extend(new_ids)
1448
1449 mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
1450 if not mobj:
1451 break
1452
1453 more = self._download_json(
1454 'https://youtube.com/%s' % mobj.group('more'), playlist_id,
1455 'Downloading page #%s' % page_num,
1456 transform_source=uppercase_escape)
1457 content_html = more['content_html']
1458 more_widget_html = more['load_more_widget_html']
1459
1460 playlist_title = self._html_search_regex(
1461 r'(?s)<h1 class="pl-header-title[^"]*">\s*(.*?)\s*</h1>',
1462 page, u'title')
1463
1464 url_results = self._ids_to_results(ids)
1465 return self.playlist_result(url_results, playlist_id, playlist_title)
1466
1467
1468class YoutubeTopListIE(YoutubePlaylistIE):
1469 IE_NAME = u'youtube:toplist'
1470 IE_DESC = (u'YouTube.com top lists, "yttoplist:{channel}:{list title}"'
1471 u' (Example: "yttoplist:music:Top Tracks")')
1472 _VALID_URL = r'yttoplist:(?P<chann>.*?):(?P<title>.*?)$'
1473
1474 def _real_extract(self, url):
1475 mobj = re.match(self._VALID_URL, url)
1476 channel = mobj.group('chann')
1477 title = mobj.group('title')
1478 query = compat_urllib_parse.urlencode({'title': title})
1479 playlist_re = 'href="([^"]+?%s.*?)"' % re.escape(query)
1480 channel_page = self._download_webpage('https://www.youtube.com/%s' % channel, title)
1481 link = self._html_search_regex(playlist_re, channel_page, u'list')
1482 url = compat_urlparse.urljoin('https://www.youtube.com/', link)
1483
1484 video_re = r'data-index="\d+".*?data-video-id="([0-9A-Za-z_-]{11})"'
1485 ids = []
1486 # sometimes the webpage doesn't contain the videos
1487 # retry until we get them
1488 for i in itertools.count(0):
1489 msg = u'Downloading Youtube mix'
1490 if i > 0:
1491 msg += ', retry #%d' % i
1492 webpage = self._download_webpage(url, title, msg)
1493 ids = orderedSet(re.findall(video_re, webpage))
1494 if ids:
1495 break
1496 url_results = self._ids_to_results(ids)
1497 return self.playlist_result(url_results, playlist_title=title)
1498
1499
1500class YoutubeChannelIE(InfoExtractor):
1501 IE_DESC = u'YouTube.com channels'
1502 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
1503 _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
1504 _MORE_PAGES_URL = 'https://www.youtube.com/c4_browse_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
1505 IE_NAME = u'youtube:channel'
1506
1507 def extract_videos_from_page(self, page):
1508 ids_in_page = []
1509 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1510 if mobj.group(1) not in ids_in_page:
1511 ids_in_page.append(mobj.group(1))
1512 return ids_in_page
1513
1514 def _real_extract(self, url):
1515 # Extract channel id
1516 mobj = re.match(self._VALID_URL, url)
1517 if mobj is None:
1518 raise ExtractorError(u'Invalid URL: %s' % url)
1519
1520 # Download channel page
1521 channel_id = mobj.group(1)
1522 video_ids = []
1523 url = 'https://www.youtube.com/channel/%s/videos' % channel_id
1524 channel_page = self._download_webpage(url, channel_id)
1525 autogenerated = re.search(r'''(?x)
1526 class="[^"]*?(?:
1527 channel-header-autogenerated-label|
1528 yt-channel-title-autogenerated
1529 )[^"]*"''', channel_page) is not None
1530
1531 if autogenerated:
1532 # The videos are contained in a single page
1533 # the ajax pages can't be used, they are empty
1534 video_ids = self.extract_videos_from_page(channel_page)
1535 else:
1536 # Download all channel pages using the json-based channel_ajax query
1537 for pagenum in itertools.count(1):
1538 url = self._MORE_PAGES_URL % (pagenum, channel_id)
1539 page = self._download_json(
1540 url, channel_id, note=u'Downloading page #%s' % pagenum,
1541 transform_source=uppercase_escape)
1542
1543 ids_in_page = self.extract_videos_from_page(page['content_html'])
1544 video_ids.extend(ids_in_page)
1545
1546 if self._MORE_PAGES_INDICATOR not in page['load_more_widget_html']:
1547 break
1548
1549 self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1550
1551 url_entries = [self.url_result(video_id, 'Youtube', video_id=video_id)
1552 for video_id in video_ids]
1553 return self.playlist_result(url_entries, channel_id)
1554
1555
1556class YoutubeUserIE(InfoExtractor):
1557 IE_DESC = u'YouTube.com user videos (URL or "ytuser" keyword)'
1558 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:user/)?(?!(?:attribution_link|watch|results)(?:$|[^a-z_A-Z0-9-])))|ytuser:)(?!feed/)([A-Za-z0-9_-]+)'
1559 _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/users/%s'
1560 _GDATA_PAGE_SIZE = 50
1561 _GDATA_URL = 'https://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d&alt=json'
1562 IE_NAME = u'youtube:user'
1563
1564 @classmethod
1565 def suitable(cls, url):
1566 # Don't return True if the url can be extracted with other youtube
1567 # extractor, the regex would is too permissive and it would match.
1568 other_ies = iter(klass for (name, klass) in globals().items() if name.endswith('IE') and klass is not cls)
1569 if any(ie.suitable(url) for ie in other_ies): return False
1570 else: return super(YoutubeUserIE, cls).suitable(url)
1571
1572 def _real_extract(self, url):
1573 # Extract username
1574 mobj = re.match(self._VALID_URL, url)
1575 if mobj is None:
1576 raise ExtractorError(u'Invalid URL: %s' % url)
1577
1578 username = mobj.group(1)
1579
1580 # Download video ids using YouTube Data API. Result size per
1581 # query is limited (currently to 50 videos) so we need to query
1582 # page by page until there are no video ids - it means we got
1583 # all of them.
1584
1585 def download_page(pagenum):
1586 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1587
1588 gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
1589 page = self._download_webpage(
1590 gdata_url, username,
1591 u'Downloading video ids from %d to %d' % (
1592 start_index, start_index + self._GDATA_PAGE_SIZE))
1593
1594 try:
1595 response = json.loads(page)
1596 except ValueError as err:
1597 raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
1598 if 'entry' not in response['feed']:
1599 return
1600
1601 # Extract video identifiers
1602 entries = response['feed']['entry']
1603 for entry in entries:
1604 title = entry['title']['$t']
1605 video_id = entry['id']['$t'].split('/')[-1]
1606 yield {
1607 '_type': 'url',
1608 'url': video_id,
1609 'ie_key': 'Youtube',
1610 'id': video_id,
1611 'title': title,
1612 }
1613 url_results = PagedList(download_page, self._GDATA_PAGE_SIZE)
1614
1615 return self.playlist_result(url_results, playlist_title=username)
1616
1617
1618class YoutubeSearchIE(SearchInfoExtractor):
1619 IE_DESC = u'YouTube.com searches'
1620 _API_URL = u'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1621 _MAX_RESULTS = 1000
1622 IE_NAME = u'youtube:search'
1623 _SEARCH_KEY = 'ytsearch'
1624
1625 def _get_n_results(self, query, n):
1626 """Get a specified number of results for a query"""
1627
1628 video_ids = []
1629 pagenum = 0
1630 limit = n
1631 PAGE_SIZE = 50
1632
1633 while (PAGE_SIZE * pagenum) < limit:
1634 result_url = self._API_URL % (
1635 compat_urllib_parse.quote_plus(query.encode('utf-8')),
1636 (PAGE_SIZE * pagenum) + 1)
1637 data_json = self._download_webpage(
1638 result_url, video_id=u'query "%s"' % query,
1639 note=u'Downloading page %s' % (pagenum + 1),
1640 errnote=u'Unable to download API page')
1641 data = json.loads(data_json)
1642 api_response = data['data']
1643
1644 if 'items' not in api_response:
1645 raise ExtractorError(
1646 u'[youtube] No video results', expected=True)
1647
1648 new_ids = list(video['id'] for video in api_response['items'])
1649 video_ids += new_ids
1650
1651 limit = min(n, api_response['totalItems'])
1652 pagenum += 1
1653
1654 if len(video_ids) > n:
1655 video_ids = video_ids[:n]
1656 videos = [self.url_result(video_id, 'Youtube', video_id=video_id)
1657 for video_id in video_ids]
1658 return self.playlist_result(videos, query)
1659
1660
1661class YoutubeSearchDateIE(YoutubeSearchIE):
1662 IE_NAME = YoutubeSearchIE.IE_NAME + ':date'
1663 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc&orderby=published'
1664 _SEARCH_KEY = 'ytsearchdate'
1665 IE_DESC = u'YouTube.com searches, newest videos first'
1666
1667
1668class YoutubeSearchURLIE(InfoExtractor):
1669 IE_DESC = u'YouTube.com search URLs'
1670 IE_NAME = u'youtube:search_url'
1671 _VALID_URL = r'https?://(?:www\.)?youtube\.com/results\?(.*?&)?search_query=(?P<query>[^&]+)(?:[&]|$)'
1672
1673 def _real_extract(self, url):
1674 mobj = re.match(self._VALID_URL, url)
1675 query = compat_urllib_parse.unquote_plus(mobj.group('query'))
1676
1677 webpage = self._download_webpage(url, query)
1678 result_code = self._search_regex(
1679 r'(?s)<ol id="search-results"(.*?)</ol>', webpage, u'result HTML')
1680
1681 part_codes = re.findall(
1682 r'(?s)<h3 class="yt-lockup-title">(.*?)</h3>', result_code)
1683 entries = []
1684 for part_code in part_codes:
1685 part_title = self._html_search_regex(
1686 r'(?s)title="([^"]+)"', part_code, 'item title', fatal=False)
1687 part_url_snippet = self._html_search_regex(
1688 r'(?s)href="([^"]+)"', part_code, 'item URL')
1689 part_url = compat_urlparse.urljoin(
1690 'https://www.youtube.com/', part_url_snippet)
1691 entries.append({
1692 '_type': 'url',
1693 'url': part_url,
1694 'title': part_title,
1695 })
1696
1697 return {
1698 '_type': 'playlist',
1699 'entries': entries,
1700 'title': query,
1701 }
1702
1703
1704class YoutubeShowIE(InfoExtractor):
1705 IE_DESC = u'YouTube.com (multi-season) shows'
1706 _VALID_URL = r'https?://www\.youtube\.com/show/(.*)'
1707 IE_NAME = u'youtube:show'
1708
1709 def _real_extract(self, url):
1710 mobj = re.match(self._VALID_URL, url)
1711 show_name = mobj.group(1)
1712 webpage = self._download_webpage(url, show_name, u'Downloading show webpage')
1713 # There's one playlist for each season of the show
1714 m_seasons = list(re.finditer(r'href="(/playlist\?list=.*?)"', webpage))
1715 self.to_screen(u'%s: Found %s seasons' % (show_name, len(m_seasons)))
1716 return [self.url_result('https://www.youtube.com' + season.group(1), 'YoutubePlaylist') for season in m_seasons]
1717
1718
1719class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
1720 """
1721 Base class for extractors that fetch info from
1722 http://www.youtube.com/feed_ajax
1723 Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.
1724 """
1725 _LOGIN_REQUIRED = True
1726 # use action_load_personal_feed instead of action_load_system_feed
1727 _PERSONAL_FEED = False
1728
1729 @property
1730 def _FEED_TEMPLATE(self):
1731 action = 'action_load_system_feed'
1732 if self._PERSONAL_FEED:
1733 action = 'action_load_personal_feed'
1734 return 'https://www.youtube.com/feed_ajax?%s=1&feed_name=%s&paging=%%s' % (action, self._FEED_NAME)
1735
1736 @property
1737 def IE_NAME(self):
1738 return u'youtube:%s' % self._FEED_NAME
1739
1740 def _real_initialize(self):
1741 self._login()
1742
1743 def _real_extract(self, url):
1744 feed_entries = []
1745 paging = 0
1746 for i in itertools.count(1):
1747 info = self._download_json(self._FEED_TEMPLATE % paging,
1748 u'%s feed' % self._FEED_NAME,
1749 u'Downloading page %s' % i)
1750 feed_html = info.get('feed_html') or info.get('content_html')
1751 m_ids = re.finditer(r'"/watch\?v=(.*?)["&]', feed_html)
1752 ids = orderedSet(m.group(1) for m in m_ids)
1753 feed_entries.extend(
1754 self.url_result(video_id, 'Youtube', video_id=video_id)
1755 for video_id in ids)
1756 if info['paging'] is None:
1757 break
1758 paging = info['paging']
1759 return self.playlist_result(feed_entries, playlist_title=self._PLAYLIST_TITLE)
1760
1761class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor):
1762 IE_DESC = u'YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication)'
1763 _VALID_URL = r'https?://www\.youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
1764 _FEED_NAME = 'subscriptions'
1765 _PLAYLIST_TITLE = u'Youtube Subscriptions'
1766
1767class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
1768 IE_DESC = u'YouTube.com recommended videos, "ytrec" keyword (requires authentication)'
1769 _VALID_URL = r'https?://www\.youtube\.com/feed/recommended|:ytrec(?:ommended)?'
1770 _FEED_NAME = 'recommended'
1771 _PLAYLIST_TITLE = u'Youtube Recommended videos'
1772
1773class YoutubeWatchLaterIE(YoutubeFeedsInfoExtractor):
1774 IE_DESC = u'Youtube watch later list, "ytwatchlater" keyword (requires authentication)'
1775 _VALID_URL = r'https?://www\.youtube\.com/feed/watch_later|:ytwatchlater'
1776 _FEED_NAME = 'watch_later'
1777 _PLAYLIST_TITLE = u'Youtube Watch Later'
1778 _PERSONAL_FEED = True
1779
1780class YoutubeHistoryIE(YoutubeFeedsInfoExtractor):
1781 IE_DESC = u'Youtube watch history, "ythistory" keyword (requires authentication)'
1782 _VALID_URL = u'https?://www\.youtube\.com/feed/history|:ythistory'
1783 _FEED_NAME = 'history'
1784 _PERSONAL_FEED = True
1785 _PLAYLIST_TITLE = u'Youtube Watch History'
1786
1787class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
1788 IE_NAME = u'youtube:favorites'
1789 IE_DESC = u'YouTube.com favourite videos, "ytfav" keyword (requires authentication)'
1790 _VALID_URL = r'https?://www\.youtube\.com/my_favorites|:ytfav(?:ou?rites)?'
1791 _LOGIN_REQUIRED = True
1792
1793 def _real_extract(self, url):
1794 webpage = self._download_webpage('https://www.youtube.com/my_favorites', 'Youtube Favourites videos')
1795 playlist_id = self._search_regex(r'list=(.+?)["&]', webpage, u'favourites playlist id')
1796 return self.url_result(playlist_id, 'YoutubePlaylist')
1797
1798
1799class YoutubeTruncatedURLIE(InfoExtractor):
1800 IE_NAME = 'youtube:truncated_url'
1801 IE_DESC = False # Do not list
1802 _VALID_URL = r'''(?x)
1803 (?:https?://)?[^/]+/watch\?(?:feature=[a-z_]+)?$|
1804 (?:https?://)?(?:www\.)?youtube\.com/attribution_link\?a=[^&]+$
1805 '''
1806
1807 def _real_extract(self, url):
1808 raise ExtractorError(
1809 u'Did you forget to quote the URL? Remember that & is a meta '
1810 u'character in most shells, so you want to put the URL in quotes, '
1811 u'like youtube-dl '
1812 u'"http://www.youtube.com/watch?feature=foo&v=BaW_jenozKc" '
1813 u' or simply youtube-dl BaW_jenozKc .',
1814 expected=True)