]> jfr.im git - yt-dlp.git/blob - youtube_dl/extractor/youtube.py
[youtube] Recognize a second format of the upload_date in the 'watch-uploader-info...
[yt-dlp.git] / youtube_dl / extractor / youtube.py
1 # coding: utf-8
2
3 import collections
4 import errno
5 import io
6 import itertools
7 import json
8 import os.path
9 import re
10 import struct
11 import traceback
12 import zlib
13
14 from .common import InfoExtractor, SearchInfoExtractor
15 from .subtitles import SubtitlesInfoExtractor
16 from ..jsinterp import JSInterpreter
17 from ..utils import (
18 compat_chr,
19 compat_parse_qs,
20 compat_urllib_parse,
21 compat_urllib_request,
22 compat_urlparse,
23 compat_str,
24
25 clean_html,
26 get_cachedir,
27 get_element_by_id,
28 get_element_by_attribute,
29 ExtractorError,
30 int_or_none,
31 PagedList,
32 unescapeHTML,
33 unified_strdate,
34 orderedSet,
35 write_json_file,
36 uppercase_escape,
37 )
38
39 class YoutubeBaseInfoExtractor(InfoExtractor):
40 """Provide base functions for Youtube extractors"""
41 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
42 _LANG_URL = r'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
43 _AGE_URL = 'https://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
44 _NETRC_MACHINE = 'youtube'
45 # If True it will raise an error if no login info is provided
46 _LOGIN_REQUIRED = False
47
48 def _set_language(self):
49 return bool(self._download_webpage(
50 self._LANG_URL, None,
51 note=u'Setting language', errnote='unable to set language',
52 fatal=False))
53
54 def _login(self):
55 (username, password) = self._get_login_info()
56 # No authentication to be performed
57 if username is None:
58 if self._LOGIN_REQUIRED:
59 raise ExtractorError(u'No login info available, needed for using %s.' % self.IE_NAME, expected=True)
60 return False
61
62 login_page = self._download_webpage(
63 self._LOGIN_URL, None,
64 note=u'Downloading login page',
65 errnote=u'unable to fetch login page', fatal=False)
66 if login_page is False:
67 return
68
69 galx = self._search_regex(r'(?s)<input.+?name="GALX".+?value="(.+?)"',
70 login_page, u'Login GALX parameter')
71
72 # Log in
73 login_form_strs = {
74 u'continue': u'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
75 u'Email': username,
76 u'GALX': galx,
77 u'Passwd': password,
78 u'PersistentCookie': u'yes',
79 u'_utf8': u'霱',
80 u'bgresponse': u'js_disabled',
81 u'checkConnection': u'',
82 u'checkedDomains': u'youtube',
83 u'dnConn': u'',
84 u'pstMsg': u'0',
85 u'rmShown': u'1',
86 u'secTok': u'',
87 u'signIn': u'Sign in',
88 u'timeStmp': u'',
89 u'service': u'youtube',
90 u'uilel': u'3',
91 u'hl': u'en_US',
92 }
93 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
94 # chokes on unicode
95 login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
96 login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
97
98 req = compat_urllib_request.Request(self._LOGIN_URL, login_data)
99 login_results = self._download_webpage(
100 req, None,
101 note=u'Logging in', errnote=u'unable to log in', fatal=False)
102 if login_results is False:
103 return False
104 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
105 self._downloader.report_warning(u'unable to log in: bad username or password')
106 return False
107 return True
108
109 def _confirm_age(self):
110 age_form = {
111 'next_url': '/',
112 'action_confirm': 'Confirm',
113 }
114 req = compat_urllib_request.Request(self._AGE_URL,
115 compat_urllib_parse.urlencode(age_form).encode('ascii'))
116
117 self._download_webpage(
118 req, None,
119 note=u'Confirming age', errnote=u'Unable to confirm age')
120 return True
121
122 def _real_initialize(self):
123 if self._downloader is None:
124 return
125 if not self._set_language():
126 return
127 if not self._login():
128 return
129 self._confirm_age()
130
131
132 class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
133 IE_DESC = u'YouTube.com'
134 _VALID_URL = r"""(?x)^
135 (
136 (?:https?://|//)? # http(s):// or protocol-independent URL (optional)
137 (?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/|
138 (?:www\.)?deturl\.com/www\.youtube\.com/|
139 (?:www\.)?pwnyoutube\.com/|
140 (?:www\.)?yourepeat\.com/|
141 tube\.majestyc\.net/|
142 youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains
143 (?:.*?\#/)? # handle anchor (#/) redirect urls
144 (?: # the various things that can precede the ID:
145 (?:(?:v|embed|e)/) # v/ or embed/ or e/
146 |(?: # or the v= param in all its forms
147 (?:(?:watch|movie)(?:_popup)?(?:\.php)?/?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
148 (?:\?|\#!?) # the params delimiter ? or # or #!
149 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
150 v=
151 )
152 ))
153 |youtu\.be/ # just youtu.be/xxxx
154 |https?://(?:www\.)?cleanvideosearch\.com/media/action/yt/watch\?videoId=
155 )
156 )? # all until now is optional -> you can pass the naked ID
157 ([0-9A-Za-z_-]{11}) # here is it! the YouTube video ID
158 (?(1).+)? # if we found the ID, everything can follow
159 $"""
160 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
161 _formats = {
162 '5': {'ext': 'flv', 'width': 400, 'height': 240},
163 '6': {'ext': 'flv', 'width': 450, 'height': 270},
164 '13': {'ext': '3gp'},
165 '17': {'ext': '3gp', 'width': 176, 'height': 144},
166 '18': {'ext': 'mp4', 'width': 640, 'height': 360},
167 '22': {'ext': 'mp4', 'width': 1280, 'height': 720},
168 '34': {'ext': 'flv', 'width': 640, 'height': 360},
169 '35': {'ext': 'flv', 'width': 854, 'height': 480},
170 '36': {'ext': '3gp', 'width': 320, 'height': 240},
171 '37': {'ext': 'mp4', 'width': 1920, 'height': 1080},
172 '38': {'ext': 'mp4', 'width': 4096, 'height': 3072},
173 '43': {'ext': 'webm', 'width': 640, 'height': 360},
174 '44': {'ext': 'webm', 'width': 854, 'height': 480},
175 '45': {'ext': 'webm', 'width': 1280, 'height': 720},
176 '46': {'ext': 'webm', 'width': 1920, 'height': 1080},
177
178
179 # 3d videos
180 '82': {'ext': 'mp4', 'height': 360, 'format_note': '3D', 'preference': -20},
181 '83': {'ext': 'mp4', 'height': 480, 'format_note': '3D', 'preference': -20},
182 '84': {'ext': 'mp4', 'height': 720, 'format_note': '3D', 'preference': -20},
183 '85': {'ext': 'mp4', 'height': 1080, 'format_note': '3D', 'preference': -20},
184 '100': {'ext': 'webm', 'height': 360, 'format_note': '3D', 'preference': -20},
185 '101': {'ext': 'webm', 'height': 480, 'format_note': '3D', 'preference': -20},
186 '102': {'ext': 'webm', 'height': 720, 'format_note': '3D', 'preference': -20},
187
188 # Apple HTTP Live Streaming
189 '92': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'preference': -10},
190 '93': {'ext': 'mp4', 'height': 360, 'format_note': 'HLS', 'preference': -10},
191 '94': {'ext': 'mp4', 'height': 480, 'format_note': 'HLS', 'preference': -10},
192 '95': {'ext': 'mp4', 'height': 720, 'format_note': 'HLS', 'preference': -10},
193 '96': {'ext': 'mp4', 'height': 1080, 'format_note': 'HLS', 'preference': -10},
194 '132': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'preference': -10},
195 '151': {'ext': 'mp4', 'height': 72, 'format_note': 'HLS', 'preference': -10},
196
197 # DASH mp4 video
198 '133': {'ext': 'mp4', 'height': 240, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
199 '134': {'ext': 'mp4', 'height': 360, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
200 '135': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
201 '136': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
202 '137': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
203 '138': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
204 '160': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
205 '264': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
206
207 # Dash mp4 audio
208 '139': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 48, 'preference': -50},
209 '140': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 128, 'preference': -50},
210 '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 256, 'preference': -50},
211
212 # Dash webm
213 '167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
214 '168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
215 '169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
216 '170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
217 '218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
218 '219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
219 '242': {'ext': 'webm', 'height': 240, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
220 '243': {'ext': 'webm', 'height': 360, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
221 '244': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
222 '245': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
223 '246': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
224 '247': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
225 '248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
226
227 # Dash webm audio
228 '171': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'abr': 48, 'preference': -50},
229 '172': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'abr': 256, 'preference': -50},
230
231 # RTMP (unnamed)
232 '_rtmp': {'protocol': 'rtmp'},
233 }
234
235 IE_NAME = u'youtube'
236 _TESTS = [
237 {
238 u"url": u"http://www.youtube.com/watch?v=BaW_jenozKc",
239 u"file": u"BaW_jenozKc.mp4",
240 u"info_dict": {
241 u"title": u"youtube-dl test video \"'/\\ä↭𝕐",
242 u"uploader": u"Philipp Hagemeister",
243 u"uploader_id": u"phihag",
244 u"upload_date": u"20121002",
245 u"description": u"test chars: \"'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .",
246 u"categories": [u'Science & Technology'],
247 }
248 },
249 {
250 u"url": u"http://www.youtube.com/watch?v=UxxajLWwzqY",
251 u"file": u"UxxajLWwzqY.mp4",
252 u"note": u"Test generic use_cipher_signature video (#897)",
253 u"info_dict": {
254 u"upload_date": u"20120506",
255 u"title": u"Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]",
256 u"description": u"md5:fea86fda2d5a5784273df5c7cc994d9f",
257 u"uploader": u"Icona Pop",
258 u"uploader_id": u"IconaPop"
259 }
260 },
261 {
262 u"url": u"https://www.youtube.com/watch?v=07FYdnEawAQ",
263 u"file": u"07FYdnEawAQ.mp4",
264 u"note": u"Test VEVO video with age protection (#956)",
265 u"info_dict": {
266 u"upload_date": u"20130703",
267 u"title": u"Justin Timberlake - Tunnel Vision (Explicit)",
268 u"description": u"md5:64249768eec3bc4276236606ea996373",
269 u"uploader": u"justintimberlakeVEVO",
270 u"uploader_id": u"justintimberlakeVEVO"
271 }
272 },
273 {
274 u"url": u"//www.YouTube.com/watch?v=yZIXLfi8CZQ",
275 u"file": u"yZIXLfi8CZQ.mp4",
276 u"note": u"Embed-only video (#1746)",
277 u"info_dict": {
278 u"upload_date": u"20120608",
279 u"title": u"Principal Sexually Assaults A Teacher - Episode 117 - 8th June 2012",
280 u"description": u"md5:09b78bd971f1e3e289601dfba15ca4f7",
281 u"uploader": u"SET India",
282 u"uploader_id": u"setindia"
283 }
284 },
285 {
286 u"url": u"http://www.youtube.com/watch?v=a9LDPn-MO4I",
287 u"file": u"a9LDPn-MO4I.m4a",
288 u"note": u"256k DASH audio (format 141) via DASH manifest",
289 u"info_dict": {
290 u"upload_date": "20121002",
291 u"uploader_id": "8KVIDEO",
292 u"description": "No description available.",
293 u"uploader": "8KVIDEO",
294 u"title": "UHDTV TEST 8K VIDEO.mp4"
295 },
296 u"params": {
297 u"youtube_include_dash_manifest": True,
298 u"format": "141",
299 },
300 },
301 # DASH manifest with encrypted signature
302 {
303 u'url': u'https://www.youtube.com/watch?v=IB3lcPjvWLA',
304 u'info_dict': {
305 u'id': u'IB3lcPjvWLA',
306 u'ext': u'm4a',
307 u'title': u'Afrojack - The Spark ft. Spree Wilson',
308 u'description': u'md5:9717375db5a9a3992be4668bbf3bc0a8',
309 u'uploader': u'AfrojackVEVO',
310 u'uploader_id': u'AfrojackVEVO',
311 u'upload_date': u'20131011',
312 },
313 u"params": {
314 u'youtube_include_dash_manifest': True,
315 u'format': '141',
316 },
317 },
318 ]
319
320
321 @classmethod
322 def suitable(cls, url):
323 """Receives a URL and returns True if suitable for this IE."""
324 if YoutubePlaylistIE.suitable(url): return False
325 return re.match(cls._VALID_URL, url) is not None
326
327 def __init__(self, *args, **kwargs):
328 super(YoutubeIE, self).__init__(*args, **kwargs)
329 self._player_cache = {}
330
331 def report_video_info_webpage_download(self, video_id):
332 """Report attempt to download video info webpage."""
333 self.to_screen(u'%s: Downloading video info webpage' % video_id)
334
335 def report_information_extraction(self, video_id):
336 """Report attempt to extract video information."""
337 self.to_screen(u'%s: Extracting video information' % video_id)
338
339 def report_unavailable_format(self, video_id, format):
340 """Report extracted video URL."""
341 self.to_screen(u'%s: Format %s not available' % (video_id, format))
342
343 def report_rtmp_download(self):
344 """Indicate the download will use the RTMP protocol."""
345 self.to_screen(u'RTMP download detected')
346
347 def _extract_signature_function(self, video_id, player_url, slen):
348 id_m = re.match(r'.*-(?P<id>[a-zA-Z0-9_-]+)\.(?P<ext>[a-z]+)$',
349 player_url)
350 player_type = id_m.group('ext')
351 player_id = id_m.group('id')
352
353 # Read from filesystem cache
354 func_id = '%s_%s_%d' % (player_type, player_id, slen)
355 assert os.path.basename(func_id) == func_id
356 cache_dir = get_cachedir(self._downloader.params)
357
358 cache_enabled = cache_dir is not None
359 if cache_enabled:
360 cache_fn = os.path.join(os.path.expanduser(cache_dir),
361 u'youtube-sigfuncs',
362 func_id + '.json')
363 try:
364 with io.open(cache_fn, 'r', encoding='utf-8') as cachef:
365 cache_spec = json.load(cachef)
366 return lambda s: u''.join(s[i] for i in cache_spec)
367 except IOError:
368 pass # No cache available
369
370 if player_type == 'js':
371 code = self._download_webpage(
372 player_url, video_id,
373 note=u'Downloading %s player %s' % (player_type, player_id),
374 errnote=u'Download of %s failed' % player_url)
375 res = self._parse_sig_js(code)
376 elif player_type == 'swf':
377 urlh = self._request_webpage(
378 player_url, video_id,
379 note=u'Downloading %s player %s' % (player_type, player_id),
380 errnote=u'Download of %s failed' % player_url)
381 code = urlh.read()
382 res = self._parse_sig_swf(code)
383 else:
384 assert False, 'Invalid player type %r' % player_type
385
386 if cache_enabled:
387 try:
388 test_string = u''.join(map(compat_chr, range(slen)))
389 cache_res = res(test_string)
390 cache_spec = [ord(c) for c in cache_res]
391 try:
392 os.makedirs(os.path.dirname(cache_fn))
393 except OSError as ose:
394 if ose.errno != errno.EEXIST:
395 raise
396 write_json_file(cache_spec, cache_fn)
397 except Exception:
398 tb = traceback.format_exc()
399 self._downloader.report_warning(
400 u'Writing cache to %r failed: %s' % (cache_fn, tb))
401
402 return res
403
404 def _print_sig_code(self, func, slen):
405 def gen_sig_code(idxs):
406 def _genslice(start, end, step):
407 starts = u'' if start == 0 else str(start)
408 ends = (u':%d' % (end+step)) if end + step >= 0 else u':'
409 steps = u'' if step == 1 else (u':%d' % step)
410 return u's[%s%s%s]' % (starts, ends, steps)
411
412 step = None
413 start = '(Never used)' # Quelch pyflakes warnings - start will be
414 # set as soon as step is set
415 for i, prev in zip(idxs[1:], idxs[:-1]):
416 if step is not None:
417 if i - prev == step:
418 continue
419 yield _genslice(start, prev, step)
420 step = None
421 continue
422 if i - prev in [-1, 1]:
423 step = i - prev
424 start = prev
425 continue
426 else:
427 yield u's[%d]' % prev
428 if step is None:
429 yield u's[%d]' % i
430 else:
431 yield _genslice(start, i, step)
432
433 test_string = u''.join(map(compat_chr, range(slen)))
434 cache_res = func(test_string)
435 cache_spec = [ord(c) for c in cache_res]
436 expr_code = u' + '.join(gen_sig_code(cache_spec))
437 code = u'if len(s) == %d:\n return %s\n' % (slen, expr_code)
438 self.to_screen(u'Extracted signature function:\n' + code)
439
440 def _parse_sig_js(self, jscode):
441 funcname = self._search_regex(
442 r'signature=([a-zA-Z]+)', jscode,
443 u'Initial JS player signature function name')
444
445 jsi = JSInterpreter(jscode)
446 initial_function = jsi.extract_function(funcname)
447 return lambda s: initial_function([s])
448
449 def _parse_sig_swf(self, file_contents):
450 if file_contents[1:3] != b'WS':
451 raise ExtractorError(
452 u'Not an SWF file; header is %r' % file_contents[:3])
453 if file_contents[:1] == b'C':
454 content = zlib.decompress(file_contents[8:])
455 else:
456 raise NotImplementedError(u'Unsupported compression format %r' %
457 file_contents[:1])
458
459 def extract_tags(content):
460 pos = 0
461 while pos < len(content):
462 header16 = struct.unpack('<H', content[pos:pos+2])[0]
463 pos += 2
464 tag_code = header16 >> 6
465 tag_len = header16 & 0x3f
466 if tag_len == 0x3f:
467 tag_len = struct.unpack('<I', content[pos:pos+4])[0]
468 pos += 4
469 assert pos+tag_len <= len(content)
470 yield (tag_code, content[pos:pos+tag_len])
471 pos += tag_len
472
473 code_tag = next(tag
474 for tag_code, tag in extract_tags(content)
475 if tag_code == 82)
476 p = code_tag.index(b'\0', 4) + 1
477 code_reader = io.BytesIO(code_tag[p:])
478
479 # Parse ABC (AVM2 ByteCode)
480 def read_int(reader=None):
481 if reader is None:
482 reader = code_reader
483 res = 0
484 shift = 0
485 for _ in range(5):
486 buf = reader.read(1)
487 assert len(buf) == 1
488 b = struct.unpack('<B', buf)[0]
489 res = res | ((b & 0x7f) << shift)
490 if b & 0x80 == 0:
491 break
492 shift += 7
493 return res
494
495 def u30(reader=None):
496 res = read_int(reader)
497 assert res & 0xf0000000 == 0
498 return res
499 u32 = read_int
500
501 def s32(reader=None):
502 v = read_int(reader)
503 if v & 0x80000000 != 0:
504 v = - ((v ^ 0xffffffff) + 1)
505 return v
506
507 def read_string(reader=None):
508 if reader is None:
509 reader = code_reader
510 slen = u30(reader)
511 resb = reader.read(slen)
512 assert len(resb) == slen
513 return resb.decode('utf-8')
514
515 def read_bytes(count, reader=None):
516 if reader is None:
517 reader = code_reader
518 resb = reader.read(count)
519 assert len(resb) == count
520 return resb
521
522 def read_byte(reader=None):
523 resb = read_bytes(1, reader=reader)
524 res = struct.unpack('<B', resb)[0]
525 return res
526
527 # minor_version + major_version
528 read_bytes(2 + 2)
529
530 # Constant pool
531 int_count = u30()
532 for _c in range(1, int_count):
533 s32()
534 uint_count = u30()
535 for _c in range(1, uint_count):
536 u32()
537 double_count = u30()
538 read_bytes((double_count-1) * 8)
539 string_count = u30()
540 constant_strings = [u'']
541 for _c in range(1, string_count):
542 s = read_string()
543 constant_strings.append(s)
544 namespace_count = u30()
545 for _c in range(1, namespace_count):
546 read_bytes(1) # kind
547 u30() # name
548 ns_set_count = u30()
549 for _c in range(1, ns_set_count):
550 count = u30()
551 for _c2 in range(count):
552 u30()
553 multiname_count = u30()
554 MULTINAME_SIZES = {
555 0x07: 2, # QName
556 0x0d: 2, # QNameA
557 0x0f: 1, # RTQName
558 0x10: 1, # RTQNameA
559 0x11: 0, # RTQNameL
560 0x12: 0, # RTQNameLA
561 0x09: 2, # Multiname
562 0x0e: 2, # MultinameA
563 0x1b: 1, # MultinameL
564 0x1c: 1, # MultinameLA
565 }
566 multinames = [u'']
567 for _c in range(1, multiname_count):
568 kind = u30()
569 assert kind in MULTINAME_SIZES, u'Invalid multiname kind %r' % kind
570 if kind == 0x07:
571 u30() # namespace_idx
572 name_idx = u30()
573 multinames.append(constant_strings[name_idx])
574 else:
575 multinames.append('[MULTINAME kind: %d]' % kind)
576 for _c2 in range(MULTINAME_SIZES[kind]):
577 u30()
578
579 # Methods
580 method_count = u30()
581 MethodInfo = collections.namedtuple(
582 'MethodInfo',
583 ['NEED_ARGUMENTS', 'NEED_REST'])
584 method_infos = []
585 for method_id in range(method_count):
586 param_count = u30()
587 u30() # return type
588 for _ in range(param_count):
589 u30() # param type
590 u30() # name index (always 0 for youtube)
591 flags = read_byte()
592 if flags & 0x08 != 0:
593 # Options present
594 option_count = u30()
595 for c in range(option_count):
596 u30() # val
597 read_bytes(1) # kind
598 if flags & 0x80 != 0:
599 # Param names present
600 for _ in range(param_count):
601 u30() # param name
602 mi = MethodInfo(flags & 0x01 != 0, flags & 0x04 != 0)
603 method_infos.append(mi)
604
605 # Metadata
606 metadata_count = u30()
607 for _c in range(metadata_count):
608 u30() # name
609 item_count = u30()
610 for _c2 in range(item_count):
611 u30() # key
612 u30() # value
613
614 def parse_traits_info():
615 trait_name_idx = u30()
616 kind_full = read_byte()
617 kind = kind_full & 0x0f
618 attrs = kind_full >> 4
619 methods = {}
620 if kind in [0x00, 0x06]: # Slot or Const
621 u30() # Slot id
622 u30() # type_name_idx
623 vindex = u30()
624 if vindex != 0:
625 read_byte() # vkind
626 elif kind in [0x01, 0x02, 0x03]: # Method / Getter / Setter
627 u30() # disp_id
628 method_idx = u30()
629 methods[multinames[trait_name_idx]] = method_idx
630 elif kind == 0x04: # Class
631 u30() # slot_id
632 u30() # classi
633 elif kind == 0x05: # Function
634 u30() # slot_id
635 function_idx = u30()
636 methods[function_idx] = multinames[trait_name_idx]
637 else:
638 raise ExtractorError(u'Unsupported trait kind %d' % kind)
639
640 if attrs & 0x4 != 0: # Metadata present
641 metadata_count = u30()
642 for _c3 in range(metadata_count):
643 u30() # metadata index
644
645 return methods
646
647 # Classes
648 TARGET_CLASSNAME = u'SignatureDecipher'
649 searched_idx = multinames.index(TARGET_CLASSNAME)
650 searched_class_id = None
651 class_count = u30()
652 for class_id in range(class_count):
653 name_idx = u30()
654 if name_idx == searched_idx:
655 # We found the class we're looking for!
656 searched_class_id = class_id
657 u30() # super_name idx
658 flags = read_byte()
659 if flags & 0x08 != 0: # Protected namespace is present
660 u30() # protected_ns_idx
661 intrf_count = u30()
662 for _c2 in range(intrf_count):
663 u30()
664 u30() # iinit
665 trait_count = u30()
666 for _c2 in range(trait_count):
667 parse_traits_info()
668
669 if searched_class_id is None:
670 raise ExtractorError(u'Target class %r not found' %
671 TARGET_CLASSNAME)
672
673 method_names = {}
674 method_idxs = {}
675 for class_id in range(class_count):
676 u30() # cinit
677 trait_count = u30()
678 for _c2 in range(trait_count):
679 trait_methods = parse_traits_info()
680 if class_id == searched_class_id:
681 method_names.update(trait_methods.items())
682 method_idxs.update(dict(
683 (idx, name)
684 for name, idx in trait_methods.items()))
685
686 # Scripts
687 script_count = u30()
688 for _c in range(script_count):
689 u30() # init
690 trait_count = u30()
691 for _c2 in range(trait_count):
692 parse_traits_info()
693
694 # Method bodies
695 method_body_count = u30()
696 Method = collections.namedtuple('Method', ['code', 'local_count'])
697 methods = {}
698 for _c in range(method_body_count):
699 method_idx = u30()
700 u30() # max_stack
701 local_count = u30()
702 u30() # init_scope_depth
703 u30() # max_scope_depth
704 code_length = u30()
705 code = read_bytes(code_length)
706 if method_idx in method_idxs:
707 m = Method(code, local_count)
708 methods[method_idxs[method_idx]] = m
709 exception_count = u30()
710 for _c2 in range(exception_count):
711 u30() # from
712 u30() # to
713 u30() # target
714 u30() # exc_type
715 u30() # var_name
716 trait_count = u30()
717 for _c2 in range(trait_count):
718 parse_traits_info()
719
720 assert p + code_reader.tell() == len(code_tag)
721 assert len(methods) == len(method_idxs)
722
723 method_pyfunctions = {}
724
725 def extract_function(func_name):
726 if func_name in method_pyfunctions:
727 return method_pyfunctions[func_name]
728 if func_name not in methods:
729 raise ExtractorError(u'Cannot find function %r' % func_name)
730 m = methods[func_name]
731
732 def resfunc(args):
733 registers = ['(this)'] + list(args) + [None] * m.local_count
734 stack = []
735 coder = io.BytesIO(m.code)
736 while True:
737 opcode = struct.unpack('!B', coder.read(1))[0]
738 if opcode == 36: # pushbyte
739 v = struct.unpack('!B', coder.read(1))[0]
740 stack.append(v)
741 elif opcode == 44: # pushstring
742 idx = u30(coder)
743 stack.append(constant_strings[idx])
744 elif opcode == 48: # pushscope
745 # We don't implement the scope register, so we'll just
746 # ignore the popped value
747 stack.pop()
748 elif opcode == 70: # callproperty
749 index = u30(coder)
750 mname = multinames[index]
751 arg_count = u30(coder)
752 args = list(reversed(
753 [stack.pop() for _ in range(arg_count)]))
754 obj = stack.pop()
755 if mname == u'split':
756 assert len(args) == 1
757 assert isinstance(args[0], compat_str)
758 assert isinstance(obj, compat_str)
759 if args[0] == u'':
760 res = list(obj)
761 else:
762 res = obj.split(args[0])
763 stack.append(res)
764 elif mname == u'slice':
765 assert len(args) == 1
766 assert isinstance(args[0], int)
767 assert isinstance(obj, list)
768 res = obj[args[0]:]
769 stack.append(res)
770 elif mname == u'join':
771 assert len(args) == 1
772 assert isinstance(args[0], compat_str)
773 assert isinstance(obj, list)
774 res = args[0].join(obj)
775 stack.append(res)
776 elif mname in method_pyfunctions:
777 stack.append(method_pyfunctions[mname](args))
778 else:
779 raise NotImplementedError(
780 u'Unsupported property %r on %r'
781 % (mname, obj))
782 elif opcode == 72: # returnvalue
783 res = stack.pop()
784 return res
785 elif opcode == 79: # callpropvoid
786 index = u30(coder)
787 mname = multinames[index]
788 arg_count = u30(coder)
789 args = list(reversed(
790 [stack.pop() for _ in range(arg_count)]))
791 obj = stack.pop()
792 if mname == u'reverse':
793 assert isinstance(obj, list)
794 obj.reverse()
795 else:
796 raise NotImplementedError(
797 u'Unsupported (void) property %r on %r'
798 % (mname, obj))
799 elif opcode == 93: # findpropstrict
800 index = u30(coder)
801 mname = multinames[index]
802 res = extract_function(mname)
803 stack.append(res)
804 elif opcode == 97: # setproperty
805 index = u30(coder)
806 value = stack.pop()
807 idx = stack.pop()
808 obj = stack.pop()
809 assert isinstance(obj, list)
810 assert isinstance(idx, int)
811 obj[idx] = value
812 elif opcode == 98: # getlocal
813 index = u30(coder)
814 stack.append(registers[index])
815 elif opcode == 99: # setlocal
816 index = u30(coder)
817 value = stack.pop()
818 registers[index] = value
819 elif opcode == 102: # getproperty
820 index = u30(coder)
821 pname = multinames[index]
822 if pname == u'length':
823 obj = stack.pop()
824 assert isinstance(obj, list)
825 stack.append(len(obj))
826 else: # Assume attribute access
827 idx = stack.pop()
828 assert isinstance(idx, int)
829 obj = stack.pop()
830 assert isinstance(obj, list)
831 stack.append(obj[idx])
832 elif opcode == 128: # coerce
833 u30(coder)
834 elif opcode == 133: # coerce_s
835 assert isinstance(stack[-1], (type(None), compat_str))
836 elif opcode == 164: # modulo
837 value2 = stack.pop()
838 value1 = stack.pop()
839 res = value1 % value2
840 stack.append(res)
841 elif opcode == 208: # getlocal_0
842 stack.append(registers[0])
843 elif opcode == 209: # getlocal_1
844 stack.append(registers[1])
845 elif opcode == 210: # getlocal_2
846 stack.append(registers[2])
847 elif opcode == 211: # getlocal_3
848 stack.append(registers[3])
849 elif opcode == 214: # setlocal_2
850 registers[2] = stack.pop()
851 elif opcode == 215: # setlocal_3
852 registers[3] = stack.pop()
853 else:
854 raise NotImplementedError(
855 u'Unsupported opcode %d' % opcode)
856
857 method_pyfunctions[func_name] = resfunc
858 return resfunc
859
860 initial_function = extract_function(u'decipher')
861 return lambda s: initial_function([s])
862
863 def _decrypt_signature(self, s, video_id, player_url, age_gate=False):
864 """Turn the encrypted s field into a working signature"""
865
866 if player_url is not None:
867 if player_url.startswith(u'//'):
868 player_url = u'https:' + player_url
869 try:
870 player_id = (player_url, len(s))
871 if player_id not in self._player_cache:
872 func = self._extract_signature_function(
873 video_id, player_url, len(s)
874 )
875 self._player_cache[player_id] = func
876 func = self._player_cache[player_id]
877 if self._downloader.params.get('youtube_print_sig_code'):
878 self._print_sig_code(func, len(s))
879 return func(s)
880 except Exception:
881 tb = traceback.format_exc()
882 self._downloader.report_warning(
883 u'Automatic signature extraction failed: ' + tb)
884
885 self._downloader.report_warning(
886 u'Warning: Falling back to static signature algorithm')
887
888 return self._static_decrypt_signature(
889 s, video_id, player_url, age_gate)
890
891 def _static_decrypt_signature(self, s, video_id, player_url, age_gate):
892 if age_gate:
893 # The videos with age protection use another player, so the
894 # algorithms can be different.
895 if len(s) == 86:
896 return s[2:63] + s[82] + s[64:82] + s[63]
897
898 if len(s) == 93:
899 return s[86:29:-1] + s[88] + s[28:5:-1]
900 elif len(s) == 92:
901 return s[25] + s[3:25] + s[0] + s[26:42] + s[79] + s[43:79] + s[91] + s[80:83]
902 elif len(s) == 91:
903 return s[84:27:-1] + s[86] + s[26:5:-1]
904 elif len(s) == 90:
905 return s[25] + s[3:25] + s[2] + s[26:40] + s[77] + s[41:77] + s[89] + s[78:81]
906 elif len(s) == 89:
907 return s[84:78:-1] + s[87] + s[77:60:-1] + s[0] + s[59:3:-1]
908 elif len(s) == 88:
909 return s[7:28] + s[87] + s[29:45] + s[55] + s[46:55] + s[2] + s[56:87] + s[28]
910 elif len(s) == 87:
911 return s[6:27] + s[4] + s[28:39] + s[27] + s[40:59] + s[2] + s[60:]
912 elif len(s) == 86:
913 return s[80:72:-1] + s[16] + s[71:39:-1] + s[72] + s[38:16:-1] + s[82] + s[15::-1]
914 elif len(s) == 85:
915 return s[3:11] + s[0] + s[12:55] + s[84] + s[56:84]
916 elif len(s) == 84:
917 return s[78:70:-1] + s[14] + s[69:37:-1] + s[70] + s[36:14:-1] + s[80] + s[:14][::-1]
918 elif len(s) == 83:
919 return s[80:63:-1] + s[0] + s[62:0:-1] + s[63]
920 elif len(s) == 82:
921 return s[80:37:-1] + s[7] + s[36:7:-1] + s[0] + s[6:0:-1] + s[37]
922 elif len(s) == 81:
923 return s[56] + s[79:56:-1] + s[41] + s[55:41:-1] + s[80] + s[40:34:-1] + s[0] + s[33:29:-1] + s[34] + s[28:9:-1] + s[29] + s[8:0:-1] + s[9]
924 elif len(s) == 80:
925 return s[1:19] + s[0] + s[20:68] + s[19] + s[69:80]
926 elif len(s) == 79:
927 return s[54] + s[77:54:-1] + s[39] + s[53:39:-1] + s[78] + s[38:34:-1] + s[0] + s[33:29:-1] + s[34] + s[28:9:-1] + s[29] + s[8:0:-1] + s[9]
928
929 else:
930 raise ExtractorError(u'Unable to decrypt signature, key length %d not supported; retrying might work' % (len(s)))
931
932 def _get_available_subtitles(self, video_id, webpage):
933 try:
934 sub_list = self._download_webpage(
935 'https://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,
936 video_id, note=False)
937 except ExtractorError as err:
938 self._downloader.report_warning(u'unable to download video subtitles: %s' % compat_str(err))
939 return {}
940 lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
941
942 sub_lang_list = {}
943 for l in lang_list:
944 lang = l[1]
945 params = compat_urllib_parse.urlencode({
946 'lang': lang,
947 'v': video_id,
948 'fmt': self._downloader.params.get('subtitlesformat', 'srt'),
949 'name': unescapeHTML(l[0]).encode('utf-8'),
950 })
951 url = u'https://www.youtube.com/api/timedtext?' + params
952 sub_lang_list[lang] = url
953 if not sub_lang_list:
954 self._downloader.report_warning(u'video doesn\'t have subtitles')
955 return {}
956 return sub_lang_list
957
958 def _get_available_automatic_caption(self, video_id, webpage):
959 """We need the webpage for getting the captions url, pass it as an
960 argument to speed up the process."""
961 sub_format = self._downloader.params.get('subtitlesformat', 'srt')
962 self.to_screen(u'%s: Looking for automatic captions' % video_id)
963 mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
964 err_msg = u'Couldn\'t find automatic captions for %s' % video_id
965 if mobj is None:
966 self._downloader.report_warning(err_msg)
967 return {}
968 player_config = json.loads(mobj.group(1))
969 try:
970 args = player_config[u'args']
971 caption_url = args[u'ttsurl']
972 timestamp = args[u'timestamp']
973 # We get the available subtitles
974 list_params = compat_urllib_parse.urlencode({
975 'type': 'list',
976 'tlangs': 1,
977 'asrs': 1,
978 })
979 list_url = caption_url + '&' + list_params
980 caption_list = self._download_xml(list_url, video_id)
981 original_lang_node = caption_list.find('track')
982 if original_lang_node is None or original_lang_node.attrib.get('kind') != 'asr' :
983 self._downloader.report_warning(u'Video doesn\'t have automatic captions')
984 return {}
985 original_lang = original_lang_node.attrib['lang_code']
986
987 sub_lang_list = {}
988 for lang_node in caption_list.findall('target'):
989 sub_lang = lang_node.attrib['lang_code']
990 params = compat_urllib_parse.urlencode({
991 'lang': original_lang,
992 'tlang': sub_lang,
993 'fmt': sub_format,
994 'ts': timestamp,
995 'kind': 'asr',
996 })
997 sub_lang_list[sub_lang] = caption_url + '&' + params
998 return sub_lang_list
999 # An extractor error can be raise by the download process if there are
1000 # no automatic captions but there are subtitles
1001 except (KeyError, ExtractorError):
1002 self._downloader.report_warning(err_msg)
1003 return {}
1004
1005 @classmethod
1006 def extract_id(cls, url):
1007 mobj = re.match(cls._VALID_URL, url, re.VERBOSE)
1008 if mobj is None:
1009 raise ExtractorError(u'Invalid URL: %s' % url)
1010 video_id = mobj.group(2)
1011 return video_id
1012
1013 def _extract_from_m3u8(self, manifest_url, video_id):
1014 url_map = {}
1015 def _get_urls(_manifest):
1016 lines = _manifest.split('\n')
1017 urls = filter(lambda l: l and not l.startswith('#'),
1018 lines)
1019 return urls
1020 manifest = self._download_webpage(manifest_url, video_id, u'Downloading formats manifest')
1021 formats_urls = _get_urls(manifest)
1022 for format_url in formats_urls:
1023 itag = self._search_regex(r'itag/(\d+?)/', format_url, 'itag')
1024 url_map[itag] = format_url
1025 return url_map
1026
1027 def _extract_annotations(self, video_id):
1028 url = 'https://www.youtube.com/annotations_invideo?features=1&legacy=1&video_id=%s' % video_id
1029 return self._download_webpage(url, video_id, note=u'Searching for annotations.', errnote=u'Unable to download video annotations.')
1030
1031 def _real_extract(self, url):
1032 proto = (
1033 u'http' if self._downloader.params.get('prefer_insecure', False)
1034 else u'https')
1035
1036 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
1037 mobj = re.search(self._NEXT_URL_RE, url)
1038 if mobj:
1039 url = proto + '://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
1040 video_id = self.extract_id(url)
1041
1042 # Get video webpage
1043 url = proto + '://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
1044 video_webpage = self._download_webpage(url, video_id)
1045
1046 # Attempt to extract SWF player URL
1047 mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
1048 if mobj is not None:
1049 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1050 else:
1051 player_url = None
1052
1053 # Get video info
1054 self.report_video_info_webpage_download(video_id)
1055 if re.search(r'player-age-gate-content">', video_webpage) is not None:
1056 self.report_age_confirmation()
1057 age_gate = True
1058 # We simulate the access to the video from www.youtube.com/v/{video_id}
1059 # this can be viewed without login into Youtube
1060 data = compat_urllib_parse.urlencode({'video_id': video_id,
1061 'el': 'player_embedded',
1062 'gl': 'US',
1063 'hl': 'en',
1064 'eurl': 'https://youtube.googleapis.com/v/' + video_id,
1065 'asv': 3,
1066 'sts':'1588',
1067 })
1068 video_info_url = proto + '://www.youtube.com/get_video_info?' + data
1069 video_info_webpage = self._download_webpage(video_info_url, video_id,
1070 note=False,
1071 errnote='unable to download video info webpage')
1072 video_info = compat_parse_qs(video_info_webpage)
1073 else:
1074 age_gate = False
1075 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1076 video_info_url = (proto + '://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1077 % (video_id, el_type))
1078 video_info_webpage = self._download_webpage(video_info_url, video_id,
1079 note=False,
1080 errnote='unable to download video info webpage')
1081 video_info = compat_parse_qs(video_info_webpage)
1082 if 'token' in video_info:
1083 break
1084 if 'token' not in video_info:
1085 if 'reason' in video_info:
1086 raise ExtractorError(
1087 u'YouTube said: %s' % video_info['reason'][0],
1088 expected=True, video_id=video_id)
1089 else:
1090 raise ExtractorError(
1091 u'"token" parameter not in video info for unknown reason',
1092 video_id=video_id)
1093
1094 if 'view_count' in video_info:
1095 view_count = int(video_info['view_count'][0])
1096 else:
1097 view_count = None
1098
1099 # Check for "rental" videos
1100 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
1101 raise ExtractorError(u'"rental" videos not supported')
1102
1103 # Start extracting information
1104 self.report_information_extraction(video_id)
1105
1106 # uploader
1107 if 'author' not in video_info:
1108 raise ExtractorError(u'Unable to extract uploader name')
1109 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
1110
1111 # uploader_id
1112 video_uploader_id = None
1113 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
1114 if mobj is not None:
1115 video_uploader_id = mobj.group(1)
1116 else:
1117 self._downloader.report_warning(u'unable to extract uploader nickname')
1118
1119 # title
1120 if 'title' in video_info:
1121 video_title = video_info['title'][0]
1122 else:
1123 self._downloader.report_warning(u'Unable to extract video title')
1124 video_title = u'_'
1125
1126 # thumbnail image
1127 # We try first to get a high quality image:
1128 m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">',
1129 video_webpage, re.DOTALL)
1130 if m_thumb is not None:
1131 video_thumbnail = m_thumb.group(1)
1132 elif 'thumbnail_url' not in video_info:
1133 self._downloader.report_warning(u'unable to extract video thumbnail')
1134 video_thumbnail = None
1135 else: # don't panic if we can't find it
1136 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
1137
1138 # upload date
1139 upload_date = None
1140 mobj = re.search(r'(?s)id="eow-date.*?>(.*?)</span>', video_webpage)
1141 if mobj is None:
1142 mobj = re.search(
1143 r'(?s)id="watch-uploader-info".*?>.*?(?:Published|Uploaded) on (.*?)</strong>',
1144 video_webpage)
1145 if mobj is not None:
1146 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1147 upload_date = unified_strdate(upload_date)
1148
1149 m_cat_container = get_element_by_id("eow-category", video_webpage)
1150 if m_cat_container:
1151 category = self._html_search_regex(
1152 r'(?s)<a[^<]+>(.*?)</a>', m_cat_container, 'category',
1153 default=None)
1154 video_categories = None if category is None else [category]
1155 else:
1156 video_categories = None
1157
1158 # description
1159 video_description = get_element_by_id("eow-description", video_webpage)
1160 if video_description:
1161 video_description = re.sub(r'''(?x)
1162 <a\s+
1163 (?:[a-zA-Z-]+="[^"]+"\s+)*?
1164 title="([^"]+)"\s+
1165 (?:[a-zA-Z-]+="[^"]+"\s+)*?
1166 class="yt-uix-redirect-link"\s*>
1167 [^<]+
1168 </a>
1169 ''', r'\1', video_description)
1170 video_description = clean_html(video_description)
1171 else:
1172 fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
1173 if fd_mobj:
1174 video_description = unescapeHTML(fd_mobj.group(1))
1175 else:
1176 video_description = u''
1177
1178 def _extract_count(klass):
1179 count = self._search_regex(
1180 r'class="%s">([\d,]+)</span>' % re.escape(klass),
1181 video_webpage, klass, default=None)
1182 if count is not None:
1183 return int(count.replace(',', ''))
1184 return None
1185 like_count = _extract_count(u'likes-count')
1186 dislike_count = _extract_count(u'dislikes-count')
1187
1188 # subtitles
1189 video_subtitles = self.extract_subtitles(video_id, video_webpage)
1190
1191 if self._downloader.params.get('listsubtitles', False):
1192 self._list_available_subtitles(video_id, video_webpage)
1193 return
1194
1195 if 'length_seconds' not in video_info:
1196 self._downloader.report_warning(u'unable to extract video duration')
1197 video_duration = None
1198 else:
1199 video_duration = int(compat_urllib_parse.unquote_plus(video_info['length_seconds'][0]))
1200
1201 # annotations
1202 video_annotations = None
1203 if self._downloader.params.get('writeannotations', False):
1204 video_annotations = self._extract_annotations(video_id)
1205
1206 # Decide which formats to download
1207 try:
1208 mobj = re.search(r';ytplayer\.config\s*=\s*({.*?});', video_webpage)
1209 if not mobj:
1210 raise ValueError('Could not find vevo ID')
1211 json_code = uppercase_escape(mobj.group(1))
1212 ytplayer_config = json.loads(json_code)
1213 args = ytplayer_config['args']
1214 # Easy way to know if the 's' value is in url_encoded_fmt_stream_map
1215 # this signatures are encrypted
1216 if 'url_encoded_fmt_stream_map' not in args:
1217 raise ValueError(u'No stream_map present') # caught below
1218 re_signature = re.compile(r'[&,]s=')
1219 m_s = re_signature.search(args['url_encoded_fmt_stream_map'])
1220 if m_s is not None:
1221 self.to_screen(u'%s: Encrypted signatures detected.' % video_id)
1222 video_info['url_encoded_fmt_stream_map'] = [args['url_encoded_fmt_stream_map']]
1223 m_s = re_signature.search(args.get('adaptive_fmts', u''))
1224 if m_s is not None:
1225 if 'adaptive_fmts' in video_info:
1226 video_info['adaptive_fmts'][0] += ',' + args['adaptive_fmts']
1227 else:
1228 video_info['adaptive_fmts'] = [args['adaptive_fmts']]
1229 except ValueError:
1230 pass
1231
1232 def _map_to_format_list(urlmap):
1233 formats = []
1234 for itag, video_real_url in urlmap.items():
1235 dct = {
1236 'format_id': itag,
1237 'url': video_real_url,
1238 'player_url': player_url,
1239 }
1240 if itag in self._formats:
1241 dct.update(self._formats[itag])
1242 formats.append(dct)
1243 return formats
1244
1245 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1246 self.report_rtmp_download()
1247 formats = [{
1248 'format_id': '_rtmp',
1249 'protocol': 'rtmp',
1250 'url': video_info['conn'][0],
1251 'player_url': player_url,
1252 }]
1253 elif len(video_info.get('url_encoded_fmt_stream_map', [])) >= 1 or len(video_info.get('adaptive_fmts', [])) >= 1:
1254 encoded_url_map = video_info.get('url_encoded_fmt_stream_map', [''])[0] + ',' + video_info.get('adaptive_fmts',[''])[0]
1255 if 'rtmpe%3Dyes' in encoded_url_map:
1256 raise ExtractorError('rtmpe downloads are not supported, see https://github.com/rg3/youtube-dl/issues/343 for more information.', expected=True)
1257 url_map = {}
1258 for url_data_str in encoded_url_map.split(','):
1259 url_data = compat_parse_qs(url_data_str)
1260 if 'itag' in url_data and 'url' in url_data:
1261 url = url_data['url'][0]
1262 if 'sig' in url_data:
1263 url += '&signature=' + url_data['sig'][0]
1264 elif 's' in url_data:
1265 encrypted_sig = url_data['s'][0]
1266 if self._downloader.params.get('verbose'):
1267 if age_gate:
1268 if player_url is None:
1269 player_version = 'unknown'
1270 else:
1271 player_version = self._search_regex(
1272 r'-(.+)\.swf$', player_url,
1273 u'flash player', fatal=False)
1274 player_desc = 'flash player %s' % player_version
1275 else:
1276 player_version = self._search_regex(
1277 r'html5player-(.+?)\.js', video_webpage,
1278 'html5 player', fatal=False)
1279 player_desc = u'html5 player %s' % player_version
1280
1281 parts_sizes = u'.'.join(compat_str(len(part)) for part in encrypted_sig.split('.'))
1282 self.to_screen(u'encrypted signature length %d (%s), itag %s, %s' %
1283 (len(encrypted_sig), parts_sizes, url_data['itag'][0], player_desc))
1284
1285 if not age_gate:
1286 jsplayer_url_json = self._search_regex(
1287 r'"assets":.+?"js":\s*("[^"]+")',
1288 video_webpage, u'JS player URL')
1289 player_url = json.loads(jsplayer_url_json)
1290
1291 signature = self._decrypt_signature(
1292 encrypted_sig, video_id, player_url, age_gate)
1293 url += '&signature=' + signature
1294 if 'ratebypass' not in url:
1295 url += '&ratebypass=yes'
1296 url_map[url_data['itag'][0]] = url
1297 formats = _map_to_format_list(url_map)
1298 elif video_info.get('hlsvp'):
1299 manifest_url = video_info['hlsvp'][0]
1300 url_map = self._extract_from_m3u8(manifest_url, video_id)
1301 formats = _map_to_format_list(url_map)
1302 else:
1303 raise ExtractorError(u'no conn, hlsvp or url_encoded_fmt_stream_map information found in video info')
1304
1305 # Look for the DASH manifest
1306 if (self._downloader.params.get('youtube_include_dash_manifest', False)):
1307 try:
1308 # The DASH manifest used needs to be the one from the original video_webpage.
1309 # The one found in get_video_info seems to be using different signatures.
1310 # However, in the case of an age restriction there won't be any embedded dashmpd in the video_webpage.
1311 # Luckily, it seems, this case uses some kind of default signature (len == 86), so the
1312 # combination of get_video_info and the _static_decrypt_signature() decryption fallback will work here.
1313 if age_gate:
1314 dash_manifest_url = video_info.get('dashmpd')[0]
1315 else:
1316 dash_manifest_url = ytplayer_config['args']['dashmpd']
1317 def decrypt_sig(mobj):
1318 s = mobj.group(1)
1319 dec_s = self._decrypt_signature(s, video_id, player_url, age_gate)
1320 return '/signature/%s' % dec_s
1321 dash_manifest_url = re.sub(r'/s/([\w\.]+)', decrypt_sig, dash_manifest_url)
1322 dash_doc = self._download_xml(
1323 dash_manifest_url, video_id,
1324 note=u'Downloading DASH manifest',
1325 errnote=u'Could not download DASH manifest')
1326 for r in dash_doc.findall(u'.//{urn:mpeg:DASH:schema:MPD:2011}Representation'):
1327 url_el = r.find('{urn:mpeg:DASH:schema:MPD:2011}BaseURL')
1328 if url_el is None:
1329 continue
1330 format_id = r.attrib['id']
1331 video_url = url_el.text
1332 filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength'))
1333 f = {
1334 'format_id': format_id,
1335 'url': video_url,
1336 'width': int_or_none(r.attrib.get('width')),
1337 'tbr': int_or_none(r.attrib.get('bandwidth'), 1000),
1338 'asr': int_or_none(r.attrib.get('audioSamplingRate')),
1339 'filesize': filesize,
1340 }
1341 try:
1342 existing_format = next(
1343 fo for fo in formats
1344 if fo['format_id'] == format_id)
1345 except StopIteration:
1346 f.update(self._formats.get(format_id, {}))
1347 formats.append(f)
1348 else:
1349 existing_format.update(f)
1350
1351 except (ExtractorError, KeyError) as e:
1352 self.report_warning(u'Skipping DASH manifest: %s' % e, video_id)
1353
1354 self._sort_formats(formats)
1355
1356 return {
1357 'id': video_id,
1358 'uploader': video_uploader,
1359 'uploader_id': video_uploader_id,
1360 'upload_date': upload_date,
1361 'title': video_title,
1362 'thumbnail': video_thumbnail,
1363 'description': video_description,
1364 'categories': video_categories,
1365 'subtitles': video_subtitles,
1366 'duration': video_duration,
1367 'age_limit': 18 if age_gate else 0,
1368 'annotations': video_annotations,
1369 'webpage_url': proto + '://www.youtube.com/watch?v=%s' % video_id,
1370 'view_count': view_count,
1371 'like_count': like_count,
1372 'dislike_count': dislike_count,
1373 'formats': formats,
1374 }
1375
1376 class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
1377 IE_DESC = u'YouTube.com playlists'
1378 _VALID_URL = r"""(?x)(?:
1379 (?:https?://)?
1380 (?:\w+\.)?
1381 youtube\.com/
1382 (?:
1383 (?:course|view_play_list|my_playlists|artist|playlist|watch)
1384 \? (?:.*?&)*? (?:p|a|list)=
1385 | p/
1386 )
1387 (
1388 (?:PL|EC|UU|FL|RD)?[0-9A-Za-z-_]{10,}
1389 # Top tracks, they can also include dots
1390 |(?:MC)[\w\.]*
1391 )
1392 .*
1393 |
1394 ((?:PL|EC|UU|FL|RD)[0-9A-Za-z-_]{10,})
1395 )"""
1396 _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s'
1397 _MORE_PAGES_INDICATOR = r'data-link-type="next"'
1398 _VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})&amp;[^"]*?index=(?P<index>\d+)'
1399 IE_NAME = u'youtube:playlist'
1400
1401 def _real_initialize(self):
1402 self._login()
1403
1404 def _ids_to_results(self, ids):
1405 return [self.url_result(vid_id, 'Youtube', video_id=vid_id)
1406 for vid_id in ids]
1407
1408 def _extract_mix(self, playlist_id):
1409 # The mixes are generated from a a single video
1410 # the id of the playlist is just 'RD' + video_id
1411 url = 'https://youtube.com/watch?v=%s&list=%s' % (playlist_id[-11:], playlist_id)
1412 webpage = self._download_webpage(url, playlist_id, u'Downloading Youtube mix')
1413 search_title = lambda class_name: get_element_by_attribute('class', class_name, webpage)
1414 title_span = (search_title('playlist-title') or
1415 search_title('title long-title') or search_title('title'))
1416 title = clean_html(title_span)
1417 video_re = r'''(?x)data-video-username="(.*?)".*?
1418 href="/watch\?v=([0-9A-Za-z_-]{11})&amp;[^"]*?list=%s''' % re.escape(playlist_id)
1419 matches = orderedSet(re.findall(video_re, webpage, flags=re.DOTALL))
1420 # Some of the videos may have been deleted, their username field is empty
1421 ids = [video_id for (username, video_id) in matches if username]
1422 url_results = self._ids_to_results(ids)
1423
1424 return self.playlist_result(url_results, playlist_id, title)
1425
1426 def _real_extract(self, url):
1427 # Extract playlist id
1428 mobj = re.match(self._VALID_URL, url)
1429 if mobj is None:
1430 raise ExtractorError(u'Invalid URL: %s' % url)
1431 playlist_id = mobj.group(1) or mobj.group(2)
1432
1433 # Check if it's a video-specific URL
1434 query_dict = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
1435 if 'v' in query_dict:
1436 video_id = query_dict['v'][0]
1437 if self._downloader.params.get('noplaylist'):
1438 self.to_screen(u'Downloading just video %s because of --no-playlist' % video_id)
1439 return self.url_result(video_id, 'Youtube', video_id=video_id)
1440 else:
1441 self.to_screen(u'Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id))
1442
1443 if playlist_id.startswith('RD'):
1444 # Mixes require a custom extraction process
1445 return self._extract_mix(playlist_id)
1446 if playlist_id.startswith('TL'):
1447 raise ExtractorError(u'For downloading YouTube.com top lists, use '
1448 u'the "yttoplist" keyword, for example "youtube-dl \'yttoplist:music:Top Tracks\'"', expected=True)
1449
1450 url = self._TEMPLATE_URL % playlist_id
1451 page = self._download_webpage(url, playlist_id)
1452 more_widget_html = content_html = page
1453
1454 # Check if the playlist exists or is private
1455 if re.search(r'<div class="yt-alert-message">[^<]*?(The|This) playlist (does not exist|is private)[^<]*?</div>', page) is not None:
1456 raise ExtractorError(
1457 u'The playlist doesn\'t exist or is private, use --username or '
1458 '--netrc to access it.',
1459 expected=True)
1460
1461 # Extract the video ids from the playlist pages
1462 ids = []
1463
1464 for page_num in itertools.count(1):
1465 matches = re.finditer(self._VIDEO_RE, content_html)
1466 # We remove the duplicates and the link with index 0
1467 # (it's not the first video of the playlist)
1468 new_ids = orderedSet(m.group('id') for m in matches if m.group('index') != '0')
1469 ids.extend(new_ids)
1470
1471 mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
1472 if not mobj:
1473 break
1474
1475 more = self._download_json(
1476 'https://youtube.com/%s' % mobj.group('more'), playlist_id,
1477 'Downloading page #%s' % page_num,
1478 transform_source=uppercase_escape)
1479 content_html = more['content_html']
1480 more_widget_html = more['load_more_widget_html']
1481
1482 playlist_title = self._html_search_regex(
1483 r'(?s)<h1 class="pl-header-title[^"]*">\s*(.*?)\s*</h1>',
1484 page, u'title')
1485
1486 url_results = self._ids_to_results(ids)
1487 return self.playlist_result(url_results, playlist_id, playlist_title)
1488
1489
1490 class YoutubeTopListIE(YoutubePlaylistIE):
1491 IE_NAME = u'youtube:toplist'
1492 IE_DESC = (u'YouTube.com top lists, "yttoplist:{channel}:{list title}"'
1493 u' (Example: "yttoplist:music:Top Tracks")')
1494 _VALID_URL = r'yttoplist:(?P<chann>.*?):(?P<title>.*?)$'
1495
1496 def _real_extract(self, url):
1497 mobj = re.match(self._VALID_URL, url)
1498 channel = mobj.group('chann')
1499 title = mobj.group('title')
1500 query = compat_urllib_parse.urlencode({'title': title})
1501 playlist_re = 'href="([^"]+?%s.*?)"' % re.escape(query)
1502 channel_page = self._download_webpage('https://www.youtube.com/%s' % channel, title)
1503 link = self._html_search_regex(playlist_re, channel_page, u'list')
1504 url = compat_urlparse.urljoin('https://www.youtube.com/', link)
1505
1506 video_re = r'data-index="\d+".*?data-video-id="([0-9A-Za-z_-]{11})"'
1507 ids = []
1508 # sometimes the webpage doesn't contain the videos
1509 # retry until we get them
1510 for i in itertools.count(0):
1511 msg = u'Downloading Youtube mix'
1512 if i > 0:
1513 msg += ', retry #%d' % i
1514 webpage = self._download_webpage(url, title, msg)
1515 ids = orderedSet(re.findall(video_re, webpage))
1516 if ids:
1517 break
1518 url_results = self._ids_to_results(ids)
1519 return self.playlist_result(url_results, playlist_title=title)
1520
1521
1522 class YoutubeChannelIE(InfoExtractor):
1523 IE_DESC = u'YouTube.com channels'
1524 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
1525 _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
1526 _MORE_PAGES_URL = 'https://www.youtube.com/c4_browse_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
1527 IE_NAME = u'youtube:channel'
1528
1529 def extract_videos_from_page(self, page):
1530 ids_in_page = []
1531 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1532 if mobj.group(1) not in ids_in_page:
1533 ids_in_page.append(mobj.group(1))
1534 return ids_in_page
1535
1536 def _real_extract(self, url):
1537 # Extract channel id
1538 mobj = re.match(self._VALID_URL, url)
1539 if mobj is None:
1540 raise ExtractorError(u'Invalid URL: %s' % url)
1541
1542 # Download channel page
1543 channel_id = mobj.group(1)
1544 video_ids = []
1545 url = 'https://www.youtube.com/channel/%s/videos' % channel_id
1546 channel_page = self._download_webpage(url, channel_id)
1547 autogenerated = re.search(r'''(?x)
1548 class="[^"]*?(?:
1549 channel-header-autogenerated-label|
1550 yt-channel-title-autogenerated
1551 )[^"]*"''', channel_page) is not None
1552
1553 if autogenerated:
1554 # The videos are contained in a single page
1555 # the ajax pages can't be used, they are empty
1556 video_ids = self.extract_videos_from_page(channel_page)
1557 else:
1558 # Download all channel pages using the json-based channel_ajax query
1559 for pagenum in itertools.count(1):
1560 url = self._MORE_PAGES_URL % (pagenum, channel_id)
1561 page = self._download_json(
1562 url, channel_id, note=u'Downloading page #%s' % pagenum,
1563 transform_source=uppercase_escape)
1564
1565 ids_in_page = self.extract_videos_from_page(page['content_html'])
1566 video_ids.extend(ids_in_page)
1567
1568 if self._MORE_PAGES_INDICATOR not in page['load_more_widget_html']:
1569 break
1570
1571 self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1572
1573 url_entries = [self.url_result(video_id, 'Youtube', video_id=video_id)
1574 for video_id in video_ids]
1575 return self.playlist_result(url_entries, channel_id)
1576
1577
1578 class YoutubeUserIE(InfoExtractor):
1579 IE_DESC = u'YouTube.com user videos (URL or "ytuser" keyword)'
1580 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:user/)?(?!(?:attribution_link|watch|results)(?:$|[^a-z_A-Z0-9-])))|ytuser:)(?!feed/)([A-Za-z0-9_-]+)'
1581 _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/users/%s'
1582 _GDATA_PAGE_SIZE = 50
1583 _GDATA_URL = 'https://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d&alt=json'
1584 IE_NAME = u'youtube:user'
1585
1586 @classmethod
1587 def suitable(cls, url):
1588 # Don't return True if the url can be extracted with other youtube
1589 # extractor, the regex would is too permissive and it would match.
1590 other_ies = iter(klass for (name, klass) in globals().items() if name.endswith('IE') and klass is not cls)
1591 if any(ie.suitable(url) for ie in other_ies): return False
1592 else: return super(YoutubeUserIE, cls).suitable(url)
1593
1594 def _real_extract(self, url):
1595 # Extract username
1596 mobj = re.match(self._VALID_URL, url)
1597 if mobj is None:
1598 raise ExtractorError(u'Invalid URL: %s' % url)
1599
1600 username = mobj.group(1)
1601
1602 # Download video ids using YouTube Data API. Result size per
1603 # query is limited (currently to 50 videos) so we need to query
1604 # page by page until there are no video ids - it means we got
1605 # all of them.
1606
1607 def download_page(pagenum):
1608 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1609
1610 gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
1611 page = self._download_webpage(
1612 gdata_url, username,
1613 u'Downloading video ids from %d to %d' % (
1614 start_index, start_index + self._GDATA_PAGE_SIZE))
1615
1616 try:
1617 response = json.loads(page)
1618 except ValueError as err:
1619 raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
1620 if 'entry' not in response['feed']:
1621 return
1622
1623 # Extract video identifiers
1624 entries = response['feed']['entry']
1625 for entry in entries:
1626 title = entry['title']['$t']
1627 video_id = entry['id']['$t'].split('/')[-1]
1628 yield {
1629 '_type': 'url',
1630 'url': video_id,
1631 'ie_key': 'Youtube',
1632 'id': video_id,
1633 'title': title,
1634 }
1635 url_results = PagedList(download_page, self._GDATA_PAGE_SIZE)
1636
1637 return self.playlist_result(url_results, playlist_title=username)
1638
1639
1640 class YoutubeSearchIE(SearchInfoExtractor):
1641 IE_DESC = u'YouTube.com searches'
1642 _API_URL = u'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1643 _MAX_RESULTS = 1000
1644 IE_NAME = u'youtube:search'
1645 _SEARCH_KEY = 'ytsearch'
1646
1647 def _get_n_results(self, query, n):
1648 """Get a specified number of results for a query"""
1649
1650 video_ids = []
1651 pagenum = 0
1652 limit = n
1653 PAGE_SIZE = 50
1654
1655 while (PAGE_SIZE * pagenum) < limit:
1656 result_url = self._API_URL % (
1657 compat_urllib_parse.quote_plus(query.encode('utf-8')),
1658 (PAGE_SIZE * pagenum) + 1)
1659 data_json = self._download_webpage(
1660 result_url, video_id=u'query "%s"' % query,
1661 note=u'Downloading page %s' % (pagenum + 1),
1662 errnote=u'Unable to download API page')
1663 data = json.loads(data_json)
1664 api_response = data['data']
1665
1666 if 'items' not in api_response:
1667 raise ExtractorError(
1668 u'[youtube] No video results', expected=True)
1669
1670 new_ids = list(video['id'] for video in api_response['items'])
1671 video_ids += new_ids
1672
1673 limit = min(n, api_response['totalItems'])
1674 pagenum += 1
1675
1676 if len(video_ids) > n:
1677 video_ids = video_ids[:n]
1678 videos = [self.url_result(video_id, 'Youtube', video_id=video_id)
1679 for video_id in video_ids]
1680 return self.playlist_result(videos, query)
1681
1682
1683 class YoutubeSearchDateIE(YoutubeSearchIE):
1684 IE_NAME = YoutubeSearchIE.IE_NAME + ':date'
1685 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc&orderby=published'
1686 _SEARCH_KEY = 'ytsearchdate'
1687 IE_DESC = u'YouTube.com searches, newest videos first'
1688
1689
1690 class YoutubeSearchURLIE(InfoExtractor):
1691 IE_DESC = u'YouTube.com search URLs'
1692 IE_NAME = u'youtube:search_url'
1693 _VALID_URL = r'https?://(?:www\.)?youtube\.com/results\?(.*?&)?search_query=(?P<query>[^&]+)(?:[&]|$)'
1694
1695 def _real_extract(self, url):
1696 mobj = re.match(self._VALID_URL, url)
1697 query = compat_urllib_parse.unquote_plus(mobj.group('query'))
1698
1699 webpage = self._download_webpage(url, query)
1700 result_code = self._search_regex(
1701 r'(?s)<ol id="search-results"(.*?)</ol>', webpage, u'result HTML')
1702
1703 part_codes = re.findall(
1704 r'(?s)<h3 class="yt-lockup-title">(.*?)</h3>', result_code)
1705 entries = []
1706 for part_code in part_codes:
1707 part_title = self._html_search_regex(
1708 r'(?s)title="([^"]+)"', part_code, 'item title', fatal=False)
1709 part_url_snippet = self._html_search_regex(
1710 r'(?s)href="([^"]+)"', part_code, 'item URL')
1711 part_url = compat_urlparse.urljoin(
1712 'https://www.youtube.com/', part_url_snippet)
1713 entries.append({
1714 '_type': 'url',
1715 'url': part_url,
1716 'title': part_title,
1717 })
1718
1719 return {
1720 '_type': 'playlist',
1721 'entries': entries,
1722 'title': query,
1723 }
1724
1725
1726 class YoutubeShowIE(InfoExtractor):
1727 IE_DESC = u'YouTube.com (multi-season) shows'
1728 _VALID_URL = r'https?://www\.youtube\.com/show/(.*)'
1729 IE_NAME = u'youtube:show'
1730
1731 def _real_extract(self, url):
1732 mobj = re.match(self._VALID_URL, url)
1733 show_name = mobj.group(1)
1734 webpage = self._download_webpage(url, show_name, u'Downloading show webpage')
1735 # There's one playlist for each season of the show
1736 m_seasons = list(re.finditer(r'href="(/playlist\?list=.*?)"', webpage))
1737 self.to_screen(u'%s: Found %s seasons' % (show_name, len(m_seasons)))
1738 return [self.url_result('https://www.youtube.com' + season.group(1), 'YoutubePlaylist') for season in m_seasons]
1739
1740
1741 class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
1742 """
1743 Base class for extractors that fetch info from
1744 http://www.youtube.com/feed_ajax
1745 Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.
1746 """
1747 _LOGIN_REQUIRED = True
1748 # use action_load_personal_feed instead of action_load_system_feed
1749 _PERSONAL_FEED = False
1750
1751 @property
1752 def _FEED_TEMPLATE(self):
1753 action = 'action_load_system_feed'
1754 if self._PERSONAL_FEED:
1755 action = 'action_load_personal_feed'
1756 return 'https://www.youtube.com/feed_ajax?%s=1&feed_name=%s&paging=%%s' % (action, self._FEED_NAME)
1757
1758 @property
1759 def IE_NAME(self):
1760 return u'youtube:%s' % self._FEED_NAME
1761
1762 def _real_initialize(self):
1763 self._login()
1764
1765 def _real_extract(self, url):
1766 feed_entries = []
1767 paging = 0
1768 for i in itertools.count(1):
1769 info = self._download_json(self._FEED_TEMPLATE % paging,
1770 u'%s feed' % self._FEED_NAME,
1771 u'Downloading page %s' % i)
1772 feed_html = info.get('feed_html') or info.get('content_html')
1773 m_ids = re.finditer(r'"/watch\?v=(.*?)["&]', feed_html)
1774 ids = orderedSet(m.group(1) for m in m_ids)
1775 feed_entries.extend(
1776 self.url_result(video_id, 'Youtube', video_id=video_id)
1777 for video_id in ids)
1778 mobj = re.search(
1779 r'data-uix-load-more-href="/?[^"]+paging=(?P<paging>\d+)',
1780 feed_html)
1781 if mobj is None:
1782 break
1783 paging = mobj.group('paging')
1784 return self.playlist_result(feed_entries, playlist_title=self._PLAYLIST_TITLE)
1785
1786 class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor):
1787 IE_DESC = u'YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication)'
1788 _VALID_URL = r'https?://www\.youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
1789 _FEED_NAME = 'subscriptions'
1790 _PLAYLIST_TITLE = u'Youtube Subscriptions'
1791
1792 class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
1793 IE_DESC = u'YouTube.com recommended videos, "ytrec" keyword (requires authentication)'
1794 _VALID_URL = r'https?://www\.youtube\.com/feed/recommended|:ytrec(?:ommended)?'
1795 _FEED_NAME = 'recommended'
1796 _PLAYLIST_TITLE = u'Youtube Recommended videos'
1797
1798 class YoutubeWatchLaterIE(YoutubeFeedsInfoExtractor):
1799 IE_DESC = u'Youtube watch later list, "ytwatchlater" keyword (requires authentication)'
1800 _VALID_URL = r'https?://www\.youtube\.com/feed/watch_later|:ytwatchlater'
1801 _FEED_NAME = 'watch_later'
1802 _PLAYLIST_TITLE = u'Youtube Watch Later'
1803 _PERSONAL_FEED = True
1804
1805 class YoutubeHistoryIE(YoutubeFeedsInfoExtractor):
1806 IE_DESC = u'Youtube watch history, "ythistory" keyword (requires authentication)'
1807 _VALID_URL = u'https?://www\.youtube\.com/feed/history|:ythistory'
1808 _FEED_NAME = 'history'
1809 _PERSONAL_FEED = True
1810 _PLAYLIST_TITLE = u'Youtube Watch History'
1811
1812 class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
1813 IE_NAME = u'youtube:favorites'
1814 IE_DESC = u'YouTube.com favourite videos, "ytfav" keyword (requires authentication)'
1815 _VALID_URL = r'https?://www\.youtube\.com/my_favorites|:ytfav(?:ou?rites)?'
1816 _LOGIN_REQUIRED = True
1817
1818 def _real_extract(self, url):
1819 webpage = self._download_webpage('https://www.youtube.com/my_favorites', 'Youtube Favourites videos')
1820 playlist_id = self._search_regex(r'list=(.+?)["&]', webpage, u'favourites playlist id')
1821 return self.url_result(playlist_id, 'YoutubePlaylist')
1822
1823
1824 class YoutubeTruncatedURLIE(InfoExtractor):
1825 IE_NAME = 'youtube:truncated_url'
1826 IE_DESC = False # Do not list
1827 _VALID_URL = r'''(?x)
1828 (?:https?://)?[^/]+/watch\?(?:feature=[a-z_]+)?$|
1829 (?:https?://)?(?:www\.)?youtube\.com/attribution_link\?a=[^&]+$
1830 '''
1831
1832 def _real_extract(self, url):
1833 raise ExtractorError(
1834 u'Did you forget to quote the URL? Remember that & is a meta '
1835 u'character in most shells, so you want to put the URL in quotes, '
1836 u'like youtube-dl '
1837 u'"http://www.youtube.com/watch?feature=foo&v=BaW_jenozKc" '
1838 u' or simply youtube-dl BaW_jenozKc .',
1839 expected=True)