]> jfr.im git - yt-dlp.git/blame_incremental - youtube_dl/extractor/youtube.py
release 2014.08.21
[yt-dlp.git] / youtube_dl / extractor / youtube.py
... / ...
CommitLineData
1# coding: utf-8
2
3import errno
4import io
5import itertools
6import json
7import os.path
8import re
9import traceback
10
11from .common import InfoExtractor, SearchInfoExtractor
12from .subtitles import SubtitlesInfoExtractor
13from ..jsinterp import JSInterpreter
14from ..swfinterp import SWFInterpreter
15from ..utils import (
16 compat_chr,
17 compat_parse_qs,
18 compat_urllib_parse,
19 compat_urllib_request,
20 compat_urlparse,
21 compat_str,
22
23 clean_html,
24 get_cachedir,
25 get_element_by_id,
26 get_element_by_attribute,
27 ExtractorError,
28 int_or_none,
29 PagedList,
30 unescapeHTML,
31 unified_strdate,
32 orderedSet,
33 write_json_file,
34 uppercase_escape,
35)
36
37class YoutubeBaseInfoExtractor(InfoExtractor):
38 """Provide base functions for Youtube extractors"""
39 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
40 _LANG_URL = r'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
41 _AGE_URL = 'https://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
42 _NETRC_MACHINE = 'youtube'
43 # If True it will raise an error if no login info is provided
44 _LOGIN_REQUIRED = False
45
46 def _set_language(self):
47 return bool(self._download_webpage(
48 self._LANG_URL, None,
49 note=u'Setting language', errnote='unable to set language',
50 fatal=False))
51
52 def _login(self):
53 (username, password) = self._get_login_info()
54 # No authentication to be performed
55 if username is None:
56 if self._LOGIN_REQUIRED:
57 raise ExtractorError(u'No login info available, needed for using %s.' % self.IE_NAME, expected=True)
58 return False
59
60 login_page = self._download_webpage(
61 self._LOGIN_URL, None,
62 note=u'Downloading login page',
63 errnote=u'unable to fetch login page', fatal=False)
64 if login_page is False:
65 return
66
67 galx = self._search_regex(r'(?s)<input.+?name="GALX".+?value="(.+?)"',
68 login_page, u'Login GALX parameter')
69
70 # Log in
71 login_form_strs = {
72 u'continue': u'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
73 u'Email': username,
74 u'GALX': galx,
75 u'Passwd': password,
76 u'PersistentCookie': u'yes',
77 u'_utf8': u'霱',
78 u'bgresponse': u'js_disabled',
79 u'checkConnection': u'',
80 u'checkedDomains': u'youtube',
81 u'dnConn': u'',
82 u'pstMsg': u'0',
83 u'rmShown': u'1',
84 u'secTok': u'',
85 u'signIn': u'Sign in',
86 u'timeStmp': u'',
87 u'service': u'youtube',
88 u'uilel': u'3',
89 u'hl': u'en_US',
90 }
91 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
92 # chokes on unicode
93 login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
94 login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
95
96 req = compat_urllib_request.Request(self._LOGIN_URL, login_data)
97 login_results = self._download_webpage(
98 req, None,
99 note=u'Logging in', errnote=u'unable to log in', fatal=False)
100 if login_results is False:
101 return False
102 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
103 self._downloader.report_warning(u'unable to log in: bad username or password')
104 return False
105 return True
106
107 def _confirm_age(self):
108 age_form = {
109 'next_url': '/',
110 'action_confirm': 'Confirm',
111 }
112 req = compat_urllib_request.Request(self._AGE_URL,
113 compat_urllib_parse.urlencode(age_form).encode('ascii'))
114
115 self._download_webpage(
116 req, None,
117 note=u'Confirming age', errnote=u'Unable to confirm age')
118 return True
119
120 def _real_initialize(self):
121 if self._downloader is None:
122 return
123 if not self._set_language():
124 return
125 if not self._login():
126 return
127 self._confirm_age()
128
129
130class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
131 IE_DESC = u'YouTube.com'
132 _VALID_URL = r"""(?x)^
133 (
134 (?:https?://|//)? # http(s):// or protocol-independent URL (optional)
135 (?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/|
136 (?:www\.)?deturl\.com/www\.youtube\.com/|
137 (?:www\.)?pwnyoutube\.com/|
138 (?:www\.)?yourepeat\.com/|
139 tube\.majestyc\.net/|
140 youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains
141 (?:.*?\#/)? # handle anchor (#/) redirect urls
142 (?: # the various things that can precede the ID:
143 (?:(?:v|embed|e)/) # v/ or embed/ or e/
144 |(?: # or the v= param in all its forms
145 (?:(?:watch|movie)(?:_popup)?(?:\.php)?/?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
146 (?:\?|\#!?) # the params delimiter ? or # or #!
147 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
148 v=
149 )
150 ))
151 |youtu\.be/ # just youtu.be/xxxx
152 |https?://(?:www\.)?cleanvideosearch\.com/media/action/yt/watch\?videoId=
153 )
154 )? # all until now is optional -> you can pass the naked ID
155 ([0-9A-Za-z_-]{11}) # here is it! the YouTube video ID
156 (?(1).+)? # if we found the ID, everything can follow
157 $"""
158 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
159 _formats = {
160 '5': {'ext': 'flv', 'width': 400, 'height': 240},
161 '6': {'ext': 'flv', 'width': 450, 'height': 270},
162 '13': {'ext': '3gp'},
163 '17': {'ext': '3gp', 'width': 176, 'height': 144},
164 '18': {'ext': 'mp4', 'width': 640, 'height': 360},
165 '22': {'ext': 'mp4', 'width': 1280, 'height': 720},
166 '34': {'ext': 'flv', 'width': 640, 'height': 360},
167 '35': {'ext': 'flv', 'width': 854, 'height': 480},
168 '36': {'ext': '3gp', 'width': 320, 'height': 240},
169 '37': {'ext': 'mp4', 'width': 1920, 'height': 1080},
170 '38': {'ext': 'mp4', 'width': 4096, 'height': 3072},
171 '43': {'ext': 'webm', 'width': 640, 'height': 360},
172 '44': {'ext': 'webm', 'width': 854, 'height': 480},
173 '45': {'ext': 'webm', 'width': 1280, 'height': 720},
174 '46': {'ext': 'webm', 'width': 1920, 'height': 1080},
175
176
177 # 3d videos
178 '82': {'ext': 'mp4', 'height': 360, 'format_note': '3D', 'preference': -20},
179 '83': {'ext': 'mp4', 'height': 480, 'format_note': '3D', 'preference': -20},
180 '84': {'ext': 'mp4', 'height': 720, 'format_note': '3D', 'preference': -20},
181 '85': {'ext': 'mp4', 'height': 1080, 'format_note': '3D', 'preference': -20},
182 '100': {'ext': 'webm', 'height': 360, 'format_note': '3D', 'preference': -20},
183 '101': {'ext': 'webm', 'height': 480, 'format_note': '3D', 'preference': -20},
184 '102': {'ext': 'webm', 'height': 720, 'format_note': '3D', 'preference': -20},
185
186 # Apple HTTP Live Streaming
187 '92': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'preference': -10},
188 '93': {'ext': 'mp4', 'height': 360, 'format_note': 'HLS', 'preference': -10},
189 '94': {'ext': 'mp4', 'height': 480, 'format_note': 'HLS', 'preference': -10},
190 '95': {'ext': 'mp4', 'height': 720, 'format_note': 'HLS', 'preference': -10},
191 '96': {'ext': 'mp4', 'height': 1080, 'format_note': 'HLS', 'preference': -10},
192 '132': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'preference': -10},
193 '151': {'ext': 'mp4', 'height': 72, 'format_note': 'HLS', 'preference': -10},
194
195 # DASH mp4 video
196 '133': {'ext': 'mp4', 'height': 240, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
197 '134': {'ext': 'mp4', 'height': 360, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
198 '135': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
199 '136': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
200 '137': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
201 '138': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
202 '160': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
203 '264': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
204
205 # Dash mp4 audio
206 '139': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 48, 'preference': -50},
207 '140': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 128, 'preference': -50},
208 '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 256, 'preference': -50},
209
210 # Dash webm
211 '167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
212 '168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
213 '169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
214 '170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
215 '218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
216 '219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
217 '242': {'ext': 'webm', 'height': 240, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
218 '243': {'ext': 'webm', 'height': 360, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
219 '244': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
220 '245': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
221 '246': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
222 '247': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
223 '248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
224 '271': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
225 '272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
226
227 # Dash webm audio
228 '171': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'abr': 48, 'preference': -50},
229 '172': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'abr': 256, 'preference': -50},
230
231 # RTMP (unnamed)
232 '_rtmp': {'protocol': 'rtmp'},
233 }
234
235 IE_NAME = u'youtube'
236 _TESTS = [
237 {
238 u"url": u"http://www.youtube.com/watch?v=BaW_jenozKc",
239 u"file": u"BaW_jenozKc.mp4",
240 u"info_dict": {
241 u"title": u"youtube-dl test video \"'/\\ä↭𝕐",
242 u"uploader": u"Philipp Hagemeister",
243 u"uploader_id": u"phihag",
244 u"upload_date": u"20121002",
245 u"description": u"test chars: \"'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .",
246 u"categories": [u'Science & Technology'],
247 }
248 },
249 {
250 u"url": u"http://www.youtube.com/watch?v=UxxajLWwzqY",
251 u"file": u"UxxajLWwzqY.mp4",
252 u"note": u"Test generic use_cipher_signature video (#897)",
253 u"info_dict": {
254 u"upload_date": u"20120506",
255 u"title": u"Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]",
256 u"description": u"md5:fea86fda2d5a5784273df5c7cc994d9f",
257 u"uploader": u"Icona Pop",
258 u"uploader_id": u"IconaPop"
259 }
260 },
261 {
262 u"url": u"https://www.youtube.com/watch?v=07FYdnEawAQ",
263 u"file": u"07FYdnEawAQ.mp4",
264 u"note": u"Test VEVO video with age protection (#956)",
265 u"info_dict": {
266 u"upload_date": u"20130703",
267 u"title": u"Justin Timberlake - Tunnel Vision (Explicit)",
268 u"description": u"md5:64249768eec3bc4276236606ea996373",
269 u"uploader": u"justintimberlakeVEVO",
270 u"uploader_id": u"justintimberlakeVEVO"
271 }
272 },
273 {
274 u"url": u"//www.YouTube.com/watch?v=yZIXLfi8CZQ",
275 u"file": u"yZIXLfi8CZQ.mp4",
276 u"note": u"Embed-only video (#1746)",
277 u"info_dict": {
278 u"upload_date": u"20120608",
279 u"title": u"Principal Sexually Assaults A Teacher - Episode 117 - 8th June 2012",
280 u"description": u"md5:09b78bd971f1e3e289601dfba15ca4f7",
281 u"uploader": u"SET India",
282 u"uploader_id": u"setindia"
283 }
284 },
285 {
286 u"url": u"http://www.youtube.com/watch?v=a9LDPn-MO4I",
287 u"file": u"a9LDPn-MO4I.m4a",
288 u"note": u"256k DASH audio (format 141) via DASH manifest",
289 u"info_dict": {
290 u"upload_date": "20121002",
291 u"uploader_id": "8KVIDEO",
292 u"description": "No description available.",
293 u"uploader": "8KVIDEO",
294 u"title": "UHDTV TEST 8K VIDEO.mp4"
295 },
296 u"params": {
297 u"youtube_include_dash_manifest": True,
298 u"format": "141",
299 },
300 },
301 # DASH manifest with encrypted signature
302 {
303 u'url': u'https://www.youtube.com/watch?v=IB3lcPjvWLA',
304 u'info_dict': {
305 u'id': u'IB3lcPjvWLA',
306 u'ext': u'm4a',
307 u'title': u'Afrojack - The Spark ft. Spree Wilson',
308 u'description': u'md5:9717375db5a9a3992be4668bbf3bc0a8',
309 u'uploader': u'AfrojackVEVO',
310 u'uploader_id': u'AfrojackVEVO',
311 u'upload_date': u'20131011',
312 },
313 u"params": {
314 u'youtube_include_dash_manifest': True,
315 u'format': '141',
316 },
317 },
318 ]
319
320
321 @classmethod
322 def suitable(cls, url):
323 """Receives a URL and returns True if suitable for this IE."""
324 if YoutubePlaylistIE.suitable(url): return False
325 return re.match(cls._VALID_URL, url) is not None
326
327 def __init__(self, *args, **kwargs):
328 super(YoutubeIE, self).__init__(*args, **kwargs)
329 self._player_cache = {}
330
331 def report_video_info_webpage_download(self, video_id):
332 """Report attempt to download video info webpage."""
333 self.to_screen(u'%s: Downloading video info webpage' % video_id)
334
335 def report_information_extraction(self, video_id):
336 """Report attempt to extract video information."""
337 self.to_screen(u'%s: Extracting video information' % video_id)
338
339 def report_unavailable_format(self, video_id, format):
340 """Report extracted video URL."""
341 self.to_screen(u'%s: Format %s not available' % (video_id, format))
342
343 def report_rtmp_download(self):
344 """Indicate the download will use the RTMP protocol."""
345 self.to_screen(u'RTMP download detected')
346
347 def _signature_cache_id(self, example_sig):
348 """ Return a string representation of a signature """
349 return u'.'.join(compat_str(len(part)) for part in example_sig.split('.'))
350
351 def _extract_signature_function(self, video_id, player_url, example_sig):
352 id_m = re.match(
353 r'.*-(?P<id>[a-zA-Z0-9_-]+)(?:/watch_as3|/html5player)?\.(?P<ext>[a-z]+)$',
354 player_url)
355 if not id_m:
356 raise ExtractorError('Cannot identify player %r' % player_url)
357 player_type = id_m.group('ext')
358 player_id = id_m.group('id')
359
360 # Read from filesystem cache
361 func_id = '%s_%s_%s' % (
362 player_type, player_id, self._signature_cache_id(example_sig))
363 assert os.path.basename(func_id) == func_id
364 cache_dir = get_cachedir(self._downloader.params)
365
366 cache_enabled = cache_dir is not None
367 if cache_enabled:
368 cache_fn = os.path.join(os.path.expanduser(cache_dir),
369 u'youtube-sigfuncs',
370 func_id + '.json')
371 try:
372 with io.open(cache_fn, 'r', encoding='utf-8') as cachef:
373 cache_spec = json.load(cachef)
374 return lambda s: u''.join(s[i] for i in cache_spec)
375 except IOError:
376 pass # No cache available
377
378 if player_type == 'js':
379 code = self._download_webpage(
380 player_url, video_id,
381 note=u'Downloading %s player %s' % (player_type, player_id),
382 errnote=u'Download of %s failed' % player_url)
383 res = self._parse_sig_js(code)
384 elif player_type == 'swf':
385 urlh = self._request_webpage(
386 player_url, video_id,
387 note=u'Downloading %s player %s' % (player_type, player_id),
388 errnote=u'Download of %s failed' % player_url)
389 code = urlh.read()
390 res = self._parse_sig_swf(code)
391 else:
392 assert False, 'Invalid player type %r' % player_type
393
394 if cache_enabled:
395 try:
396 test_string = u''.join(map(compat_chr, range(len(example_sig))))
397 cache_res = res(test_string)
398 cache_spec = [ord(c) for c in cache_res]
399 try:
400 os.makedirs(os.path.dirname(cache_fn))
401 except OSError as ose:
402 if ose.errno != errno.EEXIST:
403 raise
404 write_json_file(cache_spec, cache_fn)
405 except Exception:
406 tb = traceback.format_exc()
407 self._downloader.report_warning(
408 u'Writing cache to %r failed: %s' % (cache_fn, tb))
409
410 return res
411
412 def _print_sig_code(self, func, example_sig):
413 def gen_sig_code(idxs):
414 def _genslice(start, end, step):
415 starts = u'' if start == 0 else str(start)
416 ends = (u':%d' % (end+step)) if end + step >= 0 else u':'
417 steps = u'' if step == 1 else (u':%d' % step)
418 return u's[%s%s%s]' % (starts, ends, steps)
419
420 step = None
421 start = '(Never used)' # Quelch pyflakes warnings - start will be
422 # set as soon as step is set
423 for i, prev in zip(idxs[1:], idxs[:-1]):
424 if step is not None:
425 if i - prev == step:
426 continue
427 yield _genslice(start, prev, step)
428 step = None
429 continue
430 if i - prev in [-1, 1]:
431 step = i - prev
432 start = prev
433 continue
434 else:
435 yield u's[%d]' % prev
436 if step is None:
437 yield u's[%d]' % i
438 else:
439 yield _genslice(start, i, step)
440
441 test_string = u''.join(map(compat_chr, range(len(example_sig))))
442 cache_res = func(test_string)
443 cache_spec = [ord(c) for c in cache_res]
444 expr_code = u' + '.join(gen_sig_code(cache_spec))
445 signature_id_tuple = '(%s)' % (
446 ', '.join(compat_str(len(p)) for p in example_sig.split('.')))
447 code = (u'if tuple(len(p) for p in s.split(\'.\')) == %s:\n'
448 u' return %s\n') % (signature_id_tuple, expr_code)
449 self.to_screen(u'Extracted signature function:\n' + code)
450
451 def _parse_sig_js(self, jscode):
452 funcname = self._search_regex(
453 r'signature=([$a-zA-Z]+)', jscode,
454 u'Initial JS player signature function name')
455
456 jsi = JSInterpreter(jscode)
457 initial_function = jsi.extract_function(funcname)
458 return lambda s: initial_function([s])
459
460 def _parse_sig_swf(self, file_contents):
461 swfi = SWFInterpreter(file_contents)
462 TARGET_CLASSNAME = u'SignatureDecipher'
463 searched_class = swfi.extract_class(TARGET_CLASSNAME)
464 initial_function = swfi.extract_function(searched_class, u'decipher')
465 return lambda s: initial_function([s])
466
467 def _decrypt_signature(self, s, video_id, player_url, age_gate=False):
468 """Turn the encrypted s field into a working signature"""
469
470 if player_url is None:
471 raise ExtractorError(u'Cannot decrypt signature without player_url')
472
473 if player_url.startswith(u'//'):
474 player_url = u'https:' + player_url
475 try:
476 player_id = (player_url, self._signature_cache_id(s))
477 if player_id not in self._player_cache:
478 func = self._extract_signature_function(
479 video_id, player_url, s
480 )
481 self._player_cache[player_id] = func
482 func = self._player_cache[player_id]
483 if self._downloader.params.get('youtube_print_sig_code'):
484 self._print_sig_code(func, s)
485 return func(s)
486 except Exception as e:
487 tb = traceback.format_exc()
488 raise ExtractorError(
489 u'Signature extraction failed: ' + tb, cause=e)
490
491 def _get_available_subtitles(self, video_id, webpage):
492 try:
493 sub_list = self._download_webpage(
494 'https://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,
495 video_id, note=False)
496 except ExtractorError as err:
497 self._downloader.report_warning(u'unable to download video subtitles: %s' % compat_str(err))
498 return {}
499 lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
500
501 sub_lang_list = {}
502 for l in lang_list:
503 lang = l[1]
504 params = compat_urllib_parse.urlencode({
505 'lang': lang,
506 'v': video_id,
507 'fmt': self._downloader.params.get('subtitlesformat', 'srt'),
508 'name': unescapeHTML(l[0]).encode('utf-8'),
509 })
510 url = u'https://www.youtube.com/api/timedtext?' + params
511 sub_lang_list[lang] = url
512 if not sub_lang_list:
513 self._downloader.report_warning(u'video doesn\'t have subtitles')
514 return {}
515 return sub_lang_list
516
517 def _get_available_automatic_caption(self, video_id, webpage):
518 """We need the webpage for getting the captions url, pass it as an
519 argument to speed up the process."""
520 sub_format = self._downloader.params.get('subtitlesformat', 'srt')
521 self.to_screen(u'%s: Looking for automatic captions' % video_id)
522 mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
523 err_msg = u'Couldn\'t find automatic captions for %s' % video_id
524 if mobj is None:
525 self._downloader.report_warning(err_msg)
526 return {}
527 player_config = json.loads(mobj.group(1))
528 try:
529 args = player_config[u'args']
530 caption_url = args[u'ttsurl']
531 timestamp = args[u'timestamp']
532 # We get the available subtitles
533 list_params = compat_urllib_parse.urlencode({
534 'type': 'list',
535 'tlangs': 1,
536 'asrs': 1,
537 })
538 list_url = caption_url + '&' + list_params
539 caption_list = self._download_xml(list_url, video_id)
540 original_lang_node = caption_list.find('track')
541 if original_lang_node is None or original_lang_node.attrib.get('kind') != 'asr' :
542 self._downloader.report_warning(u'Video doesn\'t have automatic captions')
543 return {}
544 original_lang = original_lang_node.attrib['lang_code']
545
546 sub_lang_list = {}
547 for lang_node in caption_list.findall('target'):
548 sub_lang = lang_node.attrib['lang_code']
549 params = compat_urllib_parse.urlencode({
550 'lang': original_lang,
551 'tlang': sub_lang,
552 'fmt': sub_format,
553 'ts': timestamp,
554 'kind': 'asr',
555 })
556 sub_lang_list[sub_lang] = caption_url + '&' + params
557 return sub_lang_list
558 # An extractor error can be raise by the download process if there are
559 # no automatic captions but there are subtitles
560 except (KeyError, ExtractorError):
561 self._downloader.report_warning(err_msg)
562 return {}
563
564 @classmethod
565 def extract_id(cls, url):
566 mobj = re.match(cls._VALID_URL, url, re.VERBOSE)
567 if mobj is None:
568 raise ExtractorError(u'Invalid URL: %s' % url)
569 video_id = mobj.group(2)
570 return video_id
571
572 def _extract_from_m3u8(self, manifest_url, video_id):
573 url_map = {}
574 def _get_urls(_manifest):
575 lines = _manifest.split('\n')
576 urls = filter(lambda l: l and not l.startswith('#'),
577 lines)
578 return urls
579 manifest = self._download_webpage(manifest_url, video_id, u'Downloading formats manifest')
580 formats_urls = _get_urls(manifest)
581 for format_url in formats_urls:
582 itag = self._search_regex(r'itag/(\d+?)/', format_url, 'itag')
583 url_map[itag] = format_url
584 return url_map
585
586 def _extract_annotations(self, video_id):
587 url = 'https://www.youtube.com/annotations_invideo?features=1&legacy=1&video_id=%s' % video_id
588 return self._download_webpage(url, video_id, note=u'Searching for annotations.', errnote=u'Unable to download video annotations.')
589
590 def _real_extract(self, url):
591 proto = (
592 u'http' if self._downloader.params.get('prefer_insecure', False)
593 else u'https')
594
595 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
596 mobj = re.search(self._NEXT_URL_RE, url)
597 if mobj:
598 url = proto + '://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
599 video_id = self.extract_id(url)
600
601 # Get video webpage
602 url = proto + '://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
603 video_webpage = self._download_webpage(url, video_id)
604
605 # Attempt to extract SWF player URL
606 mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
607 if mobj is not None:
608 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
609 else:
610 player_url = None
611
612 # Get video info
613 self.report_video_info_webpage_download(video_id)
614 if re.search(r'player-age-gate-content">', video_webpage) is not None:
615 self.report_age_confirmation()
616 age_gate = True
617 # We simulate the access to the video from www.youtube.com/v/{video_id}
618 # this can be viewed without login into Youtube
619 data = compat_urllib_parse.urlencode({
620 'video_id': video_id,
621 'eurl': 'https://youtube.googleapis.com/v/' + video_id,
622 'sts': self._search_regex(
623 r'"sts"\s*:\s*(\d+)', video_webpage, 'sts'),
624 })
625 video_info_url = proto + '://www.youtube.com/get_video_info?' + data
626 video_info_webpage = self._download_webpage(video_info_url, video_id,
627 note=False,
628 errnote='unable to download video info webpage')
629 video_info = compat_parse_qs(video_info_webpage)
630 else:
631 age_gate = False
632 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
633 video_info_url = (proto + '://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
634 % (video_id, el_type))
635 video_info_webpage = self._download_webpage(video_info_url, video_id,
636 note=False,
637 errnote='unable to download video info webpage')
638 video_info = compat_parse_qs(video_info_webpage)
639 if 'token' in video_info:
640 break
641 if 'token' not in video_info:
642 if 'reason' in video_info:
643 raise ExtractorError(
644 u'YouTube said: %s' % video_info['reason'][0],
645 expected=True, video_id=video_id)
646 else:
647 raise ExtractorError(
648 u'"token" parameter not in video info for unknown reason',
649 video_id=video_id)
650
651 if 'view_count' in video_info:
652 view_count = int(video_info['view_count'][0])
653 else:
654 view_count = None
655
656 # Check for "rental" videos
657 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
658 raise ExtractorError(u'"rental" videos not supported')
659
660 # Start extracting information
661 self.report_information_extraction(video_id)
662
663 # uploader
664 if 'author' not in video_info:
665 raise ExtractorError(u'Unable to extract uploader name')
666 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
667
668 # uploader_id
669 video_uploader_id = None
670 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
671 if mobj is not None:
672 video_uploader_id = mobj.group(1)
673 else:
674 self._downloader.report_warning(u'unable to extract uploader nickname')
675
676 # title
677 if 'title' in video_info:
678 video_title = video_info['title'][0]
679 else:
680 self._downloader.report_warning(u'Unable to extract video title')
681 video_title = u'_'
682
683 # thumbnail image
684 # We try first to get a high quality image:
685 m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">',
686 video_webpage, re.DOTALL)
687 if m_thumb is not None:
688 video_thumbnail = m_thumb.group(1)
689 elif 'thumbnail_url' not in video_info:
690 self._downloader.report_warning(u'unable to extract video thumbnail')
691 video_thumbnail = None
692 else: # don't panic if we can't find it
693 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
694
695 # upload date
696 upload_date = None
697 mobj = re.search(r'(?s)id="eow-date.*?>(.*?)</span>', video_webpage)
698 if mobj is None:
699 mobj = re.search(
700 r'(?s)id="watch-uploader-info".*?>.*?(?:Published|Uploaded|Streamed live) on (.*?)</strong>',
701 video_webpage)
702 if mobj is not None:
703 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
704 upload_date = unified_strdate(upload_date)
705
706 m_cat_container = get_element_by_id("eow-category", video_webpage)
707 if m_cat_container:
708 category = self._html_search_regex(
709 r'(?s)<a[^<]+>(.*?)</a>', m_cat_container, 'category',
710 default=None)
711 video_categories = None if category is None else [category]
712 else:
713 video_categories = None
714
715 # description
716 video_description = get_element_by_id("eow-description", video_webpage)
717 if video_description:
718 video_description = re.sub(r'''(?x)
719 <a\s+
720 (?:[a-zA-Z-]+="[^"]+"\s+)*?
721 title="([^"]+)"\s+
722 (?:[a-zA-Z-]+="[^"]+"\s+)*?
723 class="yt-uix-redirect-link"\s*>
724 [^<]+
725 </a>
726 ''', r'\1', video_description)
727 video_description = clean_html(video_description)
728 else:
729 fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
730 if fd_mobj:
731 video_description = unescapeHTML(fd_mobj.group(1))
732 else:
733 video_description = u''
734
735 def _extract_count(klass):
736 count = self._search_regex(
737 r'class="%s">([\d,]+)</span>' % re.escape(klass),
738 video_webpage, klass, default=None)
739 if count is not None:
740 return int(count.replace(',', ''))
741 return None
742 like_count = _extract_count(u'likes-count')
743 dislike_count = _extract_count(u'dislikes-count')
744
745 # subtitles
746 video_subtitles = self.extract_subtitles(video_id, video_webpage)
747
748 if self._downloader.params.get('listsubtitles', False):
749 self._list_available_subtitles(video_id, video_webpage)
750 return
751
752 if 'length_seconds' not in video_info:
753 self._downloader.report_warning(u'unable to extract video duration')
754 video_duration = None
755 else:
756 video_duration = int(compat_urllib_parse.unquote_plus(video_info['length_seconds'][0]))
757
758 # annotations
759 video_annotations = None
760 if self._downloader.params.get('writeannotations', False):
761 video_annotations = self._extract_annotations(video_id)
762
763 # Decide which formats to download
764 try:
765 mobj = re.search(r';ytplayer\.config\s*=\s*({.*?});', video_webpage)
766 if not mobj:
767 raise ValueError('Could not find vevo ID')
768 json_code = uppercase_escape(mobj.group(1))
769 ytplayer_config = json.loads(json_code)
770 args = ytplayer_config['args']
771 # Easy way to know if the 's' value is in url_encoded_fmt_stream_map
772 # this signatures are encrypted
773 if 'url_encoded_fmt_stream_map' not in args:
774 raise ValueError(u'No stream_map present') # caught below
775 re_signature = re.compile(r'[&,]s=')
776 m_s = re_signature.search(args['url_encoded_fmt_stream_map'])
777 if m_s is not None:
778 self.to_screen(u'%s: Encrypted signatures detected.' % video_id)
779 video_info['url_encoded_fmt_stream_map'] = [args['url_encoded_fmt_stream_map']]
780 m_s = re_signature.search(args.get('adaptive_fmts', u''))
781 if m_s is not None:
782 if 'adaptive_fmts' in video_info:
783 video_info['adaptive_fmts'][0] += ',' + args['adaptive_fmts']
784 else:
785 video_info['adaptive_fmts'] = [args['adaptive_fmts']]
786 except ValueError:
787 pass
788
789 def _map_to_format_list(urlmap):
790 formats = []
791 for itag, video_real_url in urlmap.items():
792 dct = {
793 'format_id': itag,
794 'url': video_real_url,
795 'player_url': player_url,
796 }
797 if itag in self._formats:
798 dct.update(self._formats[itag])
799 formats.append(dct)
800 return formats
801
802 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
803 self.report_rtmp_download()
804 formats = [{
805 'format_id': '_rtmp',
806 'protocol': 'rtmp',
807 'url': video_info['conn'][0],
808 'player_url': player_url,
809 }]
810 elif len(video_info.get('url_encoded_fmt_stream_map', [])) >= 1 or len(video_info.get('adaptive_fmts', [])) >= 1:
811 encoded_url_map = video_info.get('url_encoded_fmt_stream_map', [''])[0] + ',' + video_info.get('adaptive_fmts',[''])[0]
812 if 'rtmpe%3Dyes' in encoded_url_map:
813 raise ExtractorError('rtmpe downloads are not supported, see https://github.com/rg3/youtube-dl/issues/343 for more information.', expected=True)
814 url_map = {}
815 for url_data_str in encoded_url_map.split(','):
816 url_data = compat_parse_qs(url_data_str)
817 if 'itag' not in url_data or 'url' not in url_data:
818 continue
819 format_id = url_data['itag'][0]
820 url = url_data['url'][0]
821
822 if 'sig' in url_data:
823 url += '&signature=' + url_data['sig'][0]
824 elif 's' in url_data:
825 encrypted_sig = url_data['s'][0]
826
827 if not age_gate:
828 jsplayer_url_json = self._search_regex(
829 r'"assets":.+?"js":\s*("[^"]+")',
830 video_webpage, u'JS player URL')
831 player_url = json.loads(jsplayer_url_json)
832 if player_url is None:
833 player_url_json = self._search_regex(
834 r'ytplayer\.config.*?"url"\s*:\s*("[^"]+")',
835 video_webpage, u'age gate player URL')
836 player_url = json.loads(player_url_json)
837
838 if self._downloader.params.get('verbose'):
839 if player_url is None:
840 player_version = 'unknown'
841 player_desc = 'unknown'
842 else:
843 if player_url.endswith('swf'):
844 player_version = self._search_regex(
845 r'-(.+?)(?:/watch_as3)?\.swf$', player_url,
846 u'flash player', fatal=False)
847 player_desc = 'flash player %s' % player_version
848 else:
849 player_version = self._search_regex(
850 r'html5player-([^/]+?)(?:/html5player)?\.js',
851 player_url,
852 'html5 player', fatal=False)
853 player_desc = u'html5 player %s' % player_version
854
855 parts_sizes = self._signature_cache_id(encrypted_sig)
856 self.to_screen(u'{%s} signature length %s, %s' %
857 (format_id, parts_sizes, player_desc))
858
859 signature = self._decrypt_signature(
860 encrypted_sig, video_id, player_url, age_gate)
861 url += '&signature=' + signature
862 if 'ratebypass' not in url:
863 url += '&ratebypass=yes'
864 url_map[format_id] = url
865 formats = _map_to_format_list(url_map)
866 elif video_info.get('hlsvp'):
867 manifest_url = video_info['hlsvp'][0]
868 url_map = self._extract_from_m3u8(manifest_url, video_id)
869 formats = _map_to_format_list(url_map)
870 else:
871 raise ExtractorError(u'no conn, hlsvp or url_encoded_fmt_stream_map information found in video info')
872
873 # Look for the DASH manifest
874 if (self._downloader.params.get('youtube_include_dash_manifest', False)):
875 try:
876 # The DASH manifest used needs to be the one from the original video_webpage.
877 # The one found in get_video_info seems to be using different signatures.
878 # However, in the case of an age restriction there won't be any embedded dashmpd in the video_webpage.
879 # Luckily, it seems, this case uses some kind of default signature (len == 86), so the
880 # combination of get_video_info and the _static_decrypt_signature() decryption fallback will work here.
881 if age_gate:
882 dash_manifest_url = video_info.get('dashmpd')[0]
883 else:
884 dash_manifest_url = ytplayer_config['args']['dashmpd']
885 def decrypt_sig(mobj):
886 s = mobj.group(1)
887 dec_s = self._decrypt_signature(s, video_id, player_url, age_gate)
888 return '/signature/%s' % dec_s
889 dash_manifest_url = re.sub(r'/s/([\w\.]+)', decrypt_sig, dash_manifest_url)
890 dash_doc = self._download_xml(
891 dash_manifest_url, video_id,
892 note=u'Downloading DASH manifest',
893 errnote=u'Could not download DASH manifest')
894 for r in dash_doc.findall(u'.//{urn:mpeg:DASH:schema:MPD:2011}Representation'):
895 url_el = r.find('{urn:mpeg:DASH:schema:MPD:2011}BaseURL')
896 if url_el is None:
897 continue
898 format_id = r.attrib['id']
899 video_url = url_el.text
900 filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength'))
901 f = {
902 'format_id': format_id,
903 'url': video_url,
904 'width': int_or_none(r.attrib.get('width')),
905 'tbr': int_or_none(r.attrib.get('bandwidth'), 1000),
906 'asr': int_or_none(r.attrib.get('audioSamplingRate')),
907 'filesize': filesize,
908 }
909 try:
910 existing_format = next(
911 fo for fo in formats
912 if fo['format_id'] == format_id)
913 except StopIteration:
914 f.update(self._formats.get(format_id, {}))
915 formats.append(f)
916 else:
917 existing_format.update(f)
918
919 except (ExtractorError, KeyError) as e:
920 self.report_warning(u'Skipping DASH manifest: %s' % e, video_id)
921
922 self._sort_formats(formats)
923
924 return {
925 'id': video_id,
926 'uploader': video_uploader,
927 'uploader_id': video_uploader_id,
928 'upload_date': upload_date,
929 'title': video_title,
930 'thumbnail': video_thumbnail,
931 'description': video_description,
932 'categories': video_categories,
933 'subtitles': video_subtitles,
934 'duration': video_duration,
935 'age_limit': 18 if age_gate else 0,
936 'annotations': video_annotations,
937 'webpage_url': proto + '://www.youtube.com/watch?v=%s' % video_id,
938 'view_count': view_count,
939 'like_count': like_count,
940 'dislike_count': dislike_count,
941 'formats': formats,
942 }
943
944class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
945 IE_DESC = u'YouTube.com playlists'
946 _VALID_URL = r"""(?x)(?:
947 (?:https?://)?
948 (?:\w+\.)?
949 youtube\.com/
950 (?:
951 (?:course|view_play_list|my_playlists|artist|playlist|watch)
952 \? (?:.*?&)*? (?:p|a|list)=
953 | p/
954 )
955 (
956 (?:PL|LL|EC|UU|FL|RD)?[0-9A-Za-z-_]{10,}
957 # Top tracks, they can also include dots
958 |(?:MC)[\w\.]*
959 )
960 .*
961 |
962 ((?:PL|LL|EC|UU|FL|RD)[0-9A-Za-z-_]{10,})
963 )"""
964 _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s'
965 _MORE_PAGES_INDICATOR = r'data-link-type="next"'
966 _VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})&amp;[^"]*?index=(?P<index>\d+)'
967 IE_NAME = u'youtube:playlist'
968
969 def _real_initialize(self):
970 self._login()
971
972 def _ids_to_results(self, ids):
973 return [self.url_result(vid_id, 'Youtube', video_id=vid_id)
974 for vid_id in ids]
975
976 def _extract_mix(self, playlist_id):
977 # The mixes are generated from a a single video
978 # the id of the playlist is just 'RD' + video_id
979 url = 'https://youtube.com/watch?v=%s&list=%s' % (playlist_id[-11:], playlist_id)
980 webpage = self._download_webpage(url, playlist_id, u'Downloading Youtube mix')
981 search_title = lambda class_name: get_element_by_attribute('class', class_name, webpage)
982 title_span = (search_title('playlist-title') or
983 search_title('title long-title') or search_title('title'))
984 title = clean_html(title_span)
985 video_re = r'''(?x)data-video-username=".*?".*?
986 href="/watch\?v=([0-9A-Za-z_-]{11})&amp;[^"]*?list=%s''' % re.escape(playlist_id)
987 ids = orderedSet(re.findall(video_re, webpage, flags=re.DOTALL))
988 url_results = self._ids_to_results(ids)
989
990 return self.playlist_result(url_results, playlist_id, title)
991
992 def _real_extract(self, url):
993 # Extract playlist id
994 mobj = re.match(self._VALID_URL, url)
995 if mobj is None:
996 raise ExtractorError(u'Invalid URL: %s' % url)
997 playlist_id = mobj.group(1) or mobj.group(2)
998
999 # Check if it's a video-specific URL
1000 query_dict = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
1001 if 'v' in query_dict:
1002 video_id = query_dict['v'][0]
1003 if self._downloader.params.get('noplaylist'):
1004 self.to_screen(u'Downloading just video %s because of --no-playlist' % video_id)
1005 return self.url_result(video_id, 'Youtube', video_id=video_id)
1006 else:
1007 self.to_screen(u'Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id))
1008
1009 if playlist_id.startswith('RD'):
1010 # Mixes require a custom extraction process
1011 return self._extract_mix(playlist_id)
1012 if playlist_id.startswith('TL'):
1013 raise ExtractorError(u'For downloading YouTube.com top lists, use '
1014 u'the "yttoplist" keyword, for example "youtube-dl \'yttoplist:music:Top Tracks\'"', expected=True)
1015
1016 url = self._TEMPLATE_URL % playlist_id
1017 page = self._download_webpage(url, playlist_id)
1018 more_widget_html = content_html = page
1019
1020 # Check if the playlist exists or is private
1021 if re.search(r'<div class="yt-alert-message">[^<]*?(The|This) playlist (does not exist|is private)[^<]*?</div>', page) is not None:
1022 raise ExtractorError(
1023 u'The playlist doesn\'t exist or is private, use --username or '
1024 '--netrc to access it.',
1025 expected=True)
1026
1027 # Extract the video ids from the playlist pages
1028 ids = []
1029
1030 for page_num in itertools.count(1):
1031 matches = re.finditer(self._VIDEO_RE, content_html)
1032 # We remove the duplicates and the link with index 0
1033 # (it's not the first video of the playlist)
1034 new_ids = orderedSet(m.group('id') for m in matches if m.group('index') != '0')
1035 ids.extend(new_ids)
1036
1037 mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
1038 if not mobj:
1039 break
1040
1041 more = self._download_json(
1042 'https://youtube.com/%s' % mobj.group('more'), playlist_id,
1043 'Downloading page #%s' % page_num,
1044 transform_source=uppercase_escape)
1045 content_html = more['content_html']
1046 more_widget_html = more['load_more_widget_html']
1047
1048 playlist_title = self._html_search_regex(
1049 r'(?s)<h1 class="pl-header-title[^"]*">\s*(.*?)\s*</h1>',
1050 page, u'title')
1051
1052 url_results = self._ids_to_results(ids)
1053 return self.playlist_result(url_results, playlist_id, playlist_title)
1054
1055
1056class YoutubeTopListIE(YoutubePlaylistIE):
1057 IE_NAME = u'youtube:toplist'
1058 IE_DESC = (u'YouTube.com top lists, "yttoplist:{channel}:{list title}"'
1059 u' (Example: "yttoplist:music:Top Tracks")')
1060 _VALID_URL = r'yttoplist:(?P<chann>.*?):(?P<title>.*?)$'
1061
1062 def _real_extract(self, url):
1063 mobj = re.match(self._VALID_URL, url)
1064 channel = mobj.group('chann')
1065 title = mobj.group('title')
1066 query = compat_urllib_parse.urlencode({'title': title})
1067 playlist_re = 'href="([^"]+?%s.*?)"' % re.escape(query)
1068 channel_page = self._download_webpage('https://www.youtube.com/%s' % channel, title)
1069 link = self._html_search_regex(playlist_re, channel_page, u'list')
1070 url = compat_urlparse.urljoin('https://www.youtube.com/', link)
1071
1072 video_re = r'data-index="\d+".*?data-video-id="([0-9A-Za-z_-]{11})"'
1073 ids = []
1074 # sometimes the webpage doesn't contain the videos
1075 # retry until we get them
1076 for i in itertools.count(0):
1077 msg = u'Downloading Youtube mix'
1078 if i > 0:
1079 msg += ', retry #%d' % i
1080 webpage = self._download_webpage(url, title, msg)
1081 ids = orderedSet(re.findall(video_re, webpage))
1082 if ids:
1083 break
1084 url_results = self._ids_to_results(ids)
1085 return self.playlist_result(url_results, playlist_title=title)
1086
1087
1088class YoutubeChannelIE(InfoExtractor):
1089 IE_DESC = u'YouTube.com channels'
1090 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
1091 _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
1092 _MORE_PAGES_URL = 'https://www.youtube.com/c4_browse_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
1093 IE_NAME = u'youtube:channel'
1094
1095 def extract_videos_from_page(self, page):
1096 ids_in_page = []
1097 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1098 if mobj.group(1) not in ids_in_page:
1099 ids_in_page.append(mobj.group(1))
1100 return ids_in_page
1101
1102 def _real_extract(self, url):
1103 # Extract channel id
1104 mobj = re.match(self._VALID_URL, url)
1105 if mobj is None:
1106 raise ExtractorError(u'Invalid URL: %s' % url)
1107
1108 # Download channel page
1109 channel_id = mobj.group(1)
1110 video_ids = []
1111 url = 'https://www.youtube.com/channel/%s/videos' % channel_id
1112 channel_page = self._download_webpage(url, channel_id)
1113 autogenerated = re.search(r'''(?x)
1114 class="[^"]*?(?:
1115 channel-header-autogenerated-label|
1116 yt-channel-title-autogenerated
1117 )[^"]*"''', channel_page) is not None
1118
1119 if autogenerated:
1120 # The videos are contained in a single page
1121 # the ajax pages can't be used, they are empty
1122 video_ids = self.extract_videos_from_page(channel_page)
1123 else:
1124 # Download all channel pages using the json-based channel_ajax query
1125 for pagenum in itertools.count(1):
1126 url = self._MORE_PAGES_URL % (pagenum, channel_id)
1127 page = self._download_json(
1128 url, channel_id, note=u'Downloading page #%s' % pagenum,
1129 transform_source=uppercase_escape)
1130
1131 ids_in_page = self.extract_videos_from_page(page['content_html'])
1132 video_ids.extend(ids_in_page)
1133
1134 if self._MORE_PAGES_INDICATOR not in page['load_more_widget_html']:
1135 break
1136
1137 self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1138
1139 url_entries = [self.url_result(video_id, 'Youtube', video_id=video_id)
1140 for video_id in video_ids]
1141 return self.playlist_result(url_entries, channel_id)
1142
1143
1144class YoutubeUserIE(InfoExtractor):
1145 IE_DESC = u'YouTube.com user videos (URL or "ytuser" keyword)'
1146 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:user/)?(?!(?:attribution_link|watch|results)(?:$|[^a-z_A-Z0-9-])))|ytuser:)(?!feed/)([A-Za-z0-9_-]+)'
1147 _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/users/%s'
1148 _GDATA_PAGE_SIZE = 50
1149 _GDATA_URL = 'https://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d&alt=json'
1150 IE_NAME = u'youtube:user'
1151
1152 @classmethod
1153 def suitable(cls, url):
1154 # Don't return True if the url can be extracted with other youtube
1155 # extractor, the regex would is too permissive and it would match.
1156 other_ies = iter(klass for (name, klass) in globals().items() if name.endswith('IE') and klass is not cls)
1157 if any(ie.suitable(url) for ie in other_ies): return False
1158 else: return super(YoutubeUserIE, cls).suitable(url)
1159
1160 def _real_extract(self, url):
1161 # Extract username
1162 mobj = re.match(self._VALID_URL, url)
1163 if mobj is None:
1164 raise ExtractorError(u'Invalid URL: %s' % url)
1165
1166 username = mobj.group(1)
1167
1168 # Download video ids using YouTube Data API. Result size per
1169 # query is limited (currently to 50 videos) so we need to query
1170 # page by page until there are no video ids - it means we got
1171 # all of them.
1172
1173 def download_page(pagenum):
1174 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1175
1176 gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
1177 page = self._download_webpage(
1178 gdata_url, username,
1179 u'Downloading video ids from %d to %d' % (
1180 start_index, start_index + self._GDATA_PAGE_SIZE))
1181
1182 try:
1183 response = json.loads(page)
1184 except ValueError as err:
1185 raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
1186 if 'entry' not in response['feed']:
1187 return
1188
1189 # Extract video identifiers
1190 entries = response['feed']['entry']
1191 for entry in entries:
1192 title = entry['title']['$t']
1193 video_id = entry['id']['$t'].split('/')[-1]
1194 yield {
1195 '_type': 'url',
1196 'url': video_id,
1197 'ie_key': 'Youtube',
1198 'id': video_id,
1199 'title': title,
1200 }
1201 url_results = PagedList(download_page, self._GDATA_PAGE_SIZE)
1202
1203 return self.playlist_result(url_results, playlist_title=username)
1204
1205
1206class YoutubeSearchIE(SearchInfoExtractor):
1207 IE_DESC = u'YouTube.com searches'
1208 _API_URL = u'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1209 _MAX_RESULTS = 1000
1210 IE_NAME = u'youtube:search'
1211 _SEARCH_KEY = 'ytsearch'
1212
1213 def _get_n_results(self, query, n):
1214 """Get a specified number of results for a query"""
1215
1216 video_ids = []
1217 pagenum = 0
1218 limit = n
1219 PAGE_SIZE = 50
1220
1221 while (PAGE_SIZE * pagenum) < limit:
1222 result_url = self._API_URL % (
1223 compat_urllib_parse.quote_plus(query.encode('utf-8')),
1224 (PAGE_SIZE * pagenum) + 1)
1225 data_json = self._download_webpage(
1226 result_url, video_id=u'query "%s"' % query,
1227 note=u'Downloading page %s' % (pagenum + 1),
1228 errnote=u'Unable to download API page')
1229 data = json.loads(data_json)
1230 api_response = data['data']
1231
1232 if 'items' not in api_response:
1233 raise ExtractorError(
1234 u'[youtube] No video results', expected=True)
1235
1236 new_ids = list(video['id'] for video in api_response['items'])
1237 video_ids += new_ids
1238
1239 limit = min(n, api_response['totalItems'])
1240 pagenum += 1
1241
1242 if len(video_ids) > n:
1243 video_ids = video_ids[:n]
1244 videos = [self.url_result(video_id, 'Youtube', video_id=video_id)
1245 for video_id in video_ids]
1246 return self.playlist_result(videos, query)
1247
1248
1249class YoutubeSearchDateIE(YoutubeSearchIE):
1250 IE_NAME = YoutubeSearchIE.IE_NAME + ':date'
1251 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc&orderby=published'
1252 _SEARCH_KEY = 'ytsearchdate'
1253 IE_DESC = u'YouTube.com searches, newest videos first'
1254
1255
1256class YoutubeSearchURLIE(InfoExtractor):
1257 IE_DESC = u'YouTube.com search URLs'
1258 IE_NAME = u'youtube:search_url'
1259 _VALID_URL = r'https?://(?:www\.)?youtube\.com/results\?(.*?&)?search_query=(?P<query>[^&]+)(?:[&]|$)'
1260
1261 def _real_extract(self, url):
1262 mobj = re.match(self._VALID_URL, url)
1263 query = compat_urllib_parse.unquote_plus(mobj.group('query'))
1264
1265 webpage = self._download_webpage(url, query)
1266 result_code = self._search_regex(
1267 r'(?s)<ol class="item-section"(.*?)</ol>', webpage, u'result HTML')
1268
1269 part_codes = re.findall(
1270 r'(?s)<h3 class="yt-lockup-title">(.*?)</h3>', result_code)
1271 entries = []
1272 for part_code in part_codes:
1273 part_title = self._html_search_regex(
1274 [r'(?s)title="([^"]+)"', r'>([^<]+)</a>'], part_code, 'item title', fatal=False)
1275 part_url_snippet = self._html_search_regex(
1276 r'(?s)href="([^"]+)"', part_code, 'item URL')
1277 part_url = compat_urlparse.urljoin(
1278 'https://www.youtube.com/', part_url_snippet)
1279 entries.append({
1280 '_type': 'url',
1281 'url': part_url,
1282 'title': part_title,
1283 })
1284
1285 return {
1286 '_type': 'playlist',
1287 'entries': entries,
1288 'title': query,
1289 }
1290
1291
1292class YoutubeShowIE(InfoExtractor):
1293 IE_DESC = u'YouTube.com (multi-season) shows'
1294 _VALID_URL = r'https?://www\.youtube\.com/show/(.*)'
1295 IE_NAME = u'youtube:show'
1296
1297 def _real_extract(self, url):
1298 mobj = re.match(self._VALID_URL, url)
1299 show_name = mobj.group(1)
1300 webpage = self._download_webpage(url, show_name, u'Downloading show webpage')
1301 # There's one playlist for each season of the show
1302 m_seasons = list(re.finditer(r'href="(/playlist\?list=.*?)"', webpage))
1303 self.to_screen(u'%s: Found %s seasons' % (show_name, len(m_seasons)))
1304 return [self.url_result('https://www.youtube.com' + season.group(1), 'YoutubePlaylist') for season in m_seasons]
1305
1306
1307class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
1308 """
1309 Base class for extractors that fetch info from
1310 http://www.youtube.com/feed_ajax
1311 Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.
1312 """
1313 _LOGIN_REQUIRED = True
1314 # use action_load_personal_feed instead of action_load_system_feed
1315 _PERSONAL_FEED = False
1316
1317 @property
1318 def _FEED_TEMPLATE(self):
1319 action = 'action_load_system_feed'
1320 if self._PERSONAL_FEED:
1321 action = 'action_load_personal_feed'
1322 return 'https://www.youtube.com/feed_ajax?%s=1&feed_name=%s&paging=%%s' % (action, self._FEED_NAME)
1323
1324 @property
1325 def IE_NAME(self):
1326 return u'youtube:%s' % self._FEED_NAME
1327
1328 def _real_initialize(self):
1329 self._login()
1330
1331 def _real_extract(self, url):
1332 feed_entries = []
1333 paging = 0
1334 for i in itertools.count(1):
1335 info = self._download_json(self._FEED_TEMPLATE % paging,
1336 u'%s feed' % self._FEED_NAME,
1337 u'Downloading page %s' % i)
1338 feed_html = info.get('feed_html') or info.get('content_html')
1339 m_ids = re.finditer(r'"/watch\?v=(.*?)["&]', feed_html)
1340 ids = orderedSet(m.group(1) for m in m_ids)
1341 feed_entries.extend(
1342 self.url_result(video_id, 'Youtube', video_id=video_id)
1343 for video_id in ids)
1344 mobj = re.search(
1345 r'data-uix-load-more-href="/?[^"]+paging=(?P<paging>\d+)',
1346 feed_html)
1347 if mobj is None:
1348 break
1349 paging = mobj.group('paging')
1350 return self.playlist_result(feed_entries, playlist_title=self._PLAYLIST_TITLE)
1351
1352class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor):
1353 IE_DESC = u'YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication)'
1354 _VALID_URL = r'https?://www\.youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
1355 _FEED_NAME = 'subscriptions'
1356 _PLAYLIST_TITLE = u'Youtube Subscriptions'
1357
1358class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
1359 IE_DESC = u'YouTube.com recommended videos, "ytrec" keyword (requires authentication)'
1360 _VALID_URL = r'https?://www\.youtube\.com/feed/recommended|:ytrec(?:ommended)?'
1361 _FEED_NAME = 'recommended'
1362 _PLAYLIST_TITLE = u'Youtube Recommended videos'
1363
1364class YoutubeWatchLaterIE(YoutubeFeedsInfoExtractor):
1365 IE_DESC = u'Youtube watch later list, "ytwatchlater" keyword (requires authentication)'
1366 _VALID_URL = r'https?://www\.youtube\.com/feed/watch_later|:ytwatchlater'
1367 _FEED_NAME = 'watch_later'
1368 _PLAYLIST_TITLE = u'Youtube Watch Later'
1369 _PERSONAL_FEED = True
1370
1371class YoutubeHistoryIE(YoutubeFeedsInfoExtractor):
1372 IE_DESC = u'Youtube watch history, "ythistory" keyword (requires authentication)'
1373 _VALID_URL = u'https?://www\.youtube\.com/feed/history|:ythistory'
1374 _FEED_NAME = 'history'
1375 _PERSONAL_FEED = True
1376 _PLAYLIST_TITLE = u'Youtube Watch History'
1377
1378class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
1379 IE_NAME = u'youtube:favorites'
1380 IE_DESC = u'YouTube.com favourite videos, "ytfav" keyword (requires authentication)'
1381 _VALID_URL = r'https?://www\.youtube\.com/my_favorites|:ytfav(?:ou?rites)?'
1382 _LOGIN_REQUIRED = True
1383
1384 def _real_extract(self, url):
1385 webpage = self._download_webpage('https://www.youtube.com/my_favorites', 'Youtube Favourites videos')
1386 playlist_id = self._search_regex(r'list=(.+?)["&]', webpage, u'favourites playlist id')
1387 return self.url_result(playlist_id, 'YoutubePlaylist')
1388
1389
1390class YoutubeTruncatedURLIE(InfoExtractor):
1391 IE_NAME = 'youtube:truncated_url'
1392 IE_DESC = False # Do not list
1393 _VALID_URL = r'''(?x)
1394 (?:https?://)?[^/]+/watch\?(?:
1395 feature=[a-z_]+|
1396 annotation_id=annotation_[^&]+
1397 )?$|
1398 (?:https?://)?(?:www\.)?youtube\.com/attribution_link\?a=[^&]+$
1399 '''
1400
1401 _TESTS = [{
1402 'url': 'http://www.youtube.com/watch?annotation_id=annotation_3951667041',
1403 'only_matching': True,
1404 }, {
1405 'url': 'http://www.youtube.com/watch?',
1406 'only_matching': True,
1407 }]
1408
1409 def _real_extract(self, url):
1410 raise ExtractorError(
1411 u'Did you forget to quote the URL? Remember that & is a meta '
1412 u'character in most shells, so you want to put the URL in quotes, '
1413 u'like youtube-dl '
1414 u'"http://www.youtube.com/watch?feature=foo&v=BaW_jenozKc" '
1415 u' or simply youtube-dl BaW_jenozKc .',
1416 expected=True)