]> jfr.im git - yt-dlp.git/blame_incremental - youtube_dl/extractor/youtube.py
Merge branch 'naglis-izlesene'
[yt-dlp.git] / youtube_dl / extractor / youtube.py
... / ...
CommitLineData
1# coding: utf-8
2
3import errno
4import io
5import itertools
6import json
7import os.path
8import re
9import traceback
10
11from .common import InfoExtractor, SearchInfoExtractor
12from .subtitles import SubtitlesInfoExtractor
13from ..jsinterp import JSInterpreter
14from ..swfinterp import SWFInterpreter
15from ..utils import (
16 compat_chr,
17 compat_parse_qs,
18 compat_urllib_parse,
19 compat_urllib_request,
20 compat_urlparse,
21 compat_str,
22
23 clean_html,
24 get_cachedir,
25 get_element_by_id,
26 get_element_by_attribute,
27 ExtractorError,
28 int_or_none,
29 PagedList,
30 unescapeHTML,
31 unified_strdate,
32 orderedSet,
33 write_json_file,
34 uppercase_escape,
35)
36
37class YoutubeBaseInfoExtractor(InfoExtractor):
38 """Provide base functions for Youtube extractors"""
39 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
40 _LANG_URL = r'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
41 _AGE_URL = 'https://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
42 _NETRC_MACHINE = 'youtube'
43 # If True it will raise an error if no login info is provided
44 _LOGIN_REQUIRED = False
45
46 def _set_language(self):
47 return bool(self._download_webpage(
48 self._LANG_URL, None,
49 note=u'Setting language', errnote='unable to set language',
50 fatal=False))
51
52 def _login(self):
53 (username, password) = self._get_login_info()
54 # No authentication to be performed
55 if username is None:
56 if self._LOGIN_REQUIRED:
57 raise ExtractorError(u'No login info available, needed for using %s.' % self.IE_NAME, expected=True)
58 return False
59
60 login_page = self._download_webpage(
61 self._LOGIN_URL, None,
62 note=u'Downloading login page',
63 errnote=u'unable to fetch login page', fatal=False)
64 if login_page is False:
65 return
66
67 galx = self._search_regex(r'(?s)<input.+?name="GALX".+?value="(.+?)"',
68 login_page, u'Login GALX parameter')
69
70 # Log in
71 login_form_strs = {
72 u'continue': u'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
73 u'Email': username,
74 u'GALX': galx,
75 u'Passwd': password,
76 u'PersistentCookie': u'yes',
77 u'_utf8': u'霱',
78 u'bgresponse': u'js_disabled',
79 u'checkConnection': u'',
80 u'checkedDomains': u'youtube',
81 u'dnConn': u'',
82 u'pstMsg': u'0',
83 u'rmShown': u'1',
84 u'secTok': u'',
85 u'signIn': u'Sign in',
86 u'timeStmp': u'',
87 u'service': u'youtube',
88 u'uilel': u'3',
89 u'hl': u'en_US',
90 }
91 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
92 # chokes on unicode
93 login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
94 login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
95
96 req = compat_urllib_request.Request(self._LOGIN_URL, login_data)
97 login_results = self._download_webpage(
98 req, None,
99 note=u'Logging in', errnote=u'unable to log in', fatal=False)
100 if login_results is False:
101 return False
102 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
103 self._downloader.report_warning(u'unable to log in: bad username or password')
104 return False
105 return True
106
107 def _confirm_age(self):
108 age_form = {
109 'next_url': '/',
110 'action_confirm': 'Confirm',
111 }
112 req = compat_urllib_request.Request(self._AGE_URL,
113 compat_urllib_parse.urlencode(age_form).encode('ascii'))
114
115 self._download_webpage(
116 req, None,
117 note=u'Confirming age', errnote=u'Unable to confirm age')
118 return True
119
120 def _real_initialize(self):
121 if self._downloader is None:
122 return
123 if not self._set_language():
124 return
125 if not self._login():
126 return
127 self._confirm_age()
128
129
130class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
131 IE_DESC = u'YouTube.com'
132 _VALID_URL = r"""(?x)^
133 (
134 (?:https?://|//)? # http(s):// or protocol-independent URL (optional)
135 (?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/|
136 (?:www\.)?deturl\.com/www\.youtube\.com/|
137 (?:www\.)?pwnyoutube\.com/|
138 (?:www\.)?yourepeat\.com/|
139 tube\.majestyc\.net/|
140 youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains
141 (?:.*?\#/)? # handle anchor (#/) redirect urls
142 (?: # the various things that can precede the ID:
143 (?:(?:v|embed|e)/) # v/ or embed/ or e/
144 |(?: # or the v= param in all its forms
145 (?:(?:watch|movie)(?:_popup)?(?:\.php)?/?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
146 (?:\?|\#!?) # the params delimiter ? or # or #!
147 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
148 v=
149 )
150 ))
151 |youtu\.be/ # just youtu.be/xxxx
152 |https?://(?:www\.)?cleanvideosearch\.com/media/action/yt/watch\?videoId=
153 )
154 )? # all until now is optional -> you can pass the naked ID
155 ([0-9A-Za-z_-]{11}) # here is it! the YouTube video ID
156 (?(1).+)? # if we found the ID, everything can follow
157 $"""
158 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
159 _formats = {
160 '5': {'ext': 'flv', 'width': 400, 'height': 240},
161 '6': {'ext': 'flv', 'width': 450, 'height': 270},
162 '13': {'ext': '3gp'},
163 '17': {'ext': '3gp', 'width': 176, 'height': 144},
164 '18': {'ext': 'mp4', 'width': 640, 'height': 360},
165 '22': {'ext': 'mp4', 'width': 1280, 'height': 720},
166 '34': {'ext': 'flv', 'width': 640, 'height': 360},
167 '35': {'ext': 'flv', 'width': 854, 'height': 480},
168 '36': {'ext': '3gp', 'width': 320, 'height': 240},
169 '37': {'ext': 'mp4', 'width': 1920, 'height': 1080},
170 '38': {'ext': 'mp4', 'width': 4096, 'height': 3072},
171 '43': {'ext': 'webm', 'width': 640, 'height': 360},
172 '44': {'ext': 'webm', 'width': 854, 'height': 480},
173 '45': {'ext': 'webm', 'width': 1280, 'height': 720},
174 '46': {'ext': 'webm', 'width': 1920, 'height': 1080},
175
176
177 # 3d videos
178 '82': {'ext': 'mp4', 'height': 360, 'format_note': '3D', 'preference': -20},
179 '83': {'ext': 'mp4', 'height': 480, 'format_note': '3D', 'preference': -20},
180 '84': {'ext': 'mp4', 'height': 720, 'format_note': '3D', 'preference': -20},
181 '85': {'ext': 'mp4', 'height': 1080, 'format_note': '3D', 'preference': -20},
182 '100': {'ext': 'webm', 'height': 360, 'format_note': '3D', 'preference': -20},
183 '101': {'ext': 'webm', 'height': 480, 'format_note': '3D', 'preference': -20},
184 '102': {'ext': 'webm', 'height': 720, 'format_note': '3D', 'preference': -20},
185
186 # Apple HTTP Live Streaming
187 '92': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'preference': -10},
188 '93': {'ext': 'mp4', 'height': 360, 'format_note': 'HLS', 'preference': -10},
189 '94': {'ext': 'mp4', 'height': 480, 'format_note': 'HLS', 'preference': -10},
190 '95': {'ext': 'mp4', 'height': 720, 'format_note': 'HLS', 'preference': -10},
191 '96': {'ext': 'mp4', 'height': 1080, 'format_note': 'HLS', 'preference': -10},
192 '132': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'preference': -10},
193 '151': {'ext': 'mp4', 'height': 72, 'format_note': 'HLS', 'preference': -10},
194
195 # DASH mp4 video
196 '133': {'ext': 'mp4', 'height': 240, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
197 '134': {'ext': 'mp4', 'height': 360, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
198 '135': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
199 '136': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
200 '137': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
201 '138': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
202 '160': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
203 '264': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
204
205 # Dash mp4 audio
206 '139': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 48, 'preference': -50},
207 '140': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 128, 'preference': -50},
208 '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 256, 'preference': -50},
209
210 # Dash webm
211 '167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
212 '168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
213 '169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
214 '170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
215 '218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
216 '219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
217 '242': {'ext': 'webm', 'height': 240, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
218 '243': {'ext': 'webm', 'height': 360, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
219 '244': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
220 '245': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
221 '246': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
222 '247': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
223 '248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
224 '271': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
225 '272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
226
227 # Dash webm audio
228 '171': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'abr': 48, 'preference': -50},
229 '172': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'abr': 256, 'preference': -50},
230
231 # RTMP (unnamed)
232 '_rtmp': {'protocol': 'rtmp'},
233 }
234
235 IE_NAME = u'youtube'
236 _TESTS = [
237 {
238 u"url": u"http://www.youtube.com/watch?v=BaW_jenozKc",
239 u"file": u"BaW_jenozKc.mp4",
240 u"info_dict": {
241 u"title": u"youtube-dl test video \"'/\\ä↭𝕐",
242 u"uploader": u"Philipp Hagemeister",
243 u"uploader_id": u"phihag",
244 u"upload_date": u"20121002",
245 u"description": u"test chars: \"'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .",
246 u"categories": [u'Science & Technology'],
247 }
248 },
249 {
250 u"url": u"http://www.youtube.com/watch?v=UxxajLWwzqY",
251 u"file": u"UxxajLWwzqY.mp4",
252 u"note": u"Test generic use_cipher_signature video (#897)",
253 u"info_dict": {
254 u"upload_date": u"20120506",
255 u"title": u"Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]",
256 u"description": u"md5:fea86fda2d5a5784273df5c7cc994d9f",
257 u"uploader": u"Icona Pop",
258 u"uploader_id": u"IconaPop"
259 }
260 },
261 {
262 u"url": u"https://www.youtube.com/watch?v=07FYdnEawAQ",
263 u"file": u"07FYdnEawAQ.mp4",
264 u"note": u"Test VEVO video with age protection (#956)",
265 u"info_dict": {
266 u"upload_date": u"20130703",
267 u"title": u"Justin Timberlake - Tunnel Vision (Explicit)",
268 u"description": u"md5:64249768eec3bc4276236606ea996373",
269 u"uploader": u"justintimberlakeVEVO",
270 u"uploader_id": u"justintimberlakeVEVO"
271 }
272 },
273 {
274 u"url": u"//www.YouTube.com/watch?v=yZIXLfi8CZQ",
275 u"file": u"yZIXLfi8CZQ.mp4",
276 u"note": u"Embed-only video (#1746)",
277 u"info_dict": {
278 u"upload_date": u"20120608",
279 u"title": u"Principal Sexually Assaults A Teacher - Episode 117 - 8th June 2012",
280 u"description": u"md5:09b78bd971f1e3e289601dfba15ca4f7",
281 u"uploader": u"SET India",
282 u"uploader_id": u"setindia"
283 }
284 },
285 {
286 u"url": u"http://www.youtube.com/watch?v=a9LDPn-MO4I",
287 u"file": u"a9LDPn-MO4I.m4a",
288 u"note": u"256k DASH audio (format 141) via DASH manifest",
289 u"info_dict": {
290 u"upload_date": "20121002",
291 u"uploader_id": "8KVIDEO",
292 u"description": "No description available.",
293 u"uploader": "8KVIDEO",
294 u"title": "UHDTV TEST 8K VIDEO.mp4"
295 },
296 u"params": {
297 u"youtube_include_dash_manifest": True,
298 u"format": "141",
299 },
300 },
301 # DASH manifest with encrypted signature
302 {
303 u'url': u'https://www.youtube.com/watch?v=IB3lcPjvWLA',
304 u'info_dict': {
305 u'id': u'IB3lcPjvWLA',
306 u'ext': u'm4a',
307 u'title': u'Afrojack - The Spark ft. Spree Wilson',
308 u'description': u'md5:9717375db5a9a3992be4668bbf3bc0a8',
309 u'uploader': u'AfrojackVEVO',
310 u'uploader_id': u'AfrojackVEVO',
311 u'upload_date': u'20131011',
312 },
313 u"params": {
314 u'youtube_include_dash_manifest': True,
315 u'format': '141',
316 },
317 },
318 ]
319
320
321 @classmethod
322 def suitable(cls, url):
323 """Receives a URL and returns True if suitable for this IE."""
324 if YoutubePlaylistIE.suitable(url): return False
325 return re.match(cls._VALID_URL, url) is not None
326
327 def __init__(self, *args, **kwargs):
328 super(YoutubeIE, self).__init__(*args, **kwargs)
329 self._player_cache = {}
330
331 def report_video_info_webpage_download(self, video_id):
332 """Report attempt to download video info webpage."""
333 self.to_screen(u'%s: Downloading video info webpage' % video_id)
334
335 def report_information_extraction(self, video_id):
336 """Report attempt to extract video information."""
337 self.to_screen(u'%s: Extracting video information' % video_id)
338
339 def report_unavailable_format(self, video_id, format):
340 """Report extracted video URL."""
341 self.to_screen(u'%s: Format %s not available' % (video_id, format))
342
343 def report_rtmp_download(self):
344 """Indicate the download will use the RTMP protocol."""
345 self.to_screen(u'RTMP download detected')
346
347 def _extract_signature_function(self, video_id, player_url, slen):
348 id_m = re.match(
349 r'.*-(?P<id>[a-zA-Z0-9_-]+)(?:/watch_as3|/html5player)?\.(?P<ext>[a-z]+)$',
350 player_url)
351 if not id_m:
352 raise ExtractorError('Cannot identify player %r' % player_url)
353 player_type = id_m.group('ext')
354 player_id = id_m.group('id')
355
356 # Read from filesystem cache
357 func_id = '%s_%s_%d' % (player_type, player_id, slen)
358 assert os.path.basename(func_id) == func_id
359 cache_dir = get_cachedir(self._downloader.params)
360
361 cache_enabled = cache_dir is not None
362 if cache_enabled:
363 cache_fn = os.path.join(os.path.expanduser(cache_dir),
364 u'youtube-sigfuncs',
365 func_id + '.json')
366 try:
367 with io.open(cache_fn, 'r', encoding='utf-8') as cachef:
368 cache_spec = json.load(cachef)
369 return lambda s: u''.join(s[i] for i in cache_spec)
370 except IOError:
371 pass # No cache available
372
373 if player_type == 'js':
374 code = self._download_webpage(
375 player_url, video_id,
376 note=u'Downloading %s player %s' % (player_type, player_id),
377 errnote=u'Download of %s failed' % player_url)
378 res = self._parse_sig_js(code)
379 elif player_type == 'swf':
380 urlh = self._request_webpage(
381 player_url, video_id,
382 note=u'Downloading %s player %s' % (player_type, player_id),
383 errnote=u'Download of %s failed' % player_url)
384 code = urlh.read()
385 res = self._parse_sig_swf(code)
386 else:
387 assert False, 'Invalid player type %r' % player_type
388
389 if cache_enabled:
390 try:
391 test_string = u''.join(map(compat_chr, range(slen)))
392 cache_res = res(test_string)
393 cache_spec = [ord(c) for c in cache_res]
394 try:
395 os.makedirs(os.path.dirname(cache_fn))
396 except OSError as ose:
397 if ose.errno != errno.EEXIST:
398 raise
399 write_json_file(cache_spec, cache_fn)
400 except Exception:
401 tb = traceback.format_exc()
402 self._downloader.report_warning(
403 u'Writing cache to %r failed: %s' % (cache_fn, tb))
404
405 return res
406
407 def _print_sig_code(self, func, slen):
408 def gen_sig_code(idxs):
409 def _genslice(start, end, step):
410 starts = u'' if start == 0 else str(start)
411 ends = (u':%d' % (end+step)) if end + step >= 0 else u':'
412 steps = u'' if step == 1 else (u':%d' % step)
413 return u's[%s%s%s]' % (starts, ends, steps)
414
415 step = None
416 start = '(Never used)' # Quelch pyflakes warnings - start will be
417 # set as soon as step is set
418 for i, prev in zip(idxs[1:], idxs[:-1]):
419 if step is not None:
420 if i - prev == step:
421 continue
422 yield _genslice(start, prev, step)
423 step = None
424 continue
425 if i - prev in [-1, 1]:
426 step = i - prev
427 start = prev
428 continue
429 else:
430 yield u's[%d]' % prev
431 if step is None:
432 yield u's[%d]' % i
433 else:
434 yield _genslice(start, i, step)
435
436 test_string = u''.join(map(compat_chr, range(slen)))
437 cache_res = func(test_string)
438 cache_spec = [ord(c) for c in cache_res]
439 expr_code = u' + '.join(gen_sig_code(cache_spec))
440 code = u'if len(s) == %d:\n return %s\n' % (slen, expr_code)
441 self.to_screen(u'Extracted signature function:\n' + code)
442
443 def _parse_sig_js(self, jscode):
444 funcname = self._search_regex(
445 r'signature=([$a-zA-Z]+)', jscode,
446 u'Initial JS player signature function name')
447
448 jsi = JSInterpreter(jscode)
449 initial_function = jsi.extract_function(funcname)
450 return lambda s: initial_function([s])
451
452 def _parse_sig_swf(self, file_contents):
453 swfi = SWFInterpreter(file_contents)
454 TARGET_CLASSNAME = u'SignatureDecipher'
455 searched_class = swfi.extract_class(TARGET_CLASSNAME)
456 initial_function = swfi.extract_function(searched_class, u'decipher')
457 return lambda s: initial_function([s])
458
459 def _decrypt_signature(self, s, video_id, player_url, age_gate=False):
460 """Turn the encrypted s field into a working signature"""
461
462 if player_url is None:
463 raise ExtractorError(u'Cannot decrypt signature without player_url')
464
465 if player_url.startswith(u'//'):
466 player_url = u'https:' + player_url
467 try:
468 player_id = (player_url, len(s))
469 if player_id not in self._player_cache:
470 func = self._extract_signature_function(
471 video_id, player_url, len(s)
472 )
473 self._player_cache[player_id] = func
474 func = self._player_cache[player_id]
475 if self._downloader.params.get('youtube_print_sig_code'):
476 self._print_sig_code(func, len(s))
477 return func(s)
478 except Exception as e:
479 tb = traceback.format_exc()
480 raise ExtractorError(
481 u'Automatic signature extraction failed: ' + tb, cause=e)
482
483 def _get_available_subtitles(self, video_id, webpage):
484 try:
485 sub_list = self._download_webpage(
486 'https://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,
487 video_id, note=False)
488 except ExtractorError as err:
489 self._downloader.report_warning(u'unable to download video subtitles: %s' % compat_str(err))
490 return {}
491 lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
492
493 sub_lang_list = {}
494 for l in lang_list:
495 lang = l[1]
496 params = compat_urllib_parse.urlencode({
497 'lang': lang,
498 'v': video_id,
499 'fmt': self._downloader.params.get('subtitlesformat', 'srt'),
500 'name': unescapeHTML(l[0]).encode('utf-8'),
501 })
502 url = u'https://www.youtube.com/api/timedtext?' + params
503 sub_lang_list[lang] = url
504 if not sub_lang_list:
505 self._downloader.report_warning(u'video doesn\'t have subtitles')
506 return {}
507 return sub_lang_list
508
509 def _get_available_automatic_caption(self, video_id, webpage):
510 """We need the webpage for getting the captions url, pass it as an
511 argument to speed up the process."""
512 sub_format = self._downloader.params.get('subtitlesformat', 'srt')
513 self.to_screen(u'%s: Looking for automatic captions' % video_id)
514 mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
515 err_msg = u'Couldn\'t find automatic captions for %s' % video_id
516 if mobj is None:
517 self._downloader.report_warning(err_msg)
518 return {}
519 player_config = json.loads(mobj.group(1))
520 try:
521 args = player_config[u'args']
522 caption_url = args[u'ttsurl']
523 timestamp = args[u'timestamp']
524 # We get the available subtitles
525 list_params = compat_urllib_parse.urlencode({
526 'type': 'list',
527 'tlangs': 1,
528 'asrs': 1,
529 })
530 list_url = caption_url + '&' + list_params
531 caption_list = self._download_xml(list_url, video_id)
532 original_lang_node = caption_list.find('track')
533 if original_lang_node is None or original_lang_node.attrib.get('kind') != 'asr' :
534 self._downloader.report_warning(u'Video doesn\'t have automatic captions')
535 return {}
536 original_lang = original_lang_node.attrib['lang_code']
537
538 sub_lang_list = {}
539 for lang_node in caption_list.findall('target'):
540 sub_lang = lang_node.attrib['lang_code']
541 params = compat_urllib_parse.urlencode({
542 'lang': original_lang,
543 'tlang': sub_lang,
544 'fmt': sub_format,
545 'ts': timestamp,
546 'kind': 'asr',
547 })
548 sub_lang_list[sub_lang] = caption_url + '&' + params
549 return sub_lang_list
550 # An extractor error can be raise by the download process if there are
551 # no automatic captions but there are subtitles
552 except (KeyError, ExtractorError):
553 self._downloader.report_warning(err_msg)
554 return {}
555
556 @classmethod
557 def extract_id(cls, url):
558 mobj = re.match(cls._VALID_URL, url, re.VERBOSE)
559 if mobj is None:
560 raise ExtractorError(u'Invalid URL: %s' % url)
561 video_id = mobj.group(2)
562 return video_id
563
564 def _extract_from_m3u8(self, manifest_url, video_id):
565 url_map = {}
566 def _get_urls(_manifest):
567 lines = _manifest.split('\n')
568 urls = filter(lambda l: l and not l.startswith('#'),
569 lines)
570 return urls
571 manifest = self._download_webpage(manifest_url, video_id, u'Downloading formats manifest')
572 formats_urls = _get_urls(manifest)
573 for format_url in formats_urls:
574 itag = self._search_regex(r'itag/(\d+?)/', format_url, 'itag')
575 url_map[itag] = format_url
576 return url_map
577
578 def _extract_annotations(self, video_id):
579 url = 'https://www.youtube.com/annotations_invideo?features=1&legacy=1&video_id=%s' % video_id
580 return self._download_webpage(url, video_id, note=u'Searching for annotations.', errnote=u'Unable to download video annotations.')
581
582 def _real_extract(self, url):
583 proto = (
584 u'http' if self._downloader.params.get('prefer_insecure', False)
585 else u'https')
586
587 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
588 mobj = re.search(self._NEXT_URL_RE, url)
589 if mobj:
590 url = proto + '://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
591 video_id = self.extract_id(url)
592
593 # Get video webpage
594 url = proto + '://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
595 video_webpage = self._download_webpage(url, video_id)
596
597 # Attempt to extract SWF player URL
598 mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
599 if mobj is not None:
600 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
601 else:
602 player_url = None
603
604 # Get video info
605 self.report_video_info_webpage_download(video_id)
606 if re.search(r'player-age-gate-content">', video_webpage) is not None:
607 self.report_age_confirmation()
608 age_gate = True
609 # We simulate the access to the video from www.youtube.com/v/{video_id}
610 # this can be viewed without login into Youtube
611 data = compat_urllib_parse.urlencode({
612 'video_id': video_id,
613 'eurl': 'https://youtube.googleapis.com/v/' + video_id,
614 'sts': self._search_regex(
615 r'"sts"\s*:\s*(\d+)', video_webpage, 'sts'),
616 })
617 video_info_url = proto + '://www.youtube.com/get_video_info?' + data
618 video_info_webpage = self._download_webpage(video_info_url, video_id,
619 note=False,
620 errnote='unable to download video info webpage')
621 video_info = compat_parse_qs(video_info_webpage)
622 else:
623 age_gate = False
624 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
625 video_info_url = (proto + '://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
626 % (video_id, el_type))
627 video_info_webpage = self._download_webpage(video_info_url, video_id,
628 note=False,
629 errnote='unable to download video info webpage')
630 video_info = compat_parse_qs(video_info_webpage)
631 if 'token' in video_info:
632 break
633 if 'token' not in video_info:
634 if 'reason' in video_info:
635 raise ExtractorError(
636 u'YouTube said: %s' % video_info['reason'][0],
637 expected=True, video_id=video_id)
638 else:
639 raise ExtractorError(
640 u'"token" parameter not in video info for unknown reason',
641 video_id=video_id)
642
643 if 'view_count' in video_info:
644 view_count = int(video_info['view_count'][0])
645 else:
646 view_count = None
647
648 # Check for "rental" videos
649 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
650 raise ExtractorError(u'"rental" videos not supported')
651
652 # Start extracting information
653 self.report_information_extraction(video_id)
654
655 # uploader
656 if 'author' not in video_info:
657 raise ExtractorError(u'Unable to extract uploader name')
658 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
659
660 # uploader_id
661 video_uploader_id = None
662 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
663 if mobj is not None:
664 video_uploader_id = mobj.group(1)
665 else:
666 self._downloader.report_warning(u'unable to extract uploader nickname')
667
668 # title
669 if 'title' in video_info:
670 video_title = video_info['title'][0]
671 else:
672 self._downloader.report_warning(u'Unable to extract video title')
673 video_title = u'_'
674
675 # thumbnail image
676 # We try first to get a high quality image:
677 m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">',
678 video_webpage, re.DOTALL)
679 if m_thumb is not None:
680 video_thumbnail = m_thumb.group(1)
681 elif 'thumbnail_url' not in video_info:
682 self._downloader.report_warning(u'unable to extract video thumbnail')
683 video_thumbnail = None
684 else: # don't panic if we can't find it
685 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
686
687 # upload date
688 upload_date = None
689 mobj = re.search(r'(?s)id="eow-date.*?>(.*?)</span>', video_webpage)
690 if mobj is None:
691 mobj = re.search(
692 r'(?s)id="watch-uploader-info".*?>.*?(?:Published|Uploaded|Streamed live) on (.*?)</strong>',
693 video_webpage)
694 if mobj is not None:
695 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
696 upload_date = unified_strdate(upload_date)
697
698 m_cat_container = get_element_by_id("eow-category", video_webpage)
699 if m_cat_container:
700 category = self._html_search_regex(
701 r'(?s)<a[^<]+>(.*?)</a>', m_cat_container, 'category',
702 default=None)
703 video_categories = None if category is None else [category]
704 else:
705 video_categories = None
706
707 # description
708 video_description = get_element_by_id("eow-description", video_webpage)
709 if video_description:
710 video_description = re.sub(r'''(?x)
711 <a\s+
712 (?:[a-zA-Z-]+="[^"]+"\s+)*?
713 title="([^"]+)"\s+
714 (?:[a-zA-Z-]+="[^"]+"\s+)*?
715 class="yt-uix-redirect-link"\s*>
716 [^<]+
717 </a>
718 ''', r'\1', video_description)
719 video_description = clean_html(video_description)
720 else:
721 fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
722 if fd_mobj:
723 video_description = unescapeHTML(fd_mobj.group(1))
724 else:
725 video_description = u''
726
727 def _extract_count(klass):
728 count = self._search_regex(
729 r'class="%s">([\d,]+)</span>' % re.escape(klass),
730 video_webpage, klass, default=None)
731 if count is not None:
732 return int(count.replace(',', ''))
733 return None
734 like_count = _extract_count(u'likes-count')
735 dislike_count = _extract_count(u'dislikes-count')
736
737 # subtitles
738 video_subtitles = self.extract_subtitles(video_id, video_webpage)
739
740 if self._downloader.params.get('listsubtitles', False):
741 self._list_available_subtitles(video_id, video_webpage)
742 return
743
744 if 'length_seconds' not in video_info:
745 self._downloader.report_warning(u'unable to extract video duration')
746 video_duration = None
747 else:
748 video_duration = int(compat_urllib_parse.unquote_plus(video_info['length_seconds'][0]))
749
750 # annotations
751 video_annotations = None
752 if self._downloader.params.get('writeannotations', False):
753 video_annotations = self._extract_annotations(video_id)
754
755 # Decide which formats to download
756 try:
757 mobj = re.search(r';ytplayer\.config\s*=\s*({.*?});', video_webpage)
758 if not mobj:
759 raise ValueError('Could not find vevo ID')
760 json_code = uppercase_escape(mobj.group(1))
761 ytplayer_config = json.loads(json_code)
762 args = ytplayer_config['args']
763 # Easy way to know if the 's' value is in url_encoded_fmt_stream_map
764 # this signatures are encrypted
765 if 'url_encoded_fmt_stream_map' not in args:
766 raise ValueError(u'No stream_map present') # caught below
767 re_signature = re.compile(r'[&,]s=')
768 m_s = re_signature.search(args['url_encoded_fmt_stream_map'])
769 if m_s is not None:
770 self.to_screen(u'%s: Encrypted signatures detected.' % video_id)
771 video_info['url_encoded_fmt_stream_map'] = [args['url_encoded_fmt_stream_map']]
772 m_s = re_signature.search(args.get('adaptive_fmts', u''))
773 if m_s is not None:
774 if 'adaptive_fmts' in video_info:
775 video_info['adaptive_fmts'][0] += ',' + args['adaptive_fmts']
776 else:
777 video_info['adaptive_fmts'] = [args['adaptive_fmts']]
778 except ValueError:
779 pass
780
781 def _map_to_format_list(urlmap):
782 formats = []
783 for itag, video_real_url in urlmap.items():
784 dct = {
785 'format_id': itag,
786 'url': video_real_url,
787 'player_url': player_url,
788 }
789 if itag in self._formats:
790 dct.update(self._formats[itag])
791 formats.append(dct)
792 return formats
793
794 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
795 self.report_rtmp_download()
796 formats = [{
797 'format_id': '_rtmp',
798 'protocol': 'rtmp',
799 'url': video_info['conn'][0],
800 'player_url': player_url,
801 }]
802 elif len(video_info.get('url_encoded_fmt_stream_map', [])) >= 1 or len(video_info.get('adaptive_fmts', [])) >= 1:
803 encoded_url_map = video_info.get('url_encoded_fmt_stream_map', [''])[0] + ',' + video_info.get('adaptive_fmts',[''])[0]
804 if 'rtmpe%3Dyes' in encoded_url_map:
805 raise ExtractorError('rtmpe downloads are not supported, see https://github.com/rg3/youtube-dl/issues/343 for more information.', expected=True)
806 url_map = {}
807 for url_data_str in encoded_url_map.split(','):
808 url_data = compat_parse_qs(url_data_str)
809 if 'itag' in url_data and 'url' in url_data:
810 url = url_data['url'][0]
811 if 'sig' in url_data:
812 url += '&signature=' + url_data['sig'][0]
813 elif 's' in url_data:
814 encrypted_sig = url_data['s'][0]
815
816 if not age_gate:
817 jsplayer_url_json = self._search_regex(
818 r'"assets":.+?"js":\s*("[^"]+")',
819 video_webpage, u'JS player URL')
820 player_url = json.loads(jsplayer_url_json)
821 if player_url is None:
822 player_url_json = self._search_regex(
823 r'ytplayer\.config.*?"url"\s*:\s*("[^"]+")',
824 video_webpage, u'age gate player URL')
825 player_url = json.loads(player_url_json)
826
827 if self._downloader.params.get('verbose'):
828 if player_url is None:
829 player_version = 'unknown'
830 player_desc = 'unknown'
831 else:
832 if player_url.endswith('swf'):
833 player_version = self._search_regex(
834 r'-(.+?)(?:/watch_as3)?\.swf$', player_url,
835 u'flash player', fatal=False)
836 player_desc = 'flash player %s' % player_version
837 else:
838 player_version = self._search_regex(
839 r'html5player-([^/]+?)(?:/html5player)?\.js',
840 player_url,
841 'html5 player', fatal=False)
842 player_desc = u'html5 player %s' % player_version
843
844 parts_sizes = u'.'.join(compat_str(len(part)) for part in encrypted_sig.split('.'))
845 self.to_screen(u'encrypted signature length %d (%s), itag %s, %s' %
846 (len(encrypted_sig), parts_sizes, url_data['itag'][0], player_desc))
847
848 signature = self._decrypt_signature(
849 encrypted_sig, video_id, player_url, age_gate)
850 url += '&signature=' + signature
851 if 'ratebypass' not in url:
852 url += '&ratebypass=yes'
853 url_map[url_data['itag'][0]] = url
854 formats = _map_to_format_list(url_map)
855 elif video_info.get('hlsvp'):
856 manifest_url = video_info['hlsvp'][0]
857 url_map = self._extract_from_m3u8(manifest_url, video_id)
858 formats = _map_to_format_list(url_map)
859 else:
860 raise ExtractorError(u'no conn, hlsvp or url_encoded_fmt_stream_map information found in video info')
861
862 # Look for the DASH manifest
863 if (self._downloader.params.get('youtube_include_dash_manifest', False)):
864 try:
865 # The DASH manifest used needs to be the one from the original video_webpage.
866 # The one found in get_video_info seems to be using different signatures.
867 # However, in the case of an age restriction there won't be any embedded dashmpd in the video_webpage.
868 # Luckily, it seems, this case uses some kind of default signature (len == 86), so the
869 # combination of get_video_info and the _static_decrypt_signature() decryption fallback will work here.
870 if age_gate:
871 dash_manifest_url = video_info.get('dashmpd')[0]
872 else:
873 dash_manifest_url = ytplayer_config['args']['dashmpd']
874 def decrypt_sig(mobj):
875 s = mobj.group(1)
876 dec_s = self._decrypt_signature(s, video_id, player_url, age_gate)
877 return '/signature/%s' % dec_s
878 dash_manifest_url = re.sub(r'/s/([\w\.]+)', decrypt_sig, dash_manifest_url)
879 dash_doc = self._download_xml(
880 dash_manifest_url, video_id,
881 note=u'Downloading DASH manifest',
882 errnote=u'Could not download DASH manifest')
883 for r in dash_doc.findall(u'.//{urn:mpeg:DASH:schema:MPD:2011}Representation'):
884 url_el = r.find('{urn:mpeg:DASH:schema:MPD:2011}BaseURL')
885 if url_el is None:
886 continue
887 format_id = r.attrib['id']
888 video_url = url_el.text
889 filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength'))
890 f = {
891 'format_id': format_id,
892 'url': video_url,
893 'width': int_or_none(r.attrib.get('width')),
894 'tbr': int_or_none(r.attrib.get('bandwidth'), 1000),
895 'asr': int_or_none(r.attrib.get('audioSamplingRate')),
896 'filesize': filesize,
897 }
898 try:
899 existing_format = next(
900 fo for fo in formats
901 if fo['format_id'] == format_id)
902 except StopIteration:
903 f.update(self._formats.get(format_id, {}))
904 formats.append(f)
905 else:
906 existing_format.update(f)
907
908 except (ExtractorError, KeyError) as e:
909 self.report_warning(u'Skipping DASH manifest: %s' % e, video_id)
910
911 self._sort_formats(formats)
912
913 return {
914 'id': video_id,
915 'uploader': video_uploader,
916 'uploader_id': video_uploader_id,
917 'upload_date': upload_date,
918 'title': video_title,
919 'thumbnail': video_thumbnail,
920 'description': video_description,
921 'categories': video_categories,
922 'subtitles': video_subtitles,
923 'duration': video_duration,
924 'age_limit': 18 if age_gate else 0,
925 'annotations': video_annotations,
926 'webpage_url': proto + '://www.youtube.com/watch?v=%s' % video_id,
927 'view_count': view_count,
928 'like_count': like_count,
929 'dislike_count': dislike_count,
930 'formats': formats,
931 }
932
933class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
934 IE_DESC = u'YouTube.com playlists'
935 _VALID_URL = r"""(?x)(?:
936 (?:https?://)?
937 (?:\w+\.)?
938 youtube\.com/
939 (?:
940 (?:course|view_play_list|my_playlists|artist|playlist|watch)
941 \? (?:.*?&)*? (?:p|a|list)=
942 | p/
943 )
944 (
945 (?:PL|LL|EC|UU|FL|RD)?[0-9A-Za-z-_]{10,}
946 # Top tracks, they can also include dots
947 |(?:MC)[\w\.]*
948 )
949 .*
950 |
951 ((?:PL|LL|EC|UU|FL|RD)[0-9A-Za-z-_]{10,})
952 )"""
953 _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s'
954 _MORE_PAGES_INDICATOR = r'data-link-type="next"'
955 _VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})&amp;[^"]*?index=(?P<index>\d+)'
956 IE_NAME = u'youtube:playlist'
957
958 def _real_initialize(self):
959 self._login()
960
961 def _ids_to_results(self, ids):
962 return [self.url_result(vid_id, 'Youtube', video_id=vid_id)
963 for vid_id in ids]
964
965 def _extract_mix(self, playlist_id):
966 # The mixes are generated from a a single video
967 # the id of the playlist is just 'RD' + video_id
968 url = 'https://youtube.com/watch?v=%s&list=%s' % (playlist_id[-11:], playlist_id)
969 webpage = self._download_webpage(url, playlist_id, u'Downloading Youtube mix')
970 search_title = lambda class_name: get_element_by_attribute('class', class_name, webpage)
971 title_span = (search_title('playlist-title') or
972 search_title('title long-title') or search_title('title'))
973 title = clean_html(title_span)
974 video_re = r'''(?x)data-video-username=".*?".*?
975 href="/watch\?v=([0-9A-Za-z_-]{11})&amp;[^"]*?list=%s''' % re.escape(playlist_id)
976 ids = orderedSet(re.findall(video_re, webpage, flags=re.DOTALL))
977 url_results = self._ids_to_results(ids)
978
979 return self.playlist_result(url_results, playlist_id, title)
980
981 def _real_extract(self, url):
982 # Extract playlist id
983 mobj = re.match(self._VALID_URL, url)
984 if mobj is None:
985 raise ExtractorError(u'Invalid URL: %s' % url)
986 playlist_id = mobj.group(1) or mobj.group(2)
987
988 # Check if it's a video-specific URL
989 query_dict = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
990 if 'v' in query_dict:
991 video_id = query_dict['v'][0]
992 if self._downloader.params.get('noplaylist'):
993 self.to_screen(u'Downloading just video %s because of --no-playlist' % video_id)
994 return self.url_result(video_id, 'Youtube', video_id=video_id)
995 else:
996 self.to_screen(u'Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id))
997
998 if playlist_id.startswith('RD'):
999 # Mixes require a custom extraction process
1000 return self._extract_mix(playlist_id)
1001 if playlist_id.startswith('TL'):
1002 raise ExtractorError(u'For downloading YouTube.com top lists, use '
1003 u'the "yttoplist" keyword, for example "youtube-dl \'yttoplist:music:Top Tracks\'"', expected=True)
1004
1005 url = self._TEMPLATE_URL % playlist_id
1006 page = self._download_webpage(url, playlist_id)
1007 more_widget_html = content_html = page
1008
1009 # Check if the playlist exists or is private
1010 if re.search(r'<div class="yt-alert-message">[^<]*?(The|This) playlist (does not exist|is private)[^<]*?</div>', page) is not None:
1011 raise ExtractorError(
1012 u'The playlist doesn\'t exist or is private, use --username or '
1013 '--netrc to access it.',
1014 expected=True)
1015
1016 # Extract the video ids from the playlist pages
1017 ids = []
1018
1019 for page_num in itertools.count(1):
1020 matches = re.finditer(self._VIDEO_RE, content_html)
1021 # We remove the duplicates and the link with index 0
1022 # (it's not the first video of the playlist)
1023 new_ids = orderedSet(m.group('id') for m in matches if m.group('index') != '0')
1024 ids.extend(new_ids)
1025
1026 mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
1027 if not mobj:
1028 break
1029
1030 more = self._download_json(
1031 'https://youtube.com/%s' % mobj.group('more'), playlist_id,
1032 'Downloading page #%s' % page_num,
1033 transform_source=uppercase_escape)
1034 content_html = more['content_html']
1035 more_widget_html = more['load_more_widget_html']
1036
1037 playlist_title = self._html_search_regex(
1038 r'(?s)<h1 class="pl-header-title[^"]*">\s*(.*?)\s*</h1>',
1039 page, u'title')
1040
1041 url_results = self._ids_to_results(ids)
1042 return self.playlist_result(url_results, playlist_id, playlist_title)
1043
1044
1045class YoutubeTopListIE(YoutubePlaylistIE):
1046 IE_NAME = u'youtube:toplist'
1047 IE_DESC = (u'YouTube.com top lists, "yttoplist:{channel}:{list title}"'
1048 u' (Example: "yttoplist:music:Top Tracks")')
1049 _VALID_URL = r'yttoplist:(?P<chann>.*?):(?P<title>.*?)$'
1050
1051 def _real_extract(self, url):
1052 mobj = re.match(self._VALID_URL, url)
1053 channel = mobj.group('chann')
1054 title = mobj.group('title')
1055 query = compat_urllib_parse.urlencode({'title': title})
1056 playlist_re = 'href="([^"]+?%s.*?)"' % re.escape(query)
1057 channel_page = self._download_webpage('https://www.youtube.com/%s' % channel, title)
1058 link = self._html_search_regex(playlist_re, channel_page, u'list')
1059 url = compat_urlparse.urljoin('https://www.youtube.com/', link)
1060
1061 video_re = r'data-index="\d+".*?data-video-id="([0-9A-Za-z_-]{11})"'
1062 ids = []
1063 # sometimes the webpage doesn't contain the videos
1064 # retry until we get them
1065 for i in itertools.count(0):
1066 msg = u'Downloading Youtube mix'
1067 if i > 0:
1068 msg += ', retry #%d' % i
1069 webpage = self._download_webpage(url, title, msg)
1070 ids = orderedSet(re.findall(video_re, webpage))
1071 if ids:
1072 break
1073 url_results = self._ids_to_results(ids)
1074 return self.playlist_result(url_results, playlist_title=title)
1075
1076
1077class YoutubeChannelIE(InfoExtractor):
1078 IE_DESC = u'YouTube.com channels'
1079 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
1080 _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
1081 _MORE_PAGES_URL = 'https://www.youtube.com/c4_browse_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
1082 IE_NAME = u'youtube:channel'
1083
1084 def extract_videos_from_page(self, page):
1085 ids_in_page = []
1086 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1087 if mobj.group(1) not in ids_in_page:
1088 ids_in_page.append(mobj.group(1))
1089 return ids_in_page
1090
1091 def _real_extract(self, url):
1092 # Extract channel id
1093 mobj = re.match(self._VALID_URL, url)
1094 if mobj is None:
1095 raise ExtractorError(u'Invalid URL: %s' % url)
1096
1097 # Download channel page
1098 channel_id = mobj.group(1)
1099 video_ids = []
1100 url = 'https://www.youtube.com/channel/%s/videos' % channel_id
1101 channel_page = self._download_webpage(url, channel_id)
1102 autogenerated = re.search(r'''(?x)
1103 class="[^"]*?(?:
1104 channel-header-autogenerated-label|
1105 yt-channel-title-autogenerated
1106 )[^"]*"''', channel_page) is not None
1107
1108 if autogenerated:
1109 # The videos are contained in a single page
1110 # the ajax pages can't be used, they are empty
1111 video_ids = self.extract_videos_from_page(channel_page)
1112 else:
1113 # Download all channel pages using the json-based channel_ajax query
1114 for pagenum in itertools.count(1):
1115 url = self._MORE_PAGES_URL % (pagenum, channel_id)
1116 page = self._download_json(
1117 url, channel_id, note=u'Downloading page #%s' % pagenum,
1118 transform_source=uppercase_escape)
1119
1120 ids_in_page = self.extract_videos_from_page(page['content_html'])
1121 video_ids.extend(ids_in_page)
1122
1123 if self._MORE_PAGES_INDICATOR not in page['load_more_widget_html']:
1124 break
1125
1126 self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1127
1128 url_entries = [self.url_result(video_id, 'Youtube', video_id=video_id)
1129 for video_id in video_ids]
1130 return self.playlist_result(url_entries, channel_id)
1131
1132
1133class YoutubeUserIE(InfoExtractor):
1134 IE_DESC = u'YouTube.com user videos (URL or "ytuser" keyword)'
1135 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:user/)?(?!(?:attribution_link|watch|results)(?:$|[^a-z_A-Z0-9-])))|ytuser:)(?!feed/)([A-Za-z0-9_-]+)'
1136 _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/users/%s'
1137 _GDATA_PAGE_SIZE = 50
1138 _GDATA_URL = 'https://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d&alt=json'
1139 IE_NAME = u'youtube:user'
1140
1141 @classmethod
1142 def suitable(cls, url):
1143 # Don't return True if the url can be extracted with other youtube
1144 # extractor, the regex would is too permissive and it would match.
1145 other_ies = iter(klass for (name, klass) in globals().items() if name.endswith('IE') and klass is not cls)
1146 if any(ie.suitable(url) for ie in other_ies): return False
1147 else: return super(YoutubeUserIE, cls).suitable(url)
1148
1149 def _real_extract(self, url):
1150 # Extract username
1151 mobj = re.match(self._VALID_URL, url)
1152 if mobj is None:
1153 raise ExtractorError(u'Invalid URL: %s' % url)
1154
1155 username = mobj.group(1)
1156
1157 # Download video ids using YouTube Data API. Result size per
1158 # query is limited (currently to 50 videos) so we need to query
1159 # page by page until there are no video ids - it means we got
1160 # all of them.
1161
1162 def download_page(pagenum):
1163 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1164
1165 gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
1166 page = self._download_webpage(
1167 gdata_url, username,
1168 u'Downloading video ids from %d to %d' % (
1169 start_index, start_index + self._GDATA_PAGE_SIZE))
1170
1171 try:
1172 response = json.loads(page)
1173 except ValueError as err:
1174 raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
1175 if 'entry' not in response['feed']:
1176 return
1177
1178 # Extract video identifiers
1179 entries = response['feed']['entry']
1180 for entry in entries:
1181 title = entry['title']['$t']
1182 video_id = entry['id']['$t'].split('/')[-1]
1183 yield {
1184 '_type': 'url',
1185 'url': video_id,
1186 'ie_key': 'Youtube',
1187 'id': video_id,
1188 'title': title,
1189 }
1190 url_results = PagedList(download_page, self._GDATA_PAGE_SIZE)
1191
1192 return self.playlist_result(url_results, playlist_title=username)
1193
1194
1195class YoutubeSearchIE(SearchInfoExtractor):
1196 IE_DESC = u'YouTube.com searches'
1197 _API_URL = u'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1198 _MAX_RESULTS = 1000
1199 IE_NAME = u'youtube:search'
1200 _SEARCH_KEY = 'ytsearch'
1201
1202 def _get_n_results(self, query, n):
1203 """Get a specified number of results for a query"""
1204
1205 video_ids = []
1206 pagenum = 0
1207 limit = n
1208 PAGE_SIZE = 50
1209
1210 while (PAGE_SIZE * pagenum) < limit:
1211 result_url = self._API_URL % (
1212 compat_urllib_parse.quote_plus(query.encode('utf-8')),
1213 (PAGE_SIZE * pagenum) + 1)
1214 data_json = self._download_webpage(
1215 result_url, video_id=u'query "%s"' % query,
1216 note=u'Downloading page %s' % (pagenum + 1),
1217 errnote=u'Unable to download API page')
1218 data = json.loads(data_json)
1219 api_response = data['data']
1220
1221 if 'items' not in api_response:
1222 raise ExtractorError(
1223 u'[youtube] No video results', expected=True)
1224
1225 new_ids = list(video['id'] for video in api_response['items'])
1226 video_ids += new_ids
1227
1228 limit = min(n, api_response['totalItems'])
1229 pagenum += 1
1230
1231 if len(video_ids) > n:
1232 video_ids = video_ids[:n]
1233 videos = [self.url_result(video_id, 'Youtube', video_id=video_id)
1234 for video_id in video_ids]
1235 return self.playlist_result(videos, query)
1236
1237
1238class YoutubeSearchDateIE(YoutubeSearchIE):
1239 IE_NAME = YoutubeSearchIE.IE_NAME + ':date'
1240 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc&orderby=published'
1241 _SEARCH_KEY = 'ytsearchdate'
1242 IE_DESC = u'YouTube.com searches, newest videos first'
1243
1244
1245class YoutubeSearchURLIE(InfoExtractor):
1246 IE_DESC = u'YouTube.com search URLs'
1247 IE_NAME = u'youtube:search_url'
1248 _VALID_URL = r'https?://(?:www\.)?youtube\.com/results\?(.*?&)?search_query=(?P<query>[^&]+)(?:[&]|$)'
1249
1250 def _real_extract(self, url):
1251 mobj = re.match(self._VALID_URL, url)
1252 query = compat_urllib_parse.unquote_plus(mobj.group('query'))
1253
1254 webpage = self._download_webpage(url, query)
1255 result_code = self._search_regex(
1256 r'(?s)<ol class="item-section"(.*?)</ol>', webpage, u'result HTML')
1257
1258 part_codes = re.findall(
1259 r'(?s)<h3 class="yt-lockup-title">(.*?)</h3>', result_code)
1260 entries = []
1261 for part_code in part_codes:
1262 part_title = self._html_search_regex(
1263 [r'(?s)title="([^"]+)"', r'>([^<]+)</a>'], part_code, 'item title', fatal=False)
1264 part_url_snippet = self._html_search_regex(
1265 r'(?s)href="([^"]+)"', part_code, 'item URL')
1266 part_url = compat_urlparse.urljoin(
1267 'https://www.youtube.com/', part_url_snippet)
1268 entries.append({
1269 '_type': 'url',
1270 'url': part_url,
1271 'title': part_title,
1272 })
1273
1274 return {
1275 '_type': 'playlist',
1276 'entries': entries,
1277 'title': query,
1278 }
1279
1280
1281class YoutubeShowIE(InfoExtractor):
1282 IE_DESC = u'YouTube.com (multi-season) shows'
1283 _VALID_URL = r'https?://www\.youtube\.com/show/(.*)'
1284 IE_NAME = u'youtube:show'
1285
1286 def _real_extract(self, url):
1287 mobj = re.match(self._VALID_URL, url)
1288 show_name = mobj.group(1)
1289 webpage = self._download_webpage(url, show_name, u'Downloading show webpage')
1290 # There's one playlist for each season of the show
1291 m_seasons = list(re.finditer(r'href="(/playlist\?list=.*?)"', webpage))
1292 self.to_screen(u'%s: Found %s seasons' % (show_name, len(m_seasons)))
1293 return [self.url_result('https://www.youtube.com' + season.group(1), 'YoutubePlaylist') for season in m_seasons]
1294
1295
1296class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
1297 """
1298 Base class for extractors that fetch info from
1299 http://www.youtube.com/feed_ajax
1300 Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.
1301 """
1302 _LOGIN_REQUIRED = True
1303 # use action_load_personal_feed instead of action_load_system_feed
1304 _PERSONAL_FEED = False
1305
1306 @property
1307 def _FEED_TEMPLATE(self):
1308 action = 'action_load_system_feed'
1309 if self._PERSONAL_FEED:
1310 action = 'action_load_personal_feed'
1311 return 'https://www.youtube.com/feed_ajax?%s=1&feed_name=%s&paging=%%s' % (action, self._FEED_NAME)
1312
1313 @property
1314 def IE_NAME(self):
1315 return u'youtube:%s' % self._FEED_NAME
1316
1317 def _real_initialize(self):
1318 self._login()
1319
1320 def _real_extract(self, url):
1321 feed_entries = []
1322 paging = 0
1323 for i in itertools.count(1):
1324 info = self._download_json(self._FEED_TEMPLATE % paging,
1325 u'%s feed' % self._FEED_NAME,
1326 u'Downloading page %s' % i)
1327 feed_html = info.get('feed_html') or info.get('content_html')
1328 m_ids = re.finditer(r'"/watch\?v=(.*?)["&]', feed_html)
1329 ids = orderedSet(m.group(1) for m in m_ids)
1330 feed_entries.extend(
1331 self.url_result(video_id, 'Youtube', video_id=video_id)
1332 for video_id in ids)
1333 mobj = re.search(
1334 r'data-uix-load-more-href="/?[^"]+paging=(?P<paging>\d+)',
1335 feed_html)
1336 if mobj is None:
1337 break
1338 paging = mobj.group('paging')
1339 return self.playlist_result(feed_entries, playlist_title=self._PLAYLIST_TITLE)
1340
1341class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor):
1342 IE_DESC = u'YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication)'
1343 _VALID_URL = r'https?://www\.youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
1344 _FEED_NAME = 'subscriptions'
1345 _PLAYLIST_TITLE = u'Youtube Subscriptions'
1346
1347class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
1348 IE_DESC = u'YouTube.com recommended videos, "ytrec" keyword (requires authentication)'
1349 _VALID_URL = r'https?://www\.youtube\.com/feed/recommended|:ytrec(?:ommended)?'
1350 _FEED_NAME = 'recommended'
1351 _PLAYLIST_TITLE = u'Youtube Recommended videos'
1352
1353class YoutubeWatchLaterIE(YoutubeFeedsInfoExtractor):
1354 IE_DESC = u'Youtube watch later list, "ytwatchlater" keyword (requires authentication)'
1355 _VALID_URL = r'https?://www\.youtube\.com/feed/watch_later|:ytwatchlater'
1356 _FEED_NAME = 'watch_later'
1357 _PLAYLIST_TITLE = u'Youtube Watch Later'
1358 _PERSONAL_FEED = True
1359
1360class YoutubeHistoryIE(YoutubeFeedsInfoExtractor):
1361 IE_DESC = u'Youtube watch history, "ythistory" keyword (requires authentication)'
1362 _VALID_URL = u'https?://www\.youtube\.com/feed/history|:ythistory'
1363 _FEED_NAME = 'history'
1364 _PERSONAL_FEED = True
1365 _PLAYLIST_TITLE = u'Youtube Watch History'
1366
1367class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
1368 IE_NAME = u'youtube:favorites'
1369 IE_DESC = u'YouTube.com favourite videos, "ytfav" keyword (requires authentication)'
1370 _VALID_URL = r'https?://www\.youtube\.com/my_favorites|:ytfav(?:ou?rites)?'
1371 _LOGIN_REQUIRED = True
1372
1373 def _real_extract(self, url):
1374 webpage = self._download_webpage('https://www.youtube.com/my_favorites', 'Youtube Favourites videos')
1375 playlist_id = self._search_regex(r'list=(.+?)["&]', webpage, u'favourites playlist id')
1376 return self.url_result(playlist_id, 'YoutubePlaylist')
1377
1378
1379class YoutubeTruncatedURLIE(InfoExtractor):
1380 IE_NAME = 'youtube:truncated_url'
1381 IE_DESC = False # Do not list
1382 _VALID_URL = r'''(?x)
1383 (?:https?://)?[^/]+/watch\?(?:
1384 feature=[a-z_]+|
1385 annotation_id=annotation_[^&]+
1386 )?$|
1387 (?:https?://)?(?:www\.)?youtube\.com/attribution_link\?a=[^&]+$
1388 '''
1389
1390 _TESTS = [{
1391 'url': 'http://www.youtube.com/watch?annotation_id=annotation_3951667041',
1392 'only_matching': True,
1393 }, {
1394 'url': 'http://www.youtube.com/watch?',
1395 'only_matching': True,
1396 }]
1397
1398 def _real_extract(self, url):
1399 raise ExtractorError(
1400 u'Did you forget to quote the URL? Remember that & is a meta '
1401 u'character in most shells, so you want to put the URL in quotes, '
1402 u'like youtube-dl '
1403 u'"http://www.youtube.com/watch?feature=foo&v=BaW_jenozKc" '
1404 u' or simply youtube-dl BaW_jenozKc .',
1405 expected=True)