]> jfr.im git - yt-dlp.git/blame - youtube_dl/extractor/youtube.py
Clarify a couple of calls
[yt-dlp.git] / youtube_dl / extractor / youtube.py
CommitLineData
c5e8d7af 1# coding: utf-8
c5e8d7af 2
e0df6211
PH
3import collections
4import itertools
5import io
c5e8d7af
PH
6import json
7import netrc
8import re
9import socket
e0df6211
PH
10import string
11import struct
12import traceback
13import zlib
c5e8d7af 14
b05654f0 15from .common import InfoExtractor, SearchInfoExtractor
54d39d8b 16from .subtitles import SubtitlesInfoExtractor
c5e8d7af
PH
17from ..utils import (
18 compat_http_client,
19 compat_parse_qs,
20 compat_urllib_error,
21 compat_urllib_parse,
22 compat_urllib_request,
23 compat_str,
24
25 clean_html,
26 get_element_by_id,
27 ExtractorError,
28 unescapeHTML,
29 unified_strdate,
04cc9617 30 orderedSet,
c5e8d7af
PH
31)
32
de7f3446 33class YoutubeBaseInfoExtractor(InfoExtractor):
b2e8bc1b
JMF
34 """Provide base functions for Youtube extractors"""
35 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
36 _LANG_URL = r'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
37 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
38 _NETRC_MACHINE = 'youtube'
39 # If True it will raise an error if no login info is provided
40 _LOGIN_REQUIRED = False
41
42 def report_lang(self):
43 """Report attempt to set language."""
44 self.to_screen(u'Setting language')
45
46 def _set_language(self):
47 request = compat_urllib_request.Request(self._LANG_URL)
48 try:
49 self.report_lang()
50 compat_urllib_request.urlopen(request).read()
51 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
52 self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
53 return False
54 return True
55
56 def _login(self):
57 (username, password) = self._get_login_info()
58 # No authentication to be performed
59 if username is None:
60 if self._LOGIN_REQUIRED:
61 raise ExtractorError(u'No login info available, needed for using %s.' % self.IE_NAME, expected=True)
62 return False
63
64 request = compat_urllib_request.Request(self._LOGIN_URL)
65 try:
66 login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
67 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
68 self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err))
69 return False
70
71 galx = None
72 dsh = None
73 match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
74 if match:
75 galx = match.group(1)
76 match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
77 if match:
78 dsh = match.group(1)
c5e8d7af 79
b2e8bc1b
JMF
80 # Log in
81 login_form_strs = {
82 u'continue': u'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
83 u'Email': username,
84 u'GALX': galx,
85 u'Passwd': password,
86 u'PersistentCookie': u'yes',
87 u'_utf8': u'霱',
88 u'bgresponse': u'js_disabled',
89 u'checkConnection': u'',
90 u'checkedDomains': u'youtube',
91 u'dnConn': u'',
92 u'dsh': dsh,
93 u'pstMsg': u'0',
94 u'rmShown': u'1',
95 u'secTok': u'',
96 u'signIn': u'Sign in',
97 u'timeStmp': u'',
98 u'service': u'youtube',
99 u'uilel': u'3',
100 u'hl': u'en_US',
101 }
102 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
103 # chokes on unicode
104 login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
105 login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
106 request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
107 try:
108 self.report_login()
109 login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
110 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
111 self._downloader.report_warning(u'unable to log in: bad username or password')
112 return False
113 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
114 self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
115 return False
116 return True
117
118 def _confirm_age(self):
119 age_form = {
120 'next_url': '/',
121 'action_confirm': 'Confirm',
122 }
123 request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
124 try:
125 self.report_age_confirmation()
126 compat_urllib_request.urlopen(request).read().decode('utf-8')
127 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
128 raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err))
129 return True
130
131 def _real_initialize(self):
132 if self._downloader is None:
133 return
134 if not self._set_language():
135 return
136 if not self._login():
137 return
138 self._confirm_age()
c5e8d7af 139
8377574c 140
de7f3446 141class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
0f818663 142 IE_DESC = u'YouTube.com'
c5e8d7af
PH
143 _VALID_URL = r"""^
144 (
145 (?:https?://)? # http(s):// (optional)
f4b05232 146 (?:(?:(?:(?:\w+\.)?youtube(?:-nocookie)?\.com/|
e69ae5b9
JMF
147 tube\.majestyc\.net/|
148 youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains
c5e8d7af
PH
149 (?:.*?\#/)? # handle anchor (#/) redirect urls
150 (?: # the various things that can precede the ID:
151 (?:(?:v|embed|e)/) # v/ or embed/ or e/
152 |(?: # or the v= param in all its forms
d741e55a 153 (?:(?:watch|movie)(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
c5e8d7af
PH
154 (?:\?|\#!?) # the params delimiter ? or # or #!
155 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
156 v=
157 )
f4b05232
JMF
158 ))
159 |youtu\.be/ # just youtu.be/xxxx
160 )
c5e8d7af 161 )? # all until now is optional -> you can pass the naked ID
8963d9c2 162 ([0-9A-Za-z_-]{11}) # here is it! the YouTube video ID
c5e8d7af
PH
163 (?(1).+)? # if we found the ID, everything can follow
164 $"""
c5e8d7af 165 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
c5e8d7af 166 # Listed in order of quality
bdc6b3fc 167 _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '36', '17', '13',
96fb5605 168 # Apple HTTP Live Streaming
bdc6b3fc 169 '96', '95', '94', '93', '92', '132', '151',
939fbd26
JMF
170 # 3D
171 '85', '84', '102', '83', '101', '82', '100',
172 # Dash video
173 '138', '137', '248', '136', '247', '135', '246',
174 '245', '244', '134', '243', '133', '242', '160',
175 # Dash audio
176 '141', '172', '140', '171', '139',
1d043b93 177 ]
bdc6b3fc 178 _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '36', '17', '13',
96fb5605 179 # Apple HTTP Live Streaming
bdc6b3fc
AZ
180 '96', '95', '94', '93', '92', '132', '151',
181 # 3D
86fe61c8 182 '85', '102', '84', '101', '83', '100', '82',
939fbd26
JMF
183 # Dash video
184 '138', '248', '137', '247', '136', '246', '245',
185 '244', '135', '243', '134', '242', '133', '160',
186 # Dash audio
187 '172', '141', '171', '140', '139',
1d043b93 188 ]
bdc6b3fc
AZ
189 _video_formats_map = {
190 'flv': ['35', '34', '6', '5'],
191 '3gp': ['36', '17', '13'],
192 'mp4': ['38', '37', '22', '18'],
193 'webm': ['46', '45', '44', '43'],
194 }
c5e8d7af
PH
195 _video_extensions = {
196 '13': '3gp',
bdc6b3fc 197 '17': '3gp',
c5e8d7af
PH
198 '18': 'mp4',
199 '22': 'mp4',
bdc6b3fc 200 '36': '3gp',
c5e8d7af 201 '37': 'mp4',
d69cf69a 202 '38': 'mp4',
c5e8d7af
PH
203 '43': 'webm',
204 '44': 'webm',
205 '45': 'webm',
206 '46': 'webm',
1d043b93 207
86fe61c8
AZ
208 # 3d videos
209 '82': 'mp4',
210 '83': 'mp4',
211 '84': 'mp4',
212 '85': 'mp4',
213 '100': 'webm',
214 '101': 'webm',
215 '102': 'webm',
836a086c 216
96fb5605 217 # Apple HTTP Live Streaming
1d043b93
JMF
218 '92': 'mp4',
219 '93': 'mp4',
220 '94': 'mp4',
221 '95': 'mp4',
222 '96': 'mp4',
223 '132': 'mp4',
224 '151': 'mp4',
836a086c
AZ
225
226 # Dash mp4
227 '133': 'mp4',
228 '134': 'mp4',
229 '135': 'mp4',
230 '136': 'mp4',
231 '137': 'mp4',
232 '138': 'mp4',
233 '139': 'mp4',
234 '140': 'mp4',
235 '141': 'mp4',
236 '160': 'mp4',
237
238 # Dash webm
239 '171': 'webm',
240 '172': 'webm',
241 '242': 'webm',
242 '243': 'webm',
243 '244': 'webm',
244 '245': 'webm',
245 '246': 'webm',
246 '247': 'webm',
247 '248': 'webm',
c5e8d7af
PH
248 }
249 _video_dimensions = {
250 '5': '240x400',
251 '6': '???',
252 '13': '???',
253 '17': '144x176',
254 '18': '360x640',
255 '22': '720x1280',
256 '34': '360x640',
257 '35': '480x854',
bdc6b3fc 258 '36': '240x320',
c5e8d7af
PH
259 '37': '1080x1920',
260 '38': '3072x4096',
261 '43': '360x640',
262 '44': '480x854',
263 '45': '720x1280',
264 '46': '1080x1920',
86fe61c8
AZ
265 '82': '360p',
266 '83': '480p',
267 '84': '720p',
268 '85': '1080p',
1d043b93
JMF
269 '92': '240p',
270 '93': '360p',
271 '94': '480p',
272 '95': '720p',
273 '96': '1080p',
86fe61c8
AZ
274 '100': '360p',
275 '101': '480p',
836a086c 276 '102': '720p',
1d043b93
JMF
277 '132': '240p',
278 '151': '72p',
836a086c
AZ
279 '133': '240p',
280 '134': '360p',
281 '135': '480p',
282 '136': '720p',
283 '137': '1080p',
284 '138': '>1080p',
285 '139': '48k',
286 '140': '128k',
287 '141': '256k',
288 '160': '192p',
289 '171': '128k',
290 '172': '256k',
291 '242': '240p',
292 '243': '360p',
293 '244': '480p',
294 '245': '480p',
295 '246': '480p',
296 '247': '720p',
297 '248': '1080p',
c5e8d7af 298 }
836a086c
AZ
299 _special_itags = {
300 '82': '3D',
301 '83': '3D',
302 '84': '3D',
303 '85': '3D',
304 '100': '3D',
305 '101': '3D',
306 '102': '3D',
307 '133': 'DASH Video',
308 '134': 'DASH Video',
309 '135': 'DASH Video',
310 '136': 'DASH Video',
311 '137': 'DASH Video',
312 '138': 'DASH Video',
313 '139': 'DASH Audio',
314 '140': 'DASH Audio',
315 '141': 'DASH Audio',
316 '160': 'DASH Video',
317 '171': 'DASH Audio',
318 '172': 'DASH Audio',
319 '242': 'DASH Video',
320 '243': 'DASH Video',
321 '244': 'DASH Video',
322 '245': 'DASH Video',
323 '246': 'DASH Video',
324 '247': 'DASH Video',
325 '248': 'DASH Video',
c5e8d7af 326 }
836a086c 327
c5e8d7af 328 IE_NAME = u'youtube'
2eb88d95
PH
329 _TESTS = [
330 {
0e853ca4
PH
331 u"url": u"http://www.youtube.com/watch?v=BaW_jenozKc",
332 u"file": u"BaW_jenozKc.mp4",
333 u"info_dict": {
334 u"title": u"youtube-dl test video \"'/\\ä↭𝕐",
335 u"uploader": u"Philipp Hagemeister",
336 u"uploader_id": u"phihag",
337 u"upload_date": u"20121002",
338 u"description": u"test chars: \"'/\\ä↭𝕐\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de ."
2eb88d95 339 }
0e853ca4
PH
340 },
341 {
342 u"url": u"http://www.youtube.com/watch?v=1ltcDfZMA3U",
343 u"file": u"1ltcDfZMA3U.flv",
344 u"note": u"Test VEVO video (#897)",
345 u"info_dict": {
346 u"upload_date": u"20070518",
347 u"title": u"Maps - It Will Find You",
348 u"description": u"Music video by Maps performing It Will Find You.",
349 u"uploader": u"MuteUSA",
350 u"uploader_id": u"MuteUSA"
2eb88d95 351 }
0e853ca4
PH
352 },
353 {
354 u"url": u"http://www.youtube.com/watch?v=UxxajLWwzqY",
355 u"file": u"UxxajLWwzqY.mp4",
356 u"note": u"Test generic use_cipher_signature video (#897)",
357 u"info_dict": {
358 u"upload_date": u"20120506",
359 u"title": u"Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]",
c7bf7366 360 u"description": u"md5:3e2666e0a55044490499ea45fe9037b7",
45ed795c 361 u"uploader": u"Icona Pop",
0e853ca4 362 u"uploader_id": u"IconaPop"
2eb88d95 363 }
c108eb73
JMF
364 },
365 {
366 u"url": u"https://www.youtube.com/watch?v=07FYdnEawAQ",
367 u"file": u"07FYdnEawAQ.mp4",
368 u"note": u"Test VEVO video with age protection (#956)",
369 u"info_dict": {
370 u"upload_date": u"20130703",
371 u"title": u"Justin Timberlake - Tunnel Vision (Explicit)",
372 u"description": u"md5:64249768eec3bc4276236606ea996373",
373 u"uploader": u"justintimberlakeVEVO",
374 u"uploader_id": u"justintimberlakeVEVO"
375 }
376 },
1d043b93
JMF
377 {
378 u'url': u'https://www.youtube.com/watch?v=TGi3HqYrWHE',
379 u'file': u'TGi3HqYrWHE.mp4',
380 u'note': u'm3u8 video',
381 u'info_dict': {
382 u'title': u'Triathlon - Men - London 2012 Olympic Games',
383 u'description': u'- Men - TR02 - Triathlon - 07 August 2012 - London 2012 Olympic Games',
384 u'uploader': u'olympic',
385 u'upload_date': u'20120807',
386 u'uploader_id': u'olympic',
387 },
388 u'params': {
389 u'skip_download': True,
390 },
391 },
2eb88d95
PH
392 ]
393
c5e8d7af
PH
394
395 @classmethod
396 def suitable(cls, url):
397 """Receives a URL and returns True if suitable for this IE."""
e3ea4790 398 if YoutubePlaylistIE.suitable(url): return False
c5e8d7af
PH
399 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
400
e0df6211
PH
401 def __init__(self, *args, **kwargs):
402 super(YoutubeIE, self).__init__(*args, **kwargs)
83799698 403 self._player_cache = {}
e0df6211 404
c5e8d7af
PH
405 def report_video_webpage_download(self, video_id):
406 """Report attempt to download video webpage."""
407 self.to_screen(u'%s: Downloading video webpage' % video_id)
408
409 def report_video_info_webpage_download(self, video_id):
410 """Report attempt to download video info webpage."""
411 self.to_screen(u'%s: Downloading video info webpage' % video_id)
412
c5e8d7af
PH
413 def report_information_extraction(self, video_id):
414 """Report attempt to extract video information."""
415 self.to_screen(u'%s: Extracting video information' % video_id)
416
417 def report_unavailable_format(self, video_id, format):
418 """Report extracted video URL."""
419 self.to_screen(u'%s: Format %s not available' % (video_id, format))
420
421 def report_rtmp_download(self):
422 """Indicate the download will use the RTMP protocol."""
423 self.to_screen(u'RTMP download detected')
424
e0df6211 425 def _extract_signature_function(self, video_id, player_url):
83799698
PH
426 id_m = re.match(r'.*-(?P<id>[a-zA-Z0-9]+)\.(?P<ext>[a-z]+)$',
427 player_url)
e0df6211
PH
428 player_type = id_m.group('ext')
429 player_id = id_m.group('id')
430
83799698
PH
431 # TODO read from filesystem cache
432
e0df6211
PH
433 if player_type == 'js':
434 code = self._download_webpage(
435 player_url, video_id,
83799698 436 note=u'Downloading %s player %s' % (player_type, player_id),
e0df6211 437 errnote=u'Download of %s failed' % player_url)
83799698 438 res = self._parse_sig_js(code)
e0df6211
PH
439 elif player_tpye == 'swf':
440 urlh = self._request_webpage(
441 player_url, video_id,
83799698 442 note=u'Downloading %s player %s' % (player_type, player_id),
e0df6211
PH
443 errnote=u'Download of %s failed' % player_url)
444 code = urlh.read()
83799698 445 res = self._parse_sig_swf(code)
e0df6211
PH
446 else:
447 assert False, 'Invalid player type %r' % player_type
448
83799698
PH
449 # TODO write cache
450
451 return res
452
e0df6211
PH
453 def _parse_sig_js(self, jscode):
454 funcname = self._search_regex(
455 r'signature=([a-zA-Z]+)', jscode,
456 u'Initial JS player signature function name')
457
458 functions = {}
459
460 def argidx(varname):
461 return string.lowercase.index(varname)
462
463 def interpret_statement(stmt, local_vars, allow_recursion=20):
464 if allow_recursion < 0:
465 raise ExctractorError(u'Recursion limit reached')
466
467 if stmt.startswith(u'var '):
468 stmt = stmt[len(u'var '):]
469 ass_m = re.match(r'^(?P<out>[a-z]+)(?:\[(?P<index>[^\]]+)\])?' +
470 r'=(?P<expr>.*)$', stmt)
471 if ass_m:
472 if ass_m.groupdict().get('index'):
473 def assign(val):
474 lvar = local_vars[ass_m.group('out')]
475 idx = interpret_expression(ass_m.group('index'),
476 local_vars, allow_recursion)
477 assert isinstance(idx, int)
478 lvar[idx] = val
479 return val
480 expr = ass_m.group('expr')
481 else:
482 def assign(val):
483 local_vars[ass_m.group('out')] = val
484 return val
485 expr = ass_m.group('expr')
486 elif stmt.startswith(u'return '):
487 assign = lambda v: v
488 expr = stmt[len(u'return '):]
489 else:
490 raise ExtractorError(
491 u'Cannot determine left side of statement in %r' % stmt)
492
493 v = interpret_expression(expr, local_vars, allow_recursion)
494 return assign(v)
495
496 def interpret_expression(expr, local_vars, allow_recursion):
497 if expr.isdigit():
498 return int(expr)
499
500 if expr.isalpha():
501 return local_vars[expr]
502
503 m = re.match(r'^(?P<in>[a-z]+)\.(?P<member>.*)$', expr)
504 if m:
505 member = m.group('member')
506 val = local_vars[m.group('in')]
507 if member == 'split("")':
508 return list(val)
509 if member == 'join("")':
510 return u''.join(val)
511 if member == 'length':
512 return len(val)
513 if member == 'reverse()':
514 return val[::-1]
515 slice_m = re.match(r'slice\((?P<idx>.*)\)', member)
516 if slice_m:
517 idx = interpret_expression(
518 slice_m.group('idx'), local_vars, allow_recursion-1)
519 return val[idx:]
520
521 m = re.match(
522 r'^(?P<in>[a-z]+)\[(?P<idx>.+)\]$', expr)
523 if m:
524 val = local_vars[m.group('in')]
525 idx = interpret_expression(m.group('idx'), local_vars,
526 allow_recursion-1)
527 return val[idx]
528
529 m = re.match(r'^(?P<a>.+?)(?P<op>[%])(?P<b>.+?)$', expr)
530 if m:
531 a = interpret_expression(m.group('a'),
532 local_vars, allow_recursion)
533 b = interpret_expression(m.group('b'),
534 local_vars, allow_recursion)
535 return a % b
536
537 m = re.match(
538 r'^(?P<func>[a-zA-Z]+)\((?P<args>[a-z0-9,]+)\)$', expr)
539 if m:
540 fname = m.group('func')
541 if fname not in functions:
542 functions[fname] = extract_function(fname)
543 argvals = [int(v) if v.isdigit() else local_vars[v]
544 for v in m.group('args').split(',')]
545 return functions[fname](argvals)
546 raise ExtractorError(u'Unsupported JS expression %r' % expr)
547
548 def extract_function(funcname):
549 func_m = re.search(
550 r'function ' + re.escape(funcname) +
551 r'\((?P<args>[a-z,]+)\){(?P<code>[^}]+)}',
552 jscode)
553 argnames = func_m.group('args').split(',')
554
555 def resf(args):
556 local_vars = dict(zip(argnames, args))
557 for stmt in func_m.group('code').split(';'):
558 res = interpret_statement(stmt, local_vars)
559 return res
560 return resf
561
562 initial_function = extract_function(funcname)
563 return lambda s: initial_function([s])
564
565 def _parse_sig_swf(self, file_contents):
566 if file_contents[1:3] != b'WS':
567 raise ExtractorError(
568 u'Not an SWF file; header is %r' % file_contents[:3])
569 if file_contents[:1] == b'C':
570 content = zlib.decompress(file_contents[8:])
571 else:
572 raise NotImplementedError(u'Unsupported compression format %r' %
573 file_contents[:1])
574
575 def extract_tags(content):
576 pos = 0
577 while pos < len(content):
578 header16 = struct.unpack('<H', content[pos:pos+2])[0]
579 pos += 2
580 tag_code = header16 >> 6
581 tag_len = header16 & 0x3f
582 if tag_len == 0x3f:
583 tag_len = struct.unpack('<I', content[pos:pos+4])[0]
584 pos += 4
585 assert pos+tag_len <= len(content)
586 yield (tag_code, content[pos:pos+tag_len])
587 pos += tag_len
588
589 code_tag = next(tag
590 for tag_code, tag in extract_tags(content)
591 if tag_code == 82)
592 p = code_tag.index(b'\0', 4) + 1
ba552f54 593 code_reader = io.BytesIO(code_tag[p:])
e0df6211
PH
594
595 # Parse ABC (AVM2 ByteCode)
ba552f54
PH
596 def read_int(reader=None):
597 if reader is None:
598 reader = code_reader
e0df6211
PH
599 res = 0
600 shift = 0
601 for _ in range(5):
ba552f54
PH
602 buf = reader.read(1)
603 assert len(buf) == 1
604 b = struct.unpack('<B', buf)[0]
e0df6211
PH
605 res = res | ((b & 0x7f) << shift)
606 if b & 0x80 == 0:
607 break
608 shift += 7
ba552f54
PH
609 return res
610
611 def u30(reader=None):
612 res = read_int(reader)
613 assert res & 0xf0000000 == 0
e0df6211
PH
614 return res
615 u32 = read_int
616
ba552f54
PH
617 def s32(reader=None):
618 v = read_int(reader)
e0df6211
PH
619 if v & 0x80000000 != 0:
620 v = - ((v ^ 0xffffffff) + 1)
ba552f54
PH
621 return v
622
623 def string(reader=None):
624 if reader is None:
625 reader = code_reader
626 slen = u30(reader)
627 resb = reader.read(slen)
628 assert len(resb) == slen
629 return resb.decode('utf-8')
630
631 def read_bytes(count, reader=None):
632 if reader is None:
633 reader = code_reader
634 resb = reader.read(count)
635 assert len(resb) == count
636 return resb
637
638 def read_byte(reader=None):
639 resb = read_bytes(1, reader=reader)
640 res = struct.unpack('<B', resb)[0]
641 return res
e0df6211
PH
642
643 # minor_version + major_version
2f2ffea9 644 _ = read_bytes(2 + 2)
e0df6211
PH
645
646 # Constant pool
ba552f54 647 int_count = u30()
e0df6211 648 for _c in range(1, int_count):
ba552f54
PH
649 _ = s32()
650 uint_count = u30()
e0df6211 651 for _c in range(1, uint_count):
ba552f54
PH
652 _ = u32()
653 double_count = u30()
654 _ = read_bytes((double_count-1) * 8)
655 string_count = u30()
e0df6211
PH
656 constant_strings = [u'']
657 for _c in range(1, string_count):
ba552f54 658 s = string()
e0df6211 659 constant_strings.append(s)
ba552f54 660 namespace_count = u30()
e0df6211 661 for _c in range(1, namespace_count):
ba552f54
PH
662 _ = read_bytes(1) # kind
663 _ = u30() # name
664 ns_set_count = u30()
e0df6211 665 for _c in range(1, ns_set_count):
ba552f54 666 count = u30()
e0df6211 667 for _c2 in range(count):
ba552f54
PH
668 _ = u30()
669 multiname_count = u30()
e0df6211
PH
670 MULTINAME_SIZES = {
671 0x07: 2, # QName
672 0x0d: 2, # QNameA
673 0x0f: 1, # RTQName
674 0x10: 1, # RTQNameA
675 0x11: 0, # RTQNameL
676 0x12: 0, # RTQNameLA
677 0x09: 2, # Multiname
678 0x0e: 2, # MultinameA
679 0x1b: 1, # MultinameL
680 0x1c: 1, # MultinameLA
681 }
682 multinames = [u'']
683 for _c in range(1, multiname_count):
ba552f54 684 kind = u30()
e0df6211
PH
685 assert kind in MULTINAME_SIZES, u'Invalid multiname kind %r' % kind
686 if kind == 0x07:
ba552f54
PH
687 namespace_idx = u30()
688 name_idx = u30()
e0df6211
PH
689 multinames.append(constant_strings[name_idx])
690 else:
691 multinames.append('[MULTINAME kind: %d]' % kind)
692 for _c2 in range(MULTINAME_SIZES[kind]):
ba552f54 693 _ = u30()
e0df6211
PH
694
695 # Methods
ba552f54 696 method_count = u30()
e0df6211
PH
697 MethodInfo = collections.namedtuple(
698 'MethodInfo',
699 ['NEED_ARGUMENTS', 'NEED_REST'])
700 method_infos = []
701 for method_id in range(method_count):
ba552f54
PH
702 param_count = u30()
703 _ = u30() # return type
e0df6211 704 for _ in range(param_count):
ba552f54
PH
705 _ = u30() # param type
706 _ = u30() # name index (always 0 for youtube)
707 flags = read_byte()
e0df6211
PH
708 if flags & 0x08 != 0:
709 # Options present
ba552f54 710 option_count = u30()
e0df6211 711 for c in range(option_count):
ba552f54
PH
712 _ = u30() # val
713 _ = read_bytes(1) # kind
e0df6211
PH
714 if flags & 0x80 != 0:
715 # Param names present
716 for _ in range(param_count):
ba552f54 717 _ = u30() # param name
e0df6211
PH
718 mi = MethodInfo(flags & 0x01 != 0, flags & 0x04 != 0)
719 method_infos.append(mi)
720
721 # Metadata
ba552f54 722 metadata_count = u30()
e0df6211 723 for _c in range(metadata_count):
ba552f54
PH
724 _ = u30() # name
725 item_count = u30()
e0df6211 726 for _c2 in range(item_count):
ba552f54
PH
727 _ = u30() # key
728 _ = u30() # value
729
730 def parse_traits_info():
731 trait_name_idx = u30()
732 kind_full = read_byte()
e0df6211
PH
733 kind = kind_full & 0x0f
734 attrs = kind_full >> 4
735 methods = {}
736 if kind in [0x00, 0x06]: # Slot or Const
ba552f54
PH
737 _ = u30() # Slot id
738 type_name_idx = u30()
739 vindex = u30()
e0df6211 740 if vindex != 0:
ba552f54 741 _ = read_byte() # vkind
e0df6211 742 elif kind in [0x01, 0x02, 0x03]: # Method / Getter / Setter
ba552f54
PH
743 _ = u30() # disp_id
744 method_idx = u30()
e0df6211
PH
745 methods[multinames[trait_name_idx]] = method_idx
746 elif kind == 0x04: # Class
ba552f54
PH
747 _ = u30() # slot_id
748 _ = u30() # classi
e0df6211 749 elif kind == 0x05: # Function
ba552f54
PH
750 _ = u30() # slot_id
751 function_idx = u30()
e0df6211
PH
752 methods[function_idx] = multinames[trait_name_idx]
753 else:
754 raise ExtractorError(u'Unsupported trait kind %d' % kind)
755
756 if attrs & 0x4 != 0: # Metadata present
ba552f54 757 metadata_count = u30()
e0df6211 758 for _c3 in range(metadata_count):
ba552f54 759 _ = u30()
e0df6211 760
ba552f54 761 return methods
e0df6211
PH
762
763 # Classes
764 TARGET_CLASSNAME = u'SignatureDecipher'
765 searched_idx = multinames.index(TARGET_CLASSNAME)
766 searched_class_id = None
ba552f54 767 class_count = u30()
e0df6211 768 for class_id in range(class_count):
ba552f54 769 name_idx = u30()
e0df6211
PH
770 if name_idx == searched_idx:
771 # We found the class we're looking for!
772 searched_class_id = class_id
ba552f54
PH
773 _ = u30() # super_name idx
774 flags = read_byte()
e0df6211 775 if flags & 0x08 != 0: # Protected namespace is present
ba552f54
PH
776 protected_ns_idx = u30()
777 intrf_count = u30()
e0df6211 778 for _c2 in range(intrf_count):
ba552f54
PH
779 _ = u30()
780 _ = u30() # iinit
781 trait_count = u30()
e0df6211 782 for _c2 in range(trait_count):
ba552f54 783 _ = parse_traits_info()
e0df6211
PH
784
785 if searched_class_id is None:
786 raise ExtractorError(u'Target class %r not found' %
787 TARGET_CLASSNAME)
788
789 method_names = {}
790 method_idxs = {}
791 for class_id in range(class_count):
ba552f54
PH
792 _ = u30() # cinit
793 trait_count = u30()
e0df6211 794 for _c2 in range(trait_count):
ba552f54 795 trait_methods = parse_traits_info()
e0df6211
PH
796 if class_id == searched_class_id:
797 method_names.update(trait_methods.items())
798 method_idxs.update(dict(
799 (idx, name)
800 for name, idx in trait_methods.items()))
801
802 # Scripts
ba552f54 803 script_count = u30()
e0df6211 804 for _c in range(script_count):
ba552f54
PH
805 _ = u30() # init
806 trait_count = u30()
e0df6211 807 for _c2 in range(trait_count):
ba552f54 808 _ = parse_traits_info()
e0df6211
PH
809
810 # Method bodies
ba552f54 811 method_body_count = u30()
e0df6211
PH
812 Method = collections.namedtuple('Method', ['code', 'local_count'])
813 methods = {}
814 for _c in range(method_body_count):
ba552f54
PH
815 method_idx = u30()
816 max_stack = u30()
817 local_count = u30()
818 init_scope_depth = u30()
819 max_scope_depth = u30()
820 code_length = u30()
821 code = read_bytes(code_length)
e0df6211 822 if method_idx in method_idxs:
ba552f54 823 m = Method(code, local_count)
e0df6211 824 methods[method_idxs[method_idx]] = m
ba552f54 825 exception_count = u30()
e0df6211 826 for _c2 in range(exception_count):
ba552f54
PH
827 _ = u30() # from
828 _ = u30() # to
829 _ = u30() # target
830 _ = u30() # exc_type
831 _ = u30() # var_name
832 trait_count = u30()
e0df6211 833 for _c2 in range(trait_count):
ba552f54 834 _ = parse_traits_info()
e0df6211 835
ba552f54 836 assert p + code_reader.tell() == len(code_tag)
e0df6211
PH
837 assert len(methods) == len(method_idxs)
838
839 method_pyfunctions = {}
840
841 def extract_function(func_name):
842 if func_name in method_pyfunctions:
843 return method_pyfunctions[func_name]
844 if func_name not in methods:
845 raise ExtractorError(u'Cannot find function %r' % func_name)
846 m = methods[func_name]
847
848 def resfunc(args):
e0df6211
PH
849 registers = ['(this)'] + list(args) + [None] * m.local_count
850 stack = []
851 coder = io.BytesIO(m.code)
852 while True:
853 opcode = struct.unpack('!B', coder.read(1))[0]
a7177865 854 if opcode == 36: # pushbyte
e0df6211
PH
855 v = struct.unpack('!B', coder.read(1))[0]
856 stack.append(v)
857 elif opcode == 44: # pushstring
858 idx = u30(coder)
859 stack.append(constant_strings[idx])
860 elif opcode == 48: # pushscope
861 # We don't implement the scope register, so we'll just
862 # ignore the popped value
863 stack.pop()
864 elif opcode == 70: # callproperty
865 index = u30(coder)
866 mname = multinames[index]
867 arg_count = u30(coder)
868 args = list(reversed(
869 [stack.pop() for _ in range(arg_count)]))
870 obj = stack.pop()
871 if mname == u'split':
872 assert len(args) == 1
873 assert isinstance(args[0], compat_str)
874 assert isinstance(obj, compat_str)
875 if args[0] == u'':
876 res = list(obj)
877 else:
878 res = obj.split(args[0])
879 stack.append(res)
a7177865
PH
880 elif mname == u'slice':
881 assert len(args) == 1
882 assert isinstance(args[0], int)
883 assert isinstance(obj, list)
884 res = obj[args[0]:]
885 stack.append(res)
886 elif mname == u'join':
887 assert len(args) == 1
888 assert isinstance(args[0], compat_str)
889 assert isinstance(obj, list)
890 res = args[0].join(obj)
891 stack.append(res)
e0df6211
PH
892 elif mname in method_pyfunctions:
893 stack.append(method_pyfunctions[mname](args))
894 else:
895 raise NotImplementedError(
896 u'Unsupported property %r on %r'
897 % (mname, obj))
a7177865
PH
898 elif opcode == 72: # returnvalue
899 res = stack.pop()
900 return res
901 elif opcode == 79: # callpropvoid
902 index = u30(coder)
903 mname = multinames[index]
904 arg_count = u30(coder)
905 args = list(reversed(
906 [stack.pop() for _ in range(arg_count)]))
907 obj = stack.pop()
908 if mname == u'reverse':
909 assert isinstance(obj, list)
910 obj.reverse()
911 else:
912 raise NotImplementedError(
913 u'Unsupported (void) property %r on %r'
914 % (mname, obj))
e0df6211
PH
915 elif opcode == 93: # findpropstrict
916 index = u30(coder)
917 mname = multinames[index]
918 res = extract_function(mname)
919 stack.append(res)
920 elif opcode == 97: # setproperty
921 index = u30(coder)
922 value = stack.pop()
923 idx = stack.pop()
924 obj = stack.pop()
925 assert isinstance(obj, list)
926 assert isinstance(idx, int)
927 obj[idx] = value
928 elif opcode == 98: # getlocal
929 index = u30(coder)
930 stack.append(registers[index])
931 elif opcode == 99: # setlocal
932 index = u30(coder)
933 value = stack.pop()
934 registers[index] = value
935 elif opcode == 102: # getproperty
936 index = u30(coder)
937 pname = multinames[index]
938 if pname == u'length':
939 obj = stack.pop()
940 assert isinstance(obj, list)
941 stack.append(len(obj))
942 else: # Assume attribute access
943 idx = stack.pop()
944 assert isinstance(idx, int)
945 obj = stack.pop()
946 assert isinstance(obj, list)
947 stack.append(obj[idx])
948 elif opcode == 128: # coerce
949 _ = u30(coder)
950 elif opcode == 133: # coerce_s
951 assert isinstance(stack[-1], (type(None), compat_str))
952 elif opcode == 164: # modulo
953 value2 = stack.pop()
954 value1 = stack.pop()
955 res = value1 % value2
956 stack.append(res)
a7177865
PH
957 elif opcode == 208: # getlocal_0
958 stack.append(registers[0])
959 elif opcode == 209: # getlocal_1
960 stack.append(registers[1])
961 elif opcode == 210: # getlocal_2
962 stack.append(registers[2])
963 elif opcode == 211: # getlocal_3
964 stack.append(registers[3])
e0df6211
PH
965 elif opcode == 214: # setlocal_2
966 registers[2] = stack.pop()
967 elif opcode == 215: # setlocal_3
968 registers[3] = stack.pop()
969 else:
970 raise NotImplementedError(
971 u'Unsupported opcode %d' % opcode)
972
973 method_pyfunctions[func_name] = resfunc
974 return resfunc
975
976 initial_function = extract_function(u'decipher')
977 return lambda s: initial_function([s])
978
83799698 979 def _decrypt_signature(self, s, video_id, player_url, age_gate=False):
257a2501 980 """Turn the encrypted s field into a working signature"""
6b37f0be 981
83799698 982 if player_url is not None:
e0df6211 983 try:
83799698
PH
984 if player_url not in self._player_cache:
985 func = self._extract_signature_function(
986 video_id, player_url
e0df6211 987 )
83799698
PH
988 self._player_cache[player_url] = func
989 return self._player_cache[player_url](s)
e0df6211
PH
990 except Exception as e:
991 tb = traceback.format_exc()
83799698
PH
992 self._downloader.report_warning(
993 u'Automatic signature extraction failed: ' + tb)
e0df6211 994
83799698
PH
995 self._downloader.report_warning(
996 u'Warning: Falling back to static signature algorithm')
2f2ffea9
PH
997 return self._static_decrypt_signature(
998 s, video_id, player_url, age_gate)
e0df6211 999
2f2ffea9 1000 def _static_decrypt_signature(self, s, video_id, player_url, age_gate):
e0df6211
PH
1001 if age_gate:
1002 # The videos with age protection use another player, so the
1003 # algorithms can be different.
1004 if len(s) == 86:
1005 return s[2:63] + s[82] + s[64:82] + s[63]
1006
1007 if len(s) == 92:
444b1165
JMF
1008 return s[25] + s[3:25] + s[0] + s[26:42] + s[79] + s[43:79] + s[91] + s[80:83]
1009 elif len(s) == 90:
1010 return s[25] + s[3:25] + s[2] + s[26:40] + s[77] + s[41:77] + s[89] + s[78:81]
8a9d86a2 1011 elif len(s) == 89:
1012 return s[84:78:-1] + s[87] + s[77:60:-1] + s[0] + s[59:3:-1]
444b1165 1013 elif len(s) == 88:
3e223834 1014 return s[7:28] + s[87] + s[29:45] + s[55] + s[46:55] + s[2] + s[56:87] + s[28]
be547e1d 1015 elif len(s) == 87:
3a725669 1016 return s[6:27] + s[4] + s[28:39] + s[27] + s[40:59] + s[2] + s[60:]
be547e1d 1017 elif len(s) == 86:
1cf911bc 1018 return s[5:34] + s[0] + s[35:38] + s[3] + s[39:45] + s[38] + s[46:53] + s[73] + s[54:73] + s[85] + s[74:85] + s[53]
be547e1d 1019 elif len(s) == 85:
6ae8ee3f 1020 return s[3:11] + s[0] + s[12:55] + s[84] + s[56:84]
be547e1d 1021 elif len(s) == 84:
23b00bc0 1022 return s[81:36:-1] + s[0] + s[35:2:-1]
be547e1d 1023 elif len(s) == 83:
e1842025 1024 return s[81:64:-1] + s[82] + s[63:52:-1] + s[45] + s[51:45:-1] + s[1] + s[44:1:-1] + s[0]
be547e1d 1025 elif len(s) == 82:
ce85f022 1026 return s[80:73:-1] + s[81] + s[72:54:-1] + s[2] + s[53:43:-1] + s[0] + s[42:2:-1] + s[43] + s[1] + s[54]
be547e1d 1027 elif len(s) == 81:
aedd6bb9 1028 return s[56] + s[79:56:-1] + s[41] + s[55:41:-1] + s[80] + s[40:34:-1] + s[0] + s[33:29:-1] + s[34] + s[28:9:-1] + s[29] + s[8:0:-1] + s[9]
066090dd
JMF
1029 elif len(s) == 80:
1030 return s[1:19] + s[0] + s[20:68] + s[19] + s[69:80]
5c468ca8
JMF
1031 elif len(s) == 79:
1032 return s[54] + s[77:54:-1] + s[39] + s[53:39:-1] + s[78] + s[38:34:-1] + s[0] + s[33:29:-1] + s[34] + s[28:9:-1] + s[29] + s[8:0:-1] + s[9]
be547e1d
PH
1033
1034 else:
1035 raise ExtractorError(u'Unable to decrypt signature, key length %d not supported; retrying might work' % (len(s)))
c5e8d7af 1036
75952c6e
JMF
1037 def _decrypt_signature_age_gate(self, s):
1038 # The videos with age protection use another player, so the algorithms
1039 # can be different.
1040 if len(s) == 86:
1041 return s[2:63] + s[82] + s[64:82] + s[63]
1042 else:
1043 # Fallback to the other algortihms
b072a9de 1044 return self._decrypt_signature(s)
c5e8d7af 1045
de7f3446 1046 def _get_available_subtitles(self, video_id):
de7f3446 1047 try:
7fad1c63
JMF
1048 sub_list = self._download_webpage(
1049 'http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,
1050 video_id, note=False)
1051 except ExtractorError as err:
de7f3446
JMF
1052 self._downloader.report_warning(u'unable to download video subtitles: %s' % compat_str(err))
1053 return {}
1054 lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
1055
1056 sub_lang_list = {}
1057 for l in lang_list:
1058 lang = l[1]
1059 params = compat_urllib_parse.urlencode({
1060 'lang': lang,
1061 'v': video_id,
1062 'fmt': self._downloader.params.get('subtitlesformat'),
1063 })
1064 url = u'http://www.youtube.com/api/timedtext?' + params
1065 sub_lang_list[lang] = url
1066 if not sub_lang_list:
1067 self._downloader.report_warning(u'video doesn\'t have subtitles')
1068 return {}
1069 return sub_lang_list
1070
055e6f36 1071 def _get_available_automatic_caption(self, video_id, webpage):
de7f3446
JMF
1072 """We need the webpage for getting the captions url, pass it as an
1073 argument to speed up the process."""
de7f3446
JMF
1074 sub_format = self._downloader.params.get('subtitlesformat')
1075 self.to_screen(u'%s: Looking for automatic captions' % video_id)
1076 mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
055e6f36 1077 err_msg = u'Couldn\'t find automatic captions for %s' % video_id
de7f3446
JMF
1078 if mobj is None:
1079 self._downloader.report_warning(err_msg)
1080 return {}
1081 player_config = json.loads(mobj.group(1))
1082 try:
1083 args = player_config[u'args']
1084 caption_url = args[u'ttsurl']
1085 timestamp = args[u'timestamp']
055e6f36
JMF
1086 # We get the available subtitles
1087 list_params = compat_urllib_parse.urlencode({
1088 'type': 'list',
1089 'tlangs': 1,
1090 'asrs': 1,
de7f3446 1091 })
055e6f36
JMF
1092 list_url = caption_url + '&' + list_params
1093 list_page = self._download_webpage(list_url, video_id)
1094 caption_list = xml.etree.ElementTree.fromstring(list_page.encode('utf-8'))
e3dc22ca
JMF
1095 original_lang_node = caption_list.find('track')
1096 if original_lang_node.attrib.get('kind') != 'asr' :
1097 self._downloader.report_warning(u'Video doesn\'t have automatic captions')
1098 return {}
1099 original_lang = original_lang_node.attrib['lang_code']
055e6f36
JMF
1100
1101 sub_lang_list = {}
1102 for lang_node in caption_list.findall('target'):
1103 sub_lang = lang_node.attrib['lang_code']
1104 params = compat_urllib_parse.urlencode({
1105 'lang': original_lang,
1106 'tlang': sub_lang,
1107 'fmt': sub_format,
1108 'ts': timestamp,
1109 'kind': 'asr',
1110 })
1111 sub_lang_list[sub_lang] = caption_url + '&' + params
1112 return sub_lang_list
de7f3446
JMF
1113 # An extractor error can be raise by the download process if there are
1114 # no automatic captions but there are subtitles
1115 except (KeyError, ExtractorError):
1116 self._downloader.report_warning(err_msg)
1117 return {}
1118
c5e8d7af
PH
1119 def _print_formats(self, formats):
1120 print('Available formats:')
1121 for x in formats:
03cc7c20
JMF
1122 print('%s\t:\t%s\t[%s]%s' %(x, self._video_extensions.get(x, 'flv'),
1123 self._video_dimensions.get(x, '???'),
836a086c 1124 ' ('+self._special_itags[x]+')' if x in self._special_itags else ''))
c5e8d7af
PH
1125
1126 def _extract_id(self, url):
1127 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1128 if mobj is None:
1129 raise ExtractorError(u'Invalid URL: %s' % url)
1130 video_id = mobj.group(2)
1131 return video_id
1132
1d043b93
JMF
1133 def _get_video_url_list(self, url_map):
1134 """
1135 Transform a dictionary in the format {itag:url} to a list of (itag, url)
1136 with the requested formats.
1137 """
1138 req_format = self._downloader.params.get('format', None)
1139 format_limit = self._downloader.params.get('format_limit', None)
1140 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
1141 if format_limit is not None and format_limit in available_formats:
1142 format_list = available_formats[available_formats.index(format_limit):]
1143 else:
1144 format_list = available_formats
1145 existing_formats = [x for x in format_list if x in url_map]
1146 if len(existing_formats) == 0:
1147 raise ExtractorError(u'no known formats available for video')
1148 if self._downloader.params.get('listformats', None):
1149 self._print_formats(existing_formats)
1150 return
1151 if req_format is None or req_format == 'best':
1152 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1153 elif req_format == 'worst':
1154 video_url_list = [(existing_formats[-1], url_map[existing_formats[-1]])] # worst quality
1155 elif req_format in ('-1', 'all'):
1156 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1157 else:
1158 # Specific formats. We pick the first in a slash-delimeted sequence.
bdc6b3fc
AZ
1159 # Format can be specified as itag or 'mp4' or 'flv' etc. We pick the highest quality
1160 # available in the specified format. For example,
1161 # if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
1162 # if '1/mp4/3/4' is requested and '1' and '5' (is a mp4) are available, we pick '1'.
1163 # if '1/mp4/3/4' is requested and '4' and '5' (is a mp4) are available, we pick '5'.
1d043b93
JMF
1164 req_formats = req_format.split('/')
1165 video_url_list = None
1166 for rf in req_formats:
1167 if rf in url_map:
1168 video_url_list = [(rf, url_map[rf])]
1169 break
bdc6b3fc
AZ
1170 if rf in self._video_formats_map:
1171 for srf in self._video_formats_map[rf]:
1172 if srf in url_map:
1173 video_url_list = [(srf, url_map[srf])]
1174 break
1175 else:
1176 continue
1177 break
1d043b93
JMF
1178 if video_url_list is None:
1179 raise ExtractorError(u'requested format not available')
1180 return video_url_list
1181
1182 def _extract_from_m3u8(self, manifest_url, video_id):
1183 url_map = {}
1184 def _get_urls(_manifest):
1185 lines = _manifest.split('\n')
1186 urls = filter(lambda l: l and not l.startswith('#'),
1187 lines)
1188 return urls
1189 manifest = self._download_webpage(manifest_url, video_id, u'Downloading formats manifest')
1190 formats_urls = _get_urls(manifest)
1191 for format_url in formats_urls:
890f62e8 1192 itag = self._search_regex(r'itag/(\d+?)/', format_url, 'itag')
1d043b93
JMF
1193 url_map[itag] = format_url
1194 return url_map
1195
c5e8d7af 1196 def _real_extract(self, url):
d7f44b5b
PH
1197 if re.match(r'(?:https?://)?[^/]+/watch\?feature=[a-z_]+$', url):
1198 self._downloader.report_warning(u'Did you forget to quote the URL? Remember that & is a meta-character in most shells, so you want to put the URL in quotes, like youtube-dl \'http://www.youtube.com/watch?feature=foo&v=BaW_jenozKc\' (or simply youtube-dl BaW_jenozKc ).')
1199
c5e8d7af
PH
1200 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
1201 mobj = re.search(self._NEXT_URL_RE, url)
1202 if mobj:
1203 url = 'https://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
1204 video_id = self._extract_id(url)
1205
1206 # Get video webpage
1207 self.report_video_webpage_download(video_id)
1208 url = 'https://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
1209 request = compat_urllib_request.Request(url)
1210 try:
1211 video_webpage_bytes = compat_urllib_request.urlopen(request).read()
1212 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1213 raise ExtractorError(u'Unable to download video webpage: %s' % compat_str(err))
1214
1215 video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
1216
1217 # Attempt to extract SWF player URL
e0df6211 1218 mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
c5e8d7af
PH
1219 if mobj is not None:
1220 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1221 else:
1222 player_url = None
1223
1224 # Get video info
1225 self.report_video_info_webpage_download(video_id)
c108eb73
JMF
1226 if re.search(r'player-age-gate-content">', video_webpage) is not None:
1227 self.report_age_confirmation()
1228 age_gate = True
1229 # We simulate the access to the video from www.youtube.com/v/{video_id}
1230 # this can be viewed without login into Youtube
1231 data = compat_urllib_parse.urlencode({'video_id': video_id,
1232 'el': 'embedded',
1233 'gl': 'US',
1234 'hl': 'en',
1235 'eurl': 'https://youtube.googleapis.com/v/' + video_id,
1236 'asv': 3,
1237 'sts':'1588',
1238 })
1239 video_info_url = 'https://www.youtube.com/get_video_info?' + data
c5e8d7af
PH
1240 video_info_webpage = self._download_webpage(video_info_url, video_id,
1241 note=False,
1242 errnote='unable to download video info webpage')
1243 video_info = compat_parse_qs(video_info_webpage)
c108eb73
JMF
1244 else:
1245 age_gate = False
1246 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1247 video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1248 % (video_id, el_type))
1249 video_info_webpage = self._download_webpage(video_info_url, video_id,
1250 note=False,
1251 errnote='unable to download video info webpage')
1252 video_info = compat_parse_qs(video_info_webpage)
1253 if 'token' in video_info:
1254 break
c5e8d7af
PH
1255 if 'token' not in video_info:
1256 if 'reason' in video_info:
9a82b238 1257 raise ExtractorError(u'YouTube said: %s' % video_info['reason'][0], expected=True)
c5e8d7af
PH
1258 else:
1259 raise ExtractorError(u'"token" parameter not in video info for unknown reason')
1260
1261 # Check for "rental" videos
1262 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
1263 raise ExtractorError(u'"rental" videos not supported')
1264
1265 # Start extracting information
1266 self.report_information_extraction(video_id)
1267
1268 # uploader
1269 if 'author' not in video_info:
1270 raise ExtractorError(u'Unable to extract uploader name')
1271 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
1272
1273 # uploader_id
1274 video_uploader_id = None
1275 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
1276 if mobj is not None:
1277 video_uploader_id = mobj.group(1)
1278 else:
1279 self._downloader.report_warning(u'unable to extract uploader nickname')
1280
1281 # title
1282 if 'title' not in video_info:
1283 raise ExtractorError(u'Unable to extract video title')
1284 video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
1285
1286 # thumbnail image
7763b04e
JMF
1287 # We try first to get a high quality image:
1288 m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">',
1289 video_webpage, re.DOTALL)
1290 if m_thumb is not None:
1291 video_thumbnail = m_thumb.group(1)
1292 elif 'thumbnail_url' not in video_info:
c5e8d7af
PH
1293 self._downloader.report_warning(u'unable to extract video thumbnail')
1294 video_thumbnail = ''
1295 else: # don't panic if we can't find it
1296 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
1297
1298 # upload date
1299 upload_date = None
1300 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
1301 if mobj is not None:
1302 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1303 upload_date = unified_strdate(upload_date)
1304
1305 # description
1306 video_description = get_element_by_id("eow-description", video_webpage)
1307 if video_description:
1308 video_description = clean_html(video_description)
1309 else:
1310 fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
1311 if fd_mobj:
1312 video_description = unescapeHTML(fd_mobj.group(1))
1313 else:
1314 video_description = u''
1315
1316 # subtitles
d82134c3 1317 video_subtitles = self.extract_subtitles(video_id, video_webpage)
c5e8d7af 1318
c5e8d7af 1319 if self._downloader.params.get('listsubtitles', False):
d665f8d3 1320 self._list_available_subtitles(video_id, video_webpage)
c5e8d7af
PH
1321 return
1322
1323 if 'length_seconds' not in video_info:
1324 self._downloader.report_warning(u'unable to extract video duration')
1325 video_duration = ''
1326 else:
1327 video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
1328
c5e8d7af 1329 # Decide which formats to download
c5e8d7af
PH
1330
1331 try:
1332 mobj = re.search(r';ytplayer.config = ({.*?});', video_webpage)
50be92c1
PH
1333 if not mobj:
1334 raise ValueError('Could not find vevo ID')
c5e8d7af
PH
1335 info = json.loads(mobj.group(1))
1336 args = info['args']
7ce7e394
JMF
1337 # Easy way to know if the 's' value is in url_encoded_fmt_stream_map
1338 # this signatures are encrypted
1339 m_s = re.search(r'[&,]s=', args['url_encoded_fmt_stream_map'])
1340 if m_s is not None:
1341 self.to_screen(u'%s: Encrypted signatures detected.' % video_id)
c5e8d7af 1342 video_info['url_encoded_fmt_stream_map'] = [args['url_encoded_fmt_stream_map']]
cde846b3 1343 m_s = re.search(r'[&,]s=', args.get('adaptive_fmts', u''))
b7a68384 1344 if m_s is not None:
37b6d5f6
AZ
1345 if 'url_encoded_fmt_stream_map' in video_info:
1346 video_info['url_encoded_fmt_stream_map'][0] += ',' + args['adaptive_fmts']
1347 else:
1348 video_info['url_encoded_fmt_stream_map'] = [args['adaptive_fmts']]
211fbc13 1349 elif 'adaptive_fmts' in video_info:
37b6d5f6
AZ
1350 if 'url_encoded_fmt_stream_map' in video_info:
1351 video_info['url_encoded_fmt_stream_map'][0] += ',' + video_info['adaptive_fmts'][0]
1352 else:
1353 video_info['url_encoded_fmt_stream_map'] = video_info['adaptive_fmts']
c5e8d7af
PH
1354 except ValueError:
1355 pass
1356
1357 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1358 self.report_rtmp_download()
1359 video_url_list = [(None, video_info['conn'][0])]
1360 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
a7055eb9
JMF
1361 if 'rtmpe%3Dyes' in video_info['url_encoded_fmt_stream_map'][0]:
1362 raise ExtractorError('rtmpe downloads are not supported, see https://github.com/rg3/youtube-dl/issues/343 for more information.', expected=True)
c5e8d7af
PH
1363 url_map = {}
1364 for url_data_str in video_info['url_encoded_fmt_stream_map'][0].split(','):
1365 url_data = compat_parse_qs(url_data_str)
1366 if 'itag' in url_data and 'url' in url_data:
1367 url = url_data['url'][0]
1368 if 'sig' in url_data:
1369 url += '&signature=' + url_data['sig'][0]
1370 elif 's' in url_data:
e0df6211 1371 encrypted_sig = url_data['s'][0]
769fda3c 1372 if self._downloader.params.get('verbose'):
c108eb73 1373 if age_gate:
83799698
PH
1374 player_version = self._search_regex(
1375 r'-(.+)\.swf$',
1376 player_url if player_url else None,
e0df6211
PH
1377 'flash player', fatal=False)
1378 player_desc = 'flash player %s' % player_version
c108eb73 1379 else:
83799698
PH
1380 player_version = self._search_regex(
1381 r'html5player-(.+?)\.js', video_webpage,
c108eb73 1382 'html5 player', fatal=False)
e0df6211
PH
1383 player_desc = u'html5 player %s' % player_version
1384
1385 parts_sizes = u'.'.join(compat_str(len(part)) for part in encrypted_sig.split('.'))
5a76c651 1386 self.to_screen(u'encrypted signature length %d (%s), itag %s, %s' %
e0df6211
PH
1387 (len(encrypted_sig), parts_sizes, url_data['itag'][0], player_desc))
1388
83799698 1389 if not age_gate:
e0df6211
PH
1390 jsplayer_url_json = self._search_regex(
1391 r'"assets":.+?"js":\s*("[^"]+")',
1392 video_webpage, u'JS player URL')
83799698 1393 player_url = json.loads(jsplayer_url_json)
e0df6211 1394
83799698
PH
1395 signature = self._decrypt_signature(
1396 encrypted_sig, video_id, player_url, age_gate)
c5e8d7af
PH
1397 url += '&signature=' + signature
1398 if 'ratebypass' not in url:
1399 url += '&ratebypass=yes'
1400 url_map[url_data['itag'][0]] = url
1d043b93
JMF
1401 video_url_list = self._get_video_url_list(url_map)
1402 if not video_url_list:
c5e8d7af 1403 return
1d043b93
JMF
1404 elif video_info.get('hlsvp'):
1405 manifest_url = video_info['hlsvp'][0]
1406 url_map = self._extract_from_m3u8(manifest_url, video_id)
1407 video_url_list = self._get_video_url_list(url_map)
1408 if not video_url_list:
1409 return
1410
c5e8d7af
PH
1411 else:
1412 raise ExtractorError(u'no conn or url_encoded_fmt_stream_map information found in video info')
1413
1414 results = []
1415 for format_param, video_real_url in video_url_list:
1416 # Extension
1417 video_extension = self._video_extensions.get(format_param, 'flv')
1418
03cc7c20
JMF
1419 video_format = '{0} - {1}{2}'.format(format_param if format_param else video_extension,
1420 self._video_dimensions.get(format_param, '???'),
836a086c 1421 ' ('+self._special_itags[format_param]+')' if format_param in self._special_itags else '')
c5e8d7af
PH
1422
1423 results.append({
1424 'id': video_id,
1425 'url': video_real_url,
1426 'uploader': video_uploader,
1427 'uploader_id': video_uploader_id,
1428 'upload_date': upload_date,
1429 'title': video_title,
1430 'ext': video_extension,
1431 'format': video_format,
1432 'thumbnail': video_thumbnail,
1433 'description': video_description,
1434 'player_url': player_url,
1435 'subtitles': video_subtitles,
1436 'duration': video_duration
1437 })
1438 return results
1439
1440class YoutubePlaylistIE(InfoExtractor):
0f818663 1441 IE_DESC = u'YouTube.com playlists'
c5e8d7af
PH
1442 _VALID_URL = r"""(?:
1443 (?:https?://)?
1444 (?:\w+\.)?
1445 youtube\.com/
1446 (?:
1447 (?:course|view_play_list|my_playlists|artist|playlist|watch)
1448 \? (?:.*?&)*? (?:p|a|list)=
1449 | p/
1450 )
c626a3d9 1451 ((?:PL|EC|UU|FL)?[0-9A-Za-z-_]{10,})
c5e8d7af
PH
1452 .*
1453 |
c626a3d9 1454 ((?:PL|EC|UU|FL)[0-9A-Za-z-_]{10,})
c5e8d7af
PH
1455 )"""
1456 _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json&safeSearch=none'
1457 _MAX_RESULTS = 50
1458 IE_NAME = u'youtube:playlist'
1459
1460 @classmethod
1461 def suitable(cls, url):
1462 """Receives a URL and returns True if suitable for this IE."""
1463 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1464
1465 def _real_extract(self, url):
1466 # Extract playlist id
1467 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1468 if mobj is None:
1469 raise ExtractorError(u'Invalid URL: %s' % url)
1470
1471 # Download playlist videos from API
1472 playlist_id = mobj.group(1) or mobj.group(2)
c5e8d7af
PH
1473 videos = []
1474
755eb032 1475 for page_num in itertools.count(1):
771822eb
JMF
1476 start_index = self._MAX_RESULTS * (page_num - 1) + 1
1477 if start_index >= 1000:
1478 self._downloader.report_warning(u'Max number of results reached')
1479 break
1480 url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, start_index)
c5e8d7af
PH
1481 page = self._download_webpage(url, playlist_id, u'Downloading page #%s' % page_num)
1482
1483 try:
1484 response = json.loads(page)
1485 except ValueError as err:
1486 raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
1487
1488 if 'feed' not in response:
1489 raise ExtractorError(u'Got a malformed response from YouTube API')
1490 playlist_title = response['feed']['title']['$t']
1491 if 'entry' not in response['feed']:
1492 # Number of videos is a multiple of self._MAX_RESULTS
1493 break
1494
1495 for entry in response['feed']['entry']:
1496 index = entry['yt$position']['$t']
c215217e
JMF
1497 if 'media$group' in entry and 'yt$videoid' in entry['media$group']:
1498 videos.append((
1499 index,
1500 'https://www.youtube.com/watch?v=' + entry['media$group']['yt$videoid']['$t']
1501 ))
c5e8d7af 1502
c5e8d7af
PH
1503 videos = [v[1] for v in sorted(videos)]
1504
20c3893f 1505 url_results = [self.url_result(vurl, 'Youtube') for vurl in videos]
c5e8d7af
PH
1506 return [self.playlist_result(url_results, playlist_id, playlist_title)]
1507
1508
1509class YoutubeChannelIE(InfoExtractor):
0f818663 1510 IE_DESC = u'YouTube.com channels'
c5e8d7af
PH
1511 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
1512 _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1513 _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
252580c5 1514 _MORE_PAGES_URL = 'http://www.youtube.com/c4_browse_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
c5e8d7af
PH
1515 IE_NAME = u'youtube:channel'
1516
1517 def extract_videos_from_page(self, page):
1518 ids_in_page = []
1519 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1520 if mobj.group(1) not in ids_in_page:
1521 ids_in_page.append(mobj.group(1))
1522 return ids_in_page
1523
1524 def _real_extract(self, url):
1525 # Extract channel id
1526 mobj = re.match(self._VALID_URL, url)
1527 if mobj is None:
1528 raise ExtractorError(u'Invalid URL: %s' % url)
1529
1530 # Download channel page
1531 channel_id = mobj.group(1)
1532 video_ids = []
1533 pagenum = 1
1534
1535 url = self._TEMPLATE_URL % (channel_id, pagenum)
1536 page = self._download_webpage(url, channel_id,
1537 u'Downloading page #%s' % pagenum)
1538
1539 # Extract video identifiers
1540 ids_in_page = self.extract_videos_from_page(page)
1541 video_ids.extend(ids_in_page)
1542
1543 # Download any subsequent channel pages using the json-based channel_ajax query
1544 if self._MORE_PAGES_INDICATOR in page:
755eb032 1545 for pagenum in itertools.count(1):
c5e8d7af
PH
1546 url = self._MORE_PAGES_URL % (pagenum, channel_id)
1547 page = self._download_webpage(url, channel_id,
1548 u'Downloading page #%s' % pagenum)
1549
1550 page = json.loads(page)
1551
1552 ids_in_page = self.extract_videos_from_page(page['content_html'])
1553 video_ids.extend(ids_in_page)
1554
1555 if self._MORE_PAGES_INDICATOR not in page['load_more_widget_html']:
1556 break
1557
1558 self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1559
1560 urls = ['http://www.youtube.com/watch?v=%s' % id for id in video_ids]
20c3893f 1561 url_entries = [self.url_result(eurl, 'Youtube') for eurl in urls]
c5e8d7af
PH
1562 return [self.playlist_result(url_entries, channel_id)]
1563
1564
1565class YoutubeUserIE(InfoExtractor):
0f818663 1566 IE_DESC = u'YouTube.com user videos (URL or "ytuser" keyword)'
faab1d38 1567 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:user/)?)|ytuser:)(?!feed/)([A-Za-z0-9_-]+)'
c5e8d7af
PH
1568 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1569 _GDATA_PAGE_SIZE = 50
fd9cf738 1570 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d&alt=json'
c5e8d7af
PH
1571 IE_NAME = u'youtube:user'
1572
e3ea4790 1573 @classmethod
f4b05232 1574 def suitable(cls, url):
e3ea4790
JMF
1575 # Don't return True if the url can be extracted with other youtube
1576 # extractor, the regex would is too permissive and it would match.
1577 other_ies = iter(klass for (name, klass) in globals().items() if name.endswith('IE') and klass is not cls)
1578 if any(ie.suitable(url) for ie in other_ies): return False
f4b05232
JMF
1579 else: return super(YoutubeUserIE, cls).suitable(url)
1580
c5e8d7af
PH
1581 def _real_extract(self, url):
1582 # Extract username
1583 mobj = re.match(self._VALID_URL, url)
1584 if mobj is None:
1585 raise ExtractorError(u'Invalid URL: %s' % url)
1586
1587 username = mobj.group(1)
1588
1589 # Download video ids using YouTube Data API. Result size per
1590 # query is limited (currently to 50 videos) so we need to query
1591 # page by page until there are no video ids - it means we got
1592 # all of them.
1593
1594 video_ids = []
c5e8d7af 1595
755eb032 1596 for pagenum in itertools.count(0):
c5e8d7af
PH
1597 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1598
1599 gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
1600 page = self._download_webpage(gdata_url, username,
1601 u'Downloading video ids from %d to %d' % (start_index, start_index + self._GDATA_PAGE_SIZE))
1602
fd9cf738
JMF
1603 try:
1604 response = json.loads(page)
1605 except ValueError as err:
1606 raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
71c82637
JMF
1607 if 'entry' not in response['feed']:
1608 # Number of videos is a multiple of self._MAX_RESULTS
1609 break
fd9cf738 1610
c5e8d7af
PH
1611 # Extract video identifiers
1612 ids_in_page = []
fd9cf738
JMF
1613 for entry in response['feed']['entry']:
1614 ids_in_page.append(entry['id']['$t'].split('/')[-1])
c5e8d7af
PH
1615 video_ids.extend(ids_in_page)
1616
1617 # A little optimization - if current page is not
1618 # "full", ie. does not contain PAGE_SIZE video ids then
1619 # we can assume that this page is the last one - there
1620 # are no more ids on further pages - no need to query
1621 # again.
1622
1623 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1624 break
1625
c5e8d7af 1626 urls = ['http://www.youtube.com/watch?v=%s' % video_id for video_id in video_ids]
20c3893f 1627 url_results = [self.url_result(rurl, 'Youtube') for rurl in urls]
c5e8d7af 1628 return [self.playlist_result(url_results, playlist_title = username)]
b05654f0
PH
1629
1630class YoutubeSearchIE(SearchInfoExtractor):
0f818663 1631 IE_DESC = u'YouTube.com searches'
b05654f0
PH
1632 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1633 _MAX_RESULTS = 1000
1634 IE_NAME = u'youtube:search'
1635 _SEARCH_KEY = 'ytsearch'
1636
1637 def report_download_page(self, query, pagenum):
1638 """Report attempt to download search page with given number."""
1639 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1640
1641 def _get_n_results(self, query, n):
1642 """Get a specified number of results for a query"""
1643
1644 video_ids = []
1645 pagenum = 0
1646 limit = n
1647
1648 while (50 * pagenum) < limit:
1649 self.report_download_page(query, pagenum+1)
1650 result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1651 request = compat_urllib_request.Request(result_url)
1652 try:
1653 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
1654 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1655 raise ExtractorError(u'Unable to download API page: %s' % compat_str(err))
1656 api_response = json.loads(data)['data']
1657
1658 if not 'items' in api_response:
1659 raise ExtractorError(u'[youtube] No video results')
1660
1661 new_ids = list(video['id'] for video in api_response['items'])
1662 video_ids += new_ids
1663
1664 limit = min(n, api_response['totalItems'])
1665 pagenum += 1
1666
1667 if len(video_ids) > n:
1668 video_ids = video_ids[:n]
1669 videos = [self.url_result('http://www.youtube.com/watch?v=%s' % id, 'Youtube') for id in video_ids]
1670 return self.playlist_result(videos, query)
75dff0ee
JMF
1671
1672
1673class YoutubeShowIE(InfoExtractor):
0f818663 1674 IE_DESC = u'YouTube.com (multi-season) shows'
75dff0ee
JMF
1675 _VALID_URL = r'https?://www\.youtube\.com/show/(.*)'
1676 IE_NAME = u'youtube:show'
1677
1678 def _real_extract(self, url):
1679 mobj = re.match(self._VALID_URL, url)
1680 show_name = mobj.group(1)
1681 webpage = self._download_webpage(url, show_name, u'Downloading show webpage')
1682 # There's one playlist for each season of the show
1683 m_seasons = list(re.finditer(r'href="(/playlist\?list=.*?)"', webpage))
1684 self.to_screen(u'%s: Found %s seasons' % (show_name, len(m_seasons)))
1685 return [self.url_result('https://www.youtube.com' + season.group(1), 'YoutubePlaylist') for season in m_seasons]
04cc9617
JMF
1686
1687
b2e8bc1b 1688class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
d7ae0639
JMF
1689 """
1690 Base class for extractors that fetch info from
1691 http://www.youtube.com/feed_ajax
1692 Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.
1693 """
b2e8bc1b 1694 _LOGIN_REQUIRED = True
04cc9617 1695 _PAGING_STEP = 30
43ba5456
JMF
1696 # use action_load_personal_feed instead of action_load_system_feed
1697 _PERSONAL_FEED = False
04cc9617 1698
d7ae0639
JMF
1699 @property
1700 def _FEED_TEMPLATE(self):
43ba5456
JMF
1701 action = 'action_load_system_feed'
1702 if self._PERSONAL_FEED:
1703 action = 'action_load_personal_feed'
1704 return 'http://www.youtube.com/feed_ajax?%s=1&feed_name=%s&paging=%%s' % (action, self._FEED_NAME)
d7ae0639
JMF
1705
1706 @property
1707 def IE_NAME(self):
1708 return u'youtube:%s' % self._FEED_NAME
04cc9617 1709
81f0259b 1710 def _real_initialize(self):
b2e8bc1b 1711 self._login()
81f0259b 1712
04cc9617
JMF
1713 def _real_extract(self, url):
1714 feed_entries = []
1715 # The step argument is available only in 2.7 or higher
1716 for i in itertools.count(0):
1717 paging = i*self._PAGING_STEP
d7ae0639
JMF
1718 info = self._download_webpage(self._FEED_TEMPLATE % paging,
1719 u'%s feed' % self._FEED_NAME,
04cc9617
JMF
1720 u'Downloading page %s' % i)
1721 info = json.loads(info)
1722 feed_html = info['feed_html']
43ba5456 1723 m_ids = re.finditer(r'"/watch\?v=(.*?)["&]', feed_html)
04cc9617
JMF
1724 ids = orderedSet(m.group(1) for m in m_ids)
1725 feed_entries.extend(self.url_result(id, 'Youtube') for id in ids)
1726 if info['paging'] is None:
1727 break
d7ae0639
JMF
1728 return self.playlist_result(feed_entries, playlist_title=self._PLAYLIST_TITLE)
1729
1730class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor):
1731 IE_DESC = u'YouTube.com subscriptions feed, "ytsubs" keyword(requires authentication)'
1732 _VALID_URL = r'https?://www\.youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
1733 _FEED_NAME = 'subscriptions'
1734 _PLAYLIST_TITLE = u'Youtube Subscriptions'
1735
1736class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
1737 IE_DESC = u'YouTube.com recommended videos, "ytrec" keyword (requires authentication)'
1738 _VALID_URL = r'https?://www\.youtube\.com/feed/recommended|:ytrec(?:ommended)?'
1739 _FEED_NAME = 'recommended'
1740 _PLAYLIST_TITLE = u'Youtube Recommended videos'
c626a3d9 1741
43ba5456
JMF
1742class YoutubeWatchLaterIE(YoutubeFeedsInfoExtractor):
1743 IE_DESC = u'Youtube watch later list, "ytwatchlater" keyword (requires authentication)'
1744 _VALID_URL = r'https?://www\.youtube\.com/feed/watch_later|:ytwatchlater'
1745 _FEED_NAME = 'watch_later'
1746 _PLAYLIST_TITLE = u'Youtube Watch Later'
1747 _PAGING_STEP = 100
1748 _PERSONAL_FEED = True
c626a3d9
JMF
1749
1750class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
1751 IE_NAME = u'youtube:favorites'
1752 IE_DESC = u'YouTube.com favourite videos, "ytfav" keyword (requires authentication)'
c7a7750d 1753 _VALID_URL = r'https?://www\.youtube\.com/my_favorites|:ytfav(?:ou?rites)?'
c626a3d9
JMF
1754 _LOGIN_REQUIRED = True
1755
1756 def _real_extract(self, url):
1757 webpage = self._download_webpage('https://www.youtube.com/my_favorites', 'Youtube Favourites videos')
1758 playlist_id = self._search_regex(r'list=(.+?)["&]', webpage, u'favourites playlist id')
1759 return self.url_result(playlist_id, 'YoutubePlaylist')