]> jfr.im git - yt-dlp.git/blame - youtube_dl/extractor/youtube.py
[youtube] Add support for yourepeat.com URLs (Closes #2397)
[yt-dlp.git] / youtube_dl / extractor / youtube.py
CommitLineData
c5e8d7af 1# coding: utf-8
c5e8d7af 2
e0df6211 3import collections
edf3e38e 4import errno
e0df6211 5import io
0ca96d48 6import itertools
c5e8d7af 7import json
c4417ddb 8import os.path
c5e8d7af 9import re
e0df6211
PH
10import string
11import struct
12import traceback
13import zlib
c5e8d7af 14
b05654f0 15from .common import InfoExtractor, SearchInfoExtractor
54d39d8b 16from .subtitles import SubtitlesInfoExtractor
c5e8d7af 17from ..utils import (
edf3e38e 18 compat_chr,
c5e8d7af 19 compat_parse_qs,
c5e8d7af
PH
20 compat_urllib_parse,
21 compat_urllib_request,
7c61bd36 22 compat_urlparse,
c5e8d7af
PH
23 compat_str,
24
25 clean_html,
c38b1e77 26 get_cachedir,
c5e8d7af 27 get_element_by_id,
652cdaa2 28 get_element_by_attribute,
c5e8d7af 29 ExtractorError,
dd27fd17 30 int_or_none,
b7ab0590 31 PagedList,
c91778f8 32 RegexNotFoundError,
c5e8d7af
PH
33 unescapeHTML,
34 unified_strdate,
04cc9617 35 orderedSet,
edf3e38e 36 write_json_file,
81c2f20b 37 uppercase_escape,
c5e8d7af
PH
38)
39
de7f3446 40class YoutubeBaseInfoExtractor(InfoExtractor):
b2e8bc1b
JMF
41 """Provide base functions for Youtube extractors"""
42 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
43 _LANG_URL = r'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
38c2e5b8 44 _AGE_URL = 'https://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
b2e8bc1b
JMF
45 _NETRC_MACHINE = 'youtube'
46 # If True it will raise an error if no login info is provided
47 _LOGIN_REQUIRED = False
48
b2e8bc1b 49 def _set_language(self):
7cc3570e
PH
50 return bool(self._download_webpage(
51 self._LANG_URL, None,
52 note=u'Setting language', errnote='unable to set language',
53 fatal=False))
b2e8bc1b
JMF
54
55 def _login(self):
56 (username, password) = self._get_login_info()
57 # No authentication to be performed
58 if username is None:
59 if self._LOGIN_REQUIRED:
60 raise ExtractorError(u'No login info available, needed for using %s.' % self.IE_NAME, expected=True)
61 return False
62
7cc3570e
PH
63 login_page = self._download_webpage(
64 self._LOGIN_URL, None,
65 note=u'Downloading login page',
66 errnote=u'unable to fetch login page', fatal=False)
67 if login_page is False:
68 return
b2e8bc1b 69
795f28f8
PH
70 galx = self._search_regex(r'(?s)<input.+?name="GALX".+?value="(.+?)"',
71 login_page, u'Login GALX parameter')
c5e8d7af 72
b2e8bc1b
JMF
73 # Log in
74 login_form_strs = {
75 u'continue': u'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
76 u'Email': username,
77 u'GALX': galx,
78 u'Passwd': password,
79 u'PersistentCookie': u'yes',
80 u'_utf8': u'霱',
81 u'bgresponse': u'js_disabled',
82 u'checkConnection': u'',
83 u'checkedDomains': u'youtube',
84 u'dnConn': u'',
b2e8bc1b
JMF
85 u'pstMsg': u'0',
86 u'rmShown': u'1',
87 u'secTok': u'',
88 u'signIn': u'Sign in',
89 u'timeStmp': u'',
90 u'service': u'youtube',
91 u'uilel': u'3',
92 u'hl': u'en_US',
93 }
94 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
95 # chokes on unicode
96 login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
97 login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
7cc3570e
PH
98
99 req = compat_urllib_request.Request(self._LOGIN_URL, login_data)
100 login_results = self._download_webpage(
101 req, None,
102 note=u'Logging in', errnote=u'unable to log in', fatal=False)
103 if login_results is False:
104 return False
105 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
106 self._downloader.report_warning(u'unable to log in: bad username or password')
b2e8bc1b
JMF
107 return False
108 return True
109
110 def _confirm_age(self):
111 age_form = {
7cc3570e
PH
112 'next_url': '/',
113 'action_confirm': 'Confirm',
114 }
5700e779
JMF
115 req = compat_urllib_request.Request(self._AGE_URL,
116 compat_urllib_parse.urlencode(age_form).encode('ascii'))
7cc3570e
PH
117
118 self._download_webpage(
119 req, None,
120 note=u'Confirming age', errnote=u'Unable to confirm age')
b2e8bc1b
JMF
121 return True
122
123 def _real_initialize(self):
124 if self._downloader is None:
125 return
126 if not self._set_language():
127 return
128 if not self._login():
129 return
130 self._confirm_age()
c5e8d7af 131
8377574c 132
de7f3446 133class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
0f818663 134 IE_DESC = u'YouTube.com'
cb7dfeea 135 _VALID_URL = r"""(?x)^
c5e8d7af 136 (
83aa5293 137 (?:https?://|//)? # http(s):// or protocol-independent URL (optional)
cb7dfeea 138 (?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/|
484aaeb2 139 (?:www\.)?deturl\.com/www\.youtube\.com/|
e70dc1d1 140 (?:www\.)?pwnyoutube\.com/|
f7000f3a 141 (?:www\.)?yourepeat\.com/|
e69ae5b9
JMF
142 tube\.majestyc\.net/|
143 youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains
c5e8d7af
PH
144 (?:.*?\#/)? # handle anchor (#/) redirect urls
145 (?: # the various things that can precede the ID:
146 (?:(?:v|embed|e)/) # v/ or embed/ or e/
147 |(?: # or the v= param in all its forms
f7000f3a 148 (?:(?:watch|movie)(?:_popup)?(?:\.php)?/?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
c5e8d7af
PH
149 (?:\?|\#!?) # the params delimiter ? or # or #!
150 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
151 v=
152 )
f4b05232
JMF
153 ))
154 |youtu\.be/ # just youtu.be/xxxx
155 )
c5e8d7af 156 )? # all until now is optional -> you can pass the naked ID
8963d9c2 157 ([0-9A-Za-z_-]{11}) # here is it! the YouTube video ID
c5e8d7af
PH
158 (?(1).+)? # if we found the ID, everything can follow
159 $"""
c5e8d7af 160 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
2c62dc26
PH
161 _formats = {
162 '5': {'ext': 'flv', 'width': 400, 'height': 240},
163 '6': {'ext': 'flv', 'width': 450, 'height': 270},
164 '13': {'ext': '3gp'},
165 '17': {'ext': '3gp', 'width': 176, 'height': 144},
166 '18': {'ext': 'mp4', 'width': 640, 'height': 360},
167 '22': {'ext': 'mp4', 'width': 1280, 'height': 720},
168 '34': {'ext': 'flv', 'width': 640, 'height': 360},
169 '35': {'ext': 'flv', 'width': 854, 'height': 480},
170 '36': {'ext': '3gp', 'width': 320, 'height': 240},
171 '37': {'ext': 'mp4', 'width': 1920, 'height': 1080},
172 '38': {'ext': 'mp4', 'width': 4096, 'height': 3072},
173 '43': {'ext': 'webm', 'width': 640, 'height': 360},
174 '44': {'ext': 'webm', 'width': 854, 'height': 480},
175 '45': {'ext': 'webm', 'width': 1280, 'height': 720},
176 '46': {'ext': 'webm', 'width': 1920, 'height': 1080},
177
1d043b93 178
86fe61c8 179 # 3d videos
2c62dc26
PH
180 '82': {'ext': 'mp4', 'height': 360, 'resolution': '360p', 'format_note': '3D', 'preference': -20},
181 '83': {'ext': 'mp4', 'height': 480, 'resolution': '480p', 'format_note': '3D', 'preference': -20},
182 '84': {'ext': 'mp4', 'height': 720, 'resolution': '720p', 'format_note': '3D', 'preference': -20},
183 '85': {'ext': 'mp4', 'height': 1080, 'resolution': '1080p', 'format_note': '3D', 'preference': -20},
184 '100': {'ext': 'webm', 'height': 360, 'resolution': '360p', 'format_note': '3D', 'preference': -20},
185 '101': {'ext': 'webm', 'height': 480, 'resolution': '480p', 'format_note': '3D', 'preference': -20},
186 '102': {'ext': 'webm', 'height': 720, 'resolution': '720p', 'format_note': '3D', 'preference': -20},
836a086c 187
96fb5605 188 # Apple HTTP Live Streaming
2c62dc26
PH
189 '92': {'ext': 'mp4', 'height': 240, 'resolution': '240p', 'format_note': 'HLS', 'preference': -10},
190 '93': {'ext': 'mp4', 'height': 360, 'resolution': '360p', 'format_note': 'HLS', 'preference': -10},
191 '94': {'ext': 'mp4', 'height': 480, 'resolution': '480p', 'format_note': 'HLS', 'preference': -10},
192 '95': {'ext': 'mp4', 'height': 720, 'resolution': '720p', 'format_note': 'HLS', 'preference': -10},
193 '96': {'ext': 'mp4', 'height': 1080, 'resolution': '1080p', 'format_note': 'HLS', 'preference': -10},
194 '132': {'ext': 'mp4', 'height': 240, 'resolution': '240p', 'format_note': 'HLS', 'preference': -10},
195 '151': {'ext': 'mp4', 'height': 72, 'resolution': '72p', 'format_note': 'HLS', 'preference': -10},
196
197 # DASH mp4 video
198 '133': {'ext': 'mp4', 'height': 240, 'resolution': '240p', 'format_note': 'DASH video', 'preference': -40},
199 '134': {'ext': 'mp4', 'height': 360, 'resolution': '360p', 'format_note': 'DASH video', 'preference': -40},
200 '135': {'ext': 'mp4', 'height': 480, 'resolution': '480p', 'format_note': 'DASH video', 'preference': -40},
201 '136': {'ext': 'mp4', 'height': 720, 'resolution': '720p', 'format_note': 'DASH video', 'preference': -40},
202 '137': {'ext': 'mp4', 'height': 1080, 'resolution': '1080p', 'format_note': 'DASH video', 'preference': -40},
203 '138': {'ext': 'mp4', 'height': 1081, 'resolution': '>1080p', 'format_note': 'DASH video', 'preference': -40},
204 '160': {'ext': 'mp4', 'height': 192, 'resolution': '192p', 'format_note': 'DASH video', 'preference': -40},
8fa8a629 205 '264': {'ext': 'mp4', 'height': 1080, 'resolution': '1080p', 'format_note': 'DASH video', 'preference': -40},
836a086c 206
f6f1fc92 207 # Dash mp4 audio
2c62dc26
PH
208 '139': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 48, 'preference': -50},
209 '140': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 128, 'preference': -50},
210 '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 256, 'preference': -50},
836a086c
AZ
211
212 # Dash webm
1394ce65
PH
213 '167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'VP8', 'acodec': 'none', 'preference': -40},
214 '168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'VP8', 'acodec': 'none', 'preference': -40},
2c5bae42
PH
215 '169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'VP8', 'acodec': 'none', 'preference': -40},
216 '170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'VP8', 'acodec': 'none', 'preference': -40},
1394ce65
PH
217 '218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'VP8', 'acodec': 'none', 'preference': -40},
218 '219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'VP8', 'acodec': 'none', 'preference': -40},
2c62dc26
PH
219 '242': {'ext': 'webm', 'height': 240, 'resolution': '240p', 'format_note': 'DASH webm', 'preference': -40},
220 '243': {'ext': 'webm', 'height': 360, 'resolution': '360p', 'format_note': 'DASH webm', 'preference': -40},
221 '244': {'ext': 'webm', 'height': 480, 'resolution': '480p', 'format_note': 'DASH webm', 'preference': -40},
222 '245': {'ext': 'webm', 'height': 480, 'resolution': '480p', 'format_note': 'DASH webm', 'preference': -40},
223 '246': {'ext': 'webm', 'height': 480, 'resolution': '480p', 'format_note': 'DASH webm', 'preference': -40},
224 '247': {'ext': 'webm', 'height': 720, 'resolution': '720p', 'format_note': 'DASH webm', 'preference': -40},
225 '248': {'ext': 'webm', 'height': 1080, 'resolution': '1080p', 'format_note': 'DASH webm', 'preference': -40},
226
227 # Dash webm audio
228 '171': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH webm audio', 'abr': 48, 'preference': -50},
229 '172': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH webm audio', 'abr': 256, 'preference': -50},
ce6b9a2d
PH
230
231 # RTMP (unnamed)
232 '_rtmp': {'protocol': 'rtmp'},
c5e8d7af 233 }
836a086c 234
c5e8d7af 235 IE_NAME = u'youtube'
2eb88d95
PH
236 _TESTS = [
237 {
0e853ca4
PH
238 u"url": u"http://www.youtube.com/watch?v=BaW_jenozKc",
239 u"file": u"BaW_jenozKc.mp4",
240 u"info_dict": {
241 u"title": u"youtube-dl test video \"'/\\ä↭𝕐",
242 u"uploader": u"Philipp Hagemeister",
243 u"uploader_id": u"phihag",
244 u"upload_date": u"20121002",
27dcce19 245 u"description": u"test chars: \"'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de ."
2eb88d95 246 }
0e853ca4 247 },
0e853ca4
PH
248 {
249 u"url": u"http://www.youtube.com/watch?v=UxxajLWwzqY",
250 u"file": u"UxxajLWwzqY.mp4",
251 u"note": u"Test generic use_cipher_signature video (#897)",
252 u"info_dict": {
253 u"upload_date": u"20120506",
254 u"title": u"Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]",
2dc59299 255 u"description": u"md5:5b292926389560516e384ac437c0ec07",
45ed795c 256 u"uploader": u"Icona Pop",
0e853ca4 257 u"uploader_id": u"IconaPop"
2eb88d95 258 }
c108eb73
JMF
259 },
260 {
261 u"url": u"https://www.youtube.com/watch?v=07FYdnEawAQ",
262 u"file": u"07FYdnEawAQ.mp4",
263 u"note": u"Test VEVO video with age protection (#956)",
264 u"info_dict": {
265 u"upload_date": u"20130703",
266 u"title": u"Justin Timberlake - Tunnel Vision (Explicit)",
267 u"description": u"md5:64249768eec3bc4276236606ea996373",
268 u"uploader": u"justintimberlakeVEVO",
269 u"uploader_id": u"justintimberlakeVEVO"
270 }
271 },
fccd3771 272 {
83aa5293 273 u"url": u"//www.YouTube.com/watch?v=yZIXLfi8CZQ",
fccd3771
PH
274 u"file": u"yZIXLfi8CZQ.mp4",
275 u"note": u"Embed-only video (#1746)",
276 u"info_dict": {
277 u"upload_date": u"20120608",
278 u"title": u"Principal Sexually Assaults A Teacher - Episode 117 - 8th June 2012",
279 u"description": u"md5:09b78bd971f1e3e289601dfba15ca4f7",
280 u"uploader": u"SET India",
281 u"uploader_id": u"setindia"
282 }
283 },
dd27fd17
PH
284 {
285 u"url": u"http://www.youtube.com/watch?v=a9LDPn-MO4I",
286 u"file": u"a9LDPn-MO4I.m4a",
287 u"note": u"256k DASH audio (format 141) via DASH manifest",
dd27fd17
PH
288 u"info_dict": {
289 u"upload_date": "20121002",
290 u"uploader_id": "8KVIDEO",
291 u"description": "No description available.",
292 u"uploader": "8KVIDEO",
293 u"title": "UHDTV TEST 8K VIDEO.mp4"
4919603f
PH
294 },
295 u"params": {
296 u"youtube_include_dash_manifest": True,
297 u"format": "141",
298 },
dd27fd17 299 },
2eb88d95
PH
300 ]
301
c5e8d7af
PH
302
303 @classmethod
304 def suitable(cls, url):
305 """Receives a URL and returns True if suitable for this IE."""
e3ea4790 306 if YoutubePlaylistIE.suitable(url): return False
fccd3771 307 return re.match(cls._VALID_URL, url) is not None
c5e8d7af 308
e0df6211
PH
309 def __init__(self, *args, **kwargs):
310 super(YoutubeIE, self).__init__(*args, **kwargs)
83799698 311 self._player_cache = {}
e0df6211 312
c5e8d7af
PH
313 def report_video_info_webpage_download(self, video_id):
314 """Report attempt to download video info webpage."""
315 self.to_screen(u'%s: Downloading video info webpage' % video_id)
316
c5e8d7af
PH
317 def report_information_extraction(self, video_id):
318 """Report attempt to extract video information."""
319 self.to_screen(u'%s: Extracting video information' % video_id)
320
321 def report_unavailable_format(self, video_id, format):
322 """Report extracted video URL."""
323 self.to_screen(u'%s: Format %s not available' % (video_id, format))
324
325 def report_rtmp_download(self):
326 """Indicate the download will use the RTMP protocol."""
327 self.to_screen(u'RTMP download detected')
328
c4417ddb
PH
329 def _extract_signature_function(self, video_id, player_url, slen):
330 id_m = re.match(r'.*-(?P<id>[a-zA-Z0-9_-]+)\.(?P<ext>[a-z]+)$',
83799698 331 player_url)
e0df6211
PH
332 player_type = id_m.group('ext')
333 player_id = id_m.group('id')
334
c4417ddb
PH
335 # Read from filesystem cache
336 func_id = '%s_%s_%d' % (player_type, player_id, slen)
337 assert os.path.basename(func_id) == func_id
c38b1e77 338 cache_dir = get_cachedir(self._downloader.params)
c4417ddb 339
c3c88a26 340 cache_enabled = cache_dir is not None
f8061589 341 if cache_enabled:
c4417ddb
PH
342 cache_fn = os.path.join(os.path.expanduser(cache_dir),
343 u'youtube-sigfuncs',
344 func_id + '.json')
345 try:
edf3e38e 346 with io.open(cache_fn, 'r', encoding='utf-8') as cachef:
c4417ddb
PH
347 cache_spec = json.load(cachef)
348 return lambda s: u''.join(s[i] for i in cache_spec)
edf3e38e 349 except IOError:
c4417ddb 350 pass # No cache available
83799698 351
e0df6211
PH
352 if player_type == 'js':
353 code = self._download_webpage(
354 player_url, video_id,
83799698 355 note=u'Downloading %s player %s' % (player_type, player_id),
e0df6211 356 errnote=u'Download of %s failed' % player_url)
83799698 357 res = self._parse_sig_js(code)
c4417ddb 358 elif player_type == 'swf':
e0df6211
PH
359 urlh = self._request_webpage(
360 player_url, video_id,
83799698 361 note=u'Downloading %s player %s' % (player_type, player_id),
e0df6211
PH
362 errnote=u'Download of %s failed' % player_url)
363 code = urlh.read()
83799698 364 res = self._parse_sig_swf(code)
e0df6211
PH
365 else:
366 assert False, 'Invalid player type %r' % player_type
367
f8061589 368 if cache_enabled:
edf3e38e 369 try:
c705320f
PH
370 test_string = u''.join(map(compat_chr, range(slen)))
371 cache_res = res(test_string)
edf3e38e
PH
372 cache_spec = [ord(c) for c in cache_res]
373 try:
374 os.makedirs(os.path.dirname(cache_fn))
375 except OSError as ose:
376 if ose.errno != errno.EEXIST:
377 raise
378 write_json_file(cache_spec, cache_fn)
0ca96d48 379 except Exception:
edf3e38e
PH
380 tb = traceback.format_exc()
381 self._downloader.report_warning(
382 u'Writing cache to %r failed: %s' % (cache_fn, tb))
83799698
PH
383
384 return res
385
edf3e38e
PH
386 def _print_sig_code(self, func, slen):
387 def gen_sig_code(idxs):
388 def _genslice(start, end, step):
389 starts = u'' if start == 0 else str(start)
e35e4ddc
PH
390 ends = (u':%d' % (end+step)) if end + step >= 0 else u':'
391 steps = u'' if step == 1 else (u':%d' % step)
edf3e38e
PH
392 return u's[%s%s%s]' % (starts, ends, steps)
393
394 step = None
0ca96d48
PH
395 start = '(Never used)' # Quelch pyflakes warnings - start will be
396 # set as soon as step is set
edf3e38e
PH
397 for i, prev in zip(idxs[1:], idxs[:-1]):
398 if step is not None:
399 if i - prev == step:
400 continue
401 yield _genslice(start, prev, step)
402 step = None
403 continue
404 if i - prev in [-1, 1]:
405 step = i - prev
406 start = prev
407 continue
408 else:
409 yield u's[%d]' % prev
410 if step is None:
411 yield u's[%d]' % i
412 else:
413 yield _genslice(start, i, step)
414
c705320f
PH
415 test_string = u''.join(map(compat_chr, range(slen)))
416 cache_res = func(test_string)
edf3e38e
PH
417 cache_spec = [ord(c) for c in cache_res]
418 expr_code = u' + '.join(gen_sig_code(cache_spec))
419 code = u'if len(s) == %d:\n return %s\n' % (slen, expr_code)
f8061589 420 self.to_screen(u'Extracted signature function:\n' + code)
edf3e38e 421
e0df6211
PH
422 def _parse_sig_js(self, jscode):
423 funcname = self._search_regex(
424 r'signature=([a-zA-Z]+)', jscode,
425 u'Initial JS player signature function name')
426
427 functions = {}
428
429 def argidx(varname):
430 return string.lowercase.index(varname)
431
432 def interpret_statement(stmt, local_vars, allow_recursion=20):
433 if allow_recursion < 0:
0ca96d48 434 raise ExtractorError(u'Recursion limit reached')
e0df6211
PH
435
436 if stmt.startswith(u'var '):
437 stmt = stmt[len(u'var '):]
438 ass_m = re.match(r'^(?P<out>[a-z]+)(?:\[(?P<index>[^\]]+)\])?' +
439 r'=(?P<expr>.*)$', stmt)
440 if ass_m:
441 if ass_m.groupdict().get('index'):
442 def assign(val):
443 lvar = local_vars[ass_m.group('out')]
444 idx = interpret_expression(ass_m.group('index'),
445 local_vars, allow_recursion)
446 assert isinstance(idx, int)
447 lvar[idx] = val
448 return val
449 expr = ass_m.group('expr')
450 else:
451 def assign(val):
452 local_vars[ass_m.group('out')] = val
453 return val
454 expr = ass_m.group('expr')
455 elif stmt.startswith(u'return '):
456 assign = lambda v: v
457 expr = stmt[len(u'return '):]
458 else:
459 raise ExtractorError(
460 u'Cannot determine left side of statement in %r' % stmt)
461
462 v = interpret_expression(expr, local_vars, allow_recursion)
463 return assign(v)
464
465 def interpret_expression(expr, local_vars, allow_recursion):
466 if expr.isdigit():
467 return int(expr)
468
469 if expr.isalpha():
470 return local_vars[expr]
471
472 m = re.match(r'^(?P<in>[a-z]+)\.(?P<member>.*)$', expr)
473 if m:
474 member = m.group('member')
475 val = local_vars[m.group('in')]
476 if member == 'split("")':
477 return list(val)
478 if member == 'join("")':
479 return u''.join(val)
480 if member == 'length':
481 return len(val)
482 if member == 'reverse()':
483 return val[::-1]
484 slice_m = re.match(r'slice\((?P<idx>.*)\)', member)
485 if slice_m:
486 idx = interpret_expression(
487 slice_m.group('idx'), local_vars, allow_recursion-1)
488 return val[idx:]
489
490 m = re.match(
491 r'^(?P<in>[a-z]+)\[(?P<idx>.+)\]$', expr)
492 if m:
493 val = local_vars[m.group('in')]
494 idx = interpret_expression(m.group('idx'), local_vars,
495 allow_recursion-1)
496 return val[idx]
497
498 m = re.match(r'^(?P<a>.+?)(?P<op>[%])(?P<b>.+?)$', expr)
499 if m:
500 a = interpret_expression(m.group('a'),
501 local_vars, allow_recursion)
502 b = interpret_expression(m.group('b'),
503 local_vars, allow_recursion)
504 return a % b
505
506 m = re.match(
20650c86 507 r'^(?P<func>[a-zA-Z$]+)\((?P<args>[a-z0-9,]+)\)$', expr)
e0df6211
PH
508 if m:
509 fname = m.group('func')
510 if fname not in functions:
511 functions[fname] = extract_function(fname)
512 argvals = [int(v) if v.isdigit() else local_vars[v]
513 for v in m.group('args').split(',')]
514 return functions[fname](argvals)
515 raise ExtractorError(u'Unsupported JS expression %r' % expr)
516
517 def extract_function(funcname):
518 func_m = re.search(
519 r'function ' + re.escape(funcname) +
520 r'\((?P<args>[a-z,]+)\){(?P<code>[^}]+)}',
521 jscode)
522 argnames = func_m.group('args').split(',')
523
524 def resf(args):
525 local_vars = dict(zip(argnames, args))
526 for stmt in func_m.group('code').split(';'):
527 res = interpret_statement(stmt, local_vars)
528 return res
529 return resf
530
531 initial_function = extract_function(funcname)
532 return lambda s: initial_function([s])
533
534 def _parse_sig_swf(self, file_contents):
535 if file_contents[1:3] != b'WS':
536 raise ExtractorError(
537 u'Not an SWF file; header is %r' % file_contents[:3])
538 if file_contents[:1] == b'C':
539 content = zlib.decompress(file_contents[8:])
540 else:
541 raise NotImplementedError(u'Unsupported compression format %r' %
542 file_contents[:1])
543
544 def extract_tags(content):
545 pos = 0
546 while pos < len(content):
547 header16 = struct.unpack('<H', content[pos:pos+2])[0]
548 pos += 2
549 tag_code = header16 >> 6
550 tag_len = header16 & 0x3f
551 if tag_len == 0x3f:
552 tag_len = struct.unpack('<I', content[pos:pos+4])[0]
553 pos += 4
554 assert pos+tag_len <= len(content)
555 yield (tag_code, content[pos:pos+tag_len])
556 pos += tag_len
557
558 code_tag = next(tag
559 for tag_code, tag in extract_tags(content)
560 if tag_code == 82)
561 p = code_tag.index(b'\0', 4) + 1
ba552f54 562 code_reader = io.BytesIO(code_tag[p:])
e0df6211
PH
563
564 # Parse ABC (AVM2 ByteCode)
ba552f54
PH
565 def read_int(reader=None):
566 if reader is None:
567 reader = code_reader
e0df6211
PH
568 res = 0
569 shift = 0
570 for _ in range(5):
ba552f54
PH
571 buf = reader.read(1)
572 assert len(buf) == 1
573 b = struct.unpack('<B', buf)[0]
e0df6211
PH
574 res = res | ((b & 0x7f) << shift)
575 if b & 0x80 == 0:
576 break
577 shift += 7
ba552f54
PH
578 return res
579
580 def u30(reader=None):
581 res = read_int(reader)
582 assert res & 0xf0000000 == 0
e0df6211
PH
583 return res
584 u32 = read_int
585
ba552f54
PH
586 def s32(reader=None):
587 v = read_int(reader)
e0df6211
PH
588 if v & 0x80000000 != 0:
589 v = - ((v ^ 0xffffffff) + 1)
ba552f54
PH
590 return v
591
0ca96d48 592 def read_string(reader=None):
ba552f54
PH
593 if reader is None:
594 reader = code_reader
595 slen = u30(reader)
596 resb = reader.read(slen)
597 assert len(resb) == slen
598 return resb.decode('utf-8')
599
600 def read_bytes(count, reader=None):
601 if reader is None:
602 reader = code_reader
603 resb = reader.read(count)
604 assert len(resb) == count
605 return resb
606
607 def read_byte(reader=None):
608 resb = read_bytes(1, reader=reader)
609 res = struct.unpack('<B', resb)[0]
610 return res
e0df6211
PH
611
612 # minor_version + major_version
0ca96d48 613 read_bytes(2 + 2)
e0df6211
PH
614
615 # Constant pool
ba552f54 616 int_count = u30()
e0df6211 617 for _c in range(1, int_count):
0ca96d48 618 s32()
ba552f54 619 uint_count = u30()
e0df6211 620 for _c in range(1, uint_count):
0ca96d48 621 u32()
ba552f54 622 double_count = u30()
0ca96d48 623 read_bytes((double_count-1) * 8)
ba552f54 624 string_count = u30()
e0df6211
PH
625 constant_strings = [u'']
626 for _c in range(1, string_count):
0ca96d48 627 s = read_string()
e0df6211 628 constant_strings.append(s)
ba552f54 629 namespace_count = u30()
e0df6211 630 for _c in range(1, namespace_count):
0ca96d48
PH
631 read_bytes(1) # kind
632 u30() # name
ba552f54 633 ns_set_count = u30()
e0df6211 634 for _c in range(1, ns_set_count):
ba552f54 635 count = u30()
e0df6211 636 for _c2 in range(count):
0ca96d48 637 u30()
ba552f54 638 multiname_count = u30()
e0df6211
PH
639 MULTINAME_SIZES = {
640 0x07: 2, # QName
641 0x0d: 2, # QNameA
642 0x0f: 1, # RTQName
643 0x10: 1, # RTQNameA
644 0x11: 0, # RTQNameL
645 0x12: 0, # RTQNameLA
646 0x09: 2, # Multiname
647 0x0e: 2, # MultinameA
648 0x1b: 1, # MultinameL
649 0x1c: 1, # MultinameLA
650 }
651 multinames = [u'']
652 for _c in range(1, multiname_count):
ba552f54 653 kind = u30()
e0df6211
PH
654 assert kind in MULTINAME_SIZES, u'Invalid multiname kind %r' % kind
655 if kind == 0x07:
0ca96d48 656 u30() # namespace_idx
ba552f54 657 name_idx = u30()
e0df6211
PH
658 multinames.append(constant_strings[name_idx])
659 else:
660 multinames.append('[MULTINAME kind: %d]' % kind)
661 for _c2 in range(MULTINAME_SIZES[kind]):
0ca96d48 662 u30()
e0df6211
PH
663
664 # Methods
ba552f54 665 method_count = u30()
e0df6211
PH
666 MethodInfo = collections.namedtuple(
667 'MethodInfo',
668 ['NEED_ARGUMENTS', 'NEED_REST'])
669 method_infos = []
670 for method_id in range(method_count):
ba552f54 671 param_count = u30()
0ca96d48 672 u30() # return type
e0df6211 673 for _ in range(param_count):
0ca96d48
PH
674 u30() # param type
675 u30() # name index (always 0 for youtube)
ba552f54 676 flags = read_byte()
e0df6211
PH
677 if flags & 0x08 != 0:
678 # Options present
ba552f54 679 option_count = u30()
e0df6211 680 for c in range(option_count):
0ca96d48
PH
681 u30() # val
682 read_bytes(1) # kind
e0df6211
PH
683 if flags & 0x80 != 0:
684 # Param names present
685 for _ in range(param_count):
0ca96d48 686 u30() # param name
e0df6211
PH
687 mi = MethodInfo(flags & 0x01 != 0, flags & 0x04 != 0)
688 method_infos.append(mi)
689
690 # Metadata
ba552f54 691 metadata_count = u30()
e0df6211 692 for _c in range(metadata_count):
0ca96d48 693 u30() # name
ba552f54 694 item_count = u30()
e0df6211 695 for _c2 in range(item_count):
0ca96d48
PH
696 u30() # key
697 u30() # value
ba552f54
PH
698
699 def parse_traits_info():
700 trait_name_idx = u30()
701 kind_full = read_byte()
e0df6211
PH
702 kind = kind_full & 0x0f
703 attrs = kind_full >> 4
704 methods = {}
705 if kind in [0x00, 0x06]: # Slot or Const
0ca96d48
PH
706 u30() # Slot id
707 u30() # type_name_idx
ba552f54 708 vindex = u30()
e0df6211 709 if vindex != 0:
0ca96d48 710 read_byte() # vkind
e0df6211 711 elif kind in [0x01, 0x02, 0x03]: # Method / Getter / Setter
0ca96d48 712 u30() # disp_id
ba552f54 713 method_idx = u30()
e0df6211
PH
714 methods[multinames[trait_name_idx]] = method_idx
715 elif kind == 0x04: # Class
0ca96d48
PH
716 u30() # slot_id
717 u30() # classi
e0df6211 718 elif kind == 0x05: # Function
0ca96d48 719 u30() # slot_id
ba552f54 720 function_idx = u30()
e0df6211
PH
721 methods[function_idx] = multinames[trait_name_idx]
722 else:
723 raise ExtractorError(u'Unsupported trait kind %d' % kind)
724
725 if attrs & 0x4 != 0: # Metadata present
ba552f54 726 metadata_count = u30()
e0df6211 727 for _c3 in range(metadata_count):
0ca96d48 728 u30() # metadata index
e0df6211 729
ba552f54 730 return methods
e0df6211
PH
731
732 # Classes
733 TARGET_CLASSNAME = u'SignatureDecipher'
734 searched_idx = multinames.index(TARGET_CLASSNAME)
735 searched_class_id = None
ba552f54 736 class_count = u30()
e0df6211 737 for class_id in range(class_count):
ba552f54 738 name_idx = u30()
e0df6211
PH
739 if name_idx == searched_idx:
740 # We found the class we're looking for!
741 searched_class_id = class_id
0ca96d48 742 u30() # super_name idx
ba552f54 743 flags = read_byte()
e0df6211 744 if flags & 0x08 != 0: # Protected namespace is present
0ca96d48 745 u30() # protected_ns_idx
ba552f54 746 intrf_count = u30()
e0df6211 747 for _c2 in range(intrf_count):
0ca96d48
PH
748 u30()
749 u30() # iinit
ba552f54 750 trait_count = u30()
e0df6211 751 for _c2 in range(trait_count):
0ca96d48 752 parse_traits_info()
e0df6211
PH
753
754 if searched_class_id is None:
755 raise ExtractorError(u'Target class %r not found' %
756 TARGET_CLASSNAME)
757
758 method_names = {}
759 method_idxs = {}
760 for class_id in range(class_count):
0ca96d48 761 u30() # cinit
ba552f54 762 trait_count = u30()
e0df6211 763 for _c2 in range(trait_count):
ba552f54 764 trait_methods = parse_traits_info()
e0df6211
PH
765 if class_id == searched_class_id:
766 method_names.update(trait_methods.items())
767 method_idxs.update(dict(
768 (idx, name)
769 for name, idx in trait_methods.items()))
770
771 # Scripts
ba552f54 772 script_count = u30()
e0df6211 773 for _c in range(script_count):
0ca96d48 774 u30() # init
ba552f54 775 trait_count = u30()
e0df6211 776 for _c2 in range(trait_count):
0ca96d48 777 parse_traits_info()
e0df6211
PH
778
779 # Method bodies
ba552f54 780 method_body_count = u30()
e0df6211
PH
781 Method = collections.namedtuple('Method', ['code', 'local_count'])
782 methods = {}
783 for _c in range(method_body_count):
ba552f54 784 method_idx = u30()
0ca96d48 785 u30() # max_stack
ba552f54 786 local_count = u30()
0ca96d48
PH
787 u30() # init_scope_depth
788 u30() # max_scope_depth
ba552f54
PH
789 code_length = u30()
790 code = read_bytes(code_length)
e0df6211 791 if method_idx in method_idxs:
ba552f54 792 m = Method(code, local_count)
e0df6211 793 methods[method_idxs[method_idx]] = m
ba552f54 794 exception_count = u30()
e0df6211 795 for _c2 in range(exception_count):
0ca96d48
PH
796 u30() # from
797 u30() # to
798 u30() # target
799 u30() # exc_type
800 u30() # var_name
ba552f54 801 trait_count = u30()
e0df6211 802 for _c2 in range(trait_count):
0ca96d48 803 parse_traits_info()
e0df6211 804
ba552f54 805 assert p + code_reader.tell() == len(code_tag)
e0df6211
PH
806 assert len(methods) == len(method_idxs)
807
808 method_pyfunctions = {}
809
810 def extract_function(func_name):
811 if func_name in method_pyfunctions:
812 return method_pyfunctions[func_name]
813 if func_name not in methods:
814 raise ExtractorError(u'Cannot find function %r' % func_name)
815 m = methods[func_name]
816
817 def resfunc(args):
e0df6211
PH
818 registers = ['(this)'] + list(args) + [None] * m.local_count
819 stack = []
820 coder = io.BytesIO(m.code)
821 while True:
822 opcode = struct.unpack('!B', coder.read(1))[0]
a7177865 823 if opcode == 36: # pushbyte
e0df6211
PH
824 v = struct.unpack('!B', coder.read(1))[0]
825 stack.append(v)
826 elif opcode == 44: # pushstring
827 idx = u30(coder)
828 stack.append(constant_strings[idx])
829 elif opcode == 48: # pushscope
830 # We don't implement the scope register, so we'll just
831 # ignore the popped value
832 stack.pop()
833 elif opcode == 70: # callproperty
834 index = u30(coder)
835 mname = multinames[index]
836 arg_count = u30(coder)
837 args = list(reversed(
838 [stack.pop() for _ in range(arg_count)]))
839 obj = stack.pop()
840 if mname == u'split':
841 assert len(args) == 1
842 assert isinstance(args[0], compat_str)
843 assert isinstance(obj, compat_str)
844 if args[0] == u'':
845 res = list(obj)
846 else:
847 res = obj.split(args[0])
848 stack.append(res)
a7177865
PH
849 elif mname == u'slice':
850 assert len(args) == 1
851 assert isinstance(args[0], int)
852 assert isinstance(obj, list)
853 res = obj[args[0]:]
854 stack.append(res)
855 elif mname == u'join':
856 assert len(args) == 1
857 assert isinstance(args[0], compat_str)
858 assert isinstance(obj, list)
859 res = args[0].join(obj)
860 stack.append(res)
e0df6211
PH
861 elif mname in method_pyfunctions:
862 stack.append(method_pyfunctions[mname](args))
863 else:
864 raise NotImplementedError(
865 u'Unsupported property %r on %r'
866 % (mname, obj))
a7177865
PH
867 elif opcode == 72: # returnvalue
868 res = stack.pop()
869 return res
870 elif opcode == 79: # callpropvoid
871 index = u30(coder)
872 mname = multinames[index]
873 arg_count = u30(coder)
874 args = list(reversed(
875 [stack.pop() for _ in range(arg_count)]))
876 obj = stack.pop()
877 if mname == u'reverse':
878 assert isinstance(obj, list)
879 obj.reverse()
880 else:
881 raise NotImplementedError(
882 u'Unsupported (void) property %r on %r'
883 % (mname, obj))
e0df6211
PH
884 elif opcode == 93: # findpropstrict
885 index = u30(coder)
886 mname = multinames[index]
887 res = extract_function(mname)
888 stack.append(res)
889 elif opcode == 97: # setproperty
890 index = u30(coder)
891 value = stack.pop()
892 idx = stack.pop()
893 obj = stack.pop()
894 assert isinstance(obj, list)
895 assert isinstance(idx, int)
896 obj[idx] = value
897 elif opcode == 98: # getlocal
898 index = u30(coder)
899 stack.append(registers[index])
900 elif opcode == 99: # setlocal
901 index = u30(coder)
902 value = stack.pop()
903 registers[index] = value
904 elif opcode == 102: # getproperty
905 index = u30(coder)
906 pname = multinames[index]
907 if pname == u'length':
908 obj = stack.pop()
909 assert isinstance(obj, list)
910 stack.append(len(obj))
911 else: # Assume attribute access
912 idx = stack.pop()
913 assert isinstance(idx, int)
914 obj = stack.pop()
915 assert isinstance(obj, list)
916 stack.append(obj[idx])
917 elif opcode == 128: # coerce
0ca96d48 918 u30(coder)
e0df6211
PH
919 elif opcode == 133: # coerce_s
920 assert isinstance(stack[-1], (type(None), compat_str))
921 elif opcode == 164: # modulo
922 value2 = stack.pop()
923 value1 = stack.pop()
924 res = value1 % value2
925 stack.append(res)
a7177865
PH
926 elif opcode == 208: # getlocal_0
927 stack.append(registers[0])
928 elif opcode == 209: # getlocal_1
929 stack.append(registers[1])
930 elif opcode == 210: # getlocal_2
931 stack.append(registers[2])
932 elif opcode == 211: # getlocal_3
933 stack.append(registers[3])
e0df6211
PH
934 elif opcode == 214: # setlocal_2
935 registers[2] = stack.pop()
936 elif opcode == 215: # setlocal_3
937 registers[3] = stack.pop()
938 else:
939 raise NotImplementedError(
940 u'Unsupported opcode %d' % opcode)
941
942 method_pyfunctions[func_name] = resfunc
943 return resfunc
944
945 initial_function = extract_function(u'decipher')
946 return lambda s: initial_function([s])
947
83799698 948 def _decrypt_signature(self, s, video_id, player_url, age_gate=False):
257a2501 949 """Turn the encrypted s field into a working signature"""
6b37f0be 950
83799698 951 if player_url is not None:
9f9be844
PH
952 if player_url.startswith(u'//'):
953 player_url = u'https:' + player_url
e0df6211 954 try:
7f8ae73a
PH
955 player_id = (player_url, len(s))
956 if player_id not in self._player_cache:
83799698 957 func = self._extract_signature_function(
c4417ddb 958 video_id, player_url, len(s)
e0df6211 959 )
7f8ae73a
PH
960 self._player_cache[player_id] = func
961 func = self._player_cache[player_id]
edf3e38e
PH
962 if self._downloader.params.get('youtube_print_sig_code'):
963 self._print_sig_code(func, len(s))
964 return func(s)
0ca96d48 965 except Exception:
e0df6211 966 tb = traceback.format_exc()
83799698
PH
967 self._downloader.report_warning(
968 u'Automatic signature extraction failed: ' + tb)
e0df6211 969
d2d8f895
PH
970 self._downloader.report_warning(
971 u'Warning: Falling back to static signature algorithm')
920de7a2 972
2f2ffea9
PH
973 return self._static_decrypt_signature(
974 s, video_id, player_url, age_gate)
e0df6211 975
2f2ffea9 976 def _static_decrypt_signature(self, s, video_id, player_url, age_gate):
e0df6211
PH
977 if age_gate:
978 # The videos with age protection use another player, so the
979 # algorithms can be different.
980 if len(s) == 86:
981 return s[2:63] + s[82] + s[64:82] + s[63]
982
bc4b9008 983 if len(s) == 93:
984 return s[86:29:-1] + s[88] + s[28:5:-1]
985 elif len(s) == 92:
444b1165 986 return s[25] + s[3:25] + s[0] + s[26:42] + s[79] + s[43:79] + s[91] + s[80:83]
38d025b3
JMF
987 elif len(s) == 91:
988 return s[84:27:-1] + s[86] + s[26:5:-1]
444b1165
JMF
989 elif len(s) == 90:
990 return s[25] + s[3:25] + s[2] + s[26:40] + s[77] + s[41:77] + s[89] + s[78:81]
8a9d86a2 991 elif len(s) == 89:
992 return s[84:78:-1] + s[87] + s[77:60:-1] + s[0] + s[59:3:-1]
444b1165 993 elif len(s) == 88:
3e223834 994 return s[7:28] + s[87] + s[29:45] + s[55] + s[46:55] + s[2] + s[56:87] + s[28]
be547e1d 995 elif len(s) == 87:
3a725669 996 return s[6:27] + s[4] + s[28:39] + s[27] + s[40:59] + s[2] + s[60:]
be547e1d 997 elif len(s) == 86:
f2c327fd 998 return s[80:72:-1] + s[16] + s[71:39:-1] + s[72] + s[38:16:-1] + s[82] + s[15::-1]
be547e1d 999 elif len(s) == 85:
6ae8ee3f 1000 return s[3:11] + s[0] + s[12:55] + s[84] + s[56:84]
be547e1d 1001 elif len(s) == 84:
6f56389b 1002 return s[78:70:-1] + s[14] + s[69:37:-1] + s[70] + s[36:14:-1] + s[80] + s[:14][::-1]
be547e1d 1003 elif len(s) == 83:
920de7a2 1004 return s[80:63:-1] + s[0] + s[62:0:-1] + s[63]
be547e1d 1005 elif len(s) == 82:
c21315f2 1006 return s[80:37:-1] + s[7] + s[36:7:-1] + s[0] + s[6:0:-1] + s[37]
be547e1d 1007 elif len(s) == 81:
aedd6bb9 1008 return s[56] + s[79:56:-1] + s[41] + s[55:41:-1] + s[80] + s[40:34:-1] + s[0] + s[33:29:-1] + s[34] + s[28:9:-1] + s[29] + s[8:0:-1] + s[9]
066090dd
JMF
1009 elif len(s) == 80:
1010 return s[1:19] + s[0] + s[20:68] + s[19] + s[69:80]
5c468ca8
JMF
1011 elif len(s) == 79:
1012 return s[54] + s[77:54:-1] + s[39] + s[53:39:-1] + s[78] + s[38:34:-1] + s[0] + s[33:29:-1] + s[34] + s[28:9:-1] + s[29] + s[8:0:-1] + s[9]
be547e1d
PH
1013
1014 else:
1015 raise ExtractorError(u'Unable to decrypt signature, key length %d not supported; retrying might work' % (len(s)))
c5e8d7af 1016
1f343eaa 1017 def _get_available_subtitles(self, video_id, webpage):
de7f3446 1018 try:
7fad1c63 1019 sub_list = self._download_webpage(
38c2e5b8 1020 'https://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,
7fad1c63
JMF
1021 video_id, note=False)
1022 except ExtractorError as err:
de7f3446
JMF
1023 self._downloader.report_warning(u'unable to download video subtitles: %s' % compat_str(err))
1024 return {}
1025 lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
1026
1027 sub_lang_list = {}
1028 for l in lang_list:
1029 lang = l[1]
1030 params = compat_urllib_parse.urlencode({
1031 'lang': lang,
1032 'v': video_id,
ca715127 1033 'fmt': self._downloader.params.get('subtitlesformat', 'srt'),
c3197e3e 1034 'name': unescapeHTML(l[0]).encode('utf-8'),
de7f3446 1035 })
38c2e5b8 1036 url = u'https://www.youtube.com/api/timedtext?' + params
de7f3446
JMF
1037 sub_lang_list[lang] = url
1038 if not sub_lang_list:
1039 self._downloader.report_warning(u'video doesn\'t have subtitles')
1040 return {}
1041 return sub_lang_list
1042
055e6f36 1043 def _get_available_automatic_caption(self, video_id, webpage):
de7f3446
JMF
1044 """We need the webpage for getting the captions url, pass it as an
1045 argument to speed up the process."""
ca715127 1046 sub_format = self._downloader.params.get('subtitlesformat', 'srt')
de7f3446
JMF
1047 self.to_screen(u'%s: Looking for automatic captions' % video_id)
1048 mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
055e6f36 1049 err_msg = u'Couldn\'t find automatic captions for %s' % video_id
de7f3446
JMF
1050 if mobj is None:
1051 self._downloader.report_warning(err_msg)
1052 return {}
1053 player_config = json.loads(mobj.group(1))
1054 try:
1055 args = player_config[u'args']
1056 caption_url = args[u'ttsurl']
1057 timestamp = args[u'timestamp']
055e6f36
JMF
1058 # We get the available subtitles
1059 list_params = compat_urllib_parse.urlencode({
1060 'type': 'list',
1061 'tlangs': 1,
1062 'asrs': 1,
de7f3446 1063 })
055e6f36 1064 list_url = caption_url + '&' + list_params
e26f8712 1065 caption_list = self._download_xml(list_url, video_id)
e3dc22ca 1066 original_lang_node = caption_list.find('track')
f6a54188 1067 if original_lang_node is None or original_lang_node.attrib.get('kind') != 'asr' :
e3dc22ca
JMF
1068 self._downloader.report_warning(u'Video doesn\'t have automatic captions')
1069 return {}
1070 original_lang = original_lang_node.attrib['lang_code']
055e6f36
JMF
1071
1072 sub_lang_list = {}
1073 for lang_node in caption_list.findall('target'):
1074 sub_lang = lang_node.attrib['lang_code']
1075 params = compat_urllib_parse.urlencode({
1076 'lang': original_lang,
1077 'tlang': sub_lang,
1078 'fmt': sub_format,
1079 'ts': timestamp,
1080 'kind': 'asr',
1081 })
1082 sub_lang_list[sub_lang] = caption_url + '&' + params
1083 return sub_lang_list
de7f3446
JMF
1084 # An extractor error can be raise by the download process if there are
1085 # no automatic captions but there are subtitles
1086 except (KeyError, ExtractorError):
1087 self._downloader.report_warning(err_msg)
1088 return {}
1089
97665381
PH
1090 @classmethod
1091 def extract_id(cls, url):
1092 mobj = re.match(cls._VALID_URL, url, re.VERBOSE)
c5e8d7af
PH
1093 if mobj is None:
1094 raise ExtractorError(u'Invalid URL: %s' % url)
1095 video_id = mobj.group(2)
1096 return video_id
1097
1d043b93
JMF
1098 def _extract_from_m3u8(self, manifest_url, video_id):
1099 url_map = {}
1100 def _get_urls(_manifest):
1101 lines = _manifest.split('\n')
1102 urls = filter(lambda l: l and not l.startswith('#'),
1103 lines)
1104 return urls
1105 manifest = self._download_webpage(manifest_url, video_id, u'Downloading formats manifest')
1106 formats_urls = _get_urls(manifest)
1107 for format_url in formats_urls:
890f62e8 1108 itag = self._search_regex(r'itag/(\d+?)/', format_url, 'itag')
1d043b93
JMF
1109 url_map[itag] = format_url
1110 return url_map
1111
1fb07d10
JG
1112 def _extract_annotations(self, video_id):
1113 url = 'https://www.youtube.com/annotations_invideo?features=1&legacy=1&video_id=%s' % video_id
1114 return self._download_webpage(url, video_id, note=u'Searching for annotations.', errnote=u'Unable to download video annotations.')
1115
c5e8d7af
PH
1116 def _real_extract(self, url):
1117 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
1118 mobj = re.search(self._NEXT_URL_RE, url)
1119 if mobj:
1120 url = 'https://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
97665381 1121 video_id = self.extract_id(url)
c5e8d7af
PH
1122
1123 # Get video webpage
c5e8d7af 1124 url = 'https://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
336c3a69 1125 video_webpage = self._download_webpage(url, video_id)
c5e8d7af
PH
1126
1127 # Attempt to extract SWF player URL
e0df6211 1128 mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
c5e8d7af
PH
1129 if mobj is not None:
1130 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1131 else:
1132 player_url = None
1133
1134 # Get video info
1135 self.report_video_info_webpage_download(video_id)
c108eb73
JMF
1136 if re.search(r'player-age-gate-content">', video_webpage) is not None:
1137 self.report_age_confirmation()
1138 age_gate = True
1139 # We simulate the access to the video from www.youtube.com/v/{video_id}
1140 # this can be viewed without login into Youtube
1141 data = compat_urllib_parse.urlencode({'video_id': video_id,
fccd3771 1142 'el': 'player_embedded',
c108eb73
JMF
1143 'gl': 'US',
1144 'hl': 'en',
1145 'eurl': 'https://youtube.googleapis.com/v/' + video_id,
1146 'asv': 3,
1147 'sts':'1588',
1148 })
1149 video_info_url = 'https://www.youtube.com/get_video_info?' + data
c5e8d7af
PH
1150 video_info_webpage = self._download_webpage(video_info_url, video_id,
1151 note=False,
1152 errnote='unable to download video info webpage')
1153 video_info = compat_parse_qs(video_info_webpage)
c108eb73
JMF
1154 else:
1155 age_gate = False
1156 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1157 video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1158 % (video_id, el_type))
1159 video_info_webpage = self._download_webpage(video_info_url, video_id,
1160 note=False,
1161 errnote='unable to download video info webpage')
1162 video_info = compat_parse_qs(video_info_webpage)
1163 if 'token' in video_info:
1164 break
c5e8d7af
PH
1165 if 'token' not in video_info:
1166 if 'reason' in video_info:
9a82b238 1167 raise ExtractorError(u'YouTube said: %s' % video_info['reason'][0], expected=True)
c5e8d7af
PH
1168 else:
1169 raise ExtractorError(u'"token" parameter not in video info for unknown reason')
1170
1d699755
PH
1171 if 'view_count' in video_info:
1172 view_count = int(video_info['view_count'][0])
1173 else:
1174 view_count = None
1175
c5e8d7af
PH
1176 # Check for "rental" videos
1177 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
1178 raise ExtractorError(u'"rental" videos not supported')
1179
1180 # Start extracting information
1181 self.report_information_extraction(video_id)
1182
1183 # uploader
1184 if 'author' not in video_info:
1185 raise ExtractorError(u'Unable to extract uploader name')
1186 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
1187
1188 # uploader_id
1189 video_uploader_id = None
1190 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
1191 if mobj is not None:
1192 video_uploader_id = mobj.group(1)
1193 else:
1194 self._downloader.report_warning(u'unable to extract uploader nickname')
1195
1196 # title
a8c6b241
PH
1197 if 'title' in video_info:
1198 video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
1199 else:
1200 self._downloader.report_warning(u'Unable to extract video title')
1201 video_title = u'_'
c5e8d7af
PH
1202
1203 # thumbnail image
7763b04e
JMF
1204 # We try first to get a high quality image:
1205 m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">',
1206 video_webpage, re.DOTALL)
1207 if m_thumb is not None:
1208 video_thumbnail = m_thumb.group(1)
1209 elif 'thumbnail_url' not in video_info:
c5e8d7af 1210 self._downloader.report_warning(u'unable to extract video thumbnail')
f490e77e 1211 video_thumbnail = None
c5e8d7af
PH
1212 else: # don't panic if we can't find it
1213 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
1214
1215 # upload date
1216 upload_date = None
1217 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
1218 if mobj is not None:
1219 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1220 upload_date = unified_strdate(upload_date)
1221
1222 # description
1223 video_description = get_element_by_id("eow-description", video_webpage)
1224 if video_description:
27dcce19
PH
1225 video_description = re.sub(r'''(?x)
1226 <a\s+
1227 (?:[a-zA-Z-]+="[^"]+"\s+)*?
1228 title="([^"]+)"\s+
1229 (?:[a-zA-Z-]+="[^"]+"\s+)*?
1230 class="yt-uix-redirect-link"\s*>
1231 [^<]+
1232 </a>
1233 ''', r'\1', video_description)
c5e8d7af
PH
1234 video_description = clean_html(video_description)
1235 else:
1236 fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
1237 if fd_mobj:
1238 video_description = unescapeHTML(fd_mobj.group(1))
1239 else:
1240 video_description = u''
1241
336c3a69 1242 def _extract_count(klass):
46374a56
PH
1243 count = self._search_regex(
1244 r'class="%s">([\d,]+)</span>' % re.escape(klass),
1245 video_webpage, klass, default=None)
336c3a69
JMF
1246 if count is not None:
1247 return int(count.replace(',', ''))
1248 return None
1249 like_count = _extract_count(u'likes-count')
1250 dislike_count = _extract_count(u'dislikes-count')
1251
c5e8d7af 1252 # subtitles
d82134c3 1253 video_subtitles = self.extract_subtitles(video_id, video_webpage)
c5e8d7af 1254
c5e8d7af 1255 if self._downloader.params.get('listsubtitles', False):
d665f8d3 1256 self._list_available_subtitles(video_id, video_webpage)
c5e8d7af
PH
1257 return
1258
1259 if 'length_seconds' not in video_info:
1260 self._downloader.report_warning(u'unable to extract video duration')
b466b702 1261 video_duration = None
c5e8d7af 1262 else:
b466b702 1263 video_duration = int(compat_urllib_parse.unquote_plus(video_info['length_seconds'][0]))
c5e8d7af 1264
1fb07d10
JG
1265 # annotations
1266 video_annotations = None
1267 if self._downloader.params.get('writeannotations', False):
1268 video_annotations = self._extract_annotations(video_id)
1269
c5e8d7af 1270 # Decide which formats to download
c5e8d7af
PH
1271 try:
1272 mobj = re.search(r';ytplayer.config = ({.*?});', video_webpage)
50be92c1
PH
1273 if not mobj:
1274 raise ValueError('Could not find vevo ID')
c5e8d7af
PH
1275 info = json.loads(mobj.group(1))
1276 args = info['args']
7ce7e394
JMF
1277 # Easy way to know if the 's' value is in url_encoded_fmt_stream_map
1278 # this signatures are encrypted
44d46655 1279 if 'url_encoded_fmt_stream_map' not in args:
f10503db 1280 raise ValueError(u'No stream_map present') # caught below
00fe14fc
JMF
1281 re_signature = re.compile(r'[&,]s=')
1282 m_s = re_signature.search(args['url_encoded_fmt_stream_map'])
7ce7e394
JMF
1283 if m_s is not None:
1284 self.to_screen(u'%s: Encrypted signatures detected.' % video_id)
c5e8d7af 1285 video_info['url_encoded_fmt_stream_map'] = [args['url_encoded_fmt_stream_map']]
00fe14fc 1286 m_s = re_signature.search(args.get('adaptive_fmts', u''))
b7a68384 1287 if m_s is not None:
00fe14fc
JMF
1288 if 'adaptive_fmts' in video_info:
1289 video_info['adaptive_fmts'][0] += ',' + args['adaptive_fmts']
37b6d5f6 1290 else:
00fe14fc 1291 video_info['adaptive_fmts'] = [args['adaptive_fmts']]
c5e8d7af
PH
1292 except ValueError:
1293 pass
1294
dd27fd17
PH
1295 def _map_to_format_list(urlmap):
1296 formats = []
1297 for itag, video_real_url in urlmap.items():
1298 dct = {
1299 'format_id': itag,
1300 'url': video_real_url,
1301 'player_url': player_url,
1302 }
0b65e5d4
PH
1303 if itag in self._formats:
1304 dct.update(self._formats[itag])
dd27fd17
PH
1305 formats.append(dct)
1306 return formats
1307
c5e8d7af
PH
1308 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1309 self.report_rtmp_download()
dd27fd17
PH
1310 formats = [{
1311 'format_id': '_rtmp',
1312 'protocol': 'rtmp',
1313 'url': video_info['conn'][0],
1314 'player_url': player_url,
1315 }]
00fe14fc
JMF
1316 elif len(video_info.get('url_encoded_fmt_stream_map', [])) >= 1 or len(video_info.get('adaptive_fmts', [])) >= 1:
1317 encoded_url_map = video_info.get('url_encoded_fmt_stream_map', [''])[0] + ',' + video_info.get('adaptive_fmts',[''])[0]
1318 if 'rtmpe%3Dyes' in encoded_url_map:
a7055eb9 1319 raise ExtractorError('rtmpe downloads are not supported, see https://github.com/rg3/youtube-dl/issues/343 for more information.', expected=True)
c5e8d7af 1320 url_map = {}
00fe14fc 1321 for url_data_str in encoded_url_map.split(','):
c5e8d7af
PH
1322 url_data = compat_parse_qs(url_data_str)
1323 if 'itag' in url_data and 'url' in url_data:
1324 url = url_data['url'][0]
1325 if 'sig' in url_data:
1326 url += '&signature=' + url_data['sig'][0]
1327 elif 's' in url_data:
e0df6211 1328 encrypted_sig = url_data['s'][0]
769fda3c 1329 if self._downloader.params.get('verbose'):
c108eb73 1330 if age_gate:
bdde940e
PH
1331 if player_url is None:
1332 player_version = 'unknown'
1333 else:
1334 player_version = self._search_regex(
1335 r'-(.+)\.swf$', player_url,
1336 u'flash player', fatal=False)
e0df6211 1337 player_desc = 'flash player %s' % player_version
c108eb73 1338 else:
83799698
PH
1339 player_version = self._search_regex(
1340 r'html5player-(.+?)\.js', video_webpage,
c108eb73 1341 'html5 player', fatal=False)
e0df6211
PH
1342 player_desc = u'html5 player %s' % player_version
1343
1344 parts_sizes = u'.'.join(compat_str(len(part)) for part in encrypted_sig.split('.'))
5a76c651 1345 self.to_screen(u'encrypted signature length %d (%s), itag %s, %s' %
e0df6211
PH
1346 (len(encrypted_sig), parts_sizes, url_data['itag'][0], player_desc))
1347
83799698 1348 if not age_gate:
e0df6211
PH
1349 jsplayer_url_json = self._search_regex(
1350 r'"assets":.+?"js":\s*("[^"]+")',
1351 video_webpage, u'JS player URL')
83799698 1352 player_url = json.loads(jsplayer_url_json)
e0df6211 1353
83799698
PH
1354 signature = self._decrypt_signature(
1355 encrypted_sig, video_id, player_url, age_gate)
c5e8d7af
PH
1356 url += '&signature=' + signature
1357 if 'ratebypass' not in url:
1358 url += '&ratebypass=yes'
1359 url_map[url_data['itag'][0]] = url
dd27fd17 1360 formats = _map_to_format_list(url_map)
1d043b93
JMF
1361 elif video_info.get('hlsvp'):
1362 manifest_url = video_info['hlsvp'][0]
1363 url_map = self._extract_from_m3u8(manifest_url, video_id)
dd27fd17 1364 formats = _map_to_format_list(url_map)
c5e8d7af 1365 else:
9abb3204 1366 raise ExtractorError(u'no conn, hlsvp or url_encoded_fmt_stream_map information found in video info')
c5e8d7af 1367
dd27fd17
PH
1368 # Look for the DASH manifest
1369 dash_manifest_url_lst = video_info.get('dashmpd')
4919603f
PH
1370 if (dash_manifest_url_lst and dash_manifest_url_lst[0] and
1371 self._downloader.params.get('youtube_include_dash_manifest', False)):
dd27fd17
PH
1372 try:
1373 dash_doc = self._download_xml(
1374 dash_manifest_url_lst[0], video_id,
1375 note=u'Downloading DASH manifest',
1376 errnote=u'Could not download DASH manifest')
1377 for r in dash_doc.findall(u'.//{urn:mpeg:DASH:schema:MPD:2011}Representation'):
1378 url_el = r.find('{urn:mpeg:DASH:schema:MPD:2011}BaseURL')
1379 if url_el is None:
1380 continue
1381 format_id = r.attrib['id']
1382 video_url = url_el.text
1383 filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength'))
1384 f = {
1385 'format_id': format_id,
1386 'url': video_url,
1387 'width': int_or_none(r.attrib.get('width')),
1388 'tbr': int_or_none(r.attrib.get('bandwidth'), 1000),
1389 'asr': int_or_none(r.attrib.get('audioSamplingRate')),
1390 'filesize': filesize,
1391 }
1392 try:
1393 existing_format = next(
1394 fo for fo in formats
1395 if fo['format_id'] == format_id)
1396 except StopIteration:
1397 f.update(self._formats.get(format_id, {}))
1398 formats.append(f)
1399 else:
1400 existing_format.update(f)
1401
1402 except (ExtractorError, KeyError) as e:
1403 self.report_warning(u'Skipping DASH manifest: %s' % e, video_id)
d80044c2 1404
4bcc7bd1 1405 self._sort_formats(formats)
4ea3be0a 1406
1407 return {
1408 'id': video_id,
1409 'uploader': video_uploader,
1410 'uploader_id': video_uploader_id,
1411 'upload_date': upload_date,
1412 'title': video_title,
1413 'thumbnail': video_thumbnail,
1414 'description': video_description,
1415 'subtitles': video_subtitles,
1416 'duration': video_duration,
1417 'age_limit': 18 if age_gate else 0,
1418 'annotations': video_annotations,
1419 'webpage_url': 'https://www.youtube.com/watch?v=%s' % video_id,
1420 'view_count': view_count,
1421 'like_count': like_count,
1422 'dislike_count': dislike_count,
1423 'formats': formats,
1424 }
c5e8d7af 1425
880e1c52 1426class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
0f818663 1427 IE_DESC = u'YouTube.com playlists'
d67cc9fa 1428 _VALID_URL = r"""(?x)(?:
c5e8d7af
PH
1429 (?:https?://)?
1430 (?:\w+\.)?
1431 youtube\.com/
1432 (?:
1433 (?:course|view_play_list|my_playlists|artist|playlist|watch)
1434 \? (?:.*?&)*? (?:p|a|list)=
1435 | p/
1436 )
d67cc9fa
JMF
1437 (
1438 (?:PL|EC|UU|FL|RD)?[0-9A-Za-z-_]{10,}
1439 # Top tracks, they can also include dots
1440 |(?:MC)[\w\.]*
1441 )
c5e8d7af
PH
1442 .*
1443 |
715c8e7b 1444 ((?:PL|EC|UU|FL|RD)[0-9A-Za-z-_]{10,})
c5e8d7af 1445 )"""
dcbb4580
JMF
1446 _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s&page=%s'
1447 _MORE_PAGES_INDICATOR = r'data-link-type="next"'
6e47b51e 1448 _VIDEO_RE = r'href="/watch\?v=(?P<id>[0-9A-Za-z_-]{11})&amp;[^"]*?index=(?P<index>\d+)'
c5e8d7af
PH
1449 IE_NAME = u'youtube:playlist'
1450
880e1c52
JMF
1451 def _real_initialize(self):
1452 self._login()
1453
652cdaa2
JMF
1454 def _ids_to_results(self, ids):
1455 return [self.url_result(vid_id, 'Youtube', video_id=vid_id)
1456 for vid_id in ids]
1457
1458 def _extract_mix(self, playlist_id):
1459 # The mixes are generated from a a single video
1460 # the id of the playlist is just 'RD' + video_id
7d4afc55 1461 url = 'https://youtube.com/watch?v=%s&list=%s' % (playlist_id[-11:], playlist_id)
652cdaa2 1462 webpage = self._download_webpage(url, playlist_id, u'Downloading Youtube mix')
76d1700b
JMF
1463 title_span = (get_element_by_attribute('class', 'title long-title', webpage) or
1464 get_element_by_attribute('class', 'title ', webpage))
1465 title = clean_html(title_span)
652cdaa2
JMF
1466 video_re = r'data-index="\d+".*?href="/watch\?v=([0-9A-Za-z_-]{11})&amp;[^"]*?list=%s' % re.escape(playlist_id)
1467 ids = orderedSet(re.findall(video_re, webpage))
1468 url_results = self._ids_to_results(ids)
1469
1470 return self.playlist_result(url_results, playlist_id, title)
1471
c5e8d7af
PH
1472 def _real_extract(self, url):
1473 # Extract playlist id
d67cc9fa 1474 mobj = re.match(self._VALID_URL, url)
c5e8d7af
PH
1475 if mobj is None:
1476 raise ExtractorError(u'Invalid URL: %s' % url)
47192f92
FV
1477 playlist_id = mobj.group(1) or mobj.group(2)
1478
1479 # Check if it's a video-specific URL
7c61bd36 1480 query_dict = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
47192f92
FV
1481 if 'v' in query_dict:
1482 video_id = query_dict['v'][0]
1483 if self._downloader.params.get('noplaylist'):
1484 self.to_screen(u'Downloading just video %s because of --no-playlist' % video_id)
7012b23c 1485 return self.url_result(video_id, 'Youtube', video_id=video_id)
47192f92
FV
1486 else:
1487 self.to_screen(u'Downloading playlist PL%s - add --no-playlist to just download video %s' % (playlist_id, video_id))
c5e8d7af 1488
7d4afc55 1489 if playlist_id.startswith('RD'):
652cdaa2
JMF
1490 # Mixes require a custom extraction process
1491 return self._extract_mix(playlist_id)
0a688bc0
JMF
1492 if playlist_id.startswith('TL'):
1493 raise ExtractorError(u'For downloading YouTube.com top lists, use '
1494 u'the "yttoplist" keyword, for example "youtube-dl \'yttoplist:music:Top Tracks\'"', expected=True)
652cdaa2 1495
dcbb4580
JMF
1496 # Extract the video ids from the playlist pages
1497 ids = []
c5e8d7af 1498
755eb032 1499 for page_num in itertools.count(1):
dcbb4580 1500 url = self._TEMPLATE_URL % (playlist_id, page_num)
c5e8d7af 1501 page = self._download_webpage(url, playlist_id, u'Downloading page #%s' % page_num)
6e47b51e
JMF
1502 matches = re.finditer(self._VIDEO_RE, page)
1503 # We remove the duplicates and the link with index 0
1504 # (it's not the first video of the playlist)
1505 new_ids = orderedSet(m.group('id') for m in matches if m.group('index') != '0')
dcbb4580 1506 ids.extend(new_ids)
c5e8d7af 1507
dcbb4580 1508 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
c5e8d7af
PH
1509 break
1510
c91778f8
PH
1511 try:
1512 playlist_title = self._og_search_title(page)
1513 except RegexNotFoundError:
1514 self.report_warning(
1515 u'Playlist page is missing OpenGraph title, falling back ...',
1516 playlist_id)
1517 playlist_title = self._html_search_regex(
1518 r'<h1 class="pl-header-title">(.*?)</h1>', page, u'title')
c5e8d7af 1519
652cdaa2 1520 url_results = self._ids_to_results(ids)
dcbb4580 1521 return self.playlist_result(url_results, playlist_id, playlist_title)
c5e8d7af
PH
1522
1523
0a688bc0
JMF
1524class YoutubeTopListIE(YoutubePlaylistIE):
1525 IE_NAME = u'youtube:toplist'
1526 IE_DESC = (u'YouTube.com top lists, "yttoplist:{channel}:{list title}"'
1527 u' (Example: "yttoplist:music:Top Tracks")')
1528 _VALID_URL = r'yttoplist:(?P<chann>.*?):(?P<title>.*?)$'
1529
1530 def _real_extract(self, url):
1531 mobj = re.match(self._VALID_URL, url)
1532 channel = mobj.group('chann')
1533 title = mobj.group('title')
1534 query = compat_urllib_parse.urlencode({'title': title})
beddbc2a 1535 playlist_re = 'href="([^"]+?%s.*?)"' % re.escape(query)
0a688bc0
JMF
1536 channel_page = self._download_webpage('https://www.youtube.com/%s' % channel, title)
1537 link = self._html_search_regex(playlist_re, channel_page, u'list')
1538 url = compat_urlparse.urljoin('https://www.youtube.com/', link)
1539
1540 video_re = r'data-index="\d+".*?data-video-id="([0-9A-Za-z_-]{11})"'
1541 ids = []
1542 # sometimes the webpage doesn't contain the videos
1543 # retry until we get them
1544 for i in itertools.count(0):
1545 msg = u'Downloading Youtube mix'
1546 if i > 0:
1547 msg += ', retry #%d' % i
1548 webpage = self._download_webpage(url, title, msg)
1549 ids = orderedSet(re.findall(video_re, webpage))
1550 if ids:
1551 break
1552 url_results = self._ids_to_results(ids)
1553 return self.playlist_result(url_results, playlist_title=title)
1554
1555
c5e8d7af 1556class YoutubeChannelIE(InfoExtractor):
0f818663 1557 IE_DESC = u'YouTube.com channels'
c5e8d7af 1558 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
c5e8d7af 1559 _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
38c2e5b8 1560 _MORE_PAGES_URL = 'https://www.youtube.com/c4_browse_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
c5e8d7af
PH
1561 IE_NAME = u'youtube:channel'
1562
1563 def extract_videos_from_page(self, page):
1564 ids_in_page = []
1565 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1566 if mobj.group(1) not in ids_in_page:
1567 ids_in_page.append(mobj.group(1))
1568 return ids_in_page
1569
1570 def _real_extract(self, url):
1571 # Extract channel id
1572 mobj = re.match(self._VALID_URL, url)
1573 if mobj is None:
1574 raise ExtractorError(u'Invalid URL: %s' % url)
1575
1576 # Download channel page
1577 channel_id = mobj.group(1)
1578 video_ids = []
b9643eed
JMF
1579 url = 'https://www.youtube.com/channel/%s/videos' % channel_id
1580 channel_page = self._download_webpage(url, channel_id)
31812a9e
PH
1581 autogenerated = re.search(r'''(?x)
1582 class="[^"]*?(?:
1583 channel-header-autogenerated-label|
1584 yt-channel-title-autogenerated
1585 )[^"]*"''', channel_page) is not None
c5e8d7af 1586
b9643eed
JMF
1587 if autogenerated:
1588 # The videos are contained in a single page
1589 # the ajax pages can't be used, they are empty
1590 video_ids = self.extract_videos_from_page(channel_page)
1591 else:
1592 # Download all channel pages using the json-based channel_ajax query
1593 for pagenum in itertools.count(1):
1594 url = self._MORE_PAGES_URL % (pagenum, channel_id)
81c2f20b
PH
1595 page = self._download_json(
1596 url, channel_id, note=u'Downloading page #%s' % pagenum,
1597 transform_source=uppercase_escape)
1598
b9643eed
JMF
1599 ids_in_page = self.extract_videos_from_page(page['content_html'])
1600 video_ids.extend(ids_in_page)
1601
1602 if self._MORE_PAGES_INDICATOR not in page['load_more_widget_html']:
1603 break
c5e8d7af
PH
1604
1605 self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1606
7012b23c
PH
1607 url_entries = [self.url_result(video_id, 'Youtube', video_id=video_id)
1608 for video_id in video_ids]
1609 return self.playlist_result(url_entries, channel_id)
c5e8d7af
PH
1610
1611
1612class YoutubeUserIE(InfoExtractor):
0f818663 1613 IE_DESC = u'YouTube.com user videos (URL or "ytuser" keyword)'
57da92b7 1614 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:user/)?(?!(?:attribution_link|watch)(?:$|[^a-z_A-Z0-9-])))|ytuser:)(?!feed/)([A-Za-z0-9_-]+)'
38c2e5b8 1615 _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/users/%s'
c5e8d7af 1616 _GDATA_PAGE_SIZE = 50
38c2e5b8 1617 _GDATA_URL = 'https://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d&alt=json'
c5e8d7af
PH
1618 IE_NAME = u'youtube:user'
1619
e3ea4790 1620 @classmethod
f4b05232 1621 def suitable(cls, url):
e3ea4790
JMF
1622 # Don't return True if the url can be extracted with other youtube
1623 # extractor, the regex would is too permissive and it would match.
1624 other_ies = iter(klass for (name, klass) in globals().items() if name.endswith('IE') and klass is not cls)
1625 if any(ie.suitable(url) for ie in other_ies): return False
f4b05232
JMF
1626 else: return super(YoutubeUserIE, cls).suitable(url)
1627
c5e8d7af
PH
1628 def _real_extract(self, url):
1629 # Extract username
1630 mobj = re.match(self._VALID_URL, url)
1631 if mobj is None:
1632 raise ExtractorError(u'Invalid URL: %s' % url)
1633
1634 username = mobj.group(1)
1635
1636 # Download video ids using YouTube Data API. Result size per
1637 # query is limited (currently to 50 videos) so we need to query
1638 # page by page until there are no video ids - it means we got
1639 # all of them.
1640
b7ab0590 1641 def download_page(pagenum):
c5e8d7af
PH
1642 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1643
1644 gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
b7ab0590
PH
1645 page = self._download_webpage(
1646 gdata_url, username,
1647 u'Downloading video ids from %d to %d' % (
1648 start_index, start_index + self._GDATA_PAGE_SIZE))
c5e8d7af 1649
fd9cf738
JMF
1650 try:
1651 response = json.loads(page)
1652 except ValueError as err:
1653 raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
71c82637 1654 if 'entry' not in response['feed']:
b7ab0590 1655 return
fd9cf738 1656
c5e8d7af 1657 # Extract video identifiers
e302f9ce
PH
1658 entries = response['feed']['entry']
1659 for entry in entries:
1660 title = entry['title']['$t']
1661 video_id = entry['id']['$t'].split('/')[-1]
b7ab0590 1662 yield {
e302f9ce
PH
1663 '_type': 'url',
1664 'url': video_id,
1665 'ie_key': 'Youtube',
b11cec41 1666 'id': video_id,
e302f9ce 1667 'title': title,
b7ab0590
PH
1668 }
1669 url_results = PagedList(download_page, self._GDATA_PAGE_SIZE)
c5e8d7af 1670
7012b23c
PH
1671 return self.playlist_result(url_results, playlist_title=username)
1672
b05654f0
PH
1673
1674class YoutubeSearchIE(SearchInfoExtractor):
0f818663 1675 IE_DESC = u'YouTube.com searches'
b05654f0
PH
1676 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1677 _MAX_RESULTS = 1000
1678 IE_NAME = u'youtube:search'
1679 _SEARCH_KEY = 'ytsearch'
1680
b05654f0
PH
1681 def _get_n_results(self, query, n):
1682 """Get a specified number of results for a query"""
1683
1684 video_ids = []
1685 pagenum = 0
1686 limit = n
1687
1688 while (50 * pagenum) < limit:
b05654f0 1689 result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
7cc3570e
PH
1690 data_json = self._download_webpage(
1691 result_url, video_id=u'query "%s"' % query,
1692 note=u'Downloading page %s' % (pagenum + 1),
1693 errnote=u'Unable to download API page')
1694 data = json.loads(data_json)
1695 api_response = data['data']
1696
1697 if 'items' not in api_response:
07ad22b8
PH
1698 raise ExtractorError(
1699 u'[youtube] No video results', expected=True)
b05654f0
PH
1700
1701 new_ids = list(video['id'] for video in api_response['items'])
1702 video_ids += new_ids
1703
1704 limit = min(n, api_response['totalItems'])
1705 pagenum += 1
1706
1707 if len(video_ids) > n:
1708 video_ids = video_ids[:n]
7012b23c
PH
1709 videos = [self.url_result(video_id, 'Youtube', video_id=video_id)
1710 for video_id in video_ids]
b05654f0 1711 return self.playlist_result(videos, query)
75dff0ee 1712
a3dd9248 1713class YoutubeSearchDateIE(YoutubeSearchIE):
cb7fb546 1714 IE_NAME = YoutubeSearchIE.IE_NAME + ':date'
a3dd9248
CM
1715 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc&orderby=published'
1716 _SEARCH_KEY = 'ytsearchdate'
08fb86c4 1717 IE_DESC = u'YouTube.com searches, newest videos first'
75dff0ee
JMF
1718
1719class YoutubeShowIE(InfoExtractor):
0f818663 1720 IE_DESC = u'YouTube.com (multi-season) shows'
75dff0ee
JMF
1721 _VALID_URL = r'https?://www\.youtube\.com/show/(.*)'
1722 IE_NAME = u'youtube:show'
1723
1724 def _real_extract(self, url):
1725 mobj = re.match(self._VALID_URL, url)
1726 show_name = mobj.group(1)
1727 webpage = self._download_webpage(url, show_name, u'Downloading show webpage')
1728 # There's one playlist for each season of the show
1729 m_seasons = list(re.finditer(r'href="(/playlist\?list=.*?)"', webpage))
1730 self.to_screen(u'%s: Found %s seasons' % (show_name, len(m_seasons)))
1731 return [self.url_result('https://www.youtube.com' + season.group(1), 'YoutubePlaylist') for season in m_seasons]
04cc9617
JMF
1732
1733
b2e8bc1b 1734class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
d7ae0639
JMF
1735 """
1736 Base class for extractors that fetch info from
1737 http://www.youtube.com/feed_ajax
1738 Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.
1739 """
b2e8bc1b 1740 _LOGIN_REQUIRED = True
43ba5456
JMF
1741 # use action_load_personal_feed instead of action_load_system_feed
1742 _PERSONAL_FEED = False
04cc9617 1743
d7ae0639
JMF
1744 @property
1745 def _FEED_TEMPLATE(self):
43ba5456
JMF
1746 action = 'action_load_system_feed'
1747 if self._PERSONAL_FEED:
1748 action = 'action_load_personal_feed'
38c2e5b8 1749 return 'https://www.youtube.com/feed_ajax?%s=1&feed_name=%s&paging=%%s' % (action, self._FEED_NAME)
d7ae0639
JMF
1750
1751 @property
1752 def IE_NAME(self):
1753 return u'youtube:%s' % self._FEED_NAME
04cc9617 1754
81f0259b 1755 def _real_initialize(self):
b2e8bc1b 1756 self._login()
81f0259b 1757
04cc9617
JMF
1758 def _real_extract(self, url):
1759 feed_entries = []
0e44d838
JMF
1760 paging = 0
1761 for i in itertools.count(1):
d7ae0639
JMF
1762 info = self._download_webpage(self._FEED_TEMPLATE % paging,
1763 u'%s feed' % self._FEED_NAME,
04cc9617
JMF
1764 u'Downloading page %s' % i)
1765 info = json.loads(info)
1766 feed_html = info['feed_html']
43ba5456 1767 m_ids = re.finditer(r'"/watch\?v=(.*?)["&]', feed_html)
04cc9617 1768 ids = orderedSet(m.group(1) for m in m_ids)
7012b23c
PH
1769 feed_entries.extend(
1770 self.url_result(video_id, 'Youtube', video_id=video_id)
1771 for video_id in ids)
04cc9617
JMF
1772 if info['paging'] is None:
1773 break
0e44d838 1774 paging = info['paging']
d7ae0639
JMF
1775 return self.playlist_result(feed_entries, playlist_title=self._PLAYLIST_TITLE)
1776
1777class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor):
1778 IE_DESC = u'YouTube.com subscriptions feed, "ytsubs" keyword(requires authentication)'
1779 _VALID_URL = r'https?://www\.youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
1780 _FEED_NAME = 'subscriptions'
1781 _PLAYLIST_TITLE = u'Youtube Subscriptions'
1782
1783class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
1784 IE_DESC = u'YouTube.com recommended videos, "ytrec" keyword (requires authentication)'
1785 _VALID_URL = r'https?://www\.youtube\.com/feed/recommended|:ytrec(?:ommended)?'
1786 _FEED_NAME = 'recommended'
1787 _PLAYLIST_TITLE = u'Youtube Recommended videos'
c626a3d9 1788
43ba5456
JMF
1789class YoutubeWatchLaterIE(YoutubeFeedsInfoExtractor):
1790 IE_DESC = u'Youtube watch later list, "ytwatchlater" keyword (requires authentication)'
1791 _VALID_URL = r'https?://www\.youtube\.com/feed/watch_later|:ytwatchlater'
1792 _FEED_NAME = 'watch_later'
1793 _PLAYLIST_TITLE = u'Youtube Watch Later'
43ba5456 1794 _PERSONAL_FEED = True
c626a3d9 1795
f459d170
JMF
1796class YoutubeHistoryIE(YoutubeFeedsInfoExtractor):
1797 IE_DESC = u'Youtube watch history, "ythistory" keyword (requires authentication)'
1798 _VALID_URL = u'https?://www\.youtube\.com/feed/history|:ythistory'
1799 _FEED_NAME = 'history'
1800 _PERSONAL_FEED = True
1801 _PLAYLIST_TITLE = u'Youtube Watch History'
1802
c626a3d9
JMF
1803class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
1804 IE_NAME = u'youtube:favorites'
1805 IE_DESC = u'YouTube.com favourite videos, "ytfav" keyword (requires authentication)'
c7a7750d 1806 _VALID_URL = r'https?://www\.youtube\.com/my_favorites|:ytfav(?:ou?rites)?'
c626a3d9
JMF
1807 _LOGIN_REQUIRED = True
1808
1809 def _real_extract(self, url):
1810 webpage = self._download_webpage('https://www.youtube.com/my_favorites', 'Youtube Favourites videos')
1811 playlist_id = self._search_regex(r'list=(.+?)["&]', webpage, u'favourites playlist id')
1812 return self.url_result(playlist_id, 'YoutubePlaylist')
15870e90
PH
1813
1814
1815class YoutubeTruncatedURLIE(InfoExtractor):
1816 IE_NAME = 'youtube:truncated_url'
1817 IE_DESC = False # Do not list
975d35db 1818 _VALID_URL = r'''(?x)
2eb5d315 1819 (?:https?://)?[^/]+/watch\?(?:feature=[a-z_]+)?$|
975d35db
PH
1820 (?:https?://)?(?:www\.)?youtube\.com/attribution_link\?a=[^&]+$
1821 '''
15870e90
PH
1822
1823 def _real_extract(self, url):
1824 raise ExtractorError(
1825 u'Did you forget to quote the URL? Remember that & is a meta '
1826 u'character in most shells, so you want to put the URL in quotes, '
1827 u'like youtube-dl '
b4622a32
PH
1828 u'"http://www.youtube.com/watch?feature=foo&v=BaW_jenozKc" '
1829 u' or simply youtube-dl BaW_jenozKc .',
15870e90 1830 expected=True)