]> jfr.im git - yt-dlp.git/blame - youtube_dl/extractor/youtube.py
[wrzuta] Add age limit
[yt-dlp.git] / youtube_dl / extractor / youtube.py
CommitLineData
c5e8d7af 1# coding: utf-8
c5e8d7af 2
e0df6211 3import collections
edf3e38e 4import errno
e0df6211 5import io
0ca96d48 6import itertools
c5e8d7af 7import json
c4417ddb 8import os.path
c5e8d7af 9import re
e0df6211
PH
10import struct
11import traceback
12import zlib
c5e8d7af 13
b05654f0 14from .common import InfoExtractor, SearchInfoExtractor
54d39d8b 15from .subtitles import SubtitlesInfoExtractor
2b25cb5d 16from ..jsinterp import JSInterpreter
c5e8d7af 17from ..utils import (
edf3e38e 18 compat_chr,
c5e8d7af 19 compat_parse_qs,
c5e8d7af
PH
20 compat_urllib_parse,
21 compat_urllib_request,
7c61bd36 22 compat_urlparse,
c5e8d7af
PH
23 compat_str,
24
25 clean_html,
c38b1e77 26 get_cachedir,
c5e8d7af 27 get_element_by_id,
652cdaa2 28 get_element_by_attribute,
c5e8d7af 29 ExtractorError,
dd27fd17 30 int_or_none,
b7ab0590 31 PagedList,
c5e8d7af
PH
32 unescapeHTML,
33 unified_strdate,
04cc9617 34 orderedSet,
edf3e38e 35 write_json_file,
81c2f20b 36 uppercase_escape,
c5e8d7af
PH
37)
38
de7f3446 39class YoutubeBaseInfoExtractor(InfoExtractor):
b2e8bc1b
JMF
40 """Provide base functions for Youtube extractors"""
41 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
42 _LANG_URL = r'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
38c2e5b8 43 _AGE_URL = 'https://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
b2e8bc1b
JMF
44 _NETRC_MACHINE = 'youtube'
45 # If True it will raise an error if no login info is provided
46 _LOGIN_REQUIRED = False
47
b2e8bc1b 48 def _set_language(self):
7cc3570e
PH
49 return bool(self._download_webpage(
50 self._LANG_URL, None,
51 note=u'Setting language', errnote='unable to set language',
52 fatal=False))
b2e8bc1b
JMF
53
54 def _login(self):
55 (username, password) = self._get_login_info()
56 # No authentication to be performed
57 if username is None:
58 if self._LOGIN_REQUIRED:
59 raise ExtractorError(u'No login info available, needed for using %s.' % self.IE_NAME, expected=True)
60 return False
61
7cc3570e
PH
62 login_page = self._download_webpage(
63 self._LOGIN_URL, None,
64 note=u'Downloading login page',
65 errnote=u'unable to fetch login page', fatal=False)
66 if login_page is False:
67 return
b2e8bc1b 68
795f28f8
PH
69 galx = self._search_regex(r'(?s)<input.+?name="GALX".+?value="(.+?)"',
70 login_page, u'Login GALX parameter')
c5e8d7af 71
b2e8bc1b
JMF
72 # Log in
73 login_form_strs = {
74 u'continue': u'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
75 u'Email': username,
76 u'GALX': galx,
77 u'Passwd': password,
78 u'PersistentCookie': u'yes',
79 u'_utf8': u'霱',
80 u'bgresponse': u'js_disabled',
81 u'checkConnection': u'',
82 u'checkedDomains': u'youtube',
83 u'dnConn': u'',
b2e8bc1b
JMF
84 u'pstMsg': u'0',
85 u'rmShown': u'1',
86 u'secTok': u'',
87 u'signIn': u'Sign in',
88 u'timeStmp': u'',
89 u'service': u'youtube',
90 u'uilel': u'3',
91 u'hl': u'en_US',
92 }
93 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
94 # chokes on unicode
95 login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
96 login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
7cc3570e
PH
97
98 req = compat_urllib_request.Request(self._LOGIN_URL, login_data)
99 login_results = self._download_webpage(
100 req, None,
101 note=u'Logging in', errnote=u'unable to log in', fatal=False)
102 if login_results is False:
103 return False
104 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
105 self._downloader.report_warning(u'unable to log in: bad username or password')
b2e8bc1b
JMF
106 return False
107 return True
108
109 def _confirm_age(self):
110 age_form = {
7cc3570e
PH
111 'next_url': '/',
112 'action_confirm': 'Confirm',
113 }
5700e779
JMF
114 req = compat_urllib_request.Request(self._AGE_URL,
115 compat_urllib_parse.urlencode(age_form).encode('ascii'))
7cc3570e
PH
116
117 self._download_webpage(
118 req, None,
119 note=u'Confirming age', errnote=u'Unable to confirm age')
b2e8bc1b
JMF
120 return True
121
122 def _real_initialize(self):
123 if self._downloader is None:
124 return
125 if not self._set_language():
126 return
127 if not self._login():
128 return
129 self._confirm_age()
c5e8d7af 130
8377574c 131
de7f3446 132class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
0f818663 133 IE_DESC = u'YouTube.com'
cb7dfeea 134 _VALID_URL = r"""(?x)^
c5e8d7af 135 (
83aa5293 136 (?:https?://|//)? # http(s):// or protocol-independent URL (optional)
cb7dfeea 137 (?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/|
484aaeb2 138 (?:www\.)?deturl\.com/www\.youtube\.com/|
e70dc1d1 139 (?:www\.)?pwnyoutube\.com/|
f7000f3a 140 (?:www\.)?yourepeat\.com/|
e69ae5b9
JMF
141 tube\.majestyc\.net/|
142 youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains
c5e8d7af
PH
143 (?:.*?\#/)? # handle anchor (#/) redirect urls
144 (?: # the various things that can precede the ID:
145 (?:(?:v|embed|e)/) # v/ or embed/ or e/
146 |(?: # or the v= param in all its forms
f7000f3a 147 (?:(?:watch|movie)(?:_popup)?(?:\.php)?/?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
c5e8d7af
PH
148 (?:\?|\#!?) # the params delimiter ? or # or #!
149 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
150 v=
151 )
f4b05232
JMF
152 ))
153 |youtu\.be/ # just youtu.be/xxxx
b9c76aa1 154 |https?://(?:www\.)?cleanvideosearch\.com/media/action/yt/watch\?videoId=
f4b05232 155 )
c5e8d7af 156 )? # all until now is optional -> you can pass the naked ID
8963d9c2 157 ([0-9A-Za-z_-]{11}) # here is it! the YouTube video ID
c5e8d7af
PH
158 (?(1).+)? # if we found the ID, everything can follow
159 $"""
c5e8d7af 160 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
2c62dc26
PH
161 _formats = {
162 '5': {'ext': 'flv', 'width': 400, 'height': 240},
163 '6': {'ext': 'flv', 'width': 450, 'height': 270},
164 '13': {'ext': '3gp'},
165 '17': {'ext': '3gp', 'width': 176, 'height': 144},
166 '18': {'ext': 'mp4', 'width': 640, 'height': 360},
167 '22': {'ext': 'mp4', 'width': 1280, 'height': 720},
168 '34': {'ext': 'flv', 'width': 640, 'height': 360},
169 '35': {'ext': 'flv', 'width': 854, 'height': 480},
170 '36': {'ext': '3gp', 'width': 320, 'height': 240},
171 '37': {'ext': 'mp4', 'width': 1920, 'height': 1080},
172 '38': {'ext': 'mp4', 'width': 4096, 'height': 3072},
173 '43': {'ext': 'webm', 'width': 640, 'height': 360},
174 '44': {'ext': 'webm', 'width': 854, 'height': 480},
175 '45': {'ext': 'webm', 'width': 1280, 'height': 720},
176 '46': {'ext': 'webm', 'width': 1920, 'height': 1080},
177
1d043b93 178
86fe61c8 179 # 3d videos
43b81eb9
PH
180 '82': {'ext': 'mp4', 'height': 360, 'format_note': '3D', 'preference': -20},
181 '83': {'ext': 'mp4', 'height': 480, 'format_note': '3D', 'preference': -20},
182 '84': {'ext': 'mp4', 'height': 720, 'format_note': '3D', 'preference': -20},
183 '85': {'ext': 'mp4', 'height': 1080, 'format_note': '3D', 'preference': -20},
184 '100': {'ext': 'webm', 'height': 360, 'format_note': '3D', 'preference': -20},
185 '101': {'ext': 'webm', 'height': 480, 'format_note': '3D', 'preference': -20},
186 '102': {'ext': 'webm', 'height': 720, 'format_note': '3D', 'preference': -20},
836a086c 187
96fb5605 188 # Apple HTTP Live Streaming
43b81eb9
PH
189 '92': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'preference': -10},
190 '93': {'ext': 'mp4', 'height': 360, 'format_note': 'HLS', 'preference': -10},
191 '94': {'ext': 'mp4', 'height': 480, 'format_note': 'HLS', 'preference': -10},
192 '95': {'ext': 'mp4', 'height': 720, 'format_note': 'HLS', 'preference': -10},
193 '96': {'ext': 'mp4', 'height': 1080, 'format_note': 'HLS', 'preference': -10},
194 '132': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'preference': -10},
195 '151': {'ext': 'mp4', 'height': 72, 'format_note': 'HLS', 'preference': -10},
2c62dc26
PH
196
197 # DASH mp4 video
43b81eb9
PH
198 '133': {'ext': 'mp4', 'height': 240, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
199 '134': {'ext': 'mp4', 'height': 360, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
200 '135': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
201 '136': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
202 '137': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
203 '138': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
204 '160': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
205 '264': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
836a086c 206
f6f1fc92 207 # Dash mp4 audio
2c62dc26
PH
208 '139': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 48, 'preference': -50},
209 '140': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 128, 'preference': -50},
210 '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 256, 'preference': -50},
836a086c
AZ
211
212 # Dash webm
e75cafe9
A
213 '167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
214 '168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
215 '169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
216 '170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
217 '218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
218 '219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
219 '242': {'ext': 'webm', 'height': 240, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
220 '243': {'ext': 'webm', 'height': 360, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
221 '244': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
222 '245': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
223 '246': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
224 '247': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
225 '248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
3c80377b 226 '271': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
2c62dc26
PH
227
228 # Dash webm audio
e75cafe9
A
229 '171': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'abr': 48, 'preference': -50},
230 '172': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'abr': 256, 'preference': -50},
ce6b9a2d
PH
231
232 # RTMP (unnamed)
233 '_rtmp': {'protocol': 'rtmp'},
c5e8d7af 234 }
836a086c 235
c5e8d7af 236 IE_NAME = u'youtube'
2eb88d95
PH
237 _TESTS = [
238 {
0e853ca4
PH
239 u"url": u"http://www.youtube.com/watch?v=BaW_jenozKc",
240 u"file": u"BaW_jenozKc.mp4",
241 u"info_dict": {
242 u"title": u"youtube-dl test video \"'/\\ä↭𝕐",
243 u"uploader": u"Philipp Hagemeister",
244 u"uploader_id": u"phihag",
245 u"upload_date": u"20121002",
ad3bc6ac
PH
246 u"description": u"test chars: \"'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .",
247 u"categories": [u'Science & Technology'],
2eb88d95 248 }
0e853ca4 249 },
0e853ca4
PH
250 {
251 u"url": u"http://www.youtube.com/watch?v=UxxajLWwzqY",
252 u"file": u"UxxajLWwzqY.mp4",
253 u"note": u"Test generic use_cipher_signature video (#897)",
254 u"info_dict": {
255 u"upload_date": u"20120506",
256 u"title": u"Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]",
ba60a3eb 257 u"description": u"md5:fea86fda2d5a5784273df5c7cc994d9f",
45ed795c 258 u"uploader": u"Icona Pop",
0e853ca4 259 u"uploader_id": u"IconaPop"
2eb88d95 260 }
c108eb73
JMF
261 },
262 {
263 u"url": u"https://www.youtube.com/watch?v=07FYdnEawAQ",
264 u"file": u"07FYdnEawAQ.mp4",
265 u"note": u"Test VEVO video with age protection (#956)",
266 u"info_dict": {
267 u"upload_date": u"20130703",
268 u"title": u"Justin Timberlake - Tunnel Vision (Explicit)",
269 u"description": u"md5:64249768eec3bc4276236606ea996373",
270 u"uploader": u"justintimberlakeVEVO",
271 u"uploader_id": u"justintimberlakeVEVO"
272 }
273 },
fccd3771 274 {
83aa5293 275 u"url": u"//www.YouTube.com/watch?v=yZIXLfi8CZQ",
fccd3771
PH
276 u"file": u"yZIXLfi8CZQ.mp4",
277 u"note": u"Embed-only video (#1746)",
278 u"info_dict": {
279 u"upload_date": u"20120608",
280 u"title": u"Principal Sexually Assaults A Teacher - Episode 117 - 8th June 2012",
281 u"description": u"md5:09b78bd971f1e3e289601dfba15ca4f7",
282 u"uploader": u"SET India",
283 u"uploader_id": u"setindia"
284 }
285 },
dd27fd17
PH
286 {
287 u"url": u"http://www.youtube.com/watch?v=a9LDPn-MO4I",
288 u"file": u"a9LDPn-MO4I.m4a",
289 u"note": u"256k DASH audio (format 141) via DASH manifest",
dd27fd17
PH
290 u"info_dict": {
291 u"upload_date": "20121002",
292 u"uploader_id": "8KVIDEO",
293 u"description": "No description available.",
294 u"uploader": "8KVIDEO",
295 u"title": "UHDTV TEST 8K VIDEO.mp4"
4919603f
PH
296 },
297 u"params": {
298 u"youtube_include_dash_manifest": True,
299 u"format": "141",
300 },
dd27fd17 301 },
3489b7d2
JMF
302 # DASH manifest with encrypted signature
303 {
304 u'url': u'https://www.youtube.com/watch?v=IB3lcPjvWLA',
305 u'info_dict': {
306 u'id': u'IB3lcPjvWLA',
307 u'ext': u'm4a',
308 u'title': u'Afrojack - The Spark ft. Spree Wilson',
e00c9cf5 309 u'description': u'md5:9717375db5a9a3992be4668bbf3bc0a8',
3489b7d2
JMF
310 u'uploader': u'AfrojackVEVO',
311 u'uploader_id': u'AfrojackVEVO',
312 u'upload_date': u'20131011',
313 },
314 u"params": {
315 u'youtube_include_dash_manifest': True,
316 u'format': '141',
317 },
318 },
2eb88d95
PH
319 ]
320
c5e8d7af
PH
321
322 @classmethod
323 def suitable(cls, url):
324 """Receives a URL and returns True if suitable for this IE."""
e3ea4790 325 if YoutubePlaylistIE.suitable(url): return False
fccd3771 326 return re.match(cls._VALID_URL, url) is not None
c5e8d7af 327
e0df6211
PH
328 def __init__(self, *args, **kwargs):
329 super(YoutubeIE, self).__init__(*args, **kwargs)
83799698 330 self._player_cache = {}
e0df6211 331
c5e8d7af
PH
332 def report_video_info_webpage_download(self, video_id):
333 """Report attempt to download video info webpage."""
334 self.to_screen(u'%s: Downloading video info webpage' % video_id)
335
c5e8d7af
PH
336 def report_information_extraction(self, video_id):
337 """Report attempt to extract video information."""
338 self.to_screen(u'%s: Extracting video information' % video_id)
339
340 def report_unavailable_format(self, video_id, format):
341 """Report extracted video URL."""
342 self.to_screen(u'%s: Format %s not available' % (video_id, format))
343
344 def report_rtmp_download(self):
345 """Indicate the download will use the RTMP protocol."""
346 self.to_screen(u'RTMP download detected')
347
c4417ddb
PH
348 def _extract_signature_function(self, video_id, player_url, slen):
349 id_m = re.match(r'.*-(?P<id>[a-zA-Z0-9_-]+)\.(?P<ext>[a-z]+)$',
83799698 350 player_url)
e0df6211
PH
351 player_type = id_m.group('ext')
352 player_id = id_m.group('id')
353
c4417ddb
PH
354 # Read from filesystem cache
355 func_id = '%s_%s_%d' % (player_type, player_id, slen)
356 assert os.path.basename(func_id) == func_id
c38b1e77 357 cache_dir = get_cachedir(self._downloader.params)
c4417ddb 358
c3c88a26 359 cache_enabled = cache_dir is not None
f8061589 360 if cache_enabled:
c4417ddb
PH
361 cache_fn = os.path.join(os.path.expanduser(cache_dir),
362 u'youtube-sigfuncs',
363 func_id + '.json')
364 try:
edf3e38e 365 with io.open(cache_fn, 'r', encoding='utf-8') as cachef:
c4417ddb
PH
366 cache_spec = json.load(cachef)
367 return lambda s: u''.join(s[i] for i in cache_spec)
edf3e38e 368 except IOError:
c4417ddb 369 pass # No cache available
83799698 370
e0df6211
PH
371 if player_type == 'js':
372 code = self._download_webpage(
373 player_url, video_id,
83799698 374 note=u'Downloading %s player %s' % (player_type, player_id),
e0df6211 375 errnote=u'Download of %s failed' % player_url)
83799698 376 res = self._parse_sig_js(code)
c4417ddb 377 elif player_type == 'swf':
e0df6211
PH
378 urlh = self._request_webpage(
379 player_url, video_id,
83799698 380 note=u'Downloading %s player %s' % (player_type, player_id),
e0df6211
PH
381 errnote=u'Download of %s failed' % player_url)
382 code = urlh.read()
83799698 383 res = self._parse_sig_swf(code)
e0df6211
PH
384 else:
385 assert False, 'Invalid player type %r' % player_type
386
f8061589 387 if cache_enabled:
edf3e38e 388 try:
c705320f
PH
389 test_string = u''.join(map(compat_chr, range(slen)))
390 cache_res = res(test_string)
edf3e38e
PH
391 cache_spec = [ord(c) for c in cache_res]
392 try:
393 os.makedirs(os.path.dirname(cache_fn))
394 except OSError as ose:
395 if ose.errno != errno.EEXIST:
396 raise
397 write_json_file(cache_spec, cache_fn)
0ca96d48 398 except Exception:
edf3e38e
PH
399 tb = traceback.format_exc()
400 self._downloader.report_warning(
401 u'Writing cache to %r failed: %s' % (cache_fn, tb))
83799698
PH
402
403 return res
404
edf3e38e
PH
405 def _print_sig_code(self, func, slen):
406 def gen_sig_code(idxs):
407 def _genslice(start, end, step):
408 starts = u'' if start == 0 else str(start)
e35e4ddc
PH
409 ends = (u':%d' % (end+step)) if end + step >= 0 else u':'
410 steps = u'' if step == 1 else (u':%d' % step)
edf3e38e
PH
411 return u's[%s%s%s]' % (starts, ends, steps)
412
413 step = None
0ca96d48
PH
414 start = '(Never used)' # Quelch pyflakes warnings - start will be
415 # set as soon as step is set
edf3e38e
PH
416 for i, prev in zip(idxs[1:], idxs[:-1]):
417 if step is not None:
418 if i - prev == step:
419 continue
420 yield _genslice(start, prev, step)
421 step = None
422 continue
423 if i - prev in [-1, 1]:
424 step = i - prev
425 start = prev
426 continue
427 else:
428 yield u's[%d]' % prev
429 if step is None:
430 yield u's[%d]' % i
431 else:
432 yield _genslice(start, i, step)
433
c705320f
PH
434 test_string = u''.join(map(compat_chr, range(slen)))
435 cache_res = func(test_string)
edf3e38e
PH
436 cache_spec = [ord(c) for c in cache_res]
437 expr_code = u' + '.join(gen_sig_code(cache_spec))
438 code = u'if len(s) == %d:\n return %s\n' % (slen, expr_code)
f8061589 439 self.to_screen(u'Extracted signature function:\n' + code)
edf3e38e 440
e0df6211
PH
441 def _parse_sig_js(self, jscode):
442 funcname = self._search_regex(
443 r'signature=([a-zA-Z]+)', jscode,
2b25cb5d
PH
444 u'Initial JS player signature function name')
445
446 jsi = JSInterpreter(jscode)
447 initial_function = jsi.extract_function(funcname)
e0df6211
PH
448 return lambda s: initial_function([s])
449
450 def _parse_sig_swf(self, file_contents):
451 if file_contents[1:3] != b'WS':
452 raise ExtractorError(
453 u'Not an SWF file; header is %r' % file_contents[:3])
454 if file_contents[:1] == b'C':
455 content = zlib.decompress(file_contents[8:])
456 else:
457 raise NotImplementedError(u'Unsupported compression format %r' %
458 file_contents[:1])
459
460 def extract_tags(content):
461 pos = 0
462 while pos < len(content):
463 header16 = struct.unpack('<H', content[pos:pos+2])[0]
464 pos += 2
465 tag_code = header16 >> 6
466 tag_len = header16 & 0x3f
467 if tag_len == 0x3f:
468 tag_len = struct.unpack('<I', content[pos:pos+4])[0]
469 pos += 4
470 assert pos+tag_len <= len(content)
471 yield (tag_code, content[pos:pos+tag_len])
472 pos += tag_len
473
474 code_tag = next(tag
475 for tag_code, tag in extract_tags(content)
476 if tag_code == 82)
477 p = code_tag.index(b'\0', 4) + 1
ba552f54 478 code_reader = io.BytesIO(code_tag[p:])
e0df6211
PH
479
480 # Parse ABC (AVM2 ByteCode)
ba552f54
PH
481 def read_int(reader=None):
482 if reader is None:
483 reader = code_reader
e0df6211
PH
484 res = 0
485 shift = 0
486 for _ in range(5):
ba552f54
PH
487 buf = reader.read(1)
488 assert len(buf) == 1
489 b = struct.unpack('<B', buf)[0]
e0df6211
PH
490 res = res | ((b & 0x7f) << shift)
491 if b & 0x80 == 0:
492 break
493 shift += 7
ba552f54
PH
494 return res
495
496 def u30(reader=None):
497 res = read_int(reader)
498 assert res & 0xf0000000 == 0
e0df6211
PH
499 return res
500 u32 = read_int
501
ba552f54
PH
502 def s32(reader=None):
503 v = read_int(reader)
e0df6211
PH
504 if v & 0x80000000 != 0:
505 v = - ((v ^ 0xffffffff) + 1)
ba552f54
PH
506 return v
507
0ca96d48 508 def read_string(reader=None):
ba552f54
PH
509 if reader is None:
510 reader = code_reader
511 slen = u30(reader)
512 resb = reader.read(slen)
513 assert len(resb) == slen
514 return resb.decode('utf-8')
515
516 def read_bytes(count, reader=None):
517 if reader is None:
518 reader = code_reader
519 resb = reader.read(count)
520 assert len(resb) == count
521 return resb
522
523 def read_byte(reader=None):
524 resb = read_bytes(1, reader=reader)
525 res = struct.unpack('<B', resb)[0]
526 return res
e0df6211
PH
527
528 # minor_version + major_version
0ca96d48 529 read_bytes(2 + 2)
e0df6211
PH
530
531 # Constant pool
ba552f54 532 int_count = u30()
e0df6211 533 for _c in range(1, int_count):
0ca96d48 534 s32()
ba552f54 535 uint_count = u30()
e0df6211 536 for _c in range(1, uint_count):
0ca96d48 537 u32()
ba552f54 538 double_count = u30()
0ca96d48 539 read_bytes((double_count-1) * 8)
ba552f54 540 string_count = u30()
e0df6211
PH
541 constant_strings = [u'']
542 for _c in range(1, string_count):
0ca96d48 543 s = read_string()
e0df6211 544 constant_strings.append(s)
ba552f54 545 namespace_count = u30()
e0df6211 546 for _c in range(1, namespace_count):
0ca96d48
PH
547 read_bytes(1) # kind
548 u30() # name
ba552f54 549 ns_set_count = u30()
e0df6211 550 for _c in range(1, ns_set_count):
ba552f54 551 count = u30()
e0df6211 552 for _c2 in range(count):
0ca96d48 553 u30()
ba552f54 554 multiname_count = u30()
e0df6211
PH
555 MULTINAME_SIZES = {
556 0x07: 2, # QName
557 0x0d: 2, # QNameA
558 0x0f: 1, # RTQName
559 0x10: 1, # RTQNameA
560 0x11: 0, # RTQNameL
561 0x12: 0, # RTQNameLA
562 0x09: 2, # Multiname
563 0x0e: 2, # MultinameA
564 0x1b: 1, # MultinameL
565 0x1c: 1, # MultinameLA
566 }
567 multinames = [u'']
568 for _c in range(1, multiname_count):
ba552f54 569 kind = u30()
e0df6211
PH
570 assert kind in MULTINAME_SIZES, u'Invalid multiname kind %r' % kind
571 if kind == 0x07:
0ca96d48 572 u30() # namespace_idx
ba552f54 573 name_idx = u30()
e0df6211
PH
574 multinames.append(constant_strings[name_idx])
575 else:
576 multinames.append('[MULTINAME kind: %d]' % kind)
577 for _c2 in range(MULTINAME_SIZES[kind]):
0ca96d48 578 u30()
e0df6211
PH
579
580 # Methods
ba552f54 581 method_count = u30()
e0df6211
PH
582 MethodInfo = collections.namedtuple(
583 'MethodInfo',
584 ['NEED_ARGUMENTS', 'NEED_REST'])
585 method_infos = []
586 for method_id in range(method_count):
ba552f54 587 param_count = u30()
0ca96d48 588 u30() # return type
e0df6211 589 for _ in range(param_count):
0ca96d48
PH
590 u30() # param type
591 u30() # name index (always 0 for youtube)
ba552f54 592 flags = read_byte()
e0df6211
PH
593 if flags & 0x08 != 0:
594 # Options present
ba552f54 595 option_count = u30()
e0df6211 596 for c in range(option_count):
0ca96d48
PH
597 u30() # val
598 read_bytes(1) # kind
e0df6211
PH
599 if flags & 0x80 != 0:
600 # Param names present
601 for _ in range(param_count):
0ca96d48 602 u30() # param name
e0df6211
PH
603 mi = MethodInfo(flags & 0x01 != 0, flags & 0x04 != 0)
604 method_infos.append(mi)
605
606 # Metadata
ba552f54 607 metadata_count = u30()
e0df6211 608 for _c in range(metadata_count):
0ca96d48 609 u30() # name
ba552f54 610 item_count = u30()
e0df6211 611 for _c2 in range(item_count):
0ca96d48
PH
612 u30() # key
613 u30() # value
ba552f54
PH
614
615 def parse_traits_info():
616 trait_name_idx = u30()
617 kind_full = read_byte()
e0df6211
PH
618 kind = kind_full & 0x0f
619 attrs = kind_full >> 4
620 methods = {}
621 if kind in [0x00, 0x06]: # Slot or Const
0ca96d48
PH
622 u30() # Slot id
623 u30() # type_name_idx
ba552f54 624 vindex = u30()
e0df6211 625 if vindex != 0:
0ca96d48 626 read_byte() # vkind
e0df6211 627 elif kind in [0x01, 0x02, 0x03]: # Method / Getter / Setter
0ca96d48 628 u30() # disp_id
ba552f54 629 method_idx = u30()
e0df6211
PH
630 methods[multinames[trait_name_idx]] = method_idx
631 elif kind == 0x04: # Class
0ca96d48
PH
632 u30() # slot_id
633 u30() # classi
e0df6211 634 elif kind == 0x05: # Function
0ca96d48 635 u30() # slot_id
ba552f54 636 function_idx = u30()
e0df6211
PH
637 methods[function_idx] = multinames[trait_name_idx]
638 else:
639 raise ExtractorError(u'Unsupported trait kind %d' % kind)
640
641 if attrs & 0x4 != 0: # Metadata present
ba552f54 642 metadata_count = u30()
e0df6211 643 for _c3 in range(metadata_count):
0ca96d48 644 u30() # metadata index
e0df6211 645
ba552f54 646 return methods
e0df6211
PH
647
648 # Classes
649 TARGET_CLASSNAME = u'SignatureDecipher'
650 searched_idx = multinames.index(TARGET_CLASSNAME)
651 searched_class_id = None
ba552f54 652 class_count = u30()
e0df6211 653 for class_id in range(class_count):
ba552f54 654 name_idx = u30()
e0df6211
PH
655 if name_idx == searched_idx:
656 # We found the class we're looking for!
657 searched_class_id = class_id
0ca96d48 658 u30() # super_name idx
ba552f54 659 flags = read_byte()
e0df6211 660 if flags & 0x08 != 0: # Protected namespace is present
0ca96d48 661 u30() # protected_ns_idx
ba552f54 662 intrf_count = u30()
e0df6211 663 for _c2 in range(intrf_count):
0ca96d48
PH
664 u30()
665 u30() # iinit
ba552f54 666 trait_count = u30()
e0df6211 667 for _c2 in range(trait_count):
0ca96d48 668 parse_traits_info()
e0df6211
PH
669
670 if searched_class_id is None:
671 raise ExtractorError(u'Target class %r not found' %
672 TARGET_CLASSNAME)
673
674 method_names = {}
675 method_idxs = {}
676 for class_id in range(class_count):
0ca96d48 677 u30() # cinit
ba552f54 678 trait_count = u30()
e0df6211 679 for _c2 in range(trait_count):
ba552f54 680 trait_methods = parse_traits_info()
e0df6211
PH
681 if class_id == searched_class_id:
682 method_names.update(trait_methods.items())
683 method_idxs.update(dict(
684 (idx, name)
685 for name, idx in trait_methods.items()))
686
687 # Scripts
ba552f54 688 script_count = u30()
e0df6211 689 for _c in range(script_count):
0ca96d48 690 u30() # init
ba552f54 691 trait_count = u30()
e0df6211 692 for _c2 in range(trait_count):
0ca96d48 693 parse_traits_info()
e0df6211
PH
694
695 # Method bodies
ba552f54 696 method_body_count = u30()
e0df6211
PH
697 Method = collections.namedtuple('Method', ['code', 'local_count'])
698 methods = {}
699 for _c in range(method_body_count):
ba552f54 700 method_idx = u30()
0ca96d48 701 u30() # max_stack
ba552f54 702 local_count = u30()
0ca96d48
PH
703 u30() # init_scope_depth
704 u30() # max_scope_depth
ba552f54
PH
705 code_length = u30()
706 code = read_bytes(code_length)
e0df6211 707 if method_idx in method_idxs:
ba552f54 708 m = Method(code, local_count)
e0df6211 709 methods[method_idxs[method_idx]] = m
ba552f54 710 exception_count = u30()
e0df6211 711 for _c2 in range(exception_count):
0ca96d48
PH
712 u30() # from
713 u30() # to
714 u30() # target
715 u30() # exc_type
716 u30() # var_name
ba552f54 717 trait_count = u30()
e0df6211 718 for _c2 in range(trait_count):
0ca96d48 719 parse_traits_info()
e0df6211 720
ba552f54 721 assert p + code_reader.tell() == len(code_tag)
e0df6211
PH
722 assert len(methods) == len(method_idxs)
723
724 method_pyfunctions = {}
725
726 def extract_function(func_name):
727 if func_name in method_pyfunctions:
728 return method_pyfunctions[func_name]
729 if func_name not in methods:
730 raise ExtractorError(u'Cannot find function %r' % func_name)
731 m = methods[func_name]
732
733 def resfunc(args):
e0df6211
PH
734 registers = ['(this)'] + list(args) + [None] * m.local_count
735 stack = []
736 coder = io.BytesIO(m.code)
737 while True:
738 opcode = struct.unpack('!B', coder.read(1))[0]
a7177865 739 if opcode == 36: # pushbyte
e0df6211
PH
740 v = struct.unpack('!B', coder.read(1))[0]
741 stack.append(v)
742 elif opcode == 44: # pushstring
743 idx = u30(coder)
744 stack.append(constant_strings[idx])
745 elif opcode == 48: # pushscope
746 # We don't implement the scope register, so we'll just
747 # ignore the popped value
748 stack.pop()
749 elif opcode == 70: # callproperty
750 index = u30(coder)
751 mname = multinames[index]
752 arg_count = u30(coder)
753 args = list(reversed(
754 [stack.pop() for _ in range(arg_count)]))
755 obj = stack.pop()
756 if mname == u'split':
757 assert len(args) == 1
758 assert isinstance(args[0], compat_str)
759 assert isinstance(obj, compat_str)
760 if args[0] == u'':
761 res = list(obj)
762 else:
763 res = obj.split(args[0])
764 stack.append(res)
a7177865
PH
765 elif mname == u'slice':
766 assert len(args) == 1
767 assert isinstance(args[0], int)
768 assert isinstance(obj, list)
769 res = obj[args[0]:]
770 stack.append(res)
771 elif mname == u'join':
772 assert len(args) == 1
773 assert isinstance(args[0], compat_str)
774 assert isinstance(obj, list)
775 res = args[0].join(obj)
776 stack.append(res)
e0df6211
PH
777 elif mname in method_pyfunctions:
778 stack.append(method_pyfunctions[mname](args))
779 else:
780 raise NotImplementedError(
781 u'Unsupported property %r on %r'
782 % (mname, obj))
a7177865
PH
783 elif opcode == 72: # returnvalue
784 res = stack.pop()
785 return res
786 elif opcode == 79: # callpropvoid
787 index = u30(coder)
788 mname = multinames[index]
789 arg_count = u30(coder)
790 args = list(reversed(
791 [stack.pop() for _ in range(arg_count)]))
792 obj = stack.pop()
793 if mname == u'reverse':
794 assert isinstance(obj, list)
795 obj.reverse()
796 else:
797 raise NotImplementedError(
798 u'Unsupported (void) property %r on %r'
799 % (mname, obj))
e0df6211
PH
800 elif opcode == 93: # findpropstrict
801 index = u30(coder)
802 mname = multinames[index]
803 res = extract_function(mname)
804 stack.append(res)
805 elif opcode == 97: # setproperty
806 index = u30(coder)
807 value = stack.pop()
808 idx = stack.pop()
809 obj = stack.pop()
810 assert isinstance(obj, list)
811 assert isinstance(idx, int)
812 obj[idx] = value
813 elif opcode == 98: # getlocal
814 index = u30(coder)
815 stack.append(registers[index])
816 elif opcode == 99: # setlocal
817 index = u30(coder)
818 value = stack.pop()
819 registers[index] = value
820 elif opcode == 102: # getproperty
821 index = u30(coder)
822 pname = multinames[index]
823 if pname == u'length':
824 obj = stack.pop()
825 assert isinstance(obj, list)
826 stack.append(len(obj))
827 else: # Assume attribute access
828 idx = stack.pop()
829 assert isinstance(idx, int)
830 obj = stack.pop()
831 assert isinstance(obj, list)
832 stack.append(obj[idx])
833 elif opcode == 128: # coerce
0ca96d48 834 u30(coder)
e0df6211
PH
835 elif opcode == 133: # coerce_s
836 assert isinstance(stack[-1], (type(None), compat_str))
837 elif opcode == 164: # modulo
838 value2 = stack.pop()
839 value1 = stack.pop()
840 res = value1 % value2
841 stack.append(res)
a7177865
PH
842 elif opcode == 208: # getlocal_0
843 stack.append(registers[0])
844 elif opcode == 209: # getlocal_1
845 stack.append(registers[1])
846 elif opcode == 210: # getlocal_2
847 stack.append(registers[2])
848 elif opcode == 211: # getlocal_3
849 stack.append(registers[3])
e0df6211
PH
850 elif opcode == 214: # setlocal_2
851 registers[2] = stack.pop()
852 elif opcode == 215: # setlocal_3
853 registers[3] = stack.pop()
854 else:
855 raise NotImplementedError(
856 u'Unsupported opcode %d' % opcode)
857
858 method_pyfunctions[func_name] = resfunc
859 return resfunc
860
861 initial_function = extract_function(u'decipher')
862 return lambda s: initial_function([s])
863
83799698 864 def _decrypt_signature(self, s, video_id, player_url, age_gate=False):
257a2501 865 """Turn the encrypted s field into a working signature"""
6b37f0be 866
83799698 867 if player_url is not None:
9f9be844
PH
868 if player_url.startswith(u'//'):
869 player_url = u'https:' + player_url
e0df6211 870 try:
7f8ae73a
PH
871 player_id = (player_url, len(s))
872 if player_id not in self._player_cache:
83799698 873 func = self._extract_signature_function(
c4417ddb 874 video_id, player_url, len(s)
e0df6211 875 )
7f8ae73a
PH
876 self._player_cache[player_id] = func
877 func = self._player_cache[player_id]
edf3e38e
PH
878 if self._downloader.params.get('youtube_print_sig_code'):
879 self._print_sig_code(func, len(s))
880 return func(s)
0ca96d48 881 except Exception:
e0df6211 882 tb = traceback.format_exc()
83799698
PH
883 self._downloader.report_warning(
884 u'Automatic signature extraction failed: ' + tb)
e0df6211 885
d2d8f895
PH
886 self._downloader.report_warning(
887 u'Warning: Falling back to static signature algorithm')
920de7a2 888
2f2ffea9
PH
889 return self._static_decrypt_signature(
890 s, video_id, player_url, age_gate)
e0df6211 891
2f2ffea9 892 def _static_decrypt_signature(self, s, video_id, player_url, age_gate):
e0df6211
PH
893 if age_gate:
894 # The videos with age protection use another player, so the
895 # algorithms can be different.
896 if len(s) == 86:
897 return s[2:63] + s[82] + s[64:82] + s[63]
898
bc4b9008 899 if len(s) == 93:
900 return s[86:29:-1] + s[88] + s[28:5:-1]
901 elif len(s) == 92:
444b1165 902 return s[25] + s[3:25] + s[0] + s[26:42] + s[79] + s[43:79] + s[91] + s[80:83]
38d025b3
JMF
903 elif len(s) == 91:
904 return s[84:27:-1] + s[86] + s[26:5:-1]
444b1165
JMF
905 elif len(s) == 90:
906 return s[25] + s[3:25] + s[2] + s[26:40] + s[77] + s[41:77] + s[89] + s[78:81]
8a9d86a2 907 elif len(s) == 89:
908 return s[84:78:-1] + s[87] + s[77:60:-1] + s[0] + s[59:3:-1]
444b1165 909 elif len(s) == 88:
3e223834 910 return s[7:28] + s[87] + s[29:45] + s[55] + s[46:55] + s[2] + s[56:87] + s[28]
be547e1d 911 elif len(s) == 87:
3a725669 912 return s[6:27] + s[4] + s[28:39] + s[27] + s[40:59] + s[2] + s[60:]
be547e1d 913 elif len(s) == 86:
f2c327fd 914 return s[80:72:-1] + s[16] + s[71:39:-1] + s[72] + s[38:16:-1] + s[82] + s[15::-1]
be547e1d 915 elif len(s) == 85:
6ae8ee3f 916 return s[3:11] + s[0] + s[12:55] + s[84] + s[56:84]
be547e1d 917 elif len(s) == 84:
6f56389b 918 return s[78:70:-1] + s[14] + s[69:37:-1] + s[70] + s[36:14:-1] + s[80] + s[:14][::-1]
be547e1d 919 elif len(s) == 83:
920de7a2 920 return s[80:63:-1] + s[0] + s[62:0:-1] + s[63]
be547e1d 921 elif len(s) == 82:
c21315f2 922 return s[80:37:-1] + s[7] + s[36:7:-1] + s[0] + s[6:0:-1] + s[37]
be547e1d 923 elif len(s) == 81:
aedd6bb9 924 return s[56] + s[79:56:-1] + s[41] + s[55:41:-1] + s[80] + s[40:34:-1] + s[0] + s[33:29:-1] + s[34] + s[28:9:-1] + s[29] + s[8:0:-1] + s[9]
066090dd
JMF
925 elif len(s) == 80:
926 return s[1:19] + s[0] + s[20:68] + s[19] + s[69:80]
5c468ca8
JMF
927 elif len(s) == 79:
928 return s[54] + s[77:54:-1] + s[39] + s[53:39:-1] + s[78] + s[38:34:-1] + s[0] + s[33:29:-1] + s[34] + s[28:9:-1] + s[29] + s[8:0:-1] + s[9]
be547e1d
PH
929
930 else:
931 raise ExtractorError(u'Unable to decrypt signature, key length %d not supported; retrying might work' % (len(s)))
c5e8d7af 932
1f343eaa 933 def _get_available_subtitles(self, video_id, webpage):
de7f3446 934 try:
7fad1c63 935 sub_list = self._download_webpage(
38c2e5b8 936 'https://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,
7fad1c63
JMF
937 video_id, note=False)
938 except ExtractorError as err:
de7f3446
JMF
939 self._downloader.report_warning(u'unable to download video subtitles: %s' % compat_str(err))
940 return {}
941 lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
942
943 sub_lang_list = {}
944 for l in lang_list:
945 lang = l[1]
946 params = compat_urllib_parse.urlencode({
947 'lang': lang,
948 'v': video_id,
ca715127 949 'fmt': self._downloader.params.get('subtitlesformat', 'srt'),
c3197e3e 950 'name': unescapeHTML(l[0]).encode('utf-8'),
de7f3446 951 })
38c2e5b8 952 url = u'https://www.youtube.com/api/timedtext?' + params
de7f3446
JMF
953 sub_lang_list[lang] = url
954 if not sub_lang_list:
955 self._downloader.report_warning(u'video doesn\'t have subtitles')
956 return {}
957 return sub_lang_list
958
055e6f36 959 def _get_available_automatic_caption(self, video_id, webpage):
de7f3446
JMF
960 """We need the webpage for getting the captions url, pass it as an
961 argument to speed up the process."""
ca715127 962 sub_format = self._downloader.params.get('subtitlesformat', 'srt')
de7f3446
JMF
963 self.to_screen(u'%s: Looking for automatic captions' % video_id)
964 mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
055e6f36 965 err_msg = u'Couldn\'t find automatic captions for %s' % video_id
de7f3446
JMF
966 if mobj is None:
967 self._downloader.report_warning(err_msg)
968 return {}
969 player_config = json.loads(mobj.group(1))
970 try:
971 args = player_config[u'args']
972 caption_url = args[u'ttsurl']
973 timestamp = args[u'timestamp']
055e6f36
JMF
974 # We get the available subtitles
975 list_params = compat_urllib_parse.urlencode({
976 'type': 'list',
977 'tlangs': 1,
978 'asrs': 1,
de7f3446 979 })
055e6f36 980 list_url = caption_url + '&' + list_params
e26f8712 981 caption_list = self._download_xml(list_url, video_id)
e3dc22ca 982 original_lang_node = caption_list.find('track')
f6a54188 983 if original_lang_node is None or original_lang_node.attrib.get('kind') != 'asr' :
e3dc22ca
JMF
984 self._downloader.report_warning(u'Video doesn\'t have automatic captions')
985 return {}
986 original_lang = original_lang_node.attrib['lang_code']
055e6f36
JMF
987
988 sub_lang_list = {}
989 for lang_node in caption_list.findall('target'):
990 sub_lang = lang_node.attrib['lang_code']
991 params = compat_urllib_parse.urlencode({
992 'lang': original_lang,
993 'tlang': sub_lang,
994 'fmt': sub_format,
995 'ts': timestamp,
996 'kind': 'asr',
997 })
998 sub_lang_list[sub_lang] = caption_url + '&' + params
999 return sub_lang_list
de7f3446
JMF
1000 # An extractor error can be raise by the download process if there are
1001 # no automatic captions but there are subtitles
1002 except (KeyError, ExtractorError):
1003 self._downloader.report_warning(err_msg)
1004 return {}
1005
97665381
PH
1006 @classmethod
1007 def extract_id(cls, url):
1008 mobj = re.match(cls._VALID_URL, url, re.VERBOSE)
c5e8d7af
PH
1009 if mobj is None:
1010 raise ExtractorError(u'Invalid URL: %s' % url)
1011 video_id = mobj.group(2)
1012 return video_id
1013
1d043b93
JMF
1014 def _extract_from_m3u8(self, manifest_url, video_id):
1015 url_map = {}
1016 def _get_urls(_manifest):
1017 lines = _manifest.split('\n')
1018 urls = filter(lambda l: l and not l.startswith('#'),
1019 lines)
1020 return urls
1021 manifest = self._download_webpage(manifest_url, video_id, u'Downloading formats manifest')
1022 formats_urls = _get_urls(manifest)
1023 for format_url in formats_urls:
890f62e8 1024 itag = self._search_regex(r'itag/(\d+?)/', format_url, 'itag')
1d043b93
JMF
1025 url_map[itag] = format_url
1026 return url_map
1027
1fb07d10
JG
1028 def _extract_annotations(self, video_id):
1029 url = 'https://www.youtube.com/annotations_invideo?features=1&legacy=1&video_id=%s' % video_id
1030 return self._download_webpage(url, video_id, note=u'Searching for annotations.', errnote=u'Unable to download video annotations.')
1031
c5e8d7af 1032 def _real_extract(self, url):
7e8c0af0
PH
1033 proto = (
1034 u'http' if self._downloader.params.get('prefer_insecure', False)
1035 else u'https')
1036
c5e8d7af
PH
1037 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
1038 mobj = re.search(self._NEXT_URL_RE, url)
1039 if mobj:
7e8c0af0 1040 url = proto + '://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
97665381 1041 video_id = self.extract_id(url)
c5e8d7af
PH
1042
1043 # Get video webpage
7e8c0af0 1044 url = proto + '://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
336c3a69 1045 video_webpage = self._download_webpage(url, video_id)
c5e8d7af
PH
1046
1047 # Attempt to extract SWF player URL
e0df6211 1048 mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
c5e8d7af
PH
1049 if mobj is not None:
1050 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1051 else:
1052 player_url = None
1053
1054 # Get video info
1055 self.report_video_info_webpage_download(video_id)
c108eb73
JMF
1056 if re.search(r'player-age-gate-content">', video_webpage) is not None:
1057 self.report_age_confirmation()
1058 age_gate = True
1059 # We simulate the access to the video from www.youtube.com/v/{video_id}
1060 # this can be viewed without login into Youtube
1061 data = compat_urllib_parse.urlencode({'video_id': video_id,
fccd3771 1062 'el': 'player_embedded',
c108eb73
JMF
1063 'gl': 'US',
1064 'hl': 'en',
1065 'eurl': 'https://youtube.googleapis.com/v/' + video_id,
1066 'asv': 3,
1067 'sts':'1588',
1068 })
7e8c0af0 1069 video_info_url = proto + '://www.youtube.com/get_video_info?' + data
c5e8d7af
PH
1070 video_info_webpage = self._download_webpage(video_info_url, video_id,
1071 note=False,
1072 errnote='unable to download video info webpage')
1073 video_info = compat_parse_qs(video_info_webpage)
c108eb73
JMF
1074 else:
1075 age_gate = False
1076 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
7e8c0af0 1077 video_info_url = (proto + '://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
c108eb73
JMF
1078 % (video_id, el_type))
1079 video_info_webpage = self._download_webpage(video_info_url, video_id,
1080 note=False,
1081 errnote='unable to download video info webpage')
1082 video_info = compat_parse_qs(video_info_webpage)
1083 if 'token' in video_info:
1084 break
c5e8d7af
PH
1085 if 'token' not in video_info:
1086 if 'reason' in video_info:
d11271dd
PH
1087 raise ExtractorError(
1088 u'YouTube said: %s' % video_info['reason'][0],
1089 expected=True, video_id=video_id)
c5e8d7af 1090 else:
d11271dd
PH
1091 raise ExtractorError(
1092 u'"token" parameter not in video info for unknown reason',
1093 video_id=video_id)
c5e8d7af 1094
1d699755
PH
1095 if 'view_count' in video_info:
1096 view_count = int(video_info['view_count'][0])
1097 else:
1098 view_count = None
1099
c5e8d7af
PH
1100 # Check for "rental" videos
1101 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
1102 raise ExtractorError(u'"rental" videos not supported')
1103
1104 # Start extracting information
1105 self.report_information_extraction(video_id)
1106
1107 # uploader
1108 if 'author' not in video_info:
1109 raise ExtractorError(u'Unable to extract uploader name')
1110 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
1111
1112 # uploader_id
1113 video_uploader_id = None
1114 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
1115 if mobj is not None:
1116 video_uploader_id = mobj.group(1)
1117 else:
1118 self._downloader.report_warning(u'unable to extract uploader nickname')
1119
1120 # title
a8c6b241 1121 if 'title' in video_info:
aa92f063 1122 video_title = video_info['title'][0]
a8c6b241
PH
1123 else:
1124 self._downloader.report_warning(u'Unable to extract video title')
1125 video_title = u'_'
c5e8d7af
PH
1126
1127 # thumbnail image
7763b04e
JMF
1128 # We try first to get a high quality image:
1129 m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">',
1130 video_webpage, re.DOTALL)
1131 if m_thumb is not None:
1132 video_thumbnail = m_thumb.group(1)
1133 elif 'thumbnail_url' not in video_info:
c5e8d7af 1134 self._downloader.report_warning(u'unable to extract video thumbnail')
f490e77e 1135 video_thumbnail = None
c5e8d7af
PH
1136 else: # don't panic if we can't find it
1137 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
1138
1139 # upload date
1140 upload_date = None
ad3bc6ac 1141 mobj = re.search(r'(?s)id="eow-date.*?>(.*?)</span>', video_webpage)
beee53de
PH
1142 if mobj is None:
1143 mobj = re.search(
263bd4ec 1144 r'(?s)id="watch-uploader-info".*?>.*?(?:Published|Uploaded|Streamed live) on (.*?)</strong>',
beee53de 1145 video_webpage)
c5e8d7af
PH
1146 if mobj is not None:
1147 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1148 upload_date = unified_strdate(upload_date)
1149
ec8deefc
DG
1150 m_cat_container = get_element_by_id("eow-category", video_webpage)
1151 if m_cat_container:
ad3bc6ac 1152 category = self._html_search_regex(
01ed5c9b 1153 r'(?s)<a[^<]+>(.*?)</a>', m_cat_container, 'category',
ad3bc6ac
PH
1154 default=None)
1155 video_categories = None if category is None else [category]
1156 else:
1157 video_categories = None
ec8deefc 1158
c5e8d7af
PH
1159 # description
1160 video_description = get_element_by_id("eow-description", video_webpage)
1161 if video_description:
27dcce19
PH
1162 video_description = re.sub(r'''(?x)
1163 <a\s+
1164 (?:[a-zA-Z-]+="[^"]+"\s+)*?
1165 title="([^"]+)"\s+
1166 (?:[a-zA-Z-]+="[^"]+"\s+)*?
1167 class="yt-uix-redirect-link"\s*>
1168 [^<]+
1169 </a>
1170 ''', r'\1', video_description)
c5e8d7af
PH
1171 video_description = clean_html(video_description)
1172 else:
1173 fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
1174 if fd_mobj:
1175 video_description = unescapeHTML(fd_mobj.group(1))
1176 else:
1177 video_description = u''
1178
336c3a69 1179 def _extract_count(klass):
46374a56
PH
1180 count = self._search_regex(
1181 r'class="%s">([\d,]+)</span>' % re.escape(klass),
1182 video_webpage, klass, default=None)
336c3a69
JMF
1183 if count is not None:
1184 return int(count.replace(',', ''))
1185 return None
1186 like_count = _extract_count(u'likes-count')
1187 dislike_count = _extract_count(u'dislikes-count')
1188
c5e8d7af 1189 # subtitles
d82134c3 1190 video_subtitles = self.extract_subtitles(video_id, video_webpage)
c5e8d7af 1191
c5e8d7af 1192 if self._downloader.params.get('listsubtitles', False):
d665f8d3 1193 self._list_available_subtitles(video_id, video_webpage)
c5e8d7af
PH
1194 return
1195
1196 if 'length_seconds' not in video_info:
1197 self._downloader.report_warning(u'unable to extract video duration')
b466b702 1198 video_duration = None
c5e8d7af 1199 else:
b466b702 1200 video_duration = int(compat_urllib_parse.unquote_plus(video_info['length_seconds'][0]))
c5e8d7af 1201
1fb07d10
JG
1202 # annotations
1203 video_annotations = None
1204 if self._downloader.params.get('writeannotations', False):
1205 video_annotations = self._extract_annotations(video_id)
1206
c5e8d7af 1207 # Decide which formats to download
c5e8d7af 1208 try:
ae7ed920 1209 mobj = re.search(r';ytplayer\.config\s*=\s*({.*?});', video_webpage)
50be92c1
PH
1210 if not mobj:
1211 raise ValueError('Could not find vevo ID')
ae7ed920
PH
1212 json_code = uppercase_escape(mobj.group(1))
1213 ytplayer_config = json.loads(json_code)
3489b7d2 1214 args = ytplayer_config['args']
7ce7e394
JMF
1215 # Easy way to know if the 's' value is in url_encoded_fmt_stream_map
1216 # this signatures are encrypted
44d46655 1217 if 'url_encoded_fmt_stream_map' not in args:
f10503db 1218 raise ValueError(u'No stream_map present') # caught below
00fe14fc
JMF
1219 re_signature = re.compile(r'[&,]s=')
1220 m_s = re_signature.search(args['url_encoded_fmt_stream_map'])
7ce7e394
JMF
1221 if m_s is not None:
1222 self.to_screen(u'%s: Encrypted signatures detected.' % video_id)
c5e8d7af 1223 video_info['url_encoded_fmt_stream_map'] = [args['url_encoded_fmt_stream_map']]
00fe14fc 1224 m_s = re_signature.search(args.get('adaptive_fmts', u''))
b7a68384 1225 if m_s is not None:
00fe14fc
JMF
1226 if 'adaptive_fmts' in video_info:
1227 video_info['adaptive_fmts'][0] += ',' + args['adaptive_fmts']
37b6d5f6 1228 else:
00fe14fc 1229 video_info['adaptive_fmts'] = [args['adaptive_fmts']]
c5e8d7af
PH
1230 except ValueError:
1231 pass
1232
dd27fd17
PH
1233 def _map_to_format_list(urlmap):
1234 formats = []
1235 for itag, video_real_url in urlmap.items():
1236 dct = {
1237 'format_id': itag,
1238 'url': video_real_url,
1239 'player_url': player_url,
1240 }
0b65e5d4
PH
1241 if itag in self._formats:
1242 dct.update(self._formats[itag])
dd27fd17
PH
1243 formats.append(dct)
1244 return formats
1245
c5e8d7af
PH
1246 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1247 self.report_rtmp_download()
dd27fd17
PH
1248 formats = [{
1249 'format_id': '_rtmp',
1250 'protocol': 'rtmp',
1251 'url': video_info['conn'][0],
1252 'player_url': player_url,
1253 }]
00fe14fc
JMF
1254 elif len(video_info.get('url_encoded_fmt_stream_map', [])) >= 1 or len(video_info.get('adaptive_fmts', [])) >= 1:
1255 encoded_url_map = video_info.get('url_encoded_fmt_stream_map', [''])[0] + ',' + video_info.get('adaptive_fmts',[''])[0]
1256 if 'rtmpe%3Dyes' in encoded_url_map:
a7055eb9 1257 raise ExtractorError('rtmpe downloads are not supported, see https://github.com/rg3/youtube-dl/issues/343 for more information.', expected=True)
c5e8d7af 1258 url_map = {}
00fe14fc 1259 for url_data_str in encoded_url_map.split(','):
c5e8d7af
PH
1260 url_data = compat_parse_qs(url_data_str)
1261 if 'itag' in url_data and 'url' in url_data:
1262 url = url_data['url'][0]
1263 if 'sig' in url_data:
1264 url += '&signature=' + url_data['sig'][0]
1265 elif 's' in url_data:
e0df6211 1266 encrypted_sig = url_data['s'][0]
769fda3c 1267 if self._downloader.params.get('verbose'):
c108eb73 1268 if age_gate:
bdde940e
PH
1269 if player_url is None:
1270 player_version = 'unknown'
1271 else:
1272 player_version = self._search_regex(
1273 r'-(.+)\.swf$', player_url,
1274 u'flash player', fatal=False)
e0df6211 1275 player_desc = 'flash player %s' % player_version
c108eb73 1276 else:
83799698
PH
1277 player_version = self._search_regex(
1278 r'html5player-(.+?)\.js', video_webpage,
c108eb73 1279 'html5 player', fatal=False)
e0df6211
PH
1280 player_desc = u'html5 player %s' % player_version
1281
1282 parts_sizes = u'.'.join(compat_str(len(part)) for part in encrypted_sig.split('.'))
5a76c651 1283 self.to_screen(u'encrypted signature length %d (%s), itag %s, %s' %
e0df6211
PH
1284 (len(encrypted_sig), parts_sizes, url_data['itag'][0], player_desc))
1285
83799698 1286 if not age_gate:
e0df6211
PH
1287 jsplayer_url_json = self._search_regex(
1288 r'"assets":.+?"js":\s*("[^"]+")',
1289 video_webpage, u'JS player URL')
83799698 1290 player_url = json.loads(jsplayer_url_json)
e0df6211 1291
83799698
PH
1292 signature = self._decrypt_signature(
1293 encrypted_sig, video_id, player_url, age_gate)
c5e8d7af
PH
1294 url += '&signature=' + signature
1295 if 'ratebypass' not in url:
1296 url += '&ratebypass=yes'
1297 url_map[url_data['itag'][0]] = url
dd27fd17 1298 formats = _map_to_format_list(url_map)
1d043b93
JMF
1299 elif video_info.get('hlsvp'):
1300 manifest_url = video_info['hlsvp'][0]
1301 url_map = self._extract_from_m3u8(manifest_url, video_id)
dd27fd17 1302 formats = _map_to_format_list(url_map)
c5e8d7af 1303 else:
9abb3204 1304 raise ExtractorError(u'no conn, hlsvp or url_encoded_fmt_stream_map information found in video info')
c5e8d7af 1305
dd27fd17 1306 # Look for the DASH manifest
d68f0cdb 1307 if (self._downloader.params.get('youtube_include_dash_manifest', False)):
dd27fd17 1308 try:
d68f0cdb 1309 # The DASH manifest used needs to be the one from the original video_webpage.
1310 # The one found in get_video_info seems to be using different signatures.
1311 # However, in the case of an age restriction there won't be any embedded dashmpd in the video_webpage.
1312 # Luckily, it seems, this case uses some kind of default signature (len == 86), so the
1313 # combination of get_video_info and the _static_decrypt_signature() decryption fallback will work here.
1314 if age_gate:
3489b7d2 1315 dash_manifest_url = video_info.get('dashmpd')[0]
d68f0cdb 1316 else:
3489b7d2 1317 dash_manifest_url = ytplayer_config['args']['dashmpd']
d68f0cdb 1318 def decrypt_sig(mobj):
1319 s = mobj.group(1)
1320 dec_s = self._decrypt_signature(s, video_id, player_url, age_gate)
1321 return '/signature/%s' % dec_s
1322 dash_manifest_url = re.sub(r'/s/([\w\.]+)', decrypt_sig, dash_manifest_url)
dd27fd17 1323 dash_doc = self._download_xml(
d68f0cdb 1324 dash_manifest_url, video_id,
dd27fd17
PH
1325 note=u'Downloading DASH manifest',
1326 errnote=u'Could not download DASH manifest')
1327 for r in dash_doc.findall(u'.//{urn:mpeg:DASH:schema:MPD:2011}Representation'):
1328 url_el = r.find('{urn:mpeg:DASH:schema:MPD:2011}BaseURL')
1329 if url_el is None:
1330 continue
1331 format_id = r.attrib['id']
1332 video_url = url_el.text
1333 filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength'))
1334 f = {
1335 'format_id': format_id,
1336 'url': video_url,
1337 'width': int_or_none(r.attrib.get('width')),
1338 'tbr': int_or_none(r.attrib.get('bandwidth'), 1000),
1339 'asr': int_or_none(r.attrib.get('audioSamplingRate')),
1340 'filesize': filesize,
1341 }
1342 try:
1343 existing_format = next(
1344 fo for fo in formats
1345 if fo['format_id'] == format_id)
1346 except StopIteration:
1347 f.update(self._formats.get(format_id, {}))
1348 formats.append(f)
1349 else:
1350 existing_format.update(f)
1351
1352 except (ExtractorError, KeyError) as e:
1353 self.report_warning(u'Skipping DASH manifest: %s' % e, video_id)
d80044c2 1354
4bcc7bd1 1355 self._sort_formats(formats)
4ea3be0a 1356
1357 return {
1358 'id': video_id,
1359 'uploader': video_uploader,
1360 'uploader_id': video_uploader_id,
1361 'upload_date': upload_date,
1362 'title': video_title,
1363 'thumbnail': video_thumbnail,
1364 'description': video_description,
ec8deefc 1365 'categories': video_categories,
4ea3be0a 1366 'subtitles': video_subtitles,
1367 'duration': video_duration,
1368 'age_limit': 18 if age_gate else 0,
1369 'annotations': video_annotations,
7e8c0af0 1370 'webpage_url': proto + '://www.youtube.com/watch?v=%s' % video_id,
4ea3be0a 1371 'view_count': view_count,
1372 'like_count': like_count,
1373 'dislike_count': dislike_count,
1374 'formats': formats,
1375 }
c5e8d7af 1376
880e1c52 1377class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
0f818663 1378 IE_DESC = u'YouTube.com playlists'
d67cc9fa 1379 _VALID_URL = r"""(?x)(?:
c5e8d7af
PH
1380 (?:https?://)?
1381 (?:\w+\.)?
1382 youtube\.com/
1383 (?:
1384 (?:course|view_play_list|my_playlists|artist|playlist|watch)
1385 \? (?:.*?&)*? (?:p|a|list)=
1386 | p/
1387 )
d67cc9fa
JMF
1388 (
1389 (?:PL|EC|UU|FL|RD)?[0-9A-Za-z-_]{10,}
1390 # Top tracks, they can also include dots
1391 |(?:MC)[\w\.]*
1392 )
c5e8d7af
PH
1393 .*
1394 |
715c8e7b 1395 ((?:PL|EC|UU|FL|RD)[0-9A-Za-z-_]{10,})
c5e8d7af 1396 )"""
dbb94fb0 1397 _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s'
dcbb4580 1398 _MORE_PAGES_INDICATOR = r'data-link-type="next"'
dbb94fb0 1399 _VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})&amp;[^"]*?index=(?P<index>\d+)'
c5e8d7af
PH
1400 IE_NAME = u'youtube:playlist'
1401
880e1c52
JMF
1402 def _real_initialize(self):
1403 self._login()
1404
652cdaa2
JMF
1405 def _ids_to_results(self, ids):
1406 return [self.url_result(vid_id, 'Youtube', video_id=vid_id)
1407 for vid_id in ids]
1408
1409 def _extract_mix(self, playlist_id):
1410 # The mixes are generated from a a single video
1411 # the id of the playlist is just 'RD' + video_id
7d4afc55 1412 url = 'https://youtube.com/watch?v=%s&list=%s' % (playlist_id[-11:], playlist_id)
652cdaa2 1413 webpage = self._download_webpage(url, playlist_id, u'Downloading Youtube mix')
bc2f773b
JMF
1414 search_title = lambda class_name: get_element_by_attribute('class', class_name, webpage)
1415 title_span = (search_title('playlist-title') or
1416 search_title('title long-title') or search_title('title'))
76d1700b 1417 title = clean_html(title_span)
70e32269 1418 video_re = r'''(?x)data-video-username=".*?".*?
bc2f773b 1419 href="/watch\?v=([0-9A-Za-z_-]{11})&amp;[^"]*?list=%s''' % re.escape(playlist_id)
70e32269 1420 ids = orderedSet(re.findall(video_re, webpage, flags=re.DOTALL))
652cdaa2
JMF
1421 url_results = self._ids_to_results(ids)
1422
1423 return self.playlist_result(url_results, playlist_id, title)
1424
c5e8d7af
PH
1425 def _real_extract(self, url):
1426 # Extract playlist id
d67cc9fa 1427 mobj = re.match(self._VALID_URL, url)
c5e8d7af
PH
1428 if mobj is None:
1429 raise ExtractorError(u'Invalid URL: %s' % url)
47192f92
FV
1430 playlist_id = mobj.group(1) or mobj.group(2)
1431
1432 # Check if it's a video-specific URL
7c61bd36 1433 query_dict = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
47192f92
FV
1434 if 'v' in query_dict:
1435 video_id = query_dict['v'][0]
1436 if self._downloader.params.get('noplaylist'):
1437 self.to_screen(u'Downloading just video %s because of --no-playlist' % video_id)
7012b23c 1438 return self.url_result(video_id, 'Youtube', video_id=video_id)
47192f92 1439 else:
1db26669 1440 self.to_screen(u'Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id))
c5e8d7af 1441
7d4afc55 1442 if playlist_id.startswith('RD'):
652cdaa2
JMF
1443 # Mixes require a custom extraction process
1444 return self._extract_mix(playlist_id)
0a688bc0
JMF
1445 if playlist_id.startswith('TL'):
1446 raise ExtractorError(u'For downloading YouTube.com top lists, use '
1447 u'the "yttoplist" keyword, for example "youtube-dl \'yttoplist:music:Top Tracks\'"', expected=True)
652cdaa2 1448
dbb94fb0
S
1449 url = self._TEMPLATE_URL % playlist_id
1450 page = self._download_webpage(url, playlist_id)
1451 more_widget_html = content_html = page
1452
10c0e2d8 1453 # Check if the playlist exists or is private
e399853d 1454 if re.search(r'<div class="yt-alert-message">[^<]*?(The|This) playlist (does not exist|is private)[^<]*?</div>', page) is not None:
10c0e2d8
JMF
1455 raise ExtractorError(
1456 u'The playlist doesn\'t exist or is private, use --username or '
1457 '--netrc to access it.',
1458 expected=True)
1459
dcbb4580
JMF
1460 # Extract the video ids from the playlist pages
1461 ids = []
c5e8d7af 1462
755eb032 1463 for page_num in itertools.count(1):
dbb94fb0 1464 matches = re.finditer(self._VIDEO_RE, content_html)
6e47b51e
JMF
1465 # We remove the duplicates and the link with index 0
1466 # (it's not the first video of the playlist)
1467 new_ids = orderedSet(m.group('id') for m in matches if m.group('index') != '0')
dcbb4580 1468 ids.extend(new_ids)
c5e8d7af 1469
dbb94fb0
S
1470 mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
1471 if not mobj:
c5e8d7af
PH
1472 break
1473
dbb94fb0 1474 more = self._download_json(
5912c639
PH
1475 'https://youtube.com/%s' % mobj.group('more'), playlist_id,
1476 'Downloading page #%s' % page_num,
1477 transform_source=uppercase_escape)
dbb94fb0
S
1478 content_html = more['content_html']
1479 more_widget_html = more['load_more_widget_html']
1480
1481 playlist_title = self._html_search_regex(
68eb8e90
PH
1482 r'(?s)<h1 class="pl-header-title[^"]*">\s*(.*?)\s*</h1>',
1483 page, u'title')
c5e8d7af 1484
652cdaa2 1485 url_results = self._ids_to_results(ids)
dcbb4580 1486 return self.playlist_result(url_results, playlist_id, playlist_title)
c5e8d7af
PH
1487
1488
0a688bc0
JMF
1489class YoutubeTopListIE(YoutubePlaylistIE):
1490 IE_NAME = u'youtube:toplist'
1491 IE_DESC = (u'YouTube.com top lists, "yttoplist:{channel}:{list title}"'
1492 u' (Example: "yttoplist:music:Top Tracks")')
1493 _VALID_URL = r'yttoplist:(?P<chann>.*?):(?P<title>.*?)$'
1494
1495 def _real_extract(self, url):
1496 mobj = re.match(self._VALID_URL, url)
1497 channel = mobj.group('chann')
1498 title = mobj.group('title')
1499 query = compat_urllib_parse.urlencode({'title': title})
beddbc2a 1500 playlist_re = 'href="([^"]+?%s.*?)"' % re.escape(query)
0a688bc0
JMF
1501 channel_page = self._download_webpage('https://www.youtube.com/%s' % channel, title)
1502 link = self._html_search_regex(playlist_re, channel_page, u'list')
1503 url = compat_urlparse.urljoin('https://www.youtube.com/', link)
1504
1505 video_re = r'data-index="\d+".*?data-video-id="([0-9A-Za-z_-]{11})"'
1506 ids = []
1507 # sometimes the webpage doesn't contain the videos
1508 # retry until we get them
1509 for i in itertools.count(0):
1510 msg = u'Downloading Youtube mix'
1511 if i > 0:
1512 msg += ', retry #%d' % i
1513 webpage = self._download_webpage(url, title, msg)
1514 ids = orderedSet(re.findall(video_re, webpage))
1515 if ids:
1516 break
1517 url_results = self._ids_to_results(ids)
1518 return self.playlist_result(url_results, playlist_title=title)
1519
1520
c5e8d7af 1521class YoutubeChannelIE(InfoExtractor):
0f818663 1522 IE_DESC = u'YouTube.com channels'
c5e8d7af 1523 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
c5e8d7af 1524 _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
38c2e5b8 1525 _MORE_PAGES_URL = 'https://www.youtube.com/c4_browse_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
c5e8d7af
PH
1526 IE_NAME = u'youtube:channel'
1527
1528 def extract_videos_from_page(self, page):
1529 ids_in_page = []
1530 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1531 if mobj.group(1) not in ids_in_page:
1532 ids_in_page.append(mobj.group(1))
1533 return ids_in_page
1534
1535 def _real_extract(self, url):
1536 # Extract channel id
1537 mobj = re.match(self._VALID_URL, url)
1538 if mobj is None:
1539 raise ExtractorError(u'Invalid URL: %s' % url)
1540
1541 # Download channel page
1542 channel_id = mobj.group(1)
1543 video_ids = []
b9643eed
JMF
1544 url = 'https://www.youtube.com/channel/%s/videos' % channel_id
1545 channel_page = self._download_webpage(url, channel_id)
31812a9e
PH
1546 autogenerated = re.search(r'''(?x)
1547 class="[^"]*?(?:
1548 channel-header-autogenerated-label|
1549 yt-channel-title-autogenerated
1550 )[^"]*"''', channel_page) is not None
c5e8d7af 1551
b9643eed
JMF
1552 if autogenerated:
1553 # The videos are contained in a single page
1554 # the ajax pages can't be used, they are empty
1555 video_ids = self.extract_videos_from_page(channel_page)
1556 else:
1557 # Download all channel pages using the json-based channel_ajax query
1558 for pagenum in itertools.count(1):
1559 url = self._MORE_PAGES_URL % (pagenum, channel_id)
81c2f20b
PH
1560 page = self._download_json(
1561 url, channel_id, note=u'Downloading page #%s' % pagenum,
1562 transform_source=uppercase_escape)
1563
b9643eed
JMF
1564 ids_in_page = self.extract_videos_from_page(page['content_html'])
1565 video_ids.extend(ids_in_page)
1566
1567 if self._MORE_PAGES_INDICATOR not in page['load_more_widget_html']:
1568 break
c5e8d7af
PH
1569
1570 self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1571
7012b23c
PH
1572 url_entries = [self.url_result(video_id, 'Youtube', video_id=video_id)
1573 for video_id in video_ids]
1574 return self.playlist_result(url_entries, channel_id)
c5e8d7af
PH
1575
1576
1577class YoutubeUserIE(InfoExtractor):
0f818663 1578 IE_DESC = u'YouTube.com user videos (URL or "ytuser" keyword)'
c9ae7b95 1579 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:user/)?(?!(?:attribution_link|watch|results)(?:$|[^a-z_A-Z0-9-])))|ytuser:)(?!feed/)([A-Za-z0-9_-]+)'
38c2e5b8 1580 _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/users/%s'
c5e8d7af 1581 _GDATA_PAGE_SIZE = 50
38c2e5b8 1582 _GDATA_URL = 'https://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d&alt=json'
c5e8d7af
PH
1583 IE_NAME = u'youtube:user'
1584
e3ea4790 1585 @classmethod
f4b05232 1586 def suitable(cls, url):
e3ea4790
JMF
1587 # Don't return True if the url can be extracted with other youtube
1588 # extractor, the regex would is too permissive and it would match.
1589 other_ies = iter(klass for (name, klass) in globals().items() if name.endswith('IE') and klass is not cls)
1590 if any(ie.suitable(url) for ie in other_ies): return False
f4b05232
JMF
1591 else: return super(YoutubeUserIE, cls).suitable(url)
1592
c5e8d7af
PH
1593 def _real_extract(self, url):
1594 # Extract username
1595 mobj = re.match(self._VALID_URL, url)
1596 if mobj is None:
1597 raise ExtractorError(u'Invalid URL: %s' % url)
1598
1599 username = mobj.group(1)
1600
1601 # Download video ids using YouTube Data API. Result size per
1602 # query is limited (currently to 50 videos) so we need to query
1603 # page by page until there are no video ids - it means we got
1604 # all of them.
1605
b7ab0590 1606 def download_page(pagenum):
c5e8d7af
PH
1607 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1608
1609 gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
b7ab0590
PH
1610 page = self._download_webpage(
1611 gdata_url, username,
1612 u'Downloading video ids from %d to %d' % (
1613 start_index, start_index + self._GDATA_PAGE_SIZE))
c5e8d7af 1614
fd9cf738
JMF
1615 try:
1616 response = json.loads(page)
1617 except ValueError as err:
1618 raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
71c82637 1619 if 'entry' not in response['feed']:
b7ab0590 1620 return
fd9cf738 1621
c5e8d7af 1622 # Extract video identifiers
e302f9ce
PH
1623 entries = response['feed']['entry']
1624 for entry in entries:
1625 title = entry['title']['$t']
1626 video_id = entry['id']['$t'].split('/')[-1]
b7ab0590 1627 yield {
e302f9ce
PH
1628 '_type': 'url',
1629 'url': video_id,
1630 'ie_key': 'Youtube',
b11cec41 1631 'id': video_id,
e302f9ce 1632 'title': title,
b7ab0590
PH
1633 }
1634 url_results = PagedList(download_page, self._GDATA_PAGE_SIZE)
c5e8d7af 1635
7012b23c
PH
1636 return self.playlist_result(url_results, playlist_title=username)
1637
b05654f0
PH
1638
1639class YoutubeSearchIE(SearchInfoExtractor):
0f818663 1640 IE_DESC = u'YouTube.com searches'
83d548ef 1641 _API_URL = u'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
b05654f0
PH
1642 _MAX_RESULTS = 1000
1643 IE_NAME = u'youtube:search'
1644 _SEARCH_KEY = 'ytsearch'
1645
b05654f0
PH
1646 def _get_n_results(self, query, n):
1647 """Get a specified number of results for a query"""
1648
1649 video_ids = []
1650 pagenum = 0
1651 limit = n
83d548ef 1652 PAGE_SIZE = 50
b05654f0 1653
83d548ef
PH
1654 while (PAGE_SIZE * pagenum) < limit:
1655 result_url = self._API_URL % (
1656 compat_urllib_parse.quote_plus(query.encode('utf-8')),
1657 (PAGE_SIZE * pagenum) + 1)
7cc3570e
PH
1658 data_json = self._download_webpage(
1659 result_url, video_id=u'query "%s"' % query,
1660 note=u'Downloading page %s' % (pagenum + 1),
1661 errnote=u'Unable to download API page')
1662 data = json.loads(data_json)
1663 api_response = data['data']
1664
1665 if 'items' not in api_response:
07ad22b8
PH
1666 raise ExtractorError(
1667 u'[youtube] No video results', expected=True)
b05654f0
PH
1668
1669 new_ids = list(video['id'] for video in api_response['items'])
1670 video_ids += new_ids
1671
1672 limit = min(n, api_response['totalItems'])
1673 pagenum += 1
1674
1675 if len(video_ids) > n:
1676 video_ids = video_ids[:n]
7012b23c
PH
1677 videos = [self.url_result(video_id, 'Youtube', video_id=video_id)
1678 for video_id in video_ids]
b05654f0 1679 return self.playlist_result(videos, query)
75dff0ee 1680
c9ae7b95 1681
a3dd9248 1682class YoutubeSearchDateIE(YoutubeSearchIE):
cb7fb546 1683 IE_NAME = YoutubeSearchIE.IE_NAME + ':date'
a3dd9248
CM
1684 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc&orderby=published'
1685 _SEARCH_KEY = 'ytsearchdate'
08fb86c4 1686 IE_DESC = u'YouTube.com searches, newest videos first'
75dff0ee 1687
c9ae7b95
PH
1688
1689class YoutubeSearchURLIE(InfoExtractor):
1690 IE_DESC = u'YouTube.com search URLs'
1691 IE_NAME = u'youtube:search_url'
1692 _VALID_URL = r'https?://(?:www\.)?youtube\.com/results\?(.*?&)?search_query=(?P<query>[^&]+)(?:[&]|$)'
1693
1694 def _real_extract(self, url):
1695 mobj = re.match(self._VALID_URL, url)
1696 query = compat_urllib_parse.unquote_plus(mobj.group('query'))
1697
1698 webpage = self._download_webpage(url, query)
1699 result_code = self._search_regex(
1700 r'(?s)<ol id="search-results"(.*?)</ol>', webpage, u'result HTML')
1701
1702 part_codes = re.findall(
1703 r'(?s)<h3 class="yt-lockup-title">(.*?)</h3>', result_code)
1704 entries = []
1705 for part_code in part_codes:
1706 part_title = self._html_search_regex(
1707 r'(?s)title="([^"]+)"', part_code, 'item title', fatal=False)
1708 part_url_snippet = self._html_search_regex(
1709 r'(?s)href="([^"]+)"', part_code, 'item URL')
1710 part_url = compat_urlparse.urljoin(
1711 'https://www.youtube.com/', part_url_snippet)
1712 entries.append({
1713 '_type': 'url',
1714 'url': part_url,
1715 'title': part_title,
1716 })
1717
1718 return {
1719 '_type': 'playlist',
1720 'entries': entries,
1721 'title': query,
1722 }
1723
1724
75dff0ee 1725class YoutubeShowIE(InfoExtractor):
0f818663 1726 IE_DESC = u'YouTube.com (multi-season) shows'
75dff0ee
JMF
1727 _VALID_URL = r'https?://www\.youtube\.com/show/(.*)'
1728 IE_NAME = u'youtube:show'
1729
1730 def _real_extract(self, url):
1731 mobj = re.match(self._VALID_URL, url)
1732 show_name = mobj.group(1)
1733 webpage = self._download_webpage(url, show_name, u'Downloading show webpage')
1734 # There's one playlist for each season of the show
1735 m_seasons = list(re.finditer(r'href="(/playlist\?list=.*?)"', webpage))
1736 self.to_screen(u'%s: Found %s seasons' % (show_name, len(m_seasons)))
1737 return [self.url_result('https://www.youtube.com' + season.group(1), 'YoutubePlaylist') for season in m_seasons]
04cc9617
JMF
1738
1739
b2e8bc1b 1740class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
d7ae0639
JMF
1741 """
1742 Base class for extractors that fetch info from
1743 http://www.youtube.com/feed_ajax
1744 Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.
1745 """
b2e8bc1b 1746 _LOGIN_REQUIRED = True
43ba5456
JMF
1747 # use action_load_personal_feed instead of action_load_system_feed
1748 _PERSONAL_FEED = False
04cc9617 1749
d7ae0639
JMF
1750 @property
1751 def _FEED_TEMPLATE(self):
43ba5456
JMF
1752 action = 'action_load_system_feed'
1753 if self._PERSONAL_FEED:
1754 action = 'action_load_personal_feed'
38c2e5b8 1755 return 'https://www.youtube.com/feed_ajax?%s=1&feed_name=%s&paging=%%s' % (action, self._FEED_NAME)
d7ae0639
JMF
1756
1757 @property
1758 def IE_NAME(self):
1759 return u'youtube:%s' % self._FEED_NAME
04cc9617 1760
81f0259b 1761 def _real_initialize(self):
b2e8bc1b 1762 self._login()
81f0259b 1763
04cc9617
JMF
1764 def _real_extract(self, url):
1765 feed_entries = []
0e44d838
JMF
1766 paging = 0
1767 for i in itertools.count(1):
f6177462 1768 info = self._download_json(self._FEED_TEMPLATE % paging,
d7ae0639 1769 u'%s feed' % self._FEED_NAME,
04cc9617 1770 u'Downloading page %s' % i)
f6177462 1771 feed_html = info.get('feed_html') or info.get('content_html')
43ba5456 1772 m_ids = re.finditer(r'"/watch\?v=(.*?)["&]', feed_html)
04cc9617 1773 ids = orderedSet(m.group(1) for m in m_ids)
7012b23c
PH
1774 feed_entries.extend(
1775 self.url_result(video_id, 'Youtube', video_id=video_id)
1776 for video_id in ids)
05ee2b6d
JMF
1777 mobj = re.search(
1778 r'data-uix-load-more-href="/?[^"]+paging=(?P<paging>\d+)',
1779 feed_html)
1780 if mobj is None:
04cc9617 1781 break
05ee2b6d 1782 paging = mobj.group('paging')
d7ae0639
JMF
1783 return self.playlist_result(feed_entries, playlist_title=self._PLAYLIST_TITLE)
1784
1785class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor):
e45d40b1 1786 IE_DESC = u'YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication)'
d7ae0639
JMF
1787 _VALID_URL = r'https?://www\.youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
1788 _FEED_NAME = 'subscriptions'
1789 _PLAYLIST_TITLE = u'Youtube Subscriptions'
1790
1791class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
1792 IE_DESC = u'YouTube.com recommended videos, "ytrec" keyword (requires authentication)'
1793 _VALID_URL = r'https?://www\.youtube\.com/feed/recommended|:ytrec(?:ommended)?'
1794 _FEED_NAME = 'recommended'
1795 _PLAYLIST_TITLE = u'Youtube Recommended videos'
c626a3d9 1796
43ba5456
JMF
1797class YoutubeWatchLaterIE(YoutubeFeedsInfoExtractor):
1798 IE_DESC = u'Youtube watch later list, "ytwatchlater" keyword (requires authentication)'
1799 _VALID_URL = r'https?://www\.youtube\.com/feed/watch_later|:ytwatchlater'
1800 _FEED_NAME = 'watch_later'
1801 _PLAYLIST_TITLE = u'Youtube Watch Later'
43ba5456 1802 _PERSONAL_FEED = True
c626a3d9 1803
f459d170
JMF
1804class YoutubeHistoryIE(YoutubeFeedsInfoExtractor):
1805 IE_DESC = u'Youtube watch history, "ythistory" keyword (requires authentication)'
1806 _VALID_URL = u'https?://www\.youtube\.com/feed/history|:ythistory'
1807 _FEED_NAME = 'history'
1808 _PERSONAL_FEED = True
1809 _PLAYLIST_TITLE = u'Youtube Watch History'
1810
c626a3d9
JMF
1811class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
1812 IE_NAME = u'youtube:favorites'
1813 IE_DESC = u'YouTube.com favourite videos, "ytfav" keyword (requires authentication)'
c7a7750d 1814 _VALID_URL = r'https?://www\.youtube\.com/my_favorites|:ytfav(?:ou?rites)?'
c626a3d9
JMF
1815 _LOGIN_REQUIRED = True
1816
1817 def _real_extract(self, url):
1818 webpage = self._download_webpage('https://www.youtube.com/my_favorites', 'Youtube Favourites videos')
1819 playlist_id = self._search_regex(r'list=(.+?)["&]', webpage, u'favourites playlist id')
1820 return self.url_result(playlist_id, 'YoutubePlaylist')
15870e90
PH
1821
1822
1823class YoutubeTruncatedURLIE(InfoExtractor):
1824 IE_NAME = 'youtube:truncated_url'
1825 IE_DESC = False # Do not list
975d35db 1826 _VALID_URL = r'''(?x)
2eb5d315 1827 (?:https?://)?[^/]+/watch\?(?:feature=[a-z_]+)?$|
975d35db
PH
1828 (?:https?://)?(?:www\.)?youtube\.com/attribution_link\?a=[^&]+$
1829 '''
15870e90
PH
1830
1831 def _real_extract(self, url):
1832 raise ExtractorError(
1833 u'Did you forget to quote the URL? Remember that & is a meta '
1834 u'character in most shells, so you want to put the URL in quotes, '
1835 u'like youtube-dl '
b4622a32
PH
1836 u'"http://www.youtube.com/watch?feature=foo&v=BaW_jenozKc" '
1837 u' or simply youtube-dl BaW_jenozKc .',
15870e90 1838 expected=True)