]> jfr.im git - yt-dlp.git/blame - youtube_dl/extractor/youtube.py
[jsinterp] Remove superfluous u
[yt-dlp.git] / youtube_dl / extractor / youtube.py
CommitLineData
c5e8d7af 1# coding: utf-8
c5e8d7af 2
e0df6211 3import collections
edf3e38e 4import errno
e0df6211 5import io
0ca96d48 6import itertools
c5e8d7af 7import json
c4417ddb 8import os.path
c5e8d7af 9import re
e0df6211
PH
10import struct
11import traceback
12import zlib
c5e8d7af 13
b05654f0 14from .common import InfoExtractor, SearchInfoExtractor
54d39d8b 15from .subtitles import SubtitlesInfoExtractor
2b25cb5d 16from ..jsinterp import JSInterpreter
c5e8d7af 17from ..utils import (
edf3e38e 18 compat_chr,
c5e8d7af 19 compat_parse_qs,
c5e8d7af
PH
20 compat_urllib_parse,
21 compat_urllib_request,
7c61bd36 22 compat_urlparse,
c5e8d7af
PH
23 compat_str,
24
25 clean_html,
c38b1e77 26 get_cachedir,
c5e8d7af 27 get_element_by_id,
652cdaa2 28 get_element_by_attribute,
c5e8d7af 29 ExtractorError,
dd27fd17 30 int_or_none,
b7ab0590 31 PagedList,
c5e8d7af
PH
32 unescapeHTML,
33 unified_strdate,
04cc9617 34 orderedSet,
edf3e38e 35 write_json_file,
81c2f20b 36 uppercase_escape,
c5e8d7af
PH
37)
38
de7f3446 39class YoutubeBaseInfoExtractor(InfoExtractor):
b2e8bc1b
JMF
40 """Provide base functions for Youtube extractors"""
41 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
42 _LANG_URL = r'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
38c2e5b8 43 _AGE_URL = 'https://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
b2e8bc1b
JMF
44 _NETRC_MACHINE = 'youtube'
45 # If True it will raise an error if no login info is provided
46 _LOGIN_REQUIRED = False
47
b2e8bc1b 48 def _set_language(self):
7cc3570e
PH
49 return bool(self._download_webpage(
50 self._LANG_URL, None,
51 note=u'Setting language', errnote='unable to set language',
52 fatal=False))
b2e8bc1b
JMF
53
54 def _login(self):
55 (username, password) = self._get_login_info()
56 # No authentication to be performed
57 if username is None:
58 if self._LOGIN_REQUIRED:
59 raise ExtractorError(u'No login info available, needed for using %s.' % self.IE_NAME, expected=True)
60 return False
61
7cc3570e
PH
62 login_page = self._download_webpage(
63 self._LOGIN_URL, None,
64 note=u'Downloading login page',
65 errnote=u'unable to fetch login page', fatal=False)
66 if login_page is False:
67 return
b2e8bc1b 68
795f28f8
PH
69 galx = self._search_regex(r'(?s)<input.+?name="GALX".+?value="(.+?)"',
70 login_page, u'Login GALX parameter')
c5e8d7af 71
b2e8bc1b
JMF
72 # Log in
73 login_form_strs = {
74 u'continue': u'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
75 u'Email': username,
76 u'GALX': galx,
77 u'Passwd': password,
78 u'PersistentCookie': u'yes',
79 u'_utf8': u'霱',
80 u'bgresponse': u'js_disabled',
81 u'checkConnection': u'',
82 u'checkedDomains': u'youtube',
83 u'dnConn': u'',
b2e8bc1b
JMF
84 u'pstMsg': u'0',
85 u'rmShown': u'1',
86 u'secTok': u'',
87 u'signIn': u'Sign in',
88 u'timeStmp': u'',
89 u'service': u'youtube',
90 u'uilel': u'3',
91 u'hl': u'en_US',
92 }
93 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
94 # chokes on unicode
95 login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
96 login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
7cc3570e
PH
97
98 req = compat_urllib_request.Request(self._LOGIN_URL, login_data)
99 login_results = self._download_webpage(
100 req, None,
101 note=u'Logging in', errnote=u'unable to log in', fatal=False)
102 if login_results is False:
103 return False
104 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
105 self._downloader.report_warning(u'unable to log in: bad username or password')
b2e8bc1b
JMF
106 return False
107 return True
108
109 def _confirm_age(self):
110 age_form = {
7cc3570e
PH
111 'next_url': '/',
112 'action_confirm': 'Confirm',
113 }
5700e779
JMF
114 req = compat_urllib_request.Request(self._AGE_URL,
115 compat_urllib_parse.urlencode(age_form).encode('ascii'))
7cc3570e
PH
116
117 self._download_webpage(
118 req, None,
119 note=u'Confirming age', errnote=u'Unable to confirm age')
b2e8bc1b
JMF
120 return True
121
122 def _real_initialize(self):
123 if self._downloader is None:
124 return
125 if not self._set_language():
126 return
127 if not self._login():
128 return
129 self._confirm_age()
c5e8d7af 130
8377574c 131
de7f3446 132class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
0f818663 133 IE_DESC = u'YouTube.com'
cb7dfeea 134 _VALID_URL = r"""(?x)^
c5e8d7af 135 (
83aa5293 136 (?:https?://|//)? # http(s):// or protocol-independent URL (optional)
cb7dfeea 137 (?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/|
484aaeb2 138 (?:www\.)?deturl\.com/www\.youtube\.com/|
e70dc1d1 139 (?:www\.)?pwnyoutube\.com/|
f7000f3a 140 (?:www\.)?yourepeat\.com/|
e69ae5b9
JMF
141 tube\.majestyc\.net/|
142 youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains
c5e8d7af
PH
143 (?:.*?\#/)? # handle anchor (#/) redirect urls
144 (?: # the various things that can precede the ID:
145 (?:(?:v|embed|e)/) # v/ or embed/ or e/
146 |(?: # or the v= param in all its forms
f7000f3a 147 (?:(?:watch|movie)(?:_popup)?(?:\.php)?/?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
c5e8d7af
PH
148 (?:\?|\#!?) # the params delimiter ? or # or #!
149 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
150 v=
151 )
f4b05232
JMF
152 ))
153 |youtu\.be/ # just youtu.be/xxxx
b9c76aa1 154 |https?://(?:www\.)?cleanvideosearch\.com/media/action/yt/watch\?videoId=
f4b05232 155 )
c5e8d7af 156 )? # all until now is optional -> you can pass the naked ID
8963d9c2 157 ([0-9A-Za-z_-]{11}) # here is it! the YouTube video ID
c5e8d7af
PH
158 (?(1).+)? # if we found the ID, everything can follow
159 $"""
c5e8d7af 160 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
2c62dc26
PH
161 _formats = {
162 '5': {'ext': 'flv', 'width': 400, 'height': 240},
163 '6': {'ext': 'flv', 'width': 450, 'height': 270},
164 '13': {'ext': '3gp'},
165 '17': {'ext': '3gp', 'width': 176, 'height': 144},
166 '18': {'ext': 'mp4', 'width': 640, 'height': 360},
167 '22': {'ext': 'mp4', 'width': 1280, 'height': 720},
168 '34': {'ext': 'flv', 'width': 640, 'height': 360},
169 '35': {'ext': 'flv', 'width': 854, 'height': 480},
170 '36': {'ext': '3gp', 'width': 320, 'height': 240},
171 '37': {'ext': 'mp4', 'width': 1920, 'height': 1080},
172 '38': {'ext': 'mp4', 'width': 4096, 'height': 3072},
173 '43': {'ext': 'webm', 'width': 640, 'height': 360},
174 '44': {'ext': 'webm', 'width': 854, 'height': 480},
175 '45': {'ext': 'webm', 'width': 1280, 'height': 720},
176 '46': {'ext': 'webm', 'width': 1920, 'height': 1080},
177
1d043b93 178
86fe61c8 179 # 3d videos
43b81eb9
PH
180 '82': {'ext': 'mp4', 'height': 360, 'format_note': '3D', 'preference': -20},
181 '83': {'ext': 'mp4', 'height': 480, 'format_note': '3D', 'preference': -20},
182 '84': {'ext': 'mp4', 'height': 720, 'format_note': '3D', 'preference': -20},
183 '85': {'ext': 'mp4', 'height': 1080, 'format_note': '3D', 'preference': -20},
184 '100': {'ext': 'webm', 'height': 360, 'format_note': '3D', 'preference': -20},
185 '101': {'ext': 'webm', 'height': 480, 'format_note': '3D', 'preference': -20},
186 '102': {'ext': 'webm', 'height': 720, 'format_note': '3D', 'preference': -20},
836a086c 187
96fb5605 188 # Apple HTTP Live Streaming
43b81eb9
PH
189 '92': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'preference': -10},
190 '93': {'ext': 'mp4', 'height': 360, 'format_note': 'HLS', 'preference': -10},
191 '94': {'ext': 'mp4', 'height': 480, 'format_note': 'HLS', 'preference': -10},
192 '95': {'ext': 'mp4', 'height': 720, 'format_note': 'HLS', 'preference': -10},
193 '96': {'ext': 'mp4', 'height': 1080, 'format_note': 'HLS', 'preference': -10},
194 '132': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'preference': -10},
195 '151': {'ext': 'mp4', 'height': 72, 'format_note': 'HLS', 'preference': -10},
2c62dc26
PH
196
197 # DASH mp4 video
43b81eb9
PH
198 '133': {'ext': 'mp4', 'height': 240, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
199 '134': {'ext': 'mp4', 'height': 360, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
200 '135': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
201 '136': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
202 '137': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
203 '138': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
204 '160': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
205 '264': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
836a086c 206
f6f1fc92 207 # Dash mp4 audio
2c62dc26
PH
208 '139': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 48, 'preference': -50},
209 '140': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 128, 'preference': -50},
210 '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 256, 'preference': -50},
836a086c
AZ
211
212 # Dash webm
e75cafe9
A
213 '167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
214 '168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
215 '169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
216 '170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
217 '218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
218 '219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
219 '242': {'ext': 'webm', 'height': 240, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
220 '243': {'ext': 'webm', 'height': 360, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
221 '244': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
222 '245': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
223 '246': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
224 '247': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
225 '248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
3c80377b 226 '271': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
18061bba 227 '272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
2c62dc26
PH
228
229 # Dash webm audio
e75cafe9
A
230 '171': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'abr': 48, 'preference': -50},
231 '172': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'abr': 256, 'preference': -50},
ce6b9a2d
PH
232
233 # RTMP (unnamed)
234 '_rtmp': {'protocol': 'rtmp'},
c5e8d7af 235 }
836a086c 236
c5e8d7af 237 IE_NAME = u'youtube'
2eb88d95
PH
238 _TESTS = [
239 {
0e853ca4
PH
240 u"url": u"http://www.youtube.com/watch?v=BaW_jenozKc",
241 u"file": u"BaW_jenozKc.mp4",
242 u"info_dict": {
243 u"title": u"youtube-dl test video \"'/\\ä↭𝕐",
244 u"uploader": u"Philipp Hagemeister",
245 u"uploader_id": u"phihag",
246 u"upload_date": u"20121002",
ad3bc6ac
PH
247 u"description": u"test chars: \"'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .",
248 u"categories": [u'Science & Technology'],
2eb88d95 249 }
0e853ca4 250 },
0e853ca4
PH
251 {
252 u"url": u"http://www.youtube.com/watch?v=UxxajLWwzqY",
253 u"file": u"UxxajLWwzqY.mp4",
254 u"note": u"Test generic use_cipher_signature video (#897)",
255 u"info_dict": {
256 u"upload_date": u"20120506",
257 u"title": u"Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]",
ba60a3eb 258 u"description": u"md5:fea86fda2d5a5784273df5c7cc994d9f",
45ed795c 259 u"uploader": u"Icona Pop",
0e853ca4 260 u"uploader_id": u"IconaPop"
2eb88d95 261 }
c108eb73
JMF
262 },
263 {
264 u"url": u"https://www.youtube.com/watch?v=07FYdnEawAQ",
265 u"file": u"07FYdnEawAQ.mp4",
266 u"note": u"Test VEVO video with age protection (#956)",
267 u"info_dict": {
268 u"upload_date": u"20130703",
269 u"title": u"Justin Timberlake - Tunnel Vision (Explicit)",
270 u"description": u"md5:64249768eec3bc4276236606ea996373",
271 u"uploader": u"justintimberlakeVEVO",
272 u"uploader_id": u"justintimberlakeVEVO"
273 }
274 },
fccd3771 275 {
83aa5293 276 u"url": u"//www.YouTube.com/watch?v=yZIXLfi8CZQ",
fccd3771
PH
277 u"file": u"yZIXLfi8CZQ.mp4",
278 u"note": u"Embed-only video (#1746)",
279 u"info_dict": {
280 u"upload_date": u"20120608",
281 u"title": u"Principal Sexually Assaults A Teacher - Episode 117 - 8th June 2012",
282 u"description": u"md5:09b78bd971f1e3e289601dfba15ca4f7",
283 u"uploader": u"SET India",
284 u"uploader_id": u"setindia"
285 }
286 },
dd27fd17
PH
287 {
288 u"url": u"http://www.youtube.com/watch?v=a9LDPn-MO4I",
289 u"file": u"a9LDPn-MO4I.m4a",
290 u"note": u"256k DASH audio (format 141) via DASH manifest",
dd27fd17
PH
291 u"info_dict": {
292 u"upload_date": "20121002",
293 u"uploader_id": "8KVIDEO",
294 u"description": "No description available.",
295 u"uploader": "8KVIDEO",
296 u"title": "UHDTV TEST 8K VIDEO.mp4"
4919603f
PH
297 },
298 u"params": {
299 u"youtube_include_dash_manifest": True,
300 u"format": "141",
301 },
dd27fd17 302 },
3489b7d2
JMF
303 # DASH manifest with encrypted signature
304 {
305 u'url': u'https://www.youtube.com/watch?v=IB3lcPjvWLA',
306 u'info_dict': {
307 u'id': u'IB3lcPjvWLA',
308 u'ext': u'm4a',
309 u'title': u'Afrojack - The Spark ft. Spree Wilson',
e00c9cf5 310 u'description': u'md5:9717375db5a9a3992be4668bbf3bc0a8',
3489b7d2
JMF
311 u'uploader': u'AfrojackVEVO',
312 u'uploader_id': u'AfrojackVEVO',
313 u'upload_date': u'20131011',
314 },
315 u"params": {
316 u'youtube_include_dash_manifest': True,
317 u'format': '141',
318 },
319 },
2eb88d95
PH
320 ]
321
c5e8d7af
PH
322
323 @classmethod
324 def suitable(cls, url):
325 """Receives a URL and returns True if suitable for this IE."""
e3ea4790 326 if YoutubePlaylistIE.suitable(url): return False
fccd3771 327 return re.match(cls._VALID_URL, url) is not None
c5e8d7af 328
e0df6211
PH
329 def __init__(self, *args, **kwargs):
330 super(YoutubeIE, self).__init__(*args, **kwargs)
83799698 331 self._player_cache = {}
e0df6211 332
c5e8d7af
PH
333 def report_video_info_webpage_download(self, video_id):
334 """Report attempt to download video info webpage."""
335 self.to_screen(u'%s: Downloading video info webpage' % video_id)
336
c5e8d7af
PH
337 def report_information_extraction(self, video_id):
338 """Report attempt to extract video information."""
339 self.to_screen(u'%s: Extracting video information' % video_id)
340
341 def report_unavailable_format(self, video_id, format):
342 """Report extracted video URL."""
343 self.to_screen(u'%s: Format %s not available' % (video_id, format))
344
345 def report_rtmp_download(self):
346 """Indicate the download will use the RTMP protocol."""
347 self.to_screen(u'RTMP download detected')
348
c4417ddb
PH
349 def _extract_signature_function(self, video_id, player_url, slen):
350 id_m = re.match(r'.*-(?P<id>[a-zA-Z0-9_-]+)\.(?P<ext>[a-z]+)$',
83799698 351 player_url)
e0df6211
PH
352 player_type = id_m.group('ext')
353 player_id = id_m.group('id')
354
c4417ddb
PH
355 # Read from filesystem cache
356 func_id = '%s_%s_%d' % (player_type, player_id, slen)
357 assert os.path.basename(func_id) == func_id
c38b1e77 358 cache_dir = get_cachedir(self._downloader.params)
c4417ddb 359
c3c88a26 360 cache_enabled = cache_dir is not None
f8061589 361 if cache_enabled:
c4417ddb
PH
362 cache_fn = os.path.join(os.path.expanduser(cache_dir),
363 u'youtube-sigfuncs',
364 func_id + '.json')
365 try:
edf3e38e 366 with io.open(cache_fn, 'r', encoding='utf-8') as cachef:
c4417ddb
PH
367 cache_spec = json.load(cachef)
368 return lambda s: u''.join(s[i] for i in cache_spec)
edf3e38e 369 except IOError:
c4417ddb 370 pass # No cache available
83799698 371
e0df6211
PH
372 if player_type == 'js':
373 code = self._download_webpage(
374 player_url, video_id,
83799698 375 note=u'Downloading %s player %s' % (player_type, player_id),
e0df6211 376 errnote=u'Download of %s failed' % player_url)
83799698 377 res = self._parse_sig_js(code)
c4417ddb 378 elif player_type == 'swf':
e0df6211
PH
379 urlh = self._request_webpage(
380 player_url, video_id,
83799698 381 note=u'Downloading %s player %s' % (player_type, player_id),
e0df6211
PH
382 errnote=u'Download of %s failed' % player_url)
383 code = urlh.read()
83799698 384 res = self._parse_sig_swf(code)
e0df6211
PH
385 else:
386 assert False, 'Invalid player type %r' % player_type
387
f8061589 388 if cache_enabled:
edf3e38e 389 try:
c705320f
PH
390 test_string = u''.join(map(compat_chr, range(slen)))
391 cache_res = res(test_string)
edf3e38e
PH
392 cache_spec = [ord(c) for c in cache_res]
393 try:
394 os.makedirs(os.path.dirname(cache_fn))
395 except OSError as ose:
396 if ose.errno != errno.EEXIST:
397 raise
398 write_json_file(cache_spec, cache_fn)
0ca96d48 399 except Exception:
edf3e38e
PH
400 tb = traceback.format_exc()
401 self._downloader.report_warning(
402 u'Writing cache to %r failed: %s' % (cache_fn, tb))
83799698
PH
403
404 return res
405
edf3e38e
PH
406 def _print_sig_code(self, func, slen):
407 def gen_sig_code(idxs):
408 def _genslice(start, end, step):
409 starts = u'' if start == 0 else str(start)
e35e4ddc
PH
410 ends = (u':%d' % (end+step)) if end + step >= 0 else u':'
411 steps = u'' if step == 1 else (u':%d' % step)
edf3e38e
PH
412 return u's[%s%s%s]' % (starts, ends, steps)
413
414 step = None
0ca96d48
PH
415 start = '(Never used)' # Quelch pyflakes warnings - start will be
416 # set as soon as step is set
edf3e38e
PH
417 for i, prev in zip(idxs[1:], idxs[:-1]):
418 if step is not None:
419 if i - prev == step:
420 continue
421 yield _genslice(start, prev, step)
422 step = None
423 continue
424 if i - prev in [-1, 1]:
425 step = i - prev
426 start = prev
427 continue
428 else:
429 yield u's[%d]' % prev
430 if step is None:
431 yield u's[%d]' % i
432 else:
433 yield _genslice(start, i, step)
434
c705320f
PH
435 test_string = u''.join(map(compat_chr, range(slen)))
436 cache_res = func(test_string)
edf3e38e
PH
437 cache_spec = [ord(c) for c in cache_res]
438 expr_code = u' + '.join(gen_sig_code(cache_spec))
439 code = u'if len(s) == %d:\n return %s\n' % (slen, expr_code)
f8061589 440 self.to_screen(u'Extracted signature function:\n' + code)
edf3e38e 441
e0df6211
PH
442 def _parse_sig_js(self, jscode):
443 funcname = self._search_regex(
c26e9ac4 444 r'signature=([$a-zA-Z]+)', jscode,
2b25cb5d
PH
445 u'Initial JS player signature function name')
446
447 jsi = JSInterpreter(jscode)
448 initial_function = jsi.extract_function(funcname)
e0df6211
PH
449 return lambda s: initial_function([s])
450
451 def _parse_sig_swf(self, file_contents):
452 if file_contents[1:3] != b'WS':
453 raise ExtractorError(
454 u'Not an SWF file; header is %r' % file_contents[:3])
455 if file_contents[:1] == b'C':
456 content = zlib.decompress(file_contents[8:])
457 else:
458 raise NotImplementedError(u'Unsupported compression format %r' %
459 file_contents[:1])
460
461 def extract_tags(content):
462 pos = 0
463 while pos < len(content):
464 header16 = struct.unpack('<H', content[pos:pos+2])[0]
465 pos += 2
466 tag_code = header16 >> 6
467 tag_len = header16 & 0x3f
468 if tag_len == 0x3f:
469 tag_len = struct.unpack('<I', content[pos:pos+4])[0]
470 pos += 4
471 assert pos+tag_len <= len(content)
472 yield (tag_code, content[pos:pos+tag_len])
473 pos += tag_len
474
475 code_tag = next(tag
476 for tag_code, tag in extract_tags(content)
477 if tag_code == 82)
478 p = code_tag.index(b'\0', 4) + 1
ba552f54 479 code_reader = io.BytesIO(code_tag[p:])
e0df6211
PH
480
481 # Parse ABC (AVM2 ByteCode)
ba552f54
PH
482 def read_int(reader=None):
483 if reader is None:
484 reader = code_reader
e0df6211
PH
485 res = 0
486 shift = 0
487 for _ in range(5):
ba552f54
PH
488 buf = reader.read(1)
489 assert len(buf) == 1
490 b = struct.unpack('<B', buf)[0]
e0df6211
PH
491 res = res | ((b & 0x7f) << shift)
492 if b & 0x80 == 0:
493 break
494 shift += 7
ba552f54
PH
495 return res
496
497 def u30(reader=None):
498 res = read_int(reader)
499 assert res & 0xf0000000 == 0
e0df6211
PH
500 return res
501 u32 = read_int
502
ba552f54
PH
503 def s32(reader=None):
504 v = read_int(reader)
e0df6211
PH
505 if v & 0x80000000 != 0:
506 v = - ((v ^ 0xffffffff) + 1)
ba552f54
PH
507 return v
508
0ca96d48 509 def read_string(reader=None):
ba552f54
PH
510 if reader is None:
511 reader = code_reader
512 slen = u30(reader)
513 resb = reader.read(slen)
514 assert len(resb) == slen
515 return resb.decode('utf-8')
516
517 def read_bytes(count, reader=None):
518 if reader is None:
519 reader = code_reader
520 resb = reader.read(count)
521 assert len(resb) == count
522 return resb
523
524 def read_byte(reader=None):
525 resb = read_bytes(1, reader=reader)
526 res = struct.unpack('<B', resb)[0]
527 return res
e0df6211
PH
528
529 # minor_version + major_version
0ca96d48 530 read_bytes(2 + 2)
e0df6211
PH
531
532 # Constant pool
ba552f54 533 int_count = u30()
e0df6211 534 for _c in range(1, int_count):
0ca96d48 535 s32()
ba552f54 536 uint_count = u30()
e0df6211 537 for _c in range(1, uint_count):
0ca96d48 538 u32()
ba552f54 539 double_count = u30()
0ca96d48 540 read_bytes((double_count-1) * 8)
ba552f54 541 string_count = u30()
e0df6211
PH
542 constant_strings = [u'']
543 for _c in range(1, string_count):
0ca96d48 544 s = read_string()
e0df6211 545 constant_strings.append(s)
ba552f54 546 namespace_count = u30()
e0df6211 547 for _c in range(1, namespace_count):
0ca96d48
PH
548 read_bytes(1) # kind
549 u30() # name
ba552f54 550 ns_set_count = u30()
e0df6211 551 for _c in range(1, ns_set_count):
ba552f54 552 count = u30()
e0df6211 553 for _c2 in range(count):
0ca96d48 554 u30()
ba552f54 555 multiname_count = u30()
e0df6211
PH
556 MULTINAME_SIZES = {
557 0x07: 2, # QName
558 0x0d: 2, # QNameA
559 0x0f: 1, # RTQName
560 0x10: 1, # RTQNameA
561 0x11: 0, # RTQNameL
562 0x12: 0, # RTQNameLA
563 0x09: 2, # Multiname
564 0x0e: 2, # MultinameA
565 0x1b: 1, # MultinameL
566 0x1c: 1, # MultinameLA
567 }
568 multinames = [u'']
569 for _c in range(1, multiname_count):
ba552f54 570 kind = u30()
e0df6211
PH
571 assert kind in MULTINAME_SIZES, u'Invalid multiname kind %r' % kind
572 if kind == 0x07:
0ca96d48 573 u30() # namespace_idx
ba552f54 574 name_idx = u30()
e0df6211
PH
575 multinames.append(constant_strings[name_idx])
576 else:
577 multinames.append('[MULTINAME kind: %d]' % kind)
578 for _c2 in range(MULTINAME_SIZES[kind]):
0ca96d48 579 u30()
e0df6211
PH
580
581 # Methods
ba552f54 582 method_count = u30()
e0df6211
PH
583 MethodInfo = collections.namedtuple(
584 'MethodInfo',
585 ['NEED_ARGUMENTS', 'NEED_REST'])
586 method_infos = []
587 for method_id in range(method_count):
ba552f54 588 param_count = u30()
0ca96d48 589 u30() # return type
e0df6211 590 for _ in range(param_count):
0ca96d48
PH
591 u30() # param type
592 u30() # name index (always 0 for youtube)
ba552f54 593 flags = read_byte()
e0df6211
PH
594 if flags & 0x08 != 0:
595 # Options present
ba552f54 596 option_count = u30()
e0df6211 597 for c in range(option_count):
0ca96d48
PH
598 u30() # val
599 read_bytes(1) # kind
e0df6211
PH
600 if flags & 0x80 != 0:
601 # Param names present
602 for _ in range(param_count):
0ca96d48 603 u30() # param name
e0df6211
PH
604 mi = MethodInfo(flags & 0x01 != 0, flags & 0x04 != 0)
605 method_infos.append(mi)
606
607 # Metadata
ba552f54 608 metadata_count = u30()
e0df6211 609 for _c in range(metadata_count):
0ca96d48 610 u30() # name
ba552f54 611 item_count = u30()
e0df6211 612 for _c2 in range(item_count):
0ca96d48
PH
613 u30() # key
614 u30() # value
ba552f54
PH
615
616 def parse_traits_info():
617 trait_name_idx = u30()
618 kind_full = read_byte()
e0df6211
PH
619 kind = kind_full & 0x0f
620 attrs = kind_full >> 4
621 methods = {}
622 if kind in [0x00, 0x06]: # Slot or Const
0ca96d48
PH
623 u30() # Slot id
624 u30() # type_name_idx
ba552f54 625 vindex = u30()
e0df6211 626 if vindex != 0:
0ca96d48 627 read_byte() # vkind
e0df6211 628 elif kind in [0x01, 0x02, 0x03]: # Method / Getter / Setter
0ca96d48 629 u30() # disp_id
ba552f54 630 method_idx = u30()
e0df6211
PH
631 methods[multinames[trait_name_idx]] = method_idx
632 elif kind == 0x04: # Class
0ca96d48
PH
633 u30() # slot_id
634 u30() # classi
e0df6211 635 elif kind == 0x05: # Function
0ca96d48 636 u30() # slot_id
ba552f54 637 function_idx = u30()
e0df6211
PH
638 methods[function_idx] = multinames[trait_name_idx]
639 else:
640 raise ExtractorError(u'Unsupported trait kind %d' % kind)
641
642 if attrs & 0x4 != 0: # Metadata present
ba552f54 643 metadata_count = u30()
e0df6211 644 for _c3 in range(metadata_count):
0ca96d48 645 u30() # metadata index
e0df6211 646
ba552f54 647 return methods
e0df6211
PH
648
649 # Classes
650 TARGET_CLASSNAME = u'SignatureDecipher'
651 searched_idx = multinames.index(TARGET_CLASSNAME)
652 searched_class_id = None
ba552f54 653 class_count = u30()
e0df6211 654 for class_id in range(class_count):
ba552f54 655 name_idx = u30()
e0df6211
PH
656 if name_idx == searched_idx:
657 # We found the class we're looking for!
658 searched_class_id = class_id
0ca96d48 659 u30() # super_name idx
ba552f54 660 flags = read_byte()
e0df6211 661 if flags & 0x08 != 0: # Protected namespace is present
0ca96d48 662 u30() # protected_ns_idx
ba552f54 663 intrf_count = u30()
e0df6211 664 for _c2 in range(intrf_count):
0ca96d48
PH
665 u30()
666 u30() # iinit
ba552f54 667 trait_count = u30()
e0df6211 668 for _c2 in range(trait_count):
0ca96d48 669 parse_traits_info()
e0df6211
PH
670
671 if searched_class_id is None:
672 raise ExtractorError(u'Target class %r not found' %
673 TARGET_CLASSNAME)
674
675 method_names = {}
676 method_idxs = {}
677 for class_id in range(class_count):
0ca96d48 678 u30() # cinit
ba552f54 679 trait_count = u30()
e0df6211 680 for _c2 in range(trait_count):
ba552f54 681 trait_methods = parse_traits_info()
e0df6211
PH
682 if class_id == searched_class_id:
683 method_names.update(trait_methods.items())
684 method_idxs.update(dict(
685 (idx, name)
686 for name, idx in trait_methods.items()))
687
688 # Scripts
ba552f54 689 script_count = u30()
e0df6211 690 for _c in range(script_count):
0ca96d48 691 u30() # init
ba552f54 692 trait_count = u30()
e0df6211 693 for _c2 in range(trait_count):
0ca96d48 694 parse_traits_info()
e0df6211
PH
695
696 # Method bodies
ba552f54 697 method_body_count = u30()
e0df6211
PH
698 Method = collections.namedtuple('Method', ['code', 'local_count'])
699 methods = {}
700 for _c in range(method_body_count):
ba552f54 701 method_idx = u30()
0ca96d48 702 u30() # max_stack
ba552f54 703 local_count = u30()
0ca96d48
PH
704 u30() # init_scope_depth
705 u30() # max_scope_depth
ba552f54
PH
706 code_length = u30()
707 code = read_bytes(code_length)
e0df6211 708 if method_idx in method_idxs:
ba552f54 709 m = Method(code, local_count)
e0df6211 710 methods[method_idxs[method_idx]] = m
ba552f54 711 exception_count = u30()
e0df6211 712 for _c2 in range(exception_count):
0ca96d48
PH
713 u30() # from
714 u30() # to
715 u30() # target
716 u30() # exc_type
717 u30() # var_name
ba552f54 718 trait_count = u30()
e0df6211 719 for _c2 in range(trait_count):
0ca96d48 720 parse_traits_info()
e0df6211 721
ba552f54 722 assert p + code_reader.tell() == len(code_tag)
e0df6211
PH
723 assert len(methods) == len(method_idxs)
724
725 method_pyfunctions = {}
726
727 def extract_function(func_name):
728 if func_name in method_pyfunctions:
729 return method_pyfunctions[func_name]
730 if func_name not in methods:
731 raise ExtractorError(u'Cannot find function %r' % func_name)
732 m = methods[func_name]
733
734 def resfunc(args):
e0df6211
PH
735 registers = ['(this)'] + list(args) + [None] * m.local_count
736 stack = []
737 coder = io.BytesIO(m.code)
738 while True:
739 opcode = struct.unpack('!B', coder.read(1))[0]
a7177865 740 if opcode == 36: # pushbyte
e0df6211
PH
741 v = struct.unpack('!B', coder.read(1))[0]
742 stack.append(v)
743 elif opcode == 44: # pushstring
744 idx = u30(coder)
745 stack.append(constant_strings[idx])
746 elif opcode == 48: # pushscope
747 # We don't implement the scope register, so we'll just
748 # ignore the popped value
749 stack.pop()
750 elif opcode == 70: # callproperty
751 index = u30(coder)
752 mname = multinames[index]
753 arg_count = u30(coder)
754 args = list(reversed(
755 [stack.pop() for _ in range(arg_count)]))
756 obj = stack.pop()
757 if mname == u'split':
758 assert len(args) == 1
759 assert isinstance(args[0], compat_str)
760 assert isinstance(obj, compat_str)
761 if args[0] == u'':
762 res = list(obj)
763 else:
764 res = obj.split(args[0])
765 stack.append(res)
a7177865
PH
766 elif mname == u'slice':
767 assert len(args) == 1
768 assert isinstance(args[0], int)
769 assert isinstance(obj, list)
770 res = obj[args[0]:]
771 stack.append(res)
772 elif mname == u'join':
773 assert len(args) == 1
774 assert isinstance(args[0], compat_str)
775 assert isinstance(obj, list)
776 res = args[0].join(obj)
777 stack.append(res)
e0df6211
PH
778 elif mname in method_pyfunctions:
779 stack.append(method_pyfunctions[mname](args))
780 else:
781 raise NotImplementedError(
782 u'Unsupported property %r on %r'
783 % (mname, obj))
a7177865
PH
784 elif opcode == 72: # returnvalue
785 res = stack.pop()
786 return res
787 elif opcode == 79: # callpropvoid
788 index = u30(coder)
789 mname = multinames[index]
790 arg_count = u30(coder)
791 args = list(reversed(
792 [stack.pop() for _ in range(arg_count)]))
793 obj = stack.pop()
794 if mname == u'reverse':
795 assert isinstance(obj, list)
796 obj.reverse()
797 else:
798 raise NotImplementedError(
799 u'Unsupported (void) property %r on %r'
800 % (mname, obj))
e0df6211
PH
801 elif opcode == 93: # findpropstrict
802 index = u30(coder)
803 mname = multinames[index]
804 res = extract_function(mname)
805 stack.append(res)
806 elif opcode == 97: # setproperty
807 index = u30(coder)
808 value = stack.pop()
809 idx = stack.pop()
810 obj = stack.pop()
811 assert isinstance(obj, list)
812 assert isinstance(idx, int)
813 obj[idx] = value
814 elif opcode == 98: # getlocal
815 index = u30(coder)
816 stack.append(registers[index])
817 elif opcode == 99: # setlocal
818 index = u30(coder)
819 value = stack.pop()
820 registers[index] = value
821 elif opcode == 102: # getproperty
822 index = u30(coder)
823 pname = multinames[index]
824 if pname == u'length':
825 obj = stack.pop()
826 assert isinstance(obj, list)
827 stack.append(len(obj))
828 else: # Assume attribute access
829 idx = stack.pop()
830 assert isinstance(idx, int)
831 obj = stack.pop()
832 assert isinstance(obj, list)
833 stack.append(obj[idx])
834 elif opcode == 128: # coerce
0ca96d48 835 u30(coder)
e0df6211
PH
836 elif opcode == 133: # coerce_s
837 assert isinstance(stack[-1], (type(None), compat_str))
838 elif opcode == 164: # modulo
839 value2 = stack.pop()
840 value1 = stack.pop()
841 res = value1 % value2
842 stack.append(res)
a7177865
PH
843 elif opcode == 208: # getlocal_0
844 stack.append(registers[0])
845 elif opcode == 209: # getlocal_1
846 stack.append(registers[1])
847 elif opcode == 210: # getlocal_2
848 stack.append(registers[2])
849 elif opcode == 211: # getlocal_3
850 stack.append(registers[3])
e0df6211
PH
851 elif opcode == 214: # setlocal_2
852 registers[2] = stack.pop()
853 elif opcode == 215: # setlocal_3
854 registers[3] = stack.pop()
855 else:
856 raise NotImplementedError(
857 u'Unsupported opcode %d' % opcode)
858
859 method_pyfunctions[func_name] = resfunc
860 return resfunc
861
862 initial_function = extract_function(u'decipher')
863 return lambda s: initial_function([s])
864
83799698 865 def _decrypt_signature(self, s, video_id, player_url, age_gate=False):
257a2501 866 """Turn the encrypted s field into a working signature"""
6b37f0be 867
83799698 868 if player_url is not None:
9f9be844
PH
869 if player_url.startswith(u'//'):
870 player_url = u'https:' + player_url
e0df6211 871 try:
7f8ae73a
PH
872 player_id = (player_url, len(s))
873 if player_id not in self._player_cache:
83799698 874 func = self._extract_signature_function(
c4417ddb 875 video_id, player_url, len(s)
e0df6211 876 )
7f8ae73a
PH
877 self._player_cache[player_id] = func
878 func = self._player_cache[player_id]
edf3e38e
PH
879 if self._downloader.params.get('youtube_print_sig_code'):
880 self._print_sig_code(func, len(s))
881 return func(s)
b3a88780 882 except Exception as e:
e0df6211 883 tb = traceback.format_exc()
b3a88780
PH
884 raise ExtractorError(
885 u'Automatic signature extraction failed: ' + tb, cause=e)
920de7a2 886
2f2ffea9
PH
887 return self._static_decrypt_signature(
888 s, video_id, player_url, age_gate)
e0df6211 889
1f343eaa 890 def _get_available_subtitles(self, video_id, webpage):
de7f3446 891 try:
7fad1c63 892 sub_list = self._download_webpage(
38c2e5b8 893 'https://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,
7fad1c63
JMF
894 video_id, note=False)
895 except ExtractorError as err:
de7f3446
JMF
896 self._downloader.report_warning(u'unable to download video subtitles: %s' % compat_str(err))
897 return {}
898 lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
899
900 sub_lang_list = {}
901 for l in lang_list:
902 lang = l[1]
903 params = compat_urllib_parse.urlencode({
904 'lang': lang,
905 'v': video_id,
ca715127 906 'fmt': self._downloader.params.get('subtitlesformat', 'srt'),
c3197e3e 907 'name': unescapeHTML(l[0]).encode('utf-8'),
de7f3446 908 })
38c2e5b8 909 url = u'https://www.youtube.com/api/timedtext?' + params
de7f3446
JMF
910 sub_lang_list[lang] = url
911 if not sub_lang_list:
912 self._downloader.report_warning(u'video doesn\'t have subtitles')
913 return {}
914 return sub_lang_list
915
055e6f36 916 def _get_available_automatic_caption(self, video_id, webpage):
de7f3446
JMF
917 """We need the webpage for getting the captions url, pass it as an
918 argument to speed up the process."""
ca715127 919 sub_format = self._downloader.params.get('subtitlesformat', 'srt')
de7f3446
JMF
920 self.to_screen(u'%s: Looking for automatic captions' % video_id)
921 mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
055e6f36 922 err_msg = u'Couldn\'t find automatic captions for %s' % video_id
de7f3446
JMF
923 if mobj is None:
924 self._downloader.report_warning(err_msg)
925 return {}
926 player_config = json.loads(mobj.group(1))
927 try:
928 args = player_config[u'args']
929 caption_url = args[u'ttsurl']
930 timestamp = args[u'timestamp']
055e6f36
JMF
931 # We get the available subtitles
932 list_params = compat_urllib_parse.urlencode({
933 'type': 'list',
934 'tlangs': 1,
935 'asrs': 1,
de7f3446 936 })
055e6f36 937 list_url = caption_url + '&' + list_params
e26f8712 938 caption_list = self._download_xml(list_url, video_id)
e3dc22ca 939 original_lang_node = caption_list.find('track')
f6a54188 940 if original_lang_node is None or original_lang_node.attrib.get('kind') != 'asr' :
e3dc22ca
JMF
941 self._downloader.report_warning(u'Video doesn\'t have automatic captions')
942 return {}
943 original_lang = original_lang_node.attrib['lang_code']
055e6f36
JMF
944
945 sub_lang_list = {}
946 for lang_node in caption_list.findall('target'):
947 sub_lang = lang_node.attrib['lang_code']
948 params = compat_urllib_parse.urlencode({
949 'lang': original_lang,
950 'tlang': sub_lang,
951 'fmt': sub_format,
952 'ts': timestamp,
953 'kind': 'asr',
954 })
955 sub_lang_list[sub_lang] = caption_url + '&' + params
956 return sub_lang_list
de7f3446
JMF
957 # An extractor error can be raise by the download process if there are
958 # no automatic captions but there are subtitles
959 except (KeyError, ExtractorError):
960 self._downloader.report_warning(err_msg)
961 return {}
962
97665381
PH
963 @classmethod
964 def extract_id(cls, url):
965 mobj = re.match(cls._VALID_URL, url, re.VERBOSE)
c5e8d7af
PH
966 if mobj is None:
967 raise ExtractorError(u'Invalid URL: %s' % url)
968 video_id = mobj.group(2)
969 return video_id
970
1d043b93
JMF
971 def _extract_from_m3u8(self, manifest_url, video_id):
972 url_map = {}
973 def _get_urls(_manifest):
974 lines = _manifest.split('\n')
975 urls = filter(lambda l: l and not l.startswith('#'),
976 lines)
977 return urls
978 manifest = self._download_webpage(manifest_url, video_id, u'Downloading formats manifest')
979 formats_urls = _get_urls(manifest)
980 for format_url in formats_urls:
890f62e8 981 itag = self._search_regex(r'itag/(\d+?)/', format_url, 'itag')
1d043b93
JMF
982 url_map[itag] = format_url
983 return url_map
984
1fb07d10
JG
985 def _extract_annotations(self, video_id):
986 url = 'https://www.youtube.com/annotations_invideo?features=1&legacy=1&video_id=%s' % video_id
987 return self._download_webpage(url, video_id, note=u'Searching for annotations.', errnote=u'Unable to download video annotations.')
988
c5e8d7af 989 def _real_extract(self, url):
7e8c0af0
PH
990 proto = (
991 u'http' if self._downloader.params.get('prefer_insecure', False)
992 else u'https')
993
c5e8d7af
PH
994 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
995 mobj = re.search(self._NEXT_URL_RE, url)
996 if mobj:
7e8c0af0 997 url = proto + '://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
97665381 998 video_id = self.extract_id(url)
c5e8d7af
PH
999
1000 # Get video webpage
7e8c0af0 1001 url = proto + '://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
336c3a69 1002 video_webpage = self._download_webpage(url, video_id)
c5e8d7af
PH
1003
1004 # Attempt to extract SWF player URL
e0df6211 1005 mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
c5e8d7af
PH
1006 if mobj is not None:
1007 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1008 else:
1009 player_url = None
1010
1011 # Get video info
1012 self.report_video_info_webpage_download(video_id)
c108eb73
JMF
1013 if re.search(r'player-age-gate-content">', video_webpage) is not None:
1014 self.report_age_confirmation()
1015 age_gate = True
1016 # We simulate the access to the video from www.youtube.com/v/{video_id}
1017 # this can be viewed without login into Youtube
1018 data = compat_urllib_parse.urlencode({'video_id': video_id,
fccd3771 1019 'el': 'player_embedded',
c108eb73
JMF
1020 'gl': 'US',
1021 'hl': 'en',
1022 'eurl': 'https://youtube.googleapis.com/v/' + video_id,
1023 'asv': 3,
1024 'sts':'1588',
1025 })
7e8c0af0 1026 video_info_url = proto + '://www.youtube.com/get_video_info?' + data
c5e8d7af
PH
1027 video_info_webpage = self._download_webpage(video_info_url, video_id,
1028 note=False,
1029 errnote='unable to download video info webpage')
1030 video_info = compat_parse_qs(video_info_webpage)
c108eb73
JMF
1031 else:
1032 age_gate = False
1033 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
7e8c0af0 1034 video_info_url = (proto + '://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
c108eb73
JMF
1035 % (video_id, el_type))
1036 video_info_webpage = self._download_webpage(video_info_url, video_id,
1037 note=False,
1038 errnote='unable to download video info webpage')
1039 video_info = compat_parse_qs(video_info_webpage)
1040 if 'token' in video_info:
1041 break
c5e8d7af
PH
1042 if 'token' not in video_info:
1043 if 'reason' in video_info:
d11271dd
PH
1044 raise ExtractorError(
1045 u'YouTube said: %s' % video_info['reason'][0],
1046 expected=True, video_id=video_id)
c5e8d7af 1047 else:
d11271dd
PH
1048 raise ExtractorError(
1049 u'"token" parameter not in video info for unknown reason',
1050 video_id=video_id)
c5e8d7af 1051
1d699755
PH
1052 if 'view_count' in video_info:
1053 view_count = int(video_info['view_count'][0])
1054 else:
1055 view_count = None
1056
c5e8d7af
PH
1057 # Check for "rental" videos
1058 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
1059 raise ExtractorError(u'"rental" videos not supported')
1060
1061 # Start extracting information
1062 self.report_information_extraction(video_id)
1063
1064 # uploader
1065 if 'author' not in video_info:
1066 raise ExtractorError(u'Unable to extract uploader name')
1067 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
1068
1069 # uploader_id
1070 video_uploader_id = None
1071 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
1072 if mobj is not None:
1073 video_uploader_id = mobj.group(1)
1074 else:
1075 self._downloader.report_warning(u'unable to extract uploader nickname')
1076
1077 # title
a8c6b241 1078 if 'title' in video_info:
aa92f063 1079 video_title = video_info['title'][0]
a8c6b241
PH
1080 else:
1081 self._downloader.report_warning(u'Unable to extract video title')
1082 video_title = u'_'
c5e8d7af
PH
1083
1084 # thumbnail image
7763b04e
JMF
1085 # We try first to get a high quality image:
1086 m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">',
1087 video_webpage, re.DOTALL)
1088 if m_thumb is not None:
1089 video_thumbnail = m_thumb.group(1)
1090 elif 'thumbnail_url' not in video_info:
c5e8d7af 1091 self._downloader.report_warning(u'unable to extract video thumbnail')
f490e77e 1092 video_thumbnail = None
c5e8d7af
PH
1093 else: # don't panic if we can't find it
1094 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
1095
1096 # upload date
1097 upload_date = None
ad3bc6ac 1098 mobj = re.search(r'(?s)id="eow-date.*?>(.*?)</span>', video_webpage)
beee53de
PH
1099 if mobj is None:
1100 mobj = re.search(
263bd4ec 1101 r'(?s)id="watch-uploader-info".*?>.*?(?:Published|Uploaded|Streamed live) on (.*?)</strong>',
beee53de 1102 video_webpage)
c5e8d7af
PH
1103 if mobj is not None:
1104 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1105 upload_date = unified_strdate(upload_date)
1106
ec8deefc
DG
1107 m_cat_container = get_element_by_id("eow-category", video_webpage)
1108 if m_cat_container:
ad3bc6ac 1109 category = self._html_search_regex(
01ed5c9b 1110 r'(?s)<a[^<]+>(.*?)</a>', m_cat_container, 'category',
ad3bc6ac
PH
1111 default=None)
1112 video_categories = None if category is None else [category]
1113 else:
1114 video_categories = None
ec8deefc 1115
c5e8d7af
PH
1116 # description
1117 video_description = get_element_by_id("eow-description", video_webpage)
1118 if video_description:
27dcce19
PH
1119 video_description = re.sub(r'''(?x)
1120 <a\s+
1121 (?:[a-zA-Z-]+="[^"]+"\s+)*?
1122 title="([^"]+)"\s+
1123 (?:[a-zA-Z-]+="[^"]+"\s+)*?
1124 class="yt-uix-redirect-link"\s*>
1125 [^<]+
1126 </a>
1127 ''', r'\1', video_description)
c5e8d7af
PH
1128 video_description = clean_html(video_description)
1129 else:
1130 fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
1131 if fd_mobj:
1132 video_description = unescapeHTML(fd_mobj.group(1))
1133 else:
1134 video_description = u''
1135
336c3a69 1136 def _extract_count(klass):
46374a56
PH
1137 count = self._search_regex(
1138 r'class="%s">([\d,]+)</span>' % re.escape(klass),
1139 video_webpage, klass, default=None)
336c3a69
JMF
1140 if count is not None:
1141 return int(count.replace(',', ''))
1142 return None
1143 like_count = _extract_count(u'likes-count')
1144 dislike_count = _extract_count(u'dislikes-count')
1145
c5e8d7af 1146 # subtitles
d82134c3 1147 video_subtitles = self.extract_subtitles(video_id, video_webpage)
c5e8d7af 1148
c5e8d7af 1149 if self._downloader.params.get('listsubtitles', False):
d665f8d3 1150 self._list_available_subtitles(video_id, video_webpage)
c5e8d7af
PH
1151 return
1152
1153 if 'length_seconds' not in video_info:
1154 self._downloader.report_warning(u'unable to extract video duration')
b466b702 1155 video_duration = None
c5e8d7af 1156 else:
b466b702 1157 video_duration = int(compat_urllib_parse.unquote_plus(video_info['length_seconds'][0]))
c5e8d7af 1158
1fb07d10
JG
1159 # annotations
1160 video_annotations = None
1161 if self._downloader.params.get('writeannotations', False):
1162 video_annotations = self._extract_annotations(video_id)
1163
c5e8d7af 1164 # Decide which formats to download
c5e8d7af 1165 try:
ae7ed920 1166 mobj = re.search(r';ytplayer\.config\s*=\s*({.*?});', video_webpage)
50be92c1
PH
1167 if not mobj:
1168 raise ValueError('Could not find vevo ID')
ae7ed920
PH
1169 json_code = uppercase_escape(mobj.group(1))
1170 ytplayer_config = json.loads(json_code)
3489b7d2 1171 args = ytplayer_config['args']
7ce7e394
JMF
1172 # Easy way to know if the 's' value is in url_encoded_fmt_stream_map
1173 # this signatures are encrypted
44d46655 1174 if 'url_encoded_fmt_stream_map' not in args:
f10503db 1175 raise ValueError(u'No stream_map present') # caught below
00fe14fc
JMF
1176 re_signature = re.compile(r'[&,]s=')
1177 m_s = re_signature.search(args['url_encoded_fmt_stream_map'])
7ce7e394
JMF
1178 if m_s is not None:
1179 self.to_screen(u'%s: Encrypted signatures detected.' % video_id)
c5e8d7af 1180 video_info['url_encoded_fmt_stream_map'] = [args['url_encoded_fmt_stream_map']]
00fe14fc 1181 m_s = re_signature.search(args.get('adaptive_fmts', u''))
b7a68384 1182 if m_s is not None:
00fe14fc
JMF
1183 if 'adaptive_fmts' in video_info:
1184 video_info['adaptive_fmts'][0] += ',' + args['adaptive_fmts']
37b6d5f6 1185 else:
00fe14fc 1186 video_info['adaptive_fmts'] = [args['adaptive_fmts']]
c5e8d7af
PH
1187 except ValueError:
1188 pass
1189
dd27fd17
PH
1190 def _map_to_format_list(urlmap):
1191 formats = []
1192 for itag, video_real_url in urlmap.items():
1193 dct = {
1194 'format_id': itag,
1195 'url': video_real_url,
1196 'player_url': player_url,
1197 }
0b65e5d4
PH
1198 if itag in self._formats:
1199 dct.update(self._formats[itag])
dd27fd17
PH
1200 formats.append(dct)
1201 return formats
1202
c5e8d7af
PH
1203 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1204 self.report_rtmp_download()
dd27fd17
PH
1205 formats = [{
1206 'format_id': '_rtmp',
1207 'protocol': 'rtmp',
1208 'url': video_info['conn'][0],
1209 'player_url': player_url,
1210 }]
00fe14fc
JMF
1211 elif len(video_info.get('url_encoded_fmt_stream_map', [])) >= 1 or len(video_info.get('adaptive_fmts', [])) >= 1:
1212 encoded_url_map = video_info.get('url_encoded_fmt_stream_map', [''])[0] + ',' + video_info.get('adaptive_fmts',[''])[0]
1213 if 'rtmpe%3Dyes' in encoded_url_map:
a7055eb9 1214 raise ExtractorError('rtmpe downloads are not supported, see https://github.com/rg3/youtube-dl/issues/343 for more information.', expected=True)
c5e8d7af 1215 url_map = {}
00fe14fc 1216 for url_data_str in encoded_url_map.split(','):
c5e8d7af
PH
1217 url_data = compat_parse_qs(url_data_str)
1218 if 'itag' in url_data and 'url' in url_data:
1219 url = url_data['url'][0]
1220 if 'sig' in url_data:
1221 url += '&signature=' + url_data['sig'][0]
1222 elif 's' in url_data:
e0df6211 1223 encrypted_sig = url_data['s'][0]
769fda3c 1224 if self._downloader.params.get('verbose'):
c108eb73 1225 if age_gate:
bdde940e
PH
1226 if player_url is None:
1227 player_version = 'unknown'
1228 else:
1229 player_version = self._search_regex(
1230 r'-(.+)\.swf$', player_url,
1231 u'flash player', fatal=False)
e0df6211 1232 player_desc = 'flash player %s' % player_version
c108eb73 1233 else:
83799698
PH
1234 player_version = self._search_regex(
1235 r'html5player-(.+?)\.js', video_webpage,
c108eb73 1236 'html5 player', fatal=False)
e0df6211
PH
1237 player_desc = u'html5 player %s' % player_version
1238
1239 parts_sizes = u'.'.join(compat_str(len(part)) for part in encrypted_sig.split('.'))
5a76c651 1240 self.to_screen(u'encrypted signature length %d (%s), itag %s, %s' %
e0df6211
PH
1241 (len(encrypted_sig), parts_sizes, url_data['itag'][0], player_desc))
1242
83799698 1243 if not age_gate:
e0df6211
PH
1244 jsplayer_url_json = self._search_regex(
1245 r'"assets":.+?"js":\s*("[^"]+")',
1246 video_webpage, u'JS player URL')
83799698 1247 player_url = json.loads(jsplayer_url_json)
e0df6211 1248
83799698
PH
1249 signature = self._decrypt_signature(
1250 encrypted_sig, video_id, player_url, age_gate)
c5e8d7af
PH
1251 url += '&signature=' + signature
1252 if 'ratebypass' not in url:
1253 url += '&ratebypass=yes'
1254 url_map[url_data['itag'][0]] = url
dd27fd17 1255 formats = _map_to_format_list(url_map)
1d043b93
JMF
1256 elif video_info.get('hlsvp'):
1257 manifest_url = video_info['hlsvp'][0]
1258 url_map = self._extract_from_m3u8(manifest_url, video_id)
dd27fd17 1259 formats = _map_to_format_list(url_map)
c5e8d7af 1260 else:
9abb3204 1261 raise ExtractorError(u'no conn, hlsvp or url_encoded_fmt_stream_map information found in video info')
c5e8d7af 1262
dd27fd17 1263 # Look for the DASH manifest
d68f0cdb 1264 if (self._downloader.params.get('youtube_include_dash_manifest', False)):
dd27fd17 1265 try:
d68f0cdb 1266 # The DASH manifest used needs to be the one from the original video_webpage.
1267 # The one found in get_video_info seems to be using different signatures.
1268 # However, in the case of an age restriction there won't be any embedded dashmpd in the video_webpage.
1269 # Luckily, it seems, this case uses some kind of default signature (len == 86), so the
1270 # combination of get_video_info and the _static_decrypt_signature() decryption fallback will work here.
1271 if age_gate:
3489b7d2 1272 dash_manifest_url = video_info.get('dashmpd')[0]
d68f0cdb 1273 else:
3489b7d2 1274 dash_manifest_url = ytplayer_config['args']['dashmpd']
d68f0cdb 1275 def decrypt_sig(mobj):
1276 s = mobj.group(1)
1277 dec_s = self._decrypt_signature(s, video_id, player_url, age_gate)
1278 return '/signature/%s' % dec_s
1279 dash_manifest_url = re.sub(r'/s/([\w\.]+)', decrypt_sig, dash_manifest_url)
dd27fd17 1280 dash_doc = self._download_xml(
d68f0cdb 1281 dash_manifest_url, video_id,
dd27fd17
PH
1282 note=u'Downloading DASH manifest',
1283 errnote=u'Could not download DASH manifest')
1284 for r in dash_doc.findall(u'.//{urn:mpeg:DASH:schema:MPD:2011}Representation'):
1285 url_el = r.find('{urn:mpeg:DASH:schema:MPD:2011}BaseURL')
1286 if url_el is None:
1287 continue
1288 format_id = r.attrib['id']
1289 video_url = url_el.text
1290 filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength'))
1291 f = {
1292 'format_id': format_id,
1293 'url': video_url,
1294 'width': int_or_none(r.attrib.get('width')),
1295 'tbr': int_or_none(r.attrib.get('bandwidth'), 1000),
1296 'asr': int_or_none(r.attrib.get('audioSamplingRate')),
1297 'filesize': filesize,
1298 }
1299 try:
1300 existing_format = next(
1301 fo for fo in formats
1302 if fo['format_id'] == format_id)
1303 except StopIteration:
1304 f.update(self._formats.get(format_id, {}))
1305 formats.append(f)
1306 else:
1307 existing_format.update(f)
1308
1309 except (ExtractorError, KeyError) as e:
1310 self.report_warning(u'Skipping DASH manifest: %s' % e, video_id)
d80044c2 1311
4bcc7bd1 1312 self._sort_formats(formats)
4ea3be0a 1313
1314 return {
1315 'id': video_id,
1316 'uploader': video_uploader,
1317 'uploader_id': video_uploader_id,
1318 'upload_date': upload_date,
1319 'title': video_title,
1320 'thumbnail': video_thumbnail,
1321 'description': video_description,
ec8deefc 1322 'categories': video_categories,
4ea3be0a 1323 'subtitles': video_subtitles,
1324 'duration': video_duration,
1325 'age_limit': 18 if age_gate else 0,
1326 'annotations': video_annotations,
7e8c0af0 1327 'webpage_url': proto + '://www.youtube.com/watch?v=%s' % video_id,
4ea3be0a 1328 'view_count': view_count,
1329 'like_count': like_count,
1330 'dislike_count': dislike_count,
1331 'formats': formats,
1332 }
c5e8d7af 1333
880e1c52 1334class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
0f818663 1335 IE_DESC = u'YouTube.com playlists'
d67cc9fa 1336 _VALID_URL = r"""(?x)(?:
c5e8d7af
PH
1337 (?:https?://)?
1338 (?:\w+\.)?
1339 youtube\.com/
1340 (?:
1341 (?:course|view_play_list|my_playlists|artist|playlist|watch)
1342 \? (?:.*?&)*? (?:p|a|list)=
1343 | p/
1344 )
d67cc9fa 1345 (
7d568f5a 1346 (?:PL|LL|EC|UU|FL|RD)?[0-9A-Za-z-_]{10,}
d67cc9fa
JMF
1347 # Top tracks, they can also include dots
1348 |(?:MC)[\w\.]*
1349 )
c5e8d7af
PH
1350 .*
1351 |
7d568f5a 1352 ((?:PL|LL|EC|UU|FL|RD)[0-9A-Za-z-_]{10,})
c5e8d7af 1353 )"""
dbb94fb0 1354 _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s'
dcbb4580 1355 _MORE_PAGES_INDICATOR = r'data-link-type="next"'
dbb94fb0 1356 _VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})&amp;[^"]*?index=(?P<index>\d+)'
c5e8d7af
PH
1357 IE_NAME = u'youtube:playlist'
1358
880e1c52
JMF
1359 def _real_initialize(self):
1360 self._login()
1361
652cdaa2
JMF
1362 def _ids_to_results(self, ids):
1363 return [self.url_result(vid_id, 'Youtube', video_id=vid_id)
1364 for vid_id in ids]
1365
1366 def _extract_mix(self, playlist_id):
1367 # The mixes are generated from a a single video
1368 # the id of the playlist is just 'RD' + video_id
7d4afc55 1369 url = 'https://youtube.com/watch?v=%s&list=%s' % (playlist_id[-11:], playlist_id)
652cdaa2 1370 webpage = self._download_webpage(url, playlist_id, u'Downloading Youtube mix')
bc2f773b
JMF
1371 search_title = lambda class_name: get_element_by_attribute('class', class_name, webpage)
1372 title_span = (search_title('playlist-title') or
1373 search_title('title long-title') or search_title('title'))
76d1700b 1374 title = clean_html(title_span)
70e32269 1375 video_re = r'''(?x)data-video-username=".*?".*?
bc2f773b 1376 href="/watch\?v=([0-9A-Za-z_-]{11})&amp;[^"]*?list=%s''' % re.escape(playlist_id)
70e32269 1377 ids = orderedSet(re.findall(video_re, webpage, flags=re.DOTALL))
652cdaa2
JMF
1378 url_results = self._ids_to_results(ids)
1379
1380 return self.playlist_result(url_results, playlist_id, title)
1381
c5e8d7af
PH
1382 def _real_extract(self, url):
1383 # Extract playlist id
d67cc9fa 1384 mobj = re.match(self._VALID_URL, url)
c5e8d7af
PH
1385 if mobj is None:
1386 raise ExtractorError(u'Invalid URL: %s' % url)
47192f92
FV
1387 playlist_id = mobj.group(1) or mobj.group(2)
1388
1389 # Check if it's a video-specific URL
7c61bd36 1390 query_dict = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
47192f92
FV
1391 if 'v' in query_dict:
1392 video_id = query_dict['v'][0]
1393 if self._downloader.params.get('noplaylist'):
1394 self.to_screen(u'Downloading just video %s because of --no-playlist' % video_id)
7012b23c 1395 return self.url_result(video_id, 'Youtube', video_id=video_id)
47192f92 1396 else:
1db26669 1397 self.to_screen(u'Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id))
c5e8d7af 1398
7d4afc55 1399 if playlist_id.startswith('RD'):
652cdaa2
JMF
1400 # Mixes require a custom extraction process
1401 return self._extract_mix(playlist_id)
0a688bc0
JMF
1402 if playlist_id.startswith('TL'):
1403 raise ExtractorError(u'For downloading YouTube.com top lists, use '
1404 u'the "yttoplist" keyword, for example "youtube-dl \'yttoplist:music:Top Tracks\'"', expected=True)
652cdaa2 1405
dbb94fb0
S
1406 url = self._TEMPLATE_URL % playlist_id
1407 page = self._download_webpage(url, playlist_id)
1408 more_widget_html = content_html = page
1409
10c0e2d8 1410 # Check if the playlist exists or is private
e399853d 1411 if re.search(r'<div class="yt-alert-message">[^<]*?(The|This) playlist (does not exist|is private)[^<]*?</div>', page) is not None:
10c0e2d8
JMF
1412 raise ExtractorError(
1413 u'The playlist doesn\'t exist or is private, use --username or '
1414 '--netrc to access it.',
1415 expected=True)
1416
dcbb4580
JMF
1417 # Extract the video ids from the playlist pages
1418 ids = []
c5e8d7af 1419
755eb032 1420 for page_num in itertools.count(1):
dbb94fb0 1421 matches = re.finditer(self._VIDEO_RE, content_html)
6e47b51e
JMF
1422 # We remove the duplicates and the link with index 0
1423 # (it's not the first video of the playlist)
1424 new_ids = orderedSet(m.group('id') for m in matches if m.group('index') != '0')
dcbb4580 1425 ids.extend(new_ids)
c5e8d7af 1426
dbb94fb0
S
1427 mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
1428 if not mobj:
c5e8d7af
PH
1429 break
1430
dbb94fb0 1431 more = self._download_json(
5912c639
PH
1432 'https://youtube.com/%s' % mobj.group('more'), playlist_id,
1433 'Downloading page #%s' % page_num,
1434 transform_source=uppercase_escape)
dbb94fb0
S
1435 content_html = more['content_html']
1436 more_widget_html = more['load_more_widget_html']
1437
1438 playlist_title = self._html_search_regex(
68eb8e90
PH
1439 r'(?s)<h1 class="pl-header-title[^"]*">\s*(.*?)\s*</h1>',
1440 page, u'title')
c5e8d7af 1441
652cdaa2 1442 url_results = self._ids_to_results(ids)
dcbb4580 1443 return self.playlist_result(url_results, playlist_id, playlist_title)
c5e8d7af
PH
1444
1445
0a688bc0
JMF
1446class YoutubeTopListIE(YoutubePlaylistIE):
1447 IE_NAME = u'youtube:toplist'
1448 IE_DESC = (u'YouTube.com top lists, "yttoplist:{channel}:{list title}"'
1449 u' (Example: "yttoplist:music:Top Tracks")')
1450 _VALID_URL = r'yttoplist:(?P<chann>.*?):(?P<title>.*?)$'
1451
1452 def _real_extract(self, url):
1453 mobj = re.match(self._VALID_URL, url)
1454 channel = mobj.group('chann')
1455 title = mobj.group('title')
1456 query = compat_urllib_parse.urlencode({'title': title})
beddbc2a 1457 playlist_re = 'href="([^"]+?%s.*?)"' % re.escape(query)
0a688bc0
JMF
1458 channel_page = self._download_webpage('https://www.youtube.com/%s' % channel, title)
1459 link = self._html_search_regex(playlist_re, channel_page, u'list')
1460 url = compat_urlparse.urljoin('https://www.youtube.com/', link)
1461
1462 video_re = r'data-index="\d+".*?data-video-id="([0-9A-Za-z_-]{11})"'
1463 ids = []
1464 # sometimes the webpage doesn't contain the videos
1465 # retry until we get them
1466 for i in itertools.count(0):
1467 msg = u'Downloading Youtube mix'
1468 if i > 0:
1469 msg += ', retry #%d' % i
1470 webpage = self._download_webpage(url, title, msg)
1471 ids = orderedSet(re.findall(video_re, webpage))
1472 if ids:
1473 break
1474 url_results = self._ids_to_results(ids)
1475 return self.playlist_result(url_results, playlist_title=title)
1476
1477
c5e8d7af 1478class YoutubeChannelIE(InfoExtractor):
0f818663 1479 IE_DESC = u'YouTube.com channels'
c5e8d7af 1480 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
c5e8d7af 1481 _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
38c2e5b8 1482 _MORE_PAGES_URL = 'https://www.youtube.com/c4_browse_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
c5e8d7af
PH
1483 IE_NAME = u'youtube:channel'
1484
1485 def extract_videos_from_page(self, page):
1486 ids_in_page = []
1487 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1488 if mobj.group(1) not in ids_in_page:
1489 ids_in_page.append(mobj.group(1))
1490 return ids_in_page
1491
1492 def _real_extract(self, url):
1493 # Extract channel id
1494 mobj = re.match(self._VALID_URL, url)
1495 if mobj is None:
1496 raise ExtractorError(u'Invalid URL: %s' % url)
1497
1498 # Download channel page
1499 channel_id = mobj.group(1)
1500 video_ids = []
b9643eed
JMF
1501 url = 'https://www.youtube.com/channel/%s/videos' % channel_id
1502 channel_page = self._download_webpage(url, channel_id)
31812a9e
PH
1503 autogenerated = re.search(r'''(?x)
1504 class="[^"]*?(?:
1505 channel-header-autogenerated-label|
1506 yt-channel-title-autogenerated
1507 )[^"]*"''', channel_page) is not None
c5e8d7af 1508
b9643eed
JMF
1509 if autogenerated:
1510 # The videos are contained in a single page
1511 # the ajax pages can't be used, they are empty
1512 video_ids = self.extract_videos_from_page(channel_page)
1513 else:
1514 # Download all channel pages using the json-based channel_ajax query
1515 for pagenum in itertools.count(1):
1516 url = self._MORE_PAGES_URL % (pagenum, channel_id)
81c2f20b
PH
1517 page = self._download_json(
1518 url, channel_id, note=u'Downloading page #%s' % pagenum,
1519 transform_source=uppercase_escape)
1520
b9643eed
JMF
1521 ids_in_page = self.extract_videos_from_page(page['content_html'])
1522 video_ids.extend(ids_in_page)
1523
1524 if self._MORE_PAGES_INDICATOR not in page['load_more_widget_html']:
1525 break
c5e8d7af
PH
1526
1527 self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1528
7012b23c
PH
1529 url_entries = [self.url_result(video_id, 'Youtube', video_id=video_id)
1530 for video_id in video_ids]
1531 return self.playlist_result(url_entries, channel_id)
c5e8d7af
PH
1532
1533
1534class YoutubeUserIE(InfoExtractor):
0f818663 1535 IE_DESC = u'YouTube.com user videos (URL or "ytuser" keyword)'
c9ae7b95 1536 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:user/)?(?!(?:attribution_link|watch|results)(?:$|[^a-z_A-Z0-9-])))|ytuser:)(?!feed/)([A-Za-z0-9_-]+)'
38c2e5b8 1537 _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/users/%s'
c5e8d7af 1538 _GDATA_PAGE_SIZE = 50
38c2e5b8 1539 _GDATA_URL = 'https://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d&alt=json'
c5e8d7af
PH
1540 IE_NAME = u'youtube:user'
1541
e3ea4790 1542 @classmethod
f4b05232 1543 def suitable(cls, url):
e3ea4790
JMF
1544 # Don't return True if the url can be extracted with other youtube
1545 # extractor, the regex would is too permissive and it would match.
1546 other_ies = iter(klass for (name, klass) in globals().items() if name.endswith('IE') and klass is not cls)
1547 if any(ie.suitable(url) for ie in other_ies): return False
f4b05232
JMF
1548 else: return super(YoutubeUserIE, cls).suitable(url)
1549
c5e8d7af
PH
1550 def _real_extract(self, url):
1551 # Extract username
1552 mobj = re.match(self._VALID_URL, url)
1553 if mobj is None:
1554 raise ExtractorError(u'Invalid URL: %s' % url)
1555
1556 username = mobj.group(1)
1557
1558 # Download video ids using YouTube Data API. Result size per
1559 # query is limited (currently to 50 videos) so we need to query
1560 # page by page until there are no video ids - it means we got
1561 # all of them.
1562
b7ab0590 1563 def download_page(pagenum):
c5e8d7af
PH
1564 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1565
1566 gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
b7ab0590
PH
1567 page = self._download_webpage(
1568 gdata_url, username,
1569 u'Downloading video ids from %d to %d' % (
1570 start_index, start_index + self._GDATA_PAGE_SIZE))
c5e8d7af 1571
fd9cf738
JMF
1572 try:
1573 response = json.loads(page)
1574 except ValueError as err:
1575 raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
71c82637 1576 if 'entry' not in response['feed']:
b7ab0590 1577 return
fd9cf738 1578
c5e8d7af 1579 # Extract video identifiers
e302f9ce
PH
1580 entries = response['feed']['entry']
1581 for entry in entries:
1582 title = entry['title']['$t']
1583 video_id = entry['id']['$t'].split('/')[-1]
b7ab0590 1584 yield {
e302f9ce
PH
1585 '_type': 'url',
1586 'url': video_id,
1587 'ie_key': 'Youtube',
b11cec41 1588 'id': video_id,
e302f9ce 1589 'title': title,
b7ab0590
PH
1590 }
1591 url_results = PagedList(download_page, self._GDATA_PAGE_SIZE)
c5e8d7af 1592
7012b23c
PH
1593 return self.playlist_result(url_results, playlist_title=username)
1594
b05654f0
PH
1595
1596class YoutubeSearchIE(SearchInfoExtractor):
0f818663 1597 IE_DESC = u'YouTube.com searches'
83d548ef 1598 _API_URL = u'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
b05654f0
PH
1599 _MAX_RESULTS = 1000
1600 IE_NAME = u'youtube:search'
1601 _SEARCH_KEY = 'ytsearch'
1602
b05654f0
PH
1603 def _get_n_results(self, query, n):
1604 """Get a specified number of results for a query"""
1605
1606 video_ids = []
1607 pagenum = 0
1608 limit = n
83d548ef 1609 PAGE_SIZE = 50
b05654f0 1610
83d548ef
PH
1611 while (PAGE_SIZE * pagenum) < limit:
1612 result_url = self._API_URL % (
1613 compat_urllib_parse.quote_plus(query.encode('utf-8')),
1614 (PAGE_SIZE * pagenum) + 1)
7cc3570e
PH
1615 data_json = self._download_webpage(
1616 result_url, video_id=u'query "%s"' % query,
1617 note=u'Downloading page %s' % (pagenum + 1),
1618 errnote=u'Unable to download API page')
1619 data = json.loads(data_json)
1620 api_response = data['data']
1621
1622 if 'items' not in api_response:
07ad22b8
PH
1623 raise ExtractorError(
1624 u'[youtube] No video results', expected=True)
b05654f0
PH
1625
1626 new_ids = list(video['id'] for video in api_response['items'])
1627 video_ids += new_ids
1628
1629 limit = min(n, api_response['totalItems'])
1630 pagenum += 1
1631
1632 if len(video_ids) > n:
1633 video_ids = video_ids[:n]
7012b23c
PH
1634 videos = [self.url_result(video_id, 'Youtube', video_id=video_id)
1635 for video_id in video_ids]
b05654f0 1636 return self.playlist_result(videos, query)
75dff0ee 1637
c9ae7b95 1638
a3dd9248 1639class YoutubeSearchDateIE(YoutubeSearchIE):
cb7fb546 1640 IE_NAME = YoutubeSearchIE.IE_NAME + ':date'
a3dd9248
CM
1641 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc&orderby=published'
1642 _SEARCH_KEY = 'ytsearchdate'
08fb86c4 1643 IE_DESC = u'YouTube.com searches, newest videos first'
75dff0ee 1644
c9ae7b95
PH
1645
1646class YoutubeSearchURLIE(InfoExtractor):
1647 IE_DESC = u'YouTube.com search URLs'
1648 IE_NAME = u'youtube:search_url'
1649 _VALID_URL = r'https?://(?:www\.)?youtube\.com/results\?(.*?&)?search_query=(?P<query>[^&]+)(?:[&]|$)'
1650
1651 def _real_extract(self, url):
1652 mobj = re.match(self._VALID_URL, url)
1653 query = compat_urllib_parse.unquote_plus(mobj.group('query'))
1654
1655 webpage = self._download_webpage(url, query)
1656 result_code = self._search_regex(
6feb2d5e 1657 r'(?s)<ol class="item-section"(.*?)</ol>', webpage, u'result HTML')
c9ae7b95
PH
1658
1659 part_codes = re.findall(
1660 r'(?s)<h3 class="yt-lockup-title">(.*?)</h3>', result_code)
1661 entries = []
1662 for part_code in part_codes:
1663 part_title = self._html_search_regex(
6feb2d5e 1664 [r'(?s)title="([^"]+)"', r'>([^<]+)</a>'], part_code, 'item title', fatal=False)
c9ae7b95
PH
1665 part_url_snippet = self._html_search_regex(
1666 r'(?s)href="([^"]+)"', part_code, 'item URL')
1667 part_url = compat_urlparse.urljoin(
1668 'https://www.youtube.com/', part_url_snippet)
1669 entries.append({
1670 '_type': 'url',
1671 'url': part_url,
1672 'title': part_title,
1673 })
1674
1675 return {
1676 '_type': 'playlist',
1677 'entries': entries,
1678 'title': query,
1679 }
1680
1681
75dff0ee 1682class YoutubeShowIE(InfoExtractor):
0f818663 1683 IE_DESC = u'YouTube.com (multi-season) shows'
75dff0ee
JMF
1684 _VALID_URL = r'https?://www\.youtube\.com/show/(.*)'
1685 IE_NAME = u'youtube:show'
1686
1687 def _real_extract(self, url):
1688 mobj = re.match(self._VALID_URL, url)
1689 show_name = mobj.group(1)
1690 webpage = self._download_webpage(url, show_name, u'Downloading show webpage')
1691 # There's one playlist for each season of the show
1692 m_seasons = list(re.finditer(r'href="(/playlist\?list=.*?)"', webpage))
1693 self.to_screen(u'%s: Found %s seasons' % (show_name, len(m_seasons)))
1694 return [self.url_result('https://www.youtube.com' + season.group(1), 'YoutubePlaylist') for season in m_seasons]
04cc9617
JMF
1695
1696
b2e8bc1b 1697class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
d7ae0639
JMF
1698 """
1699 Base class for extractors that fetch info from
1700 http://www.youtube.com/feed_ajax
1701 Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.
1702 """
b2e8bc1b 1703 _LOGIN_REQUIRED = True
43ba5456
JMF
1704 # use action_load_personal_feed instead of action_load_system_feed
1705 _PERSONAL_FEED = False
04cc9617 1706
d7ae0639
JMF
1707 @property
1708 def _FEED_TEMPLATE(self):
43ba5456
JMF
1709 action = 'action_load_system_feed'
1710 if self._PERSONAL_FEED:
1711 action = 'action_load_personal_feed'
38c2e5b8 1712 return 'https://www.youtube.com/feed_ajax?%s=1&feed_name=%s&paging=%%s' % (action, self._FEED_NAME)
d7ae0639
JMF
1713
1714 @property
1715 def IE_NAME(self):
1716 return u'youtube:%s' % self._FEED_NAME
04cc9617 1717
81f0259b 1718 def _real_initialize(self):
b2e8bc1b 1719 self._login()
81f0259b 1720
04cc9617
JMF
1721 def _real_extract(self, url):
1722 feed_entries = []
0e44d838
JMF
1723 paging = 0
1724 for i in itertools.count(1):
f6177462 1725 info = self._download_json(self._FEED_TEMPLATE % paging,
d7ae0639 1726 u'%s feed' % self._FEED_NAME,
04cc9617 1727 u'Downloading page %s' % i)
f6177462 1728 feed_html = info.get('feed_html') or info.get('content_html')
43ba5456 1729 m_ids = re.finditer(r'"/watch\?v=(.*?)["&]', feed_html)
04cc9617 1730 ids = orderedSet(m.group(1) for m in m_ids)
7012b23c
PH
1731 feed_entries.extend(
1732 self.url_result(video_id, 'Youtube', video_id=video_id)
1733 for video_id in ids)
05ee2b6d
JMF
1734 mobj = re.search(
1735 r'data-uix-load-more-href="/?[^"]+paging=(?P<paging>\d+)',
1736 feed_html)
1737 if mobj is None:
04cc9617 1738 break
05ee2b6d 1739 paging = mobj.group('paging')
d7ae0639
JMF
1740 return self.playlist_result(feed_entries, playlist_title=self._PLAYLIST_TITLE)
1741
1742class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor):
e45d40b1 1743 IE_DESC = u'YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication)'
d7ae0639
JMF
1744 _VALID_URL = r'https?://www\.youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
1745 _FEED_NAME = 'subscriptions'
1746 _PLAYLIST_TITLE = u'Youtube Subscriptions'
1747
1748class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
1749 IE_DESC = u'YouTube.com recommended videos, "ytrec" keyword (requires authentication)'
1750 _VALID_URL = r'https?://www\.youtube\.com/feed/recommended|:ytrec(?:ommended)?'
1751 _FEED_NAME = 'recommended'
1752 _PLAYLIST_TITLE = u'Youtube Recommended videos'
c626a3d9 1753
43ba5456
JMF
1754class YoutubeWatchLaterIE(YoutubeFeedsInfoExtractor):
1755 IE_DESC = u'Youtube watch later list, "ytwatchlater" keyword (requires authentication)'
1756 _VALID_URL = r'https?://www\.youtube\.com/feed/watch_later|:ytwatchlater'
1757 _FEED_NAME = 'watch_later'
1758 _PLAYLIST_TITLE = u'Youtube Watch Later'
43ba5456 1759 _PERSONAL_FEED = True
c626a3d9 1760
f459d170
JMF
1761class YoutubeHistoryIE(YoutubeFeedsInfoExtractor):
1762 IE_DESC = u'Youtube watch history, "ythistory" keyword (requires authentication)'
1763 _VALID_URL = u'https?://www\.youtube\.com/feed/history|:ythistory'
1764 _FEED_NAME = 'history'
1765 _PERSONAL_FEED = True
1766 _PLAYLIST_TITLE = u'Youtube Watch History'
1767
c626a3d9
JMF
1768class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
1769 IE_NAME = u'youtube:favorites'
1770 IE_DESC = u'YouTube.com favourite videos, "ytfav" keyword (requires authentication)'
c7a7750d 1771 _VALID_URL = r'https?://www\.youtube\.com/my_favorites|:ytfav(?:ou?rites)?'
c626a3d9
JMF
1772 _LOGIN_REQUIRED = True
1773
1774 def _real_extract(self, url):
1775 webpage = self._download_webpage('https://www.youtube.com/my_favorites', 'Youtube Favourites videos')
1776 playlist_id = self._search_regex(r'list=(.+?)["&]', webpage, u'favourites playlist id')
1777 return self.url_result(playlist_id, 'YoutubePlaylist')
15870e90
PH
1778
1779
1780class YoutubeTruncatedURLIE(InfoExtractor):
1781 IE_NAME = 'youtube:truncated_url'
1782 IE_DESC = False # Do not list
975d35db 1783 _VALID_URL = r'''(?x)
c4808c60
PH
1784 (?:https?://)?[^/]+/watch\?(?:
1785 feature=[a-z_]+|
1786 annotation_id=annotation_[^&]+
1787 )?$|
975d35db
PH
1788 (?:https?://)?(?:www\.)?youtube\.com/attribution_link\?a=[^&]+$
1789 '''
15870e90 1790
c4808c60
PH
1791 _TESTS = [{
1792 'url': 'http://www.youtube.com/watch?annotation_id=annotation_3951667041',
1793 'only_matching': True,
dc2fc736
PH
1794 }, {
1795 'url': 'http://www.youtube.com/watch?',
1796 'only_matching': True,
c4808c60
PH
1797 }]
1798
15870e90
PH
1799 def _real_extract(self, url):
1800 raise ExtractorError(
1801 u'Did you forget to quote the URL? Remember that & is a meta '
1802 u'character in most shells, so you want to put the URL in quotes, '
1803 u'like youtube-dl '
b4622a32
PH
1804 u'"http://www.youtube.com/watch?feature=foo&v=BaW_jenozKc" '
1805 u' or simply youtube-dl BaW_jenozKc .',
15870e90 1806 expected=True)