]> jfr.im git - yt-dlp.git/blame - youtube_dl/extractor/youtube.py
[nfl] Fix test case - download, but don't check md5
[yt-dlp.git] / youtube_dl / extractor / youtube.py
CommitLineData
c5e8d7af 1# coding: utf-8
c5e8d7af 2
78caa52a
PH
3from __future__ import unicode_literals
4
5
0ca96d48 6import itertools
c5e8d7af 7import json
c4417ddb 8import os.path
c5e8d7af 9import re
e0df6211 10import traceback
c5e8d7af 11
b05654f0 12from .common import InfoExtractor, SearchInfoExtractor
54d39d8b 13from .subtitles import SubtitlesInfoExtractor
2b25cb5d 14from ..jsinterp import JSInterpreter
54256267 15from ..swfinterp import SWFInterpreter
c5e8d7af 16from ..utils import (
edf3e38e 17 compat_chr,
c5e8d7af 18 compat_parse_qs,
c5e8d7af
PH
19 compat_urllib_parse,
20 compat_urllib_request,
7c61bd36 21 compat_urlparse,
c5e8d7af
PH
22 compat_str,
23
24 clean_html,
25 get_element_by_id,
652cdaa2 26 get_element_by_attribute,
c5e8d7af 27 ExtractorError,
dd27fd17 28 int_or_none,
b7ab0590 29 PagedList,
c5e8d7af
PH
30 unescapeHTML,
31 unified_strdate,
04cc9617 32 orderedSet,
81c2f20b 33 uppercase_escape,
c5e8d7af
PH
34)
35
de7f3446 36class YoutubeBaseInfoExtractor(InfoExtractor):
b2e8bc1b
JMF
37 """Provide base functions for Youtube extractors"""
38 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
83317f69 39 _TWOFACTOR_URL = 'https://accounts.google.com/SecondFactor'
b2e8bc1b 40 _LANG_URL = r'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
38c2e5b8 41 _AGE_URL = 'https://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
b2e8bc1b
JMF
42 _NETRC_MACHINE = 'youtube'
43 # If True it will raise an error if no login info is provided
44 _LOGIN_REQUIRED = False
45
b2e8bc1b 46 def _set_language(self):
7cc3570e
PH
47 return bool(self._download_webpage(
48 self._LANG_URL, None,
49 note=u'Setting language', errnote='unable to set language',
50 fatal=False))
b2e8bc1b
JMF
51
52 def _login(self):
83317f69 53 """
54 Attempt to log in to YouTube.
55 True is returned if successful or skipped.
56 False is returned if login failed.
57
58 If _LOGIN_REQUIRED is set and no authentication was provided, an error is raised.
59 """
b2e8bc1b
JMF
60 (username, password) = self._get_login_info()
61 # No authentication to be performed
62 if username is None:
63 if self._LOGIN_REQUIRED:
64 raise ExtractorError(u'No login info available, needed for using %s.' % self.IE_NAME, expected=True)
83317f69 65 return True
b2e8bc1b 66
7cc3570e
PH
67 login_page = self._download_webpage(
68 self._LOGIN_URL, None,
69 note=u'Downloading login page',
70 errnote=u'unable to fetch login page', fatal=False)
71 if login_page is False:
72 return
b2e8bc1b 73
795f28f8 74 galx = self._search_regex(r'(?s)<input.+?name="GALX".+?value="(.+?)"',
78caa52a 75 login_page, 'Login GALX parameter')
c5e8d7af 76
b2e8bc1b
JMF
77 # Log in
78 login_form_strs = {
78caa52a
PH
79 'continue': 'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
80 'Email': username,
81 'GALX': galx,
82 'Passwd': password,
83
84 'PersistentCookie': 'yes',
85 '_utf8': '霱',
86 'bgresponse': 'js_disabled',
87 'checkConnection': '',
88 'checkedDomains': 'youtube',
89 'dnConn': '',
90 'pstMsg': '0',
91 'rmShown': '1',
92 'secTok': '',
93 'signIn': 'Sign in',
94 'timeStmp': '',
95 'service': 'youtube',
96 'uilel': '3',
97 'hl': 'en_US',
b2e8bc1b 98 }
83317f69 99
b2e8bc1b
JMF
100 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
101 # chokes on unicode
102 login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
103 login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
7cc3570e
PH
104
105 req = compat_urllib_request.Request(self._LOGIN_URL, login_data)
106 login_results = self._download_webpage(
107 req, None,
108 note=u'Logging in', errnote=u'unable to log in', fatal=False)
109 if login_results is False:
110 return False
83317f69 111
112 if re.search(r'id="errormsg_0_Passwd"', login_results) is not None:
113 raise ExtractorError(u'Please use your account password and a two-factor code instead of an application-specific password.', expected=True)
114
115 # Two-Factor
116 # TODO add SMS and phone call support - these require making a request and then prompting the user
117
118 if re.search(r'(?i)<form[^>]* id="gaia_secondfactorform"', login_results) is not None:
119 tfa_code = self._get_tfa_info()
120
121 if tfa_code is None:
122 self._downloader.report_warning(u'Two-factor authentication required. Provide it with --twofactor <code>')
123 self._downloader.report_warning(u'(Note that only TOTP (Google Authenticator App) codes work at this time.)')
124 return False
125
126 # Unlike the first login form, secTok and timeStmp are both required for the TFA form
127
128 match = re.search(r'id="secTok"\n\s+value=\'(.+)\'/>', login_results, re.M | re.U)
129 if match is None:
130 self._downloader.report_warning(u'Failed to get secTok - did the page structure change?')
131 secTok = match.group(1)
132 match = re.search(r'id="timeStmp"\n\s+value=\'(.+)\'/>', login_results, re.M | re.U)
133 if match is None:
134 self._downloader.report_warning(u'Failed to get timeStmp - did the page structure change?')
135 timeStmp = match.group(1)
136
137 tfa_form_strs = {
78caa52a
PH
138 'continue': 'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
139 'smsToken': '',
140 'smsUserPin': tfa_code,
141 'smsVerifyPin': 'Verify',
142
143 'PersistentCookie': 'yes',
144 'checkConnection': '',
145 'checkedDomains': 'youtube',
146 'pstMsg': '1',
147 'secTok': secTok,
148 'timeStmp': timeStmp,
149 'service': 'youtube',
150 'hl': 'en_US',
83317f69 151 }
152 tfa_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in tfa_form_strs.items())
153 tfa_data = compat_urllib_parse.urlencode(tfa_form).encode('ascii')
154
155 tfa_req = compat_urllib_request.Request(self._TWOFACTOR_URL, tfa_data)
156 tfa_results = self._download_webpage(
157 tfa_req, None,
158 note=u'Submitting TFA code', errnote=u'unable to submit tfa', fatal=False)
159
160 if tfa_results is False:
161 return False
162
163 if re.search(r'(?i)<form[^>]* id="gaia_secondfactorform"', tfa_results) is not None:
164 self._downloader.report_warning(u'Two-factor code expired. Please try again, or use a one-use backup code instead.')
165 return False
166 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', tfa_results) is not None:
167 self._downloader.report_warning(u'unable to log in - did the page structure change?')
168 return False
169 if re.search(r'smsauth-interstitial-reviewsettings', tfa_results) is not None:
170 self._downloader.report_warning(u'Your Google account has a security notice. Please log in on your web browser, resolve the notice, and try again.')
171 return False
172
7cc3570e
PH
173 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
174 self._downloader.report_warning(u'unable to log in: bad username or password')
b2e8bc1b
JMF
175 return False
176 return True
177
178 def _confirm_age(self):
179 age_form = {
7cc3570e
PH
180 'next_url': '/',
181 'action_confirm': 'Confirm',
182 }
5700e779
JMF
183 req = compat_urllib_request.Request(self._AGE_URL,
184 compat_urllib_parse.urlencode(age_form).encode('ascii'))
7cc3570e
PH
185
186 self._download_webpage(
187 req, None,
188 note=u'Confirming age', errnote=u'Unable to confirm age')
b2e8bc1b
JMF
189 return True
190
191 def _real_initialize(self):
192 if self._downloader is None:
193 return
194 if not self._set_language():
195 return
196 if not self._login():
197 return
198 self._confirm_age()
c5e8d7af 199
8377574c 200
de7f3446 201class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
78caa52a 202 IE_DESC = 'YouTube.com'
cb7dfeea 203 _VALID_URL = r"""(?x)^
c5e8d7af 204 (
edb53e2d 205 (?:https?://|//) # http(s):// or protocol-independent URL
cb7dfeea 206 (?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/|
484aaeb2 207 (?:www\.)?deturl\.com/www\.youtube\.com/|
e70dc1d1 208 (?:www\.)?pwnyoutube\.com/|
f7000f3a 209 (?:www\.)?yourepeat\.com/|
e69ae5b9
JMF
210 tube\.majestyc\.net/|
211 youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains
c5e8d7af
PH
212 (?:.*?\#/)? # handle anchor (#/) redirect urls
213 (?: # the various things that can precede the ID:
214 (?:(?:v|embed|e)/) # v/ or embed/ or e/
215 |(?: # or the v= param in all its forms
f7000f3a 216 (?:(?:watch|movie)(?:_popup)?(?:\.php)?/?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
c5e8d7af
PH
217 (?:\?|\#!?) # the params delimiter ? or # or #!
218 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
219 v=
220 )
f4b05232
JMF
221 ))
222 |youtu\.be/ # just youtu.be/xxxx
edb53e2d 223 |(?:www\.)?cleanvideosearch\.com/media/action/yt/watch\?videoId=
f4b05232 224 )
c5e8d7af 225 )? # all until now is optional -> you can pass the naked ID
8963d9c2 226 ([0-9A-Za-z_-]{11}) # here is it! the YouTube video ID
9291475f 227 (?!.*?&list=) # combined list/video URLs are handled by the playlist IE
c5e8d7af
PH
228 (?(1).+)? # if we found the ID, everything can follow
229 $"""
c5e8d7af 230 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
2c62dc26
PH
231 _formats = {
232 '5': {'ext': 'flv', 'width': 400, 'height': 240},
233 '6': {'ext': 'flv', 'width': 450, 'height': 270},
234 '13': {'ext': '3gp'},
235 '17': {'ext': '3gp', 'width': 176, 'height': 144},
236 '18': {'ext': 'mp4', 'width': 640, 'height': 360},
237 '22': {'ext': 'mp4', 'width': 1280, 'height': 720},
238 '34': {'ext': 'flv', 'width': 640, 'height': 360},
239 '35': {'ext': 'flv', 'width': 854, 'height': 480},
240 '36': {'ext': '3gp', 'width': 320, 'height': 240},
241 '37': {'ext': 'mp4', 'width': 1920, 'height': 1080},
242 '38': {'ext': 'mp4', 'width': 4096, 'height': 3072},
243 '43': {'ext': 'webm', 'width': 640, 'height': 360},
244 '44': {'ext': 'webm', 'width': 854, 'height': 480},
245 '45': {'ext': 'webm', 'width': 1280, 'height': 720},
246 '46': {'ext': 'webm', 'width': 1920, 'height': 1080},
247
1d043b93 248
86fe61c8 249 # 3d videos
43b81eb9
PH
250 '82': {'ext': 'mp4', 'height': 360, 'format_note': '3D', 'preference': -20},
251 '83': {'ext': 'mp4', 'height': 480, 'format_note': '3D', 'preference': -20},
252 '84': {'ext': 'mp4', 'height': 720, 'format_note': '3D', 'preference': -20},
253 '85': {'ext': 'mp4', 'height': 1080, 'format_note': '3D', 'preference': -20},
254 '100': {'ext': 'webm', 'height': 360, 'format_note': '3D', 'preference': -20},
255 '101': {'ext': 'webm', 'height': 480, 'format_note': '3D', 'preference': -20},
256 '102': {'ext': 'webm', 'height': 720, 'format_note': '3D', 'preference': -20},
836a086c 257
96fb5605 258 # Apple HTTP Live Streaming
43b81eb9
PH
259 '92': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'preference': -10},
260 '93': {'ext': 'mp4', 'height': 360, 'format_note': 'HLS', 'preference': -10},
261 '94': {'ext': 'mp4', 'height': 480, 'format_note': 'HLS', 'preference': -10},
262 '95': {'ext': 'mp4', 'height': 720, 'format_note': 'HLS', 'preference': -10},
263 '96': {'ext': 'mp4', 'height': 1080, 'format_note': 'HLS', 'preference': -10},
264 '132': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'preference': -10},
265 '151': {'ext': 'mp4', 'height': 72, 'format_note': 'HLS', 'preference': -10},
2c62dc26
PH
266
267 # DASH mp4 video
43b81eb9
PH
268 '133': {'ext': 'mp4', 'height': 240, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
269 '134': {'ext': 'mp4', 'height': 360, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
270 '135': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
271 '136': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
272 '137': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
273 '138': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
274 '160': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
275 '264': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
836a086c 276
f6f1fc92 277 # Dash mp4 audio
2c62dc26
PH
278 '139': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 48, 'preference': -50},
279 '140': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 128, 'preference': -50},
280 '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 256, 'preference': -50},
836a086c
AZ
281
282 # Dash webm
e75cafe9
A
283 '167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
284 '168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
285 '169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
286 '170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
287 '218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
288 '219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
289 '242': {'ext': 'webm', 'height': 240, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
290 '243': {'ext': 'webm', 'height': 360, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
291 '244': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
292 '245': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
293 '246': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
294 '247': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
295 '248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
3c80377b 296 '271': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
18061bba 297 '272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
2c62dc26
PH
298
299 # Dash webm audio
55db73ef 300 '171': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'abr': 128, 'preference': -50},
e75cafe9 301 '172': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'abr': 256, 'preference': -50},
ce6b9a2d
PH
302
303 # RTMP (unnamed)
304 '_rtmp': {'protocol': 'rtmp'},
c5e8d7af 305 }
836a086c 306
78caa52a 307 IE_NAME = 'youtube'
2eb88d95
PH
308 _TESTS = [
309 {
0e853ca4
PH
310 u"url": u"http://www.youtube.com/watch?v=BaW_jenozKc",
311 u"file": u"BaW_jenozKc.mp4",
312 u"info_dict": {
313 u"title": u"youtube-dl test video \"'/\\ä↭𝕐",
314 u"uploader": u"Philipp Hagemeister",
315 u"uploader_id": u"phihag",
316 u"upload_date": u"20121002",
ad3bc6ac
PH
317 u"description": u"test chars: \"'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .",
318 u"categories": [u'Science & Technology'],
3e7c1224
PH
319 'like_count': int,
320 'dislike_count': int,
2eb88d95 321 }
0e853ca4 322 },
0e853ca4
PH
323 {
324 u"url": u"http://www.youtube.com/watch?v=UxxajLWwzqY",
325 u"file": u"UxxajLWwzqY.mp4",
326 u"note": u"Test generic use_cipher_signature video (#897)",
327 u"info_dict": {
328 u"upload_date": u"20120506",
329 u"title": u"Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]",
ba60a3eb 330 u"description": u"md5:fea86fda2d5a5784273df5c7cc994d9f",
45ed795c 331 u"uploader": u"Icona Pop",
0e853ca4 332 u"uploader_id": u"IconaPop"
2eb88d95 333 }
c108eb73
JMF
334 },
335 {
336 u"url": u"https://www.youtube.com/watch?v=07FYdnEawAQ",
337 u"file": u"07FYdnEawAQ.mp4",
338 u"note": u"Test VEVO video with age protection (#956)",
339 u"info_dict": {
340 u"upload_date": u"20130703",
341 u"title": u"Justin Timberlake - Tunnel Vision (Explicit)",
342 u"description": u"md5:64249768eec3bc4276236606ea996373",
343 u"uploader": u"justintimberlakeVEVO",
344 u"uploader_id": u"justintimberlakeVEVO"
345 }
346 },
fccd3771 347 {
83aa5293 348 u"url": u"//www.YouTube.com/watch?v=yZIXLfi8CZQ",
fccd3771
PH
349 u"file": u"yZIXLfi8CZQ.mp4",
350 u"note": u"Embed-only video (#1746)",
351 u"info_dict": {
352 u"upload_date": u"20120608",
353 u"title": u"Principal Sexually Assaults A Teacher - Episode 117 - 8th June 2012",
354 u"description": u"md5:09b78bd971f1e3e289601dfba15ca4f7",
355 u"uploader": u"SET India",
356 u"uploader_id": u"setindia"
357 }
358 },
dd27fd17
PH
359 {
360 u"url": u"http://www.youtube.com/watch?v=a9LDPn-MO4I",
361 u"file": u"a9LDPn-MO4I.m4a",
362 u"note": u"256k DASH audio (format 141) via DASH manifest",
dd27fd17
PH
363 u"info_dict": {
364 u"upload_date": "20121002",
365 u"uploader_id": "8KVIDEO",
310d75d1 366 u"description": '',
dd27fd17
PH
367 u"uploader": "8KVIDEO",
368 u"title": "UHDTV TEST 8K VIDEO.mp4"
4919603f
PH
369 },
370 u"params": {
371 u"youtube_include_dash_manifest": True,
372 u"format": "141",
373 },
dd27fd17 374 },
3489b7d2
JMF
375 # DASH manifest with encrypted signature
376 {
78caa52a
PH
377 'url': 'https://www.youtube.com/watch?v=IB3lcPjvWLA',
378 'info_dict': {
379 'id': 'IB3lcPjvWLA',
380 'ext': 'm4a',
381 'title': 'Afrojack - The Spark ft. Spree Wilson',
382 'description': 'md5:9717375db5a9a3992be4668bbf3bc0a8',
383 'uploader': 'AfrojackVEVO',
384 'uploader_id': 'AfrojackVEVO',
385 'upload_date': '20131011',
3489b7d2
JMF
386 },
387 u"params": {
78caa52a
PH
388 'youtube_include_dash_manifest': True,
389 'format': '141',
3489b7d2
JMF
390 },
391 },
2eb88d95
PH
392 ]
393
e0df6211
PH
394 def __init__(self, *args, **kwargs):
395 super(YoutubeIE, self).__init__(*args, **kwargs)
83799698 396 self._player_cache = {}
e0df6211 397
c5e8d7af
PH
398 def report_video_info_webpage_download(self, video_id):
399 """Report attempt to download video info webpage."""
400 self.to_screen(u'%s: Downloading video info webpage' % video_id)
401
c5e8d7af
PH
402 def report_information_extraction(self, video_id):
403 """Report attempt to extract video information."""
404 self.to_screen(u'%s: Extracting video information' % video_id)
405
406 def report_unavailable_format(self, video_id, format):
407 """Report extracted video URL."""
408 self.to_screen(u'%s: Format %s not available' % (video_id, format))
409
410 def report_rtmp_download(self):
411 """Indicate the download will use the RTMP protocol."""
412 self.to_screen(u'RTMP download detected')
413
60064c53
PH
414 def _signature_cache_id(self, example_sig):
415 """ Return a string representation of a signature """
78caa52a 416 return '.'.join(compat_str(len(part)) for part in example_sig.split('.'))
60064c53
PH
417
418 def _extract_signature_function(self, video_id, player_url, example_sig):
cf010131 419 id_m = re.match(
c081b35c 420 r'.*-(?P<id>[a-zA-Z0-9_-]+)(?:/watch_as3|/html5player)?\.(?P<ext>[a-z]+)$',
cf010131 421 player_url)
c081b35c
PH
422 if not id_m:
423 raise ExtractorError('Cannot identify player %r' % player_url)
e0df6211
PH
424 player_type = id_m.group('ext')
425 player_id = id_m.group('id')
426
c4417ddb 427 # Read from filesystem cache
60064c53
PH
428 func_id = '%s_%s_%s' % (
429 player_type, player_id, self._signature_cache_id(example_sig))
c4417ddb 430 assert os.path.basename(func_id) == func_id
a0e07d31
PH
431
432 cache_spec = self._downloader.cache.load(u'youtube-sigfuncs', func_id)
433 if cache_spec is not None:
78caa52a 434 return lambda s: ''.join(s[i] for i in cache_spec)
83799698 435
e0df6211
PH
436 if player_type == 'js':
437 code = self._download_webpage(
438 player_url, video_id,
83799698 439 note=u'Downloading %s player %s' % (player_type, player_id),
e0df6211 440 errnote=u'Download of %s failed' % player_url)
83799698 441 res = self._parse_sig_js(code)
c4417ddb 442 elif player_type == 'swf':
e0df6211
PH
443 urlh = self._request_webpage(
444 player_url, video_id,
83799698 445 note=u'Downloading %s player %s' % (player_type, player_id),
e0df6211
PH
446 errnote=u'Download of %s failed' % player_url)
447 code = urlh.read()
83799698 448 res = self._parse_sig_swf(code)
e0df6211
PH
449 else:
450 assert False, 'Invalid player type %r' % player_type
451
a0e07d31 452 if cache_spec is None:
78caa52a 453 test_string = ''.join(map(compat_chr, range(len(example_sig))))
a0e07d31
PH
454 cache_res = res(test_string)
455 cache_spec = [ord(c) for c in cache_res]
83799698 456
a0e07d31 457 self._downloader.cache.store(u'youtube-sigfuncs', func_id, cache_spec)
83799698
PH
458 return res
459
60064c53 460 def _print_sig_code(self, func, example_sig):
edf3e38e
PH
461 def gen_sig_code(idxs):
462 def _genslice(start, end, step):
78caa52a
PH
463 starts = '' if start == 0 else str(start)
464 ends = (u':%d' % (end+step)) if end + step >= 0 else ':'
465 steps = '' if step == 1 else (u':%d' % step)
466 return 's[%s%s%s]' % (starts, ends, steps)
edf3e38e
PH
467
468 step = None
0ca96d48
PH
469 start = '(Never used)' # Quelch pyflakes warnings - start will be
470 # set as soon as step is set
edf3e38e
PH
471 for i, prev in zip(idxs[1:], idxs[:-1]):
472 if step is not None:
473 if i - prev == step:
474 continue
475 yield _genslice(start, prev, step)
476 step = None
477 continue
478 if i - prev in [-1, 1]:
479 step = i - prev
480 start = prev
481 continue
482 else:
78caa52a 483 yield 's[%d]' % prev
edf3e38e 484 if step is None:
78caa52a 485 yield 's[%d]' % i
edf3e38e
PH
486 else:
487 yield _genslice(start, i, step)
488
78caa52a 489 test_string = ''.join(map(compat_chr, range(len(example_sig))))
c705320f 490 cache_res = func(test_string)
edf3e38e 491 cache_spec = [ord(c) for c in cache_res]
78caa52a 492 expr_code = ' + '.join(gen_sig_code(cache_spec))
60064c53
PH
493 signature_id_tuple = '(%s)' % (
494 ', '.join(compat_str(len(p)) for p in example_sig.split('.')))
495 code = (u'if tuple(len(p) for p in s.split(\'.\')) == %s:\n'
78caa52a 496 ' return %s\n') % (signature_id_tuple, expr_code)
f8061589 497 self.to_screen(u'Extracted signature function:\n' + code)
edf3e38e 498
e0df6211
PH
499 def _parse_sig_js(self, jscode):
500 funcname = self._search_regex(
c26e9ac4 501 r'signature=([$a-zA-Z]+)', jscode,
78caa52a 502 'Initial JS player signature function name')
2b25cb5d
PH
503
504 jsi = JSInterpreter(jscode)
505 initial_function = jsi.extract_function(funcname)
e0df6211
PH
506 return lambda s: initial_function([s])
507
508 def _parse_sig_swf(self, file_contents):
54256267 509 swfi = SWFInterpreter(file_contents)
78caa52a 510 TARGET_CLASSNAME = 'SignatureDecipher'
54256267 511 searched_class = swfi.extract_class(TARGET_CLASSNAME)
78caa52a 512 initial_function = swfi.extract_function(searched_class, 'decipher')
e0df6211
PH
513 return lambda s: initial_function([s])
514
83799698 515 def _decrypt_signature(self, s, video_id, player_url, age_gate=False):
257a2501 516 """Turn the encrypted s field into a working signature"""
6b37f0be 517
c8bf86d5
PH
518 if player_url is None:
519 raise ExtractorError(u'Cannot decrypt signature without player_url')
920de7a2 520
c8bf86d5 521 if player_url.startswith(u'//'):
78caa52a 522 player_url = 'https:' + player_url
c8bf86d5 523 try:
62af3a0e 524 player_id = (player_url, self._signature_cache_id(s))
c8bf86d5
PH
525 if player_id not in self._player_cache:
526 func = self._extract_signature_function(
60064c53 527 video_id, player_url, s
c8bf86d5
PH
528 )
529 self._player_cache[player_id] = func
530 func = self._player_cache[player_id]
531 if self._downloader.params.get('youtube_print_sig_code'):
60064c53 532 self._print_sig_code(func, s)
c8bf86d5
PH
533 return func(s)
534 except Exception as e:
535 tb = traceback.format_exc()
536 raise ExtractorError(
78caa52a 537 'Signature extraction failed: ' + tb, cause=e)
e0df6211 538
1f343eaa 539 def _get_available_subtitles(self, video_id, webpage):
de7f3446 540 try:
7fad1c63 541 sub_list = self._download_webpage(
38c2e5b8 542 'https://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,
7fad1c63
JMF
543 video_id, note=False)
544 except ExtractorError as err:
de7f3446
JMF
545 self._downloader.report_warning(u'unable to download video subtitles: %s' % compat_str(err))
546 return {}
547 lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
548
549 sub_lang_list = {}
550 for l in lang_list:
551 lang = l[1]
7e660ac1
LD
552 if lang in sub_lang_list:
553 continue
de7f3446
JMF
554 params = compat_urllib_parse.urlencode({
555 'lang': lang,
556 'v': video_id,
ca715127 557 'fmt': self._downloader.params.get('subtitlesformat', 'srt'),
c3197e3e 558 'name': unescapeHTML(l[0]).encode('utf-8'),
de7f3446 559 })
78caa52a 560 url = 'https://www.youtube.com/api/timedtext?' + params
de7f3446
JMF
561 sub_lang_list[lang] = url
562 if not sub_lang_list:
563 self._downloader.report_warning(u'video doesn\'t have subtitles')
564 return {}
565 return sub_lang_list
566
055e6f36 567 def _get_available_automatic_caption(self, video_id, webpage):
de7f3446
JMF
568 """We need the webpage for getting the captions url, pass it as an
569 argument to speed up the process."""
ca715127 570 sub_format = self._downloader.params.get('subtitlesformat', 'srt')
de7f3446
JMF
571 self.to_screen(u'%s: Looking for automatic captions' % video_id)
572 mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
78caa52a 573 err_msg = 'Couldn\'t find automatic captions for %s' % video_id
de7f3446
JMF
574 if mobj is None:
575 self._downloader.report_warning(err_msg)
576 return {}
577 player_config = json.loads(mobj.group(1))
578 try:
579 args = player_config[u'args']
580 caption_url = args[u'ttsurl']
581 timestamp = args[u'timestamp']
055e6f36
JMF
582 # We get the available subtitles
583 list_params = compat_urllib_parse.urlencode({
584 'type': 'list',
585 'tlangs': 1,
586 'asrs': 1,
de7f3446 587 })
055e6f36 588 list_url = caption_url + '&' + list_params
e26f8712 589 caption_list = self._download_xml(list_url, video_id)
e3dc22ca 590 original_lang_node = caption_list.find('track')
f6a54188 591 if original_lang_node is None or original_lang_node.attrib.get('kind') != 'asr' :
e3dc22ca
JMF
592 self._downloader.report_warning(u'Video doesn\'t have automatic captions')
593 return {}
594 original_lang = original_lang_node.attrib['lang_code']
055e6f36
JMF
595
596 sub_lang_list = {}
597 for lang_node in caption_list.findall('target'):
598 sub_lang = lang_node.attrib['lang_code']
599 params = compat_urllib_parse.urlencode({
600 'lang': original_lang,
601 'tlang': sub_lang,
602 'fmt': sub_format,
603 'ts': timestamp,
604 'kind': 'asr',
605 })
606 sub_lang_list[sub_lang] = caption_url + '&' + params
607 return sub_lang_list
de7f3446
JMF
608 # An extractor error can be raise by the download process if there are
609 # no automatic captions but there are subtitles
610 except (KeyError, ExtractorError):
611 self._downloader.report_warning(err_msg)
612 return {}
613
97665381
PH
614 @classmethod
615 def extract_id(cls, url):
616 mobj = re.match(cls._VALID_URL, url, re.VERBOSE)
c5e8d7af
PH
617 if mobj is None:
618 raise ExtractorError(u'Invalid URL: %s' % url)
619 video_id = mobj.group(2)
620 return video_id
621
1d043b93
JMF
622 def _extract_from_m3u8(self, manifest_url, video_id):
623 url_map = {}
624 def _get_urls(_manifest):
625 lines = _manifest.split('\n')
626 urls = filter(lambda l: l and not l.startswith('#'),
627 lines)
628 return urls
78caa52a 629 manifest = self._download_webpage(manifest_url, video_id, 'Downloading formats manifest')
1d043b93
JMF
630 formats_urls = _get_urls(manifest)
631 for format_url in formats_urls:
890f62e8 632 itag = self._search_regex(r'itag/(\d+?)/', format_url, 'itag')
1d043b93
JMF
633 url_map[itag] = format_url
634 return url_map
635
1fb07d10
JG
636 def _extract_annotations(self, video_id):
637 url = 'https://www.youtube.com/annotations_invideo?features=1&legacy=1&video_id=%s' % video_id
638 return self._download_webpage(url, video_id, note=u'Searching for annotations.', errnote=u'Unable to download video annotations.')
639
c5e8d7af 640 def _real_extract(self, url):
7e8c0af0 641 proto = (
78caa52a
PH
642 'http' if self._downloader.params.get('prefer_insecure', False)
643 else 'https')
7e8c0af0 644
c5e8d7af
PH
645 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
646 mobj = re.search(self._NEXT_URL_RE, url)
647 if mobj:
7e8c0af0 648 url = proto + '://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
97665381 649 video_id = self.extract_id(url)
c5e8d7af
PH
650
651 # Get video webpage
7e8c0af0 652 url = proto + '://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
336c3a69 653 video_webpage = self._download_webpage(url, video_id)
c5e8d7af
PH
654
655 # Attempt to extract SWF player URL
e0df6211 656 mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
c5e8d7af
PH
657 if mobj is not None:
658 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
659 else:
660 player_url = None
661
662 # Get video info
663 self.report_video_info_webpage_download(video_id)
c108eb73
JMF
664 if re.search(r'player-age-gate-content">', video_webpage) is not None:
665 self.report_age_confirmation()
666 age_gate = True
667 # We simulate the access to the video from www.youtube.com/v/{video_id}
668 # this can be viewed without login into Youtube
2c57c7fa
JMF
669 data = compat_urllib_parse.urlencode({
670 'video_id': video_id,
671 'eurl': 'https://youtube.googleapis.com/v/' + video_id,
c084c934
JMF
672 'sts': self._search_regex(
673 r'"sts"\s*:\s*(\d+)', video_webpage, 'sts'),
2c57c7fa 674 })
7e8c0af0 675 video_info_url = proto + '://www.youtube.com/get_video_info?' + data
c5e8d7af
PH
676 video_info_webpage = self._download_webpage(video_info_url, video_id,
677 note=False,
678 errnote='unable to download video info webpage')
679 video_info = compat_parse_qs(video_info_webpage)
c108eb73
JMF
680 else:
681 age_gate = False
682 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
7e8c0af0 683 video_info_url = (proto + '://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
c108eb73
JMF
684 % (video_id, el_type))
685 video_info_webpage = self._download_webpage(video_info_url, video_id,
686 note=False,
687 errnote='unable to download video info webpage')
688 video_info = compat_parse_qs(video_info_webpage)
689 if 'token' in video_info:
690 break
c5e8d7af
PH
691 if 'token' not in video_info:
692 if 'reason' in video_info:
d11271dd 693 raise ExtractorError(
78caa52a 694 'YouTube said: %s' % video_info['reason'][0],
d11271dd 695 expected=True, video_id=video_id)
c5e8d7af 696 else:
d11271dd 697 raise ExtractorError(
78caa52a 698 '"token" parameter not in video info for unknown reason',
d11271dd 699 video_id=video_id)
c5e8d7af 700
1d699755
PH
701 if 'view_count' in video_info:
702 view_count = int(video_info['view_count'][0])
703 else:
704 view_count = None
705
c5e8d7af
PH
706 # Check for "rental" videos
707 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
708 raise ExtractorError(u'"rental" videos not supported')
709
710 # Start extracting information
711 self.report_information_extraction(video_id)
712
713 # uploader
714 if 'author' not in video_info:
715 raise ExtractorError(u'Unable to extract uploader name')
716 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
717
718 # uploader_id
719 video_uploader_id = None
720 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
721 if mobj is not None:
722 video_uploader_id = mobj.group(1)
723 else:
724 self._downloader.report_warning(u'unable to extract uploader nickname')
725
726 # title
a8c6b241 727 if 'title' in video_info:
aa92f063 728 video_title = video_info['title'][0]
a8c6b241
PH
729 else:
730 self._downloader.report_warning(u'Unable to extract video title')
78caa52a 731 video_title = '_'
c5e8d7af
PH
732
733 # thumbnail image
7763b04e
JMF
734 # We try first to get a high quality image:
735 m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">',
736 video_webpage, re.DOTALL)
737 if m_thumb is not None:
738 video_thumbnail = m_thumb.group(1)
739 elif 'thumbnail_url' not in video_info:
c5e8d7af 740 self._downloader.report_warning(u'unable to extract video thumbnail')
f490e77e 741 video_thumbnail = None
c5e8d7af
PH
742 else: # don't panic if we can't find it
743 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
744
745 # upload date
746 upload_date = None
ad3bc6ac 747 mobj = re.search(r'(?s)id="eow-date.*?>(.*?)</span>', video_webpage)
beee53de
PH
748 if mobj is None:
749 mobj = re.search(
263bd4ec 750 r'(?s)id="watch-uploader-info".*?>.*?(?:Published|Uploaded|Streamed live) on (.*?)</strong>',
beee53de 751 video_webpage)
c5e8d7af
PH
752 if mobj is not None:
753 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
754 upload_date = unified_strdate(upload_date)
755
55f7bd2d
PH
756 m_cat_container = self._search_regex(
757 r'(?s)<h4[^>]*>\s*Category\s*</h4>\s*<ul[^>]*>(.*?)</ul>',
758 video_webpage, 'categories', fatal=False)
ec8deefc 759 if m_cat_container:
ad3bc6ac 760 category = self._html_search_regex(
01ed5c9b 761 r'(?s)<a[^<]+>(.*?)</a>', m_cat_container, 'category',
ad3bc6ac
PH
762 default=None)
763 video_categories = None if category is None else [category]
764 else:
765 video_categories = None
ec8deefc 766
c5e8d7af
PH
767 # description
768 video_description = get_element_by_id("eow-description", video_webpage)
769 if video_description:
27dcce19
PH
770 video_description = re.sub(r'''(?x)
771 <a\s+
772 (?:[a-zA-Z-]+="[^"]+"\s+)*?
773 title="([^"]+)"\s+
774 (?:[a-zA-Z-]+="[^"]+"\s+)*?
775 class="yt-uix-redirect-link"\s*>
776 [^<]+
777 </a>
778 ''', r'\1', video_description)
c5e8d7af
PH
779 video_description = clean_html(video_description)
780 else:
781 fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
782 if fd_mobj:
783 video_description = unescapeHTML(fd_mobj.group(1))
784 else:
78caa52a 785 video_description = ''
c5e8d7af 786
f30a38be 787 def _extract_count(count_name):
46374a56 788 count = self._search_regex(
f30a38be
JMF
789 r'id="watch-%s"[^>]*>.*?([\d,]+)\s*</span>' % re.escape(count_name),
790 video_webpage, count_name, default=None)
336c3a69
JMF
791 if count is not None:
792 return int(count.replace(',', ''))
793 return None
f30a38be
JMF
794 like_count = _extract_count(u'like')
795 dislike_count = _extract_count(u'dislike')
336c3a69 796
c5e8d7af 797 # subtitles
d82134c3 798 video_subtitles = self.extract_subtitles(video_id, video_webpage)
c5e8d7af 799
c5e8d7af 800 if self._downloader.params.get('listsubtitles', False):
d665f8d3 801 self._list_available_subtitles(video_id, video_webpage)
c5e8d7af
PH
802 return
803
804 if 'length_seconds' not in video_info:
805 self._downloader.report_warning(u'unable to extract video duration')
b466b702 806 video_duration = None
c5e8d7af 807 else:
b466b702 808 video_duration = int(compat_urllib_parse.unquote_plus(video_info['length_seconds'][0]))
c5e8d7af 809
1fb07d10
JG
810 # annotations
811 video_annotations = None
812 if self._downloader.params.get('writeannotations', False):
813 video_annotations = self._extract_annotations(video_id)
814
c5e8d7af 815 # Decide which formats to download
c5e8d7af 816 try:
ae7ed920 817 mobj = re.search(r';ytplayer\.config\s*=\s*({.*?});', video_webpage)
50be92c1
PH
818 if not mobj:
819 raise ValueError('Could not find vevo ID')
ae7ed920
PH
820 json_code = uppercase_escape(mobj.group(1))
821 ytplayer_config = json.loads(json_code)
3489b7d2 822 args = ytplayer_config['args']
7ce7e394
JMF
823 # Easy way to know if the 's' value is in url_encoded_fmt_stream_map
824 # this signatures are encrypted
44d46655 825 if 'url_encoded_fmt_stream_map' not in args:
f10503db 826 raise ValueError(u'No stream_map present') # caught below
00fe14fc
JMF
827 re_signature = re.compile(r'[&,]s=')
828 m_s = re_signature.search(args['url_encoded_fmt_stream_map'])
7ce7e394
JMF
829 if m_s is not None:
830 self.to_screen(u'%s: Encrypted signatures detected.' % video_id)
c5e8d7af 831 video_info['url_encoded_fmt_stream_map'] = [args['url_encoded_fmt_stream_map']]
78caa52a 832 m_s = re_signature.search(args.get('adaptive_fmts', ''))
b7a68384 833 if m_s is not None:
00fe14fc
JMF
834 if 'adaptive_fmts' in video_info:
835 video_info['adaptive_fmts'][0] += ',' + args['adaptive_fmts']
37b6d5f6 836 else:
00fe14fc 837 video_info['adaptive_fmts'] = [args['adaptive_fmts']]
c5e8d7af
PH
838 except ValueError:
839 pass
840
dd27fd17
PH
841 def _map_to_format_list(urlmap):
842 formats = []
843 for itag, video_real_url in urlmap.items():
844 dct = {
845 'format_id': itag,
846 'url': video_real_url,
847 'player_url': player_url,
848 }
0b65e5d4
PH
849 if itag in self._formats:
850 dct.update(self._formats[itag])
dd27fd17
PH
851 formats.append(dct)
852 return formats
853
c5e8d7af
PH
854 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
855 self.report_rtmp_download()
dd27fd17
PH
856 formats = [{
857 'format_id': '_rtmp',
858 'protocol': 'rtmp',
859 'url': video_info['conn'][0],
860 'player_url': player_url,
861 }]
00fe14fc
JMF
862 elif len(video_info.get('url_encoded_fmt_stream_map', [])) >= 1 or len(video_info.get('adaptive_fmts', [])) >= 1:
863 encoded_url_map = video_info.get('url_encoded_fmt_stream_map', [''])[0] + ',' + video_info.get('adaptive_fmts',[''])[0]
864 if 'rtmpe%3Dyes' in encoded_url_map:
a7055eb9 865 raise ExtractorError('rtmpe downloads are not supported, see https://github.com/rg3/youtube-dl/issues/343 for more information.', expected=True)
c5e8d7af 866 url_map = {}
00fe14fc 867 for url_data_str in encoded_url_map.split(','):
c5e8d7af 868 url_data = compat_parse_qs(url_data_str)
201e9eaa
PH
869 if 'itag' not in url_data or 'url' not in url_data:
870 continue
871 format_id = url_data['itag'][0]
872 url = url_data['url'][0]
873
874 if 'sig' in url_data:
875 url += '&signature=' + url_data['sig'][0]
876 elif 's' in url_data:
877 encrypted_sig = url_data['s'][0]
878
879 if not age_gate:
880 jsplayer_url_json = self._search_regex(
881 r'"assets":.+?"js":\s*("[^"]+")',
78caa52a 882 video_webpage, 'JS player URL')
201e9eaa
PH
883 player_url = json.loads(jsplayer_url_json)
884 if player_url is None:
885 player_url_json = self._search_regex(
886 r'ytplayer\.config.*?"url"\s*:\s*("[^"]+")',
78caa52a 887 video_webpage, 'age gate player URL')
201e9eaa
PH
888 player_url = json.loads(player_url_json)
889
890 if self._downloader.params.get('verbose'):
cf010131 891 if player_url is None:
201e9eaa
PH
892 player_version = 'unknown'
893 player_desc = 'unknown'
894 else:
895 if player_url.endswith('swf'):
896 player_version = self._search_regex(
897 r'-(.+?)(?:/watch_as3)?\.swf$', player_url,
78caa52a 898 'flash player', fatal=False)
201e9eaa 899 player_desc = 'flash player %s' % player_version
cf010131 900 else:
201e9eaa
PH
901 player_version = self._search_regex(
902 r'html5player-([^/]+?)(?:/html5player)?\.js',
903 player_url,
904 'html5 player', fatal=False)
78caa52a 905 player_desc = 'html5 player %s' % player_version
201e9eaa 906
60064c53 907 parts_sizes = self._signature_cache_id(encrypted_sig)
98eb1c3f
PH
908 self.to_screen(u'{%s} signature length %s, %s' %
909 (format_id, parts_sizes, player_desc))
201e9eaa
PH
910
911 signature = self._decrypt_signature(
912 encrypted_sig, video_id, player_url, age_gate)
913 url += '&signature=' + signature
914 if 'ratebypass' not in url:
915 url += '&ratebypass=yes'
916 url_map[format_id] = url
dd27fd17 917 formats = _map_to_format_list(url_map)
1d043b93
JMF
918 elif video_info.get('hlsvp'):
919 manifest_url = video_info['hlsvp'][0]
920 url_map = self._extract_from_m3u8(manifest_url, video_id)
dd27fd17 921 formats = _map_to_format_list(url_map)
c5e8d7af 922 else:
9abb3204 923 raise ExtractorError(u'no conn, hlsvp or url_encoded_fmt_stream_map information found in video info')
c5e8d7af 924
dd27fd17 925 # Look for the DASH manifest
d68f0cdb 926 if (self._downloader.params.get('youtube_include_dash_manifest', False)):
dd27fd17 927 try:
d68f0cdb 928 # The DASH manifest used needs to be the one from the original video_webpage.
929 # The one found in get_video_info seems to be using different signatures.
930 # However, in the case of an age restriction there won't be any embedded dashmpd in the video_webpage.
931 # Luckily, it seems, this case uses some kind of default signature (len == 86), so the
932 # combination of get_video_info and the _static_decrypt_signature() decryption fallback will work here.
933 if age_gate:
3489b7d2 934 dash_manifest_url = video_info.get('dashmpd')[0]
d68f0cdb 935 else:
3489b7d2 936 dash_manifest_url = ytplayer_config['args']['dashmpd']
d68f0cdb 937 def decrypt_sig(mobj):
938 s = mobj.group(1)
939 dec_s = self._decrypt_signature(s, video_id, player_url, age_gate)
940 return '/signature/%s' % dec_s
941 dash_manifest_url = re.sub(r'/s/([\w\.]+)', decrypt_sig, dash_manifest_url)
dd27fd17 942 dash_doc = self._download_xml(
d68f0cdb 943 dash_manifest_url, video_id,
dd27fd17
PH
944 note=u'Downloading DASH manifest',
945 errnote=u'Could not download DASH manifest')
946 for r in dash_doc.findall(u'.//{urn:mpeg:DASH:schema:MPD:2011}Representation'):
947 url_el = r.find('{urn:mpeg:DASH:schema:MPD:2011}BaseURL')
948 if url_el is None:
949 continue
950 format_id = r.attrib['id']
951 video_url = url_el.text
952 filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength'))
953 f = {
954 'format_id': format_id,
955 'url': video_url,
956 'width': int_or_none(r.attrib.get('width')),
957 'tbr': int_or_none(r.attrib.get('bandwidth'), 1000),
958 'asr': int_or_none(r.attrib.get('audioSamplingRate')),
959 'filesize': filesize,
960 }
961 try:
962 existing_format = next(
963 fo for fo in formats
964 if fo['format_id'] == format_id)
965 except StopIteration:
966 f.update(self._formats.get(format_id, {}))
967 formats.append(f)
968 else:
969 existing_format.update(f)
970
971 except (ExtractorError, KeyError) as e:
972 self.report_warning(u'Skipping DASH manifest: %s' % e, video_id)
d80044c2 973
4bcc7bd1 974 self._sort_formats(formats)
4ea3be0a 975
976 return {
977 'id': video_id,
978 'uploader': video_uploader,
979 'uploader_id': video_uploader_id,
980 'upload_date': upload_date,
981 'title': video_title,
982 'thumbnail': video_thumbnail,
983 'description': video_description,
ec8deefc 984 'categories': video_categories,
4ea3be0a 985 'subtitles': video_subtitles,
986 'duration': video_duration,
987 'age_limit': 18 if age_gate else 0,
988 'annotations': video_annotations,
7e8c0af0 989 'webpage_url': proto + '://www.youtube.com/watch?v=%s' % video_id,
4ea3be0a 990 'view_count': view_count,
991 'like_count': like_count,
992 'dislike_count': dislike_count,
993 'formats': formats,
994 }
c5e8d7af 995
880e1c52 996class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
78caa52a 997 IE_DESC = 'YouTube.com playlists'
d67cc9fa 998 _VALID_URL = r"""(?x)(?:
c5e8d7af
PH
999 (?:https?://)?
1000 (?:\w+\.)?
1001 youtube\.com/
1002 (?:
1003 (?:course|view_play_list|my_playlists|artist|playlist|watch)
1004 \? (?:.*?&)*? (?:p|a|list)=
1005 | p/
1006 )
d67cc9fa 1007 (
7d568f5a 1008 (?:PL|LL|EC|UU|FL|RD)?[0-9A-Za-z-_]{10,}
d67cc9fa
JMF
1009 # Top tracks, they can also include dots
1010 |(?:MC)[\w\.]*
1011 )
c5e8d7af
PH
1012 .*
1013 |
7d568f5a 1014 ((?:PL|LL|EC|UU|FL|RD)[0-9A-Za-z-_]{10,})
c5e8d7af 1015 )"""
dbb94fb0 1016 _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s'
dcbb4580 1017 _MORE_PAGES_INDICATOR = r'data-link-type="next"'
dbb94fb0 1018 _VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})&amp;[^"]*?index=(?P<index>\d+)'
78caa52a 1019 IE_NAME = 'youtube:playlist'
81127aa5
PH
1020 _TESTS = [{
1021 'url': 'https://www.youtube.com/playlist?list=PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re',
1022 'info_dict': {
1023 'title': 'ytdl test PL',
1024 },
1025 'playlist_count': 3,
9291475f
PH
1026 }, {
1027 'url': 'https://www.youtube.com/playlist?list=PLtPgu7CB4gbZDA7i_euNxn75ISqxwZPYx',
1028 'info_dict': {
1029 'title': 'YDL_Empty_List',
1030 },
1031 'playlist_count': 0,
1032 }, {
1033 'note': 'Playlist with deleted videos (#651). As a bonus, the video #51 is also twice in this list.',
1034 'url': 'https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC',
1035 'info_dict': {
1036 'title': '29C3: Not my department',
1037 },
1038 'playlist_count': 95,
1039 }, {
1040 'note': 'issue #673',
1041 'url': 'PLBB231211A4F62143',
1042 'info_dict': {
1043 'title': 'Team Fortress 2 (Class-based LP)',
1044 },
1045 'playlist_mincount': 26,
1046 }, {
1047 'note': 'Large playlist',
1048 'url': 'https://www.youtube.com/playlist?list=UUBABnxM4Ar9ten8Mdjj1j0Q',
1049 'info_dict': {
1050 'title': 'Uploads from Cauchemar',
1051 },
1052 'playlist_mincount': 799,
1053 }, {
1054 'url': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl',
1055 'info_dict': {
1056 'title': 'YDL_safe_search',
1057 },
1058 'playlist_count': 2,
81127aa5 1059 }]
c5e8d7af 1060
880e1c52
JMF
1061 def _real_initialize(self):
1062 self._login()
1063
652cdaa2 1064 def _ids_to_results(self, ids):
c9cc0bf5
PH
1065 return [
1066 self.url_result(vid_id, 'Youtube', video_id=vid_id)
1067 for vid_id in ids]
652cdaa2
JMF
1068
1069 def _extract_mix(self, playlist_id):
1070 # The mixes are generated from a a single video
1071 # the id of the playlist is just 'RD' + video_id
7d4afc55 1072 url = 'https://youtube.com/watch?v=%s&list=%s' % (playlist_id[-11:], playlist_id)
c9cc0bf5 1073 webpage = self._download_webpage(
78caa52a 1074 url, playlist_id, 'Downloading Youtube mix')
bc2f773b 1075 search_title = lambda class_name: get_element_by_attribute('class', class_name, webpage)
c9cc0bf5
PH
1076 title_span = (
1077 search_title('playlist-title') or
1078 search_title('title long-title') or
1079 search_title('title'))
76d1700b 1080 title = clean_html(title_span)
c9cc0bf5
PH
1081 ids = orderedSet(re.findall(
1082 r'''(?xs)data-video-username=".*?".*?
1083 href="/watch\?v=([0-9A-Za-z_-]{11})&amp;[^"]*?list=%s''' % re.escape(playlist_id),
1084 webpage))
652cdaa2
JMF
1085 url_results = self._ids_to_results(ids)
1086
1087 return self.playlist_result(url_results, playlist_id, title)
1088
c5e8d7af
PH
1089 def _real_extract(self, url):
1090 # Extract playlist id
d67cc9fa 1091 mobj = re.match(self._VALID_URL, url)
c5e8d7af
PH
1092 if mobj is None:
1093 raise ExtractorError(u'Invalid URL: %s' % url)
47192f92
FV
1094 playlist_id = mobj.group(1) or mobj.group(2)
1095
1096 # Check if it's a video-specific URL
7c61bd36 1097 query_dict = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
47192f92
FV
1098 if 'v' in query_dict:
1099 video_id = query_dict['v'][0]
1100 if self._downloader.params.get('noplaylist'):
1101 self.to_screen(u'Downloading just video %s because of --no-playlist' % video_id)
7012b23c 1102 return self.url_result(video_id, 'Youtube', video_id=video_id)
47192f92 1103 else:
1db26669 1104 self.to_screen(u'Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id))
c5e8d7af 1105
7d4afc55 1106 if playlist_id.startswith('RD'):
652cdaa2
JMF
1107 # Mixes require a custom extraction process
1108 return self._extract_mix(playlist_id)
0a688bc0
JMF
1109 if playlist_id.startswith('TL'):
1110 raise ExtractorError(u'For downloading YouTube.com top lists, use '
78caa52a 1111 'the "yttoplist" keyword, for example "youtube-dl \'yttoplist:music:Top Tracks\'"', expected=True)
652cdaa2 1112
dbb94fb0
S
1113 url = self._TEMPLATE_URL % playlist_id
1114 page = self._download_webpage(url, playlist_id)
1115 more_widget_html = content_html = page
1116
10c0e2d8 1117 # Check if the playlist exists or is private
e399853d 1118 if re.search(r'<div class="yt-alert-message">[^<]*?(The|This) playlist (does not exist|is private)[^<]*?</div>', page) is not None:
10c0e2d8 1119 raise ExtractorError(
78caa52a 1120 'The playlist doesn\'t exist or is private, use --username or '
10c0e2d8
JMF
1121 '--netrc to access it.',
1122 expected=True)
1123
dcbb4580
JMF
1124 # Extract the video ids from the playlist pages
1125 ids = []
c5e8d7af 1126
755eb032 1127 for page_num in itertools.count(1):
dbb94fb0 1128 matches = re.finditer(self._VIDEO_RE, content_html)
6e47b51e
JMF
1129 # We remove the duplicates and the link with index 0
1130 # (it's not the first video of the playlist)
1131 new_ids = orderedSet(m.group('id') for m in matches if m.group('index') != '0')
dcbb4580 1132 ids.extend(new_ids)
c5e8d7af 1133
dbb94fb0
S
1134 mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
1135 if not mobj:
c5e8d7af
PH
1136 break
1137
dbb94fb0 1138 more = self._download_json(
5912c639
PH
1139 'https://youtube.com/%s' % mobj.group('more'), playlist_id,
1140 'Downloading page #%s' % page_num,
1141 transform_source=uppercase_escape)
dbb94fb0
S
1142 content_html = more['content_html']
1143 more_widget_html = more['load_more_widget_html']
1144
1145 playlist_title = self._html_search_regex(
68eb8e90 1146 r'(?s)<h1 class="pl-header-title[^"]*">\s*(.*?)\s*</h1>',
78caa52a 1147 page, 'title')
c5e8d7af 1148
652cdaa2 1149 url_results = self._ids_to_results(ids)
dcbb4580 1150 return self.playlist_result(url_results, playlist_id, playlist_title)
c5e8d7af
PH
1151
1152
0a688bc0 1153class YoutubeTopListIE(YoutubePlaylistIE):
78caa52a 1154 IE_NAME = 'youtube:toplist'
0a688bc0 1155 IE_DESC = (u'YouTube.com top lists, "yttoplist:{channel}:{list title}"'
78caa52a 1156 ' (Example: "yttoplist:music:Top Tracks")')
0a688bc0 1157 _VALID_URL = r'yttoplist:(?P<chann>.*?):(?P<title>.*?)$'
81127aa5 1158 _TESTS = []
0a688bc0
JMF
1159
1160 def _real_extract(self, url):
1161 mobj = re.match(self._VALID_URL, url)
1162 channel = mobj.group('chann')
1163 title = mobj.group('title')
1164 query = compat_urllib_parse.urlencode({'title': title})
beddbc2a 1165 playlist_re = 'href="([^"]+?%s.*?)"' % re.escape(query)
0a688bc0 1166 channel_page = self._download_webpage('https://www.youtube.com/%s' % channel, title)
78caa52a 1167 link = self._html_search_regex(playlist_re, channel_page, 'list')
0a688bc0
JMF
1168 url = compat_urlparse.urljoin('https://www.youtube.com/', link)
1169
1170 video_re = r'data-index="\d+".*?data-video-id="([0-9A-Za-z_-]{11})"'
1171 ids = []
1172 # sometimes the webpage doesn't contain the videos
1173 # retry until we get them
1174 for i in itertools.count(0):
78caa52a 1175 msg = 'Downloading Youtube mix'
0a688bc0
JMF
1176 if i > 0:
1177 msg += ', retry #%d' % i
c9cc0bf5 1178
0a688bc0
JMF
1179 webpage = self._download_webpage(url, title, msg)
1180 ids = orderedSet(re.findall(video_re, webpage))
1181 if ids:
1182 break
1183 url_results = self._ids_to_results(ids)
1184 return self.playlist_result(url_results, playlist_title=title)
1185
1186
c5e8d7af 1187class YoutubeChannelIE(InfoExtractor):
78caa52a 1188 IE_DESC = 'YouTube.com channels'
c5e8d7af 1189 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
c5e8d7af 1190 _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
38c2e5b8 1191 _MORE_PAGES_URL = 'https://www.youtube.com/c4_browse_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
78caa52a 1192 IE_NAME = 'youtube:channel'
c5e8d7af
PH
1193
1194 def extract_videos_from_page(self, page):
1195 ids_in_page = []
1196 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1197 if mobj.group(1) not in ids_in_page:
1198 ids_in_page.append(mobj.group(1))
1199 return ids_in_page
1200
1201 def _real_extract(self, url):
1202 # Extract channel id
1203 mobj = re.match(self._VALID_URL, url)
1204 if mobj is None:
1205 raise ExtractorError(u'Invalid URL: %s' % url)
1206
1207 # Download channel page
1208 channel_id = mobj.group(1)
1209 video_ids = []
b9643eed
JMF
1210 url = 'https://www.youtube.com/channel/%s/videos' % channel_id
1211 channel_page = self._download_webpage(url, channel_id)
31812a9e
PH
1212 autogenerated = re.search(r'''(?x)
1213 class="[^"]*?(?:
1214 channel-header-autogenerated-label|
1215 yt-channel-title-autogenerated
1216 )[^"]*"''', channel_page) is not None
c5e8d7af 1217
b9643eed
JMF
1218 if autogenerated:
1219 # The videos are contained in a single page
1220 # the ajax pages can't be used, they are empty
1221 video_ids = self.extract_videos_from_page(channel_page)
1222 else:
1223 # Download all channel pages using the json-based channel_ajax query
1224 for pagenum in itertools.count(1):
1225 url = self._MORE_PAGES_URL % (pagenum, channel_id)
81c2f20b
PH
1226 page = self._download_json(
1227 url, channel_id, note=u'Downloading page #%s' % pagenum,
1228 transform_source=uppercase_escape)
1229
b9643eed
JMF
1230 ids_in_page = self.extract_videos_from_page(page['content_html'])
1231 video_ids.extend(ids_in_page)
1232
1233 if self._MORE_PAGES_INDICATOR not in page['load_more_widget_html']:
1234 break
c5e8d7af
PH
1235
1236 self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1237
7012b23c
PH
1238 url_entries = [self.url_result(video_id, 'Youtube', video_id=video_id)
1239 for video_id in video_ids]
1240 return self.playlist_result(url_entries, channel_id)
c5e8d7af
PH
1241
1242
1243class YoutubeUserIE(InfoExtractor):
78caa52a 1244 IE_DESC = 'YouTube.com user videos (URL or "ytuser" keyword)'
c9ae7b95 1245 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:user/)?(?!(?:attribution_link|watch|results)(?:$|[^a-z_A-Z0-9-])))|ytuser:)(?!feed/)([A-Za-z0-9_-]+)'
38c2e5b8 1246 _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/users/%s'
c5e8d7af 1247 _GDATA_PAGE_SIZE = 50
38c2e5b8 1248 _GDATA_URL = 'https://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d&alt=json'
78caa52a 1249 IE_NAME = 'youtube:user'
c5e8d7af 1250
e3ea4790 1251 @classmethod
f4b05232 1252 def suitable(cls, url):
e3ea4790
JMF
1253 # Don't return True if the url can be extracted with other youtube
1254 # extractor, the regex would is too permissive and it would match.
1255 other_ies = iter(klass for (name, klass) in globals().items() if name.endswith('IE') and klass is not cls)
1256 if any(ie.suitable(url) for ie in other_ies): return False
f4b05232
JMF
1257 else: return super(YoutubeUserIE, cls).suitable(url)
1258
c5e8d7af
PH
1259 def _real_extract(self, url):
1260 # Extract username
1261 mobj = re.match(self._VALID_URL, url)
1262 if mobj is None:
1263 raise ExtractorError(u'Invalid URL: %s' % url)
1264
1265 username = mobj.group(1)
1266
1267 # Download video ids using YouTube Data API. Result size per
1268 # query is limited (currently to 50 videos) so we need to query
1269 # page by page until there are no video ids - it means we got
1270 # all of them.
1271
b7ab0590 1272 def download_page(pagenum):
c5e8d7af
PH
1273 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1274
1275 gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
b7ab0590
PH
1276 page = self._download_webpage(
1277 gdata_url, username,
78caa52a 1278 'Downloading video ids from %d to %d' % (
b7ab0590 1279 start_index, start_index + self._GDATA_PAGE_SIZE))
c5e8d7af 1280
fd9cf738
JMF
1281 try:
1282 response = json.loads(page)
1283 except ValueError as err:
1284 raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
71c82637 1285 if 'entry' not in response['feed']:
b7ab0590 1286 return
fd9cf738 1287
c5e8d7af 1288 # Extract video identifiers
e302f9ce
PH
1289 entries = response['feed']['entry']
1290 for entry in entries:
1291 title = entry['title']['$t']
1292 video_id = entry['id']['$t'].split('/')[-1]
b7ab0590 1293 yield {
e302f9ce
PH
1294 '_type': 'url',
1295 'url': video_id,
1296 'ie_key': 'Youtube',
b11cec41 1297 'id': video_id,
e302f9ce 1298 'title': title,
b7ab0590
PH
1299 }
1300 url_results = PagedList(download_page, self._GDATA_PAGE_SIZE)
c5e8d7af 1301
7012b23c
PH
1302 return self.playlist_result(url_results, playlist_title=username)
1303
b05654f0
PH
1304
1305class YoutubeSearchIE(SearchInfoExtractor):
78caa52a
PH
1306 IE_DESC = 'YouTube.com searches'
1307 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
b05654f0 1308 _MAX_RESULTS = 1000
78caa52a 1309 IE_NAME = 'youtube:search'
b05654f0
PH
1310 _SEARCH_KEY = 'ytsearch'
1311
b05654f0
PH
1312 def _get_n_results(self, query, n):
1313 """Get a specified number of results for a query"""
1314
1315 video_ids = []
1316 pagenum = 0
1317 limit = n
83d548ef 1318 PAGE_SIZE = 50
b05654f0 1319
83d548ef
PH
1320 while (PAGE_SIZE * pagenum) < limit:
1321 result_url = self._API_URL % (
1322 compat_urllib_parse.quote_plus(query.encode('utf-8')),
1323 (PAGE_SIZE * pagenum) + 1)
7cc3570e
PH
1324 data_json = self._download_webpage(
1325 result_url, video_id=u'query "%s"' % query,
1326 note=u'Downloading page %s' % (pagenum + 1),
1327 errnote=u'Unable to download API page')
1328 data = json.loads(data_json)
1329 api_response = data['data']
1330
1331 if 'items' not in api_response:
07ad22b8 1332 raise ExtractorError(
78caa52a 1333 '[youtube] No video results', expected=True)
b05654f0
PH
1334
1335 new_ids = list(video['id'] for video in api_response['items'])
1336 video_ids += new_ids
1337
1338 limit = min(n, api_response['totalItems'])
1339 pagenum += 1
1340
1341 if len(video_ids) > n:
1342 video_ids = video_ids[:n]
7012b23c
PH
1343 videos = [self.url_result(video_id, 'Youtube', video_id=video_id)
1344 for video_id in video_ids]
b05654f0 1345 return self.playlist_result(videos, query)
75dff0ee 1346
c9ae7b95 1347
a3dd9248 1348class YoutubeSearchDateIE(YoutubeSearchIE):
cb7fb546 1349 IE_NAME = YoutubeSearchIE.IE_NAME + ':date'
a3dd9248
CM
1350 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc&orderby=published'
1351 _SEARCH_KEY = 'ytsearchdate'
78caa52a 1352 IE_DESC = 'YouTube.com searches, newest videos first'
75dff0ee 1353
c9ae7b95
PH
1354
1355class YoutubeSearchURLIE(InfoExtractor):
78caa52a
PH
1356 IE_DESC = 'YouTube.com search URLs'
1357 IE_NAME = 'youtube:search_url'
c9ae7b95
PH
1358 _VALID_URL = r'https?://(?:www\.)?youtube\.com/results\?(.*?&)?search_query=(?P<query>[^&]+)(?:[&]|$)'
1359
1360 def _real_extract(self, url):
1361 mobj = re.match(self._VALID_URL, url)
1362 query = compat_urllib_parse.unquote_plus(mobj.group('query'))
1363
1364 webpage = self._download_webpage(url, query)
1365 result_code = self._search_regex(
78caa52a 1366 r'(?s)<ol class="item-section"(.*?)</ol>', webpage, 'result HTML')
c9ae7b95
PH
1367
1368 part_codes = re.findall(
1369 r'(?s)<h3 class="yt-lockup-title">(.*?)</h3>', result_code)
1370 entries = []
1371 for part_code in part_codes:
1372 part_title = self._html_search_regex(
6feb2d5e 1373 [r'(?s)title="([^"]+)"', r'>([^<]+)</a>'], part_code, 'item title', fatal=False)
c9ae7b95
PH
1374 part_url_snippet = self._html_search_regex(
1375 r'(?s)href="([^"]+)"', part_code, 'item URL')
1376 part_url = compat_urlparse.urljoin(
1377 'https://www.youtube.com/', part_url_snippet)
1378 entries.append({
1379 '_type': 'url',
1380 'url': part_url,
1381 'title': part_title,
1382 })
1383
1384 return {
1385 '_type': 'playlist',
1386 'entries': entries,
1387 'title': query,
1388 }
1389
1390
75dff0ee 1391class YoutubeShowIE(InfoExtractor):
78caa52a 1392 IE_DESC = 'YouTube.com (multi-season) shows'
75dff0ee 1393 _VALID_URL = r'https?://www\.youtube\.com/show/(.*)'
78caa52a 1394 IE_NAME = 'youtube:show'
75dff0ee
JMF
1395
1396 def _real_extract(self, url):
1397 mobj = re.match(self._VALID_URL, url)
1398 show_name = mobj.group(1)
78caa52a 1399 webpage = self._download_webpage(url, show_name, 'Downloading show webpage')
75dff0ee
JMF
1400 # There's one playlist for each season of the show
1401 m_seasons = list(re.finditer(r'href="(/playlist\?list=.*?)"', webpage))
1402 self.to_screen(u'%s: Found %s seasons' % (show_name, len(m_seasons)))
1403 return [self.url_result('https://www.youtube.com' + season.group(1), 'YoutubePlaylist') for season in m_seasons]
04cc9617
JMF
1404
1405
b2e8bc1b 1406class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
d7ae0639
JMF
1407 """
1408 Base class for extractors that fetch info from
1409 http://www.youtube.com/feed_ajax
1410 Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.
1411 """
b2e8bc1b 1412 _LOGIN_REQUIRED = True
43ba5456
JMF
1413 # use action_load_personal_feed instead of action_load_system_feed
1414 _PERSONAL_FEED = False
04cc9617 1415
d7ae0639
JMF
1416 @property
1417 def _FEED_TEMPLATE(self):
43ba5456
JMF
1418 action = 'action_load_system_feed'
1419 if self._PERSONAL_FEED:
1420 action = 'action_load_personal_feed'
38c2e5b8 1421 return 'https://www.youtube.com/feed_ajax?%s=1&feed_name=%s&paging=%%s' % (action, self._FEED_NAME)
d7ae0639
JMF
1422
1423 @property
1424 def IE_NAME(self):
78caa52a 1425 return 'youtube:%s' % self._FEED_NAME
04cc9617 1426
81f0259b 1427 def _real_initialize(self):
b2e8bc1b 1428 self._login()
81f0259b 1429
04cc9617
JMF
1430 def _real_extract(self, url):
1431 feed_entries = []
0e44d838
JMF
1432 paging = 0
1433 for i in itertools.count(1):
f6177462 1434 info = self._download_json(self._FEED_TEMPLATE % paging,
78caa52a
PH
1435 '%s feed' % self._FEED_NAME,
1436 'Downloading page %s' % i)
f6177462 1437 feed_html = info.get('feed_html') or info.get('content_html')
1a9b9649 1438 load_more_widget_html = info.get('load_more_widget_html') or feed_html
43ba5456 1439 m_ids = re.finditer(r'"/watch\?v=(.*?)["&]', feed_html)
04cc9617 1440 ids = orderedSet(m.group(1) for m in m_ids)
7012b23c
PH
1441 feed_entries.extend(
1442 self.url_result(video_id, 'Youtube', video_id=video_id)
1443 for video_id in ids)
05ee2b6d
JMF
1444 mobj = re.search(
1445 r'data-uix-load-more-href="/?[^"]+paging=(?P<paging>\d+)',
1a9b9649 1446 load_more_widget_html)
05ee2b6d 1447 if mobj is None:
04cc9617 1448 break
05ee2b6d 1449 paging = mobj.group('paging')
d7ae0639
JMF
1450 return self.playlist_result(feed_entries, playlist_title=self._PLAYLIST_TITLE)
1451
d7ae0639 1452class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
78caa52a 1453 IE_DESC = 'YouTube.com recommended videos, "ytrec" keyword (requires authentication)'
d7ae0639
JMF
1454 _VALID_URL = r'https?://www\.youtube\.com/feed/recommended|:ytrec(?:ommended)?'
1455 _FEED_NAME = 'recommended'
78caa52a 1456 _PLAYLIST_TITLE = 'Youtube Recommended videos'
c626a3d9 1457
43ba5456 1458class YoutubeWatchLaterIE(YoutubeFeedsInfoExtractor):
78caa52a 1459 IE_DESC = 'Youtube watch later list, "ytwatchlater" keyword (requires authentication)'
43ba5456
JMF
1460 _VALID_URL = r'https?://www\.youtube\.com/feed/watch_later|:ytwatchlater'
1461 _FEED_NAME = 'watch_later'
78caa52a 1462 _PLAYLIST_TITLE = 'Youtube Watch Later'
43ba5456 1463 _PERSONAL_FEED = True
c626a3d9 1464
f459d170 1465class YoutubeHistoryIE(YoutubeFeedsInfoExtractor):
78caa52a
PH
1466 IE_DESC = 'Youtube watch history, "ythistory" keyword (requires authentication)'
1467 _VALID_URL = 'https?://www\.youtube\.com/feed/history|:ythistory'
f459d170
JMF
1468 _FEED_NAME = 'history'
1469 _PERSONAL_FEED = True
78caa52a 1470 _PLAYLIST_TITLE = 'Youtube Watch History'
f459d170 1471
c626a3d9 1472class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
78caa52a
PH
1473 IE_NAME = 'youtube:favorites'
1474 IE_DESC = 'YouTube.com favourite videos, "ytfav" keyword (requires authentication)'
c7a7750d 1475 _VALID_URL = r'https?://www\.youtube\.com/my_favorites|:ytfav(?:ou?rites)?'
c626a3d9
JMF
1476 _LOGIN_REQUIRED = True
1477
1478 def _real_extract(self, url):
1479 webpage = self._download_webpage('https://www.youtube.com/my_favorites', 'Youtube Favourites videos')
78caa52a 1480 playlist_id = self._search_regex(r'list=(.+?)["&]', webpage, 'favourites playlist id')
c626a3d9 1481 return self.url_result(playlist_id, 'YoutubePlaylist')
15870e90
PH
1482
1483
1ed5b5c9 1484class YoutubeSubscriptionsIE(YoutubePlaylistIE):
78caa52a
PH
1485 IE_NAME = 'youtube:subscriptions'
1486 IE_DESC = 'YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication)'
1ed5b5c9 1487 _VALID_URL = r'https?://www\.youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
81127aa5 1488 _TESTS = []
1ed5b5c9
JMF
1489
1490 def _real_extract(self, url):
78caa52a 1491 title = 'Youtube Subscriptions'
1ed5b5c9
JMF
1492 page = self._download_webpage('https://www.youtube.com/feed/subscriptions', title)
1493
1494 # The extraction process is the same as for playlists, but the regex
1495 # for the video ids doesn't contain an index
1496 ids = []
1497 more_widget_html = content_html = page
1498
1499 for page_num in itertools.count(1):
1500 matches = re.findall(r'href="\s*/watch\?v=([0-9A-Za-z_-]{11})', content_html)
1501 new_ids = orderedSet(matches)
1502 ids.extend(new_ids)
1503
1504 mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
1505 if not mobj:
1506 break
1507
1508 more = self._download_json(
1509 'https://youtube.com/%s' % mobj.group('more'), title,
1510 'Downloading page #%s' % page_num,
1511 transform_source=uppercase_escape)
1512 content_html = more['content_html']
1513 more_widget_html = more['load_more_widget_html']
1514
1515 return {
1516 '_type': 'playlist',
1517 'title': title,
1518 'entries': self._ids_to_results(ids),
1519 }
1520
1521
15870e90
PH
1522class YoutubeTruncatedURLIE(InfoExtractor):
1523 IE_NAME = 'youtube:truncated_url'
1524 IE_DESC = False # Do not list
975d35db 1525 _VALID_URL = r'''(?x)
c4808c60
PH
1526 (?:https?://)?[^/]+/watch\?(?:
1527 feature=[a-z_]+|
1528 annotation_id=annotation_[^&]+
1529 )?$|
975d35db
PH
1530 (?:https?://)?(?:www\.)?youtube\.com/attribution_link\?a=[^&]+$
1531 '''
15870e90 1532
c4808c60
PH
1533 _TESTS = [{
1534 'url': 'http://www.youtube.com/watch?annotation_id=annotation_3951667041',
1535 'only_matching': True,
dc2fc736
PH
1536 }, {
1537 'url': 'http://www.youtube.com/watch?',
1538 'only_matching': True,
c4808c60
PH
1539 }]
1540
15870e90
PH
1541 def _real_extract(self, url):
1542 raise ExtractorError(
78caa52a
PH
1543 'Did you forget to quote the URL? Remember that & is a meta '
1544 'character in most shells, so you want to put the URL in quotes, '
1545 'like youtube-dl '
1546 '"http://www.youtube.com/watch?feature=foo&v=BaW_jenozKc" '
1547 ' or simply youtube-dl BaW_jenozKc .',
15870e90 1548 expected=True)