]> jfr.im git - yt-dlp.git/blame - youtube_dl/extractor/youtube.py
Merge branch 'peugeot-tnaflix'
[yt-dlp.git] / youtube_dl / extractor / youtube.py
CommitLineData
c5e8d7af 1# coding: utf-8
c5e8d7af 2
edf3e38e 3import errno
e0df6211 4import io
0ca96d48 5import itertools
c5e8d7af 6import json
c4417ddb 7import os.path
c5e8d7af 8import re
e0df6211 9import traceback
c5e8d7af 10
b05654f0 11from .common import InfoExtractor, SearchInfoExtractor
54d39d8b 12from .subtitles import SubtitlesInfoExtractor
2b25cb5d 13from ..jsinterp import JSInterpreter
54256267 14from ..swfinterp import SWFInterpreter
c5e8d7af 15from ..utils import (
edf3e38e 16 compat_chr,
c5e8d7af 17 compat_parse_qs,
c5e8d7af
PH
18 compat_urllib_parse,
19 compat_urllib_request,
7c61bd36 20 compat_urlparse,
c5e8d7af
PH
21 compat_str,
22
23 clean_html,
c38b1e77 24 get_cachedir,
c5e8d7af 25 get_element_by_id,
652cdaa2 26 get_element_by_attribute,
c5e8d7af 27 ExtractorError,
dd27fd17 28 int_or_none,
b7ab0590 29 PagedList,
c5e8d7af
PH
30 unescapeHTML,
31 unified_strdate,
04cc9617 32 orderedSet,
edf3e38e 33 write_json_file,
81c2f20b 34 uppercase_escape,
c5e8d7af
PH
35)
36
de7f3446 37class YoutubeBaseInfoExtractor(InfoExtractor):
b2e8bc1b
JMF
38 """Provide base functions for Youtube extractors"""
39 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
83317f69 40 _TWOFACTOR_URL = 'https://accounts.google.com/SecondFactor'
b2e8bc1b 41 _LANG_URL = r'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
38c2e5b8 42 _AGE_URL = 'https://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
b2e8bc1b
JMF
43 _NETRC_MACHINE = 'youtube'
44 # If True it will raise an error if no login info is provided
45 _LOGIN_REQUIRED = False
46
b2e8bc1b 47 def _set_language(self):
7cc3570e
PH
48 return bool(self._download_webpage(
49 self._LANG_URL, None,
50 note=u'Setting language', errnote='unable to set language',
51 fatal=False))
b2e8bc1b
JMF
52
53 def _login(self):
83317f69 54 """
55 Attempt to log in to YouTube.
56 True is returned if successful or skipped.
57 False is returned if login failed.
58
59 If _LOGIN_REQUIRED is set and no authentication was provided, an error is raised.
60 """
b2e8bc1b
JMF
61 (username, password) = self._get_login_info()
62 # No authentication to be performed
63 if username is None:
64 if self._LOGIN_REQUIRED:
65 raise ExtractorError(u'No login info available, needed for using %s.' % self.IE_NAME, expected=True)
83317f69 66 return True
b2e8bc1b 67
7cc3570e
PH
68 login_page = self._download_webpage(
69 self._LOGIN_URL, None,
70 note=u'Downloading login page',
71 errnote=u'unable to fetch login page', fatal=False)
72 if login_page is False:
73 return
b2e8bc1b 74
795f28f8
PH
75 galx = self._search_regex(r'(?s)<input.+?name="GALX".+?value="(.+?)"',
76 login_page, u'Login GALX parameter')
c5e8d7af 77
b2e8bc1b
JMF
78 # Log in
79 login_form_strs = {
80 u'continue': u'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
81 u'Email': username,
82 u'GALX': galx,
83 u'Passwd': password,
83317f69 84
b2e8bc1b
JMF
85 u'PersistentCookie': u'yes',
86 u'_utf8': u'霱',
87 u'bgresponse': u'js_disabled',
88 u'checkConnection': u'',
89 u'checkedDomains': u'youtube',
90 u'dnConn': u'',
b2e8bc1b
JMF
91 u'pstMsg': u'0',
92 u'rmShown': u'1',
93 u'secTok': u'',
94 u'signIn': u'Sign in',
95 u'timeStmp': u'',
96 u'service': u'youtube',
97 u'uilel': u'3',
98 u'hl': u'en_US',
99 }
83317f69 100
b2e8bc1b
JMF
101 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
102 # chokes on unicode
103 login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
104 login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
7cc3570e
PH
105
106 req = compat_urllib_request.Request(self._LOGIN_URL, login_data)
107 login_results = self._download_webpage(
108 req, None,
109 note=u'Logging in', errnote=u'unable to log in', fatal=False)
110 if login_results is False:
111 return False
83317f69 112
113 if re.search(r'id="errormsg_0_Passwd"', login_results) is not None:
114 raise ExtractorError(u'Please use your account password and a two-factor code instead of an application-specific password.', expected=True)
115
116 # Two-Factor
117 # TODO add SMS and phone call support - these require making a request and then prompting the user
118
119 if re.search(r'(?i)<form[^>]* id="gaia_secondfactorform"', login_results) is not None:
120 tfa_code = self._get_tfa_info()
121
122 if tfa_code is None:
123 self._downloader.report_warning(u'Two-factor authentication required. Provide it with --twofactor <code>')
124 self._downloader.report_warning(u'(Note that only TOTP (Google Authenticator App) codes work at this time.)')
125 return False
126
127 # Unlike the first login form, secTok and timeStmp are both required for the TFA form
128
129 match = re.search(r'id="secTok"\n\s+value=\'(.+)\'/>', login_results, re.M | re.U)
130 if match is None:
131 self._downloader.report_warning(u'Failed to get secTok - did the page structure change?')
132 secTok = match.group(1)
133 match = re.search(r'id="timeStmp"\n\s+value=\'(.+)\'/>', login_results, re.M | re.U)
134 if match is None:
135 self._downloader.report_warning(u'Failed to get timeStmp - did the page structure change?')
136 timeStmp = match.group(1)
137
138 tfa_form_strs = {
139 u'continue': u'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
140 u'smsToken': u'',
141 u'smsUserPin': tfa_code,
142 u'smsVerifyPin': u'Verify',
143
144 u'PersistentCookie': u'yes',
145 u'checkConnection': u'',
146 u'checkedDomains': u'youtube',
147 u'pstMsg': u'1',
148 u'secTok': secTok,
149 u'timeStmp': timeStmp,
150 u'service': u'youtube',
151 u'hl': u'en_US',
152 }
153 tfa_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in tfa_form_strs.items())
154 tfa_data = compat_urllib_parse.urlencode(tfa_form).encode('ascii')
155
156 tfa_req = compat_urllib_request.Request(self._TWOFACTOR_URL, tfa_data)
157 tfa_results = self._download_webpage(
158 tfa_req, None,
159 note=u'Submitting TFA code', errnote=u'unable to submit tfa', fatal=False)
160
161 if tfa_results is False:
162 return False
163
164 if re.search(r'(?i)<form[^>]* id="gaia_secondfactorform"', tfa_results) is not None:
165 self._downloader.report_warning(u'Two-factor code expired. Please try again, or use a one-use backup code instead.')
166 return False
167 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', tfa_results) is not None:
168 self._downloader.report_warning(u'unable to log in - did the page structure change?')
169 return False
170 if re.search(r'smsauth-interstitial-reviewsettings', tfa_results) is not None:
171 self._downloader.report_warning(u'Your Google account has a security notice. Please log in on your web browser, resolve the notice, and try again.')
172 return False
173
7cc3570e
PH
174 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
175 self._downloader.report_warning(u'unable to log in: bad username or password')
b2e8bc1b
JMF
176 return False
177 return True
178
179 def _confirm_age(self):
180 age_form = {
7cc3570e
PH
181 'next_url': '/',
182 'action_confirm': 'Confirm',
183 }
5700e779
JMF
184 req = compat_urllib_request.Request(self._AGE_URL,
185 compat_urllib_parse.urlencode(age_form).encode('ascii'))
7cc3570e
PH
186
187 self._download_webpage(
188 req, None,
189 note=u'Confirming age', errnote=u'Unable to confirm age')
b2e8bc1b
JMF
190 return True
191
192 def _real_initialize(self):
193 if self._downloader is None:
194 return
195 if not self._set_language():
196 return
197 if not self._login():
198 return
199 self._confirm_age()
c5e8d7af 200
8377574c 201
de7f3446 202class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
0f818663 203 IE_DESC = u'YouTube.com'
cb7dfeea 204 _VALID_URL = r"""(?x)^
c5e8d7af 205 (
83aa5293 206 (?:https?://|//)? # http(s):// or protocol-independent URL (optional)
cb7dfeea 207 (?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/|
484aaeb2 208 (?:www\.)?deturl\.com/www\.youtube\.com/|
e70dc1d1 209 (?:www\.)?pwnyoutube\.com/|
f7000f3a 210 (?:www\.)?yourepeat\.com/|
e69ae5b9
JMF
211 tube\.majestyc\.net/|
212 youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains
c5e8d7af
PH
213 (?:.*?\#/)? # handle anchor (#/) redirect urls
214 (?: # the various things that can precede the ID:
215 (?:(?:v|embed|e)/) # v/ or embed/ or e/
216 |(?: # or the v= param in all its forms
f7000f3a 217 (?:(?:watch|movie)(?:_popup)?(?:\.php)?/?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
c5e8d7af
PH
218 (?:\?|\#!?) # the params delimiter ? or # or #!
219 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
220 v=
221 )
f4b05232
JMF
222 ))
223 |youtu\.be/ # just youtu.be/xxxx
b9c76aa1 224 |https?://(?:www\.)?cleanvideosearch\.com/media/action/yt/watch\?videoId=
f4b05232 225 )
c5e8d7af 226 )? # all until now is optional -> you can pass the naked ID
8963d9c2 227 ([0-9A-Za-z_-]{11}) # here is it! the YouTube video ID
c5e8d7af
PH
228 (?(1).+)? # if we found the ID, everything can follow
229 $"""
c5e8d7af 230 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
2c62dc26
PH
231 _formats = {
232 '5': {'ext': 'flv', 'width': 400, 'height': 240},
233 '6': {'ext': 'flv', 'width': 450, 'height': 270},
234 '13': {'ext': '3gp'},
235 '17': {'ext': '3gp', 'width': 176, 'height': 144},
236 '18': {'ext': 'mp4', 'width': 640, 'height': 360},
237 '22': {'ext': 'mp4', 'width': 1280, 'height': 720},
238 '34': {'ext': 'flv', 'width': 640, 'height': 360},
239 '35': {'ext': 'flv', 'width': 854, 'height': 480},
240 '36': {'ext': '3gp', 'width': 320, 'height': 240},
241 '37': {'ext': 'mp4', 'width': 1920, 'height': 1080},
242 '38': {'ext': 'mp4', 'width': 4096, 'height': 3072},
243 '43': {'ext': 'webm', 'width': 640, 'height': 360},
244 '44': {'ext': 'webm', 'width': 854, 'height': 480},
245 '45': {'ext': 'webm', 'width': 1280, 'height': 720},
246 '46': {'ext': 'webm', 'width': 1920, 'height': 1080},
247
1d043b93 248
86fe61c8 249 # 3d videos
43b81eb9
PH
250 '82': {'ext': 'mp4', 'height': 360, 'format_note': '3D', 'preference': -20},
251 '83': {'ext': 'mp4', 'height': 480, 'format_note': '3D', 'preference': -20},
252 '84': {'ext': 'mp4', 'height': 720, 'format_note': '3D', 'preference': -20},
253 '85': {'ext': 'mp4', 'height': 1080, 'format_note': '3D', 'preference': -20},
254 '100': {'ext': 'webm', 'height': 360, 'format_note': '3D', 'preference': -20},
255 '101': {'ext': 'webm', 'height': 480, 'format_note': '3D', 'preference': -20},
256 '102': {'ext': 'webm', 'height': 720, 'format_note': '3D', 'preference': -20},
836a086c 257
96fb5605 258 # Apple HTTP Live Streaming
43b81eb9
PH
259 '92': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'preference': -10},
260 '93': {'ext': 'mp4', 'height': 360, 'format_note': 'HLS', 'preference': -10},
261 '94': {'ext': 'mp4', 'height': 480, 'format_note': 'HLS', 'preference': -10},
262 '95': {'ext': 'mp4', 'height': 720, 'format_note': 'HLS', 'preference': -10},
263 '96': {'ext': 'mp4', 'height': 1080, 'format_note': 'HLS', 'preference': -10},
264 '132': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'preference': -10},
265 '151': {'ext': 'mp4', 'height': 72, 'format_note': 'HLS', 'preference': -10},
2c62dc26
PH
266
267 # DASH mp4 video
43b81eb9
PH
268 '133': {'ext': 'mp4', 'height': 240, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
269 '134': {'ext': 'mp4', 'height': 360, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
270 '135': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
271 '136': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
272 '137': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
273 '138': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
274 '160': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
275 '264': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
836a086c 276
f6f1fc92 277 # Dash mp4 audio
2c62dc26
PH
278 '139': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 48, 'preference': -50},
279 '140': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 128, 'preference': -50},
280 '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 256, 'preference': -50},
836a086c
AZ
281
282 # Dash webm
e75cafe9
A
283 '167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
284 '168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
285 '169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
286 '170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
287 '218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
288 '219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
289 '242': {'ext': 'webm', 'height': 240, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
290 '243': {'ext': 'webm', 'height': 360, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
291 '244': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
292 '245': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
293 '246': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
294 '247': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
295 '248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
3c80377b 296 '271': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
18061bba 297 '272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
2c62dc26
PH
298
299 # Dash webm audio
55db73ef 300 '171': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'abr': 128, 'preference': -50},
e75cafe9 301 '172': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'abr': 256, 'preference': -50},
ce6b9a2d
PH
302
303 # RTMP (unnamed)
304 '_rtmp': {'protocol': 'rtmp'},
c5e8d7af 305 }
836a086c 306
c5e8d7af 307 IE_NAME = u'youtube'
2eb88d95
PH
308 _TESTS = [
309 {
0e853ca4
PH
310 u"url": u"http://www.youtube.com/watch?v=BaW_jenozKc",
311 u"file": u"BaW_jenozKc.mp4",
312 u"info_dict": {
313 u"title": u"youtube-dl test video \"'/\\ä↭𝕐",
314 u"uploader": u"Philipp Hagemeister",
315 u"uploader_id": u"phihag",
316 u"upload_date": u"20121002",
ad3bc6ac
PH
317 u"description": u"test chars: \"'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .",
318 u"categories": [u'Science & Technology'],
3e7c1224
PH
319 'like_count': int,
320 'dislike_count': int,
2eb88d95 321 }
0e853ca4 322 },
0e853ca4
PH
323 {
324 u"url": u"http://www.youtube.com/watch?v=UxxajLWwzqY",
325 u"file": u"UxxajLWwzqY.mp4",
326 u"note": u"Test generic use_cipher_signature video (#897)",
327 u"info_dict": {
328 u"upload_date": u"20120506",
329 u"title": u"Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]",
ba60a3eb 330 u"description": u"md5:fea86fda2d5a5784273df5c7cc994d9f",
45ed795c 331 u"uploader": u"Icona Pop",
0e853ca4 332 u"uploader_id": u"IconaPop"
2eb88d95 333 }
c108eb73
JMF
334 },
335 {
336 u"url": u"https://www.youtube.com/watch?v=07FYdnEawAQ",
337 u"file": u"07FYdnEawAQ.mp4",
338 u"note": u"Test VEVO video with age protection (#956)",
339 u"info_dict": {
340 u"upload_date": u"20130703",
341 u"title": u"Justin Timberlake - Tunnel Vision (Explicit)",
342 u"description": u"md5:64249768eec3bc4276236606ea996373",
343 u"uploader": u"justintimberlakeVEVO",
344 u"uploader_id": u"justintimberlakeVEVO"
345 }
346 },
fccd3771 347 {
83aa5293 348 u"url": u"//www.YouTube.com/watch?v=yZIXLfi8CZQ",
fccd3771
PH
349 u"file": u"yZIXLfi8CZQ.mp4",
350 u"note": u"Embed-only video (#1746)",
351 u"info_dict": {
352 u"upload_date": u"20120608",
353 u"title": u"Principal Sexually Assaults A Teacher - Episode 117 - 8th June 2012",
354 u"description": u"md5:09b78bd971f1e3e289601dfba15ca4f7",
355 u"uploader": u"SET India",
356 u"uploader_id": u"setindia"
357 }
358 },
dd27fd17
PH
359 {
360 u"url": u"http://www.youtube.com/watch?v=a9LDPn-MO4I",
361 u"file": u"a9LDPn-MO4I.m4a",
362 u"note": u"256k DASH audio (format 141) via DASH manifest",
dd27fd17
PH
363 u"info_dict": {
364 u"upload_date": "20121002",
365 u"uploader_id": "8KVIDEO",
366 u"description": "No description available.",
367 u"uploader": "8KVIDEO",
368 u"title": "UHDTV TEST 8K VIDEO.mp4"
4919603f
PH
369 },
370 u"params": {
371 u"youtube_include_dash_manifest": True,
372 u"format": "141",
373 },
dd27fd17 374 },
3489b7d2
JMF
375 # DASH manifest with encrypted signature
376 {
377 u'url': u'https://www.youtube.com/watch?v=IB3lcPjvWLA',
378 u'info_dict': {
379 u'id': u'IB3lcPjvWLA',
380 u'ext': u'm4a',
381 u'title': u'Afrojack - The Spark ft. Spree Wilson',
e00c9cf5 382 u'description': u'md5:9717375db5a9a3992be4668bbf3bc0a8',
3489b7d2
JMF
383 u'uploader': u'AfrojackVEVO',
384 u'uploader_id': u'AfrojackVEVO',
385 u'upload_date': u'20131011',
386 },
387 u"params": {
388 u'youtube_include_dash_manifest': True,
389 u'format': '141',
390 },
391 },
2eb88d95
PH
392 ]
393
c5e8d7af
PH
394
395 @classmethod
396 def suitable(cls, url):
397 """Receives a URL and returns True if suitable for this IE."""
e3ea4790 398 if YoutubePlaylistIE.suitable(url): return False
fccd3771 399 return re.match(cls._VALID_URL, url) is not None
c5e8d7af 400
e0df6211
PH
401 def __init__(self, *args, **kwargs):
402 super(YoutubeIE, self).__init__(*args, **kwargs)
83799698 403 self._player_cache = {}
e0df6211 404
c5e8d7af
PH
405 def report_video_info_webpage_download(self, video_id):
406 """Report attempt to download video info webpage."""
407 self.to_screen(u'%s: Downloading video info webpage' % video_id)
408
c5e8d7af
PH
409 def report_information_extraction(self, video_id):
410 """Report attempt to extract video information."""
411 self.to_screen(u'%s: Extracting video information' % video_id)
412
413 def report_unavailable_format(self, video_id, format):
414 """Report extracted video URL."""
415 self.to_screen(u'%s: Format %s not available' % (video_id, format))
416
417 def report_rtmp_download(self):
418 """Indicate the download will use the RTMP protocol."""
419 self.to_screen(u'RTMP download detected')
420
60064c53
PH
421 def _signature_cache_id(self, example_sig):
422 """ Return a string representation of a signature """
423 return u'.'.join(compat_str(len(part)) for part in example_sig.split('.'))
424
425 def _extract_signature_function(self, video_id, player_url, example_sig):
cf010131 426 id_m = re.match(
c081b35c 427 r'.*-(?P<id>[a-zA-Z0-9_-]+)(?:/watch_as3|/html5player)?\.(?P<ext>[a-z]+)$',
cf010131 428 player_url)
c081b35c
PH
429 if not id_m:
430 raise ExtractorError('Cannot identify player %r' % player_url)
e0df6211
PH
431 player_type = id_m.group('ext')
432 player_id = id_m.group('id')
433
c4417ddb 434 # Read from filesystem cache
60064c53
PH
435 func_id = '%s_%s_%s' % (
436 player_type, player_id, self._signature_cache_id(example_sig))
c4417ddb 437 assert os.path.basename(func_id) == func_id
c38b1e77 438 cache_dir = get_cachedir(self._downloader.params)
c4417ddb 439
c3c88a26 440 cache_enabled = cache_dir is not None
f8061589 441 if cache_enabled:
c4417ddb
PH
442 cache_fn = os.path.join(os.path.expanduser(cache_dir),
443 u'youtube-sigfuncs',
444 func_id + '.json')
445 try:
edf3e38e 446 with io.open(cache_fn, 'r', encoding='utf-8') as cachef:
c4417ddb
PH
447 cache_spec = json.load(cachef)
448 return lambda s: u''.join(s[i] for i in cache_spec)
edf3e38e 449 except IOError:
c4417ddb 450 pass # No cache available
55c49908
PH
451 except ValueError:
452 try:
453 file_size = os.path.getsize(cache_fn)
454 except (OSError, IOError) as oe:
455 file_size = str(oe)
456 self._downloader.report_warning(
457 u'Cache %s failed (%s)' % (cache_fn, file_size))
83799698 458
e0df6211
PH
459 if player_type == 'js':
460 code = self._download_webpage(
461 player_url, video_id,
83799698 462 note=u'Downloading %s player %s' % (player_type, player_id),
e0df6211 463 errnote=u'Download of %s failed' % player_url)
83799698 464 res = self._parse_sig_js(code)
c4417ddb 465 elif player_type == 'swf':
e0df6211
PH
466 urlh = self._request_webpage(
467 player_url, video_id,
83799698 468 note=u'Downloading %s player %s' % (player_type, player_id),
e0df6211
PH
469 errnote=u'Download of %s failed' % player_url)
470 code = urlh.read()
83799698 471 res = self._parse_sig_swf(code)
e0df6211
PH
472 else:
473 assert False, 'Invalid player type %r' % player_type
474
f8061589 475 if cache_enabled:
edf3e38e 476 try:
60064c53 477 test_string = u''.join(map(compat_chr, range(len(example_sig))))
c705320f 478 cache_res = res(test_string)
edf3e38e
PH
479 cache_spec = [ord(c) for c in cache_res]
480 try:
481 os.makedirs(os.path.dirname(cache_fn))
482 except OSError as ose:
483 if ose.errno != errno.EEXIST:
484 raise
485 write_json_file(cache_spec, cache_fn)
0ca96d48 486 except Exception:
edf3e38e
PH
487 tb = traceback.format_exc()
488 self._downloader.report_warning(
489 u'Writing cache to %r failed: %s' % (cache_fn, tb))
83799698
PH
490
491 return res
492
60064c53 493 def _print_sig_code(self, func, example_sig):
edf3e38e
PH
494 def gen_sig_code(idxs):
495 def _genslice(start, end, step):
496 starts = u'' if start == 0 else str(start)
e35e4ddc
PH
497 ends = (u':%d' % (end+step)) if end + step >= 0 else u':'
498 steps = u'' if step == 1 else (u':%d' % step)
edf3e38e
PH
499 return u's[%s%s%s]' % (starts, ends, steps)
500
501 step = None
0ca96d48
PH
502 start = '(Never used)' # Quelch pyflakes warnings - start will be
503 # set as soon as step is set
edf3e38e
PH
504 for i, prev in zip(idxs[1:], idxs[:-1]):
505 if step is not None:
506 if i - prev == step:
507 continue
508 yield _genslice(start, prev, step)
509 step = None
510 continue
511 if i - prev in [-1, 1]:
512 step = i - prev
513 start = prev
514 continue
515 else:
516 yield u's[%d]' % prev
517 if step is None:
518 yield u's[%d]' % i
519 else:
520 yield _genslice(start, i, step)
521
60064c53 522 test_string = u''.join(map(compat_chr, range(len(example_sig))))
c705320f 523 cache_res = func(test_string)
edf3e38e
PH
524 cache_spec = [ord(c) for c in cache_res]
525 expr_code = u' + '.join(gen_sig_code(cache_spec))
60064c53
PH
526 signature_id_tuple = '(%s)' % (
527 ', '.join(compat_str(len(p)) for p in example_sig.split('.')))
528 code = (u'if tuple(len(p) for p in s.split(\'.\')) == %s:\n'
529 u' return %s\n') % (signature_id_tuple, expr_code)
f8061589 530 self.to_screen(u'Extracted signature function:\n' + code)
edf3e38e 531
e0df6211
PH
532 def _parse_sig_js(self, jscode):
533 funcname = self._search_regex(
c26e9ac4 534 r'signature=([$a-zA-Z]+)', jscode,
2b25cb5d
PH
535 u'Initial JS player signature function name')
536
537 jsi = JSInterpreter(jscode)
538 initial_function = jsi.extract_function(funcname)
e0df6211
PH
539 return lambda s: initial_function([s])
540
541 def _parse_sig_swf(self, file_contents):
54256267 542 swfi = SWFInterpreter(file_contents)
5dc3552d 543 TARGET_CLASSNAME = u'SignatureDecipher'
54256267
PH
544 searched_class = swfi.extract_class(TARGET_CLASSNAME)
545 initial_function = swfi.extract_function(searched_class, u'decipher')
e0df6211
PH
546 return lambda s: initial_function([s])
547
83799698 548 def _decrypt_signature(self, s, video_id, player_url, age_gate=False):
257a2501 549 """Turn the encrypted s field into a working signature"""
6b37f0be 550
c8bf86d5
PH
551 if player_url is None:
552 raise ExtractorError(u'Cannot decrypt signature without player_url')
920de7a2 553
c8bf86d5
PH
554 if player_url.startswith(u'//'):
555 player_url = u'https:' + player_url
556 try:
62af3a0e 557 player_id = (player_url, self._signature_cache_id(s))
c8bf86d5
PH
558 if player_id not in self._player_cache:
559 func = self._extract_signature_function(
60064c53 560 video_id, player_url, s
c8bf86d5
PH
561 )
562 self._player_cache[player_id] = func
563 func = self._player_cache[player_id]
564 if self._downloader.params.get('youtube_print_sig_code'):
60064c53 565 self._print_sig_code(func, s)
c8bf86d5
PH
566 return func(s)
567 except Exception as e:
568 tb = traceback.format_exc()
569 raise ExtractorError(
60064c53 570 u'Signature extraction failed: ' + tb, cause=e)
e0df6211 571
1f343eaa 572 def _get_available_subtitles(self, video_id, webpage):
de7f3446 573 try:
7fad1c63 574 sub_list = self._download_webpage(
38c2e5b8 575 'https://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,
7fad1c63
JMF
576 video_id, note=False)
577 except ExtractorError as err:
de7f3446
JMF
578 self._downloader.report_warning(u'unable to download video subtitles: %s' % compat_str(err))
579 return {}
580 lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
581
582 sub_lang_list = {}
583 for l in lang_list:
584 lang = l[1]
7e660ac1
LD
585 if lang in sub_lang_list:
586 continue
de7f3446
JMF
587 params = compat_urllib_parse.urlencode({
588 'lang': lang,
589 'v': video_id,
ca715127 590 'fmt': self._downloader.params.get('subtitlesformat', 'srt'),
c3197e3e 591 'name': unescapeHTML(l[0]).encode('utf-8'),
de7f3446 592 })
38c2e5b8 593 url = u'https://www.youtube.com/api/timedtext?' + params
de7f3446
JMF
594 sub_lang_list[lang] = url
595 if not sub_lang_list:
596 self._downloader.report_warning(u'video doesn\'t have subtitles')
597 return {}
598 return sub_lang_list
599
055e6f36 600 def _get_available_automatic_caption(self, video_id, webpage):
de7f3446
JMF
601 """We need the webpage for getting the captions url, pass it as an
602 argument to speed up the process."""
ca715127 603 sub_format = self._downloader.params.get('subtitlesformat', 'srt')
de7f3446
JMF
604 self.to_screen(u'%s: Looking for automatic captions' % video_id)
605 mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
055e6f36 606 err_msg = u'Couldn\'t find automatic captions for %s' % video_id
de7f3446
JMF
607 if mobj is None:
608 self._downloader.report_warning(err_msg)
609 return {}
610 player_config = json.loads(mobj.group(1))
611 try:
612 args = player_config[u'args']
613 caption_url = args[u'ttsurl']
614 timestamp = args[u'timestamp']
055e6f36
JMF
615 # We get the available subtitles
616 list_params = compat_urllib_parse.urlencode({
617 'type': 'list',
618 'tlangs': 1,
619 'asrs': 1,
de7f3446 620 })
055e6f36 621 list_url = caption_url + '&' + list_params
e26f8712 622 caption_list = self._download_xml(list_url, video_id)
e3dc22ca 623 original_lang_node = caption_list.find('track')
f6a54188 624 if original_lang_node is None or original_lang_node.attrib.get('kind') != 'asr' :
e3dc22ca
JMF
625 self._downloader.report_warning(u'Video doesn\'t have automatic captions')
626 return {}
627 original_lang = original_lang_node.attrib['lang_code']
055e6f36
JMF
628
629 sub_lang_list = {}
630 for lang_node in caption_list.findall('target'):
631 sub_lang = lang_node.attrib['lang_code']
632 params = compat_urllib_parse.urlencode({
633 'lang': original_lang,
634 'tlang': sub_lang,
635 'fmt': sub_format,
636 'ts': timestamp,
637 'kind': 'asr',
638 })
639 sub_lang_list[sub_lang] = caption_url + '&' + params
640 return sub_lang_list
de7f3446
JMF
641 # An extractor error can be raise by the download process if there are
642 # no automatic captions but there are subtitles
643 except (KeyError, ExtractorError):
644 self._downloader.report_warning(err_msg)
645 return {}
646
97665381
PH
647 @classmethod
648 def extract_id(cls, url):
649 mobj = re.match(cls._VALID_URL, url, re.VERBOSE)
c5e8d7af
PH
650 if mobj is None:
651 raise ExtractorError(u'Invalid URL: %s' % url)
652 video_id = mobj.group(2)
653 return video_id
654
1d043b93
JMF
655 def _extract_from_m3u8(self, manifest_url, video_id):
656 url_map = {}
657 def _get_urls(_manifest):
658 lines = _manifest.split('\n')
659 urls = filter(lambda l: l and not l.startswith('#'),
660 lines)
661 return urls
662 manifest = self._download_webpage(manifest_url, video_id, u'Downloading formats manifest')
663 formats_urls = _get_urls(manifest)
664 for format_url in formats_urls:
890f62e8 665 itag = self._search_regex(r'itag/(\d+?)/', format_url, 'itag')
1d043b93
JMF
666 url_map[itag] = format_url
667 return url_map
668
1fb07d10
JG
669 def _extract_annotations(self, video_id):
670 url = 'https://www.youtube.com/annotations_invideo?features=1&legacy=1&video_id=%s' % video_id
671 return self._download_webpage(url, video_id, note=u'Searching for annotations.', errnote=u'Unable to download video annotations.')
672
c5e8d7af 673 def _real_extract(self, url):
7e8c0af0
PH
674 proto = (
675 u'http' if self._downloader.params.get('prefer_insecure', False)
676 else u'https')
677
c5e8d7af
PH
678 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
679 mobj = re.search(self._NEXT_URL_RE, url)
680 if mobj:
7e8c0af0 681 url = proto + '://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
97665381 682 video_id = self.extract_id(url)
c5e8d7af
PH
683
684 # Get video webpage
7e8c0af0 685 url = proto + '://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
336c3a69 686 video_webpage = self._download_webpage(url, video_id)
c5e8d7af
PH
687
688 # Attempt to extract SWF player URL
e0df6211 689 mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
c5e8d7af
PH
690 if mobj is not None:
691 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
692 else:
693 player_url = None
694
695 # Get video info
696 self.report_video_info_webpage_download(video_id)
c108eb73
JMF
697 if re.search(r'player-age-gate-content">', video_webpage) is not None:
698 self.report_age_confirmation()
699 age_gate = True
700 # We simulate the access to the video from www.youtube.com/v/{video_id}
701 # this can be viewed without login into Youtube
2c57c7fa
JMF
702 data = compat_urllib_parse.urlencode({
703 'video_id': video_id,
704 'eurl': 'https://youtube.googleapis.com/v/' + video_id,
c084c934
JMF
705 'sts': self._search_regex(
706 r'"sts"\s*:\s*(\d+)', video_webpage, 'sts'),
2c57c7fa 707 })
7e8c0af0 708 video_info_url = proto + '://www.youtube.com/get_video_info?' + data
c5e8d7af
PH
709 video_info_webpage = self._download_webpage(video_info_url, video_id,
710 note=False,
711 errnote='unable to download video info webpage')
712 video_info = compat_parse_qs(video_info_webpage)
c108eb73
JMF
713 else:
714 age_gate = False
715 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
7e8c0af0 716 video_info_url = (proto + '://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
c108eb73
JMF
717 % (video_id, el_type))
718 video_info_webpage = self._download_webpage(video_info_url, video_id,
719 note=False,
720 errnote='unable to download video info webpage')
721 video_info = compat_parse_qs(video_info_webpage)
722 if 'token' in video_info:
723 break
c5e8d7af
PH
724 if 'token' not in video_info:
725 if 'reason' in video_info:
d11271dd
PH
726 raise ExtractorError(
727 u'YouTube said: %s' % video_info['reason'][0],
728 expected=True, video_id=video_id)
c5e8d7af 729 else:
d11271dd
PH
730 raise ExtractorError(
731 u'"token" parameter not in video info for unknown reason',
732 video_id=video_id)
c5e8d7af 733
1d699755
PH
734 if 'view_count' in video_info:
735 view_count = int(video_info['view_count'][0])
736 else:
737 view_count = None
738
c5e8d7af
PH
739 # Check for "rental" videos
740 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
741 raise ExtractorError(u'"rental" videos not supported')
742
743 # Start extracting information
744 self.report_information_extraction(video_id)
745
746 # uploader
747 if 'author' not in video_info:
748 raise ExtractorError(u'Unable to extract uploader name')
749 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
750
751 # uploader_id
752 video_uploader_id = None
753 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
754 if mobj is not None:
755 video_uploader_id = mobj.group(1)
756 else:
757 self._downloader.report_warning(u'unable to extract uploader nickname')
758
759 # title
a8c6b241 760 if 'title' in video_info:
aa92f063 761 video_title = video_info['title'][0]
a8c6b241
PH
762 else:
763 self._downloader.report_warning(u'Unable to extract video title')
764 video_title = u'_'
c5e8d7af
PH
765
766 # thumbnail image
7763b04e
JMF
767 # We try first to get a high quality image:
768 m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">',
769 video_webpage, re.DOTALL)
770 if m_thumb is not None:
771 video_thumbnail = m_thumb.group(1)
772 elif 'thumbnail_url' not in video_info:
c5e8d7af 773 self._downloader.report_warning(u'unable to extract video thumbnail')
f490e77e 774 video_thumbnail = None
c5e8d7af
PH
775 else: # don't panic if we can't find it
776 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
777
778 # upload date
779 upload_date = None
ad3bc6ac 780 mobj = re.search(r'(?s)id="eow-date.*?>(.*?)</span>', video_webpage)
beee53de
PH
781 if mobj is None:
782 mobj = re.search(
263bd4ec 783 r'(?s)id="watch-uploader-info".*?>.*?(?:Published|Uploaded|Streamed live) on (.*?)</strong>',
beee53de 784 video_webpage)
c5e8d7af
PH
785 if mobj is not None:
786 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
787 upload_date = unified_strdate(upload_date)
788
55f7bd2d
PH
789 m_cat_container = self._search_regex(
790 r'(?s)<h4[^>]*>\s*Category\s*</h4>\s*<ul[^>]*>(.*?)</ul>',
791 video_webpage, 'categories', fatal=False)
ec8deefc 792 if m_cat_container:
ad3bc6ac 793 category = self._html_search_regex(
01ed5c9b 794 r'(?s)<a[^<]+>(.*?)</a>', m_cat_container, 'category',
ad3bc6ac
PH
795 default=None)
796 video_categories = None if category is None else [category]
797 else:
798 video_categories = None
ec8deefc 799
c5e8d7af
PH
800 # description
801 video_description = get_element_by_id("eow-description", video_webpage)
802 if video_description:
27dcce19
PH
803 video_description = re.sub(r'''(?x)
804 <a\s+
805 (?:[a-zA-Z-]+="[^"]+"\s+)*?
806 title="([^"]+)"\s+
807 (?:[a-zA-Z-]+="[^"]+"\s+)*?
808 class="yt-uix-redirect-link"\s*>
809 [^<]+
810 </a>
811 ''', r'\1', video_description)
c5e8d7af
PH
812 video_description = clean_html(video_description)
813 else:
814 fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
815 if fd_mobj:
816 video_description = unescapeHTML(fd_mobj.group(1))
817 else:
818 video_description = u''
819
f30a38be 820 def _extract_count(count_name):
46374a56 821 count = self._search_regex(
f30a38be
JMF
822 r'id="watch-%s"[^>]*>.*?([\d,]+)\s*</span>' % re.escape(count_name),
823 video_webpage, count_name, default=None)
336c3a69
JMF
824 if count is not None:
825 return int(count.replace(',', ''))
826 return None
f30a38be
JMF
827 like_count = _extract_count(u'like')
828 dislike_count = _extract_count(u'dislike')
336c3a69 829
c5e8d7af 830 # subtitles
d82134c3 831 video_subtitles = self.extract_subtitles(video_id, video_webpage)
c5e8d7af 832
c5e8d7af 833 if self._downloader.params.get('listsubtitles', False):
d665f8d3 834 self._list_available_subtitles(video_id, video_webpage)
c5e8d7af
PH
835 return
836
837 if 'length_seconds' not in video_info:
838 self._downloader.report_warning(u'unable to extract video duration')
b466b702 839 video_duration = None
c5e8d7af 840 else:
b466b702 841 video_duration = int(compat_urllib_parse.unquote_plus(video_info['length_seconds'][0]))
c5e8d7af 842
1fb07d10
JG
843 # annotations
844 video_annotations = None
845 if self._downloader.params.get('writeannotations', False):
846 video_annotations = self._extract_annotations(video_id)
847
c5e8d7af 848 # Decide which formats to download
c5e8d7af 849 try:
ae7ed920 850 mobj = re.search(r';ytplayer\.config\s*=\s*({.*?});', video_webpage)
50be92c1
PH
851 if not mobj:
852 raise ValueError('Could not find vevo ID')
ae7ed920
PH
853 json_code = uppercase_escape(mobj.group(1))
854 ytplayer_config = json.loads(json_code)
3489b7d2 855 args = ytplayer_config['args']
7ce7e394
JMF
856 # Easy way to know if the 's' value is in url_encoded_fmt_stream_map
857 # this signatures are encrypted
44d46655 858 if 'url_encoded_fmt_stream_map' not in args:
f10503db 859 raise ValueError(u'No stream_map present') # caught below
00fe14fc
JMF
860 re_signature = re.compile(r'[&,]s=')
861 m_s = re_signature.search(args['url_encoded_fmt_stream_map'])
7ce7e394
JMF
862 if m_s is not None:
863 self.to_screen(u'%s: Encrypted signatures detected.' % video_id)
c5e8d7af 864 video_info['url_encoded_fmt_stream_map'] = [args['url_encoded_fmt_stream_map']]
00fe14fc 865 m_s = re_signature.search(args.get('adaptive_fmts', u''))
b7a68384 866 if m_s is not None:
00fe14fc
JMF
867 if 'adaptive_fmts' in video_info:
868 video_info['adaptive_fmts'][0] += ',' + args['adaptive_fmts']
37b6d5f6 869 else:
00fe14fc 870 video_info['adaptive_fmts'] = [args['adaptive_fmts']]
c5e8d7af
PH
871 except ValueError:
872 pass
873
dd27fd17
PH
874 def _map_to_format_list(urlmap):
875 formats = []
876 for itag, video_real_url in urlmap.items():
877 dct = {
878 'format_id': itag,
879 'url': video_real_url,
880 'player_url': player_url,
881 }
0b65e5d4
PH
882 if itag in self._formats:
883 dct.update(self._formats[itag])
dd27fd17
PH
884 formats.append(dct)
885 return formats
886
c5e8d7af
PH
887 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
888 self.report_rtmp_download()
dd27fd17
PH
889 formats = [{
890 'format_id': '_rtmp',
891 'protocol': 'rtmp',
892 'url': video_info['conn'][0],
893 'player_url': player_url,
894 }]
00fe14fc
JMF
895 elif len(video_info.get('url_encoded_fmt_stream_map', [])) >= 1 or len(video_info.get('adaptive_fmts', [])) >= 1:
896 encoded_url_map = video_info.get('url_encoded_fmt_stream_map', [''])[0] + ',' + video_info.get('adaptive_fmts',[''])[0]
897 if 'rtmpe%3Dyes' in encoded_url_map:
a7055eb9 898 raise ExtractorError('rtmpe downloads are not supported, see https://github.com/rg3/youtube-dl/issues/343 for more information.', expected=True)
c5e8d7af 899 url_map = {}
00fe14fc 900 for url_data_str in encoded_url_map.split(','):
c5e8d7af 901 url_data = compat_parse_qs(url_data_str)
201e9eaa
PH
902 if 'itag' not in url_data or 'url' not in url_data:
903 continue
904 format_id = url_data['itag'][0]
905 url = url_data['url'][0]
906
907 if 'sig' in url_data:
908 url += '&signature=' + url_data['sig'][0]
909 elif 's' in url_data:
910 encrypted_sig = url_data['s'][0]
911
912 if not age_gate:
913 jsplayer_url_json = self._search_regex(
914 r'"assets":.+?"js":\s*("[^"]+")',
915 video_webpage, u'JS player URL')
916 player_url = json.loads(jsplayer_url_json)
917 if player_url is None:
918 player_url_json = self._search_regex(
919 r'ytplayer\.config.*?"url"\s*:\s*("[^"]+")',
920 video_webpage, u'age gate player URL')
921 player_url = json.loads(player_url_json)
922
923 if self._downloader.params.get('verbose'):
cf010131 924 if player_url is None:
201e9eaa
PH
925 player_version = 'unknown'
926 player_desc = 'unknown'
927 else:
928 if player_url.endswith('swf'):
929 player_version = self._search_regex(
930 r'-(.+?)(?:/watch_as3)?\.swf$', player_url,
931 u'flash player', fatal=False)
932 player_desc = 'flash player %s' % player_version
cf010131 933 else:
201e9eaa
PH
934 player_version = self._search_regex(
935 r'html5player-([^/]+?)(?:/html5player)?\.js',
936 player_url,
937 'html5 player', fatal=False)
938 player_desc = u'html5 player %s' % player_version
939
60064c53 940 parts_sizes = self._signature_cache_id(encrypted_sig)
98eb1c3f
PH
941 self.to_screen(u'{%s} signature length %s, %s' %
942 (format_id, parts_sizes, player_desc))
201e9eaa
PH
943
944 signature = self._decrypt_signature(
945 encrypted_sig, video_id, player_url, age_gate)
946 url += '&signature=' + signature
947 if 'ratebypass' not in url:
948 url += '&ratebypass=yes'
949 url_map[format_id] = url
dd27fd17 950 formats = _map_to_format_list(url_map)
1d043b93
JMF
951 elif video_info.get('hlsvp'):
952 manifest_url = video_info['hlsvp'][0]
953 url_map = self._extract_from_m3u8(manifest_url, video_id)
dd27fd17 954 formats = _map_to_format_list(url_map)
c5e8d7af 955 else:
9abb3204 956 raise ExtractorError(u'no conn, hlsvp or url_encoded_fmt_stream_map information found in video info')
c5e8d7af 957
dd27fd17 958 # Look for the DASH manifest
d68f0cdb 959 if (self._downloader.params.get('youtube_include_dash_manifest', False)):
dd27fd17 960 try:
d68f0cdb 961 # The DASH manifest used needs to be the one from the original video_webpage.
962 # The one found in get_video_info seems to be using different signatures.
963 # However, in the case of an age restriction there won't be any embedded dashmpd in the video_webpage.
964 # Luckily, it seems, this case uses some kind of default signature (len == 86), so the
965 # combination of get_video_info and the _static_decrypt_signature() decryption fallback will work here.
966 if age_gate:
3489b7d2 967 dash_manifest_url = video_info.get('dashmpd')[0]
d68f0cdb 968 else:
3489b7d2 969 dash_manifest_url = ytplayer_config['args']['dashmpd']
d68f0cdb 970 def decrypt_sig(mobj):
971 s = mobj.group(1)
972 dec_s = self._decrypt_signature(s, video_id, player_url, age_gate)
973 return '/signature/%s' % dec_s
974 dash_manifest_url = re.sub(r'/s/([\w\.]+)', decrypt_sig, dash_manifest_url)
dd27fd17 975 dash_doc = self._download_xml(
d68f0cdb 976 dash_manifest_url, video_id,
dd27fd17
PH
977 note=u'Downloading DASH manifest',
978 errnote=u'Could not download DASH manifest')
979 for r in dash_doc.findall(u'.//{urn:mpeg:DASH:schema:MPD:2011}Representation'):
980 url_el = r.find('{urn:mpeg:DASH:schema:MPD:2011}BaseURL')
981 if url_el is None:
982 continue
983 format_id = r.attrib['id']
984 video_url = url_el.text
985 filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength'))
986 f = {
987 'format_id': format_id,
988 'url': video_url,
989 'width': int_or_none(r.attrib.get('width')),
990 'tbr': int_or_none(r.attrib.get('bandwidth'), 1000),
991 'asr': int_or_none(r.attrib.get('audioSamplingRate')),
992 'filesize': filesize,
993 }
994 try:
995 existing_format = next(
996 fo for fo in formats
997 if fo['format_id'] == format_id)
998 except StopIteration:
999 f.update(self._formats.get(format_id, {}))
1000 formats.append(f)
1001 else:
1002 existing_format.update(f)
1003
1004 except (ExtractorError, KeyError) as e:
1005 self.report_warning(u'Skipping DASH manifest: %s' % e, video_id)
d80044c2 1006
4bcc7bd1 1007 self._sort_formats(formats)
4ea3be0a 1008
1009 return {
1010 'id': video_id,
1011 'uploader': video_uploader,
1012 'uploader_id': video_uploader_id,
1013 'upload_date': upload_date,
1014 'title': video_title,
1015 'thumbnail': video_thumbnail,
1016 'description': video_description,
ec8deefc 1017 'categories': video_categories,
4ea3be0a 1018 'subtitles': video_subtitles,
1019 'duration': video_duration,
1020 'age_limit': 18 if age_gate else 0,
1021 'annotations': video_annotations,
7e8c0af0 1022 'webpage_url': proto + '://www.youtube.com/watch?v=%s' % video_id,
4ea3be0a 1023 'view_count': view_count,
1024 'like_count': like_count,
1025 'dislike_count': dislike_count,
1026 'formats': formats,
1027 }
c5e8d7af 1028
880e1c52 1029class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
0f818663 1030 IE_DESC = u'YouTube.com playlists'
d67cc9fa 1031 _VALID_URL = r"""(?x)(?:
c5e8d7af
PH
1032 (?:https?://)?
1033 (?:\w+\.)?
1034 youtube\.com/
1035 (?:
1036 (?:course|view_play_list|my_playlists|artist|playlist|watch)
1037 \? (?:.*?&)*? (?:p|a|list)=
1038 | p/
1039 )
d67cc9fa 1040 (
7d568f5a 1041 (?:PL|LL|EC|UU|FL|RD)?[0-9A-Za-z-_]{10,}
d67cc9fa
JMF
1042 # Top tracks, they can also include dots
1043 |(?:MC)[\w\.]*
1044 )
c5e8d7af
PH
1045 .*
1046 |
7d568f5a 1047 ((?:PL|LL|EC|UU|FL|RD)[0-9A-Za-z-_]{10,})
c5e8d7af 1048 )"""
dbb94fb0 1049 _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s'
dcbb4580 1050 _MORE_PAGES_INDICATOR = r'data-link-type="next"'
dbb94fb0 1051 _VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})&amp;[^"]*?index=(?P<index>\d+)'
c5e8d7af
PH
1052 IE_NAME = u'youtube:playlist'
1053
880e1c52
JMF
1054 def _real_initialize(self):
1055 self._login()
1056
652cdaa2 1057 def _ids_to_results(self, ids):
c9cc0bf5
PH
1058 return [
1059 self.url_result(vid_id, 'Youtube', video_id=vid_id)
1060 for vid_id in ids]
652cdaa2
JMF
1061
1062 def _extract_mix(self, playlist_id):
1063 # The mixes are generated from a a single video
1064 # the id of the playlist is just 'RD' + video_id
7d4afc55 1065 url = 'https://youtube.com/watch?v=%s&list=%s' % (playlist_id[-11:], playlist_id)
c9cc0bf5
PH
1066 webpage = self._download_webpage(
1067 url, playlist_id, u'Downloading Youtube mix')
bc2f773b 1068 search_title = lambda class_name: get_element_by_attribute('class', class_name, webpage)
c9cc0bf5
PH
1069 title_span = (
1070 search_title('playlist-title') or
1071 search_title('title long-title') or
1072 search_title('title'))
76d1700b 1073 title = clean_html(title_span)
c9cc0bf5
PH
1074 ids = orderedSet(re.findall(
1075 r'''(?xs)data-video-username=".*?".*?
1076 href="/watch\?v=([0-9A-Za-z_-]{11})&amp;[^"]*?list=%s''' % re.escape(playlist_id),
1077 webpage))
652cdaa2
JMF
1078 url_results = self._ids_to_results(ids)
1079
1080 return self.playlist_result(url_results, playlist_id, title)
1081
c5e8d7af
PH
1082 def _real_extract(self, url):
1083 # Extract playlist id
d67cc9fa 1084 mobj = re.match(self._VALID_URL, url)
c5e8d7af
PH
1085 if mobj is None:
1086 raise ExtractorError(u'Invalid URL: %s' % url)
47192f92
FV
1087 playlist_id = mobj.group(1) or mobj.group(2)
1088
1089 # Check if it's a video-specific URL
7c61bd36 1090 query_dict = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
47192f92
FV
1091 if 'v' in query_dict:
1092 video_id = query_dict['v'][0]
1093 if self._downloader.params.get('noplaylist'):
1094 self.to_screen(u'Downloading just video %s because of --no-playlist' % video_id)
7012b23c 1095 return self.url_result(video_id, 'Youtube', video_id=video_id)
47192f92 1096 else:
1db26669 1097 self.to_screen(u'Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id))
c5e8d7af 1098
7d4afc55 1099 if playlist_id.startswith('RD'):
652cdaa2
JMF
1100 # Mixes require a custom extraction process
1101 return self._extract_mix(playlist_id)
0a688bc0
JMF
1102 if playlist_id.startswith('TL'):
1103 raise ExtractorError(u'For downloading YouTube.com top lists, use '
1104 u'the "yttoplist" keyword, for example "youtube-dl \'yttoplist:music:Top Tracks\'"', expected=True)
652cdaa2 1105
dbb94fb0
S
1106 url = self._TEMPLATE_URL % playlist_id
1107 page = self._download_webpage(url, playlist_id)
1108 more_widget_html = content_html = page
1109
10c0e2d8 1110 # Check if the playlist exists or is private
e399853d 1111 if re.search(r'<div class="yt-alert-message">[^<]*?(The|This) playlist (does not exist|is private)[^<]*?</div>', page) is not None:
10c0e2d8
JMF
1112 raise ExtractorError(
1113 u'The playlist doesn\'t exist or is private, use --username or '
1114 '--netrc to access it.',
1115 expected=True)
1116
dcbb4580
JMF
1117 # Extract the video ids from the playlist pages
1118 ids = []
c5e8d7af 1119
755eb032 1120 for page_num in itertools.count(1):
dbb94fb0 1121 matches = re.finditer(self._VIDEO_RE, content_html)
6e47b51e
JMF
1122 # We remove the duplicates and the link with index 0
1123 # (it's not the first video of the playlist)
1124 new_ids = orderedSet(m.group('id') for m in matches if m.group('index') != '0')
dcbb4580 1125 ids.extend(new_ids)
c5e8d7af 1126
dbb94fb0
S
1127 mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
1128 if not mobj:
c5e8d7af
PH
1129 break
1130
dbb94fb0 1131 more = self._download_json(
5912c639
PH
1132 'https://youtube.com/%s' % mobj.group('more'), playlist_id,
1133 'Downloading page #%s' % page_num,
1134 transform_source=uppercase_escape)
dbb94fb0
S
1135 content_html = more['content_html']
1136 more_widget_html = more['load_more_widget_html']
1137
1138 playlist_title = self._html_search_regex(
68eb8e90
PH
1139 r'(?s)<h1 class="pl-header-title[^"]*">\s*(.*?)\s*</h1>',
1140 page, u'title')
c5e8d7af 1141
652cdaa2 1142 url_results = self._ids_to_results(ids)
dcbb4580 1143 return self.playlist_result(url_results, playlist_id, playlist_title)
c5e8d7af
PH
1144
1145
0a688bc0
JMF
1146class YoutubeTopListIE(YoutubePlaylistIE):
1147 IE_NAME = u'youtube:toplist'
1148 IE_DESC = (u'YouTube.com top lists, "yttoplist:{channel}:{list title}"'
1149 u' (Example: "yttoplist:music:Top Tracks")')
1150 _VALID_URL = r'yttoplist:(?P<chann>.*?):(?P<title>.*?)$'
1151
1152 def _real_extract(self, url):
1153 mobj = re.match(self._VALID_URL, url)
1154 channel = mobj.group('chann')
1155 title = mobj.group('title')
1156 query = compat_urllib_parse.urlencode({'title': title})
beddbc2a 1157 playlist_re = 'href="([^"]+?%s.*?)"' % re.escape(query)
0a688bc0
JMF
1158 channel_page = self._download_webpage('https://www.youtube.com/%s' % channel, title)
1159 link = self._html_search_regex(playlist_re, channel_page, u'list')
1160 url = compat_urlparse.urljoin('https://www.youtube.com/', link)
1161
1162 video_re = r'data-index="\d+".*?data-video-id="([0-9A-Za-z_-]{11})"'
1163 ids = []
1164 # sometimes the webpage doesn't contain the videos
1165 # retry until we get them
1166 for i in itertools.count(0):
1167 msg = u'Downloading Youtube mix'
1168 if i > 0:
1169 msg += ', retry #%d' % i
c9cc0bf5 1170
0a688bc0
JMF
1171 webpage = self._download_webpage(url, title, msg)
1172 ids = orderedSet(re.findall(video_re, webpage))
1173 if ids:
1174 break
1175 url_results = self._ids_to_results(ids)
1176 return self.playlist_result(url_results, playlist_title=title)
1177
1178
c5e8d7af 1179class YoutubeChannelIE(InfoExtractor):
0f818663 1180 IE_DESC = u'YouTube.com channels'
c5e8d7af 1181 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
c5e8d7af 1182 _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
38c2e5b8 1183 _MORE_PAGES_URL = 'https://www.youtube.com/c4_browse_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
c5e8d7af
PH
1184 IE_NAME = u'youtube:channel'
1185
1186 def extract_videos_from_page(self, page):
1187 ids_in_page = []
1188 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1189 if mobj.group(1) not in ids_in_page:
1190 ids_in_page.append(mobj.group(1))
1191 return ids_in_page
1192
1193 def _real_extract(self, url):
1194 # Extract channel id
1195 mobj = re.match(self._VALID_URL, url)
1196 if mobj is None:
1197 raise ExtractorError(u'Invalid URL: %s' % url)
1198
1199 # Download channel page
1200 channel_id = mobj.group(1)
1201 video_ids = []
b9643eed
JMF
1202 url = 'https://www.youtube.com/channel/%s/videos' % channel_id
1203 channel_page = self._download_webpage(url, channel_id)
31812a9e
PH
1204 autogenerated = re.search(r'''(?x)
1205 class="[^"]*?(?:
1206 channel-header-autogenerated-label|
1207 yt-channel-title-autogenerated
1208 )[^"]*"''', channel_page) is not None
c5e8d7af 1209
b9643eed
JMF
1210 if autogenerated:
1211 # The videos are contained in a single page
1212 # the ajax pages can't be used, they are empty
1213 video_ids = self.extract_videos_from_page(channel_page)
1214 else:
1215 # Download all channel pages using the json-based channel_ajax query
1216 for pagenum in itertools.count(1):
1217 url = self._MORE_PAGES_URL % (pagenum, channel_id)
81c2f20b
PH
1218 page = self._download_json(
1219 url, channel_id, note=u'Downloading page #%s' % pagenum,
1220 transform_source=uppercase_escape)
1221
b9643eed
JMF
1222 ids_in_page = self.extract_videos_from_page(page['content_html'])
1223 video_ids.extend(ids_in_page)
1224
1225 if self._MORE_PAGES_INDICATOR not in page['load_more_widget_html']:
1226 break
c5e8d7af
PH
1227
1228 self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1229
7012b23c
PH
1230 url_entries = [self.url_result(video_id, 'Youtube', video_id=video_id)
1231 for video_id in video_ids]
1232 return self.playlist_result(url_entries, channel_id)
c5e8d7af
PH
1233
1234
1235class YoutubeUserIE(InfoExtractor):
0f818663 1236 IE_DESC = u'YouTube.com user videos (URL or "ytuser" keyword)'
c9ae7b95 1237 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:user/)?(?!(?:attribution_link|watch|results)(?:$|[^a-z_A-Z0-9-])))|ytuser:)(?!feed/)([A-Za-z0-9_-]+)'
38c2e5b8 1238 _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/users/%s'
c5e8d7af 1239 _GDATA_PAGE_SIZE = 50
38c2e5b8 1240 _GDATA_URL = 'https://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d&alt=json'
c5e8d7af
PH
1241 IE_NAME = u'youtube:user'
1242
e3ea4790 1243 @classmethod
f4b05232 1244 def suitable(cls, url):
e3ea4790
JMF
1245 # Don't return True if the url can be extracted with other youtube
1246 # extractor, the regex would is too permissive and it would match.
1247 other_ies = iter(klass for (name, klass) in globals().items() if name.endswith('IE') and klass is not cls)
1248 if any(ie.suitable(url) for ie in other_ies): return False
f4b05232
JMF
1249 else: return super(YoutubeUserIE, cls).suitable(url)
1250
c5e8d7af
PH
1251 def _real_extract(self, url):
1252 # Extract username
1253 mobj = re.match(self._VALID_URL, url)
1254 if mobj is None:
1255 raise ExtractorError(u'Invalid URL: %s' % url)
1256
1257 username = mobj.group(1)
1258
1259 # Download video ids using YouTube Data API. Result size per
1260 # query is limited (currently to 50 videos) so we need to query
1261 # page by page until there are no video ids - it means we got
1262 # all of them.
1263
b7ab0590 1264 def download_page(pagenum):
c5e8d7af
PH
1265 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1266
1267 gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
b7ab0590
PH
1268 page = self._download_webpage(
1269 gdata_url, username,
1270 u'Downloading video ids from %d to %d' % (
1271 start_index, start_index + self._GDATA_PAGE_SIZE))
c5e8d7af 1272
fd9cf738
JMF
1273 try:
1274 response = json.loads(page)
1275 except ValueError as err:
1276 raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
71c82637 1277 if 'entry' not in response['feed']:
b7ab0590 1278 return
fd9cf738 1279
c5e8d7af 1280 # Extract video identifiers
e302f9ce
PH
1281 entries = response['feed']['entry']
1282 for entry in entries:
1283 title = entry['title']['$t']
1284 video_id = entry['id']['$t'].split('/')[-1]
b7ab0590 1285 yield {
e302f9ce
PH
1286 '_type': 'url',
1287 'url': video_id,
1288 'ie_key': 'Youtube',
b11cec41 1289 'id': video_id,
e302f9ce 1290 'title': title,
b7ab0590
PH
1291 }
1292 url_results = PagedList(download_page, self._GDATA_PAGE_SIZE)
c5e8d7af 1293
7012b23c
PH
1294 return self.playlist_result(url_results, playlist_title=username)
1295
b05654f0
PH
1296
1297class YoutubeSearchIE(SearchInfoExtractor):
0f818663 1298 IE_DESC = u'YouTube.com searches'
83d548ef 1299 _API_URL = u'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
b05654f0
PH
1300 _MAX_RESULTS = 1000
1301 IE_NAME = u'youtube:search'
1302 _SEARCH_KEY = 'ytsearch'
1303
b05654f0
PH
1304 def _get_n_results(self, query, n):
1305 """Get a specified number of results for a query"""
1306
1307 video_ids = []
1308 pagenum = 0
1309 limit = n
83d548ef 1310 PAGE_SIZE = 50
b05654f0 1311
83d548ef
PH
1312 while (PAGE_SIZE * pagenum) < limit:
1313 result_url = self._API_URL % (
1314 compat_urllib_parse.quote_plus(query.encode('utf-8')),
1315 (PAGE_SIZE * pagenum) + 1)
7cc3570e
PH
1316 data_json = self._download_webpage(
1317 result_url, video_id=u'query "%s"' % query,
1318 note=u'Downloading page %s' % (pagenum + 1),
1319 errnote=u'Unable to download API page')
1320 data = json.loads(data_json)
1321 api_response = data['data']
1322
1323 if 'items' not in api_response:
07ad22b8
PH
1324 raise ExtractorError(
1325 u'[youtube] No video results', expected=True)
b05654f0
PH
1326
1327 new_ids = list(video['id'] for video in api_response['items'])
1328 video_ids += new_ids
1329
1330 limit = min(n, api_response['totalItems'])
1331 pagenum += 1
1332
1333 if len(video_ids) > n:
1334 video_ids = video_ids[:n]
7012b23c
PH
1335 videos = [self.url_result(video_id, 'Youtube', video_id=video_id)
1336 for video_id in video_ids]
b05654f0 1337 return self.playlist_result(videos, query)
75dff0ee 1338
c9ae7b95 1339
a3dd9248 1340class YoutubeSearchDateIE(YoutubeSearchIE):
cb7fb546 1341 IE_NAME = YoutubeSearchIE.IE_NAME + ':date'
a3dd9248
CM
1342 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc&orderby=published'
1343 _SEARCH_KEY = 'ytsearchdate'
08fb86c4 1344 IE_DESC = u'YouTube.com searches, newest videos first'
75dff0ee 1345
c9ae7b95
PH
1346
1347class YoutubeSearchURLIE(InfoExtractor):
1348 IE_DESC = u'YouTube.com search URLs'
1349 IE_NAME = u'youtube:search_url'
1350 _VALID_URL = r'https?://(?:www\.)?youtube\.com/results\?(.*?&)?search_query=(?P<query>[^&]+)(?:[&]|$)'
1351
1352 def _real_extract(self, url):
1353 mobj = re.match(self._VALID_URL, url)
1354 query = compat_urllib_parse.unquote_plus(mobj.group('query'))
1355
1356 webpage = self._download_webpage(url, query)
1357 result_code = self._search_regex(
6feb2d5e 1358 r'(?s)<ol class="item-section"(.*?)</ol>', webpage, u'result HTML')
c9ae7b95
PH
1359
1360 part_codes = re.findall(
1361 r'(?s)<h3 class="yt-lockup-title">(.*?)</h3>', result_code)
1362 entries = []
1363 for part_code in part_codes:
1364 part_title = self._html_search_regex(
6feb2d5e 1365 [r'(?s)title="([^"]+)"', r'>([^<]+)</a>'], part_code, 'item title', fatal=False)
c9ae7b95
PH
1366 part_url_snippet = self._html_search_regex(
1367 r'(?s)href="([^"]+)"', part_code, 'item URL')
1368 part_url = compat_urlparse.urljoin(
1369 'https://www.youtube.com/', part_url_snippet)
1370 entries.append({
1371 '_type': 'url',
1372 'url': part_url,
1373 'title': part_title,
1374 })
1375
1376 return {
1377 '_type': 'playlist',
1378 'entries': entries,
1379 'title': query,
1380 }
1381
1382
75dff0ee 1383class YoutubeShowIE(InfoExtractor):
0f818663 1384 IE_DESC = u'YouTube.com (multi-season) shows'
75dff0ee
JMF
1385 _VALID_URL = r'https?://www\.youtube\.com/show/(.*)'
1386 IE_NAME = u'youtube:show'
1387
1388 def _real_extract(self, url):
1389 mobj = re.match(self._VALID_URL, url)
1390 show_name = mobj.group(1)
1391 webpage = self._download_webpage(url, show_name, u'Downloading show webpage')
1392 # There's one playlist for each season of the show
1393 m_seasons = list(re.finditer(r'href="(/playlist\?list=.*?)"', webpage))
1394 self.to_screen(u'%s: Found %s seasons' % (show_name, len(m_seasons)))
1395 return [self.url_result('https://www.youtube.com' + season.group(1), 'YoutubePlaylist') for season in m_seasons]
04cc9617
JMF
1396
1397
b2e8bc1b 1398class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
d7ae0639
JMF
1399 """
1400 Base class for extractors that fetch info from
1401 http://www.youtube.com/feed_ajax
1402 Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.
1403 """
b2e8bc1b 1404 _LOGIN_REQUIRED = True
43ba5456
JMF
1405 # use action_load_personal_feed instead of action_load_system_feed
1406 _PERSONAL_FEED = False
04cc9617 1407
d7ae0639
JMF
1408 @property
1409 def _FEED_TEMPLATE(self):
43ba5456
JMF
1410 action = 'action_load_system_feed'
1411 if self._PERSONAL_FEED:
1412 action = 'action_load_personal_feed'
38c2e5b8 1413 return 'https://www.youtube.com/feed_ajax?%s=1&feed_name=%s&paging=%%s' % (action, self._FEED_NAME)
d7ae0639
JMF
1414
1415 @property
1416 def IE_NAME(self):
1417 return u'youtube:%s' % self._FEED_NAME
04cc9617 1418
81f0259b 1419 def _real_initialize(self):
b2e8bc1b 1420 self._login()
81f0259b 1421
04cc9617
JMF
1422 def _real_extract(self, url):
1423 feed_entries = []
0e44d838
JMF
1424 paging = 0
1425 for i in itertools.count(1):
f6177462 1426 info = self._download_json(self._FEED_TEMPLATE % paging,
d7ae0639 1427 u'%s feed' % self._FEED_NAME,
04cc9617 1428 u'Downloading page %s' % i)
f6177462 1429 feed_html = info.get('feed_html') or info.get('content_html')
43ba5456 1430 m_ids = re.finditer(r'"/watch\?v=(.*?)["&]', feed_html)
04cc9617 1431 ids = orderedSet(m.group(1) for m in m_ids)
7012b23c
PH
1432 feed_entries.extend(
1433 self.url_result(video_id, 'Youtube', video_id=video_id)
1434 for video_id in ids)
05ee2b6d
JMF
1435 mobj = re.search(
1436 r'data-uix-load-more-href="/?[^"]+paging=(?P<paging>\d+)',
1437 feed_html)
1438 if mobj is None:
04cc9617 1439 break
05ee2b6d 1440 paging = mobj.group('paging')
d7ae0639
JMF
1441 return self.playlist_result(feed_entries, playlist_title=self._PLAYLIST_TITLE)
1442
d7ae0639
JMF
1443class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
1444 IE_DESC = u'YouTube.com recommended videos, "ytrec" keyword (requires authentication)'
1445 _VALID_URL = r'https?://www\.youtube\.com/feed/recommended|:ytrec(?:ommended)?'
1446 _FEED_NAME = 'recommended'
1447 _PLAYLIST_TITLE = u'Youtube Recommended videos'
c626a3d9 1448
43ba5456
JMF
1449class YoutubeWatchLaterIE(YoutubeFeedsInfoExtractor):
1450 IE_DESC = u'Youtube watch later list, "ytwatchlater" keyword (requires authentication)'
1451 _VALID_URL = r'https?://www\.youtube\.com/feed/watch_later|:ytwatchlater'
1452 _FEED_NAME = 'watch_later'
1453 _PLAYLIST_TITLE = u'Youtube Watch Later'
43ba5456 1454 _PERSONAL_FEED = True
c626a3d9 1455
f459d170
JMF
1456class YoutubeHistoryIE(YoutubeFeedsInfoExtractor):
1457 IE_DESC = u'Youtube watch history, "ythistory" keyword (requires authentication)'
1458 _VALID_URL = u'https?://www\.youtube\.com/feed/history|:ythistory'
1459 _FEED_NAME = 'history'
1460 _PERSONAL_FEED = True
1461 _PLAYLIST_TITLE = u'Youtube Watch History'
1462
c626a3d9
JMF
1463class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
1464 IE_NAME = u'youtube:favorites'
1465 IE_DESC = u'YouTube.com favourite videos, "ytfav" keyword (requires authentication)'
c7a7750d 1466 _VALID_URL = r'https?://www\.youtube\.com/my_favorites|:ytfav(?:ou?rites)?'
c626a3d9
JMF
1467 _LOGIN_REQUIRED = True
1468
1469 def _real_extract(self, url):
1470 webpage = self._download_webpage('https://www.youtube.com/my_favorites', 'Youtube Favourites videos')
1471 playlist_id = self._search_regex(r'list=(.+?)["&]', webpage, u'favourites playlist id')
1472 return self.url_result(playlist_id, 'YoutubePlaylist')
15870e90
PH
1473
1474
1ed5b5c9
JMF
1475class YoutubeSubscriptionsIE(YoutubePlaylistIE):
1476 IE_NAME = u'youtube:subscriptions'
1477 IE_DESC = u'YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication)'
1478 _VALID_URL = r'https?://www\.youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
1479
1480 def _real_extract(self, url):
1481 title = u'Youtube Subscriptions'
1482 page = self._download_webpage('https://www.youtube.com/feed/subscriptions', title)
1483
1484 # The extraction process is the same as for playlists, but the regex
1485 # for the video ids doesn't contain an index
1486 ids = []
1487 more_widget_html = content_html = page
1488
1489 for page_num in itertools.count(1):
1490 matches = re.findall(r'href="\s*/watch\?v=([0-9A-Za-z_-]{11})', content_html)
1491 new_ids = orderedSet(matches)
1492 ids.extend(new_ids)
1493
1494 mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
1495 if not mobj:
1496 break
1497
1498 more = self._download_json(
1499 'https://youtube.com/%s' % mobj.group('more'), title,
1500 'Downloading page #%s' % page_num,
1501 transform_source=uppercase_escape)
1502 content_html = more['content_html']
1503 more_widget_html = more['load_more_widget_html']
1504
1505 return {
1506 '_type': 'playlist',
1507 'title': title,
1508 'entries': self._ids_to_results(ids),
1509 }
1510
1511
15870e90
PH
1512class YoutubeTruncatedURLIE(InfoExtractor):
1513 IE_NAME = 'youtube:truncated_url'
1514 IE_DESC = False # Do not list
975d35db 1515 _VALID_URL = r'''(?x)
c4808c60
PH
1516 (?:https?://)?[^/]+/watch\?(?:
1517 feature=[a-z_]+|
1518 annotation_id=annotation_[^&]+
1519 )?$|
975d35db
PH
1520 (?:https?://)?(?:www\.)?youtube\.com/attribution_link\?a=[^&]+$
1521 '''
15870e90 1522
c4808c60
PH
1523 _TESTS = [{
1524 'url': 'http://www.youtube.com/watch?annotation_id=annotation_3951667041',
1525 'only_matching': True,
dc2fc736
PH
1526 }, {
1527 'url': 'http://www.youtube.com/watch?',
1528 'only_matching': True,
c4808c60
PH
1529 }]
1530
15870e90
PH
1531 def _real_extract(self, url):
1532 raise ExtractorError(
1533 u'Did you forget to quote the URL? Remember that & is a meta '
1534 u'character in most shells, so you want to put the URL in quotes, '
1535 u'like youtube-dl '
b4622a32
PH
1536 u'"http://www.youtube.com/watch?feature=foo&v=BaW_jenozKc" '
1537 u' or simply youtube-dl BaW_jenozKc .',
15870e90 1538 expected=True)