]> jfr.im git - yt-dlp.git/blame - youtube_dl/extractor/youtube.py
Merge branch 'peugeot-sunporno'
[yt-dlp.git] / youtube_dl / extractor / youtube.py
CommitLineData
c5e8d7af 1# coding: utf-8
c5e8d7af 2
edf3e38e 3import errno
e0df6211 4import io
0ca96d48 5import itertools
c5e8d7af 6import json
c4417ddb 7import os.path
c5e8d7af 8import re
e0df6211 9import traceback
c5e8d7af 10
b05654f0 11from .common import InfoExtractor, SearchInfoExtractor
54d39d8b 12from .subtitles import SubtitlesInfoExtractor
2b25cb5d 13from ..jsinterp import JSInterpreter
54256267 14from ..swfinterp import SWFInterpreter
c5e8d7af 15from ..utils import (
edf3e38e 16 compat_chr,
c5e8d7af 17 compat_parse_qs,
c5e8d7af
PH
18 compat_urllib_parse,
19 compat_urllib_request,
7c61bd36 20 compat_urlparse,
c5e8d7af
PH
21 compat_str,
22
23 clean_html,
c38b1e77 24 get_cachedir,
c5e8d7af 25 get_element_by_id,
652cdaa2 26 get_element_by_attribute,
c5e8d7af 27 ExtractorError,
dd27fd17 28 int_or_none,
b7ab0590 29 PagedList,
c5e8d7af
PH
30 unescapeHTML,
31 unified_strdate,
04cc9617 32 orderedSet,
edf3e38e 33 write_json_file,
81c2f20b 34 uppercase_escape,
c5e8d7af
PH
35)
36
de7f3446 37class YoutubeBaseInfoExtractor(InfoExtractor):
b2e8bc1b
JMF
38 """Provide base functions for Youtube extractors"""
39 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
83317f69 40 _TWOFACTOR_URL = 'https://accounts.google.com/SecondFactor'
b2e8bc1b 41 _LANG_URL = r'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
38c2e5b8 42 _AGE_URL = 'https://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
b2e8bc1b
JMF
43 _NETRC_MACHINE = 'youtube'
44 # If True it will raise an error if no login info is provided
45 _LOGIN_REQUIRED = False
46
b2e8bc1b 47 def _set_language(self):
7cc3570e
PH
48 return bool(self._download_webpage(
49 self._LANG_URL, None,
50 note=u'Setting language', errnote='unable to set language',
51 fatal=False))
b2e8bc1b
JMF
52
53 def _login(self):
83317f69 54 """
55 Attempt to log in to YouTube.
56 True is returned if successful or skipped.
57 False is returned if login failed.
58
59 If _LOGIN_REQUIRED is set and no authentication was provided, an error is raised.
60 """
b2e8bc1b
JMF
61 (username, password) = self._get_login_info()
62 # No authentication to be performed
63 if username is None:
64 if self._LOGIN_REQUIRED:
65 raise ExtractorError(u'No login info available, needed for using %s.' % self.IE_NAME, expected=True)
83317f69 66 return True
b2e8bc1b 67
7cc3570e
PH
68 login_page = self._download_webpage(
69 self._LOGIN_URL, None,
70 note=u'Downloading login page',
71 errnote=u'unable to fetch login page', fatal=False)
72 if login_page is False:
73 return
b2e8bc1b 74
795f28f8
PH
75 galx = self._search_regex(r'(?s)<input.+?name="GALX".+?value="(.+?)"',
76 login_page, u'Login GALX parameter')
c5e8d7af 77
b2e8bc1b
JMF
78 # Log in
79 login_form_strs = {
80 u'continue': u'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
81 u'Email': username,
82 u'GALX': galx,
83 u'Passwd': password,
83317f69 84
b2e8bc1b
JMF
85 u'PersistentCookie': u'yes',
86 u'_utf8': u'霱',
87 u'bgresponse': u'js_disabled',
88 u'checkConnection': u'',
89 u'checkedDomains': u'youtube',
90 u'dnConn': u'',
b2e8bc1b
JMF
91 u'pstMsg': u'0',
92 u'rmShown': u'1',
93 u'secTok': u'',
94 u'signIn': u'Sign in',
95 u'timeStmp': u'',
96 u'service': u'youtube',
97 u'uilel': u'3',
98 u'hl': u'en_US',
99 }
83317f69 100
b2e8bc1b
JMF
101 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
102 # chokes on unicode
103 login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
104 login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
7cc3570e
PH
105
106 req = compat_urllib_request.Request(self._LOGIN_URL, login_data)
107 login_results = self._download_webpage(
108 req, None,
109 note=u'Logging in', errnote=u'unable to log in', fatal=False)
110 if login_results is False:
111 return False
83317f69 112
113 if re.search(r'id="errormsg_0_Passwd"', login_results) is not None:
114 raise ExtractorError(u'Please use your account password and a two-factor code instead of an application-specific password.', expected=True)
115
116 # Two-Factor
117 # TODO add SMS and phone call support - these require making a request and then prompting the user
118
119 if re.search(r'(?i)<form[^>]* id="gaia_secondfactorform"', login_results) is not None:
120 tfa_code = self._get_tfa_info()
121
122 if tfa_code is None:
123 self._downloader.report_warning(u'Two-factor authentication required. Provide it with --twofactor <code>')
124 self._downloader.report_warning(u'(Note that only TOTP (Google Authenticator App) codes work at this time.)')
125 return False
126
127 # Unlike the first login form, secTok and timeStmp are both required for the TFA form
128
129 match = re.search(r'id="secTok"\n\s+value=\'(.+)\'/>', login_results, re.M | re.U)
130 if match is None:
131 self._downloader.report_warning(u'Failed to get secTok - did the page structure change?')
132 secTok = match.group(1)
133 match = re.search(r'id="timeStmp"\n\s+value=\'(.+)\'/>', login_results, re.M | re.U)
134 if match is None:
135 self._downloader.report_warning(u'Failed to get timeStmp - did the page structure change?')
136 timeStmp = match.group(1)
137
138 tfa_form_strs = {
139 u'continue': u'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
140 u'smsToken': u'',
141 u'smsUserPin': tfa_code,
142 u'smsVerifyPin': u'Verify',
143
144 u'PersistentCookie': u'yes',
145 u'checkConnection': u'',
146 u'checkedDomains': u'youtube',
147 u'pstMsg': u'1',
148 u'secTok': secTok,
149 u'timeStmp': timeStmp,
150 u'service': u'youtube',
151 u'hl': u'en_US',
152 }
153 tfa_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in tfa_form_strs.items())
154 tfa_data = compat_urllib_parse.urlencode(tfa_form).encode('ascii')
155
156 tfa_req = compat_urllib_request.Request(self._TWOFACTOR_URL, tfa_data)
157 tfa_results = self._download_webpage(
158 tfa_req, None,
159 note=u'Submitting TFA code', errnote=u'unable to submit tfa', fatal=False)
160
161 if tfa_results is False:
162 return False
163
164 if re.search(r'(?i)<form[^>]* id="gaia_secondfactorform"', tfa_results) is not None:
165 self._downloader.report_warning(u'Two-factor code expired. Please try again, or use a one-use backup code instead.')
166 return False
167 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', tfa_results) is not None:
168 self._downloader.report_warning(u'unable to log in - did the page structure change?')
169 return False
170 if re.search(r'smsauth-interstitial-reviewsettings', tfa_results) is not None:
171 self._downloader.report_warning(u'Your Google account has a security notice. Please log in on your web browser, resolve the notice, and try again.')
172 return False
173
7cc3570e
PH
174 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
175 self._downloader.report_warning(u'unable to log in: bad username or password')
b2e8bc1b
JMF
176 return False
177 return True
178
179 def _confirm_age(self):
180 age_form = {
7cc3570e
PH
181 'next_url': '/',
182 'action_confirm': 'Confirm',
183 }
5700e779
JMF
184 req = compat_urllib_request.Request(self._AGE_URL,
185 compat_urllib_parse.urlencode(age_form).encode('ascii'))
7cc3570e
PH
186
187 self._download_webpage(
188 req, None,
189 note=u'Confirming age', errnote=u'Unable to confirm age')
b2e8bc1b
JMF
190 return True
191
192 def _real_initialize(self):
193 if self._downloader is None:
194 return
195 if not self._set_language():
196 return
197 if not self._login():
198 return
199 self._confirm_age()
c5e8d7af 200
8377574c 201
de7f3446 202class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
0f818663 203 IE_DESC = u'YouTube.com'
cb7dfeea 204 _VALID_URL = r"""(?x)^
c5e8d7af 205 (
83aa5293 206 (?:https?://|//)? # http(s):// or protocol-independent URL (optional)
cb7dfeea 207 (?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/|
484aaeb2 208 (?:www\.)?deturl\.com/www\.youtube\.com/|
e70dc1d1 209 (?:www\.)?pwnyoutube\.com/|
f7000f3a 210 (?:www\.)?yourepeat\.com/|
e69ae5b9
JMF
211 tube\.majestyc\.net/|
212 youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains
c5e8d7af
PH
213 (?:.*?\#/)? # handle anchor (#/) redirect urls
214 (?: # the various things that can precede the ID:
215 (?:(?:v|embed|e)/) # v/ or embed/ or e/
216 |(?: # or the v= param in all its forms
f7000f3a 217 (?:(?:watch|movie)(?:_popup)?(?:\.php)?/?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
c5e8d7af
PH
218 (?:\?|\#!?) # the params delimiter ? or # or #!
219 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
220 v=
221 )
f4b05232
JMF
222 ))
223 |youtu\.be/ # just youtu.be/xxxx
b9c76aa1 224 |https?://(?:www\.)?cleanvideosearch\.com/media/action/yt/watch\?videoId=
f4b05232 225 )
c5e8d7af 226 )? # all until now is optional -> you can pass the naked ID
8963d9c2 227 ([0-9A-Za-z_-]{11}) # here is it! the YouTube video ID
c5e8d7af
PH
228 (?(1).+)? # if we found the ID, everything can follow
229 $"""
c5e8d7af 230 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
2c62dc26
PH
231 _formats = {
232 '5': {'ext': 'flv', 'width': 400, 'height': 240},
233 '6': {'ext': 'flv', 'width': 450, 'height': 270},
234 '13': {'ext': '3gp'},
235 '17': {'ext': '3gp', 'width': 176, 'height': 144},
236 '18': {'ext': 'mp4', 'width': 640, 'height': 360},
237 '22': {'ext': 'mp4', 'width': 1280, 'height': 720},
238 '34': {'ext': 'flv', 'width': 640, 'height': 360},
239 '35': {'ext': 'flv', 'width': 854, 'height': 480},
240 '36': {'ext': '3gp', 'width': 320, 'height': 240},
241 '37': {'ext': 'mp4', 'width': 1920, 'height': 1080},
242 '38': {'ext': 'mp4', 'width': 4096, 'height': 3072},
243 '43': {'ext': 'webm', 'width': 640, 'height': 360},
244 '44': {'ext': 'webm', 'width': 854, 'height': 480},
245 '45': {'ext': 'webm', 'width': 1280, 'height': 720},
246 '46': {'ext': 'webm', 'width': 1920, 'height': 1080},
247
1d043b93 248
86fe61c8 249 # 3d videos
43b81eb9
PH
250 '82': {'ext': 'mp4', 'height': 360, 'format_note': '3D', 'preference': -20},
251 '83': {'ext': 'mp4', 'height': 480, 'format_note': '3D', 'preference': -20},
252 '84': {'ext': 'mp4', 'height': 720, 'format_note': '3D', 'preference': -20},
253 '85': {'ext': 'mp4', 'height': 1080, 'format_note': '3D', 'preference': -20},
254 '100': {'ext': 'webm', 'height': 360, 'format_note': '3D', 'preference': -20},
255 '101': {'ext': 'webm', 'height': 480, 'format_note': '3D', 'preference': -20},
256 '102': {'ext': 'webm', 'height': 720, 'format_note': '3D', 'preference': -20},
836a086c 257
96fb5605 258 # Apple HTTP Live Streaming
43b81eb9
PH
259 '92': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'preference': -10},
260 '93': {'ext': 'mp4', 'height': 360, 'format_note': 'HLS', 'preference': -10},
261 '94': {'ext': 'mp4', 'height': 480, 'format_note': 'HLS', 'preference': -10},
262 '95': {'ext': 'mp4', 'height': 720, 'format_note': 'HLS', 'preference': -10},
263 '96': {'ext': 'mp4', 'height': 1080, 'format_note': 'HLS', 'preference': -10},
264 '132': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'preference': -10},
265 '151': {'ext': 'mp4', 'height': 72, 'format_note': 'HLS', 'preference': -10},
2c62dc26
PH
266
267 # DASH mp4 video
43b81eb9
PH
268 '133': {'ext': 'mp4', 'height': 240, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
269 '134': {'ext': 'mp4', 'height': 360, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
270 '135': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
271 '136': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
272 '137': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
273 '138': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
274 '160': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
275 '264': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
836a086c 276
f6f1fc92 277 # Dash mp4 audio
2c62dc26
PH
278 '139': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 48, 'preference': -50},
279 '140': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 128, 'preference': -50},
280 '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 256, 'preference': -50},
836a086c
AZ
281
282 # Dash webm
e75cafe9
A
283 '167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
284 '168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
285 '169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
286 '170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
287 '218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
288 '219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
289 '242': {'ext': 'webm', 'height': 240, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
290 '243': {'ext': 'webm', 'height': 360, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
291 '244': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
292 '245': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
293 '246': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
294 '247': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
295 '248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
3c80377b 296 '271': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
18061bba 297 '272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
2c62dc26
PH
298
299 # Dash webm audio
55db73ef 300 '171': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'abr': 128, 'preference': -50},
e75cafe9 301 '172': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'abr': 256, 'preference': -50},
ce6b9a2d
PH
302
303 # RTMP (unnamed)
304 '_rtmp': {'protocol': 'rtmp'},
c5e8d7af 305 }
836a086c 306
c5e8d7af 307 IE_NAME = u'youtube'
2eb88d95
PH
308 _TESTS = [
309 {
0e853ca4
PH
310 u"url": u"http://www.youtube.com/watch?v=BaW_jenozKc",
311 u"file": u"BaW_jenozKc.mp4",
312 u"info_dict": {
313 u"title": u"youtube-dl test video \"'/\\ä↭𝕐",
314 u"uploader": u"Philipp Hagemeister",
315 u"uploader_id": u"phihag",
316 u"upload_date": u"20121002",
ad3bc6ac
PH
317 u"description": u"test chars: \"'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .",
318 u"categories": [u'Science & Technology'],
2eb88d95 319 }
0e853ca4 320 },
0e853ca4
PH
321 {
322 u"url": u"http://www.youtube.com/watch?v=UxxajLWwzqY",
323 u"file": u"UxxajLWwzqY.mp4",
324 u"note": u"Test generic use_cipher_signature video (#897)",
325 u"info_dict": {
326 u"upload_date": u"20120506",
327 u"title": u"Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]",
ba60a3eb 328 u"description": u"md5:fea86fda2d5a5784273df5c7cc994d9f",
45ed795c 329 u"uploader": u"Icona Pop",
0e853ca4 330 u"uploader_id": u"IconaPop"
2eb88d95 331 }
c108eb73
JMF
332 },
333 {
334 u"url": u"https://www.youtube.com/watch?v=07FYdnEawAQ",
335 u"file": u"07FYdnEawAQ.mp4",
336 u"note": u"Test VEVO video with age protection (#956)",
337 u"info_dict": {
338 u"upload_date": u"20130703",
339 u"title": u"Justin Timberlake - Tunnel Vision (Explicit)",
340 u"description": u"md5:64249768eec3bc4276236606ea996373",
341 u"uploader": u"justintimberlakeVEVO",
342 u"uploader_id": u"justintimberlakeVEVO"
343 }
344 },
fccd3771 345 {
83aa5293 346 u"url": u"//www.YouTube.com/watch?v=yZIXLfi8CZQ",
fccd3771
PH
347 u"file": u"yZIXLfi8CZQ.mp4",
348 u"note": u"Embed-only video (#1746)",
349 u"info_dict": {
350 u"upload_date": u"20120608",
351 u"title": u"Principal Sexually Assaults A Teacher - Episode 117 - 8th June 2012",
352 u"description": u"md5:09b78bd971f1e3e289601dfba15ca4f7",
353 u"uploader": u"SET India",
354 u"uploader_id": u"setindia"
355 }
356 },
dd27fd17
PH
357 {
358 u"url": u"http://www.youtube.com/watch?v=a9LDPn-MO4I",
359 u"file": u"a9LDPn-MO4I.m4a",
360 u"note": u"256k DASH audio (format 141) via DASH manifest",
dd27fd17
PH
361 u"info_dict": {
362 u"upload_date": "20121002",
363 u"uploader_id": "8KVIDEO",
364 u"description": "No description available.",
365 u"uploader": "8KVIDEO",
366 u"title": "UHDTV TEST 8K VIDEO.mp4"
4919603f
PH
367 },
368 u"params": {
369 u"youtube_include_dash_manifest": True,
370 u"format": "141",
371 },
dd27fd17 372 },
3489b7d2
JMF
373 # DASH manifest with encrypted signature
374 {
375 u'url': u'https://www.youtube.com/watch?v=IB3lcPjvWLA',
376 u'info_dict': {
377 u'id': u'IB3lcPjvWLA',
378 u'ext': u'm4a',
379 u'title': u'Afrojack - The Spark ft. Spree Wilson',
e00c9cf5 380 u'description': u'md5:9717375db5a9a3992be4668bbf3bc0a8',
3489b7d2
JMF
381 u'uploader': u'AfrojackVEVO',
382 u'uploader_id': u'AfrojackVEVO',
383 u'upload_date': u'20131011',
384 },
385 u"params": {
386 u'youtube_include_dash_manifest': True,
387 u'format': '141',
388 },
389 },
2eb88d95
PH
390 ]
391
c5e8d7af
PH
392
393 @classmethod
394 def suitable(cls, url):
395 """Receives a URL and returns True if suitable for this IE."""
e3ea4790 396 if YoutubePlaylistIE.suitable(url): return False
fccd3771 397 return re.match(cls._VALID_URL, url) is not None
c5e8d7af 398
e0df6211
PH
399 def __init__(self, *args, **kwargs):
400 super(YoutubeIE, self).__init__(*args, **kwargs)
83799698 401 self._player_cache = {}
e0df6211 402
c5e8d7af
PH
403 def report_video_info_webpage_download(self, video_id):
404 """Report attempt to download video info webpage."""
405 self.to_screen(u'%s: Downloading video info webpage' % video_id)
406
c5e8d7af
PH
407 def report_information_extraction(self, video_id):
408 """Report attempt to extract video information."""
409 self.to_screen(u'%s: Extracting video information' % video_id)
410
411 def report_unavailable_format(self, video_id, format):
412 """Report extracted video URL."""
413 self.to_screen(u'%s: Format %s not available' % (video_id, format))
414
415 def report_rtmp_download(self):
416 """Indicate the download will use the RTMP protocol."""
417 self.to_screen(u'RTMP download detected')
418
60064c53
PH
419 def _signature_cache_id(self, example_sig):
420 """ Return a string representation of a signature """
421 return u'.'.join(compat_str(len(part)) for part in example_sig.split('.'))
422
423 def _extract_signature_function(self, video_id, player_url, example_sig):
cf010131 424 id_m = re.match(
c081b35c 425 r'.*-(?P<id>[a-zA-Z0-9_-]+)(?:/watch_as3|/html5player)?\.(?P<ext>[a-z]+)$',
cf010131 426 player_url)
c081b35c
PH
427 if not id_m:
428 raise ExtractorError('Cannot identify player %r' % player_url)
e0df6211
PH
429 player_type = id_m.group('ext')
430 player_id = id_m.group('id')
431
c4417ddb 432 # Read from filesystem cache
60064c53
PH
433 func_id = '%s_%s_%s' % (
434 player_type, player_id, self._signature_cache_id(example_sig))
c4417ddb 435 assert os.path.basename(func_id) == func_id
c38b1e77 436 cache_dir = get_cachedir(self._downloader.params)
c4417ddb 437
c3c88a26 438 cache_enabled = cache_dir is not None
f8061589 439 if cache_enabled:
c4417ddb
PH
440 cache_fn = os.path.join(os.path.expanduser(cache_dir),
441 u'youtube-sigfuncs',
442 func_id + '.json')
443 try:
edf3e38e 444 with io.open(cache_fn, 'r', encoding='utf-8') as cachef:
c4417ddb
PH
445 cache_spec = json.load(cachef)
446 return lambda s: u''.join(s[i] for i in cache_spec)
edf3e38e 447 except IOError:
c4417ddb 448 pass # No cache available
55c49908
PH
449 except ValueError:
450 try:
451 file_size = os.path.getsize(cache_fn)
452 except (OSError, IOError) as oe:
453 file_size = str(oe)
454 self._downloader.report_warning(
455 u'Cache %s failed (%s)' % (cache_fn, file_size))
83799698 456
e0df6211
PH
457 if player_type == 'js':
458 code = self._download_webpage(
459 player_url, video_id,
83799698 460 note=u'Downloading %s player %s' % (player_type, player_id),
e0df6211 461 errnote=u'Download of %s failed' % player_url)
83799698 462 res = self._parse_sig_js(code)
c4417ddb 463 elif player_type == 'swf':
e0df6211
PH
464 urlh = self._request_webpage(
465 player_url, video_id,
83799698 466 note=u'Downloading %s player %s' % (player_type, player_id),
e0df6211
PH
467 errnote=u'Download of %s failed' % player_url)
468 code = urlh.read()
83799698 469 res = self._parse_sig_swf(code)
e0df6211
PH
470 else:
471 assert False, 'Invalid player type %r' % player_type
472
f8061589 473 if cache_enabled:
edf3e38e 474 try:
60064c53 475 test_string = u''.join(map(compat_chr, range(len(example_sig))))
c705320f 476 cache_res = res(test_string)
edf3e38e
PH
477 cache_spec = [ord(c) for c in cache_res]
478 try:
479 os.makedirs(os.path.dirname(cache_fn))
480 except OSError as ose:
481 if ose.errno != errno.EEXIST:
482 raise
483 write_json_file(cache_spec, cache_fn)
0ca96d48 484 except Exception:
edf3e38e
PH
485 tb = traceback.format_exc()
486 self._downloader.report_warning(
487 u'Writing cache to %r failed: %s' % (cache_fn, tb))
83799698
PH
488
489 return res
490
60064c53 491 def _print_sig_code(self, func, example_sig):
edf3e38e
PH
492 def gen_sig_code(idxs):
493 def _genslice(start, end, step):
494 starts = u'' if start == 0 else str(start)
e35e4ddc
PH
495 ends = (u':%d' % (end+step)) if end + step >= 0 else u':'
496 steps = u'' if step == 1 else (u':%d' % step)
edf3e38e
PH
497 return u's[%s%s%s]' % (starts, ends, steps)
498
499 step = None
0ca96d48
PH
500 start = '(Never used)' # Quelch pyflakes warnings - start will be
501 # set as soon as step is set
edf3e38e
PH
502 for i, prev in zip(idxs[1:], idxs[:-1]):
503 if step is not None:
504 if i - prev == step:
505 continue
506 yield _genslice(start, prev, step)
507 step = None
508 continue
509 if i - prev in [-1, 1]:
510 step = i - prev
511 start = prev
512 continue
513 else:
514 yield u's[%d]' % prev
515 if step is None:
516 yield u's[%d]' % i
517 else:
518 yield _genslice(start, i, step)
519
60064c53 520 test_string = u''.join(map(compat_chr, range(len(example_sig))))
c705320f 521 cache_res = func(test_string)
edf3e38e
PH
522 cache_spec = [ord(c) for c in cache_res]
523 expr_code = u' + '.join(gen_sig_code(cache_spec))
60064c53
PH
524 signature_id_tuple = '(%s)' % (
525 ', '.join(compat_str(len(p)) for p in example_sig.split('.')))
526 code = (u'if tuple(len(p) for p in s.split(\'.\')) == %s:\n'
527 u' return %s\n') % (signature_id_tuple, expr_code)
f8061589 528 self.to_screen(u'Extracted signature function:\n' + code)
edf3e38e 529
e0df6211
PH
530 def _parse_sig_js(self, jscode):
531 funcname = self._search_regex(
c26e9ac4 532 r'signature=([$a-zA-Z]+)', jscode,
2b25cb5d
PH
533 u'Initial JS player signature function name')
534
535 jsi = JSInterpreter(jscode)
536 initial_function = jsi.extract_function(funcname)
e0df6211
PH
537 return lambda s: initial_function([s])
538
539 def _parse_sig_swf(self, file_contents):
54256267 540 swfi = SWFInterpreter(file_contents)
5dc3552d 541 TARGET_CLASSNAME = u'SignatureDecipher'
54256267
PH
542 searched_class = swfi.extract_class(TARGET_CLASSNAME)
543 initial_function = swfi.extract_function(searched_class, u'decipher')
e0df6211
PH
544 return lambda s: initial_function([s])
545
83799698 546 def _decrypt_signature(self, s, video_id, player_url, age_gate=False):
257a2501 547 """Turn the encrypted s field into a working signature"""
6b37f0be 548
c8bf86d5
PH
549 if player_url is None:
550 raise ExtractorError(u'Cannot decrypt signature without player_url')
920de7a2 551
c8bf86d5
PH
552 if player_url.startswith(u'//'):
553 player_url = u'https:' + player_url
554 try:
62af3a0e 555 player_id = (player_url, self._signature_cache_id(s))
c8bf86d5
PH
556 if player_id not in self._player_cache:
557 func = self._extract_signature_function(
60064c53 558 video_id, player_url, s
c8bf86d5
PH
559 )
560 self._player_cache[player_id] = func
561 func = self._player_cache[player_id]
562 if self._downloader.params.get('youtube_print_sig_code'):
60064c53 563 self._print_sig_code(func, s)
c8bf86d5
PH
564 return func(s)
565 except Exception as e:
566 tb = traceback.format_exc()
567 raise ExtractorError(
60064c53 568 u'Signature extraction failed: ' + tb, cause=e)
e0df6211 569
1f343eaa 570 def _get_available_subtitles(self, video_id, webpage):
de7f3446 571 try:
7fad1c63 572 sub_list = self._download_webpage(
38c2e5b8 573 'https://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,
7fad1c63
JMF
574 video_id, note=False)
575 except ExtractorError as err:
de7f3446
JMF
576 self._downloader.report_warning(u'unable to download video subtitles: %s' % compat_str(err))
577 return {}
578 lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
579
580 sub_lang_list = {}
581 for l in lang_list:
582 lang = l[1]
7e660ac1
LD
583 if lang in sub_lang_list:
584 continue
de7f3446
JMF
585 params = compat_urllib_parse.urlencode({
586 'lang': lang,
587 'v': video_id,
ca715127 588 'fmt': self._downloader.params.get('subtitlesformat', 'srt'),
c3197e3e 589 'name': unescapeHTML(l[0]).encode('utf-8'),
de7f3446 590 })
38c2e5b8 591 url = u'https://www.youtube.com/api/timedtext?' + params
de7f3446
JMF
592 sub_lang_list[lang] = url
593 if not sub_lang_list:
594 self._downloader.report_warning(u'video doesn\'t have subtitles')
595 return {}
596 return sub_lang_list
597
055e6f36 598 def _get_available_automatic_caption(self, video_id, webpage):
de7f3446
JMF
599 """We need the webpage for getting the captions url, pass it as an
600 argument to speed up the process."""
ca715127 601 sub_format = self._downloader.params.get('subtitlesformat', 'srt')
de7f3446
JMF
602 self.to_screen(u'%s: Looking for automatic captions' % video_id)
603 mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
055e6f36 604 err_msg = u'Couldn\'t find automatic captions for %s' % video_id
de7f3446
JMF
605 if mobj is None:
606 self._downloader.report_warning(err_msg)
607 return {}
608 player_config = json.loads(mobj.group(1))
609 try:
610 args = player_config[u'args']
611 caption_url = args[u'ttsurl']
612 timestamp = args[u'timestamp']
055e6f36
JMF
613 # We get the available subtitles
614 list_params = compat_urllib_parse.urlencode({
615 'type': 'list',
616 'tlangs': 1,
617 'asrs': 1,
de7f3446 618 })
055e6f36 619 list_url = caption_url + '&' + list_params
e26f8712 620 caption_list = self._download_xml(list_url, video_id)
e3dc22ca 621 original_lang_node = caption_list.find('track')
f6a54188 622 if original_lang_node is None or original_lang_node.attrib.get('kind') != 'asr' :
e3dc22ca
JMF
623 self._downloader.report_warning(u'Video doesn\'t have automatic captions')
624 return {}
625 original_lang = original_lang_node.attrib['lang_code']
055e6f36
JMF
626
627 sub_lang_list = {}
628 for lang_node in caption_list.findall('target'):
629 sub_lang = lang_node.attrib['lang_code']
630 params = compat_urllib_parse.urlencode({
631 'lang': original_lang,
632 'tlang': sub_lang,
633 'fmt': sub_format,
634 'ts': timestamp,
635 'kind': 'asr',
636 })
637 sub_lang_list[sub_lang] = caption_url + '&' + params
638 return sub_lang_list
de7f3446
JMF
639 # An extractor error can be raise by the download process if there are
640 # no automatic captions but there are subtitles
641 except (KeyError, ExtractorError):
642 self._downloader.report_warning(err_msg)
643 return {}
644
97665381
PH
645 @classmethod
646 def extract_id(cls, url):
647 mobj = re.match(cls._VALID_URL, url, re.VERBOSE)
c5e8d7af
PH
648 if mobj is None:
649 raise ExtractorError(u'Invalid URL: %s' % url)
650 video_id = mobj.group(2)
651 return video_id
652
1d043b93
JMF
653 def _extract_from_m3u8(self, manifest_url, video_id):
654 url_map = {}
655 def _get_urls(_manifest):
656 lines = _manifest.split('\n')
657 urls = filter(lambda l: l and not l.startswith('#'),
658 lines)
659 return urls
660 manifest = self._download_webpage(manifest_url, video_id, u'Downloading formats manifest')
661 formats_urls = _get_urls(manifest)
662 for format_url in formats_urls:
890f62e8 663 itag = self._search_regex(r'itag/(\d+?)/', format_url, 'itag')
1d043b93
JMF
664 url_map[itag] = format_url
665 return url_map
666
1fb07d10
JG
667 def _extract_annotations(self, video_id):
668 url = 'https://www.youtube.com/annotations_invideo?features=1&legacy=1&video_id=%s' % video_id
669 return self._download_webpage(url, video_id, note=u'Searching for annotations.', errnote=u'Unable to download video annotations.')
670
c5e8d7af 671 def _real_extract(self, url):
7e8c0af0
PH
672 proto = (
673 u'http' if self._downloader.params.get('prefer_insecure', False)
674 else u'https')
675
c5e8d7af
PH
676 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
677 mobj = re.search(self._NEXT_URL_RE, url)
678 if mobj:
7e8c0af0 679 url = proto + '://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
97665381 680 video_id = self.extract_id(url)
c5e8d7af
PH
681
682 # Get video webpage
7e8c0af0 683 url = proto + '://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
336c3a69 684 video_webpage = self._download_webpage(url, video_id)
c5e8d7af
PH
685
686 # Attempt to extract SWF player URL
e0df6211 687 mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
c5e8d7af
PH
688 if mobj is not None:
689 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
690 else:
691 player_url = None
692
693 # Get video info
694 self.report_video_info_webpage_download(video_id)
c108eb73
JMF
695 if re.search(r'player-age-gate-content">', video_webpage) is not None:
696 self.report_age_confirmation()
697 age_gate = True
698 # We simulate the access to the video from www.youtube.com/v/{video_id}
699 # this can be viewed without login into Youtube
2c57c7fa
JMF
700 data = compat_urllib_parse.urlencode({
701 'video_id': video_id,
702 'eurl': 'https://youtube.googleapis.com/v/' + video_id,
c084c934
JMF
703 'sts': self._search_regex(
704 r'"sts"\s*:\s*(\d+)', video_webpage, 'sts'),
2c57c7fa 705 })
7e8c0af0 706 video_info_url = proto + '://www.youtube.com/get_video_info?' + data
c5e8d7af
PH
707 video_info_webpage = self._download_webpage(video_info_url, video_id,
708 note=False,
709 errnote='unable to download video info webpage')
710 video_info = compat_parse_qs(video_info_webpage)
c108eb73
JMF
711 else:
712 age_gate = False
713 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
7e8c0af0 714 video_info_url = (proto + '://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
c108eb73
JMF
715 % (video_id, el_type))
716 video_info_webpage = self._download_webpage(video_info_url, video_id,
717 note=False,
718 errnote='unable to download video info webpage')
719 video_info = compat_parse_qs(video_info_webpage)
720 if 'token' in video_info:
721 break
c5e8d7af
PH
722 if 'token' not in video_info:
723 if 'reason' in video_info:
d11271dd
PH
724 raise ExtractorError(
725 u'YouTube said: %s' % video_info['reason'][0],
726 expected=True, video_id=video_id)
c5e8d7af 727 else:
d11271dd
PH
728 raise ExtractorError(
729 u'"token" parameter not in video info for unknown reason',
730 video_id=video_id)
c5e8d7af 731
1d699755
PH
732 if 'view_count' in video_info:
733 view_count = int(video_info['view_count'][0])
734 else:
735 view_count = None
736
c5e8d7af
PH
737 # Check for "rental" videos
738 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
739 raise ExtractorError(u'"rental" videos not supported')
740
741 # Start extracting information
742 self.report_information_extraction(video_id)
743
744 # uploader
745 if 'author' not in video_info:
746 raise ExtractorError(u'Unable to extract uploader name')
747 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
748
749 # uploader_id
750 video_uploader_id = None
751 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
752 if mobj is not None:
753 video_uploader_id = mobj.group(1)
754 else:
755 self._downloader.report_warning(u'unable to extract uploader nickname')
756
757 # title
a8c6b241 758 if 'title' in video_info:
aa92f063 759 video_title = video_info['title'][0]
a8c6b241
PH
760 else:
761 self._downloader.report_warning(u'Unable to extract video title')
762 video_title = u'_'
c5e8d7af
PH
763
764 # thumbnail image
7763b04e
JMF
765 # We try first to get a high quality image:
766 m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">',
767 video_webpage, re.DOTALL)
768 if m_thumb is not None:
769 video_thumbnail = m_thumb.group(1)
770 elif 'thumbnail_url' not in video_info:
c5e8d7af 771 self._downloader.report_warning(u'unable to extract video thumbnail')
f490e77e 772 video_thumbnail = None
c5e8d7af
PH
773 else: # don't panic if we can't find it
774 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
775
776 # upload date
777 upload_date = None
ad3bc6ac 778 mobj = re.search(r'(?s)id="eow-date.*?>(.*?)</span>', video_webpage)
beee53de
PH
779 if mobj is None:
780 mobj = re.search(
263bd4ec 781 r'(?s)id="watch-uploader-info".*?>.*?(?:Published|Uploaded|Streamed live) on (.*?)</strong>',
beee53de 782 video_webpage)
c5e8d7af
PH
783 if mobj is not None:
784 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
785 upload_date = unified_strdate(upload_date)
786
ec8deefc
DG
787 m_cat_container = get_element_by_id("eow-category", video_webpage)
788 if m_cat_container:
ad3bc6ac 789 category = self._html_search_regex(
01ed5c9b 790 r'(?s)<a[^<]+>(.*?)</a>', m_cat_container, 'category',
ad3bc6ac
PH
791 default=None)
792 video_categories = None if category is None else [category]
793 else:
794 video_categories = None
ec8deefc 795
c5e8d7af
PH
796 # description
797 video_description = get_element_by_id("eow-description", video_webpage)
798 if video_description:
27dcce19
PH
799 video_description = re.sub(r'''(?x)
800 <a\s+
801 (?:[a-zA-Z-]+="[^"]+"\s+)*?
802 title="([^"]+)"\s+
803 (?:[a-zA-Z-]+="[^"]+"\s+)*?
804 class="yt-uix-redirect-link"\s*>
805 [^<]+
806 </a>
807 ''', r'\1', video_description)
c5e8d7af
PH
808 video_description = clean_html(video_description)
809 else:
810 fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
811 if fd_mobj:
812 video_description = unescapeHTML(fd_mobj.group(1))
813 else:
814 video_description = u''
815
f30a38be 816 def _extract_count(count_name):
46374a56 817 count = self._search_regex(
f30a38be
JMF
818 r'id="watch-%s"[^>]*>.*?([\d,]+)\s*</span>' % re.escape(count_name),
819 video_webpage, count_name, default=None)
336c3a69
JMF
820 if count is not None:
821 return int(count.replace(',', ''))
822 return None
f30a38be
JMF
823 like_count = _extract_count(u'like')
824 dislike_count = _extract_count(u'dislike')
336c3a69 825
c5e8d7af 826 # subtitles
d82134c3 827 video_subtitles = self.extract_subtitles(video_id, video_webpage)
c5e8d7af 828
c5e8d7af 829 if self._downloader.params.get('listsubtitles', False):
d665f8d3 830 self._list_available_subtitles(video_id, video_webpage)
c5e8d7af
PH
831 return
832
833 if 'length_seconds' not in video_info:
834 self._downloader.report_warning(u'unable to extract video duration')
b466b702 835 video_duration = None
c5e8d7af 836 else:
b466b702 837 video_duration = int(compat_urllib_parse.unquote_plus(video_info['length_seconds'][0]))
c5e8d7af 838
1fb07d10
JG
839 # annotations
840 video_annotations = None
841 if self._downloader.params.get('writeannotations', False):
842 video_annotations = self._extract_annotations(video_id)
843
c5e8d7af 844 # Decide which formats to download
c5e8d7af 845 try:
ae7ed920 846 mobj = re.search(r';ytplayer\.config\s*=\s*({.*?});', video_webpage)
50be92c1
PH
847 if not mobj:
848 raise ValueError('Could not find vevo ID')
ae7ed920
PH
849 json_code = uppercase_escape(mobj.group(1))
850 ytplayer_config = json.loads(json_code)
3489b7d2 851 args = ytplayer_config['args']
7ce7e394
JMF
852 # Easy way to know if the 's' value is in url_encoded_fmt_stream_map
853 # this signatures are encrypted
44d46655 854 if 'url_encoded_fmt_stream_map' not in args:
f10503db 855 raise ValueError(u'No stream_map present') # caught below
00fe14fc
JMF
856 re_signature = re.compile(r'[&,]s=')
857 m_s = re_signature.search(args['url_encoded_fmt_stream_map'])
7ce7e394
JMF
858 if m_s is not None:
859 self.to_screen(u'%s: Encrypted signatures detected.' % video_id)
c5e8d7af 860 video_info['url_encoded_fmt_stream_map'] = [args['url_encoded_fmt_stream_map']]
00fe14fc 861 m_s = re_signature.search(args.get('adaptive_fmts', u''))
b7a68384 862 if m_s is not None:
00fe14fc
JMF
863 if 'adaptive_fmts' in video_info:
864 video_info['adaptive_fmts'][0] += ',' + args['adaptive_fmts']
37b6d5f6 865 else:
00fe14fc 866 video_info['adaptive_fmts'] = [args['adaptive_fmts']]
c5e8d7af
PH
867 except ValueError:
868 pass
869
dd27fd17
PH
870 def _map_to_format_list(urlmap):
871 formats = []
872 for itag, video_real_url in urlmap.items():
873 dct = {
874 'format_id': itag,
875 'url': video_real_url,
876 'player_url': player_url,
877 }
0b65e5d4
PH
878 if itag in self._formats:
879 dct.update(self._formats[itag])
dd27fd17
PH
880 formats.append(dct)
881 return formats
882
c5e8d7af
PH
883 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
884 self.report_rtmp_download()
dd27fd17
PH
885 formats = [{
886 'format_id': '_rtmp',
887 'protocol': 'rtmp',
888 'url': video_info['conn'][0],
889 'player_url': player_url,
890 }]
00fe14fc
JMF
891 elif len(video_info.get('url_encoded_fmt_stream_map', [])) >= 1 or len(video_info.get('adaptive_fmts', [])) >= 1:
892 encoded_url_map = video_info.get('url_encoded_fmt_stream_map', [''])[0] + ',' + video_info.get('adaptive_fmts',[''])[0]
893 if 'rtmpe%3Dyes' in encoded_url_map:
a7055eb9 894 raise ExtractorError('rtmpe downloads are not supported, see https://github.com/rg3/youtube-dl/issues/343 for more information.', expected=True)
c5e8d7af 895 url_map = {}
00fe14fc 896 for url_data_str in encoded_url_map.split(','):
c5e8d7af 897 url_data = compat_parse_qs(url_data_str)
201e9eaa
PH
898 if 'itag' not in url_data or 'url' not in url_data:
899 continue
900 format_id = url_data['itag'][0]
901 url = url_data['url'][0]
902
903 if 'sig' in url_data:
904 url += '&signature=' + url_data['sig'][0]
905 elif 's' in url_data:
906 encrypted_sig = url_data['s'][0]
907
908 if not age_gate:
909 jsplayer_url_json = self._search_regex(
910 r'"assets":.+?"js":\s*("[^"]+")',
911 video_webpage, u'JS player URL')
912 player_url = json.loads(jsplayer_url_json)
913 if player_url is None:
914 player_url_json = self._search_regex(
915 r'ytplayer\.config.*?"url"\s*:\s*("[^"]+")',
916 video_webpage, u'age gate player URL')
917 player_url = json.loads(player_url_json)
918
919 if self._downloader.params.get('verbose'):
cf010131 920 if player_url is None:
201e9eaa
PH
921 player_version = 'unknown'
922 player_desc = 'unknown'
923 else:
924 if player_url.endswith('swf'):
925 player_version = self._search_regex(
926 r'-(.+?)(?:/watch_as3)?\.swf$', player_url,
927 u'flash player', fatal=False)
928 player_desc = 'flash player %s' % player_version
cf010131 929 else:
201e9eaa
PH
930 player_version = self._search_regex(
931 r'html5player-([^/]+?)(?:/html5player)?\.js',
932 player_url,
933 'html5 player', fatal=False)
934 player_desc = u'html5 player %s' % player_version
935
60064c53 936 parts_sizes = self._signature_cache_id(encrypted_sig)
98eb1c3f
PH
937 self.to_screen(u'{%s} signature length %s, %s' %
938 (format_id, parts_sizes, player_desc))
201e9eaa
PH
939
940 signature = self._decrypt_signature(
941 encrypted_sig, video_id, player_url, age_gate)
942 url += '&signature=' + signature
943 if 'ratebypass' not in url:
944 url += '&ratebypass=yes'
945 url_map[format_id] = url
dd27fd17 946 formats = _map_to_format_list(url_map)
1d043b93
JMF
947 elif video_info.get('hlsvp'):
948 manifest_url = video_info['hlsvp'][0]
949 url_map = self._extract_from_m3u8(manifest_url, video_id)
dd27fd17 950 formats = _map_to_format_list(url_map)
c5e8d7af 951 else:
9abb3204 952 raise ExtractorError(u'no conn, hlsvp or url_encoded_fmt_stream_map information found in video info')
c5e8d7af 953
dd27fd17 954 # Look for the DASH manifest
d68f0cdb 955 if (self._downloader.params.get('youtube_include_dash_manifest', False)):
dd27fd17 956 try:
d68f0cdb 957 # The DASH manifest used needs to be the one from the original video_webpage.
958 # The one found in get_video_info seems to be using different signatures.
959 # However, in the case of an age restriction there won't be any embedded dashmpd in the video_webpage.
960 # Luckily, it seems, this case uses some kind of default signature (len == 86), so the
961 # combination of get_video_info and the _static_decrypt_signature() decryption fallback will work here.
962 if age_gate:
3489b7d2 963 dash_manifest_url = video_info.get('dashmpd')[0]
d68f0cdb 964 else:
3489b7d2 965 dash_manifest_url = ytplayer_config['args']['dashmpd']
d68f0cdb 966 def decrypt_sig(mobj):
967 s = mobj.group(1)
968 dec_s = self._decrypt_signature(s, video_id, player_url, age_gate)
969 return '/signature/%s' % dec_s
970 dash_manifest_url = re.sub(r'/s/([\w\.]+)', decrypt_sig, dash_manifest_url)
dd27fd17 971 dash_doc = self._download_xml(
d68f0cdb 972 dash_manifest_url, video_id,
dd27fd17
PH
973 note=u'Downloading DASH manifest',
974 errnote=u'Could not download DASH manifest')
975 for r in dash_doc.findall(u'.//{urn:mpeg:DASH:schema:MPD:2011}Representation'):
976 url_el = r.find('{urn:mpeg:DASH:schema:MPD:2011}BaseURL')
977 if url_el is None:
978 continue
979 format_id = r.attrib['id']
980 video_url = url_el.text
981 filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength'))
982 f = {
983 'format_id': format_id,
984 'url': video_url,
985 'width': int_or_none(r.attrib.get('width')),
986 'tbr': int_or_none(r.attrib.get('bandwidth'), 1000),
987 'asr': int_or_none(r.attrib.get('audioSamplingRate')),
988 'filesize': filesize,
989 }
990 try:
991 existing_format = next(
992 fo for fo in formats
993 if fo['format_id'] == format_id)
994 except StopIteration:
995 f.update(self._formats.get(format_id, {}))
996 formats.append(f)
997 else:
998 existing_format.update(f)
999
1000 except (ExtractorError, KeyError) as e:
1001 self.report_warning(u'Skipping DASH manifest: %s' % e, video_id)
d80044c2 1002
4bcc7bd1 1003 self._sort_formats(formats)
4ea3be0a 1004
1005 return {
1006 'id': video_id,
1007 'uploader': video_uploader,
1008 'uploader_id': video_uploader_id,
1009 'upload_date': upload_date,
1010 'title': video_title,
1011 'thumbnail': video_thumbnail,
1012 'description': video_description,
ec8deefc 1013 'categories': video_categories,
4ea3be0a 1014 'subtitles': video_subtitles,
1015 'duration': video_duration,
1016 'age_limit': 18 if age_gate else 0,
1017 'annotations': video_annotations,
7e8c0af0 1018 'webpage_url': proto + '://www.youtube.com/watch?v=%s' % video_id,
4ea3be0a 1019 'view_count': view_count,
1020 'like_count': like_count,
1021 'dislike_count': dislike_count,
1022 'formats': formats,
1023 }
c5e8d7af 1024
880e1c52 1025class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
0f818663 1026 IE_DESC = u'YouTube.com playlists'
d67cc9fa 1027 _VALID_URL = r"""(?x)(?:
c5e8d7af
PH
1028 (?:https?://)?
1029 (?:\w+\.)?
1030 youtube\.com/
1031 (?:
1032 (?:course|view_play_list|my_playlists|artist|playlist|watch)
1033 \? (?:.*?&)*? (?:p|a|list)=
1034 | p/
1035 )
d67cc9fa 1036 (
7d568f5a 1037 (?:PL|LL|EC|UU|FL|RD)?[0-9A-Za-z-_]{10,}
d67cc9fa
JMF
1038 # Top tracks, they can also include dots
1039 |(?:MC)[\w\.]*
1040 )
c5e8d7af
PH
1041 .*
1042 |
7d568f5a 1043 ((?:PL|LL|EC|UU|FL|RD)[0-9A-Za-z-_]{10,})
c5e8d7af 1044 )"""
dbb94fb0 1045 _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s'
dcbb4580 1046 _MORE_PAGES_INDICATOR = r'data-link-type="next"'
dbb94fb0 1047 _VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})&amp;[^"]*?index=(?P<index>\d+)'
c5e8d7af
PH
1048 IE_NAME = u'youtube:playlist'
1049
880e1c52
JMF
1050 def _real_initialize(self):
1051 self._login()
1052
652cdaa2
JMF
1053 def _ids_to_results(self, ids):
1054 return [self.url_result(vid_id, 'Youtube', video_id=vid_id)
1055 for vid_id in ids]
1056
1057 def _extract_mix(self, playlist_id):
1058 # The mixes are generated from a a single video
1059 # the id of the playlist is just 'RD' + video_id
7d4afc55 1060 url = 'https://youtube.com/watch?v=%s&list=%s' % (playlist_id[-11:], playlist_id)
652cdaa2 1061 webpage = self._download_webpage(url, playlist_id, u'Downloading Youtube mix')
bc2f773b
JMF
1062 search_title = lambda class_name: get_element_by_attribute('class', class_name, webpage)
1063 title_span = (search_title('playlist-title') or
1064 search_title('title long-title') or search_title('title'))
76d1700b 1065 title = clean_html(title_span)
70e32269 1066 video_re = r'''(?x)data-video-username=".*?".*?
bc2f773b 1067 href="/watch\?v=([0-9A-Za-z_-]{11})&amp;[^"]*?list=%s''' % re.escape(playlist_id)
70e32269 1068 ids = orderedSet(re.findall(video_re, webpage, flags=re.DOTALL))
652cdaa2
JMF
1069 url_results = self._ids_to_results(ids)
1070
1071 return self.playlist_result(url_results, playlist_id, title)
1072
c5e8d7af
PH
1073 def _real_extract(self, url):
1074 # Extract playlist id
d67cc9fa 1075 mobj = re.match(self._VALID_URL, url)
c5e8d7af
PH
1076 if mobj is None:
1077 raise ExtractorError(u'Invalid URL: %s' % url)
47192f92
FV
1078 playlist_id = mobj.group(1) or mobj.group(2)
1079
1080 # Check if it's a video-specific URL
7c61bd36 1081 query_dict = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
47192f92
FV
1082 if 'v' in query_dict:
1083 video_id = query_dict['v'][0]
1084 if self._downloader.params.get('noplaylist'):
1085 self.to_screen(u'Downloading just video %s because of --no-playlist' % video_id)
7012b23c 1086 return self.url_result(video_id, 'Youtube', video_id=video_id)
47192f92 1087 else:
1db26669 1088 self.to_screen(u'Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id))
c5e8d7af 1089
7d4afc55 1090 if playlist_id.startswith('RD'):
652cdaa2
JMF
1091 # Mixes require a custom extraction process
1092 return self._extract_mix(playlist_id)
0a688bc0
JMF
1093 if playlist_id.startswith('TL'):
1094 raise ExtractorError(u'For downloading YouTube.com top lists, use '
1095 u'the "yttoplist" keyword, for example "youtube-dl \'yttoplist:music:Top Tracks\'"', expected=True)
652cdaa2 1096
dbb94fb0
S
1097 url = self._TEMPLATE_URL % playlist_id
1098 page = self._download_webpage(url, playlist_id)
1099 more_widget_html = content_html = page
1100
10c0e2d8 1101 # Check if the playlist exists or is private
e399853d 1102 if re.search(r'<div class="yt-alert-message">[^<]*?(The|This) playlist (does not exist|is private)[^<]*?</div>', page) is not None:
10c0e2d8
JMF
1103 raise ExtractorError(
1104 u'The playlist doesn\'t exist or is private, use --username or '
1105 '--netrc to access it.',
1106 expected=True)
1107
dcbb4580
JMF
1108 # Extract the video ids from the playlist pages
1109 ids = []
c5e8d7af 1110
755eb032 1111 for page_num in itertools.count(1):
dbb94fb0 1112 matches = re.finditer(self._VIDEO_RE, content_html)
6e47b51e
JMF
1113 # We remove the duplicates and the link with index 0
1114 # (it's not the first video of the playlist)
1115 new_ids = orderedSet(m.group('id') for m in matches if m.group('index') != '0')
dcbb4580 1116 ids.extend(new_ids)
c5e8d7af 1117
dbb94fb0
S
1118 mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
1119 if not mobj:
c5e8d7af
PH
1120 break
1121
dbb94fb0 1122 more = self._download_json(
5912c639
PH
1123 'https://youtube.com/%s' % mobj.group('more'), playlist_id,
1124 'Downloading page #%s' % page_num,
1125 transform_source=uppercase_escape)
dbb94fb0
S
1126 content_html = more['content_html']
1127 more_widget_html = more['load_more_widget_html']
1128
1129 playlist_title = self._html_search_regex(
68eb8e90
PH
1130 r'(?s)<h1 class="pl-header-title[^"]*">\s*(.*?)\s*</h1>',
1131 page, u'title')
c5e8d7af 1132
652cdaa2 1133 url_results = self._ids_to_results(ids)
dcbb4580 1134 return self.playlist_result(url_results, playlist_id, playlist_title)
c5e8d7af
PH
1135
1136
0a688bc0
JMF
1137class YoutubeTopListIE(YoutubePlaylistIE):
1138 IE_NAME = u'youtube:toplist'
1139 IE_DESC = (u'YouTube.com top lists, "yttoplist:{channel}:{list title}"'
1140 u' (Example: "yttoplist:music:Top Tracks")')
1141 _VALID_URL = r'yttoplist:(?P<chann>.*?):(?P<title>.*?)$'
1142
1143 def _real_extract(self, url):
1144 mobj = re.match(self._VALID_URL, url)
1145 channel = mobj.group('chann')
1146 title = mobj.group('title')
1147 query = compat_urllib_parse.urlencode({'title': title})
beddbc2a 1148 playlist_re = 'href="([^"]+?%s.*?)"' % re.escape(query)
0a688bc0
JMF
1149 channel_page = self._download_webpage('https://www.youtube.com/%s' % channel, title)
1150 link = self._html_search_regex(playlist_re, channel_page, u'list')
1151 url = compat_urlparse.urljoin('https://www.youtube.com/', link)
1152
1153 video_re = r'data-index="\d+".*?data-video-id="([0-9A-Za-z_-]{11})"'
1154 ids = []
1155 # sometimes the webpage doesn't contain the videos
1156 # retry until we get them
1157 for i in itertools.count(0):
1158 msg = u'Downloading Youtube mix'
1159 if i > 0:
1160 msg += ', retry #%d' % i
1161 webpage = self._download_webpage(url, title, msg)
1162 ids = orderedSet(re.findall(video_re, webpage))
1163 if ids:
1164 break
1165 url_results = self._ids_to_results(ids)
1166 return self.playlist_result(url_results, playlist_title=title)
1167
1168
c5e8d7af 1169class YoutubeChannelIE(InfoExtractor):
0f818663 1170 IE_DESC = u'YouTube.com channels'
c5e8d7af 1171 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
c5e8d7af 1172 _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
38c2e5b8 1173 _MORE_PAGES_URL = 'https://www.youtube.com/c4_browse_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
c5e8d7af
PH
1174 IE_NAME = u'youtube:channel'
1175
1176 def extract_videos_from_page(self, page):
1177 ids_in_page = []
1178 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1179 if mobj.group(1) not in ids_in_page:
1180 ids_in_page.append(mobj.group(1))
1181 return ids_in_page
1182
1183 def _real_extract(self, url):
1184 # Extract channel id
1185 mobj = re.match(self._VALID_URL, url)
1186 if mobj is None:
1187 raise ExtractorError(u'Invalid URL: %s' % url)
1188
1189 # Download channel page
1190 channel_id = mobj.group(1)
1191 video_ids = []
b9643eed
JMF
1192 url = 'https://www.youtube.com/channel/%s/videos' % channel_id
1193 channel_page = self._download_webpage(url, channel_id)
31812a9e
PH
1194 autogenerated = re.search(r'''(?x)
1195 class="[^"]*?(?:
1196 channel-header-autogenerated-label|
1197 yt-channel-title-autogenerated
1198 )[^"]*"''', channel_page) is not None
c5e8d7af 1199
b9643eed
JMF
1200 if autogenerated:
1201 # The videos are contained in a single page
1202 # the ajax pages can't be used, they are empty
1203 video_ids = self.extract_videos_from_page(channel_page)
1204 else:
1205 # Download all channel pages using the json-based channel_ajax query
1206 for pagenum in itertools.count(1):
1207 url = self._MORE_PAGES_URL % (pagenum, channel_id)
81c2f20b
PH
1208 page = self._download_json(
1209 url, channel_id, note=u'Downloading page #%s' % pagenum,
1210 transform_source=uppercase_escape)
1211
b9643eed
JMF
1212 ids_in_page = self.extract_videos_from_page(page['content_html'])
1213 video_ids.extend(ids_in_page)
1214
1215 if self._MORE_PAGES_INDICATOR not in page['load_more_widget_html']:
1216 break
c5e8d7af
PH
1217
1218 self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1219
7012b23c
PH
1220 url_entries = [self.url_result(video_id, 'Youtube', video_id=video_id)
1221 for video_id in video_ids]
1222 return self.playlist_result(url_entries, channel_id)
c5e8d7af
PH
1223
1224
1225class YoutubeUserIE(InfoExtractor):
0f818663 1226 IE_DESC = u'YouTube.com user videos (URL or "ytuser" keyword)'
c9ae7b95 1227 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:user/)?(?!(?:attribution_link|watch|results)(?:$|[^a-z_A-Z0-9-])))|ytuser:)(?!feed/)([A-Za-z0-9_-]+)'
38c2e5b8 1228 _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/users/%s'
c5e8d7af 1229 _GDATA_PAGE_SIZE = 50
38c2e5b8 1230 _GDATA_URL = 'https://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d&alt=json'
c5e8d7af
PH
1231 IE_NAME = u'youtube:user'
1232
e3ea4790 1233 @classmethod
f4b05232 1234 def suitable(cls, url):
e3ea4790
JMF
1235 # Don't return True if the url can be extracted with other youtube
1236 # extractor, the regex would is too permissive and it would match.
1237 other_ies = iter(klass for (name, klass) in globals().items() if name.endswith('IE') and klass is not cls)
1238 if any(ie.suitable(url) for ie in other_ies): return False
f4b05232
JMF
1239 else: return super(YoutubeUserIE, cls).suitable(url)
1240
c5e8d7af
PH
1241 def _real_extract(self, url):
1242 # Extract username
1243 mobj = re.match(self._VALID_URL, url)
1244 if mobj is None:
1245 raise ExtractorError(u'Invalid URL: %s' % url)
1246
1247 username = mobj.group(1)
1248
1249 # Download video ids using YouTube Data API. Result size per
1250 # query is limited (currently to 50 videos) so we need to query
1251 # page by page until there are no video ids - it means we got
1252 # all of them.
1253
b7ab0590 1254 def download_page(pagenum):
c5e8d7af
PH
1255 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1256
1257 gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
b7ab0590
PH
1258 page = self._download_webpage(
1259 gdata_url, username,
1260 u'Downloading video ids from %d to %d' % (
1261 start_index, start_index + self._GDATA_PAGE_SIZE))
c5e8d7af 1262
fd9cf738
JMF
1263 try:
1264 response = json.loads(page)
1265 except ValueError as err:
1266 raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
71c82637 1267 if 'entry' not in response['feed']:
b7ab0590 1268 return
fd9cf738 1269
c5e8d7af 1270 # Extract video identifiers
e302f9ce
PH
1271 entries = response['feed']['entry']
1272 for entry in entries:
1273 title = entry['title']['$t']
1274 video_id = entry['id']['$t'].split('/')[-1]
b7ab0590 1275 yield {
e302f9ce
PH
1276 '_type': 'url',
1277 'url': video_id,
1278 'ie_key': 'Youtube',
b11cec41 1279 'id': video_id,
e302f9ce 1280 'title': title,
b7ab0590
PH
1281 }
1282 url_results = PagedList(download_page, self._GDATA_PAGE_SIZE)
c5e8d7af 1283
7012b23c
PH
1284 return self.playlist_result(url_results, playlist_title=username)
1285
b05654f0
PH
1286
1287class YoutubeSearchIE(SearchInfoExtractor):
0f818663 1288 IE_DESC = u'YouTube.com searches'
83d548ef 1289 _API_URL = u'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
b05654f0
PH
1290 _MAX_RESULTS = 1000
1291 IE_NAME = u'youtube:search'
1292 _SEARCH_KEY = 'ytsearch'
1293
b05654f0
PH
1294 def _get_n_results(self, query, n):
1295 """Get a specified number of results for a query"""
1296
1297 video_ids = []
1298 pagenum = 0
1299 limit = n
83d548ef 1300 PAGE_SIZE = 50
b05654f0 1301
83d548ef
PH
1302 while (PAGE_SIZE * pagenum) < limit:
1303 result_url = self._API_URL % (
1304 compat_urllib_parse.quote_plus(query.encode('utf-8')),
1305 (PAGE_SIZE * pagenum) + 1)
7cc3570e
PH
1306 data_json = self._download_webpage(
1307 result_url, video_id=u'query "%s"' % query,
1308 note=u'Downloading page %s' % (pagenum + 1),
1309 errnote=u'Unable to download API page')
1310 data = json.loads(data_json)
1311 api_response = data['data']
1312
1313 if 'items' not in api_response:
07ad22b8
PH
1314 raise ExtractorError(
1315 u'[youtube] No video results', expected=True)
b05654f0
PH
1316
1317 new_ids = list(video['id'] for video in api_response['items'])
1318 video_ids += new_ids
1319
1320 limit = min(n, api_response['totalItems'])
1321 pagenum += 1
1322
1323 if len(video_ids) > n:
1324 video_ids = video_ids[:n]
7012b23c
PH
1325 videos = [self.url_result(video_id, 'Youtube', video_id=video_id)
1326 for video_id in video_ids]
b05654f0 1327 return self.playlist_result(videos, query)
75dff0ee 1328
c9ae7b95 1329
a3dd9248 1330class YoutubeSearchDateIE(YoutubeSearchIE):
cb7fb546 1331 IE_NAME = YoutubeSearchIE.IE_NAME + ':date'
a3dd9248
CM
1332 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc&orderby=published'
1333 _SEARCH_KEY = 'ytsearchdate'
08fb86c4 1334 IE_DESC = u'YouTube.com searches, newest videos first'
75dff0ee 1335
c9ae7b95
PH
1336
1337class YoutubeSearchURLIE(InfoExtractor):
1338 IE_DESC = u'YouTube.com search URLs'
1339 IE_NAME = u'youtube:search_url'
1340 _VALID_URL = r'https?://(?:www\.)?youtube\.com/results\?(.*?&)?search_query=(?P<query>[^&]+)(?:[&]|$)'
1341
1342 def _real_extract(self, url):
1343 mobj = re.match(self._VALID_URL, url)
1344 query = compat_urllib_parse.unquote_plus(mobj.group('query'))
1345
1346 webpage = self._download_webpage(url, query)
1347 result_code = self._search_regex(
6feb2d5e 1348 r'(?s)<ol class="item-section"(.*?)</ol>', webpage, u'result HTML')
c9ae7b95
PH
1349
1350 part_codes = re.findall(
1351 r'(?s)<h3 class="yt-lockup-title">(.*?)</h3>', result_code)
1352 entries = []
1353 for part_code in part_codes:
1354 part_title = self._html_search_regex(
6feb2d5e 1355 [r'(?s)title="([^"]+)"', r'>([^<]+)</a>'], part_code, 'item title', fatal=False)
c9ae7b95
PH
1356 part_url_snippet = self._html_search_regex(
1357 r'(?s)href="([^"]+)"', part_code, 'item URL')
1358 part_url = compat_urlparse.urljoin(
1359 'https://www.youtube.com/', part_url_snippet)
1360 entries.append({
1361 '_type': 'url',
1362 'url': part_url,
1363 'title': part_title,
1364 })
1365
1366 return {
1367 '_type': 'playlist',
1368 'entries': entries,
1369 'title': query,
1370 }
1371
1372
75dff0ee 1373class YoutubeShowIE(InfoExtractor):
0f818663 1374 IE_DESC = u'YouTube.com (multi-season) shows'
75dff0ee
JMF
1375 _VALID_URL = r'https?://www\.youtube\.com/show/(.*)'
1376 IE_NAME = u'youtube:show'
1377
1378 def _real_extract(self, url):
1379 mobj = re.match(self._VALID_URL, url)
1380 show_name = mobj.group(1)
1381 webpage = self._download_webpage(url, show_name, u'Downloading show webpage')
1382 # There's one playlist for each season of the show
1383 m_seasons = list(re.finditer(r'href="(/playlist\?list=.*?)"', webpage))
1384 self.to_screen(u'%s: Found %s seasons' % (show_name, len(m_seasons)))
1385 return [self.url_result('https://www.youtube.com' + season.group(1), 'YoutubePlaylist') for season in m_seasons]
04cc9617
JMF
1386
1387
b2e8bc1b 1388class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
d7ae0639
JMF
1389 """
1390 Base class for extractors that fetch info from
1391 http://www.youtube.com/feed_ajax
1392 Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.
1393 """
b2e8bc1b 1394 _LOGIN_REQUIRED = True
43ba5456
JMF
1395 # use action_load_personal_feed instead of action_load_system_feed
1396 _PERSONAL_FEED = False
04cc9617 1397
d7ae0639
JMF
1398 @property
1399 def _FEED_TEMPLATE(self):
43ba5456
JMF
1400 action = 'action_load_system_feed'
1401 if self._PERSONAL_FEED:
1402 action = 'action_load_personal_feed'
38c2e5b8 1403 return 'https://www.youtube.com/feed_ajax?%s=1&feed_name=%s&paging=%%s' % (action, self._FEED_NAME)
d7ae0639
JMF
1404
1405 @property
1406 def IE_NAME(self):
1407 return u'youtube:%s' % self._FEED_NAME
04cc9617 1408
81f0259b 1409 def _real_initialize(self):
b2e8bc1b 1410 self._login()
81f0259b 1411
04cc9617
JMF
1412 def _real_extract(self, url):
1413 feed_entries = []
0e44d838
JMF
1414 paging = 0
1415 for i in itertools.count(1):
f6177462 1416 info = self._download_json(self._FEED_TEMPLATE % paging,
d7ae0639 1417 u'%s feed' % self._FEED_NAME,
04cc9617 1418 u'Downloading page %s' % i)
f6177462 1419 feed_html = info.get('feed_html') or info.get('content_html')
43ba5456 1420 m_ids = re.finditer(r'"/watch\?v=(.*?)["&]', feed_html)
04cc9617 1421 ids = orderedSet(m.group(1) for m in m_ids)
7012b23c
PH
1422 feed_entries.extend(
1423 self.url_result(video_id, 'Youtube', video_id=video_id)
1424 for video_id in ids)
05ee2b6d
JMF
1425 mobj = re.search(
1426 r'data-uix-load-more-href="/?[^"]+paging=(?P<paging>\d+)',
1427 feed_html)
1428 if mobj is None:
04cc9617 1429 break
05ee2b6d 1430 paging = mobj.group('paging')
d7ae0639
JMF
1431 return self.playlist_result(feed_entries, playlist_title=self._PLAYLIST_TITLE)
1432
1433class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor):
e45d40b1 1434 IE_DESC = u'YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication)'
d7ae0639
JMF
1435 _VALID_URL = r'https?://www\.youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
1436 _FEED_NAME = 'subscriptions'
1437 _PLAYLIST_TITLE = u'Youtube Subscriptions'
1438
1439class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
1440 IE_DESC = u'YouTube.com recommended videos, "ytrec" keyword (requires authentication)'
1441 _VALID_URL = r'https?://www\.youtube\.com/feed/recommended|:ytrec(?:ommended)?'
1442 _FEED_NAME = 'recommended'
1443 _PLAYLIST_TITLE = u'Youtube Recommended videos'
c626a3d9 1444
43ba5456
JMF
1445class YoutubeWatchLaterIE(YoutubeFeedsInfoExtractor):
1446 IE_DESC = u'Youtube watch later list, "ytwatchlater" keyword (requires authentication)'
1447 _VALID_URL = r'https?://www\.youtube\.com/feed/watch_later|:ytwatchlater'
1448 _FEED_NAME = 'watch_later'
1449 _PLAYLIST_TITLE = u'Youtube Watch Later'
43ba5456 1450 _PERSONAL_FEED = True
c626a3d9 1451
f459d170
JMF
1452class YoutubeHistoryIE(YoutubeFeedsInfoExtractor):
1453 IE_DESC = u'Youtube watch history, "ythistory" keyword (requires authentication)'
1454 _VALID_URL = u'https?://www\.youtube\.com/feed/history|:ythistory'
1455 _FEED_NAME = 'history'
1456 _PERSONAL_FEED = True
1457 _PLAYLIST_TITLE = u'Youtube Watch History'
1458
c626a3d9
JMF
1459class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
1460 IE_NAME = u'youtube:favorites'
1461 IE_DESC = u'YouTube.com favourite videos, "ytfav" keyword (requires authentication)'
c7a7750d 1462 _VALID_URL = r'https?://www\.youtube\.com/my_favorites|:ytfav(?:ou?rites)?'
c626a3d9
JMF
1463 _LOGIN_REQUIRED = True
1464
1465 def _real_extract(self, url):
1466 webpage = self._download_webpage('https://www.youtube.com/my_favorites', 'Youtube Favourites videos')
1467 playlist_id = self._search_regex(r'list=(.+?)["&]', webpage, u'favourites playlist id')
1468 return self.url_result(playlist_id, 'YoutubePlaylist')
15870e90
PH
1469
1470
1471class YoutubeTruncatedURLIE(InfoExtractor):
1472 IE_NAME = 'youtube:truncated_url'
1473 IE_DESC = False # Do not list
975d35db 1474 _VALID_URL = r'''(?x)
c4808c60
PH
1475 (?:https?://)?[^/]+/watch\?(?:
1476 feature=[a-z_]+|
1477 annotation_id=annotation_[^&]+
1478 )?$|
975d35db
PH
1479 (?:https?://)?(?:www\.)?youtube\.com/attribution_link\?a=[^&]+$
1480 '''
15870e90 1481
c4808c60
PH
1482 _TESTS = [{
1483 'url': 'http://www.youtube.com/watch?annotation_id=annotation_3951667041',
1484 'only_matching': True,
dc2fc736
PH
1485 }, {
1486 'url': 'http://www.youtube.com/watch?',
1487 'only_matching': True,
c4808c60
PH
1488 }]
1489
15870e90
PH
1490 def _real_extract(self, url):
1491 raise ExtractorError(
1492 u'Did you forget to quote the URL? Remember that & is a meta '
1493 u'character in most shells, so you want to put the URL in quotes, '
1494 u'like youtube-dl '
b4622a32
PH
1495 u'"http://www.youtube.com/watch?feature=foo&v=BaW_jenozKc" '
1496 u' or simply youtube-dl BaW_jenozKc .',
15870e90 1497 expected=True)