]> jfr.im git - yt-dlp.git/blame - youtube_dl/extractor/youtube.py
[tvp] Update tests and improve output
[yt-dlp.git] / youtube_dl / extractor / youtube.py
CommitLineData
c5e8d7af 1# coding: utf-8
c5e8d7af 2
78caa52a
PH
3from __future__ import unicode_literals
4
5
0ca96d48 6import itertools
c5e8d7af 7import json
c4417ddb 8import os.path
c5e8d7af 9import re
42939b61 10import time
e0df6211 11import traceback
c5e8d7af 12
b05654f0 13from .common import InfoExtractor, SearchInfoExtractor
54d39d8b 14from .subtitles import SubtitlesInfoExtractor
2b25cb5d 15from ..jsinterp import JSInterpreter
54256267 16from ..swfinterp import SWFInterpreter
4bb4a188 17from ..compat import (
edf3e38e 18 compat_chr,
c5e8d7af 19 compat_parse_qs,
c5e8d7af
PH
20 compat_urllib_parse,
21 compat_urllib_request,
7c61bd36 22 compat_urlparse,
c5e8d7af 23 compat_str,
4bb4a188
PH
24)
25from ..utils import (
c5e8d7af 26 clean_html,
c5e8d7af 27 ExtractorError,
4bb4a188
PH
28 get_element_by_attribute,
29 get_element_by_id,
dd27fd17 30 int_or_none,
9c44d242 31 OnDemandPagedList,
4bb4a188 32 orderedSet,
c5e8d7af
PH
33 unescapeHTML,
34 unified_strdate,
81c2f20b 35 uppercase_escape,
c5e8d7af
PH
36)
37
5f6a1245 38
de7f3446 39class YoutubeBaseInfoExtractor(InfoExtractor):
b2e8bc1b
JMF
40 """Provide base functions for Youtube extractors"""
41 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
83317f69 42 _TWOFACTOR_URL = 'https://accounts.google.com/SecondFactor'
b2e8bc1b
JMF
43 _NETRC_MACHINE = 'youtube'
44 # If True it will raise an error if no login info is provided
45 _LOGIN_REQUIRED = False
46
b2e8bc1b 47 def _set_language(self):
810fb84d
PH
48 self._set_cookie(
49 '.youtube.com', 'PREF', 'f1=50000000&hl=en',
42939b61 50 # YouTube sets the expire time to about two months
810fb84d 51 expire_time=time.time() + 2 * 30 * 24 * 3600)
b2e8bc1b
JMF
52
53 def _login(self):
83317f69 54 """
55 Attempt to log in to YouTube.
56 True is returned if successful or skipped.
57 False is returned if login failed.
58
59 If _LOGIN_REQUIRED is set and no authentication was provided, an error is raised.
60 """
b2e8bc1b
JMF
61 (username, password) = self._get_login_info()
62 # No authentication to be performed
63 if username is None:
64 if self._LOGIN_REQUIRED:
69ea8ca4 65 raise ExtractorError('No login info available, needed for using %s.' % self.IE_NAME, expected=True)
83317f69 66 return True
b2e8bc1b 67
7cc3570e
PH
68 login_page = self._download_webpage(
69 self._LOGIN_URL, None,
69ea8ca4
PH
70 note='Downloading login page',
71 errnote='unable to fetch login page', fatal=False)
7cc3570e
PH
72 if login_page is False:
73 return
b2e8bc1b 74
795f28f8 75 galx = self._search_regex(r'(?s)<input.+?name="GALX".+?value="(.+?)"',
78caa52a 76 login_page, 'Login GALX parameter')
c5e8d7af 77
b2e8bc1b
JMF
78 # Log in
79 login_form_strs = {
8bcc8756
JW
80 'continue': 'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
81 'Email': username,
82 'GALX': galx,
83 'Passwd': password,
84
85 'PersistentCookie': 'yes',
86 '_utf8': '霱',
87 'bgresponse': 'js_disabled',
88 'checkConnection': '',
89 'checkedDomains': 'youtube',
90 'dnConn': '',
91 'pstMsg': '0',
92 'rmShown': '1',
93 'secTok': '',
94 'signIn': 'Sign in',
95 'timeStmp': '',
96 'service': 'youtube',
97 'uilel': '3',
98 'hl': 'en_US',
b2e8bc1b 99 }
83317f69 100
b2e8bc1b
JMF
101 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
102 # chokes on unicode
5f6a1245 103 login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k, v in login_form_strs.items())
b2e8bc1b 104 login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
7cc3570e
PH
105
106 req = compat_urllib_request.Request(self._LOGIN_URL, login_data)
107 login_results = self._download_webpage(
108 req, None,
69ea8ca4 109 note='Logging in', errnote='unable to log in', fatal=False)
7cc3570e
PH
110 if login_results is False:
111 return False
83317f69 112
113 if re.search(r'id="errormsg_0_Passwd"', login_results) is not None:
69ea8ca4 114 raise ExtractorError('Please use your account password and a two-factor code instead of an application-specific password.', expected=True)
83317f69 115
116 # Two-Factor
117 # TODO add SMS and phone call support - these require making a request and then prompting the user
118
119 if re.search(r'(?i)<form[^>]* id="gaia_secondfactorform"', login_results) is not None:
120 tfa_code = self._get_tfa_info()
121
122 if tfa_code is None:
69ea8ca4
PH
123 self._downloader.report_warning('Two-factor authentication required. Provide it with --twofactor <code>')
124 self._downloader.report_warning('(Note that only TOTP (Google Authenticator App) codes work at this time.)')
83317f69 125 return False
126
127 # Unlike the first login form, secTok and timeStmp are both required for the TFA form
128
129 match = re.search(r'id="secTok"\n\s+value=\'(.+)\'/>', login_results, re.M | re.U)
130 if match is None:
69ea8ca4 131 self._downloader.report_warning('Failed to get secTok - did the page structure change?')
83317f69 132 secTok = match.group(1)
133 match = re.search(r'id="timeStmp"\n\s+value=\'(.+)\'/>', login_results, re.M | re.U)
134 if match is None:
69ea8ca4 135 self._downloader.report_warning('Failed to get timeStmp - did the page structure change?')
83317f69 136 timeStmp = match.group(1)
137
138 tfa_form_strs = {
78caa52a
PH
139 'continue': 'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
140 'smsToken': '',
141 'smsUserPin': tfa_code,
142 'smsVerifyPin': 'Verify',
143
144 'PersistentCookie': 'yes',
145 'checkConnection': '',
146 'checkedDomains': 'youtube',
147 'pstMsg': '1',
148 'secTok': secTok,
149 'timeStmp': timeStmp,
150 'service': 'youtube',
151 'hl': 'en_US',
83317f69 152 }
5f6a1245 153 tfa_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k, v in tfa_form_strs.items())
83317f69 154 tfa_data = compat_urllib_parse.urlencode(tfa_form).encode('ascii')
155
156 tfa_req = compat_urllib_request.Request(self._TWOFACTOR_URL, tfa_data)
157 tfa_results = self._download_webpage(
158 tfa_req, None,
69ea8ca4 159 note='Submitting TFA code', errnote='unable to submit tfa', fatal=False)
83317f69 160
161 if tfa_results is False:
162 return False
163
164 if re.search(r'(?i)<form[^>]* id="gaia_secondfactorform"', tfa_results) is not None:
69ea8ca4 165 self._downloader.report_warning('Two-factor code expired. Please try again, or use a one-use backup code instead.')
83317f69 166 return False
167 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', tfa_results) is not None:
69ea8ca4 168 self._downloader.report_warning('unable to log in - did the page structure change?')
83317f69 169 return False
170 if re.search(r'smsauth-interstitial-reviewsettings', tfa_results) is not None:
69ea8ca4 171 self._downloader.report_warning('Your Google account has a security notice. Please log in on your web browser, resolve the notice, and try again.')
83317f69 172 return False
173
7cc3570e 174 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
69ea8ca4 175 self._downloader.report_warning('unable to log in: bad username or password')
b2e8bc1b
JMF
176 return False
177 return True
178
b2e8bc1b
JMF
179 def _real_initialize(self):
180 if self._downloader is None:
181 return
42939b61 182 self._set_language()
b2e8bc1b
JMF
183 if not self._login():
184 return
c5e8d7af 185
8377574c 186
de7f3446 187class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
78caa52a 188 IE_DESC = 'YouTube.com'
cb7dfeea 189 _VALID_URL = r"""(?x)^
c5e8d7af 190 (
edb53e2d 191 (?:https?://|//) # http(s):// or protocol-independent URL
cb7dfeea 192 (?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/|
484aaeb2 193 (?:www\.)?deturl\.com/www\.youtube\.com/|
e70dc1d1 194 (?:www\.)?pwnyoutube\.com/|
f7000f3a 195 (?:www\.)?yourepeat\.com/|
e69ae5b9
JMF
196 tube\.majestyc\.net/|
197 youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains
c5e8d7af
PH
198 (?:.*?\#/)? # handle anchor (#/) redirect urls
199 (?: # the various things that can precede the ID:
ac7553d0 200 (?:(?:v|embed|e)/(?!videoseries)) # v/ or embed/ or e/
c5e8d7af 201 |(?: # or the v= param in all its forms
f7000f3a 202 (?:(?:watch|movie)(?:_popup)?(?:\.php)?/?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
c5e8d7af
PH
203 (?:\?|\#!?) # the params delimiter ? or # or #!
204 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
205 v=
206 )
f4b05232
JMF
207 ))
208 |youtu\.be/ # just youtu.be/xxxx
edb53e2d 209 |(?:www\.)?cleanvideosearch\.com/media/action/yt/watch\?videoId=
f4b05232 210 )
c5e8d7af 211 )? # all until now is optional -> you can pass the naked ID
8963d9c2 212 ([0-9A-Za-z_-]{11}) # here is it! the YouTube video ID
9291475f 213 (?!.*?&list=) # combined list/video URLs are handled by the playlist IE
c5e8d7af
PH
214 (?(1).+)? # if we found the ID, everything can follow
215 $"""
c5e8d7af 216 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
2c62dc26
PH
217 _formats = {
218 '5': {'ext': 'flv', 'width': 400, 'height': 240},
219 '6': {'ext': 'flv', 'width': 450, 'height': 270},
220 '13': {'ext': '3gp'},
221 '17': {'ext': '3gp', 'width': 176, 'height': 144},
222 '18': {'ext': 'mp4', 'width': 640, 'height': 360},
223 '22': {'ext': 'mp4', 'width': 1280, 'height': 720},
224 '34': {'ext': 'flv', 'width': 640, 'height': 360},
225 '35': {'ext': 'flv', 'width': 854, 'height': 480},
226 '36': {'ext': '3gp', 'width': 320, 'height': 240},
227 '37': {'ext': 'mp4', 'width': 1920, 'height': 1080},
228 '38': {'ext': 'mp4', 'width': 4096, 'height': 3072},
229 '43': {'ext': 'webm', 'width': 640, 'height': 360},
230 '44': {'ext': 'webm', 'width': 854, 'height': 480},
231 '45': {'ext': 'webm', 'width': 1280, 'height': 720},
232 '46': {'ext': 'webm', 'width': 1920, 'height': 1080},
233
1d043b93 234
86fe61c8 235 # 3d videos
43b81eb9
PH
236 '82': {'ext': 'mp4', 'height': 360, 'format_note': '3D', 'preference': -20},
237 '83': {'ext': 'mp4', 'height': 480, 'format_note': '3D', 'preference': -20},
238 '84': {'ext': 'mp4', 'height': 720, 'format_note': '3D', 'preference': -20},
239 '85': {'ext': 'mp4', 'height': 1080, 'format_note': '3D', 'preference': -20},
240 '100': {'ext': 'webm', 'height': 360, 'format_note': '3D', 'preference': -20},
241 '101': {'ext': 'webm', 'height': 480, 'format_note': '3D', 'preference': -20},
242 '102': {'ext': 'webm', 'height': 720, 'format_note': '3D', 'preference': -20},
836a086c 243
96fb5605 244 # Apple HTTP Live Streaming
43b81eb9
PH
245 '92': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'preference': -10},
246 '93': {'ext': 'mp4', 'height': 360, 'format_note': 'HLS', 'preference': -10},
247 '94': {'ext': 'mp4', 'height': 480, 'format_note': 'HLS', 'preference': -10},
248 '95': {'ext': 'mp4', 'height': 720, 'format_note': 'HLS', 'preference': -10},
249 '96': {'ext': 'mp4', 'height': 1080, 'format_note': 'HLS', 'preference': -10},
250 '132': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'preference': -10},
251 '151': {'ext': 'mp4', 'height': 72, 'format_note': 'HLS', 'preference': -10},
2c62dc26
PH
252
253 # DASH mp4 video
43b81eb9
PH
254 '133': {'ext': 'mp4', 'height': 240, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
255 '134': {'ext': 'mp4', 'height': 360, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
256 '135': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
257 '136': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
258 '137': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
e65566a9 259 '138': {'ext': 'mp4', 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40}, # Height can vary (https://github.com/rg3/youtube-dl/issues/4559)
43b81eb9
PH
260 '160': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
261 '264': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
0d2c1418
PH
262 '298': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'h264'},
263 '299': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'h264'},
264 '266': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'vcodec': 'h264'},
836a086c 265
f6f1fc92 266 # Dash mp4 audio
230b2287
PH
267 '139': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'vcodec': 'none', 'abr': 48, 'preference': -50},
268 '140': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'vcodec': 'none', 'abr': 128, 'preference': -50},
269 '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'vcodec': 'none', 'abr': 256, 'preference': -50},
836a086c
AZ
270
271 # Dash webm
e75cafe9
A
272 '167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
273 '168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
274 '169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
275 '170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
276 '218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
277 '219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
1cc887cb 278 '278': {'ext': 'webm', 'height': 144, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'container': 'webm', 'vcodec': 'VP9'},
e75cafe9
A
279 '242': {'ext': 'webm', 'height': 240, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
280 '243': {'ext': 'webm', 'height': 360, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
281 '244': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
282 '245': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
283 '246': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
284 '247': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
285 '248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
3c80377b 286 '271': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
18061bba 287 '272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
a51d3aa0
PH
288 '302': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'VP9'},
289 '303': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'VP9'},
76b3c610 290 '308': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'VP9'},
10a404c3 291 '313': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'vcodec': 'VP9'},
76b3c610 292 '315': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'VP9'},
2c62dc26
PH
293
294 # Dash webm audio
55db73ef 295 '171': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'abr': 128, 'preference': -50},
e75cafe9 296 '172': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'abr': 256, 'preference': -50},
ce6b9a2d 297
0857baad
PH
298 # Dash webm audio with opus inside
299 '249': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 50, 'preference': -50},
300 '250': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 70, 'preference': -50},
301 '251': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 160, 'preference': -50},
302
ce6b9a2d
PH
303 # RTMP (unnamed)
304 '_rtmp': {'protocol': 'rtmp'},
c5e8d7af 305 }
836a086c 306
78caa52a 307 IE_NAME = 'youtube'
2eb88d95
PH
308 _TESTS = [
309 {
4bc3a23e
PH
310 'url': 'http://www.youtube.com/watch?v=BaW_jenozKc',
311 'info_dict': {
312 'id': 'BaW_jenozKc',
313 'ext': 'mp4',
314 'title': 'youtube-dl test video "\'/\\ä↭𝕐',
315 'uploader': 'Philipp Hagemeister',
316 'uploader_id': 'phihag',
317 'upload_date': '20121002',
318 'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',
319 'categories': ['Science & Technology'],
3e7c1224
PH
320 'like_count': int,
321 'dislike_count': int,
2eb88d95 322 }
0e853ca4 323 },
0e853ca4 324 {
4bc3a23e
PH
325 'url': 'http://www.youtube.com/watch?v=UxxajLWwzqY',
326 'note': 'Test generic use_cipher_signature video (#897)',
327 'info_dict': {
328 'id': 'UxxajLWwzqY',
329 'ext': 'mp4',
330 'upload_date': '20120506',
331 'title': 'Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]',
332 'description': 'md5:fea86fda2d5a5784273df5c7cc994d9f',
333 'uploader': 'Icona Pop',
334 'uploader_id': 'IconaPop',
2eb88d95 335 }
c108eb73
JMF
336 },
337 {
4bc3a23e
PH
338 'url': 'https://www.youtube.com/watch?v=07FYdnEawAQ',
339 'note': 'Test VEVO video with age protection (#956)',
340 'info_dict': {
341 'id': '07FYdnEawAQ',
342 'ext': 'mp4',
343 'upload_date': '20130703',
344 'title': 'Justin Timberlake - Tunnel Vision (Explicit)',
345 'description': 'md5:64249768eec3bc4276236606ea996373',
346 'uploader': 'justintimberlakeVEVO',
347 'uploader_id': 'justintimberlakeVEVO',
c108eb73
JMF
348 }
349 },
fccd3771 350 {
4bc3a23e
PH
351 'url': '//www.YouTube.com/watch?v=yZIXLfi8CZQ',
352 'note': 'Embed-only video (#1746)',
353 'info_dict': {
354 'id': 'yZIXLfi8CZQ',
355 'ext': 'mp4',
356 'upload_date': '20120608',
357 'title': 'Principal Sexually Assaults A Teacher - Episode 117 - 8th June 2012',
358 'description': 'md5:09b78bd971f1e3e289601dfba15ca4f7',
359 'uploader': 'SET India',
360 'uploader_id': 'setindia'
fccd3771
PH
361 }
362 },
dd27fd17 363 {
4bc3a23e
PH
364 'url': 'http://www.youtube.com/watch?v=a9LDPn-MO4I',
365 'note': '256k DASH audio (format 141) via DASH manifest',
366 'info_dict': {
367 'id': 'a9LDPn-MO4I',
368 'ext': 'm4a',
369 'upload_date': '20121002',
370 'uploader_id': '8KVIDEO',
371 'description': '',
372 'uploader': '8KVIDEO',
373 'title': 'UHDTV TEST 8K VIDEO.mp4'
4919603f 374 },
4bc3a23e
PH
375 'params': {
376 'youtube_include_dash_manifest': True,
377 'format': '141',
4919603f 378 },
dd27fd17 379 },
3489b7d2
JMF
380 # DASH manifest with encrypted signature
381 {
78caa52a
PH
382 'url': 'https://www.youtube.com/watch?v=IB3lcPjvWLA',
383 'info_dict': {
384 'id': 'IB3lcPjvWLA',
385 'ext': 'm4a',
b766eb27
JMF
386 'title': 'Afrojack, Spree Wilson - The Spark ft. Spree Wilson',
387 'description': 'md5:12e7067fa6735a77bdcbb58cb1187d2d',
78caa52a
PH
388 'uploader': 'AfrojackVEVO',
389 'uploader_id': 'AfrojackVEVO',
390 'upload_date': '20131011',
3489b7d2 391 },
4bc3a23e 392 'params': {
78caa52a
PH
393 'youtube_include_dash_manifest': True,
394 'format': '141',
3489b7d2
JMF
395 },
396 },
aa79ac0c
PH
397 # Controversy video
398 {
399 'url': 'https://www.youtube.com/watch?v=T4XJQO3qol8',
400 'info_dict': {
401 'id': 'T4XJQO3qol8',
402 'ext': 'mp4',
403 'upload_date': '20100909',
404 'uploader': 'The Amazing Atheist',
405 'uploader_id': 'TheAmazingAtheist',
406 'title': 'Burning Everyone\'s Koran',
407 'description': 'SUBSCRIBE: http://www.youtube.com/saturninefilms\n\nEven Obama has taken a stand against freedom on this issue: http://www.huffingtonpost.com/2010/09/09/obama-gma-interview-quran_n_710282.html',
408 }
c522adb1
JMF
409 },
410 # Normal age-gate video (No vevo, embed allowed)
411 {
412 'url': 'http://youtube.com/watch?v=HtVdAasjOgU',
413 'info_dict': {
414 'id': 'HtVdAasjOgU',
415 'ext': 'mp4',
416 'title': 'The Witcher 3: Wild Hunt - The Sword Of Destiny Trailer',
9ed99402 417 'description': 're:(?s).{100,}About the Game\n.*?The Witcher 3: Wild Hunt.{100,}',
c522adb1
JMF
418 'uploader': 'The Witcher',
419 'uploader_id': 'WitcherGame',
420 'upload_date': '20140605',
421 },
422 },
fccae2b9
S
423 # Age-gate video with encrypted signature
424 {
425 'url': 'http://www.youtube.com/watch?v=6kLq3WMV1nU',
426 'info_dict': {
427 'id': '6kLq3WMV1nU',
428 'ext': 'mp4',
429 'title': 'Dedication To My Ex (Miss That) (Lyric Video)',
430 'description': 'md5:33765bb339e1b47e7e72b5490139bb41',
431 'uploader': 'LloydVEVO',
432 'uploader_id': 'LloydVEVO',
433 'upload_date': '20110629',
434 },
435 },
774e208f
PH
436 # video_info is None (https://github.com/rg3/youtube-dl/issues/4421)
437 {
438 'url': '__2ABJjxzNo',
439 'info_dict': {
440 'id': '__2ABJjxzNo',
441 'ext': 'mp4',
442 'upload_date': '20100430',
443 'uploader_id': 'deadmau5',
444 'description': 'md5:12c56784b8032162bb936a5f76d55360',
445 'uploader': 'deadmau5',
446 'title': 'Deadmau5 - Some Chords (HD)',
447 },
448 'expected_warnings': [
449 'DASH manifest missing',
450 ]
e52a40ab
PH
451 },
452 # Olympics (https://github.com/rg3/youtube-dl/issues/4431)
453 {
454 'url': 'lqQg6PlCWgI',
455 'info_dict': {
456 'id': 'lqQg6PlCWgI',
457 'ext': 'mp4',
cbe2bd91
PH
458 'upload_date': '20120731',
459 'uploader_id': 'olympic',
460 'description': 'HO09 - Women - GER-AUS - Hockey - 31 July 2012 - London 2012 Olympic Games',
461 'uploader': 'Olympics',
462 'title': 'Hockey - Women - GER-AUS - London 2012 Olympic Games',
463 },
464 'params': {
465 'skip_download': 'requires avconv',
e52a40ab 466 }
cbe2bd91 467 },
2eb88d95
PH
468 ]
469
e0df6211
PH
470 def __init__(self, *args, **kwargs):
471 super(YoutubeIE, self).__init__(*args, **kwargs)
83799698 472 self._player_cache = {}
e0df6211 473
c5e8d7af
PH
474 def report_video_info_webpage_download(self, video_id):
475 """Report attempt to download video info webpage."""
69ea8ca4 476 self.to_screen('%s: Downloading video info webpage' % video_id)
c5e8d7af 477
c5e8d7af
PH
478 def report_information_extraction(self, video_id):
479 """Report attempt to extract video information."""
69ea8ca4 480 self.to_screen('%s: Extracting video information' % video_id)
c5e8d7af
PH
481
482 def report_unavailable_format(self, video_id, format):
483 """Report extracted video URL."""
69ea8ca4 484 self.to_screen('%s: Format %s not available' % (video_id, format))
c5e8d7af
PH
485
486 def report_rtmp_download(self):
487 """Indicate the download will use the RTMP protocol."""
69ea8ca4 488 self.to_screen('RTMP download detected')
c5e8d7af 489
60064c53
PH
490 def _signature_cache_id(self, example_sig):
491 """ Return a string representation of a signature """
78caa52a 492 return '.'.join(compat_str(len(part)) for part in example_sig.split('.'))
60064c53
PH
493
494 def _extract_signature_function(self, video_id, player_url, example_sig):
cf010131 495 id_m = re.match(
60620368 496 r'.*?-(?P<id>[a-zA-Z0-9_-]+)(?:/watch_as3|/html5player)?\.(?P<ext>[a-z]+)$',
cf010131 497 player_url)
c081b35c
PH
498 if not id_m:
499 raise ExtractorError('Cannot identify player %r' % player_url)
e0df6211
PH
500 player_type = id_m.group('ext')
501 player_id = id_m.group('id')
502
c4417ddb 503 # Read from filesystem cache
60064c53
PH
504 func_id = '%s_%s_%s' % (
505 player_type, player_id, self._signature_cache_id(example_sig))
c4417ddb 506 assert os.path.basename(func_id) == func_id
a0e07d31 507
69ea8ca4 508 cache_spec = self._downloader.cache.load('youtube-sigfuncs', func_id)
a0e07d31 509 if cache_spec is not None:
78caa52a 510 return lambda s: ''.join(s[i] for i in cache_spec)
83799698 511
e0df6211
PH
512 if player_type == 'js':
513 code = self._download_webpage(
514 player_url, video_id,
69ea8ca4
PH
515 note='Downloading %s player %s' % (player_type, player_id),
516 errnote='Download of %s failed' % player_url)
83799698 517 res = self._parse_sig_js(code)
c4417ddb 518 elif player_type == 'swf':
e0df6211
PH
519 urlh = self._request_webpage(
520 player_url, video_id,
69ea8ca4
PH
521 note='Downloading %s player %s' % (player_type, player_id),
522 errnote='Download of %s failed' % player_url)
e0df6211 523 code = urlh.read()
83799698 524 res = self._parse_sig_swf(code)
e0df6211
PH
525 else:
526 assert False, 'Invalid player type %r' % player_type
527
a0e07d31 528 if cache_spec is None:
78caa52a 529 test_string = ''.join(map(compat_chr, range(len(example_sig))))
a0e07d31
PH
530 cache_res = res(test_string)
531 cache_spec = [ord(c) for c in cache_res]
83799698 532
69ea8ca4 533 self._downloader.cache.store('youtube-sigfuncs', func_id, cache_spec)
83799698
PH
534 return res
535
60064c53 536 def _print_sig_code(self, func, example_sig):
edf3e38e
PH
537 def gen_sig_code(idxs):
538 def _genslice(start, end, step):
78caa52a 539 starts = '' if start == 0 else str(start)
8bcc8756 540 ends = (':%d' % (end + step)) if end + step >= 0 else ':'
69ea8ca4 541 steps = '' if step == 1 else (':%d' % step)
78caa52a 542 return 's[%s%s%s]' % (starts, ends, steps)
edf3e38e
PH
543
544 step = None
7af808a5
PH
545 # Quelch pyflakes warnings - start will be set when step is set
546 start = '(Never used)'
edf3e38e
PH
547 for i, prev in zip(idxs[1:], idxs[:-1]):
548 if step is not None:
549 if i - prev == step:
550 continue
551 yield _genslice(start, prev, step)
552 step = None
553 continue
554 if i - prev in [-1, 1]:
555 step = i - prev
556 start = prev
557 continue
558 else:
78caa52a 559 yield 's[%d]' % prev
edf3e38e 560 if step is None:
78caa52a 561 yield 's[%d]' % i
edf3e38e
PH
562 else:
563 yield _genslice(start, i, step)
564
78caa52a 565 test_string = ''.join(map(compat_chr, range(len(example_sig))))
c705320f 566 cache_res = func(test_string)
edf3e38e 567 cache_spec = [ord(c) for c in cache_res]
78caa52a 568 expr_code = ' + '.join(gen_sig_code(cache_spec))
60064c53
PH
569 signature_id_tuple = '(%s)' % (
570 ', '.join(compat_str(len(p)) for p in example_sig.split('.')))
69ea8ca4 571 code = ('if tuple(len(p) for p in s.split(\'.\')) == %s:\n'
78caa52a 572 ' return %s\n') % (signature_id_tuple, expr_code)
69ea8ca4 573 self.to_screen('Extracted signature function:\n' + code)
edf3e38e 574
e0df6211
PH
575 def _parse_sig_js(self, jscode):
576 funcname = self._search_regex(
894dd868 577 r'\.sig\|\|([a-zA-Z0-9]+)\(', jscode,
8bcc8756 578 'Initial JS player signature function name')
2b25cb5d
PH
579
580 jsi = JSInterpreter(jscode)
581 initial_function = jsi.extract_function(funcname)
e0df6211
PH
582 return lambda s: initial_function([s])
583
584 def _parse_sig_swf(self, file_contents):
54256267 585 swfi = SWFInterpreter(file_contents)
78caa52a 586 TARGET_CLASSNAME = 'SignatureDecipher'
54256267 587 searched_class = swfi.extract_class(TARGET_CLASSNAME)
78caa52a 588 initial_function = swfi.extract_function(searched_class, 'decipher')
e0df6211
PH
589 return lambda s: initial_function([s])
590
83799698 591 def _decrypt_signature(self, s, video_id, player_url, age_gate=False):
257a2501 592 """Turn the encrypted s field into a working signature"""
6b37f0be 593
c8bf86d5 594 if player_url is None:
69ea8ca4 595 raise ExtractorError('Cannot decrypt signature without player_url')
920de7a2 596
69ea8ca4 597 if player_url.startswith('//'):
78caa52a 598 player_url = 'https:' + player_url
c8bf86d5 599 try:
62af3a0e 600 player_id = (player_url, self._signature_cache_id(s))
c8bf86d5
PH
601 if player_id not in self._player_cache:
602 func = self._extract_signature_function(
60064c53 603 video_id, player_url, s
c8bf86d5
PH
604 )
605 self._player_cache[player_id] = func
606 func = self._player_cache[player_id]
607 if self._downloader.params.get('youtube_print_sig_code'):
60064c53 608 self._print_sig_code(func, s)
c8bf86d5
PH
609 return func(s)
610 except Exception as e:
611 tb = traceback.format_exc()
612 raise ExtractorError(
78caa52a 613 'Signature extraction failed: ' + tb, cause=e)
e0df6211 614
1f343eaa 615 def _get_available_subtitles(self, video_id, webpage):
de7f3446 616 try:
60e47a26 617 subs_doc = self._download_xml(
38c2e5b8 618 'https://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,
7fad1c63
JMF
619 video_id, note=False)
620 except ExtractorError as err:
69ea8ca4 621 self._downloader.report_warning('unable to download video subtitles: %s' % compat_str(err))
de7f3446 622 return {}
de7f3446
JMF
623
624 sub_lang_list = {}
60e47a26
JMF
625 for track in subs_doc.findall('track'):
626 lang = track.attrib['lang_code']
7e660ac1
LD
627 if lang in sub_lang_list:
628 continue
de7f3446
JMF
629 params = compat_urllib_parse.urlencode({
630 'lang': lang,
631 'v': video_id,
ca715127 632 'fmt': self._downloader.params.get('subtitlesformat', 'srt'),
60e47a26 633 'name': track.attrib['name'].encode('utf-8'),
de7f3446 634 })
78caa52a 635 url = 'https://www.youtube.com/api/timedtext?' + params
de7f3446
JMF
636 sub_lang_list[lang] = url
637 if not sub_lang_list:
69ea8ca4 638 self._downloader.report_warning('video doesn\'t have subtitles')
de7f3446
JMF
639 return {}
640 return sub_lang_list
641
055e6f36 642 def _get_available_automatic_caption(self, video_id, webpage):
de7f3446
JMF
643 """We need the webpage for getting the captions url, pass it as an
644 argument to speed up the process."""
ca715127 645 sub_format = self._downloader.params.get('subtitlesformat', 'srt')
69ea8ca4 646 self.to_screen('%s: Looking for automatic captions' % video_id)
de7f3446 647 mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
78caa52a 648 err_msg = 'Couldn\'t find automatic captions for %s' % video_id
de7f3446
JMF
649 if mobj is None:
650 self._downloader.report_warning(err_msg)
651 return {}
652 player_config = json.loads(mobj.group(1))
653 try:
0792d563
PH
654 args = player_config['args']
655 caption_url = args['ttsurl']
656 timestamp = args['timestamp']
055e6f36
JMF
657 # We get the available subtitles
658 list_params = compat_urllib_parse.urlencode({
659 'type': 'list',
660 'tlangs': 1,
661 'asrs': 1,
de7f3446 662 })
055e6f36 663 list_url = caption_url + '&' + list_params
e26f8712 664 caption_list = self._download_xml(list_url, video_id)
e3dc22ca 665 original_lang_node = caption_list.find('track')
7d900ef1 666 if original_lang_node is None:
69ea8ca4 667 self._downloader.report_warning('Video doesn\'t have automatic captions')
e3dc22ca
JMF
668 return {}
669 original_lang = original_lang_node.attrib['lang_code']
7d900ef1 670 caption_kind = original_lang_node.attrib.get('kind', '')
055e6f36
JMF
671
672 sub_lang_list = {}
673 for lang_node in caption_list.findall('target'):
674 sub_lang = lang_node.attrib['lang_code']
675 params = compat_urllib_parse.urlencode({
676 'lang': original_lang,
677 'tlang': sub_lang,
678 'fmt': sub_format,
679 'ts': timestamp,
7d900ef1 680 'kind': caption_kind,
055e6f36
JMF
681 })
682 sub_lang_list[sub_lang] = caption_url + '&' + params
683 return sub_lang_list
de7f3446
JMF
684 # An extractor error can be raise by the download process if there are
685 # no automatic captions but there are subtitles
686 except (KeyError, ExtractorError):
687 self._downloader.report_warning(err_msg)
688 return {}
689
97665381
PH
690 @classmethod
691 def extract_id(cls, url):
692 mobj = re.match(cls._VALID_URL, url, re.VERBOSE)
c5e8d7af 693 if mobj is None:
69ea8ca4 694 raise ExtractorError('Invalid URL: %s' % url)
c5e8d7af
PH
695 video_id = mobj.group(2)
696 return video_id
697
1d043b93
JMF
698 def _extract_from_m3u8(self, manifest_url, video_id):
699 url_map = {}
5f6a1245 700
1d043b93
JMF
701 def _get_urls(_manifest):
702 lines = _manifest.split('\n')
703 urls = filter(lambda l: l and not l.startswith('#'),
8bcc8756 704 lines)
1d043b93 705 return urls
78caa52a 706 manifest = self._download_webpage(manifest_url, video_id, 'Downloading formats manifest')
1d043b93
JMF
707 formats_urls = _get_urls(manifest)
708 for format_url in formats_urls:
890f62e8 709 itag = self._search_regex(r'itag/(\d+?)/', format_url, 'itag')
1d043b93
JMF
710 url_map[itag] = format_url
711 return url_map
712
1fb07d10
JG
713 def _extract_annotations(self, video_id):
714 url = 'https://www.youtube.com/annotations_invideo?features=1&legacy=1&video_id=%s' % video_id
69ea8ca4 715 return self._download_webpage(url, video_id, note='Searching for annotations.', errnote='Unable to download video annotations.')
1fb07d10 716
da276600
PH
717 def _parse_dash_manifest(
718 self, video_id, dash_manifest_url, player_url, age_gate):
774e208f
PH
719 def decrypt_sig(mobj):
720 s = mobj.group(1)
721 dec_s = self._decrypt_signature(s, video_id, player_url, age_gate)
722 return '/signature/%s' % dec_s
723 dash_manifest_url = re.sub(r'/s/([\w\.]+)', decrypt_sig, dash_manifest_url)
724 dash_doc = self._download_xml(
725 dash_manifest_url, video_id,
726 note='Downloading DASH manifest',
727 errnote='Could not download DASH manifest')
728
729 formats = []
730 for r in dash_doc.findall('.//{urn:mpeg:DASH:schema:MPD:2011}Representation'):
731 url_el = r.find('{urn:mpeg:DASH:schema:MPD:2011}BaseURL')
732 if url_el is None:
733 continue
734 format_id = r.attrib['id']
735 video_url = url_el.text
736 filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength'))
737 f = {
738 'format_id': format_id,
739 'url': video_url,
740 'width': int_or_none(r.attrib.get('width')),
e65566a9 741 'height': int_or_none(r.attrib.get('height')),
774e208f
PH
742 'tbr': int_or_none(r.attrib.get('bandwidth'), 1000),
743 'asr': int_or_none(r.attrib.get('audioSamplingRate')),
744 'filesize': filesize,
745 'fps': int_or_none(r.attrib.get('frameRate')),
746 }
747 try:
748 existing_format = next(
749 fo for fo in formats
750 if fo['format_id'] == format_id)
751 except StopIteration:
e65566a9 752 f.update(self._formats.get(format_id, {}).items())
774e208f
PH
753 formats.append(f)
754 else:
755 existing_format.update(f)
756 return formats
757
c5e8d7af 758 def _real_extract(self, url):
7e8c0af0 759 proto = (
78caa52a
PH
760 'http' if self._downloader.params.get('prefer_insecure', False)
761 else 'https')
7e8c0af0 762
c5e8d7af
PH
763 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
764 mobj = re.search(self._NEXT_URL_RE, url)
765 if mobj:
7e8c0af0 766 url = proto + '://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
97665381 767 video_id = self.extract_id(url)
c5e8d7af
PH
768
769 # Get video webpage
aa79ac0c 770 url = proto + '://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1&bpctr=9999999999' % video_id
a1f934b1 771 video_webpage = self._download_webpage(url, video_id)
c5e8d7af
PH
772
773 # Attempt to extract SWF player URL
e0df6211 774 mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
c5e8d7af
PH
775 if mobj is not None:
776 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
777 else:
778 player_url = None
779
780 # Get video info
c108eb73 781 if re.search(r'player-age-gate-content">', video_webpage) is not None:
c108eb73
JMF
782 age_gate = True
783 # We simulate the access to the video from www.youtube.com/v/{video_id}
784 # this can be viewed without login into Youtube
beb95e77
CL
785 url = proto + '://www.youtube.com/embed/%s' % video_id
786 embed_webpage = self._download_webpage(url, video_id, 'Downloading embed webpage')
2c57c7fa
JMF
787 data = compat_urllib_parse.urlencode({
788 'video_id': video_id,
789 'eurl': 'https://youtube.googleapis.com/v/' + video_id,
c084c934 790 'sts': self._search_regex(
beb95e77 791 r'"sts"\s*:\s*(\d+)', embed_webpage, 'sts', default=''),
2c57c7fa 792 })
7e8c0af0 793 video_info_url = proto + '://www.youtube.com/get_video_info?' + data
94bd3613
PH
794 video_info_webpage = self._download_webpage(
795 video_info_url, video_id,
20436c30 796 note='Refetching age-gated info webpage',
94bd3613 797 errnote='unable to download video info webpage')
c5e8d7af 798 video_info = compat_parse_qs(video_info_webpage)
c108eb73
JMF
799 else:
800 age_gate = False
4e62ebe2
JMF
801 try:
802 # Try looking directly into the video webpage
803 mobj = re.search(r';ytplayer\.config\s*=\s*({.*?});', video_webpage)
804 if not mobj:
805 raise ValueError('Could not find ytplayer.config') # caught below
806 json_code = uppercase_escape(mobj.group(1))
807 ytplayer_config = json.loads(json_code)
808 args = ytplayer_config['args']
809 # Convert to the same format returned by compat_parse_qs
810 video_info = dict((k, [v]) for k, v in args.items())
811 if 'url_encoded_fmt_stream_map' not in args:
812 raise ValueError('No stream_map present') # caught below
813 except ValueError:
814 # We fallback to the get_video_info pages (used by the embed page)
815 self.report_video_info_webpage_download(video_id)
816 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
810fb84d
PH
817 video_info_url = (
818 '%s://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
819 % (proto, video_id, el_type))
820 video_info_webpage = self._download_webpage(
821 video_info_url,
4e62ebe2
JMF
822 video_id, note=False,
823 errnote='unable to download video info webpage')
824 video_info = compat_parse_qs(video_info_webpage)
825 if 'token' in video_info:
826 break
c5e8d7af
PH
827 if 'token' not in video_info:
828 if 'reason' in video_info:
d11271dd 829 raise ExtractorError(
78caa52a 830 'YouTube said: %s' % video_info['reason'][0],
d11271dd 831 expected=True, video_id=video_id)
c5e8d7af 832 else:
d11271dd 833 raise ExtractorError(
78caa52a 834 '"token" parameter not in video info for unknown reason',
d11271dd 835 video_id=video_id)
c5e8d7af 836
1d699755
PH
837 if 'view_count' in video_info:
838 view_count = int(video_info['view_count'][0])
839 else:
840 view_count = None
841
c5e8d7af
PH
842 # Check for "rental" videos
843 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
69ea8ca4 844 raise ExtractorError('"rental" videos not supported')
c5e8d7af
PH
845
846 # Start extracting information
847 self.report_information_extraction(video_id)
848
849 # uploader
850 if 'author' not in video_info:
69ea8ca4 851 raise ExtractorError('Unable to extract uploader name')
c5e8d7af
PH
852 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
853
854 # uploader_id
855 video_uploader_id = None
856 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
857 if mobj is not None:
858 video_uploader_id = mobj.group(1)
859 else:
69ea8ca4 860 self._downloader.report_warning('unable to extract uploader nickname')
c5e8d7af
PH
861
862 # title
a8c6b241 863 if 'title' in video_info:
aa92f063 864 video_title = video_info['title'][0]
a8c6b241 865 else:
69ea8ca4 866 self._downloader.report_warning('Unable to extract video title')
78caa52a 867 video_title = '_'
c5e8d7af
PH
868
869 # thumbnail image
7763b04e
JMF
870 # We try first to get a high quality image:
871 m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">',
872 video_webpage, re.DOTALL)
873 if m_thumb is not None:
874 video_thumbnail = m_thumb.group(1)
875 elif 'thumbnail_url' not in video_info:
69ea8ca4 876 self._downloader.report_warning('unable to extract video thumbnail')
f490e77e 877 video_thumbnail = None
c5e8d7af
PH
878 else: # don't panic if we can't find it
879 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
880
881 # upload date
882 upload_date = None
ad3bc6ac 883 mobj = re.search(r'(?s)id="eow-date.*?>(.*?)</span>', video_webpage)
beee53de
PH
884 if mobj is None:
885 mobj = re.search(
263bd4ec 886 r'(?s)id="watch-uploader-info".*?>.*?(?:Published|Uploaded|Streamed live) on (.*?)</strong>',
beee53de 887 video_webpage)
c5e8d7af
PH
888 if mobj is not None:
889 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
890 upload_date = unified_strdate(upload_date)
891
55f7bd2d
PH
892 m_cat_container = self._search_regex(
893 r'(?s)<h4[^>]*>\s*Category\s*</h4>\s*<ul[^>]*>(.*?)</ul>',
624dcebf 894 video_webpage, 'categories', default=None)
ec8deefc 895 if m_cat_container:
ad3bc6ac 896 category = self._html_search_regex(
01ed5c9b 897 r'(?s)<a[^<]+>(.*?)</a>', m_cat_container, 'category',
ad3bc6ac
PH
898 default=None)
899 video_categories = None if category is None else [category]
900 else:
901 video_categories = None
ec8deefc 902
c5e8d7af
PH
903 # description
904 video_description = get_element_by_id("eow-description", video_webpage)
905 if video_description:
27dcce19
PH
906 video_description = re.sub(r'''(?x)
907 <a\s+
908 (?:[a-zA-Z-]+="[^"]+"\s+)*?
909 title="([^"]+)"\s+
910 (?:[a-zA-Z-]+="[^"]+"\s+)*?
911 class="yt-uix-redirect-link"\s*>
912 [^<]+
913 </a>
914 ''', r'\1', video_description)
c5e8d7af
PH
915 video_description = clean_html(video_description)
916 else:
917 fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
918 if fd_mobj:
919 video_description = unescapeHTML(fd_mobj.group(1))
920 else:
78caa52a 921 video_description = ''
c5e8d7af 922
f30a38be 923 def _extract_count(count_name):
46374a56 924 count = self._search_regex(
f30a38be
JMF
925 r'id="watch-%s"[^>]*>.*?([\d,]+)\s*</span>' % re.escape(count_name),
926 video_webpage, count_name, default=None)
336c3a69
JMF
927 if count is not None:
928 return int(count.replace(',', ''))
929 return None
69ea8ca4
PH
930 like_count = _extract_count('like')
931 dislike_count = _extract_count('dislike')
336c3a69 932
c5e8d7af 933 # subtitles
d82134c3 934 video_subtitles = self.extract_subtitles(video_id, video_webpage)
c5e8d7af 935
c5e8d7af 936 if self._downloader.params.get('listsubtitles', False):
d665f8d3 937 self._list_available_subtitles(video_id, video_webpage)
c5e8d7af
PH
938 return
939
940 if 'length_seconds' not in video_info:
69ea8ca4 941 self._downloader.report_warning('unable to extract video duration')
b466b702 942 video_duration = None
c5e8d7af 943 else:
b466b702 944 video_duration = int(compat_urllib_parse.unquote_plus(video_info['length_seconds'][0]))
c5e8d7af 945
1fb07d10
JG
946 # annotations
947 video_annotations = None
948 if self._downloader.params.get('writeannotations', False):
5f6a1245 949 video_annotations = self._extract_annotations(video_id)
1fb07d10 950
dd27fd17
PH
951 def _map_to_format_list(urlmap):
952 formats = []
953 for itag, video_real_url in urlmap.items():
954 dct = {
955 'format_id': itag,
956 'url': video_real_url,
957 'player_url': player_url,
958 }
0b65e5d4
PH
959 if itag in self._formats:
960 dct.update(self._formats[itag])
dd27fd17
PH
961 formats.append(dct)
962 return formats
963
c5e8d7af
PH
964 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
965 self.report_rtmp_download()
dd27fd17
PH
966 formats = [{
967 'format_id': '_rtmp',
968 'protocol': 'rtmp',
969 'url': video_info['conn'][0],
970 'player_url': player_url,
971 }]
24270b03 972 elif len(video_info.get('url_encoded_fmt_stream_map', [''])[0]) >= 1 or len(video_info.get('adaptive_fmts', [''])[0]) >= 1:
5f6a1245 973 encoded_url_map = video_info.get('url_encoded_fmt_stream_map', [''])[0] + ',' + video_info.get('adaptive_fmts', [''])[0]
00fe14fc 974 if 'rtmpe%3Dyes' in encoded_url_map:
a7055eb9 975 raise ExtractorError('rtmpe downloads are not supported, see https://github.com/rg3/youtube-dl/issues/343 for more information.', expected=True)
c5e8d7af 976 url_map = {}
00fe14fc 977 for url_data_str in encoded_url_map.split(','):
c5e8d7af 978 url_data = compat_parse_qs(url_data_str)
201e9eaa
PH
979 if 'itag' not in url_data or 'url' not in url_data:
980 continue
981 format_id = url_data['itag'][0]
982 url = url_data['url'][0]
983
984 if 'sig' in url_data:
985 url += '&signature=' + url_data['sig'][0]
986 elif 's' in url_data:
987 encrypted_sig = url_data['s'][0]
988
beb95e77
CL
989 jsplayer_url_json = self._search_regex(
990 r'"assets":.+?"js":\s*("[^"]+")',
991 embed_webpage if age_gate else video_webpage, 'JS player URL')
992 player_url = json.loads(jsplayer_url_json)
201e9eaa
PH
993 if player_url is None:
994 player_url_json = self._search_regex(
995 r'ytplayer\.config.*?"url"\s*:\s*("[^"]+")',
78caa52a 996 video_webpage, 'age gate player URL')
201e9eaa
PH
997 player_url = json.loads(player_url_json)
998
999 if self._downloader.params.get('verbose'):
cf010131 1000 if player_url is None:
201e9eaa
PH
1001 player_version = 'unknown'
1002 player_desc = 'unknown'
1003 else:
1004 if player_url.endswith('swf'):
1005 player_version = self._search_regex(
1006 r'-(.+?)(?:/watch_as3)?\.swf$', player_url,
78caa52a 1007 'flash player', fatal=False)
201e9eaa 1008 player_desc = 'flash player %s' % player_version
cf010131 1009 else:
201e9eaa
PH
1010 player_version = self._search_regex(
1011 r'html5player-([^/]+?)(?:/html5player)?\.js',
1012 player_url,
1013 'html5 player', fatal=False)
78caa52a 1014 player_desc = 'html5 player %s' % player_version
201e9eaa 1015
60064c53 1016 parts_sizes = self._signature_cache_id(encrypted_sig)
69ea8ca4 1017 self.to_screen('{%s} signature length %s, %s' %
9e1a5b84 1018 (format_id, parts_sizes, player_desc))
201e9eaa
PH
1019
1020 signature = self._decrypt_signature(
1021 encrypted_sig, video_id, player_url, age_gate)
1022 url += '&signature=' + signature
1023 if 'ratebypass' not in url:
1024 url += '&ratebypass=yes'
1025 url_map[format_id] = url
dd27fd17 1026 formats = _map_to_format_list(url_map)
1d043b93
JMF
1027 elif video_info.get('hlsvp'):
1028 manifest_url = video_info['hlsvp'][0]
1029 url_map = self._extract_from_m3u8(manifest_url, video_id)
dd27fd17 1030 formats = _map_to_format_list(url_map)
c5e8d7af 1031 else:
69ea8ca4 1032 raise ExtractorError('no conn, hlsvp or url_encoded_fmt_stream_map information found in video info')
c5e8d7af 1033
dd27fd17 1034 # Look for the DASH manifest
203fb43f 1035 if self._downloader.params.get('youtube_include_dash_manifest', True):
774e208f 1036 dash_mpd = video_info.get('dashmpd')
75111274 1037 if dash_mpd:
774e208f
PH
1038 dash_manifest_url = dash_mpd[0]
1039 try:
1040 dash_formats = self._parse_dash_manifest(
da276600 1041 video_id, dash_manifest_url, player_url, age_gate)
774e208f
PH
1042 except (ExtractorError, KeyError) as e:
1043 self.report_warning(
1044 'Skipping DASH manifest: %r' % e, video_id)
1045 else:
e65566a9
PH
1046 # Hide the formats we found through non-DASH
1047 dash_keys = set(df['format_id'] for df in dash_formats)
1048 for f in formats:
1049 if f['format_id'] in dash_keys:
1050 f['format_id'] = 'nondash-%s' % f['format_id']
ee61f6f3 1051 f['preference'] = f.get('preference', 0) - 10000
774e208f 1052 formats.extend(dash_formats)
d80044c2 1053
4bcc7bd1 1054 self._sort_formats(formats)
4ea3be0a 1055
1056 return {
8bcc8756
JW
1057 'id': video_id,
1058 'uploader': video_uploader,
1059 'uploader_id': video_uploader_id,
1060 'upload_date': upload_date,
1061 'title': video_title,
1062 'thumbnail': video_thumbnail,
1063 'description': video_description,
1064 'categories': video_categories,
1065 'subtitles': video_subtitles,
1066 'duration': video_duration,
1067 'age_limit': 18 if age_gate else 0,
1068 'annotations': video_annotations,
7e8c0af0 1069 'webpage_url': proto + '://www.youtube.com/watch?v=%s' % video_id,
8bcc8756 1070 'view_count': view_count,
4ea3be0a 1071 'like_count': like_count,
1072 'dislike_count': dislike_count,
8bcc8756 1073 'formats': formats,
4ea3be0a 1074 }
c5e8d7af 1075
5f6a1245 1076
880e1c52 1077class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
78caa52a 1078 IE_DESC = 'YouTube.com playlists'
d67cc9fa 1079 _VALID_URL = r"""(?x)(?:
c5e8d7af
PH
1080 (?:https?://)?
1081 (?:\w+\.)?
1082 youtube\.com/
1083 (?:
ac7553d0 1084 (?:course|view_play_list|my_playlists|artist|playlist|watch|embed/videoseries)
c5e8d7af
PH
1085 \? (?:.*?&)*? (?:p|a|list)=
1086 | p/
1087 )
d67cc9fa 1088 (
7d568f5a 1089 (?:PL|LL|EC|UU|FL|RD)?[0-9A-Za-z-_]{10,}
5f6a1245 1090 # Top tracks, they can also include dots
d67cc9fa
JMF
1091 |(?:MC)[\w\.]*
1092 )
c5e8d7af
PH
1093 .*
1094 |
7d568f5a 1095 ((?:PL|LL|EC|UU|FL|RD)[0-9A-Za-z-_]{10,})
c5e8d7af 1096 )"""
dbb94fb0 1097 _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s'
dbb94fb0 1098 _VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})&amp;[^"]*?index=(?P<index>\d+)'
78caa52a 1099 IE_NAME = 'youtube:playlist'
81127aa5
PH
1100 _TESTS = [{
1101 'url': 'https://www.youtube.com/playlist?list=PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re',
1102 'info_dict': {
1103 'title': 'ytdl test PL',
a1cf99d0 1104 'id': 'PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re',
81127aa5
PH
1105 },
1106 'playlist_count': 3,
9291475f
PH
1107 }, {
1108 'url': 'https://www.youtube.com/playlist?list=PLtPgu7CB4gbZDA7i_euNxn75ISqxwZPYx',
1109 'info_dict': {
1110 'title': 'YDL_Empty_List',
1111 },
1112 'playlist_count': 0,
1113 }, {
1114 'note': 'Playlist with deleted videos (#651). As a bonus, the video #51 is also twice in this list.',
1115 'url': 'https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC',
1116 'info_dict': {
1117 'title': '29C3: Not my department',
1118 },
1119 'playlist_count': 95,
1120 }, {
1121 'note': 'issue #673',
1122 'url': 'PLBB231211A4F62143',
1123 'info_dict': {
f46a8702 1124 'title': '[OLD]Team Fortress 2 (Class-based LP)',
9291475f
PH
1125 },
1126 'playlist_mincount': 26,
1127 }, {
1128 'note': 'Large playlist',
1129 'url': 'https://www.youtube.com/playlist?list=UUBABnxM4Ar9ten8Mdjj1j0Q',
1130 'info_dict': {
1131 'title': 'Uploads from Cauchemar',
1132 },
1133 'playlist_mincount': 799,
1134 }, {
1135 'url': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl',
1136 'info_dict': {
1137 'title': 'YDL_safe_search',
1138 },
1139 'playlist_count': 2,
ac7553d0
PH
1140 }, {
1141 'note': 'embedded',
1142 'url': 'http://www.youtube.com/embed/videoseries?list=PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu',
1143 'playlist_count': 4,
1144 'info_dict': {
1145 'title': 'JODA15',
1146 }
6b08cdf6
PH
1147 }, {
1148 'note': 'Embedded SWF player',
1149 'url': 'http://www.youtube.com/p/YN5VISEtHet5D4NEvfTd0zcgFk84NqFZ?hl=en_US&fs=1&rel=0',
1150 'playlist_count': 4,
1151 'info_dict': {
1152 'title': 'JODA7',
1153 }
4b7df0d3
JMF
1154 }, {
1155 'note': 'Buggy playlist: the webpage has a "Load more" button but it doesn\'t have more videos',
1156 'url': 'https://www.youtube.com/playlist?list=UUXw-G3eDE9trcvY2sBMM_aA',
1157 'info_dict': {
1158 'title': 'Uploads from Interstellar Movie',
1159 },
1160 'playlist_mincout': 21,
81127aa5 1161 }]
c5e8d7af 1162
880e1c52
JMF
1163 def _real_initialize(self):
1164 self._login()
1165
652cdaa2 1166 def _ids_to_results(self, ids):
c9cc0bf5
PH
1167 return [
1168 self.url_result(vid_id, 'Youtube', video_id=vid_id)
1169 for vid_id in ids]
652cdaa2
JMF
1170
1171 def _extract_mix(self, playlist_id):
1172 # The mixes are generated from a a single video
1173 # the id of the playlist is just 'RD' + video_id
7d4afc55 1174 url = 'https://youtube.com/watch?v=%s&list=%s' % (playlist_id[-11:], playlist_id)
c9cc0bf5 1175 webpage = self._download_webpage(
78caa52a 1176 url, playlist_id, 'Downloading Youtube mix')
bc2f773b 1177 search_title = lambda class_name: get_element_by_attribute('class', class_name, webpage)
c9cc0bf5
PH
1178 title_span = (
1179 search_title('playlist-title') or
1180 search_title('title long-title') or
1181 search_title('title'))
76d1700b 1182 title = clean_html(title_span)
c9cc0bf5
PH
1183 ids = orderedSet(re.findall(
1184 r'''(?xs)data-video-username=".*?".*?
1185 href="/watch\?v=([0-9A-Za-z_-]{11})&amp;[^"]*?list=%s''' % re.escape(playlist_id),
1186 webpage))
652cdaa2
JMF
1187 url_results = self._ids_to_results(ids)
1188
1189 return self.playlist_result(url_results, playlist_id, title)
1190
c5e8d7af
PH
1191 def _real_extract(self, url):
1192 # Extract playlist id
d67cc9fa 1193 mobj = re.match(self._VALID_URL, url)
c5e8d7af 1194 if mobj is None:
69ea8ca4 1195 raise ExtractorError('Invalid URL: %s' % url)
47192f92
FV
1196 playlist_id = mobj.group(1) or mobj.group(2)
1197
1198 # Check if it's a video-specific URL
7c61bd36 1199 query_dict = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
47192f92
FV
1200 if 'v' in query_dict:
1201 video_id = query_dict['v'][0]
1202 if self._downloader.params.get('noplaylist'):
69ea8ca4 1203 self.to_screen('Downloading just video %s because of --no-playlist' % video_id)
7012b23c 1204 return self.url_result(video_id, 'Youtube', video_id=video_id)
47192f92 1205 else:
69ea8ca4 1206 self.to_screen('Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id))
c5e8d7af 1207
7d4afc55 1208 if playlist_id.startswith('RD'):
652cdaa2
JMF
1209 # Mixes require a custom extraction process
1210 return self._extract_mix(playlist_id)
1211
dbb94fb0
S
1212 url = self._TEMPLATE_URL % playlist_id
1213 page = self._download_webpage(url, playlist_id)
1214 more_widget_html = content_html = page
1215
10c0e2d8 1216 # Check if the playlist exists or is private
e399853d 1217 if re.search(r'<div class="yt-alert-message">[^<]*?(The|This) playlist (does not exist|is private)[^<]*?</div>', page) is not None:
10c0e2d8 1218 raise ExtractorError(
78caa52a 1219 'The playlist doesn\'t exist or is private, use --username or '
10c0e2d8
JMF
1220 '--netrc to access it.',
1221 expected=True)
1222
dcbb4580
JMF
1223 # Extract the video ids from the playlist pages
1224 ids = []
c5e8d7af 1225
755eb032 1226 for page_num in itertools.count(1):
dbb94fb0 1227 matches = re.finditer(self._VIDEO_RE, content_html)
6e47b51e
JMF
1228 # We remove the duplicates and the link with index 0
1229 # (it's not the first video of the playlist)
1230 new_ids = orderedSet(m.group('id') for m in matches if m.group('index') != '0')
dcbb4580 1231 ids.extend(new_ids)
c5e8d7af 1232
dbb94fb0
S
1233 mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
1234 if not mobj:
c5e8d7af
PH
1235 break
1236
dbb94fb0 1237 more = self._download_json(
5912c639
PH
1238 'https://youtube.com/%s' % mobj.group('more'), playlist_id,
1239 'Downloading page #%s' % page_num,
1240 transform_source=uppercase_escape)
dbb94fb0 1241 content_html = more['content_html']
4b7df0d3
JMF
1242 if not content_html.strip():
1243 # Some webpages show a "Load more" button but they don't
1244 # have more videos
1245 break
dbb94fb0
S
1246 more_widget_html = more['load_more_widget_html']
1247
1248 playlist_title = self._html_search_regex(
68eb8e90 1249 r'(?s)<h1 class="pl-header-title[^"]*">\s*(.*?)\s*</h1>',
78caa52a 1250 page, 'title')
c5e8d7af 1251
652cdaa2 1252 url_results = self._ids_to_results(ids)
dcbb4580 1253 return self.playlist_result(url_results, playlist_id, playlist_title)
c5e8d7af
PH
1254
1255
1256class YoutubeChannelIE(InfoExtractor):
78caa52a 1257 IE_DESC = 'YouTube.com channels'
9ff67727 1258 _VALID_URL = r'https?://(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/(?P<id>[0-9A-Za-z_-]+)'
78caa52a 1259 IE_NAME = 'youtube:channel'
cdc628a4
PH
1260 _TESTS = [{
1261 'note': 'paginated channel',
1262 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w',
1263 'playlist_mincount': 91,
1264 }]
c5e8d7af
PH
1265
1266 def extract_videos_from_page(self, page):
1267 ids_in_page = []
1268 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1269 if mobj.group(1) not in ids_in_page:
1270 ids_in_page.append(mobj.group(1))
1271 return ids_in_page
1272
1273 def _real_extract(self, url):
9ff67727 1274 channel_id = self._match_id(url)
c5e8d7af 1275
c5e8d7af 1276 video_ids = []
b9643eed
JMF
1277 url = 'https://www.youtube.com/channel/%s/videos' % channel_id
1278 channel_page = self._download_webpage(url, channel_id)
31812a9e
PH
1279 autogenerated = re.search(r'''(?x)
1280 class="[^"]*?(?:
1281 channel-header-autogenerated-label|
1282 yt-channel-title-autogenerated
1283 )[^"]*"''', channel_page) is not None
c5e8d7af 1284
b9643eed
JMF
1285 if autogenerated:
1286 # The videos are contained in a single page
1287 # the ajax pages can't be used, they are empty
1288 video_ids = self.extract_videos_from_page(channel_page)
b82f815f
PH
1289 entries = [
1290 self.url_result(video_id, 'Youtube', video_id=video_id)
1291 for video_id in video_ids]
1292 return self.playlist_result(entries, channel_id)
1293
1294 def _entries():
23d3608c 1295 more_widget_html = content_html = channel_page
b9643eed 1296 for pagenum in itertools.count(1):
81c2f20b 1297
23d3608c 1298 ids_in_page = self.extract_videos_from_page(content_html)
b82f815f
PH
1299 for video_id in ids_in_page:
1300 yield self.url_result(
1301 video_id, 'Youtube', video_id=video_id)
5f6a1245 1302
23d3608c
JMF
1303 mobj = re.search(
1304 r'data-uix-load-more-href="/?(?P<more>[^"]+)"',
1305 more_widget_html)
1306 if not mobj:
b9643eed 1307 break
c5e8d7af 1308
23d3608c
JMF
1309 more = self._download_json(
1310 'https://youtube.com/%s' % mobj.group('more'), channel_id,
1311 'Downloading page #%s' % (pagenum + 1),
1312 transform_source=uppercase_escape)
1313 content_html = more['content_html']
1314 more_widget_html = more['load_more_widget_html']
1315
b82f815f 1316 return self.playlist_result(_entries(), channel_id)
c5e8d7af
PH
1317
1318
1319class YoutubeUserIE(InfoExtractor):
78caa52a 1320 IE_DESC = 'YouTube.com user videos (URL or "ytuser" keyword)'
9ff67727 1321 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:user/)?(?!(?:attribution_link|watch|results)(?:$|[^a-z_A-Z0-9-])))|ytuser:)(?!feed/)(?P<id>[A-Za-z0-9_-]+)'
38c2e5b8 1322 _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/users/%s'
c5e8d7af 1323 _GDATA_PAGE_SIZE = 50
38c2e5b8 1324 _GDATA_URL = 'https://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d&alt=json'
78caa52a 1325 IE_NAME = 'youtube:user'
c5e8d7af 1326
cdc628a4
PH
1327 _TESTS = [{
1328 'url': 'https://www.youtube.com/user/TheLinuxFoundation',
1329 'playlist_mincount': 320,
1330 'info_dict': {
1331 'title': 'TheLinuxFoundation',
1332 }
1333 }, {
1334 'url': 'ytuser:phihag',
1335 'only_matching': True,
1336 }]
1337
e3ea4790 1338 @classmethod
f4b05232 1339 def suitable(cls, url):
e3ea4790
JMF
1340 # Don't return True if the url can be extracted with other youtube
1341 # extractor, the regex would is too permissive and it would match.
1342 other_ies = iter(klass for (name, klass) in globals().items() if name.endswith('IE') and klass is not cls)
5f6a1245
JW
1343 if any(ie.suitable(url) for ie in other_ies):
1344 return False
1345 else:
1346 return super(YoutubeUserIE, cls).suitable(url)
f4b05232 1347
c5e8d7af 1348 def _real_extract(self, url):
9ff67727 1349 username = self._match_id(url)
c5e8d7af
PH
1350
1351 # Download video ids using YouTube Data API. Result size per
1352 # query is limited (currently to 50 videos) so we need to query
1353 # page by page until there are no video ids - it means we got
1354 # all of them.
1355
b7ab0590 1356 def download_page(pagenum):
c5e8d7af
PH
1357 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1358
1359 gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
b7ab0590
PH
1360 page = self._download_webpage(
1361 gdata_url, username,
78caa52a 1362 'Downloading video ids from %d to %d' % (
b7ab0590 1363 start_index, start_index + self._GDATA_PAGE_SIZE))
c5e8d7af 1364
fd9cf738
JMF
1365 try:
1366 response = json.loads(page)
1367 except ValueError as err:
69ea8ca4 1368 raise ExtractorError('Invalid JSON in API response: ' + compat_str(err))
71c82637 1369 if 'entry' not in response['feed']:
b7ab0590 1370 return
fd9cf738 1371
c5e8d7af 1372 # Extract video identifiers
e302f9ce
PH
1373 entries = response['feed']['entry']
1374 for entry in entries:
1375 title = entry['title']['$t']
1376 video_id = entry['id']['$t'].split('/')[-1]
b7ab0590 1377 yield {
e302f9ce
PH
1378 '_type': 'url',
1379 'url': video_id,
1380 'ie_key': 'Youtube',
b11cec41 1381 'id': video_id,
e302f9ce 1382 'title': title,
b7ab0590 1383 }
9c44d242 1384 url_results = OnDemandPagedList(download_page, self._GDATA_PAGE_SIZE)
c5e8d7af 1385
7012b23c
PH
1386 return self.playlist_result(url_results, playlist_title=username)
1387
b05654f0
PH
1388
1389class YoutubeSearchIE(SearchInfoExtractor):
78caa52a
PH
1390 IE_DESC = 'YouTube.com searches'
1391 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
b05654f0 1392 _MAX_RESULTS = 1000
78caa52a 1393 IE_NAME = 'youtube:search'
b05654f0
PH
1394 _SEARCH_KEY = 'ytsearch'
1395
b05654f0
PH
1396 def _get_n_results(self, query, n):
1397 """Get a specified number of results for a query"""
1398
1399 video_ids = []
1400 pagenum = 0
1401 limit = n
83d548ef 1402 PAGE_SIZE = 50
b05654f0 1403
83d548ef
PH
1404 while (PAGE_SIZE * pagenum) < limit:
1405 result_url = self._API_URL % (
1406 compat_urllib_parse.quote_plus(query.encode('utf-8')),
1407 (PAGE_SIZE * pagenum) + 1)
7cc3570e 1408 data_json = self._download_webpage(
69ea8ca4
PH
1409 result_url, video_id='query "%s"' % query,
1410 note='Downloading page %s' % (pagenum + 1),
1411 errnote='Unable to download API page')
7cc3570e
PH
1412 data = json.loads(data_json)
1413 api_response = data['data']
1414
1415 if 'items' not in api_response:
07ad22b8 1416 raise ExtractorError(
78caa52a 1417 '[youtube] No video results', expected=True)
b05654f0
PH
1418
1419 new_ids = list(video['id'] for video in api_response['items'])
1420 video_ids += new_ids
1421
1422 limit = min(n, api_response['totalItems'])
1423 pagenum += 1
1424
1425 if len(video_ids) > n:
1426 video_ids = video_ids[:n]
7012b23c
PH
1427 videos = [self.url_result(video_id, 'Youtube', video_id=video_id)
1428 for video_id in video_ids]
b05654f0 1429 return self.playlist_result(videos, query)
75dff0ee 1430
c9ae7b95 1431
a3dd9248 1432class YoutubeSearchDateIE(YoutubeSearchIE):
cb7fb546 1433 IE_NAME = YoutubeSearchIE.IE_NAME + ':date'
a3dd9248
CM
1434 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc&orderby=published'
1435 _SEARCH_KEY = 'ytsearchdate'
78caa52a 1436 IE_DESC = 'YouTube.com searches, newest videos first'
75dff0ee 1437
c9ae7b95
PH
1438
1439class YoutubeSearchURLIE(InfoExtractor):
78caa52a
PH
1440 IE_DESC = 'YouTube.com search URLs'
1441 IE_NAME = 'youtube:search_url'
c9ae7b95 1442 _VALID_URL = r'https?://(?:www\.)?youtube\.com/results\?(.*?&)?search_query=(?P<query>[^&]+)(?:[&]|$)'
cdc628a4
PH
1443 _TESTS = [{
1444 'url': 'https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video',
1445 'playlist_mincount': 5,
1446 'info_dict': {
1447 'title': 'youtube-dl test video',
1448 }
1449 }]
c9ae7b95
PH
1450
1451 def _real_extract(self, url):
1452 mobj = re.match(self._VALID_URL, url)
1453 query = compat_urllib_parse.unquote_plus(mobj.group('query'))
1454
1455 webpage = self._download_webpage(url, query)
1456 result_code = self._search_regex(
78caa52a 1457 r'(?s)<ol class="item-section"(.*?)</ol>', webpage, 'result HTML')
c9ae7b95
PH
1458
1459 part_codes = re.findall(
1460 r'(?s)<h3 class="yt-lockup-title">(.*?)</h3>', result_code)
1461 entries = []
1462 for part_code in part_codes:
1463 part_title = self._html_search_regex(
6feb2d5e 1464 [r'(?s)title="([^"]+)"', r'>([^<]+)</a>'], part_code, 'item title', fatal=False)
c9ae7b95
PH
1465 part_url_snippet = self._html_search_regex(
1466 r'(?s)href="([^"]+)"', part_code, 'item URL')
1467 part_url = compat_urlparse.urljoin(
1468 'https://www.youtube.com/', part_url_snippet)
1469 entries.append({
1470 '_type': 'url',
1471 'url': part_url,
1472 'title': part_title,
1473 })
1474
1475 return {
1476 '_type': 'playlist',
1477 'entries': entries,
1478 'title': query,
1479 }
1480
1481
75dff0ee 1482class YoutubeShowIE(InfoExtractor):
78caa52a 1483 IE_DESC = 'YouTube.com (multi-season) shows'
cdc628a4 1484 _VALID_URL = r'https?://www\.youtube\.com/show/(?P<id>[^?#]*)'
78caa52a 1485 IE_NAME = 'youtube:show'
cdc628a4
PH
1486 _TESTS = [{
1487 'url': 'http://www.youtube.com/show/airdisasters',
1488 'playlist_mincount': 3,
1489 'info_dict': {
1490 'id': 'airdisasters',
1491 'title': 'Air Disasters',
1492 }
1493 }]
75dff0ee
JMF
1494
1495 def _real_extract(self, url):
1496 mobj = re.match(self._VALID_URL, url)
cdc628a4
PH
1497 playlist_id = mobj.group('id')
1498 webpage = self._download_webpage(
1499 url, playlist_id, 'Downloading show webpage')
75dff0ee
JMF
1500 # There's one playlist for each season of the show
1501 m_seasons = list(re.finditer(r'href="(/playlist\?list=.*?)"', webpage))
cdc628a4
PH
1502 self.to_screen('%s: Found %s seasons' % (playlist_id, len(m_seasons)))
1503 entries = [
1504 self.url_result(
1505 'https://www.youtube.com' + season.group(1), 'YoutubePlaylist')
1506 for season in m_seasons
1507 ]
1508 title = self._og_search_title(webpage, fatal=False)
1509
1510 return {
1511 '_type': 'playlist',
1512 'id': playlist_id,
1513 'title': title,
1514 'entries': entries,
1515 }
04cc9617
JMF
1516
1517
b2e8bc1b 1518class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
d7ae0639
JMF
1519 """
1520 Base class for extractors that fetch info from
1521 http://www.youtube.com/feed_ajax
1522 Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.
1523 """
b2e8bc1b 1524 _LOGIN_REQUIRED = True
43ba5456
JMF
1525 # use action_load_personal_feed instead of action_load_system_feed
1526 _PERSONAL_FEED = False
04cc9617 1527
d7ae0639
JMF
1528 @property
1529 def _FEED_TEMPLATE(self):
43ba5456
JMF
1530 action = 'action_load_system_feed'
1531 if self._PERSONAL_FEED:
1532 action = 'action_load_personal_feed'
38c2e5b8 1533 return 'https://www.youtube.com/feed_ajax?%s=1&feed_name=%s&paging=%%s' % (action, self._FEED_NAME)
d7ae0639
JMF
1534
1535 @property
1536 def IE_NAME(self):
78caa52a 1537 return 'youtube:%s' % self._FEED_NAME
04cc9617 1538
81f0259b 1539 def _real_initialize(self):
b2e8bc1b 1540 self._login()
81f0259b 1541
04cc9617
JMF
1542 def _real_extract(self, url):
1543 feed_entries = []
0e44d838
JMF
1544 paging = 0
1545 for i in itertools.count(1):
84d84211
PH
1546 info = self._download_json(
1547 self._FEED_TEMPLATE % paging,
1548 '%s feed' % self._FEED_NAME,
1549 'Downloading page %s' % i,
1550 transform_source=uppercase_escape)
f6177462 1551 feed_html = info.get('feed_html') or info.get('content_html')
1a9b9649 1552 load_more_widget_html = info.get('load_more_widget_html') or feed_html
43ba5456 1553 m_ids = re.finditer(r'"/watch\?v=(.*?)["&]', feed_html)
04cc9617 1554 ids = orderedSet(m.group(1) for m in m_ids)
7012b23c
PH
1555 feed_entries.extend(
1556 self.url_result(video_id, 'Youtube', video_id=video_id)
1557 for video_id in ids)
05ee2b6d
JMF
1558 mobj = re.search(
1559 r'data-uix-load-more-href="/?[^"]+paging=(?P<paging>\d+)',
1a9b9649 1560 load_more_widget_html)
05ee2b6d 1561 if mobj is None:
04cc9617 1562 break
05ee2b6d 1563 paging = mobj.group('paging')
d7ae0639
JMF
1564 return self.playlist_result(feed_entries, playlist_title=self._PLAYLIST_TITLE)
1565
5f6a1245 1566
d7ae0639 1567class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
f3a34072 1568 IE_DESC = 'YouTube.com recommended videos, ":ytrec" for short (requires authentication)'
d7ae0639
JMF
1569 _VALID_URL = r'https?://www\.youtube\.com/feed/recommended|:ytrec(?:ommended)?'
1570 _FEED_NAME = 'recommended'
78caa52a 1571 _PLAYLIST_TITLE = 'Youtube Recommended videos'
c626a3d9 1572
5f6a1245 1573
43ba5456 1574class YoutubeWatchLaterIE(YoutubeFeedsInfoExtractor):
f3a34072 1575 IE_DESC = 'Youtube watch later list, ":ytwatchlater" for short (requires authentication)'
43ba5456
JMF
1576 _VALID_URL = r'https?://www\.youtube\.com/feed/watch_later|:ytwatchlater'
1577 _FEED_NAME = 'watch_later'
78caa52a 1578 _PLAYLIST_TITLE = 'Youtube Watch Later'
43ba5456 1579 _PERSONAL_FEED = True
c626a3d9 1580
5f6a1245 1581
f459d170 1582class YoutubeHistoryIE(YoutubeFeedsInfoExtractor):
f3a34072 1583 IE_DESC = 'Youtube watch history, ":ythistory" for short (requires authentication)'
78caa52a 1584 _VALID_URL = 'https?://www\.youtube\.com/feed/history|:ythistory'
f459d170
JMF
1585 _FEED_NAME = 'history'
1586 _PERSONAL_FEED = True
78caa52a 1587 _PLAYLIST_TITLE = 'Youtube Watch History'
f459d170 1588
5f6a1245 1589
c626a3d9 1590class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
78caa52a 1591 IE_NAME = 'youtube:favorites'
f3a34072 1592 IE_DESC = 'YouTube.com favourite videos, ":ytfav" for short (requires authentication)'
c7a7750d 1593 _VALID_URL = r'https?://www\.youtube\.com/my_favorites|:ytfav(?:ou?rites)?'
c626a3d9
JMF
1594 _LOGIN_REQUIRED = True
1595
1596 def _real_extract(self, url):
1597 webpage = self._download_webpage('https://www.youtube.com/my_favorites', 'Youtube Favourites videos')
78caa52a 1598 playlist_id = self._search_regex(r'list=(.+?)["&]', webpage, 'favourites playlist id')
c626a3d9 1599 return self.url_result(playlist_id, 'YoutubePlaylist')
15870e90
PH
1600
1601
1ed5b5c9 1602class YoutubeSubscriptionsIE(YoutubePlaylistIE):
78caa52a
PH
1603 IE_NAME = 'youtube:subscriptions'
1604 IE_DESC = 'YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication)'
1ed5b5c9 1605 _VALID_URL = r'https?://www\.youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
81127aa5 1606 _TESTS = []
1ed5b5c9
JMF
1607
1608 def _real_extract(self, url):
78caa52a 1609 title = 'Youtube Subscriptions'
1ed5b5c9
JMF
1610 page = self._download_webpage('https://www.youtube.com/feed/subscriptions', title)
1611
1612 # The extraction process is the same as for playlists, but the regex
1613 # for the video ids doesn't contain an index
1614 ids = []
1615 more_widget_html = content_html = page
1616
1617 for page_num in itertools.count(1):
1618 matches = re.findall(r'href="\s*/watch\?v=([0-9A-Za-z_-]{11})', content_html)
1619 new_ids = orderedSet(matches)
1620 ids.extend(new_ids)
1621
1622 mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
1623 if not mobj:
1624 break
1625
1626 more = self._download_json(
1627 'https://youtube.com/%s' % mobj.group('more'), title,
1628 'Downloading page #%s' % page_num,
1629 transform_source=uppercase_escape)
1630 content_html = more['content_html']
1631 more_widget_html = more['load_more_widget_html']
1632
1633 return {
1634 '_type': 'playlist',
1635 'title': title,
1636 'entries': self._ids_to_results(ids),
1637 }
1638
1639
15870e90
PH
1640class YoutubeTruncatedURLIE(InfoExtractor):
1641 IE_NAME = 'youtube:truncated_url'
1642 IE_DESC = False # Do not list
975d35db 1643 _VALID_URL = r'''(?x)
c4808c60
PH
1644 (?:https?://)?[^/]+/watch\?(?:
1645 feature=[a-z_]+|
1646 annotation_id=annotation_[^&]+
1647 )?$|
975d35db
PH
1648 (?:https?://)?(?:www\.)?youtube\.com/attribution_link\?a=[^&]+$
1649 '''
15870e90 1650
c4808c60
PH
1651 _TESTS = [{
1652 'url': 'http://www.youtube.com/watch?annotation_id=annotation_3951667041',
1653 'only_matching': True,
dc2fc736
PH
1654 }, {
1655 'url': 'http://www.youtube.com/watch?',
1656 'only_matching': True,
c4808c60
PH
1657 }]
1658
15870e90
PH
1659 def _real_extract(self, url):
1660 raise ExtractorError(
78caa52a
PH
1661 'Did you forget to quote the URL? Remember that & is a meta '
1662 'character in most shells, so you want to put the URL in quotes, '
1663 'like youtube-dl '
1664 '"http://www.youtube.com/watch?feature=foo&v=BaW_jenozKc" '
1665 ' or simply youtube-dl BaW_jenozKc .',
15870e90 1666 expected=True)
772fd5cc
PH
1667
1668
1669class YoutubeTruncatedIDIE(InfoExtractor):
1670 IE_NAME = 'youtube:truncated_id'
1671 IE_DESC = False # Do not list
1672 _VALID_URL = r'https?://(?:www\.)youtube\.com/watch\?v=(?P<id>[0-9A-Za-z_-]{1,10})$'
1673
1674 _TESTS = [{
1675 'url': 'https://www.youtube.com/watch?v=N_708QY7Ob',
1676 'only_matching': True,
1677 }]
1678
1679 def _real_extract(self, url):
1680 video_id = self._match_id(url)
1681 raise ExtractorError(
1682 'Incomplete YouTube ID %s. URL %s looks truncated.' % (video_id, url),
1683 expected=True)