]> jfr.im git - yt-dlp.git/blame - youtube_dl/extractor/youtube.py
[youtube] Do not warn if DASH manifest is missing (#4442)
[yt-dlp.git] / youtube_dl / extractor / youtube.py
CommitLineData
c5e8d7af 1# coding: utf-8
c5e8d7af 2
78caa52a
PH
3from __future__ import unicode_literals
4
5
0ca96d48 6import itertools
c5e8d7af 7import json
c4417ddb 8import os.path
c5e8d7af 9import re
42939b61 10import time
e0df6211 11import traceback
c5e8d7af 12
b05654f0 13from .common import InfoExtractor, SearchInfoExtractor
54d39d8b 14from .subtitles import SubtitlesInfoExtractor
2b25cb5d 15from ..jsinterp import JSInterpreter
54256267 16from ..swfinterp import SWFInterpreter
4bb4a188 17from ..compat import (
edf3e38e 18 compat_chr,
c5e8d7af 19 compat_parse_qs,
c5e8d7af
PH
20 compat_urllib_parse,
21 compat_urllib_request,
7c61bd36 22 compat_urlparse,
c5e8d7af 23 compat_str,
4bb4a188
PH
24)
25from ..utils import (
c5e8d7af 26 clean_html,
c5e8d7af 27 ExtractorError,
4bb4a188
PH
28 get_element_by_attribute,
29 get_element_by_id,
dd27fd17 30 int_or_none,
9c44d242 31 OnDemandPagedList,
4bb4a188 32 orderedSet,
c5e8d7af
PH
33 unescapeHTML,
34 unified_strdate,
81c2f20b 35 uppercase_escape,
c5e8d7af
PH
36)
37
5f6a1245 38
de7f3446 39class YoutubeBaseInfoExtractor(InfoExtractor):
b2e8bc1b
JMF
40 """Provide base functions for Youtube extractors"""
41 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
83317f69 42 _TWOFACTOR_URL = 'https://accounts.google.com/SecondFactor'
b2e8bc1b
JMF
43 _NETRC_MACHINE = 'youtube'
44 # If True it will raise an error if no login info is provided
45 _LOGIN_REQUIRED = False
46
b2e8bc1b 47 def _set_language(self):
810fb84d
PH
48 self._set_cookie(
49 '.youtube.com', 'PREF', 'f1=50000000&hl=en',
42939b61 50 # YouTube sets the expire time to about two months
810fb84d 51 expire_time=time.time() + 2 * 30 * 24 * 3600)
b2e8bc1b
JMF
52
53 def _login(self):
83317f69 54 """
55 Attempt to log in to YouTube.
56 True is returned if successful or skipped.
57 False is returned if login failed.
58
59 If _LOGIN_REQUIRED is set and no authentication was provided, an error is raised.
60 """
b2e8bc1b
JMF
61 (username, password) = self._get_login_info()
62 # No authentication to be performed
63 if username is None:
64 if self._LOGIN_REQUIRED:
69ea8ca4 65 raise ExtractorError('No login info available, needed for using %s.' % self.IE_NAME, expected=True)
83317f69 66 return True
b2e8bc1b 67
7cc3570e
PH
68 login_page = self._download_webpage(
69 self._LOGIN_URL, None,
69ea8ca4
PH
70 note='Downloading login page',
71 errnote='unable to fetch login page', fatal=False)
7cc3570e
PH
72 if login_page is False:
73 return
b2e8bc1b 74
795f28f8 75 galx = self._search_regex(r'(?s)<input.+?name="GALX".+?value="(.+?)"',
78caa52a 76 login_page, 'Login GALX parameter')
c5e8d7af 77
b2e8bc1b
JMF
78 # Log in
79 login_form_strs = {
8bcc8756
JW
80 'continue': 'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
81 'Email': username,
82 'GALX': galx,
83 'Passwd': password,
84
85 'PersistentCookie': 'yes',
86 '_utf8': '霱',
87 'bgresponse': 'js_disabled',
88 'checkConnection': '',
89 'checkedDomains': 'youtube',
90 'dnConn': '',
91 'pstMsg': '0',
92 'rmShown': '1',
93 'secTok': '',
94 'signIn': 'Sign in',
95 'timeStmp': '',
96 'service': 'youtube',
97 'uilel': '3',
98 'hl': 'en_US',
b2e8bc1b 99 }
83317f69 100
b2e8bc1b
JMF
101 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
102 # chokes on unicode
5f6a1245 103 login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k, v in login_form_strs.items())
b2e8bc1b 104 login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
7cc3570e
PH
105
106 req = compat_urllib_request.Request(self._LOGIN_URL, login_data)
107 login_results = self._download_webpage(
108 req, None,
69ea8ca4 109 note='Logging in', errnote='unable to log in', fatal=False)
7cc3570e
PH
110 if login_results is False:
111 return False
83317f69 112
113 if re.search(r'id="errormsg_0_Passwd"', login_results) is not None:
69ea8ca4 114 raise ExtractorError('Please use your account password and a two-factor code instead of an application-specific password.', expected=True)
83317f69 115
116 # Two-Factor
117 # TODO add SMS and phone call support - these require making a request and then prompting the user
118
119 if re.search(r'(?i)<form[^>]* id="gaia_secondfactorform"', login_results) is not None:
120 tfa_code = self._get_tfa_info()
121
122 if tfa_code is None:
69ea8ca4
PH
123 self._downloader.report_warning('Two-factor authentication required. Provide it with --twofactor <code>')
124 self._downloader.report_warning('(Note that only TOTP (Google Authenticator App) codes work at this time.)')
83317f69 125 return False
126
127 # Unlike the first login form, secTok and timeStmp are both required for the TFA form
128
129 match = re.search(r'id="secTok"\n\s+value=\'(.+)\'/>', login_results, re.M | re.U)
130 if match is None:
69ea8ca4 131 self._downloader.report_warning('Failed to get secTok - did the page structure change?')
83317f69 132 secTok = match.group(1)
133 match = re.search(r'id="timeStmp"\n\s+value=\'(.+)\'/>', login_results, re.M | re.U)
134 if match is None:
69ea8ca4 135 self._downloader.report_warning('Failed to get timeStmp - did the page structure change?')
83317f69 136 timeStmp = match.group(1)
137
138 tfa_form_strs = {
78caa52a
PH
139 'continue': 'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
140 'smsToken': '',
141 'smsUserPin': tfa_code,
142 'smsVerifyPin': 'Verify',
143
144 'PersistentCookie': 'yes',
145 'checkConnection': '',
146 'checkedDomains': 'youtube',
147 'pstMsg': '1',
148 'secTok': secTok,
149 'timeStmp': timeStmp,
150 'service': 'youtube',
151 'hl': 'en_US',
83317f69 152 }
5f6a1245 153 tfa_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k, v in tfa_form_strs.items())
83317f69 154 tfa_data = compat_urllib_parse.urlencode(tfa_form).encode('ascii')
155
156 tfa_req = compat_urllib_request.Request(self._TWOFACTOR_URL, tfa_data)
157 tfa_results = self._download_webpage(
158 tfa_req, None,
69ea8ca4 159 note='Submitting TFA code', errnote='unable to submit tfa', fatal=False)
83317f69 160
161 if tfa_results is False:
162 return False
163
164 if re.search(r'(?i)<form[^>]* id="gaia_secondfactorform"', tfa_results) is not None:
69ea8ca4 165 self._downloader.report_warning('Two-factor code expired. Please try again, or use a one-use backup code instead.')
83317f69 166 return False
167 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', tfa_results) is not None:
69ea8ca4 168 self._downloader.report_warning('unable to log in - did the page structure change?')
83317f69 169 return False
170 if re.search(r'smsauth-interstitial-reviewsettings', tfa_results) is not None:
69ea8ca4 171 self._downloader.report_warning('Your Google account has a security notice. Please log in on your web browser, resolve the notice, and try again.')
83317f69 172 return False
173
7cc3570e 174 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
69ea8ca4 175 self._downloader.report_warning('unable to log in: bad username or password')
b2e8bc1b
JMF
176 return False
177 return True
178
b2e8bc1b
JMF
179 def _real_initialize(self):
180 if self._downloader is None:
181 return
42939b61 182 self._set_language()
b2e8bc1b
JMF
183 if not self._login():
184 return
c5e8d7af 185
8377574c 186
de7f3446 187class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
78caa52a 188 IE_DESC = 'YouTube.com'
cb7dfeea 189 _VALID_URL = r"""(?x)^
c5e8d7af 190 (
edb53e2d 191 (?:https?://|//) # http(s):// or protocol-independent URL
cb7dfeea 192 (?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/|
484aaeb2 193 (?:www\.)?deturl\.com/www\.youtube\.com/|
e70dc1d1 194 (?:www\.)?pwnyoutube\.com/|
f7000f3a 195 (?:www\.)?yourepeat\.com/|
e69ae5b9
JMF
196 tube\.majestyc\.net/|
197 youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains
c5e8d7af
PH
198 (?:.*?\#/)? # handle anchor (#/) redirect urls
199 (?: # the various things that can precede the ID:
ac7553d0 200 (?:(?:v|embed|e)/(?!videoseries)) # v/ or embed/ or e/
c5e8d7af 201 |(?: # or the v= param in all its forms
f7000f3a 202 (?:(?:watch|movie)(?:_popup)?(?:\.php)?/?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
c5e8d7af
PH
203 (?:\?|\#!?) # the params delimiter ? or # or #!
204 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
205 v=
206 )
f4b05232
JMF
207 ))
208 |youtu\.be/ # just youtu.be/xxxx
edb53e2d 209 |(?:www\.)?cleanvideosearch\.com/media/action/yt/watch\?videoId=
f4b05232 210 )
c5e8d7af 211 )? # all until now is optional -> you can pass the naked ID
8963d9c2 212 ([0-9A-Za-z_-]{11}) # here is it! the YouTube video ID
9291475f 213 (?!.*?&list=) # combined list/video URLs are handled by the playlist IE
c5e8d7af
PH
214 (?(1).+)? # if we found the ID, everything can follow
215 $"""
c5e8d7af 216 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
2c62dc26
PH
217 _formats = {
218 '5': {'ext': 'flv', 'width': 400, 'height': 240},
219 '6': {'ext': 'flv', 'width': 450, 'height': 270},
220 '13': {'ext': '3gp'},
221 '17': {'ext': '3gp', 'width': 176, 'height': 144},
222 '18': {'ext': 'mp4', 'width': 640, 'height': 360},
223 '22': {'ext': 'mp4', 'width': 1280, 'height': 720},
224 '34': {'ext': 'flv', 'width': 640, 'height': 360},
225 '35': {'ext': 'flv', 'width': 854, 'height': 480},
226 '36': {'ext': '3gp', 'width': 320, 'height': 240},
227 '37': {'ext': 'mp4', 'width': 1920, 'height': 1080},
228 '38': {'ext': 'mp4', 'width': 4096, 'height': 3072},
229 '43': {'ext': 'webm', 'width': 640, 'height': 360},
230 '44': {'ext': 'webm', 'width': 854, 'height': 480},
231 '45': {'ext': 'webm', 'width': 1280, 'height': 720},
232 '46': {'ext': 'webm', 'width': 1920, 'height': 1080},
233
1d043b93 234
86fe61c8 235 # 3d videos
43b81eb9
PH
236 '82': {'ext': 'mp4', 'height': 360, 'format_note': '3D', 'preference': -20},
237 '83': {'ext': 'mp4', 'height': 480, 'format_note': '3D', 'preference': -20},
238 '84': {'ext': 'mp4', 'height': 720, 'format_note': '3D', 'preference': -20},
239 '85': {'ext': 'mp4', 'height': 1080, 'format_note': '3D', 'preference': -20},
240 '100': {'ext': 'webm', 'height': 360, 'format_note': '3D', 'preference': -20},
241 '101': {'ext': 'webm', 'height': 480, 'format_note': '3D', 'preference': -20},
242 '102': {'ext': 'webm', 'height': 720, 'format_note': '3D', 'preference': -20},
836a086c 243
96fb5605 244 # Apple HTTP Live Streaming
43b81eb9
PH
245 '92': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'preference': -10},
246 '93': {'ext': 'mp4', 'height': 360, 'format_note': 'HLS', 'preference': -10},
247 '94': {'ext': 'mp4', 'height': 480, 'format_note': 'HLS', 'preference': -10},
248 '95': {'ext': 'mp4', 'height': 720, 'format_note': 'HLS', 'preference': -10},
249 '96': {'ext': 'mp4', 'height': 1080, 'format_note': 'HLS', 'preference': -10},
250 '132': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'preference': -10},
251 '151': {'ext': 'mp4', 'height': 72, 'format_note': 'HLS', 'preference': -10},
2c62dc26
PH
252
253 # DASH mp4 video
43b81eb9
PH
254 '133': {'ext': 'mp4', 'height': 240, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
255 '134': {'ext': 'mp4', 'height': 360, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
256 '135': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
257 '136': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
258 '137': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
259 '138': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
260 '160': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
261 '264': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
0d2c1418
PH
262 '298': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'h264'},
263 '299': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'h264'},
264 '266': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'vcodec': 'h264'},
836a086c 265
f6f1fc92 266 # Dash mp4 audio
2c62dc26
PH
267 '139': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 48, 'preference': -50},
268 '140': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 128, 'preference': -50},
269 '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 256, 'preference': -50},
836a086c
AZ
270
271 # Dash webm
e75cafe9
A
272 '167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
273 '168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
274 '169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
275 '170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
276 '218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
277 '219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
1cc887cb 278 '278': {'ext': 'webm', 'height': 144, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'container': 'webm', 'vcodec': 'VP9'},
e75cafe9
A
279 '242': {'ext': 'webm', 'height': 240, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
280 '243': {'ext': 'webm', 'height': 360, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
281 '244': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
282 '245': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
283 '246': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
284 '247': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
285 '248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
3c80377b 286 '271': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
18061bba 287 '272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
a51d3aa0
PH
288 '302': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'VP9'},
289 '303': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'VP9'},
10a404c3 290 '313': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'vcodec': 'VP9'},
2c62dc26
PH
291
292 # Dash webm audio
55db73ef 293 '171': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'abr': 128, 'preference': -50},
e75cafe9 294 '172': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'abr': 256, 'preference': -50},
ce6b9a2d 295
0857baad
PH
296 # Dash webm audio with opus inside
297 '249': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 50, 'preference': -50},
298 '250': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 70, 'preference': -50},
299 '251': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 160, 'preference': -50},
300
ce6b9a2d
PH
301 # RTMP (unnamed)
302 '_rtmp': {'protocol': 'rtmp'},
c5e8d7af 303 }
836a086c 304
78caa52a 305 IE_NAME = 'youtube'
2eb88d95
PH
306 _TESTS = [
307 {
4bc3a23e
PH
308 'url': 'http://www.youtube.com/watch?v=BaW_jenozKc',
309 'info_dict': {
310 'id': 'BaW_jenozKc',
311 'ext': 'mp4',
312 'title': 'youtube-dl test video "\'/\\ä↭𝕐',
313 'uploader': 'Philipp Hagemeister',
314 'uploader_id': 'phihag',
315 'upload_date': '20121002',
316 'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',
317 'categories': ['Science & Technology'],
3e7c1224
PH
318 'like_count': int,
319 'dislike_count': int,
2eb88d95 320 }
0e853ca4 321 },
0e853ca4 322 {
4bc3a23e
PH
323 'url': 'http://www.youtube.com/watch?v=UxxajLWwzqY',
324 'note': 'Test generic use_cipher_signature video (#897)',
325 'info_dict': {
326 'id': 'UxxajLWwzqY',
327 'ext': 'mp4',
328 'upload_date': '20120506',
329 'title': 'Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]',
330 'description': 'md5:fea86fda2d5a5784273df5c7cc994d9f',
331 'uploader': 'Icona Pop',
332 'uploader_id': 'IconaPop',
2eb88d95 333 }
c108eb73
JMF
334 },
335 {
4bc3a23e
PH
336 'url': 'https://www.youtube.com/watch?v=07FYdnEawAQ',
337 'note': 'Test VEVO video with age protection (#956)',
338 'info_dict': {
339 'id': '07FYdnEawAQ',
340 'ext': 'mp4',
341 'upload_date': '20130703',
342 'title': 'Justin Timberlake - Tunnel Vision (Explicit)',
343 'description': 'md5:64249768eec3bc4276236606ea996373',
344 'uploader': 'justintimberlakeVEVO',
345 'uploader_id': 'justintimberlakeVEVO',
c108eb73
JMF
346 }
347 },
fccd3771 348 {
4bc3a23e
PH
349 'url': '//www.YouTube.com/watch?v=yZIXLfi8CZQ',
350 'note': 'Embed-only video (#1746)',
351 'info_dict': {
352 'id': 'yZIXLfi8CZQ',
353 'ext': 'mp4',
354 'upload_date': '20120608',
355 'title': 'Principal Sexually Assaults A Teacher - Episode 117 - 8th June 2012',
356 'description': 'md5:09b78bd971f1e3e289601dfba15ca4f7',
357 'uploader': 'SET India',
358 'uploader_id': 'setindia'
fccd3771
PH
359 }
360 },
dd27fd17 361 {
4bc3a23e
PH
362 'url': 'http://www.youtube.com/watch?v=a9LDPn-MO4I',
363 'note': '256k DASH audio (format 141) via DASH manifest',
364 'info_dict': {
365 'id': 'a9LDPn-MO4I',
366 'ext': 'm4a',
367 'upload_date': '20121002',
368 'uploader_id': '8KVIDEO',
369 'description': '',
370 'uploader': '8KVIDEO',
371 'title': 'UHDTV TEST 8K VIDEO.mp4'
4919603f 372 },
4bc3a23e
PH
373 'params': {
374 'youtube_include_dash_manifest': True,
375 'format': '141',
4919603f 376 },
dd27fd17 377 },
3489b7d2
JMF
378 # DASH manifest with encrypted signature
379 {
78caa52a
PH
380 'url': 'https://www.youtube.com/watch?v=IB3lcPjvWLA',
381 'info_dict': {
382 'id': 'IB3lcPjvWLA',
383 'ext': 'm4a',
b766eb27
JMF
384 'title': 'Afrojack, Spree Wilson - The Spark ft. Spree Wilson',
385 'description': 'md5:12e7067fa6735a77bdcbb58cb1187d2d',
78caa52a
PH
386 'uploader': 'AfrojackVEVO',
387 'uploader_id': 'AfrojackVEVO',
388 'upload_date': '20131011',
3489b7d2 389 },
4bc3a23e 390 'params': {
78caa52a
PH
391 'youtube_include_dash_manifest': True,
392 'format': '141',
3489b7d2
JMF
393 },
394 },
aa79ac0c
PH
395 # Controversy video
396 {
397 'url': 'https://www.youtube.com/watch?v=T4XJQO3qol8',
398 'info_dict': {
399 'id': 'T4XJQO3qol8',
400 'ext': 'mp4',
401 'upload_date': '20100909',
402 'uploader': 'The Amazing Atheist',
403 'uploader_id': 'TheAmazingAtheist',
404 'title': 'Burning Everyone\'s Koran',
405 'description': 'SUBSCRIBE: http://www.youtube.com/saturninefilms\n\nEven Obama has taken a stand against freedom on this issue: http://www.huffingtonpost.com/2010/09/09/obama-gma-interview-quran_n_710282.html',
406 }
c522adb1
JMF
407 },
408 # Normal age-gate video (No vevo, embed allowed)
409 {
410 'url': 'http://youtube.com/watch?v=HtVdAasjOgU',
411 'info_dict': {
412 'id': 'HtVdAasjOgU',
413 'ext': 'mp4',
414 'title': 'The Witcher 3: Wild Hunt - The Sword Of Destiny Trailer',
415 'description': 'md5:eca57043abae25130f58f655ad9a7771',
416 'uploader': 'The Witcher',
417 'uploader_id': 'WitcherGame',
418 'upload_date': '20140605',
419 },
420 },
774e208f
PH
421 # video_info is None (https://github.com/rg3/youtube-dl/issues/4421)
422 {
423 'url': '__2ABJjxzNo',
424 'info_dict': {
425 'id': '__2ABJjxzNo',
426 'ext': 'mp4',
427 'upload_date': '20100430',
428 'uploader_id': 'deadmau5',
429 'description': 'md5:12c56784b8032162bb936a5f76d55360',
430 'uploader': 'deadmau5',
431 'title': 'Deadmau5 - Some Chords (HD)',
432 },
433 'expected_warnings': [
434 'DASH manifest missing',
435 ]
e52a40ab
PH
436 },
437 # Olympics (https://github.com/rg3/youtube-dl/issues/4431)
438 {
439 'url': 'lqQg6PlCWgI',
440 'info_dict': {
441 'id': 'lqQg6PlCWgI',
442 'ext': 'mp4',
443
444 }
774e208f 445 }
2eb88d95
PH
446 ]
447
e0df6211
PH
448 def __init__(self, *args, **kwargs):
449 super(YoutubeIE, self).__init__(*args, **kwargs)
83799698 450 self._player_cache = {}
e0df6211 451
c5e8d7af
PH
452 def report_video_info_webpage_download(self, video_id):
453 """Report attempt to download video info webpage."""
69ea8ca4 454 self.to_screen('%s: Downloading video info webpage' % video_id)
c5e8d7af 455
c5e8d7af
PH
456 def report_information_extraction(self, video_id):
457 """Report attempt to extract video information."""
69ea8ca4 458 self.to_screen('%s: Extracting video information' % video_id)
c5e8d7af
PH
459
460 def report_unavailable_format(self, video_id, format):
461 """Report extracted video URL."""
69ea8ca4 462 self.to_screen('%s: Format %s not available' % (video_id, format))
c5e8d7af
PH
463
464 def report_rtmp_download(self):
465 """Indicate the download will use the RTMP protocol."""
69ea8ca4 466 self.to_screen('RTMP download detected')
c5e8d7af 467
60064c53
PH
468 def _signature_cache_id(self, example_sig):
469 """ Return a string representation of a signature """
78caa52a 470 return '.'.join(compat_str(len(part)) for part in example_sig.split('.'))
60064c53
PH
471
472 def _extract_signature_function(self, video_id, player_url, example_sig):
cf010131 473 id_m = re.match(
c081b35c 474 r'.*-(?P<id>[a-zA-Z0-9_-]+)(?:/watch_as3|/html5player)?\.(?P<ext>[a-z]+)$',
cf010131 475 player_url)
c081b35c
PH
476 if not id_m:
477 raise ExtractorError('Cannot identify player %r' % player_url)
e0df6211
PH
478 player_type = id_m.group('ext')
479 player_id = id_m.group('id')
480
c4417ddb 481 # Read from filesystem cache
60064c53
PH
482 func_id = '%s_%s_%s' % (
483 player_type, player_id, self._signature_cache_id(example_sig))
c4417ddb 484 assert os.path.basename(func_id) == func_id
a0e07d31 485
69ea8ca4 486 cache_spec = self._downloader.cache.load('youtube-sigfuncs', func_id)
a0e07d31 487 if cache_spec is not None:
78caa52a 488 return lambda s: ''.join(s[i] for i in cache_spec)
83799698 489
e0df6211
PH
490 if player_type == 'js':
491 code = self._download_webpage(
492 player_url, video_id,
69ea8ca4
PH
493 note='Downloading %s player %s' % (player_type, player_id),
494 errnote='Download of %s failed' % player_url)
83799698 495 res = self._parse_sig_js(code)
c4417ddb 496 elif player_type == 'swf':
e0df6211
PH
497 urlh = self._request_webpage(
498 player_url, video_id,
69ea8ca4
PH
499 note='Downloading %s player %s' % (player_type, player_id),
500 errnote='Download of %s failed' % player_url)
e0df6211 501 code = urlh.read()
83799698 502 res = self._parse_sig_swf(code)
e0df6211
PH
503 else:
504 assert False, 'Invalid player type %r' % player_type
505
a0e07d31 506 if cache_spec is None:
78caa52a 507 test_string = ''.join(map(compat_chr, range(len(example_sig))))
a0e07d31
PH
508 cache_res = res(test_string)
509 cache_spec = [ord(c) for c in cache_res]
83799698 510
69ea8ca4 511 self._downloader.cache.store('youtube-sigfuncs', func_id, cache_spec)
83799698
PH
512 return res
513
60064c53 514 def _print_sig_code(self, func, example_sig):
edf3e38e
PH
515 def gen_sig_code(idxs):
516 def _genslice(start, end, step):
78caa52a 517 starts = '' if start == 0 else str(start)
8bcc8756 518 ends = (':%d' % (end + step)) if end + step >= 0 else ':'
69ea8ca4 519 steps = '' if step == 1 else (':%d' % step)
78caa52a 520 return 's[%s%s%s]' % (starts, ends, steps)
edf3e38e
PH
521
522 step = None
0ca96d48
PH
523 start = '(Never used)' # Quelch pyflakes warnings - start will be
524 # set as soon as step is set
edf3e38e
PH
525 for i, prev in zip(idxs[1:], idxs[:-1]):
526 if step is not None:
527 if i - prev == step:
528 continue
529 yield _genslice(start, prev, step)
530 step = None
531 continue
532 if i - prev in [-1, 1]:
533 step = i - prev
534 start = prev
535 continue
536 else:
78caa52a 537 yield 's[%d]' % prev
edf3e38e 538 if step is None:
78caa52a 539 yield 's[%d]' % i
edf3e38e
PH
540 else:
541 yield _genslice(start, i, step)
542
78caa52a 543 test_string = ''.join(map(compat_chr, range(len(example_sig))))
c705320f 544 cache_res = func(test_string)
edf3e38e 545 cache_spec = [ord(c) for c in cache_res]
78caa52a 546 expr_code = ' + '.join(gen_sig_code(cache_spec))
60064c53
PH
547 signature_id_tuple = '(%s)' % (
548 ', '.join(compat_str(len(p)) for p in example_sig.split('.')))
69ea8ca4 549 code = ('if tuple(len(p) for p in s.split(\'.\')) == %s:\n'
78caa52a 550 ' return %s\n') % (signature_id_tuple, expr_code)
69ea8ca4 551 self.to_screen('Extracted signature function:\n' + code)
edf3e38e 552
e0df6211
PH
553 def _parse_sig_js(self, jscode):
554 funcname = self._search_regex(
894dd868 555 r'\.sig\|\|([a-zA-Z0-9]+)\(', jscode,
8bcc8756 556 'Initial JS player signature function name')
2b25cb5d
PH
557
558 jsi = JSInterpreter(jscode)
559 initial_function = jsi.extract_function(funcname)
e0df6211
PH
560 return lambda s: initial_function([s])
561
562 def _parse_sig_swf(self, file_contents):
54256267 563 swfi = SWFInterpreter(file_contents)
78caa52a 564 TARGET_CLASSNAME = 'SignatureDecipher'
54256267 565 searched_class = swfi.extract_class(TARGET_CLASSNAME)
78caa52a 566 initial_function = swfi.extract_function(searched_class, 'decipher')
e0df6211
PH
567 return lambda s: initial_function([s])
568
83799698 569 def _decrypt_signature(self, s, video_id, player_url, age_gate=False):
257a2501 570 """Turn the encrypted s field into a working signature"""
6b37f0be 571
c8bf86d5 572 if player_url is None:
69ea8ca4 573 raise ExtractorError('Cannot decrypt signature without player_url')
920de7a2 574
69ea8ca4 575 if player_url.startswith('//'):
78caa52a 576 player_url = 'https:' + player_url
c8bf86d5 577 try:
62af3a0e 578 player_id = (player_url, self._signature_cache_id(s))
c8bf86d5
PH
579 if player_id not in self._player_cache:
580 func = self._extract_signature_function(
60064c53 581 video_id, player_url, s
c8bf86d5
PH
582 )
583 self._player_cache[player_id] = func
584 func = self._player_cache[player_id]
585 if self._downloader.params.get('youtube_print_sig_code'):
60064c53 586 self._print_sig_code(func, s)
c8bf86d5
PH
587 return func(s)
588 except Exception as e:
589 tb = traceback.format_exc()
590 raise ExtractorError(
78caa52a 591 'Signature extraction failed: ' + tb, cause=e)
e0df6211 592
1f343eaa 593 def _get_available_subtitles(self, video_id, webpage):
de7f3446 594 try:
7fad1c63 595 sub_list = self._download_webpage(
38c2e5b8 596 'https://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,
7fad1c63
JMF
597 video_id, note=False)
598 except ExtractorError as err:
69ea8ca4 599 self._downloader.report_warning('unable to download video subtitles: %s' % compat_str(err))
de7f3446
JMF
600 return {}
601 lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
602
603 sub_lang_list = {}
604 for l in lang_list:
605 lang = l[1]
7e660ac1
LD
606 if lang in sub_lang_list:
607 continue
de7f3446
JMF
608 params = compat_urllib_parse.urlencode({
609 'lang': lang,
610 'v': video_id,
ca715127 611 'fmt': self._downloader.params.get('subtitlesformat', 'srt'),
c3197e3e 612 'name': unescapeHTML(l[0]).encode('utf-8'),
de7f3446 613 })
78caa52a 614 url = 'https://www.youtube.com/api/timedtext?' + params
de7f3446
JMF
615 sub_lang_list[lang] = url
616 if not sub_lang_list:
69ea8ca4 617 self._downloader.report_warning('video doesn\'t have subtitles')
de7f3446
JMF
618 return {}
619 return sub_lang_list
620
055e6f36 621 def _get_available_automatic_caption(self, video_id, webpage):
de7f3446
JMF
622 """We need the webpage for getting the captions url, pass it as an
623 argument to speed up the process."""
ca715127 624 sub_format = self._downloader.params.get('subtitlesformat', 'srt')
69ea8ca4 625 self.to_screen('%s: Looking for automatic captions' % video_id)
de7f3446 626 mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
78caa52a 627 err_msg = 'Couldn\'t find automatic captions for %s' % video_id
de7f3446
JMF
628 if mobj is None:
629 self._downloader.report_warning(err_msg)
630 return {}
631 player_config = json.loads(mobj.group(1))
632 try:
0792d563
PH
633 args = player_config['args']
634 caption_url = args['ttsurl']
635 timestamp = args['timestamp']
055e6f36
JMF
636 # We get the available subtitles
637 list_params = compat_urllib_parse.urlencode({
638 'type': 'list',
639 'tlangs': 1,
640 'asrs': 1,
de7f3446 641 })
055e6f36 642 list_url = caption_url + '&' + list_params
e26f8712 643 caption_list = self._download_xml(list_url, video_id)
e3dc22ca 644 original_lang_node = caption_list.find('track')
5f6a1245 645 if original_lang_node is None or original_lang_node.attrib.get('kind') != 'asr':
69ea8ca4 646 self._downloader.report_warning('Video doesn\'t have automatic captions')
e3dc22ca
JMF
647 return {}
648 original_lang = original_lang_node.attrib['lang_code']
055e6f36
JMF
649
650 sub_lang_list = {}
651 for lang_node in caption_list.findall('target'):
652 sub_lang = lang_node.attrib['lang_code']
653 params = compat_urllib_parse.urlencode({
654 'lang': original_lang,
655 'tlang': sub_lang,
656 'fmt': sub_format,
657 'ts': timestamp,
658 'kind': 'asr',
659 })
660 sub_lang_list[sub_lang] = caption_url + '&' + params
661 return sub_lang_list
de7f3446
JMF
662 # An extractor error can be raise by the download process if there are
663 # no automatic captions but there are subtitles
664 except (KeyError, ExtractorError):
665 self._downloader.report_warning(err_msg)
666 return {}
667
97665381
PH
668 @classmethod
669 def extract_id(cls, url):
670 mobj = re.match(cls._VALID_URL, url, re.VERBOSE)
c5e8d7af 671 if mobj is None:
69ea8ca4 672 raise ExtractorError('Invalid URL: %s' % url)
c5e8d7af
PH
673 video_id = mobj.group(2)
674 return video_id
675
1d043b93
JMF
676 def _extract_from_m3u8(self, manifest_url, video_id):
677 url_map = {}
5f6a1245 678
1d043b93
JMF
679 def _get_urls(_manifest):
680 lines = _manifest.split('\n')
681 urls = filter(lambda l: l and not l.startswith('#'),
8bcc8756 682 lines)
1d043b93 683 return urls
78caa52a 684 manifest = self._download_webpage(manifest_url, video_id, 'Downloading formats manifest')
1d043b93
JMF
685 formats_urls = _get_urls(manifest)
686 for format_url in formats_urls:
890f62e8 687 itag = self._search_regex(r'itag/(\d+?)/', format_url, 'itag')
1d043b93
JMF
688 url_map[itag] = format_url
689 return url_map
690
1fb07d10
JG
691 def _extract_annotations(self, video_id):
692 url = 'https://www.youtube.com/annotations_invideo?features=1&legacy=1&video_id=%s' % video_id
69ea8ca4 693 return self._download_webpage(url, video_id, note='Searching for annotations.', errnote='Unable to download video annotations.')
1fb07d10 694
da276600
PH
695 def _parse_dash_manifest(
696 self, video_id, dash_manifest_url, player_url, age_gate):
774e208f
PH
697 def decrypt_sig(mobj):
698 s = mobj.group(1)
699 dec_s = self._decrypt_signature(s, video_id, player_url, age_gate)
700 return '/signature/%s' % dec_s
701 dash_manifest_url = re.sub(r'/s/([\w\.]+)', decrypt_sig, dash_manifest_url)
702 dash_doc = self._download_xml(
703 dash_manifest_url, video_id,
704 note='Downloading DASH manifest',
705 errnote='Could not download DASH manifest')
706
707 formats = []
708 for r in dash_doc.findall('.//{urn:mpeg:DASH:schema:MPD:2011}Representation'):
709 url_el = r.find('{urn:mpeg:DASH:schema:MPD:2011}BaseURL')
710 if url_el is None:
711 continue
712 format_id = r.attrib['id']
713 video_url = url_el.text
714 filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength'))
715 f = {
716 'format_id': format_id,
717 'url': video_url,
718 'width': int_or_none(r.attrib.get('width')),
719 'tbr': int_or_none(r.attrib.get('bandwidth'), 1000),
720 'asr': int_or_none(r.attrib.get('audioSamplingRate')),
721 'filesize': filesize,
722 'fps': int_or_none(r.attrib.get('frameRate')),
723 }
724 try:
725 existing_format = next(
726 fo for fo in formats
727 if fo['format_id'] == format_id)
728 except StopIteration:
729 f.update(self._formats.get(format_id, {}))
730 formats.append(f)
731 else:
732 existing_format.update(f)
733 return formats
734
c5e8d7af 735 def _real_extract(self, url):
7e8c0af0 736 proto = (
78caa52a
PH
737 'http' if self._downloader.params.get('prefer_insecure', False)
738 else 'https')
7e8c0af0 739
c5e8d7af
PH
740 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
741 mobj = re.search(self._NEXT_URL_RE, url)
742 if mobj:
7e8c0af0 743 url = proto + '://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
97665381 744 video_id = self.extract_id(url)
c5e8d7af
PH
745
746 # Get video webpage
aa79ac0c 747 url = proto + '://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1&bpctr=9999999999' % video_id
a1f934b1 748 video_webpage = self._download_webpage(url, video_id)
c5e8d7af
PH
749
750 # Attempt to extract SWF player URL
e0df6211 751 mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
c5e8d7af
PH
752 if mobj is not None:
753 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
754 else:
755 player_url = None
756
757 # Get video info
c108eb73 758 if re.search(r'player-age-gate-content">', video_webpage) is not None:
c108eb73
JMF
759 age_gate = True
760 # We simulate the access to the video from www.youtube.com/v/{video_id}
761 # this can be viewed without login into Youtube
2c57c7fa
JMF
762 data = compat_urllib_parse.urlencode({
763 'video_id': video_id,
764 'eurl': 'https://youtube.googleapis.com/v/' + video_id,
c084c934 765 'sts': self._search_regex(
94bd3613 766 r'"sts"\s*:\s*(\d+)', video_webpage, 'sts', default=''),
2c57c7fa 767 })
7e8c0af0 768 video_info_url = proto + '://www.youtube.com/get_video_info?' + data
94bd3613
PH
769 video_info_webpage = self._download_webpage(
770 video_info_url, video_id,
20436c30 771 note='Refetching age-gated info webpage',
94bd3613 772 errnote='unable to download video info webpage')
c5e8d7af 773 video_info = compat_parse_qs(video_info_webpage)
c108eb73
JMF
774 else:
775 age_gate = False
4e62ebe2
JMF
776 try:
777 # Try looking directly into the video webpage
778 mobj = re.search(r';ytplayer\.config\s*=\s*({.*?});', video_webpage)
779 if not mobj:
780 raise ValueError('Could not find ytplayer.config') # caught below
781 json_code = uppercase_escape(mobj.group(1))
782 ytplayer_config = json.loads(json_code)
783 args = ytplayer_config['args']
784 # Convert to the same format returned by compat_parse_qs
785 video_info = dict((k, [v]) for k, v in args.items())
786 if 'url_encoded_fmt_stream_map' not in args:
787 raise ValueError('No stream_map present') # caught below
788 except ValueError:
789 # We fallback to the get_video_info pages (used by the embed page)
790 self.report_video_info_webpage_download(video_id)
791 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
810fb84d
PH
792 video_info_url = (
793 '%s://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
794 % (proto, video_id, el_type))
795 video_info_webpage = self._download_webpage(
796 video_info_url,
4e62ebe2
JMF
797 video_id, note=False,
798 errnote='unable to download video info webpage')
799 video_info = compat_parse_qs(video_info_webpage)
800 if 'token' in video_info:
801 break
c5e8d7af
PH
802 if 'token' not in video_info:
803 if 'reason' in video_info:
d11271dd 804 raise ExtractorError(
78caa52a 805 'YouTube said: %s' % video_info['reason'][0],
d11271dd 806 expected=True, video_id=video_id)
c5e8d7af 807 else:
d11271dd 808 raise ExtractorError(
78caa52a 809 '"token" parameter not in video info for unknown reason',
d11271dd 810 video_id=video_id)
c5e8d7af 811
1d699755
PH
812 if 'view_count' in video_info:
813 view_count = int(video_info['view_count'][0])
814 else:
815 view_count = None
816
c5e8d7af
PH
817 # Check for "rental" videos
818 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
69ea8ca4 819 raise ExtractorError('"rental" videos not supported')
c5e8d7af
PH
820
821 # Start extracting information
822 self.report_information_extraction(video_id)
823
824 # uploader
825 if 'author' not in video_info:
69ea8ca4 826 raise ExtractorError('Unable to extract uploader name')
c5e8d7af
PH
827 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
828
829 # uploader_id
830 video_uploader_id = None
831 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
832 if mobj is not None:
833 video_uploader_id = mobj.group(1)
834 else:
69ea8ca4 835 self._downloader.report_warning('unable to extract uploader nickname')
c5e8d7af
PH
836
837 # title
a8c6b241 838 if 'title' in video_info:
aa92f063 839 video_title = video_info['title'][0]
a8c6b241 840 else:
69ea8ca4 841 self._downloader.report_warning('Unable to extract video title')
78caa52a 842 video_title = '_'
c5e8d7af
PH
843
844 # thumbnail image
7763b04e
JMF
845 # We try first to get a high quality image:
846 m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">',
847 video_webpage, re.DOTALL)
848 if m_thumb is not None:
849 video_thumbnail = m_thumb.group(1)
850 elif 'thumbnail_url' not in video_info:
69ea8ca4 851 self._downloader.report_warning('unable to extract video thumbnail')
f490e77e 852 video_thumbnail = None
c5e8d7af
PH
853 else: # don't panic if we can't find it
854 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
855
856 # upload date
857 upload_date = None
ad3bc6ac 858 mobj = re.search(r'(?s)id="eow-date.*?>(.*?)</span>', video_webpage)
beee53de
PH
859 if mobj is None:
860 mobj = re.search(
263bd4ec 861 r'(?s)id="watch-uploader-info".*?>.*?(?:Published|Uploaded|Streamed live) on (.*?)</strong>',
beee53de 862 video_webpage)
c5e8d7af
PH
863 if mobj is not None:
864 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
865 upload_date = unified_strdate(upload_date)
866
55f7bd2d
PH
867 m_cat_container = self._search_regex(
868 r'(?s)<h4[^>]*>\s*Category\s*</h4>\s*<ul[^>]*>(.*?)</ul>',
624dcebf 869 video_webpage, 'categories', default=None)
ec8deefc 870 if m_cat_container:
ad3bc6ac 871 category = self._html_search_regex(
01ed5c9b 872 r'(?s)<a[^<]+>(.*?)</a>', m_cat_container, 'category',
ad3bc6ac
PH
873 default=None)
874 video_categories = None if category is None else [category]
875 else:
876 video_categories = None
ec8deefc 877
c5e8d7af
PH
878 # description
879 video_description = get_element_by_id("eow-description", video_webpage)
880 if video_description:
27dcce19
PH
881 video_description = re.sub(r'''(?x)
882 <a\s+
883 (?:[a-zA-Z-]+="[^"]+"\s+)*?
884 title="([^"]+)"\s+
885 (?:[a-zA-Z-]+="[^"]+"\s+)*?
886 class="yt-uix-redirect-link"\s*>
887 [^<]+
888 </a>
889 ''', r'\1', video_description)
c5e8d7af
PH
890 video_description = clean_html(video_description)
891 else:
892 fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
893 if fd_mobj:
894 video_description = unescapeHTML(fd_mobj.group(1))
895 else:
78caa52a 896 video_description = ''
c5e8d7af 897
f30a38be 898 def _extract_count(count_name):
46374a56 899 count = self._search_regex(
f30a38be
JMF
900 r'id="watch-%s"[^>]*>.*?([\d,]+)\s*</span>' % re.escape(count_name),
901 video_webpage, count_name, default=None)
336c3a69
JMF
902 if count is not None:
903 return int(count.replace(',', ''))
904 return None
69ea8ca4
PH
905 like_count = _extract_count('like')
906 dislike_count = _extract_count('dislike')
336c3a69 907
c5e8d7af 908 # subtitles
d82134c3 909 video_subtitles = self.extract_subtitles(video_id, video_webpage)
c5e8d7af 910
c5e8d7af 911 if self._downloader.params.get('listsubtitles', False):
d665f8d3 912 self._list_available_subtitles(video_id, video_webpage)
c5e8d7af
PH
913 return
914
915 if 'length_seconds' not in video_info:
69ea8ca4 916 self._downloader.report_warning('unable to extract video duration')
b466b702 917 video_duration = None
c5e8d7af 918 else:
b466b702 919 video_duration = int(compat_urllib_parse.unquote_plus(video_info['length_seconds'][0]))
c5e8d7af 920
1fb07d10
JG
921 # annotations
922 video_annotations = None
923 if self._downloader.params.get('writeannotations', False):
5f6a1245 924 video_annotations = self._extract_annotations(video_id)
1fb07d10 925
dd27fd17
PH
926 def _map_to_format_list(urlmap):
927 formats = []
928 for itag, video_real_url in urlmap.items():
929 dct = {
930 'format_id': itag,
931 'url': video_real_url,
932 'player_url': player_url,
933 }
0b65e5d4
PH
934 if itag in self._formats:
935 dct.update(self._formats[itag])
dd27fd17
PH
936 formats.append(dct)
937 return formats
938
c5e8d7af
PH
939 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
940 self.report_rtmp_download()
dd27fd17
PH
941 formats = [{
942 'format_id': '_rtmp',
943 'protocol': 'rtmp',
944 'url': video_info['conn'][0],
945 'player_url': player_url,
946 }]
24270b03 947 elif len(video_info.get('url_encoded_fmt_stream_map', [''])[0]) >= 1 or len(video_info.get('adaptive_fmts', [''])[0]) >= 1:
5f6a1245 948 encoded_url_map = video_info.get('url_encoded_fmt_stream_map', [''])[0] + ',' + video_info.get('adaptive_fmts', [''])[0]
00fe14fc 949 if 'rtmpe%3Dyes' in encoded_url_map:
a7055eb9 950 raise ExtractorError('rtmpe downloads are not supported, see https://github.com/rg3/youtube-dl/issues/343 for more information.', expected=True)
c5e8d7af 951 url_map = {}
00fe14fc 952 for url_data_str in encoded_url_map.split(','):
c5e8d7af 953 url_data = compat_parse_qs(url_data_str)
201e9eaa
PH
954 if 'itag' not in url_data or 'url' not in url_data:
955 continue
956 format_id = url_data['itag'][0]
957 url = url_data['url'][0]
958
959 if 'sig' in url_data:
960 url += '&signature=' + url_data['sig'][0]
961 elif 's' in url_data:
962 encrypted_sig = url_data['s'][0]
963
964 if not age_gate:
965 jsplayer_url_json = self._search_regex(
966 r'"assets":.+?"js":\s*("[^"]+")',
78caa52a 967 video_webpage, 'JS player URL')
201e9eaa
PH
968 player_url = json.loads(jsplayer_url_json)
969 if player_url is None:
970 player_url_json = self._search_regex(
971 r'ytplayer\.config.*?"url"\s*:\s*("[^"]+")',
78caa52a 972 video_webpage, 'age gate player URL')
201e9eaa
PH
973 player_url = json.loads(player_url_json)
974
975 if self._downloader.params.get('verbose'):
cf010131 976 if player_url is None:
201e9eaa
PH
977 player_version = 'unknown'
978 player_desc = 'unknown'
979 else:
980 if player_url.endswith('swf'):
981 player_version = self._search_regex(
982 r'-(.+?)(?:/watch_as3)?\.swf$', player_url,
78caa52a 983 'flash player', fatal=False)
201e9eaa 984 player_desc = 'flash player %s' % player_version
cf010131 985 else:
201e9eaa
PH
986 player_version = self._search_regex(
987 r'html5player-([^/]+?)(?:/html5player)?\.js',
988 player_url,
989 'html5 player', fatal=False)
78caa52a 990 player_desc = 'html5 player %s' % player_version
201e9eaa 991
60064c53 992 parts_sizes = self._signature_cache_id(encrypted_sig)
69ea8ca4 993 self.to_screen('{%s} signature length %s, %s' %
9e1a5b84 994 (format_id, parts_sizes, player_desc))
201e9eaa
PH
995
996 signature = self._decrypt_signature(
997 encrypted_sig, video_id, player_url, age_gate)
998 url += '&signature=' + signature
999 if 'ratebypass' not in url:
1000 url += '&ratebypass=yes'
1001 url_map[format_id] = url
dd27fd17 1002 formats = _map_to_format_list(url_map)
1d043b93
JMF
1003 elif video_info.get('hlsvp'):
1004 manifest_url = video_info['hlsvp'][0]
1005 url_map = self._extract_from_m3u8(manifest_url, video_id)
dd27fd17 1006 formats = _map_to_format_list(url_map)
c5e8d7af 1007 else:
69ea8ca4 1008 raise ExtractorError('no conn, hlsvp or url_encoded_fmt_stream_map information found in video info')
c5e8d7af 1009
dd27fd17 1010 # Look for the DASH manifest
203fb43f 1011 if self._downloader.params.get('youtube_include_dash_manifest', True):
774e208f 1012 dash_mpd = video_info.get('dashmpd')
75111274 1013 if dash_mpd:
774e208f
PH
1014 dash_manifest_url = dash_mpd[0]
1015 try:
1016 dash_formats = self._parse_dash_manifest(
da276600 1017 video_id, dash_manifest_url, player_url, age_gate)
774e208f
PH
1018 except (ExtractorError, KeyError) as e:
1019 self.report_warning(
1020 'Skipping DASH manifest: %r' % e, video_id)
1021 else:
1022 formats.extend(dash_formats)
d80044c2 1023
4bcc7bd1 1024 self._sort_formats(formats)
4ea3be0a 1025
1026 return {
8bcc8756
JW
1027 'id': video_id,
1028 'uploader': video_uploader,
1029 'uploader_id': video_uploader_id,
1030 'upload_date': upload_date,
1031 'title': video_title,
1032 'thumbnail': video_thumbnail,
1033 'description': video_description,
1034 'categories': video_categories,
1035 'subtitles': video_subtitles,
1036 'duration': video_duration,
1037 'age_limit': 18 if age_gate else 0,
1038 'annotations': video_annotations,
7e8c0af0 1039 'webpage_url': proto + '://www.youtube.com/watch?v=%s' % video_id,
8bcc8756 1040 'view_count': view_count,
4ea3be0a 1041 'like_count': like_count,
1042 'dislike_count': dislike_count,
8bcc8756 1043 'formats': formats,
4ea3be0a 1044 }
c5e8d7af 1045
5f6a1245 1046
880e1c52 1047class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
78caa52a 1048 IE_DESC = 'YouTube.com playlists'
d67cc9fa 1049 _VALID_URL = r"""(?x)(?:
c5e8d7af
PH
1050 (?:https?://)?
1051 (?:\w+\.)?
1052 youtube\.com/
1053 (?:
ac7553d0 1054 (?:course|view_play_list|my_playlists|artist|playlist|watch|embed/videoseries)
c5e8d7af
PH
1055 \? (?:.*?&)*? (?:p|a|list)=
1056 | p/
1057 )
d67cc9fa 1058 (
7d568f5a 1059 (?:PL|LL|EC|UU|FL|RD)?[0-9A-Za-z-_]{10,}
5f6a1245 1060 # Top tracks, they can also include dots
d67cc9fa
JMF
1061 |(?:MC)[\w\.]*
1062 )
c5e8d7af
PH
1063 .*
1064 |
7d568f5a 1065 ((?:PL|LL|EC|UU|FL|RD)[0-9A-Za-z-_]{10,})
c5e8d7af 1066 )"""
dbb94fb0 1067 _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s'
dcbb4580 1068 _MORE_PAGES_INDICATOR = r'data-link-type="next"'
dbb94fb0 1069 _VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})&amp;[^"]*?index=(?P<index>\d+)'
78caa52a 1070 IE_NAME = 'youtube:playlist'
81127aa5
PH
1071 _TESTS = [{
1072 'url': 'https://www.youtube.com/playlist?list=PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re',
1073 'info_dict': {
1074 'title': 'ytdl test PL',
a1cf99d0 1075 'id': 'PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re',
81127aa5
PH
1076 },
1077 'playlist_count': 3,
9291475f
PH
1078 }, {
1079 'url': 'https://www.youtube.com/playlist?list=PLtPgu7CB4gbZDA7i_euNxn75ISqxwZPYx',
1080 'info_dict': {
1081 'title': 'YDL_Empty_List',
1082 },
1083 'playlist_count': 0,
1084 }, {
1085 'note': 'Playlist with deleted videos (#651). As a bonus, the video #51 is also twice in this list.',
1086 'url': 'https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC',
1087 'info_dict': {
1088 'title': '29C3: Not my department',
1089 },
1090 'playlist_count': 95,
1091 }, {
1092 'note': 'issue #673',
1093 'url': 'PLBB231211A4F62143',
1094 'info_dict': {
f46a8702 1095 'title': '[OLD]Team Fortress 2 (Class-based LP)',
9291475f
PH
1096 },
1097 'playlist_mincount': 26,
1098 }, {
1099 'note': 'Large playlist',
1100 'url': 'https://www.youtube.com/playlist?list=UUBABnxM4Ar9ten8Mdjj1j0Q',
1101 'info_dict': {
1102 'title': 'Uploads from Cauchemar',
1103 },
1104 'playlist_mincount': 799,
1105 }, {
1106 'url': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl',
1107 'info_dict': {
1108 'title': 'YDL_safe_search',
1109 },
1110 'playlist_count': 2,
ac7553d0
PH
1111 }, {
1112 'note': 'embedded',
1113 'url': 'http://www.youtube.com/embed/videoseries?list=PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu',
1114 'playlist_count': 4,
1115 'info_dict': {
1116 'title': 'JODA15',
1117 }
6b08cdf6
PH
1118 }, {
1119 'note': 'Embedded SWF player',
1120 'url': 'http://www.youtube.com/p/YN5VISEtHet5D4NEvfTd0zcgFk84NqFZ?hl=en_US&fs=1&rel=0',
1121 'playlist_count': 4,
1122 'info_dict': {
1123 'title': 'JODA7',
1124 }
81127aa5 1125 }]
c5e8d7af 1126
880e1c52
JMF
1127 def _real_initialize(self):
1128 self._login()
1129
652cdaa2 1130 def _ids_to_results(self, ids):
c9cc0bf5
PH
1131 return [
1132 self.url_result(vid_id, 'Youtube', video_id=vid_id)
1133 for vid_id in ids]
652cdaa2
JMF
1134
1135 def _extract_mix(self, playlist_id):
1136 # The mixes are generated from a a single video
1137 # the id of the playlist is just 'RD' + video_id
7d4afc55 1138 url = 'https://youtube.com/watch?v=%s&list=%s' % (playlist_id[-11:], playlist_id)
c9cc0bf5 1139 webpage = self._download_webpage(
78caa52a 1140 url, playlist_id, 'Downloading Youtube mix')
bc2f773b 1141 search_title = lambda class_name: get_element_by_attribute('class', class_name, webpage)
c9cc0bf5
PH
1142 title_span = (
1143 search_title('playlist-title') or
1144 search_title('title long-title') or
1145 search_title('title'))
76d1700b 1146 title = clean_html(title_span)
c9cc0bf5
PH
1147 ids = orderedSet(re.findall(
1148 r'''(?xs)data-video-username=".*?".*?
1149 href="/watch\?v=([0-9A-Za-z_-]{11})&amp;[^"]*?list=%s''' % re.escape(playlist_id),
1150 webpage))
652cdaa2
JMF
1151 url_results = self._ids_to_results(ids)
1152
1153 return self.playlist_result(url_results, playlist_id, title)
1154
c5e8d7af
PH
1155 def _real_extract(self, url):
1156 # Extract playlist id
d67cc9fa 1157 mobj = re.match(self._VALID_URL, url)
c5e8d7af 1158 if mobj is None:
69ea8ca4 1159 raise ExtractorError('Invalid URL: %s' % url)
47192f92
FV
1160 playlist_id = mobj.group(1) or mobj.group(2)
1161
1162 # Check if it's a video-specific URL
7c61bd36 1163 query_dict = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
47192f92
FV
1164 if 'v' in query_dict:
1165 video_id = query_dict['v'][0]
1166 if self._downloader.params.get('noplaylist'):
69ea8ca4 1167 self.to_screen('Downloading just video %s because of --no-playlist' % video_id)
7012b23c 1168 return self.url_result(video_id, 'Youtube', video_id=video_id)
47192f92 1169 else:
69ea8ca4 1170 self.to_screen('Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id))
c5e8d7af 1171
7d4afc55 1172 if playlist_id.startswith('RD'):
652cdaa2
JMF
1173 # Mixes require a custom extraction process
1174 return self._extract_mix(playlist_id)
0a688bc0 1175 if playlist_id.startswith('TL'):
69ea8ca4 1176 raise ExtractorError('For downloading YouTube.com top lists, use '
8bcc8756 1177 'the "yttoplist" keyword, for example "youtube-dl \'yttoplist:music:Top Tracks\'"', expected=True)
652cdaa2 1178
dbb94fb0
S
1179 url = self._TEMPLATE_URL % playlist_id
1180 page = self._download_webpage(url, playlist_id)
1181 more_widget_html = content_html = page
1182
10c0e2d8 1183 # Check if the playlist exists or is private
e399853d 1184 if re.search(r'<div class="yt-alert-message">[^<]*?(The|This) playlist (does not exist|is private)[^<]*?</div>', page) is not None:
10c0e2d8 1185 raise ExtractorError(
78caa52a 1186 'The playlist doesn\'t exist or is private, use --username or '
10c0e2d8
JMF
1187 '--netrc to access it.',
1188 expected=True)
1189
dcbb4580
JMF
1190 # Extract the video ids from the playlist pages
1191 ids = []
c5e8d7af 1192
755eb032 1193 for page_num in itertools.count(1):
dbb94fb0 1194 matches = re.finditer(self._VIDEO_RE, content_html)
6e47b51e
JMF
1195 # We remove the duplicates and the link with index 0
1196 # (it's not the first video of the playlist)
1197 new_ids = orderedSet(m.group('id') for m in matches if m.group('index') != '0')
dcbb4580 1198 ids.extend(new_ids)
c5e8d7af 1199
dbb94fb0
S
1200 mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
1201 if not mobj:
c5e8d7af
PH
1202 break
1203
dbb94fb0 1204 more = self._download_json(
5912c639
PH
1205 'https://youtube.com/%s' % mobj.group('more'), playlist_id,
1206 'Downloading page #%s' % page_num,
1207 transform_source=uppercase_escape)
dbb94fb0
S
1208 content_html = more['content_html']
1209 more_widget_html = more['load_more_widget_html']
1210
1211 playlist_title = self._html_search_regex(
68eb8e90 1212 r'(?s)<h1 class="pl-header-title[^"]*">\s*(.*?)\s*</h1>',
78caa52a 1213 page, 'title')
c5e8d7af 1214
652cdaa2 1215 url_results = self._ids_to_results(ids)
dcbb4580 1216 return self.playlist_result(url_results, playlist_id, playlist_title)
c5e8d7af
PH
1217
1218
0a688bc0 1219class YoutubeTopListIE(YoutubePlaylistIE):
78caa52a 1220 IE_NAME = 'youtube:toplist'
69ea8ca4 1221 IE_DESC = ('YouTube.com top lists, "yttoplist:{channel}:{list title}"'
9e1a5b84 1222 ' (Example: "yttoplist:music:Top Tracks")')
0a688bc0 1223 _VALID_URL = r'yttoplist:(?P<chann>.*?):(?P<title>.*?)$'
cdc628a4
PH
1224 _TESTS = [{
1225 'url': 'yttoplist:music:Trending',
1226 'playlist_mincount': 5,
1227 'skip': 'Only works for logged-in users',
1228 }]
0a688bc0
JMF
1229
1230 def _real_extract(self, url):
1231 mobj = re.match(self._VALID_URL, url)
1232 channel = mobj.group('chann')
1233 title = mobj.group('title')
1234 query = compat_urllib_parse.urlencode({'title': title})
cdc628a4
PH
1235 channel_page = self._download_webpage(
1236 'https://www.youtube.com/%s' % channel, title)
1237 link = self._html_search_regex(
1238 r'''(?x)
1239 <a\s+href="([^"]+)".*?>\s*
1240 <span\s+class="branded-page-module-title-text">\s*
1241 <span[^>]*>.*?%s.*?</span>''' % re.escape(query),
1242 channel_page, 'list')
0a688bc0 1243 url = compat_urlparse.urljoin('https://www.youtube.com/', link)
5f6a1245 1244
0a688bc0
JMF
1245 video_re = r'data-index="\d+".*?data-video-id="([0-9A-Za-z_-]{11})"'
1246 ids = []
1247 # sometimes the webpage doesn't contain the videos
1248 # retry until we get them
1249 for i in itertools.count(0):
78caa52a 1250 msg = 'Downloading Youtube mix'
0a688bc0
JMF
1251 if i > 0:
1252 msg += ', retry #%d' % i
c9cc0bf5 1253
0a688bc0
JMF
1254 webpage = self._download_webpage(url, title, msg)
1255 ids = orderedSet(re.findall(video_re, webpage))
1256 if ids:
1257 break
1258 url_results = self._ids_to_results(ids)
1259 return self.playlist_result(url_results, playlist_title=title)
1260
1261
c5e8d7af 1262class YoutubeChannelIE(InfoExtractor):
78caa52a 1263 IE_DESC = 'YouTube.com channels'
9ff67727 1264 _VALID_URL = r'https?://(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/(?P<id>[0-9A-Za-z_-]+)'
c5e8d7af 1265 _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
38c2e5b8 1266 _MORE_PAGES_URL = 'https://www.youtube.com/c4_browse_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
78caa52a 1267 IE_NAME = 'youtube:channel'
cdc628a4
PH
1268 _TESTS = [{
1269 'note': 'paginated channel',
1270 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w',
1271 'playlist_mincount': 91,
1272 }]
c5e8d7af
PH
1273
1274 def extract_videos_from_page(self, page):
1275 ids_in_page = []
1276 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1277 if mobj.group(1) not in ids_in_page:
1278 ids_in_page.append(mobj.group(1))
1279 return ids_in_page
1280
1281 def _real_extract(self, url):
9ff67727 1282 channel_id = self._match_id(url)
c5e8d7af 1283
c5e8d7af 1284 video_ids = []
b9643eed
JMF
1285 url = 'https://www.youtube.com/channel/%s/videos' % channel_id
1286 channel_page = self._download_webpage(url, channel_id)
31812a9e
PH
1287 autogenerated = re.search(r'''(?x)
1288 class="[^"]*?(?:
1289 channel-header-autogenerated-label|
1290 yt-channel-title-autogenerated
1291 )[^"]*"''', channel_page) is not None
c5e8d7af 1292
b9643eed
JMF
1293 if autogenerated:
1294 # The videos are contained in a single page
1295 # the ajax pages can't be used, they are empty
1296 video_ids = self.extract_videos_from_page(channel_page)
b82f815f
PH
1297 entries = [
1298 self.url_result(video_id, 'Youtube', video_id=video_id)
1299 for video_id in video_ids]
1300 return self.playlist_result(entries, channel_id)
1301
1302 def _entries():
b9643eed
JMF
1303 for pagenum in itertools.count(1):
1304 url = self._MORE_PAGES_URL % (pagenum, channel_id)
81c2f20b 1305 page = self._download_json(
69ea8ca4 1306 url, channel_id, note='Downloading page #%s' % pagenum,
81c2f20b
PH
1307 transform_source=uppercase_escape)
1308
b9643eed 1309 ids_in_page = self.extract_videos_from_page(page['content_html'])
b82f815f
PH
1310 for video_id in ids_in_page:
1311 yield self.url_result(
1312 video_id, 'Youtube', video_id=video_id)
5f6a1245 1313
b9643eed
JMF
1314 if self._MORE_PAGES_INDICATOR not in page['load_more_widget_html']:
1315 break
c5e8d7af 1316
b82f815f 1317 return self.playlist_result(_entries(), channel_id)
c5e8d7af
PH
1318
1319
1320class YoutubeUserIE(InfoExtractor):
78caa52a 1321 IE_DESC = 'YouTube.com user videos (URL or "ytuser" keyword)'
9ff67727 1322 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:user/)?(?!(?:attribution_link|watch|results)(?:$|[^a-z_A-Z0-9-])))|ytuser:)(?!feed/)(?P<id>[A-Za-z0-9_-]+)'
38c2e5b8 1323 _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/users/%s'
c5e8d7af 1324 _GDATA_PAGE_SIZE = 50
38c2e5b8 1325 _GDATA_URL = 'https://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d&alt=json'
78caa52a 1326 IE_NAME = 'youtube:user'
c5e8d7af 1327
cdc628a4
PH
1328 _TESTS = [{
1329 'url': 'https://www.youtube.com/user/TheLinuxFoundation',
1330 'playlist_mincount': 320,
1331 'info_dict': {
1332 'title': 'TheLinuxFoundation',
1333 }
1334 }, {
1335 'url': 'ytuser:phihag',
1336 'only_matching': True,
1337 }]
1338
e3ea4790 1339 @classmethod
f4b05232 1340 def suitable(cls, url):
e3ea4790
JMF
1341 # Don't return True if the url can be extracted with other youtube
1342 # extractor, the regex would is too permissive and it would match.
1343 other_ies = iter(klass for (name, klass) in globals().items() if name.endswith('IE') and klass is not cls)
5f6a1245
JW
1344 if any(ie.suitable(url) for ie in other_ies):
1345 return False
1346 else:
1347 return super(YoutubeUserIE, cls).suitable(url)
f4b05232 1348
c5e8d7af 1349 def _real_extract(self, url):
9ff67727 1350 username = self._match_id(url)
c5e8d7af
PH
1351
1352 # Download video ids using YouTube Data API. Result size per
1353 # query is limited (currently to 50 videos) so we need to query
1354 # page by page until there are no video ids - it means we got
1355 # all of them.
1356
b7ab0590 1357 def download_page(pagenum):
c5e8d7af
PH
1358 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1359
1360 gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
b7ab0590
PH
1361 page = self._download_webpage(
1362 gdata_url, username,
78caa52a 1363 'Downloading video ids from %d to %d' % (
b7ab0590 1364 start_index, start_index + self._GDATA_PAGE_SIZE))
c5e8d7af 1365
fd9cf738
JMF
1366 try:
1367 response = json.loads(page)
1368 except ValueError as err:
69ea8ca4 1369 raise ExtractorError('Invalid JSON in API response: ' + compat_str(err))
71c82637 1370 if 'entry' not in response['feed']:
b7ab0590 1371 return
fd9cf738 1372
c5e8d7af 1373 # Extract video identifiers
e302f9ce
PH
1374 entries = response['feed']['entry']
1375 for entry in entries:
1376 title = entry['title']['$t']
1377 video_id = entry['id']['$t'].split('/')[-1]
b7ab0590 1378 yield {
e302f9ce
PH
1379 '_type': 'url',
1380 'url': video_id,
1381 'ie_key': 'Youtube',
b11cec41 1382 'id': video_id,
e302f9ce 1383 'title': title,
b7ab0590 1384 }
9c44d242 1385 url_results = OnDemandPagedList(download_page, self._GDATA_PAGE_SIZE)
c5e8d7af 1386
7012b23c
PH
1387 return self.playlist_result(url_results, playlist_title=username)
1388
b05654f0
PH
1389
1390class YoutubeSearchIE(SearchInfoExtractor):
78caa52a
PH
1391 IE_DESC = 'YouTube.com searches'
1392 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
b05654f0 1393 _MAX_RESULTS = 1000
78caa52a 1394 IE_NAME = 'youtube:search'
b05654f0
PH
1395 _SEARCH_KEY = 'ytsearch'
1396
b05654f0
PH
1397 def _get_n_results(self, query, n):
1398 """Get a specified number of results for a query"""
1399
1400 video_ids = []
1401 pagenum = 0
1402 limit = n
83d548ef 1403 PAGE_SIZE = 50
b05654f0 1404
83d548ef
PH
1405 while (PAGE_SIZE * pagenum) < limit:
1406 result_url = self._API_URL % (
1407 compat_urllib_parse.quote_plus(query.encode('utf-8')),
1408 (PAGE_SIZE * pagenum) + 1)
7cc3570e 1409 data_json = self._download_webpage(
69ea8ca4
PH
1410 result_url, video_id='query "%s"' % query,
1411 note='Downloading page %s' % (pagenum + 1),
1412 errnote='Unable to download API page')
7cc3570e
PH
1413 data = json.loads(data_json)
1414 api_response = data['data']
1415
1416 if 'items' not in api_response:
07ad22b8 1417 raise ExtractorError(
78caa52a 1418 '[youtube] No video results', expected=True)
b05654f0
PH
1419
1420 new_ids = list(video['id'] for video in api_response['items'])
1421 video_ids += new_ids
1422
1423 limit = min(n, api_response['totalItems'])
1424 pagenum += 1
1425
1426 if len(video_ids) > n:
1427 video_ids = video_ids[:n]
7012b23c
PH
1428 videos = [self.url_result(video_id, 'Youtube', video_id=video_id)
1429 for video_id in video_ids]
b05654f0 1430 return self.playlist_result(videos, query)
75dff0ee 1431
c9ae7b95 1432
a3dd9248 1433class YoutubeSearchDateIE(YoutubeSearchIE):
cb7fb546 1434 IE_NAME = YoutubeSearchIE.IE_NAME + ':date'
a3dd9248
CM
1435 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc&orderby=published'
1436 _SEARCH_KEY = 'ytsearchdate'
78caa52a 1437 IE_DESC = 'YouTube.com searches, newest videos first'
75dff0ee 1438
c9ae7b95
PH
1439
1440class YoutubeSearchURLIE(InfoExtractor):
78caa52a
PH
1441 IE_DESC = 'YouTube.com search URLs'
1442 IE_NAME = 'youtube:search_url'
c9ae7b95 1443 _VALID_URL = r'https?://(?:www\.)?youtube\.com/results\?(.*?&)?search_query=(?P<query>[^&]+)(?:[&]|$)'
cdc628a4
PH
1444 _TESTS = [{
1445 'url': 'https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video',
1446 'playlist_mincount': 5,
1447 'info_dict': {
1448 'title': 'youtube-dl test video',
1449 }
1450 }]
c9ae7b95
PH
1451
1452 def _real_extract(self, url):
1453 mobj = re.match(self._VALID_URL, url)
1454 query = compat_urllib_parse.unquote_plus(mobj.group('query'))
1455
1456 webpage = self._download_webpage(url, query)
1457 result_code = self._search_regex(
78caa52a 1458 r'(?s)<ol class="item-section"(.*?)</ol>', webpage, 'result HTML')
c9ae7b95
PH
1459
1460 part_codes = re.findall(
1461 r'(?s)<h3 class="yt-lockup-title">(.*?)</h3>', result_code)
1462 entries = []
1463 for part_code in part_codes:
1464 part_title = self._html_search_regex(
6feb2d5e 1465 [r'(?s)title="([^"]+)"', r'>([^<]+)</a>'], part_code, 'item title', fatal=False)
c9ae7b95
PH
1466 part_url_snippet = self._html_search_regex(
1467 r'(?s)href="([^"]+)"', part_code, 'item URL')
1468 part_url = compat_urlparse.urljoin(
1469 'https://www.youtube.com/', part_url_snippet)
1470 entries.append({
1471 '_type': 'url',
1472 'url': part_url,
1473 'title': part_title,
1474 })
1475
1476 return {
1477 '_type': 'playlist',
1478 'entries': entries,
1479 'title': query,
1480 }
1481
1482
75dff0ee 1483class YoutubeShowIE(InfoExtractor):
78caa52a 1484 IE_DESC = 'YouTube.com (multi-season) shows'
cdc628a4 1485 _VALID_URL = r'https?://www\.youtube\.com/show/(?P<id>[^?#]*)'
78caa52a 1486 IE_NAME = 'youtube:show'
cdc628a4
PH
1487 _TESTS = [{
1488 'url': 'http://www.youtube.com/show/airdisasters',
1489 'playlist_mincount': 3,
1490 'info_dict': {
1491 'id': 'airdisasters',
1492 'title': 'Air Disasters',
1493 }
1494 }]
75dff0ee
JMF
1495
1496 def _real_extract(self, url):
1497 mobj = re.match(self._VALID_URL, url)
cdc628a4
PH
1498 playlist_id = mobj.group('id')
1499 webpage = self._download_webpage(
1500 url, playlist_id, 'Downloading show webpage')
75dff0ee
JMF
1501 # There's one playlist for each season of the show
1502 m_seasons = list(re.finditer(r'href="(/playlist\?list=.*?)"', webpage))
cdc628a4
PH
1503 self.to_screen('%s: Found %s seasons' % (playlist_id, len(m_seasons)))
1504 entries = [
1505 self.url_result(
1506 'https://www.youtube.com' + season.group(1), 'YoutubePlaylist')
1507 for season in m_seasons
1508 ]
1509 title = self._og_search_title(webpage, fatal=False)
1510
1511 return {
1512 '_type': 'playlist',
1513 'id': playlist_id,
1514 'title': title,
1515 'entries': entries,
1516 }
04cc9617
JMF
1517
1518
b2e8bc1b 1519class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
d7ae0639
JMF
1520 """
1521 Base class for extractors that fetch info from
1522 http://www.youtube.com/feed_ajax
1523 Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.
1524 """
b2e8bc1b 1525 _LOGIN_REQUIRED = True
43ba5456
JMF
1526 # use action_load_personal_feed instead of action_load_system_feed
1527 _PERSONAL_FEED = False
04cc9617 1528
d7ae0639
JMF
1529 @property
1530 def _FEED_TEMPLATE(self):
43ba5456
JMF
1531 action = 'action_load_system_feed'
1532 if self._PERSONAL_FEED:
1533 action = 'action_load_personal_feed'
38c2e5b8 1534 return 'https://www.youtube.com/feed_ajax?%s=1&feed_name=%s&paging=%%s' % (action, self._FEED_NAME)
d7ae0639
JMF
1535
1536 @property
1537 def IE_NAME(self):
78caa52a 1538 return 'youtube:%s' % self._FEED_NAME
04cc9617 1539
81f0259b 1540 def _real_initialize(self):
b2e8bc1b 1541 self._login()
81f0259b 1542
04cc9617
JMF
1543 def _real_extract(self, url):
1544 feed_entries = []
0e44d838
JMF
1545 paging = 0
1546 for i in itertools.count(1):
f6177462 1547 info = self._download_json(self._FEED_TEMPLATE % paging,
8bcc8756
JW
1548 '%s feed' % self._FEED_NAME,
1549 'Downloading page %s' % i)
f6177462 1550 feed_html = info.get('feed_html') or info.get('content_html')
1a9b9649 1551 load_more_widget_html = info.get('load_more_widget_html') or feed_html
43ba5456 1552 m_ids = re.finditer(r'"/watch\?v=(.*?)["&]', feed_html)
04cc9617 1553 ids = orderedSet(m.group(1) for m in m_ids)
7012b23c
PH
1554 feed_entries.extend(
1555 self.url_result(video_id, 'Youtube', video_id=video_id)
1556 for video_id in ids)
05ee2b6d
JMF
1557 mobj = re.search(
1558 r'data-uix-load-more-href="/?[^"]+paging=(?P<paging>\d+)',
1a9b9649 1559 load_more_widget_html)
05ee2b6d 1560 if mobj is None:
04cc9617 1561 break
05ee2b6d 1562 paging = mobj.group('paging')
d7ae0639
JMF
1563 return self.playlist_result(feed_entries, playlist_title=self._PLAYLIST_TITLE)
1564
5f6a1245 1565
d7ae0639 1566class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
f3a34072 1567 IE_DESC = 'YouTube.com recommended videos, ":ytrec" for short (requires authentication)'
d7ae0639
JMF
1568 _VALID_URL = r'https?://www\.youtube\.com/feed/recommended|:ytrec(?:ommended)?'
1569 _FEED_NAME = 'recommended'
78caa52a 1570 _PLAYLIST_TITLE = 'Youtube Recommended videos'
c626a3d9 1571
5f6a1245 1572
43ba5456 1573class YoutubeWatchLaterIE(YoutubeFeedsInfoExtractor):
f3a34072 1574 IE_DESC = 'Youtube watch later list, ":ytwatchlater" for short (requires authentication)'
43ba5456
JMF
1575 _VALID_URL = r'https?://www\.youtube\.com/feed/watch_later|:ytwatchlater'
1576 _FEED_NAME = 'watch_later'
78caa52a 1577 _PLAYLIST_TITLE = 'Youtube Watch Later'
43ba5456 1578 _PERSONAL_FEED = True
c626a3d9 1579
5f6a1245 1580
f459d170 1581class YoutubeHistoryIE(YoutubeFeedsInfoExtractor):
f3a34072 1582 IE_DESC = 'Youtube watch history, ":ythistory" for short (requires authentication)'
78caa52a 1583 _VALID_URL = 'https?://www\.youtube\.com/feed/history|:ythistory'
f459d170
JMF
1584 _FEED_NAME = 'history'
1585 _PERSONAL_FEED = True
78caa52a 1586 _PLAYLIST_TITLE = 'Youtube Watch History'
f459d170 1587
5f6a1245 1588
c626a3d9 1589class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
78caa52a 1590 IE_NAME = 'youtube:favorites'
f3a34072 1591 IE_DESC = 'YouTube.com favourite videos, ":ytfav" for short (requires authentication)'
c7a7750d 1592 _VALID_URL = r'https?://www\.youtube\.com/my_favorites|:ytfav(?:ou?rites)?'
c626a3d9
JMF
1593 _LOGIN_REQUIRED = True
1594
1595 def _real_extract(self, url):
1596 webpage = self._download_webpage('https://www.youtube.com/my_favorites', 'Youtube Favourites videos')
78caa52a 1597 playlist_id = self._search_regex(r'list=(.+?)["&]', webpage, 'favourites playlist id')
c626a3d9 1598 return self.url_result(playlist_id, 'YoutubePlaylist')
15870e90
PH
1599
1600
1ed5b5c9 1601class YoutubeSubscriptionsIE(YoutubePlaylistIE):
78caa52a
PH
1602 IE_NAME = 'youtube:subscriptions'
1603 IE_DESC = 'YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication)'
1ed5b5c9 1604 _VALID_URL = r'https?://www\.youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
81127aa5 1605 _TESTS = []
1ed5b5c9
JMF
1606
1607 def _real_extract(self, url):
78caa52a 1608 title = 'Youtube Subscriptions'
1ed5b5c9
JMF
1609 page = self._download_webpage('https://www.youtube.com/feed/subscriptions', title)
1610
1611 # The extraction process is the same as for playlists, but the regex
1612 # for the video ids doesn't contain an index
1613 ids = []
1614 more_widget_html = content_html = page
1615
1616 for page_num in itertools.count(1):
1617 matches = re.findall(r'href="\s*/watch\?v=([0-9A-Za-z_-]{11})', content_html)
1618 new_ids = orderedSet(matches)
1619 ids.extend(new_ids)
1620
1621 mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
1622 if not mobj:
1623 break
1624
1625 more = self._download_json(
1626 'https://youtube.com/%s' % mobj.group('more'), title,
1627 'Downloading page #%s' % page_num,
1628 transform_source=uppercase_escape)
1629 content_html = more['content_html']
1630 more_widget_html = more['load_more_widget_html']
1631
1632 return {
1633 '_type': 'playlist',
1634 'title': title,
1635 'entries': self._ids_to_results(ids),
1636 }
1637
1638
15870e90
PH
1639class YoutubeTruncatedURLIE(InfoExtractor):
1640 IE_NAME = 'youtube:truncated_url'
1641 IE_DESC = False # Do not list
975d35db 1642 _VALID_URL = r'''(?x)
c4808c60
PH
1643 (?:https?://)?[^/]+/watch\?(?:
1644 feature=[a-z_]+|
1645 annotation_id=annotation_[^&]+
1646 )?$|
975d35db
PH
1647 (?:https?://)?(?:www\.)?youtube\.com/attribution_link\?a=[^&]+$
1648 '''
15870e90 1649
c4808c60
PH
1650 _TESTS = [{
1651 'url': 'http://www.youtube.com/watch?annotation_id=annotation_3951667041',
1652 'only_matching': True,
dc2fc736
PH
1653 }, {
1654 'url': 'http://www.youtube.com/watch?',
1655 'only_matching': True,
c4808c60
PH
1656 }]
1657
15870e90
PH
1658 def _real_extract(self, url):
1659 raise ExtractorError(
78caa52a
PH
1660 'Did you forget to quote the URL? Remember that & is a meta '
1661 'character in most shells, so you want to put the URL in quotes, '
1662 'like youtube-dl '
1663 '"http://www.youtube.com/watch?feature=foo&v=BaW_jenozKc" '
1664 ' or simply youtube-dl BaW_jenozKc .',
15870e90 1665 expected=True)