]> jfr.im git - yt-dlp.git/blame - youtube_dl/extractor/youtube.py
[motorsport] Fix extraction and make trailing '/' optional
[yt-dlp.git] / youtube_dl / extractor / youtube.py
CommitLineData
c5e8d7af 1# coding: utf-8
c5e8d7af 2
78caa52a
PH
3from __future__ import unicode_literals
4
5
0ca96d48 6import itertools
c5e8d7af 7import json
c4417ddb 8import os.path
c5e8d7af 9import re
42939b61 10import time
e0df6211 11import traceback
c5e8d7af 12
b05654f0 13from .common import InfoExtractor, SearchInfoExtractor
54d39d8b 14from .subtitles import SubtitlesInfoExtractor
2b25cb5d 15from ..jsinterp import JSInterpreter
54256267 16from ..swfinterp import SWFInterpreter
4bb4a188 17from ..compat import (
edf3e38e 18 compat_chr,
c5e8d7af 19 compat_parse_qs,
c5e8d7af
PH
20 compat_urllib_parse,
21 compat_urllib_request,
7c61bd36 22 compat_urlparse,
c5e8d7af 23 compat_str,
4bb4a188
PH
24)
25from ..utils import (
c5e8d7af 26 clean_html,
c5e8d7af 27 ExtractorError,
4bb4a188
PH
28 get_element_by_attribute,
29 get_element_by_id,
dd27fd17 30 int_or_none,
9c44d242 31 OnDemandPagedList,
4bb4a188 32 orderedSet,
c5e8d7af
PH
33 unescapeHTML,
34 unified_strdate,
81c2f20b 35 uppercase_escape,
c5e8d7af
PH
36)
37
5f6a1245 38
de7f3446 39class YoutubeBaseInfoExtractor(InfoExtractor):
b2e8bc1b
JMF
40 """Provide base functions for Youtube extractors"""
41 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
83317f69 42 _TWOFACTOR_URL = 'https://accounts.google.com/SecondFactor'
b2e8bc1b
JMF
43 _NETRC_MACHINE = 'youtube'
44 # If True it will raise an error if no login info is provided
45 _LOGIN_REQUIRED = False
46
b2e8bc1b 47 def _set_language(self):
810fb84d
PH
48 self._set_cookie(
49 '.youtube.com', 'PREF', 'f1=50000000&hl=en',
42939b61 50 # YouTube sets the expire time to about two months
810fb84d 51 expire_time=time.time() + 2 * 30 * 24 * 3600)
b2e8bc1b
JMF
52
53 def _login(self):
83317f69 54 """
55 Attempt to log in to YouTube.
56 True is returned if successful or skipped.
57 False is returned if login failed.
58
59 If _LOGIN_REQUIRED is set and no authentication was provided, an error is raised.
60 """
b2e8bc1b
JMF
61 (username, password) = self._get_login_info()
62 # No authentication to be performed
63 if username is None:
64 if self._LOGIN_REQUIRED:
69ea8ca4 65 raise ExtractorError('No login info available, needed for using %s.' % self.IE_NAME, expected=True)
83317f69 66 return True
b2e8bc1b 67
7cc3570e
PH
68 login_page = self._download_webpage(
69 self._LOGIN_URL, None,
69ea8ca4
PH
70 note='Downloading login page',
71 errnote='unable to fetch login page', fatal=False)
7cc3570e
PH
72 if login_page is False:
73 return
b2e8bc1b 74
795f28f8 75 galx = self._search_regex(r'(?s)<input.+?name="GALX".+?value="(.+?)"',
78caa52a 76 login_page, 'Login GALX parameter')
c5e8d7af 77
b2e8bc1b
JMF
78 # Log in
79 login_form_strs = {
8bcc8756
JW
80 'continue': 'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
81 'Email': username,
82 'GALX': galx,
83 'Passwd': password,
84
85 'PersistentCookie': 'yes',
86 '_utf8': '霱',
87 'bgresponse': 'js_disabled',
88 'checkConnection': '',
89 'checkedDomains': 'youtube',
90 'dnConn': '',
91 'pstMsg': '0',
92 'rmShown': '1',
93 'secTok': '',
94 'signIn': 'Sign in',
95 'timeStmp': '',
96 'service': 'youtube',
97 'uilel': '3',
98 'hl': 'en_US',
b2e8bc1b 99 }
83317f69 100
b2e8bc1b
JMF
101 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
102 # chokes on unicode
5f6a1245 103 login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k, v in login_form_strs.items())
b2e8bc1b 104 login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
7cc3570e
PH
105
106 req = compat_urllib_request.Request(self._LOGIN_URL, login_data)
107 login_results = self._download_webpage(
108 req, None,
69ea8ca4 109 note='Logging in', errnote='unable to log in', fatal=False)
7cc3570e
PH
110 if login_results is False:
111 return False
83317f69 112
113 if re.search(r'id="errormsg_0_Passwd"', login_results) is not None:
69ea8ca4 114 raise ExtractorError('Please use your account password and a two-factor code instead of an application-specific password.', expected=True)
83317f69 115
116 # Two-Factor
117 # TODO add SMS and phone call support - these require making a request and then prompting the user
118
119 if re.search(r'(?i)<form[^>]* id="gaia_secondfactorform"', login_results) is not None:
120 tfa_code = self._get_tfa_info()
121
122 if tfa_code is None:
69ea8ca4
PH
123 self._downloader.report_warning('Two-factor authentication required. Provide it with --twofactor <code>')
124 self._downloader.report_warning('(Note that only TOTP (Google Authenticator App) codes work at this time.)')
83317f69 125 return False
126
127 # Unlike the first login form, secTok and timeStmp are both required for the TFA form
128
129 match = re.search(r'id="secTok"\n\s+value=\'(.+)\'/>', login_results, re.M | re.U)
130 if match is None:
69ea8ca4 131 self._downloader.report_warning('Failed to get secTok - did the page structure change?')
83317f69 132 secTok = match.group(1)
133 match = re.search(r'id="timeStmp"\n\s+value=\'(.+)\'/>', login_results, re.M | re.U)
134 if match is None:
69ea8ca4 135 self._downloader.report_warning('Failed to get timeStmp - did the page structure change?')
83317f69 136 timeStmp = match.group(1)
137
138 tfa_form_strs = {
78caa52a
PH
139 'continue': 'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
140 'smsToken': '',
141 'smsUserPin': tfa_code,
142 'smsVerifyPin': 'Verify',
143
144 'PersistentCookie': 'yes',
145 'checkConnection': '',
146 'checkedDomains': 'youtube',
147 'pstMsg': '1',
148 'secTok': secTok,
149 'timeStmp': timeStmp,
150 'service': 'youtube',
151 'hl': 'en_US',
83317f69 152 }
5f6a1245 153 tfa_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k, v in tfa_form_strs.items())
83317f69 154 tfa_data = compat_urllib_parse.urlencode(tfa_form).encode('ascii')
155
156 tfa_req = compat_urllib_request.Request(self._TWOFACTOR_URL, tfa_data)
157 tfa_results = self._download_webpage(
158 tfa_req, None,
69ea8ca4 159 note='Submitting TFA code', errnote='unable to submit tfa', fatal=False)
83317f69 160
161 if tfa_results is False:
162 return False
163
164 if re.search(r'(?i)<form[^>]* id="gaia_secondfactorform"', tfa_results) is not None:
69ea8ca4 165 self._downloader.report_warning('Two-factor code expired. Please try again, or use a one-use backup code instead.')
83317f69 166 return False
167 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', tfa_results) is not None:
69ea8ca4 168 self._downloader.report_warning('unable to log in - did the page structure change?')
83317f69 169 return False
170 if re.search(r'smsauth-interstitial-reviewsettings', tfa_results) is not None:
69ea8ca4 171 self._downloader.report_warning('Your Google account has a security notice. Please log in on your web browser, resolve the notice, and try again.')
83317f69 172 return False
173
7cc3570e 174 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
69ea8ca4 175 self._downloader.report_warning('unable to log in: bad username or password')
b2e8bc1b
JMF
176 return False
177 return True
178
b2e8bc1b
JMF
179 def _real_initialize(self):
180 if self._downloader is None:
181 return
42939b61 182 self._set_language()
b2e8bc1b
JMF
183 if not self._login():
184 return
c5e8d7af 185
8377574c 186
de7f3446 187class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
78caa52a 188 IE_DESC = 'YouTube.com'
cb7dfeea 189 _VALID_URL = r"""(?x)^
c5e8d7af 190 (
edb53e2d 191 (?:https?://|//) # http(s):// or protocol-independent URL
cb7dfeea 192 (?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/|
484aaeb2 193 (?:www\.)?deturl\.com/www\.youtube\.com/|
e70dc1d1 194 (?:www\.)?pwnyoutube\.com/|
f7000f3a 195 (?:www\.)?yourepeat\.com/|
e69ae5b9
JMF
196 tube\.majestyc\.net/|
197 youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains
c5e8d7af
PH
198 (?:.*?\#/)? # handle anchor (#/) redirect urls
199 (?: # the various things that can precede the ID:
ac7553d0 200 (?:(?:v|embed|e)/(?!videoseries)) # v/ or embed/ or e/
c5e8d7af 201 |(?: # or the v= param in all its forms
f7000f3a 202 (?:(?:watch|movie)(?:_popup)?(?:\.php)?/?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
c5e8d7af
PH
203 (?:\?|\#!?) # the params delimiter ? or # or #!
204 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
205 v=
206 )
f4b05232
JMF
207 ))
208 |youtu\.be/ # just youtu.be/xxxx
edb53e2d 209 |(?:www\.)?cleanvideosearch\.com/media/action/yt/watch\?videoId=
f4b05232 210 )
c5e8d7af 211 )? # all until now is optional -> you can pass the naked ID
8963d9c2 212 ([0-9A-Za-z_-]{11}) # here is it! the YouTube video ID
9291475f 213 (?!.*?&list=) # combined list/video URLs are handled by the playlist IE
c5e8d7af
PH
214 (?(1).+)? # if we found the ID, everything can follow
215 $"""
c5e8d7af 216 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
2c62dc26
PH
217 _formats = {
218 '5': {'ext': 'flv', 'width': 400, 'height': 240},
219 '6': {'ext': 'flv', 'width': 450, 'height': 270},
220 '13': {'ext': '3gp'},
221 '17': {'ext': '3gp', 'width': 176, 'height': 144},
222 '18': {'ext': 'mp4', 'width': 640, 'height': 360},
223 '22': {'ext': 'mp4', 'width': 1280, 'height': 720},
224 '34': {'ext': 'flv', 'width': 640, 'height': 360},
225 '35': {'ext': 'flv', 'width': 854, 'height': 480},
226 '36': {'ext': '3gp', 'width': 320, 'height': 240},
227 '37': {'ext': 'mp4', 'width': 1920, 'height': 1080},
228 '38': {'ext': 'mp4', 'width': 4096, 'height': 3072},
229 '43': {'ext': 'webm', 'width': 640, 'height': 360},
230 '44': {'ext': 'webm', 'width': 854, 'height': 480},
231 '45': {'ext': 'webm', 'width': 1280, 'height': 720},
232 '46': {'ext': 'webm', 'width': 1920, 'height': 1080},
233
1d043b93 234
86fe61c8 235 # 3d videos
43b81eb9
PH
236 '82': {'ext': 'mp4', 'height': 360, 'format_note': '3D', 'preference': -20},
237 '83': {'ext': 'mp4', 'height': 480, 'format_note': '3D', 'preference': -20},
238 '84': {'ext': 'mp4', 'height': 720, 'format_note': '3D', 'preference': -20},
239 '85': {'ext': 'mp4', 'height': 1080, 'format_note': '3D', 'preference': -20},
240 '100': {'ext': 'webm', 'height': 360, 'format_note': '3D', 'preference': -20},
241 '101': {'ext': 'webm', 'height': 480, 'format_note': '3D', 'preference': -20},
242 '102': {'ext': 'webm', 'height': 720, 'format_note': '3D', 'preference': -20},
836a086c 243
96fb5605 244 # Apple HTTP Live Streaming
43b81eb9
PH
245 '92': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'preference': -10},
246 '93': {'ext': 'mp4', 'height': 360, 'format_note': 'HLS', 'preference': -10},
247 '94': {'ext': 'mp4', 'height': 480, 'format_note': 'HLS', 'preference': -10},
248 '95': {'ext': 'mp4', 'height': 720, 'format_note': 'HLS', 'preference': -10},
249 '96': {'ext': 'mp4', 'height': 1080, 'format_note': 'HLS', 'preference': -10},
250 '132': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'preference': -10},
251 '151': {'ext': 'mp4', 'height': 72, 'format_note': 'HLS', 'preference': -10},
2c62dc26
PH
252
253 # DASH mp4 video
43b81eb9
PH
254 '133': {'ext': 'mp4', 'height': 240, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
255 '134': {'ext': 'mp4', 'height': 360, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
256 '135': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
257 '136': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
258 '137': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
e65566a9 259 '138': {'ext': 'mp4', 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40}, # Height can vary (https://github.com/rg3/youtube-dl/issues/4559)
43b81eb9
PH
260 '160': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
261 '264': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
0d2c1418
PH
262 '298': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'h264'},
263 '299': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'h264'},
264 '266': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'vcodec': 'h264'},
836a086c 265
f6f1fc92 266 # Dash mp4 audio
2c62dc26
PH
267 '139': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 48, 'preference': -50},
268 '140': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 128, 'preference': -50},
269 '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 256, 'preference': -50},
836a086c
AZ
270
271 # Dash webm
e75cafe9
A
272 '167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
273 '168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
274 '169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
275 '170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
276 '218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
277 '219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
1cc887cb 278 '278': {'ext': 'webm', 'height': 144, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'container': 'webm', 'vcodec': 'VP9'},
e75cafe9
A
279 '242': {'ext': 'webm', 'height': 240, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
280 '243': {'ext': 'webm', 'height': 360, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
281 '244': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
282 '245': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
283 '246': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
284 '247': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
285 '248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
3c80377b 286 '271': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
18061bba 287 '272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
a51d3aa0
PH
288 '302': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'VP9'},
289 '303': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'VP9'},
10a404c3 290 '313': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'vcodec': 'VP9'},
2c62dc26
PH
291
292 # Dash webm audio
55db73ef 293 '171': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'abr': 128, 'preference': -50},
e75cafe9 294 '172': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'abr': 256, 'preference': -50},
ce6b9a2d 295
0857baad
PH
296 # Dash webm audio with opus inside
297 '249': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 50, 'preference': -50},
298 '250': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 70, 'preference': -50},
299 '251': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 160, 'preference': -50},
300
ce6b9a2d
PH
301 # RTMP (unnamed)
302 '_rtmp': {'protocol': 'rtmp'},
c5e8d7af 303 }
836a086c 304
78caa52a 305 IE_NAME = 'youtube'
2eb88d95
PH
306 _TESTS = [
307 {
4bc3a23e
PH
308 'url': 'http://www.youtube.com/watch?v=BaW_jenozKc',
309 'info_dict': {
310 'id': 'BaW_jenozKc',
311 'ext': 'mp4',
312 'title': 'youtube-dl test video "\'/\\ä↭𝕐',
313 'uploader': 'Philipp Hagemeister',
314 'uploader_id': 'phihag',
315 'upload_date': '20121002',
316 'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',
317 'categories': ['Science & Technology'],
3e7c1224
PH
318 'like_count': int,
319 'dislike_count': int,
2eb88d95 320 }
0e853ca4 321 },
0e853ca4 322 {
4bc3a23e
PH
323 'url': 'http://www.youtube.com/watch?v=UxxajLWwzqY',
324 'note': 'Test generic use_cipher_signature video (#897)',
325 'info_dict': {
326 'id': 'UxxajLWwzqY',
327 'ext': 'mp4',
328 'upload_date': '20120506',
329 'title': 'Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]',
330 'description': 'md5:fea86fda2d5a5784273df5c7cc994d9f',
331 'uploader': 'Icona Pop',
332 'uploader_id': 'IconaPop',
2eb88d95 333 }
c108eb73
JMF
334 },
335 {
4bc3a23e
PH
336 'url': 'https://www.youtube.com/watch?v=07FYdnEawAQ',
337 'note': 'Test VEVO video with age protection (#956)',
338 'info_dict': {
339 'id': '07FYdnEawAQ',
340 'ext': 'mp4',
341 'upload_date': '20130703',
342 'title': 'Justin Timberlake - Tunnel Vision (Explicit)',
343 'description': 'md5:64249768eec3bc4276236606ea996373',
344 'uploader': 'justintimberlakeVEVO',
345 'uploader_id': 'justintimberlakeVEVO',
c108eb73
JMF
346 }
347 },
fccd3771 348 {
4bc3a23e
PH
349 'url': '//www.YouTube.com/watch?v=yZIXLfi8CZQ',
350 'note': 'Embed-only video (#1746)',
351 'info_dict': {
352 'id': 'yZIXLfi8CZQ',
353 'ext': 'mp4',
354 'upload_date': '20120608',
355 'title': 'Principal Sexually Assaults A Teacher - Episode 117 - 8th June 2012',
356 'description': 'md5:09b78bd971f1e3e289601dfba15ca4f7',
357 'uploader': 'SET India',
358 'uploader_id': 'setindia'
fccd3771
PH
359 }
360 },
dd27fd17 361 {
4bc3a23e
PH
362 'url': 'http://www.youtube.com/watch?v=a9LDPn-MO4I',
363 'note': '256k DASH audio (format 141) via DASH manifest',
364 'info_dict': {
365 'id': 'a9LDPn-MO4I',
366 'ext': 'm4a',
367 'upload_date': '20121002',
368 'uploader_id': '8KVIDEO',
369 'description': '',
370 'uploader': '8KVIDEO',
371 'title': 'UHDTV TEST 8K VIDEO.mp4'
4919603f 372 },
4bc3a23e
PH
373 'params': {
374 'youtube_include_dash_manifest': True,
375 'format': '141',
4919603f 376 },
dd27fd17 377 },
3489b7d2
JMF
378 # DASH manifest with encrypted signature
379 {
78caa52a
PH
380 'url': 'https://www.youtube.com/watch?v=IB3lcPjvWLA',
381 'info_dict': {
382 'id': 'IB3lcPjvWLA',
383 'ext': 'm4a',
b766eb27
JMF
384 'title': 'Afrojack, Spree Wilson - The Spark ft. Spree Wilson',
385 'description': 'md5:12e7067fa6735a77bdcbb58cb1187d2d',
78caa52a
PH
386 'uploader': 'AfrojackVEVO',
387 'uploader_id': 'AfrojackVEVO',
388 'upload_date': '20131011',
3489b7d2 389 },
4bc3a23e 390 'params': {
78caa52a
PH
391 'youtube_include_dash_manifest': True,
392 'format': '141',
3489b7d2
JMF
393 },
394 },
aa79ac0c
PH
395 # Controversy video
396 {
397 'url': 'https://www.youtube.com/watch?v=T4XJQO3qol8',
398 'info_dict': {
399 'id': 'T4XJQO3qol8',
400 'ext': 'mp4',
401 'upload_date': '20100909',
402 'uploader': 'The Amazing Atheist',
403 'uploader_id': 'TheAmazingAtheist',
404 'title': 'Burning Everyone\'s Koran',
405 'description': 'SUBSCRIBE: http://www.youtube.com/saturninefilms\n\nEven Obama has taken a stand against freedom on this issue: http://www.huffingtonpost.com/2010/09/09/obama-gma-interview-quran_n_710282.html',
406 }
c522adb1
JMF
407 },
408 # Normal age-gate video (No vevo, embed allowed)
409 {
410 'url': 'http://youtube.com/watch?v=HtVdAasjOgU',
411 'info_dict': {
412 'id': 'HtVdAasjOgU',
413 'ext': 'mp4',
414 'title': 'The Witcher 3: Wild Hunt - The Sword Of Destiny Trailer',
415 'description': 'md5:eca57043abae25130f58f655ad9a7771',
416 'uploader': 'The Witcher',
417 'uploader_id': 'WitcherGame',
418 'upload_date': '20140605',
419 },
420 },
fccae2b9
S
421 # Age-gate video with encrypted signature
422 {
423 'url': 'http://www.youtube.com/watch?v=6kLq3WMV1nU',
424 'info_dict': {
425 'id': '6kLq3WMV1nU',
426 'ext': 'mp4',
427 'title': 'Dedication To My Ex (Miss That) (Lyric Video)',
428 'description': 'md5:33765bb339e1b47e7e72b5490139bb41',
429 'uploader': 'LloydVEVO',
430 'uploader_id': 'LloydVEVO',
431 'upload_date': '20110629',
432 },
433 },
774e208f
PH
434 # video_info is None (https://github.com/rg3/youtube-dl/issues/4421)
435 {
436 'url': '__2ABJjxzNo',
437 'info_dict': {
438 'id': '__2ABJjxzNo',
439 'ext': 'mp4',
440 'upload_date': '20100430',
441 'uploader_id': 'deadmau5',
442 'description': 'md5:12c56784b8032162bb936a5f76d55360',
443 'uploader': 'deadmau5',
444 'title': 'Deadmau5 - Some Chords (HD)',
445 },
446 'expected_warnings': [
447 'DASH manifest missing',
448 ]
e52a40ab
PH
449 },
450 # Olympics (https://github.com/rg3/youtube-dl/issues/4431)
451 {
452 'url': 'lqQg6PlCWgI',
453 'info_dict': {
454 'id': 'lqQg6PlCWgI',
455 'ext': 'mp4',
cbe2bd91
PH
456 'upload_date': '20120731',
457 'uploader_id': 'olympic',
458 'description': 'HO09 - Women - GER-AUS - Hockey - 31 July 2012 - London 2012 Olympic Games',
459 'uploader': 'Olympics',
460 'title': 'Hockey - Women - GER-AUS - London 2012 Olympic Games',
461 },
462 'params': {
463 'skip_download': 'requires avconv',
e52a40ab 464 }
cbe2bd91 465 },
2eb88d95
PH
466 ]
467
e0df6211
PH
468 def __init__(self, *args, **kwargs):
469 super(YoutubeIE, self).__init__(*args, **kwargs)
83799698 470 self._player_cache = {}
e0df6211 471
c5e8d7af
PH
472 def report_video_info_webpage_download(self, video_id):
473 """Report attempt to download video info webpage."""
69ea8ca4 474 self.to_screen('%s: Downloading video info webpage' % video_id)
c5e8d7af 475
c5e8d7af
PH
476 def report_information_extraction(self, video_id):
477 """Report attempt to extract video information."""
69ea8ca4 478 self.to_screen('%s: Extracting video information' % video_id)
c5e8d7af
PH
479
480 def report_unavailable_format(self, video_id, format):
481 """Report extracted video URL."""
69ea8ca4 482 self.to_screen('%s: Format %s not available' % (video_id, format))
c5e8d7af
PH
483
484 def report_rtmp_download(self):
485 """Indicate the download will use the RTMP protocol."""
69ea8ca4 486 self.to_screen('RTMP download detected')
c5e8d7af 487
60064c53
PH
488 def _signature_cache_id(self, example_sig):
489 """ Return a string representation of a signature """
78caa52a 490 return '.'.join(compat_str(len(part)) for part in example_sig.split('.'))
60064c53
PH
491
492 def _extract_signature_function(self, video_id, player_url, example_sig):
cf010131 493 id_m = re.match(
60620368 494 r'.*?-(?P<id>[a-zA-Z0-9_-]+)(?:/watch_as3|/html5player)?\.(?P<ext>[a-z]+)$',
cf010131 495 player_url)
c081b35c
PH
496 if not id_m:
497 raise ExtractorError('Cannot identify player %r' % player_url)
e0df6211
PH
498 player_type = id_m.group('ext')
499 player_id = id_m.group('id')
500
c4417ddb 501 # Read from filesystem cache
60064c53
PH
502 func_id = '%s_%s_%s' % (
503 player_type, player_id, self._signature_cache_id(example_sig))
c4417ddb 504 assert os.path.basename(func_id) == func_id
a0e07d31 505
69ea8ca4 506 cache_spec = self._downloader.cache.load('youtube-sigfuncs', func_id)
a0e07d31 507 if cache_spec is not None:
78caa52a 508 return lambda s: ''.join(s[i] for i in cache_spec)
83799698 509
e0df6211
PH
510 if player_type == 'js':
511 code = self._download_webpage(
512 player_url, video_id,
69ea8ca4
PH
513 note='Downloading %s player %s' % (player_type, player_id),
514 errnote='Download of %s failed' % player_url)
83799698 515 res = self._parse_sig_js(code)
c4417ddb 516 elif player_type == 'swf':
e0df6211
PH
517 urlh = self._request_webpage(
518 player_url, video_id,
69ea8ca4
PH
519 note='Downloading %s player %s' % (player_type, player_id),
520 errnote='Download of %s failed' % player_url)
e0df6211 521 code = urlh.read()
83799698 522 res = self._parse_sig_swf(code)
e0df6211
PH
523 else:
524 assert False, 'Invalid player type %r' % player_type
525
a0e07d31 526 if cache_spec is None:
78caa52a 527 test_string = ''.join(map(compat_chr, range(len(example_sig))))
a0e07d31
PH
528 cache_res = res(test_string)
529 cache_spec = [ord(c) for c in cache_res]
83799698 530
69ea8ca4 531 self._downloader.cache.store('youtube-sigfuncs', func_id, cache_spec)
83799698
PH
532 return res
533
60064c53 534 def _print_sig_code(self, func, example_sig):
edf3e38e
PH
535 def gen_sig_code(idxs):
536 def _genslice(start, end, step):
78caa52a 537 starts = '' if start == 0 else str(start)
8bcc8756 538 ends = (':%d' % (end + step)) if end + step >= 0 else ':'
69ea8ca4 539 steps = '' if step == 1 else (':%d' % step)
78caa52a 540 return 's[%s%s%s]' % (starts, ends, steps)
edf3e38e
PH
541
542 step = None
7af808a5
PH
543 # Quelch pyflakes warnings - start will be set when step is set
544 start = '(Never used)'
edf3e38e
PH
545 for i, prev in zip(idxs[1:], idxs[:-1]):
546 if step is not None:
547 if i - prev == step:
548 continue
549 yield _genslice(start, prev, step)
550 step = None
551 continue
552 if i - prev in [-1, 1]:
553 step = i - prev
554 start = prev
555 continue
556 else:
78caa52a 557 yield 's[%d]' % prev
edf3e38e 558 if step is None:
78caa52a 559 yield 's[%d]' % i
edf3e38e
PH
560 else:
561 yield _genslice(start, i, step)
562
78caa52a 563 test_string = ''.join(map(compat_chr, range(len(example_sig))))
c705320f 564 cache_res = func(test_string)
edf3e38e 565 cache_spec = [ord(c) for c in cache_res]
78caa52a 566 expr_code = ' + '.join(gen_sig_code(cache_spec))
60064c53
PH
567 signature_id_tuple = '(%s)' % (
568 ', '.join(compat_str(len(p)) for p in example_sig.split('.')))
69ea8ca4 569 code = ('if tuple(len(p) for p in s.split(\'.\')) == %s:\n'
78caa52a 570 ' return %s\n') % (signature_id_tuple, expr_code)
69ea8ca4 571 self.to_screen('Extracted signature function:\n' + code)
edf3e38e 572
e0df6211
PH
573 def _parse_sig_js(self, jscode):
574 funcname = self._search_regex(
894dd868 575 r'\.sig\|\|([a-zA-Z0-9]+)\(', jscode,
8bcc8756 576 'Initial JS player signature function name')
2b25cb5d
PH
577
578 jsi = JSInterpreter(jscode)
579 initial_function = jsi.extract_function(funcname)
e0df6211
PH
580 return lambda s: initial_function([s])
581
582 def _parse_sig_swf(self, file_contents):
54256267 583 swfi = SWFInterpreter(file_contents)
78caa52a 584 TARGET_CLASSNAME = 'SignatureDecipher'
54256267 585 searched_class = swfi.extract_class(TARGET_CLASSNAME)
78caa52a 586 initial_function = swfi.extract_function(searched_class, 'decipher')
e0df6211
PH
587 return lambda s: initial_function([s])
588
83799698 589 def _decrypt_signature(self, s, video_id, player_url, age_gate=False):
257a2501 590 """Turn the encrypted s field into a working signature"""
6b37f0be 591
c8bf86d5 592 if player_url is None:
69ea8ca4 593 raise ExtractorError('Cannot decrypt signature without player_url')
920de7a2 594
69ea8ca4 595 if player_url.startswith('//'):
78caa52a 596 player_url = 'https:' + player_url
c8bf86d5 597 try:
62af3a0e 598 player_id = (player_url, self._signature_cache_id(s))
c8bf86d5
PH
599 if player_id not in self._player_cache:
600 func = self._extract_signature_function(
60064c53 601 video_id, player_url, s
c8bf86d5
PH
602 )
603 self._player_cache[player_id] = func
604 func = self._player_cache[player_id]
605 if self._downloader.params.get('youtube_print_sig_code'):
60064c53 606 self._print_sig_code(func, s)
c8bf86d5
PH
607 return func(s)
608 except Exception as e:
609 tb = traceback.format_exc()
610 raise ExtractorError(
78caa52a 611 'Signature extraction failed: ' + tb, cause=e)
e0df6211 612
1f343eaa 613 def _get_available_subtitles(self, video_id, webpage):
de7f3446 614 try:
60e47a26 615 subs_doc = self._download_xml(
38c2e5b8 616 'https://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,
7fad1c63
JMF
617 video_id, note=False)
618 except ExtractorError as err:
69ea8ca4 619 self._downloader.report_warning('unable to download video subtitles: %s' % compat_str(err))
de7f3446 620 return {}
de7f3446
JMF
621
622 sub_lang_list = {}
60e47a26
JMF
623 for track in subs_doc.findall('track'):
624 lang = track.attrib['lang_code']
7e660ac1
LD
625 if lang in sub_lang_list:
626 continue
de7f3446
JMF
627 params = compat_urllib_parse.urlencode({
628 'lang': lang,
629 'v': video_id,
ca715127 630 'fmt': self._downloader.params.get('subtitlesformat', 'srt'),
60e47a26 631 'name': track.attrib['name'].encode('utf-8'),
de7f3446 632 })
78caa52a 633 url = 'https://www.youtube.com/api/timedtext?' + params
de7f3446
JMF
634 sub_lang_list[lang] = url
635 if not sub_lang_list:
69ea8ca4 636 self._downloader.report_warning('video doesn\'t have subtitles')
de7f3446
JMF
637 return {}
638 return sub_lang_list
639
055e6f36 640 def _get_available_automatic_caption(self, video_id, webpage):
de7f3446
JMF
641 """We need the webpage for getting the captions url, pass it as an
642 argument to speed up the process."""
ca715127 643 sub_format = self._downloader.params.get('subtitlesformat', 'srt')
69ea8ca4 644 self.to_screen('%s: Looking for automatic captions' % video_id)
de7f3446 645 mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
78caa52a 646 err_msg = 'Couldn\'t find automatic captions for %s' % video_id
de7f3446
JMF
647 if mobj is None:
648 self._downloader.report_warning(err_msg)
649 return {}
650 player_config = json.loads(mobj.group(1))
651 try:
0792d563
PH
652 args = player_config['args']
653 caption_url = args['ttsurl']
654 timestamp = args['timestamp']
055e6f36
JMF
655 # We get the available subtitles
656 list_params = compat_urllib_parse.urlencode({
657 'type': 'list',
658 'tlangs': 1,
659 'asrs': 1,
de7f3446 660 })
055e6f36 661 list_url = caption_url + '&' + list_params
e26f8712 662 caption_list = self._download_xml(list_url, video_id)
e3dc22ca 663 original_lang_node = caption_list.find('track')
7d900ef1 664 if original_lang_node is None:
69ea8ca4 665 self._downloader.report_warning('Video doesn\'t have automatic captions')
e3dc22ca
JMF
666 return {}
667 original_lang = original_lang_node.attrib['lang_code']
7d900ef1 668 caption_kind = original_lang_node.attrib.get('kind', '')
055e6f36
JMF
669
670 sub_lang_list = {}
671 for lang_node in caption_list.findall('target'):
672 sub_lang = lang_node.attrib['lang_code']
673 params = compat_urllib_parse.urlencode({
674 'lang': original_lang,
675 'tlang': sub_lang,
676 'fmt': sub_format,
677 'ts': timestamp,
7d900ef1 678 'kind': caption_kind,
055e6f36
JMF
679 })
680 sub_lang_list[sub_lang] = caption_url + '&' + params
681 return sub_lang_list
de7f3446
JMF
682 # An extractor error can be raise by the download process if there are
683 # no automatic captions but there are subtitles
684 except (KeyError, ExtractorError):
685 self._downloader.report_warning(err_msg)
686 return {}
687
97665381
PH
688 @classmethod
689 def extract_id(cls, url):
690 mobj = re.match(cls._VALID_URL, url, re.VERBOSE)
c5e8d7af 691 if mobj is None:
69ea8ca4 692 raise ExtractorError('Invalid URL: %s' % url)
c5e8d7af
PH
693 video_id = mobj.group(2)
694 return video_id
695
1d043b93
JMF
696 def _extract_from_m3u8(self, manifest_url, video_id):
697 url_map = {}
5f6a1245 698
1d043b93
JMF
699 def _get_urls(_manifest):
700 lines = _manifest.split('\n')
701 urls = filter(lambda l: l and not l.startswith('#'),
8bcc8756 702 lines)
1d043b93 703 return urls
78caa52a 704 manifest = self._download_webpage(manifest_url, video_id, 'Downloading formats manifest')
1d043b93
JMF
705 formats_urls = _get_urls(manifest)
706 for format_url in formats_urls:
890f62e8 707 itag = self._search_regex(r'itag/(\d+?)/', format_url, 'itag')
1d043b93
JMF
708 url_map[itag] = format_url
709 return url_map
710
1fb07d10
JG
711 def _extract_annotations(self, video_id):
712 url = 'https://www.youtube.com/annotations_invideo?features=1&legacy=1&video_id=%s' % video_id
69ea8ca4 713 return self._download_webpage(url, video_id, note='Searching for annotations.', errnote='Unable to download video annotations.')
1fb07d10 714
da276600
PH
715 def _parse_dash_manifest(
716 self, video_id, dash_manifest_url, player_url, age_gate):
774e208f
PH
717 def decrypt_sig(mobj):
718 s = mobj.group(1)
719 dec_s = self._decrypt_signature(s, video_id, player_url, age_gate)
720 return '/signature/%s' % dec_s
721 dash_manifest_url = re.sub(r'/s/([\w\.]+)', decrypt_sig, dash_manifest_url)
722 dash_doc = self._download_xml(
723 dash_manifest_url, video_id,
724 note='Downloading DASH manifest',
725 errnote='Could not download DASH manifest')
726
727 formats = []
728 for r in dash_doc.findall('.//{urn:mpeg:DASH:schema:MPD:2011}Representation'):
729 url_el = r.find('{urn:mpeg:DASH:schema:MPD:2011}BaseURL')
730 if url_el is None:
731 continue
732 format_id = r.attrib['id']
733 video_url = url_el.text
734 filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength'))
735 f = {
736 'format_id': format_id,
737 'url': video_url,
738 'width': int_or_none(r.attrib.get('width')),
e65566a9 739 'height': int_or_none(r.attrib.get('height')),
774e208f
PH
740 'tbr': int_or_none(r.attrib.get('bandwidth'), 1000),
741 'asr': int_or_none(r.attrib.get('audioSamplingRate')),
742 'filesize': filesize,
743 'fps': int_or_none(r.attrib.get('frameRate')),
744 }
745 try:
746 existing_format = next(
747 fo for fo in formats
748 if fo['format_id'] == format_id)
749 except StopIteration:
e65566a9 750 f.update(self._formats.get(format_id, {}).items())
774e208f
PH
751 formats.append(f)
752 else:
753 existing_format.update(f)
754 return formats
755
c5e8d7af 756 def _real_extract(self, url):
7e8c0af0 757 proto = (
78caa52a
PH
758 'http' if self._downloader.params.get('prefer_insecure', False)
759 else 'https')
7e8c0af0 760
c5e8d7af
PH
761 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
762 mobj = re.search(self._NEXT_URL_RE, url)
763 if mobj:
7e8c0af0 764 url = proto + '://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
97665381 765 video_id = self.extract_id(url)
c5e8d7af
PH
766
767 # Get video webpage
aa79ac0c 768 url = proto + '://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1&bpctr=9999999999' % video_id
a1f934b1 769 video_webpage = self._download_webpage(url, video_id)
c5e8d7af
PH
770
771 # Attempt to extract SWF player URL
e0df6211 772 mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
c5e8d7af
PH
773 if mobj is not None:
774 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
775 else:
776 player_url = None
777
778 # Get video info
c108eb73 779 if re.search(r'player-age-gate-content">', video_webpage) is not None:
c108eb73
JMF
780 age_gate = True
781 # We simulate the access to the video from www.youtube.com/v/{video_id}
782 # this can be viewed without login into Youtube
beb95e77
CL
783 url = proto + '://www.youtube.com/embed/%s' % video_id
784 embed_webpage = self._download_webpage(url, video_id, 'Downloading embed webpage')
2c57c7fa
JMF
785 data = compat_urllib_parse.urlencode({
786 'video_id': video_id,
787 'eurl': 'https://youtube.googleapis.com/v/' + video_id,
c084c934 788 'sts': self._search_regex(
beb95e77 789 r'"sts"\s*:\s*(\d+)', embed_webpage, 'sts', default=''),
2c57c7fa 790 })
7e8c0af0 791 video_info_url = proto + '://www.youtube.com/get_video_info?' + data
94bd3613
PH
792 video_info_webpage = self._download_webpage(
793 video_info_url, video_id,
20436c30 794 note='Refetching age-gated info webpage',
94bd3613 795 errnote='unable to download video info webpage')
c5e8d7af 796 video_info = compat_parse_qs(video_info_webpage)
c108eb73
JMF
797 else:
798 age_gate = False
4e62ebe2
JMF
799 try:
800 # Try looking directly into the video webpage
801 mobj = re.search(r';ytplayer\.config\s*=\s*({.*?});', video_webpage)
802 if not mobj:
803 raise ValueError('Could not find ytplayer.config') # caught below
804 json_code = uppercase_escape(mobj.group(1))
805 ytplayer_config = json.loads(json_code)
806 args = ytplayer_config['args']
807 # Convert to the same format returned by compat_parse_qs
808 video_info = dict((k, [v]) for k, v in args.items())
809 if 'url_encoded_fmt_stream_map' not in args:
810 raise ValueError('No stream_map present') # caught below
811 except ValueError:
812 # We fallback to the get_video_info pages (used by the embed page)
813 self.report_video_info_webpage_download(video_id)
814 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
810fb84d
PH
815 video_info_url = (
816 '%s://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
817 % (proto, video_id, el_type))
818 video_info_webpage = self._download_webpage(
819 video_info_url,
4e62ebe2
JMF
820 video_id, note=False,
821 errnote='unable to download video info webpage')
822 video_info = compat_parse_qs(video_info_webpage)
823 if 'token' in video_info:
824 break
c5e8d7af
PH
825 if 'token' not in video_info:
826 if 'reason' in video_info:
d11271dd 827 raise ExtractorError(
78caa52a 828 'YouTube said: %s' % video_info['reason'][0],
d11271dd 829 expected=True, video_id=video_id)
c5e8d7af 830 else:
d11271dd 831 raise ExtractorError(
78caa52a 832 '"token" parameter not in video info for unknown reason',
d11271dd 833 video_id=video_id)
c5e8d7af 834
1d699755
PH
835 if 'view_count' in video_info:
836 view_count = int(video_info['view_count'][0])
837 else:
838 view_count = None
839
c5e8d7af
PH
840 # Check for "rental" videos
841 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
69ea8ca4 842 raise ExtractorError('"rental" videos not supported')
c5e8d7af
PH
843
844 # Start extracting information
845 self.report_information_extraction(video_id)
846
847 # uploader
848 if 'author' not in video_info:
69ea8ca4 849 raise ExtractorError('Unable to extract uploader name')
c5e8d7af
PH
850 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
851
852 # uploader_id
853 video_uploader_id = None
854 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
855 if mobj is not None:
856 video_uploader_id = mobj.group(1)
857 else:
69ea8ca4 858 self._downloader.report_warning('unable to extract uploader nickname')
c5e8d7af
PH
859
860 # title
a8c6b241 861 if 'title' in video_info:
aa92f063 862 video_title = video_info['title'][0]
a8c6b241 863 else:
69ea8ca4 864 self._downloader.report_warning('Unable to extract video title')
78caa52a 865 video_title = '_'
c5e8d7af
PH
866
867 # thumbnail image
7763b04e
JMF
868 # We try first to get a high quality image:
869 m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">',
870 video_webpage, re.DOTALL)
871 if m_thumb is not None:
872 video_thumbnail = m_thumb.group(1)
873 elif 'thumbnail_url' not in video_info:
69ea8ca4 874 self._downloader.report_warning('unable to extract video thumbnail')
f490e77e 875 video_thumbnail = None
c5e8d7af
PH
876 else: # don't panic if we can't find it
877 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
878
879 # upload date
880 upload_date = None
ad3bc6ac 881 mobj = re.search(r'(?s)id="eow-date.*?>(.*?)</span>', video_webpage)
beee53de
PH
882 if mobj is None:
883 mobj = re.search(
263bd4ec 884 r'(?s)id="watch-uploader-info".*?>.*?(?:Published|Uploaded|Streamed live) on (.*?)</strong>',
beee53de 885 video_webpage)
c5e8d7af
PH
886 if mobj is not None:
887 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
888 upload_date = unified_strdate(upload_date)
889
55f7bd2d
PH
890 m_cat_container = self._search_regex(
891 r'(?s)<h4[^>]*>\s*Category\s*</h4>\s*<ul[^>]*>(.*?)</ul>',
624dcebf 892 video_webpage, 'categories', default=None)
ec8deefc 893 if m_cat_container:
ad3bc6ac 894 category = self._html_search_regex(
01ed5c9b 895 r'(?s)<a[^<]+>(.*?)</a>', m_cat_container, 'category',
ad3bc6ac
PH
896 default=None)
897 video_categories = None if category is None else [category]
898 else:
899 video_categories = None
ec8deefc 900
c5e8d7af
PH
901 # description
902 video_description = get_element_by_id("eow-description", video_webpage)
903 if video_description:
27dcce19
PH
904 video_description = re.sub(r'''(?x)
905 <a\s+
906 (?:[a-zA-Z-]+="[^"]+"\s+)*?
907 title="([^"]+)"\s+
908 (?:[a-zA-Z-]+="[^"]+"\s+)*?
909 class="yt-uix-redirect-link"\s*>
910 [^<]+
911 </a>
912 ''', r'\1', video_description)
c5e8d7af
PH
913 video_description = clean_html(video_description)
914 else:
915 fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
916 if fd_mobj:
917 video_description = unescapeHTML(fd_mobj.group(1))
918 else:
78caa52a 919 video_description = ''
c5e8d7af 920
f30a38be 921 def _extract_count(count_name):
46374a56 922 count = self._search_regex(
f30a38be
JMF
923 r'id="watch-%s"[^>]*>.*?([\d,]+)\s*</span>' % re.escape(count_name),
924 video_webpage, count_name, default=None)
336c3a69
JMF
925 if count is not None:
926 return int(count.replace(',', ''))
927 return None
69ea8ca4
PH
928 like_count = _extract_count('like')
929 dislike_count = _extract_count('dislike')
336c3a69 930
c5e8d7af 931 # subtitles
d82134c3 932 video_subtitles = self.extract_subtitles(video_id, video_webpage)
c5e8d7af 933
c5e8d7af 934 if self._downloader.params.get('listsubtitles', False):
d665f8d3 935 self._list_available_subtitles(video_id, video_webpage)
c5e8d7af
PH
936 return
937
938 if 'length_seconds' not in video_info:
69ea8ca4 939 self._downloader.report_warning('unable to extract video duration')
b466b702 940 video_duration = None
c5e8d7af 941 else:
b466b702 942 video_duration = int(compat_urllib_parse.unquote_plus(video_info['length_seconds'][0]))
c5e8d7af 943
1fb07d10
JG
944 # annotations
945 video_annotations = None
946 if self._downloader.params.get('writeannotations', False):
5f6a1245 947 video_annotations = self._extract_annotations(video_id)
1fb07d10 948
dd27fd17
PH
949 def _map_to_format_list(urlmap):
950 formats = []
951 for itag, video_real_url in urlmap.items():
952 dct = {
953 'format_id': itag,
954 'url': video_real_url,
955 'player_url': player_url,
956 }
0b65e5d4
PH
957 if itag in self._formats:
958 dct.update(self._formats[itag])
dd27fd17
PH
959 formats.append(dct)
960 return formats
961
c5e8d7af
PH
962 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
963 self.report_rtmp_download()
dd27fd17
PH
964 formats = [{
965 'format_id': '_rtmp',
966 'protocol': 'rtmp',
967 'url': video_info['conn'][0],
968 'player_url': player_url,
969 }]
24270b03 970 elif len(video_info.get('url_encoded_fmt_stream_map', [''])[0]) >= 1 or len(video_info.get('adaptive_fmts', [''])[0]) >= 1:
5f6a1245 971 encoded_url_map = video_info.get('url_encoded_fmt_stream_map', [''])[0] + ',' + video_info.get('adaptive_fmts', [''])[0]
00fe14fc 972 if 'rtmpe%3Dyes' in encoded_url_map:
a7055eb9 973 raise ExtractorError('rtmpe downloads are not supported, see https://github.com/rg3/youtube-dl/issues/343 for more information.', expected=True)
c5e8d7af 974 url_map = {}
00fe14fc 975 for url_data_str in encoded_url_map.split(','):
c5e8d7af 976 url_data = compat_parse_qs(url_data_str)
201e9eaa
PH
977 if 'itag' not in url_data or 'url' not in url_data:
978 continue
979 format_id = url_data['itag'][0]
980 url = url_data['url'][0]
981
982 if 'sig' in url_data:
983 url += '&signature=' + url_data['sig'][0]
984 elif 's' in url_data:
985 encrypted_sig = url_data['s'][0]
986
beb95e77
CL
987 jsplayer_url_json = self._search_regex(
988 r'"assets":.+?"js":\s*("[^"]+")',
989 embed_webpage if age_gate else video_webpage, 'JS player URL')
990 player_url = json.loads(jsplayer_url_json)
201e9eaa
PH
991 if player_url is None:
992 player_url_json = self._search_regex(
993 r'ytplayer\.config.*?"url"\s*:\s*("[^"]+")',
78caa52a 994 video_webpage, 'age gate player URL')
201e9eaa
PH
995 player_url = json.loads(player_url_json)
996
997 if self._downloader.params.get('verbose'):
cf010131 998 if player_url is None:
201e9eaa
PH
999 player_version = 'unknown'
1000 player_desc = 'unknown'
1001 else:
1002 if player_url.endswith('swf'):
1003 player_version = self._search_regex(
1004 r'-(.+?)(?:/watch_as3)?\.swf$', player_url,
78caa52a 1005 'flash player', fatal=False)
201e9eaa 1006 player_desc = 'flash player %s' % player_version
cf010131 1007 else:
201e9eaa
PH
1008 player_version = self._search_regex(
1009 r'html5player-([^/]+?)(?:/html5player)?\.js',
1010 player_url,
1011 'html5 player', fatal=False)
78caa52a 1012 player_desc = 'html5 player %s' % player_version
201e9eaa 1013
60064c53 1014 parts_sizes = self._signature_cache_id(encrypted_sig)
69ea8ca4 1015 self.to_screen('{%s} signature length %s, %s' %
9e1a5b84 1016 (format_id, parts_sizes, player_desc))
201e9eaa
PH
1017
1018 signature = self._decrypt_signature(
1019 encrypted_sig, video_id, player_url, age_gate)
1020 url += '&signature=' + signature
1021 if 'ratebypass' not in url:
1022 url += '&ratebypass=yes'
1023 url_map[format_id] = url
dd27fd17 1024 formats = _map_to_format_list(url_map)
1d043b93
JMF
1025 elif video_info.get('hlsvp'):
1026 manifest_url = video_info['hlsvp'][0]
1027 url_map = self._extract_from_m3u8(manifest_url, video_id)
dd27fd17 1028 formats = _map_to_format_list(url_map)
c5e8d7af 1029 else:
69ea8ca4 1030 raise ExtractorError('no conn, hlsvp or url_encoded_fmt_stream_map information found in video info')
c5e8d7af 1031
dd27fd17 1032 # Look for the DASH manifest
203fb43f 1033 if self._downloader.params.get('youtube_include_dash_manifest', True):
774e208f 1034 dash_mpd = video_info.get('dashmpd')
75111274 1035 if dash_mpd:
774e208f
PH
1036 dash_manifest_url = dash_mpd[0]
1037 try:
1038 dash_formats = self._parse_dash_manifest(
da276600 1039 video_id, dash_manifest_url, player_url, age_gate)
774e208f
PH
1040 except (ExtractorError, KeyError) as e:
1041 self.report_warning(
1042 'Skipping DASH manifest: %r' % e, video_id)
1043 else:
e65566a9
PH
1044 # Hide the formats we found through non-DASH
1045 dash_keys = set(df['format_id'] for df in dash_formats)
1046 for f in formats:
1047 if f['format_id'] in dash_keys:
1048 f['format_id'] = 'nondash-%s' % f['format_id']
1049 f['preference'] -= 10000
774e208f 1050 formats.extend(dash_formats)
d80044c2 1051
4bcc7bd1 1052 self._sort_formats(formats)
4ea3be0a 1053
1054 return {
8bcc8756
JW
1055 'id': video_id,
1056 'uploader': video_uploader,
1057 'uploader_id': video_uploader_id,
1058 'upload_date': upload_date,
1059 'title': video_title,
1060 'thumbnail': video_thumbnail,
1061 'description': video_description,
1062 'categories': video_categories,
1063 'subtitles': video_subtitles,
1064 'duration': video_duration,
1065 'age_limit': 18 if age_gate else 0,
1066 'annotations': video_annotations,
7e8c0af0 1067 'webpage_url': proto + '://www.youtube.com/watch?v=%s' % video_id,
8bcc8756 1068 'view_count': view_count,
4ea3be0a 1069 'like_count': like_count,
1070 'dislike_count': dislike_count,
8bcc8756 1071 'formats': formats,
4ea3be0a 1072 }
c5e8d7af 1073
5f6a1245 1074
880e1c52 1075class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
78caa52a 1076 IE_DESC = 'YouTube.com playlists'
d67cc9fa 1077 _VALID_URL = r"""(?x)(?:
c5e8d7af
PH
1078 (?:https?://)?
1079 (?:\w+\.)?
1080 youtube\.com/
1081 (?:
ac7553d0 1082 (?:course|view_play_list|my_playlists|artist|playlist|watch|embed/videoseries)
c5e8d7af
PH
1083 \? (?:.*?&)*? (?:p|a|list)=
1084 | p/
1085 )
d67cc9fa 1086 (
7d568f5a 1087 (?:PL|LL|EC|UU|FL|RD)?[0-9A-Za-z-_]{10,}
5f6a1245 1088 # Top tracks, they can also include dots
d67cc9fa
JMF
1089 |(?:MC)[\w\.]*
1090 )
c5e8d7af
PH
1091 .*
1092 |
7d568f5a 1093 ((?:PL|LL|EC|UU|FL|RD)[0-9A-Za-z-_]{10,})
c5e8d7af 1094 )"""
dbb94fb0 1095 _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s'
dbb94fb0 1096 _VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})&amp;[^"]*?index=(?P<index>\d+)'
78caa52a 1097 IE_NAME = 'youtube:playlist'
81127aa5
PH
1098 _TESTS = [{
1099 'url': 'https://www.youtube.com/playlist?list=PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re',
1100 'info_dict': {
1101 'title': 'ytdl test PL',
a1cf99d0 1102 'id': 'PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re',
81127aa5
PH
1103 },
1104 'playlist_count': 3,
9291475f
PH
1105 }, {
1106 'url': 'https://www.youtube.com/playlist?list=PLtPgu7CB4gbZDA7i_euNxn75ISqxwZPYx',
1107 'info_dict': {
1108 'title': 'YDL_Empty_List',
1109 },
1110 'playlist_count': 0,
1111 }, {
1112 'note': 'Playlist with deleted videos (#651). As a bonus, the video #51 is also twice in this list.',
1113 'url': 'https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC',
1114 'info_dict': {
1115 'title': '29C3: Not my department',
1116 },
1117 'playlist_count': 95,
1118 }, {
1119 'note': 'issue #673',
1120 'url': 'PLBB231211A4F62143',
1121 'info_dict': {
f46a8702 1122 'title': '[OLD]Team Fortress 2 (Class-based LP)',
9291475f
PH
1123 },
1124 'playlist_mincount': 26,
1125 }, {
1126 'note': 'Large playlist',
1127 'url': 'https://www.youtube.com/playlist?list=UUBABnxM4Ar9ten8Mdjj1j0Q',
1128 'info_dict': {
1129 'title': 'Uploads from Cauchemar',
1130 },
1131 'playlist_mincount': 799,
1132 }, {
1133 'url': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl',
1134 'info_dict': {
1135 'title': 'YDL_safe_search',
1136 },
1137 'playlist_count': 2,
ac7553d0
PH
1138 }, {
1139 'note': 'embedded',
1140 'url': 'http://www.youtube.com/embed/videoseries?list=PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu',
1141 'playlist_count': 4,
1142 'info_dict': {
1143 'title': 'JODA15',
1144 }
6b08cdf6
PH
1145 }, {
1146 'note': 'Embedded SWF player',
1147 'url': 'http://www.youtube.com/p/YN5VISEtHet5D4NEvfTd0zcgFk84NqFZ?hl=en_US&fs=1&rel=0',
1148 'playlist_count': 4,
1149 'info_dict': {
1150 'title': 'JODA7',
1151 }
4b7df0d3
JMF
1152 }, {
1153 'note': 'Buggy playlist: the webpage has a "Load more" button but it doesn\'t have more videos',
1154 'url': 'https://www.youtube.com/playlist?list=UUXw-G3eDE9trcvY2sBMM_aA',
1155 'info_dict': {
1156 'title': 'Uploads from Interstellar Movie',
1157 },
1158 'playlist_mincout': 21,
81127aa5 1159 }]
c5e8d7af 1160
880e1c52
JMF
1161 def _real_initialize(self):
1162 self._login()
1163
652cdaa2 1164 def _ids_to_results(self, ids):
c9cc0bf5
PH
1165 return [
1166 self.url_result(vid_id, 'Youtube', video_id=vid_id)
1167 for vid_id in ids]
652cdaa2
JMF
1168
1169 def _extract_mix(self, playlist_id):
1170 # The mixes are generated from a a single video
1171 # the id of the playlist is just 'RD' + video_id
7d4afc55 1172 url = 'https://youtube.com/watch?v=%s&list=%s' % (playlist_id[-11:], playlist_id)
c9cc0bf5 1173 webpage = self._download_webpage(
78caa52a 1174 url, playlist_id, 'Downloading Youtube mix')
bc2f773b 1175 search_title = lambda class_name: get_element_by_attribute('class', class_name, webpage)
c9cc0bf5
PH
1176 title_span = (
1177 search_title('playlist-title') or
1178 search_title('title long-title') or
1179 search_title('title'))
76d1700b 1180 title = clean_html(title_span)
c9cc0bf5
PH
1181 ids = orderedSet(re.findall(
1182 r'''(?xs)data-video-username=".*?".*?
1183 href="/watch\?v=([0-9A-Za-z_-]{11})&amp;[^"]*?list=%s''' % re.escape(playlist_id),
1184 webpage))
652cdaa2
JMF
1185 url_results = self._ids_to_results(ids)
1186
1187 return self.playlist_result(url_results, playlist_id, title)
1188
c5e8d7af
PH
1189 def _real_extract(self, url):
1190 # Extract playlist id
d67cc9fa 1191 mobj = re.match(self._VALID_URL, url)
c5e8d7af 1192 if mobj is None:
69ea8ca4 1193 raise ExtractorError('Invalid URL: %s' % url)
47192f92
FV
1194 playlist_id = mobj.group(1) or mobj.group(2)
1195
1196 # Check if it's a video-specific URL
7c61bd36 1197 query_dict = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
47192f92
FV
1198 if 'v' in query_dict:
1199 video_id = query_dict['v'][0]
1200 if self._downloader.params.get('noplaylist'):
69ea8ca4 1201 self.to_screen('Downloading just video %s because of --no-playlist' % video_id)
7012b23c 1202 return self.url_result(video_id, 'Youtube', video_id=video_id)
47192f92 1203 else:
69ea8ca4 1204 self.to_screen('Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id))
c5e8d7af 1205
7d4afc55 1206 if playlist_id.startswith('RD'):
652cdaa2
JMF
1207 # Mixes require a custom extraction process
1208 return self._extract_mix(playlist_id)
1209
dbb94fb0
S
1210 url = self._TEMPLATE_URL % playlist_id
1211 page = self._download_webpage(url, playlist_id)
1212 more_widget_html = content_html = page
1213
10c0e2d8 1214 # Check if the playlist exists or is private
e399853d 1215 if re.search(r'<div class="yt-alert-message">[^<]*?(The|This) playlist (does not exist|is private)[^<]*?</div>', page) is not None:
10c0e2d8 1216 raise ExtractorError(
78caa52a 1217 'The playlist doesn\'t exist or is private, use --username or '
10c0e2d8
JMF
1218 '--netrc to access it.',
1219 expected=True)
1220
dcbb4580
JMF
1221 # Extract the video ids from the playlist pages
1222 ids = []
c5e8d7af 1223
755eb032 1224 for page_num in itertools.count(1):
dbb94fb0 1225 matches = re.finditer(self._VIDEO_RE, content_html)
6e47b51e
JMF
1226 # We remove the duplicates and the link with index 0
1227 # (it's not the first video of the playlist)
1228 new_ids = orderedSet(m.group('id') for m in matches if m.group('index') != '0')
dcbb4580 1229 ids.extend(new_ids)
c5e8d7af 1230
dbb94fb0
S
1231 mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
1232 if not mobj:
c5e8d7af
PH
1233 break
1234
dbb94fb0 1235 more = self._download_json(
5912c639
PH
1236 'https://youtube.com/%s' % mobj.group('more'), playlist_id,
1237 'Downloading page #%s' % page_num,
1238 transform_source=uppercase_escape)
dbb94fb0 1239 content_html = more['content_html']
4b7df0d3
JMF
1240 if not content_html.strip():
1241 # Some webpages show a "Load more" button but they don't
1242 # have more videos
1243 break
dbb94fb0
S
1244 more_widget_html = more['load_more_widget_html']
1245
1246 playlist_title = self._html_search_regex(
68eb8e90 1247 r'(?s)<h1 class="pl-header-title[^"]*">\s*(.*?)\s*</h1>',
78caa52a 1248 page, 'title')
c5e8d7af 1249
652cdaa2 1250 url_results = self._ids_to_results(ids)
dcbb4580 1251 return self.playlist_result(url_results, playlist_id, playlist_title)
c5e8d7af
PH
1252
1253
1254class YoutubeChannelIE(InfoExtractor):
78caa52a 1255 IE_DESC = 'YouTube.com channels'
9ff67727 1256 _VALID_URL = r'https?://(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/(?P<id>[0-9A-Za-z_-]+)'
78caa52a 1257 IE_NAME = 'youtube:channel'
cdc628a4
PH
1258 _TESTS = [{
1259 'note': 'paginated channel',
1260 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w',
1261 'playlist_mincount': 91,
1262 }]
c5e8d7af
PH
1263
1264 def extract_videos_from_page(self, page):
1265 ids_in_page = []
1266 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1267 if mobj.group(1) not in ids_in_page:
1268 ids_in_page.append(mobj.group(1))
1269 return ids_in_page
1270
1271 def _real_extract(self, url):
9ff67727 1272 channel_id = self._match_id(url)
c5e8d7af 1273
c5e8d7af 1274 video_ids = []
b9643eed
JMF
1275 url = 'https://www.youtube.com/channel/%s/videos' % channel_id
1276 channel_page = self._download_webpage(url, channel_id)
31812a9e
PH
1277 autogenerated = re.search(r'''(?x)
1278 class="[^"]*?(?:
1279 channel-header-autogenerated-label|
1280 yt-channel-title-autogenerated
1281 )[^"]*"''', channel_page) is not None
c5e8d7af 1282
b9643eed
JMF
1283 if autogenerated:
1284 # The videos are contained in a single page
1285 # the ajax pages can't be used, they are empty
1286 video_ids = self.extract_videos_from_page(channel_page)
b82f815f
PH
1287 entries = [
1288 self.url_result(video_id, 'Youtube', video_id=video_id)
1289 for video_id in video_ids]
1290 return self.playlist_result(entries, channel_id)
1291
1292 def _entries():
23d3608c 1293 more_widget_html = content_html = channel_page
b9643eed 1294 for pagenum in itertools.count(1):
81c2f20b 1295
23d3608c 1296 ids_in_page = self.extract_videos_from_page(content_html)
b82f815f
PH
1297 for video_id in ids_in_page:
1298 yield self.url_result(
1299 video_id, 'Youtube', video_id=video_id)
5f6a1245 1300
23d3608c
JMF
1301 mobj = re.search(
1302 r'data-uix-load-more-href="/?(?P<more>[^"]+)"',
1303 more_widget_html)
1304 if not mobj:
b9643eed 1305 break
c5e8d7af 1306
23d3608c
JMF
1307 more = self._download_json(
1308 'https://youtube.com/%s' % mobj.group('more'), channel_id,
1309 'Downloading page #%s' % (pagenum + 1),
1310 transform_source=uppercase_escape)
1311 content_html = more['content_html']
1312 more_widget_html = more['load_more_widget_html']
1313
b82f815f 1314 return self.playlist_result(_entries(), channel_id)
c5e8d7af
PH
1315
1316
1317class YoutubeUserIE(InfoExtractor):
78caa52a 1318 IE_DESC = 'YouTube.com user videos (URL or "ytuser" keyword)'
9ff67727 1319 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:user/)?(?!(?:attribution_link|watch|results)(?:$|[^a-z_A-Z0-9-])))|ytuser:)(?!feed/)(?P<id>[A-Za-z0-9_-]+)'
38c2e5b8 1320 _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/users/%s'
c5e8d7af 1321 _GDATA_PAGE_SIZE = 50
38c2e5b8 1322 _GDATA_URL = 'https://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d&alt=json'
78caa52a 1323 IE_NAME = 'youtube:user'
c5e8d7af 1324
cdc628a4
PH
1325 _TESTS = [{
1326 'url': 'https://www.youtube.com/user/TheLinuxFoundation',
1327 'playlist_mincount': 320,
1328 'info_dict': {
1329 'title': 'TheLinuxFoundation',
1330 }
1331 }, {
1332 'url': 'ytuser:phihag',
1333 'only_matching': True,
1334 }]
1335
e3ea4790 1336 @classmethod
f4b05232 1337 def suitable(cls, url):
e3ea4790
JMF
1338 # Don't return True if the url can be extracted with other youtube
1339 # extractor, the regex would is too permissive and it would match.
1340 other_ies = iter(klass for (name, klass) in globals().items() if name.endswith('IE') and klass is not cls)
5f6a1245
JW
1341 if any(ie.suitable(url) for ie in other_ies):
1342 return False
1343 else:
1344 return super(YoutubeUserIE, cls).suitable(url)
f4b05232 1345
c5e8d7af 1346 def _real_extract(self, url):
9ff67727 1347 username = self._match_id(url)
c5e8d7af
PH
1348
1349 # Download video ids using YouTube Data API. Result size per
1350 # query is limited (currently to 50 videos) so we need to query
1351 # page by page until there are no video ids - it means we got
1352 # all of them.
1353
b7ab0590 1354 def download_page(pagenum):
c5e8d7af
PH
1355 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1356
1357 gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
b7ab0590
PH
1358 page = self._download_webpage(
1359 gdata_url, username,
78caa52a 1360 'Downloading video ids from %d to %d' % (
b7ab0590 1361 start_index, start_index + self._GDATA_PAGE_SIZE))
c5e8d7af 1362
fd9cf738
JMF
1363 try:
1364 response = json.loads(page)
1365 except ValueError as err:
69ea8ca4 1366 raise ExtractorError('Invalid JSON in API response: ' + compat_str(err))
71c82637 1367 if 'entry' not in response['feed']:
b7ab0590 1368 return
fd9cf738 1369
c5e8d7af 1370 # Extract video identifiers
e302f9ce
PH
1371 entries = response['feed']['entry']
1372 for entry in entries:
1373 title = entry['title']['$t']
1374 video_id = entry['id']['$t'].split('/')[-1]
b7ab0590 1375 yield {
e302f9ce
PH
1376 '_type': 'url',
1377 'url': video_id,
1378 'ie_key': 'Youtube',
b11cec41 1379 'id': video_id,
e302f9ce 1380 'title': title,
b7ab0590 1381 }
9c44d242 1382 url_results = OnDemandPagedList(download_page, self._GDATA_PAGE_SIZE)
c5e8d7af 1383
7012b23c
PH
1384 return self.playlist_result(url_results, playlist_title=username)
1385
b05654f0
PH
1386
1387class YoutubeSearchIE(SearchInfoExtractor):
78caa52a
PH
1388 IE_DESC = 'YouTube.com searches'
1389 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
b05654f0 1390 _MAX_RESULTS = 1000
78caa52a 1391 IE_NAME = 'youtube:search'
b05654f0
PH
1392 _SEARCH_KEY = 'ytsearch'
1393
b05654f0
PH
1394 def _get_n_results(self, query, n):
1395 """Get a specified number of results for a query"""
1396
1397 video_ids = []
1398 pagenum = 0
1399 limit = n
83d548ef 1400 PAGE_SIZE = 50
b05654f0 1401
83d548ef
PH
1402 while (PAGE_SIZE * pagenum) < limit:
1403 result_url = self._API_URL % (
1404 compat_urllib_parse.quote_plus(query.encode('utf-8')),
1405 (PAGE_SIZE * pagenum) + 1)
7cc3570e 1406 data_json = self._download_webpage(
69ea8ca4
PH
1407 result_url, video_id='query "%s"' % query,
1408 note='Downloading page %s' % (pagenum + 1),
1409 errnote='Unable to download API page')
7cc3570e
PH
1410 data = json.loads(data_json)
1411 api_response = data['data']
1412
1413 if 'items' not in api_response:
07ad22b8 1414 raise ExtractorError(
78caa52a 1415 '[youtube] No video results', expected=True)
b05654f0
PH
1416
1417 new_ids = list(video['id'] for video in api_response['items'])
1418 video_ids += new_ids
1419
1420 limit = min(n, api_response['totalItems'])
1421 pagenum += 1
1422
1423 if len(video_ids) > n:
1424 video_ids = video_ids[:n]
7012b23c
PH
1425 videos = [self.url_result(video_id, 'Youtube', video_id=video_id)
1426 for video_id in video_ids]
b05654f0 1427 return self.playlist_result(videos, query)
75dff0ee 1428
c9ae7b95 1429
a3dd9248 1430class YoutubeSearchDateIE(YoutubeSearchIE):
cb7fb546 1431 IE_NAME = YoutubeSearchIE.IE_NAME + ':date'
a3dd9248
CM
1432 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc&orderby=published'
1433 _SEARCH_KEY = 'ytsearchdate'
78caa52a 1434 IE_DESC = 'YouTube.com searches, newest videos first'
75dff0ee 1435
c9ae7b95
PH
1436
1437class YoutubeSearchURLIE(InfoExtractor):
78caa52a
PH
1438 IE_DESC = 'YouTube.com search URLs'
1439 IE_NAME = 'youtube:search_url'
c9ae7b95 1440 _VALID_URL = r'https?://(?:www\.)?youtube\.com/results\?(.*?&)?search_query=(?P<query>[^&]+)(?:[&]|$)'
cdc628a4
PH
1441 _TESTS = [{
1442 'url': 'https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video',
1443 'playlist_mincount': 5,
1444 'info_dict': {
1445 'title': 'youtube-dl test video',
1446 }
1447 }]
c9ae7b95
PH
1448
1449 def _real_extract(self, url):
1450 mobj = re.match(self._VALID_URL, url)
1451 query = compat_urllib_parse.unquote_plus(mobj.group('query'))
1452
1453 webpage = self._download_webpage(url, query)
1454 result_code = self._search_regex(
78caa52a 1455 r'(?s)<ol class="item-section"(.*?)</ol>', webpage, 'result HTML')
c9ae7b95
PH
1456
1457 part_codes = re.findall(
1458 r'(?s)<h3 class="yt-lockup-title">(.*?)</h3>', result_code)
1459 entries = []
1460 for part_code in part_codes:
1461 part_title = self._html_search_regex(
6feb2d5e 1462 [r'(?s)title="([^"]+)"', r'>([^<]+)</a>'], part_code, 'item title', fatal=False)
c9ae7b95
PH
1463 part_url_snippet = self._html_search_regex(
1464 r'(?s)href="([^"]+)"', part_code, 'item URL')
1465 part_url = compat_urlparse.urljoin(
1466 'https://www.youtube.com/', part_url_snippet)
1467 entries.append({
1468 '_type': 'url',
1469 'url': part_url,
1470 'title': part_title,
1471 })
1472
1473 return {
1474 '_type': 'playlist',
1475 'entries': entries,
1476 'title': query,
1477 }
1478
1479
75dff0ee 1480class YoutubeShowIE(InfoExtractor):
78caa52a 1481 IE_DESC = 'YouTube.com (multi-season) shows'
cdc628a4 1482 _VALID_URL = r'https?://www\.youtube\.com/show/(?P<id>[^?#]*)'
78caa52a 1483 IE_NAME = 'youtube:show'
cdc628a4
PH
1484 _TESTS = [{
1485 'url': 'http://www.youtube.com/show/airdisasters',
1486 'playlist_mincount': 3,
1487 'info_dict': {
1488 'id': 'airdisasters',
1489 'title': 'Air Disasters',
1490 }
1491 }]
75dff0ee
JMF
1492
1493 def _real_extract(self, url):
1494 mobj = re.match(self._VALID_URL, url)
cdc628a4
PH
1495 playlist_id = mobj.group('id')
1496 webpage = self._download_webpage(
1497 url, playlist_id, 'Downloading show webpage')
75dff0ee
JMF
1498 # There's one playlist for each season of the show
1499 m_seasons = list(re.finditer(r'href="(/playlist\?list=.*?)"', webpage))
cdc628a4
PH
1500 self.to_screen('%s: Found %s seasons' % (playlist_id, len(m_seasons)))
1501 entries = [
1502 self.url_result(
1503 'https://www.youtube.com' + season.group(1), 'YoutubePlaylist')
1504 for season in m_seasons
1505 ]
1506 title = self._og_search_title(webpage, fatal=False)
1507
1508 return {
1509 '_type': 'playlist',
1510 'id': playlist_id,
1511 'title': title,
1512 'entries': entries,
1513 }
04cc9617
JMF
1514
1515
b2e8bc1b 1516class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
d7ae0639
JMF
1517 """
1518 Base class for extractors that fetch info from
1519 http://www.youtube.com/feed_ajax
1520 Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.
1521 """
b2e8bc1b 1522 _LOGIN_REQUIRED = True
43ba5456
JMF
1523 # use action_load_personal_feed instead of action_load_system_feed
1524 _PERSONAL_FEED = False
04cc9617 1525
d7ae0639
JMF
1526 @property
1527 def _FEED_TEMPLATE(self):
43ba5456
JMF
1528 action = 'action_load_system_feed'
1529 if self._PERSONAL_FEED:
1530 action = 'action_load_personal_feed'
38c2e5b8 1531 return 'https://www.youtube.com/feed_ajax?%s=1&feed_name=%s&paging=%%s' % (action, self._FEED_NAME)
d7ae0639
JMF
1532
1533 @property
1534 def IE_NAME(self):
78caa52a 1535 return 'youtube:%s' % self._FEED_NAME
04cc9617 1536
81f0259b 1537 def _real_initialize(self):
b2e8bc1b 1538 self._login()
81f0259b 1539
04cc9617
JMF
1540 def _real_extract(self, url):
1541 feed_entries = []
0e44d838
JMF
1542 paging = 0
1543 for i in itertools.count(1):
84d84211
PH
1544 info = self._download_json(
1545 self._FEED_TEMPLATE % paging,
1546 '%s feed' % self._FEED_NAME,
1547 'Downloading page %s' % i,
1548 transform_source=uppercase_escape)
f6177462 1549 feed_html = info.get('feed_html') or info.get('content_html')
1a9b9649 1550 load_more_widget_html = info.get('load_more_widget_html') or feed_html
43ba5456 1551 m_ids = re.finditer(r'"/watch\?v=(.*?)["&]', feed_html)
04cc9617 1552 ids = orderedSet(m.group(1) for m in m_ids)
7012b23c
PH
1553 feed_entries.extend(
1554 self.url_result(video_id, 'Youtube', video_id=video_id)
1555 for video_id in ids)
05ee2b6d
JMF
1556 mobj = re.search(
1557 r'data-uix-load-more-href="/?[^"]+paging=(?P<paging>\d+)',
1a9b9649 1558 load_more_widget_html)
05ee2b6d 1559 if mobj is None:
04cc9617 1560 break
05ee2b6d 1561 paging = mobj.group('paging')
d7ae0639
JMF
1562 return self.playlist_result(feed_entries, playlist_title=self._PLAYLIST_TITLE)
1563
5f6a1245 1564
d7ae0639 1565class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
f3a34072 1566 IE_DESC = 'YouTube.com recommended videos, ":ytrec" for short (requires authentication)'
d7ae0639
JMF
1567 _VALID_URL = r'https?://www\.youtube\.com/feed/recommended|:ytrec(?:ommended)?'
1568 _FEED_NAME = 'recommended'
78caa52a 1569 _PLAYLIST_TITLE = 'Youtube Recommended videos'
c626a3d9 1570
5f6a1245 1571
43ba5456 1572class YoutubeWatchLaterIE(YoutubeFeedsInfoExtractor):
f3a34072 1573 IE_DESC = 'Youtube watch later list, ":ytwatchlater" for short (requires authentication)'
43ba5456
JMF
1574 _VALID_URL = r'https?://www\.youtube\.com/feed/watch_later|:ytwatchlater'
1575 _FEED_NAME = 'watch_later'
78caa52a 1576 _PLAYLIST_TITLE = 'Youtube Watch Later'
43ba5456 1577 _PERSONAL_FEED = True
c626a3d9 1578
5f6a1245 1579
f459d170 1580class YoutubeHistoryIE(YoutubeFeedsInfoExtractor):
f3a34072 1581 IE_DESC = 'Youtube watch history, ":ythistory" for short (requires authentication)'
78caa52a 1582 _VALID_URL = 'https?://www\.youtube\.com/feed/history|:ythistory'
f459d170
JMF
1583 _FEED_NAME = 'history'
1584 _PERSONAL_FEED = True
78caa52a 1585 _PLAYLIST_TITLE = 'Youtube Watch History'
f459d170 1586
5f6a1245 1587
c626a3d9 1588class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
78caa52a 1589 IE_NAME = 'youtube:favorites'
f3a34072 1590 IE_DESC = 'YouTube.com favourite videos, ":ytfav" for short (requires authentication)'
c7a7750d 1591 _VALID_URL = r'https?://www\.youtube\.com/my_favorites|:ytfav(?:ou?rites)?'
c626a3d9
JMF
1592 _LOGIN_REQUIRED = True
1593
1594 def _real_extract(self, url):
1595 webpage = self._download_webpage('https://www.youtube.com/my_favorites', 'Youtube Favourites videos')
78caa52a 1596 playlist_id = self._search_regex(r'list=(.+?)["&]', webpage, 'favourites playlist id')
c626a3d9 1597 return self.url_result(playlist_id, 'YoutubePlaylist')
15870e90
PH
1598
1599
1ed5b5c9 1600class YoutubeSubscriptionsIE(YoutubePlaylistIE):
78caa52a
PH
1601 IE_NAME = 'youtube:subscriptions'
1602 IE_DESC = 'YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication)'
1ed5b5c9 1603 _VALID_URL = r'https?://www\.youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
81127aa5 1604 _TESTS = []
1ed5b5c9
JMF
1605
1606 def _real_extract(self, url):
78caa52a 1607 title = 'Youtube Subscriptions'
1ed5b5c9
JMF
1608 page = self._download_webpage('https://www.youtube.com/feed/subscriptions', title)
1609
1610 # The extraction process is the same as for playlists, but the regex
1611 # for the video ids doesn't contain an index
1612 ids = []
1613 more_widget_html = content_html = page
1614
1615 for page_num in itertools.count(1):
1616 matches = re.findall(r'href="\s*/watch\?v=([0-9A-Za-z_-]{11})', content_html)
1617 new_ids = orderedSet(matches)
1618 ids.extend(new_ids)
1619
1620 mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
1621 if not mobj:
1622 break
1623
1624 more = self._download_json(
1625 'https://youtube.com/%s' % mobj.group('more'), title,
1626 'Downloading page #%s' % page_num,
1627 transform_source=uppercase_escape)
1628 content_html = more['content_html']
1629 more_widget_html = more['load_more_widget_html']
1630
1631 return {
1632 '_type': 'playlist',
1633 'title': title,
1634 'entries': self._ids_to_results(ids),
1635 }
1636
1637
15870e90
PH
1638class YoutubeTruncatedURLIE(InfoExtractor):
1639 IE_NAME = 'youtube:truncated_url'
1640 IE_DESC = False # Do not list
975d35db 1641 _VALID_URL = r'''(?x)
c4808c60
PH
1642 (?:https?://)?[^/]+/watch\?(?:
1643 feature=[a-z_]+|
1644 annotation_id=annotation_[^&]+
1645 )?$|
975d35db
PH
1646 (?:https?://)?(?:www\.)?youtube\.com/attribution_link\?a=[^&]+$
1647 '''
15870e90 1648
c4808c60
PH
1649 _TESTS = [{
1650 'url': 'http://www.youtube.com/watch?annotation_id=annotation_3951667041',
1651 'only_matching': True,
dc2fc736
PH
1652 }, {
1653 'url': 'http://www.youtube.com/watch?',
1654 'only_matching': True,
c4808c60
PH
1655 }]
1656
15870e90
PH
1657 def _real_extract(self, url):
1658 raise ExtractorError(
78caa52a
PH
1659 'Did you forget to quote the URL? Remember that & is a meta '
1660 'character in most shells, so you want to put the URL in quotes, '
1661 'like youtube-dl '
1662 '"http://www.youtube.com/watch?feature=foo&v=BaW_jenozKc" '
1663 ' or simply youtube-dl BaW_jenozKc .',
15870e90 1664 expected=True)
772fd5cc
PH
1665
1666
1667class YoutubeTruncatedIDIE(InfoExtractor):
1668 IE_NAME = 'youtube:truncated_id'
1669 IE_DESC = False # Do not list
1670 _VALID_URL = r'https?://(?:www\.)youtube\.com/watch\?v=(?P<id>[0-9A-Za-z_-]{1,10})$'
1671
1672 _TESTS = [{
1673 'url': 'https://www.youtube.com/watch?v=N_708QY7Ob',
1674 'only_matching': True,
1675 }]
1676
1677 def _real_extract(self, url):
1678 video_id = self._match_id(url)
1679 raise ExtractorError(
1680 'Incomplete YouTube ID %s. URL %s looks truncated.' % (video_id, url),
1681 expected=True)