]> jfr.im git - yt-dlp.git/blame - youtube_dl/extractor/youtube.py
[/__init__] Add another cute search example
[yt-dlp.git] / youtube_dl / extractor / youtube.py
CommitLineData
c5e8d7af 1# coding: utf-8
c5e8d7af 2
78caa52a
PH
3from __future__ import unicode_literals
4
5
0ca96d48 6import itertools
c5e8d7af 7import json
c4417ddb 8import os.path
c5e8d7af 9import re
42939b61 10import time
e0df6211 11import traceback
c5e8d7af 12
b05654f0 13from .common import InfoExtractor, SearchInfoExtractor
54d39d8b 14from .subtitles import SubtitlesInfoExtractor
2b25cb5d 15from ..jsinterp import JSInterpreter
54256267 16from ..swfinterp import SWFInterpreter
4bb4a188 17from ..compat import (
edf3e38e 18 compat_chr,
c5e8d7af 19 compat_parse_qs,
c5e8d7af
PH
20 compat_urllib_parse,
21 compat_urllib_request,
7c61bd36 22 compat_urlparse,
c5e8d7af 23 compat_str,
4bb4a188
PH
24)
25from ..utils import (
c5e8d7af 26 clean_html,
c5e8d7af 27 ExtractorError,
4bb4a188
PH
28 get_element_by_attribute,
29 get_element_by_id,
dd27fd17 30 int_or_none,
9c44d242 31 OnDemandPagedList,
4bb4a188 32 orderedSet,
c5e8d7af
PH
33 unescapeHTML,
34 unified_strdate,
81c2f20b 35 uppercase_escape,
c5e8d7af
PH
36)
37
5f6a1245 38
de7f3446 39class YoutubeBaseInfoExtractor(InfoExtractor):
b2e8bc1b
JMF
40 """Provide base functions for Youtube extractors"""
41 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
83317f69 42 _TWOFACTOR_URL = 'https://accounts.google.com/SecondFactor'
b2e8bc1b
JMF
43 _NETRC_MACHINE = 'youtube'
44 # If True it will raise an error if no login info is provided
45 _LOGIN_REQUIRED = False
46
b2e8bc1b 47 def _set_language(self):
810fb84d
PH
48 self._set_cookie(
49 '.youtube.com', 'PREF', 'f1=50000000&hl=en',
42939b61 50 # YouTube sets the expire time to about two months
810fb84d 51 expire_time=time.time() + 2 * 30 * 24 * 3600)
b2e8bc1b
JMF
52
53 def _login(self):
83317f69 54 """
55 Attempt to log in to YouTube.
56 True is returned if successful or skipped.
57 False is returned if login failed.
58
59 If _LOGIN_REQUIRED is set and no authentication was provided, an error is raised.
60 """
b2e8bc1b
JMF
61 (username, password) = self._get_login_info()
62 # No authentication to be performed
63 if username is None:
64 if self._LOGIN_REQUIRED:
69ea8ca4 65 raise ExtractorError('No login info available, needed for using %s.' % self.IE_NAME, expected=True)
83317f69 66 return True
b2e8bc1b 67
7cc3570e
PH
68 login_page = self._download_webpage(
69 self._LOGIN_URL, None,
69ea8ca4
PH
70 note='Downloading login page',
71 errnote='unable to fetch login page', fatal=False)
7cc3570e
PH
72 if login_page is False:
73 return
b2e8bc1b 74
795f28f8 75 galx = self._search_regex(r'(?s)<input.+?name="GALX".+?value="(.+?)"',
78caa52a 76 login_page, 'Login GALX parameter')
c5e8d7af 77
b2e8bc1b
JMF
78 # Log in
79 login_form_strs = {
8bcc8756
JW
80 'continue': 'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
81 'Email': username,
82 'GALX': galx,
83 'Passwd': password,
84
85 'PersistentCookie': 'yes',
86 '_utf8': '霱',
87 'bgresponse': 'js_disabled',
88 'checkConnection': '',
89 'checkedDomains': 'youtube',
90 'dnConn': '',
91 'pstMsg': '0',
92 'rmShown': '1',
93 'secTok': '',
94 'signIn': 'Sign in',
95 'timeStmp': '',
96 'service': 'youtube',
97 'uilel': '3',
98 'hl': 'en_US',
b2e8bc1b 99 }
83317f69 100
b2e8bc1b
JMF
101 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
102 # chokes on unicode
5f6a1245 103 login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k, v in login_form_strs.items())
b2e8bc1b 104 login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
7cc3570e
PH
105
106 req = compat_urllib_request.Request(self._LOGIN_URL, login_data)
107 login_results = self._download_webpage(
108 req, None,
69ea8ca4 109 note='Logging in', errnote='unable to log in', fatal=False)
7cc3570e
PH
110 if login_results is False:
111 return False
83317f69 112
113 if re.search(r'id="errormsg_0_Passwd"', login_results) is not None:
69ea8ca4 114 raise ExtractorError('Please use your account password and a two-factor code instead of an application-specific password.', expected=True)
83317f69 115
116 # Two-Factor
117 # TODO add SMS and phone call support - these require making a request and then prompting the user
118
119 if re.search(r'(?i)<form[^>]* id="gaia_secondfactorform"', login_results) is not None:
120 tfa_code = self._get_tfa_info()
121
122 if tfa_code is None:
69ea8ca4
PH
123 self._downloader.report_warning('Two-factor authentication required. Provide it with --twofactor <code>')
124 self._downloader.report_warning('(Note that only TOTP (Google Authenticator App) codes work at this time.)')
83317f69 125 return False
126
127 # Unlike the first login form, secTok and timeStmp are both required for the TFA form
128
129 match = re.search(r'id="secTok"\n\s+value=\'(.+)\'/>', login_results, re.M | re.U)
130 if match is None:
69ea8ca4 131 self._downloader.report_warning('Failed to get secTok - did the page structure change?')
83317f69 132 secTok = match.group(1)
133 match = re.search(r'id="timeStmp"\n\s+value=\'(.+)\'/>', login_results, re.M | re.U)
134 if match is None:
69ea8ca4 135 self._downloader.report_warning('Failed to get timeStmp - did the page structure change?')
83317f69 136 timeStmp = match.group(1)
137
138 tfa_form_strs = {
78caa52a
PH
139 'continue': 'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
140 'smsToken': '',
141 'smsUserPin': tfa_code,
142 'smsVerifyPin': 'Verify',
143
144 'PersistentCookie': 'yes',
145 'checkConnection': '',
146 'checkedDomains': 'youtube',
147 'pstMsg': '1',
148 'secTok': secTok,
149 'timeStmp': timeStmp,
150 'service': 'youtube',
151 'hl': 'en_US',
83317f69 152 }
5f6a1245 153 tfa_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k, v in tfa_form_strs.items())
83317f69 154 tfa_data = compat_urllib_parse.urlencode(tfa_form).encode('ascii')
155
156 tfa_req = compat_urllib_request.Request(self._TWOFACTOR_URL, tfa_data)
157 tfa_results = self._download_webpage(
158 tfa_req, None,
69ea8ca4 159 note='Submitting TFA code', errnote='unable to submit tfa', fatal=False)
83317f69 160
161 if tfa_results is False:
162 return False
163
164 if re.search(r'(?i)<form[^>]* id="gaia_secondfactorform"', tfa_results) is not None:
69ea8ca4 165 self._downloader.report_warning('Two-factor code expired. Please try again, or use a one-use backup code instead.')
83317f69 166 return False
167 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', tfa_results) is not None:
69ea8ca4 168 self._downloader.report_warning('unable to log in - did the page structure change?')
83317f69 169 return False
170 if re.search(r'smsauth-interstitial-reviewsettings', tfa_results) is not None:
69ea8ca4 171 self._downloader.report_warning('Your Google account has a security notice. Please log in on your web browser, resolve the notice, and try again.')
83317f69 172 return False
173
7cc3570e 174 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
69ea8ca4 175 self._downloader.report_warning('unable to log in: bad username or password')
b2e8bc1b
JMF
176 return False
177 return True
178
b2e8bc1b
JMF
179 def _real_initialize(self):
180 if self._downloader is None:
181 return
42939b61 182 self._set_language()
b2e8bc1b
JMF
183 if not self._login():
184 return
c5e8d7af 185
8377574c 186
de7f3446 187class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
78caa52a 188 IE_DESC = 'YouTube.com'
cb7dfeea 189 _VALID_URL = r"""(?x)^
c5e8d7af 190 (
edb53e2d 191 (?:https?://|//) # http(s):// or protocol-independent URL
cb7dfeea 192 (?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/|
484aaeb2 193 (?:www\.)?deturl\.com/www\.youtube\.com/|
e70dc1d1 194 (?:www\.)?pwnyoutube\.com/|
f7000f3a 195 (?:www\.)?yourepeat\.com/|
e69ae5b9
JMF
196 tube\.majestyc\.net/|
197 youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains
c5e8d7af
PH
198 (?:.*?\#/)? # handle anchor (#/) redirect urls
199 (?: # the various things that can precede the ID:
ac7553d0 200 (?:(?:v|embed|e)/(?!videoseries)) # v/ or embed/ or e/
c5e8d7af 201 |(?: # or the v= param in all its forms
f7000f3a 202 (?:(?:watch|movie)(?:_popup)?(?:\.php)?/?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
c5e8d7af
PH
203 (?:\?|\#!?) # the params delimiter ? or # or #!
204 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
205 v=
206 )
f4b05232
JMF
207 ))
208 |youtu\.be/ # just youtu.be/xxxx
edb53e2d 209 |(?:www\.)?cleanvideosearch\.com/media/action/yt/watch\?videoId=
f4b05232 210 )
c5e8d7af 211 )? # all until now is optional -> you can pass the naked ID
8963d9c2 212 ([0-9A-Za-z_-]{11}) # here is it! the YouTube video ID
9291475f 213 (?!.*?&list=) # combined list/video URLs are handled by the playlist IE
c5e8d7af
PH
214 (?(1).+)? # if we found the ID, everything can follow
215 $"""
c5e8d7af 216 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
2c62dc26
PH
217 _formats = {
218 '5': {'ext': 'flv', 'width': 400, 'height': 240},
219 '6': {'ext': 'flv', 'width': 450, 'height': 270},
220 '13': {'ext': '3gp'},
221 '17': {'ext': '3gp', 'width': 176, 'height': 144},
222 '18': {'ext': 'mp4', 'width': 640, 'height': 360},
223 '22': {'ext': 'mp4', 'width': 1280, 'height': 720},
224 '34': {'ext': 'flv', 'width': 640, 'height': 360},
225 '35': {'ext': 'flv', 'width': 854, 'height': 480},
226 '36': {'ext': '3gp', 'width': 320, 'height': 240},
227 '37': {'ext': 'mp4', 'width': 1920, 'height': 1080},
228 '38': {'ext': 'mp4', 'width': 4096, 'height': 3072},
229 '43': {'ext': 'webm', 'width': 640, 'height': 360},
230 '44': {'ext': 'webm', 'width': 854, 'height': 480},
231 '45': {'ext': 'webm', 'width': 1280, 'height': 720},
232 '46': {'ext': 'webm', 'width': 1920, 'height': 1080},
233
1d043b93 234
86fe61c8 235 # 3d videos
43b81eb9
PH
236 '82': {'ext': 'mp4', 'height': 360, 'format_note': '3D', 'preference': -20},
237 '83': {'ext': 'mp4', 'height': 480, 'format_note': '3D', 'preference': -20},
238 '84': {'ext': 'mp4', 'height': 720, 'format_note': '3D', 'preference': -20},
239 '85': {'ext': 'mp4', 'height': 1080, 'format_note': '3D', 'preference': -20},
240 '100': {'ext': 'webm', 'height': 360, 'format_note': '3D', 'preference': -20},
241 '101': {'ext': 'webm', 'height': 480, 'format_note': '3D', 'preference': -20},
242 '102': {'ext': 'webm', 'height': 720, 'format_note': '3D', 'preference': -20},
836a086c 243
96fb5605 244 # Apple HTTP Live Streaming
43b81eb9
PH
245 '92': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'preference': -10},
246 '93': {'ext': 'mp4', 'height': 360, 'format_note': 'HLS', 'preference': -10},
247 '94': {'ext': 'mp4', 'height': 480, 'format_note': 'HLS', 'preference': -10},
248 '95': {'ext': 'mp4', 'height': 720, 'format_note': 'HLS', 'preference': -10},
249 '96': {'ext': 'mp4', 'height': 1080, 'format_note': 'HLS', 'preference': -10},
250 '132': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'preference': -10},
251 '151': {'ext': 'mp4', 'height': 72, 'format_note': 'HLS', 'preference': -10},
2c62dc26
PH
252
253 # DASH mp4 video
43b81eb9
PH
254 '133': {'ext': 'mp4', 'height': 240, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
255 '134': {'ext': 'mp4', 'height': 360, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
256 '135': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
257 '136': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
258 '137': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
259 '138': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
260 '160': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
261 '264': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
0d2c1418
PH
262 '298': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'h264'},
263 '299': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'h264'},
264 '266': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'vcodec': 'h264'},
836a086c 265
f6f1fc92 266 # Dash mp4 audio
2c62dc26
PH
267 '139': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 48, 'preference': -50},
268 '140': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 128, 'preference': -50},
269 '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 256, 'preference': -50},
836a086c
AZ
270
271 # Dash webm
e75cafe9
A
272 '167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
273 '168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
274 '169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
275 '170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
276 '218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
277 '219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
1cc887cb 278 '278': {'ext': 'webm', 'height': 144, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'container': 'webm', 'vcodec': 'VP9'},
e75cafe9
A
279 '242': {'ext': 'webm', 'height': 240, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
280 '243': {'ext': 'webm', 'height': 360, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
281 '244': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
282 '245': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
283 '246': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
284 '247': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
285 '248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
3c80377b 286 '271': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
18061bba 287 '272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
a51d3aa0
PH
288 '302': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'VP9'},
289 '303': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'VP9'},
10a404c3 290 '313': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'vcodec': 'VP9'},
2c62dc26
PH
291
292 # Dash webm audio
55db73ef 293 '171': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'abr': 128, 'preference': -50},
e75cafe9 294 '172': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'abr': 256, 'preference': -50},
ce6b9a2d 295
0857baad
PH
296 # Dash webm audio with opus inside
297 '249': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 50, 'preference': -50},
298 '250': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 70, 'preference': -50},
299 '251': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 160, 'preference': -50},
300
ce6b9a2d
PH
301 # RTMP (unnamed)
302 '_rtmp': {'protocol': 'rtmp'},
c5e8d7af 303 }
836a086c 304
78caa52a 305 IE_NAME = 'youtube'
2eb88d95
PH
306 _TESTS = [
307 {
4bc3a23e
PH
308 'url': 'http://www.youtube.com/watch?v=BaW_jenozKc',
309 'info_dict': {
310 'id': 'BaW_jenozKc',
311 'ext': 'mp4',
312 'title': 'youtube-dl test video "\'/\\ä↭𝕐',
313 'uploader': 'Philipp Hagemeister',
314 'uploader_id': 'phihag',
315 'upload_date': '20121002',
316 'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',
317 'categories': ['Science & Technology'],
3e7c1224
PH
318 'like_count': int,
319 'dislike_count': int,
2eb88d95 320 }
0e853ca4 321 },
0e853ca4 322 {
4bc3a23e
PH
323 'url': 'http://www.youtube.com/watch?v=UxxajLWwzqY',
324 'note': 'Test generic use_cipher_signature video (#897)',
325 'info_dict': {
326 'id': 'UxxajLWwzqY',
327 'ext': 'mp4',
328 'upload_date': '20120506',
329 'title': 'Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]',
330 'description': 'md5:fea86fda2d5a5784273df5c7cc994d9f',
331 'uploader': 'Icona Pop',
332 'uploader_id': 'IconaPop',
2eb88d95 333 }
c108eb73
JMF
334 },
335 {
4bc3a23e
PH
336 'url': 'https://www.youtube.com/watch?v=07FYdnEawAQ',
337 'note': 'Test VEVO video with age protection (#956)',
338 'info_dict': {
339 'id': '07FYdnEawAQ',
340 'ext': 'mp4',
341 'upload_date': '20130703',
342 'title': 'Justin Timberlake - Tunnel Vision (Explicit)',
343 'description': 'md5:64249768eec3bc4276236606ea996373',
344 'uploader': 'justintimberlakeVEVO',
345 'uploader_id': 'justintimberlakeVEVO',
c108eb73
JMF
346 }
347 },
fccd3771 348 {
4bc3a23e
PH
349 'url': '//www.YouTube.com/watch?v=yZIXLfi8CZQ',
350 'note': 'Embed-only video (#1746)',
351 'info_dict': {
352 'id': 'yZIXLfi8CZQ',
353 'ext': 'mp4',
354 'upload_date': '20120608',
355 'title': 'Principal Sexually Assaults A Teacher - Episode 117 - 8th June 2012',
356 'description': 'md5:09b78bd971f1e3e289601dfba15ca4f7',
357 'uploader': 'SET India',
358 'uploader_id': 'setindia'
fccd3771
PH
359 }
360 },
dd27fd17 361 {
4bc3a23e
PH
362 'url': 'http://www.youtube.com/watch?v=a9LDPn-MO4I',
363 'note': '256k DASH audio (format 141) via DASH manifest',
364 'info_dict': {
365 'id': 'a9LDPn-MO4I',
366 'ext': 'm4a',
367 'upload_date': '20121002',
368 'uploader_id': '8KVIDEO',
369 'description': '',
370 'uploader': '8KVIDEO',
371 'title': 'UHDTV TEST 8K VIDEO.mp4'
4919603f 372 },
4bc3a23e
PH
373 'params': {
374 'youtube_include_dash_manifest': True,
375 'format': '141',
4919603f 376 },
dd27fd17 377 },
3489b7d2
JMF
378 # DASH manifest with encrypted signature
379 {
78caa52a
PH
380 'url': 'https://www.youtube.com/watch?v=IB3lcPjvWLA',
381 'info_dict': {
382 'id': 'IB3lcPjvWLA',
383 'ext': 'm4a',
b766eb27
JMF
384 'title': 'Afrojack, Spree Wilson - The Spark ft. Spree Wilson',
385 'description': 'md5:12e7067fa6735a77bdcbb58cb1187d2d',
78caa52a
PH
386 'uploader': 'AfrojackVEVO',
387 'uploader_id': 'AfrojackVEVO',
388 'upload_date': '20131011',
3489b7d2 389 },
4bc3a23e 390 'params': {
78caa52a
PH
391 'youtube_include_dash_manifest': True,
392 'format': '141',
3489b7d2
JMF
393 },
394 },
aa79ac0c
PH
395 # Controversy video
396 {
397 'url': 'https://www.youtube.com/watch?v=T4XJQO3qol8',
398 'info_dict': {
399 'id': 'T4XJQO3qol8',
400 'ext': 'mp4',
401 'upload_date': '20100909',
402 'uploader': 'The Amazing Atheist',
403 'uploader_id': 'TheAmazingAtheist',
404 'title': 'Burning Everyone\'s Koran',
405 'description': 'SUBSCRIBE: http://www.youtube.com/saturninefilms\n\nEven Obama has taken a stand against freedom on this issue: http://www.huffingtonpost.com/2010/09/09/obama-gma-interview-quran_n_710282.html',
406 }
c522adb1
JMF
407 },
408 # Normal age-gate video (No vevo, embed allowed)
409 {
410 'url': 'http://youtube.com/watch?v=HtVdAasjOgU',
411 'info_dict': {
412 'id': 'HtVdAasjOgU',
413 'ext': 'mp4',
414 'title': 'The Witcher 3: Wild Hunt - The Sword Of Destiny Trailer',
415 'description': 'md5:eca57043abae25130f58f655ad9a7771',
416 'uploader': 'The Witcher',
417 'uploader_id': 'WitcherGame',
418 'upload_date': '20140605',
419 },
420 },
fccae2b9
S
421 # Age-gate video with encrypted signature
422 {
423 'url': 'http://www.youtube.com/watch?v=6kLq3WMV1nU',
424 'info_dict': {
425 'id': '6kLq3WMV1nU',
426 'ext': 'mp4',
427 'title': 'Dedication To My Ex (Miss That) (Lyric Video)',
428 'description': 'md5:33765bb339e1b47e7e72b5490139bb41',
429 'uploader': 'LloydVEVO',
430 'uploader_id': 'LloydVEVO',
431 'upload_date': '20110629',
432 },
433 },
774e208f
PH
434 # video_info is None (https://github.com/rg3/youtube-dl/issues/4421)
435 {
436 'url': '__2ABJjxzNo',
437 'info_dict': {
438 'id': '__2ABJjxzNo',
439 'ext': 'mp4',
440 'upload_date': '20100430',
441 'uploader_id': 'deadmau5',
442 'description': 'md5:12c56784b8032162bb936a5f76d55360',
443 'uploader': 'deadmau5',
444 'title': 'Deadmau5 - Some Chords (HD)',
445 },
446 'expected_warnings': [
447 'DASH manifest missing',
448 ]
e52a40ab
PH
449 },
450 # Olympics (https://github.com/rg3/youtube-dl/issues/4431)
451 {
452 'url': 'lqQg6PlCWgI',
453 'info_dict': {
454 'id': 'lqQg6PlCWgI',
455 'ext': 'mp4',
cbe2bd91
PH
456 'upload_date': '20120731',
457 'uploader_id': 'olympic',
458 'description': 'HO09 - Women - GER-AUS - Hockey - 31 July 2012 - London 2012 Olympic Games',
459 'uploader': 'Olympics',
460 'title': 'Hockey - Women - GER-AUS - London 2012 Olympic Games',
461 },
462 'params': {
463 'skip_download': 'requires avconv',
e52a40ab 464 }
cbe2bd91 465 },
2eb88d95
PH
466 ]
467
e0df6211
PH
468 def __init__(self, *args, **kwargs):
469 super(YoutubeIE, self).__init__(*args, **kwargs)
83799698 470 self._player_cache = {}
e0df6211 471
c5e8d7af
PH
472 def report_video_info_webpage_download(self, video_id):
473 """Report attempt to download video info webpage."""
69ea8ca4 474 self.to_screen('%s: Downloading video info webpage' % video_id)
c5e8d7af 475
c5e8d7af
PH
476 def report_information_extraction(self, video_id):
477 """Report attempt to extract video information."""
69ea8ca4 478 self.to_screen('%s: Extracting video information' % video_id)
c5e8d7af
PH
479
480 def report_unavailable_format(self, video_id, format):
481 """Report extracted video URL."""
69ea8ca4 482 self.to_screen('%s: Format %s not available' % (video_id, format))
c5e8d7af
PH
483
484 def report_rtmp_download(self):
485 """Indicate the download will use the RTMP protocol."""
69ea8ca4 486 self.to_screen('RTMP download detected')
c5e8d7af 487
60064c53
PH
488 def _signature_cache_id(self, example_sig):
489 """ Return a string representation of a signature """
78caa52a 490 return '.'.join(compat_str(len(part)) for part in example_sig.split('.'))
60064c53
PH
491
492 def _extract_signature_function(self, video_id, player_url, example_sig):
cf010131 493 id_m = re.match(
60620368 494 r'.*?-(?P<id>[a-zA-Z0-9_-]+)(?:/watch_as3|/html5player)?\.(?P<ext>[a-z]+)$',
cf010131 495 player_url)
c081b35c
PH
496 if not id_m:
497 raise ExtractorError('Cannot identify player %r' % player_url)
e0df6211
PH
498 player_type = id_m.group('ext')
499 player_id = id_m.group('id')
500
c4417ddb 501 # Read from filesystem cache
60064c53
PH
502 func_id = '%s_%s_%s' % (
503 player_type, player_id, self._signature_cache_id(example_sig))
c4417ddb 504 assert os.path.basename(func_id) == func_id
a0e07d31 505
69ea8ca4 506 cache_spec = self._downloader.cache.load('youtube-sigfuncs', func_id)
a0e07d31 507 if cache_spec is not None:
78caa52a 508 return lambda s: ''.join(s[i] for i in cache_spec)
83799698 509
e0df6211
PH
510 if player_type == 'js':
511 code = self._download_webpage(
512 player_url, video_id,
69ea8ca4
PH
513 note='Downloading %s player %s' % (player_type, player_id),
514 errnote='Download of %s failed' % player_url)
83799698 515 res = self._parse_sig_js(code)
c4417ddb 516 elif player_type == 'swf':
e0df6211
PH
517 urlh = self._request_webpage(
518 player_url, video_id,
69ea8ca4
PH
519 note='Downloading %s player %s' % (player_type, player_id),
520 errnote='Download of %s failed' % player_url)
e0df6211 521 code = urlh.read()
83799698 522 res = self._parse_sig_swf(code)
e0df6211
PH
523 else:
524 assert False, 'Invalid player type %r' % player_type
525
a0e07d31 526 if cache_spec is None:
78caa52a 527 test_string = ''.join(map(compat_chr, range(len(example_sig))))
a0e07d31
PH
528 cache_res = res(test_string)
529 cache_spec = [ord(c) for c in cache_res]
83799698 530
69ea8ca4 531 self._downloader.cache.store('youtube-sigfuncs', func_id, cache_spec)
83799698
PH
532 return res
533
60064c53 534 def _print_sig_code(self, func, example_sig):
edf3e38e
PH
535 def gen_sig_code(idxs):
536 def _genslice(start, end, step):
78caa52a 537 starts = '' if start == 0 else str(start)
8bcc8756 538 ends = (':%d' % (end + step)) if end + step >= 0 else ':'
69ea8ca4 539 steps = '' if step == 1 else (':%d' % step)
78caa52a 540 return 's[%s%s%s]' % (starts, ends, steps)
edf3e38e
PH
541
542 step = None
7af808a5
PH
543 # Quelch pyflakes warnings - start will be set when step is set
544 start = '(Never used)'
edf3e38e
PH
545 for i, prev in zip(idxs[1:], idxs[:-1]):
546 if step is not None:
547 if i - prev == step:
548 continue
549 yield _genslice(start, prev, step)
550 step = None
551 continue
552 if i - prev in [-1, 1]:
553 step = i - prev
554 start = prev
555 continue
556 else:
78caa52a 557 yield 's[%d]' % prev
edf3e38e 558 if step is None:
78caa52a 559 yield 's[%d]' % i
edf3e38e
PH
560 else:
561 yield _genslice(start, i, step)
562
78caa52a 563 test_string = ''.join(map(compat_chr, range(len(example_sig))))
c705320f 564 cache_res = func(test_string)
edf3e38e 565 cache_spec = [ord(c) for c in cache_res]
78caa52a 566 expr_code = ' + '.join(gen_sig_code(cache_spec))
60064c53
PH
567 signature_id_tuple = '(%s)' % (
568 ', '.join(compat_str(len(p)) for p in example_sig.split('.')))
69ea8ca4 569 code = ('if tuple(len(p) for p in s.split(\'.\')) == %s:\n'
78caa52a 570 ' return %s\n') % (signature_id_tuple, expr_code)
69ea8ca4 571 self.to_screen('Extracted signature function:\n' + code)
edf3e38e 572
e0df6211
PH
573 def _parse_sig_js(self, jscode):
574 funcname = self._search_regex(
894dd868 575 r'\.sig\|\|([a-zA-Z0-9]+)\(', jscode,
8bcc8756 576 'Initial JS player signature function name')
2b25cb5d
PH
577
578 jsi = JSInterpreter(jscode)
579 initial_function = jsi.extract_function(funcname)
e0df6211
PH
580 return lambda s: initial_function([s])
581
582 def _parse_sig_swf(self, file_contents):
54256267 583 swfi = SWFInterpreter(file_contents)
78caa52a 584 TARGET_CLASSNAME = 'SignatureDecipher'
54256267 585 searched_class = swfi.extract_class(TARGET_CLASSNAME)
78caa52a 586 initial_function = swfi.extract_function(searched_class, 'decipher')
e0df6211
PH
587 return lambda s: initial_function([s])
588
83799698 589 def _decrypt_signature(self, s, video_id, player_url, age_gate=False):
257a2501 590 """Turn the encrypted s field into a working signature"""
6b37f0be 591
c8bf86d5 592 if player_url is None:
69ea8ca4 593 raise ExtractorError('Cannot decrypt signature without player_url')
920de7a2 594
69ea8ca4 595 if player_url.startswith('//'):
78caa52a 596 player_url = 'https:' + player_url
c8bf86d5 597 try:
62af3a0e 598 player_id = (player_url, self._signature_cache_id(s))
c8bf86d5
PH
599 if player_id not in self._player_cache:
600 func = self._extract_signature_function(
60064c53 601 video_id, player_url, s
c8bf86d5
PH
602 )
603 self._player_cache[player_id] = func
604 func = self._player_cache[player_id]
605 if self._downloader.params.get('youtube_print_sig_code'):
60064c53 606 self._print_sig_code(func, s)
c8bf86d5
PH
607 return func(s)
608 except Exception as e:
609 tb = traceback.format_exc()
610 raise ExtractorError(
78caa52a 611 'Signature extraction failed: ' + tb, cause=e)
e0df6211 612
1f343eaa 613 def _get_available_subtitles(self, video_id, webpage):
de7f3446 614 try:
60e47a26 615 subs_doc = self._download_xml(
38c2e5b8 616 'https://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,
7fad1c63
JMF
617 video_id, note=False)
618 except ExtractorError as err:
69ea8ca4 619 self._downloader.report_warning('unable to download video subtitles: %s' % compat_str(err))
de7f3446 620 return {}
de7f3446
JMF
621
622 sub_lang_list = {}
60e47a26
JMF
623 for track in subs_doc.findall('track'):
624 lang = track.attrib['lang_code']
7e660ac1
LD
625 if lang in sub_lang_list:
626 continue
de7f3446
JMF
627 params = compat_urllib_parse.urlencode({
628 'lang': lang,
629 'v': video_id,
ca715127 630 'fmt': self._downloader.params.get('subtitlesformat', 'srt'),
60e47a26 631 'name': track.attrib['name'].encode('utf-8'),
de7f3446 632 })
78caa52a 633 url = 'https://www.youtube.com/api/timedtext?' + params
de7f3446
JMF
634 sub_lang_list[lang] = url
635 if not sub_lang_list:
69ea8ca4 636 self._downloader.report_warning('video doesn\'t have subtitles')
de7f3446
JMF
637 return {}
638 return sub_lang_list
639
055e6f36 640 def _get_available_automatic_caption(self, video_id, webpage):
de7f3446
JMF
641 """We need the webpage for getting the captions url, pass it as an
642 argument to speed up the process."""
ca715127 643 sub_format = self._downloader.params.get('subtitlesformat', 'srt')
69ea8ca4 644 self.to_screen('%s: Looking for automatic captions' % video_id)
de7f3446 645 mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
78caa52a 646 err_msg = 'Couldn\'t find automatic captions for %s' % video_id
de7f3446
JMF
647 if mobj is None:
648 self._downloader.report_warning(err_msg)
649 return {}
650 player_config = json.loads(mobj.group(1))
651 try:
0792d563
PH
652 args = player_config['args']
653 caption_url = args['ttsurl']
654 timestamp = args['timestamp']
055e6f36
JMF
655 # We get the available subtitles
656 list_params = compat_urllib_parse.urlencode({
657 'type': 'list',
658 'tlangs': 1,
659 'asrs': 1,
de7f3446 660 })
055e6f36 661 list_url = caption_url + '&' + list_params
e26f8712 662 caption_list = self._download_xml(list_url, video_id)
e3dc22ca 663 original_lang_node = caption_list.find('track')
7d900ef1 664 if original_lang_node is None:
69ea8ca4 665 self._downloader.report_warning('Video doesn\'t have automatic captions')
e3dc22ca
JMF
666 return {}
667 original_lang = original_lang_node.attrib['lang_code']
7d900ef1 668 caption_kind = original_lang_node.attrib.get('kind', '')
055e6f36
JMF
669
670 sub_lang_list = {}
671 for lang_node in caption_list.findall('target'):
672 sub_lang = lang_node.attrib['lang_code']
673 params = compat_urllib_parse.urlencode({
674 'lang': original_lang,
675 'tlang': sub_lang,
676 'fmt': sub_format,
677 'ts': timestamp,
7d900ef1 678 'kind': caption_kind,
055e6f36
JMF
679 })
680 sub_lang_list[sub_lang] = caption_url + '&' + params
681 return sub_lang_list
de7f3446
JMF
682 # An extractor error can be raise by the download process if there are
683 # no automatic captions but there are subtitles
684 except (KeyError, ExtractorError):
685 self._downloader.report_warning(err_msg)
686 return {}
687
97665381
PH
688 @classmethod
689 def extract_id(cls, url):
690 mobj = re.match(cls._VALID_URL, url, re.VERBOSE)
c5e8d7af 691 if mobj is None:
69ea8ca4 692 raise ExtractorError('Invalid URL: %s' % url)
c5e8d7af
PH
693 video_id = mobj.group(2)
694 return video_id
695
1d043b93
JMF
696 def _extract_from_m3u8(self, manifest_url, video_id):
697 url_map = {}
5f6a1245 698
1d043b93
JMF
699 def _get_urls(_manifest):
700 lines = _manifest.split('\n')
701 urls = filter(lambda l: l and not l.startswith('#'),
8bcc8756 702 lines)
1d043b93 703 return urls
78caa52a 704 manifest = self._download_webpage(manifest_url, video_id, 'Downloading formats manifest')
1d043b93
JMF
705 formats_urls = _get_urls(manifest)
706 for format_url in formats_urls:
890f62e8 707 itag = self._search_regex(r'itag/(\d+?)/', format_url, 'itag')
1d043b93
JMF
708 url_map[itag] = format_url
709 return url_map
710
1fb07d10
JG
711 def _extract_annotations(self, video_id):
712 url = 'https://www.youtube.com/annotations_invideo?features=1&legacy=1&video_id=%s' % video_id
69ea8ca4 713 return self._download_webpage(url, video_id, note='Searching for annotations.', errnote='Unable to download video annotations.')
1fb07d10 714
da276600
PH
715 def _parse_dash_manifest(
716 self, video_id, dash_manifest_url, player_url, age_gate):
774e208f
PH
717 def decrypt_sig(mobj):
718 s = mobj.group(1)
719 dec_s = self._decrypt_signature(s, video_id, player_url, age_gate)
720 return '/signature/%s' % dec_s
721 dash_manifest_url = re.sub(r'/s/([\w\.]+)', decrypt_sig, dash_manifest_url)
722 dash_doc = self._download_xml(
723 dash_manifest_url, video_id,
724 note='Downloading DASH manifest',
725 errnote='Could not download DASH manifest')
726
727 formats = []
728 for r in dash_doc.findall('.//{urn:mpeg:DASH:schema:MPD:2011}Representation'):
729 url_el = r.find('{urn:mpeg:DASH:schema:MPD:2011}BaseURL')
730 if url_el is None:
731 continue
732 format_id = r.attrib['id']
733 video_url = url_el.text
734 filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength'))
735 f = {
736 'format_id': format_id,
737 'url': video_url,
738 'width': int_or_none(r.attrib.get('width')),
739 'tbr': int_or_none(r.attrib.get('bandwidth'), 1000),
740 'asr': int_or_none(r.attrib.get('audioSamplingRate')),
741 'filesize': filesize,
742 'fps': int_or_none(r.attrib.get('frameRate')),
743 }
744 try:
745 existing_format = next(
746 fo for fo in formats
747 if fo['format_id'] == format_id)
748 except StopIteration:
749 f.update(self._formats.get(format_id, {}))
750 formats.append(f)
751 else:
752 existing_format.update(f)
753 return formats
754
c5e8d7af 755 def _real_extract(self, url):
7e8c0af0 756 proto = (
78caa52a
PH
757 'http' if self._downloader.params.get('prefer_insecure', False)
758 else 'https')
7e8c0af0 759
c5e8d7af
PH
760 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
761 mobj = re.search(self._NEXT_URL_RE, url)
762 if mobj:
7e8c0af0 763 url = proto + '://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
97665381 764 video_id = self.extract_id(url)
c5e8d7af
PH
765
766 # Get video webpage
aa79ac0c 767 url = proto + '://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1&bpctr=9999999999' % video_id
a1f934b1 768 video_webpage = self._download_webpage(url, video_id)
c5e8d7af
PH
769
770 # Attempt to extract SWF player URL
e0df6211 771 mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
c5e8d7af
PH
772 if mobj is not None:
773 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
774 else:
775 player_url = None
776
777 # Get video info
c108eb73 778 if re.search(r'player-age-gate-content">', video_webpage) is not None:
c108eb73
JMF
779 age_gate = True
780 # We simulate the access to the video from www.youtube.com/v/{video_id}
781 # this can be viewed without login into Youtube
beb95e77
CL
782 url = proto + '://www.youtube.com/embed/%s' % video_id
783 embed_webpage = self._download_webpage(url, video_id, 'Downloading embed webpage')
2c57c7fa
JMF
784 data = compat_urllib_parse.urlencode({
785 'video_id': video_id,
786 'eurl': 'https://youtube.googleapis.com/v/' + video_id,
c084c934 787 'sts': self._search_regex(
beb95e77 788 r'"sts"\s*:\s*(\d+)', embed_webpage, 'sts', default=''),
2c57c7fa 789 })
7e8c0af0 790 video_info_url = proto + '://www.youtube.com/get_video_info?' + data
94bd3613
PH
791 video_info_webpage = self._download_webpage(
792 video_info_url, video_id,
20436c30 793 note='Refetching age-gated info webpage',
94bd3613 794 errnote='unable to download video info webpage')
c5e8d7af 795 video_info = compat_parse_qs(video_info_webpage)
c108eb73
JMF
796 else:
797 age_gate = False
4e62ebe2
JMF
798 try:
799 # Try looking directly into the video webpage
800 mobj = re.search(r';ytplayer\.config\s*=\s*({.*?});', video_webpage)
801 if not mobj:
802 raise ValueError('Could not find ytplayer.config') # caught below
803 json_code = uppercase_escape(mobj.group(1))
804 ytplayer_config = json.loads(json_code)
805 args = ytplayer_config['args']
806 # Convert to the same format returned by compat_parse_qs
807 video_info = dict((k, [v]) for k, v in args.items())
808 if 'url_encoded_fmt_stream_map' not in args:
809 raise ValueError('No stream_map present') # caught below
810 except ValueError:
811 # We fallback to the get_video_info pages (used by the embed page)
812 self.report_video_info_webpage_download(video_id)
813 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
810fb84d
PH
814 video_info_url = (
815 '%s://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
816 % (proto, video_id, el_type))
817 video_info_webpage = self._download_webpage(
818 video_info_url,
4e62ebe2
JMF
819 video_id, note=False,
820 errnote='unable to download video info webpage')
821 video_info = compat_parse_qs(video_info_webpage)
822 if 'token' in video_info:
823 break
c5e8d7af
PH
824 if 'token' not in video_info:
825 if 'reason' in video_info:
d11271dd 826 raise ExtractorError(
78caa52a 827 'YouTube said: %s' % video_info['reason'][0],
d11271dd 828 expected=True, video_id=video_id)
c5e8d7af 829 else:
d11271dd 830 raise ExtractorError(
78caa52a 831 '"token" parameter not in video info for unknown reason',
d11271dd 832 video_id=video_id)
c5e8d7af 833
1d699755
PH
834 if 'view_count' in video_info:
835 view_count = int(video_info['view_count'][0])
836 else:
837 view_count = None
838
c5e8d7af
PH
839 # Check for "rental" videos
840 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
69ea8ca4 841 raise ExtractorError('"rental" videos not supported')
c5e8d7af
PH
842
843 # Start extracting information
844 self.report_information_extraction(video_id)
845
846 # uploader
847 if 'author' not in video_info:
69ea8ca4 848 raise ExtractorError('Unable to extract uploader name')
c5e8d7af
PH
849 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
850
851 # uploader_id
852 video_uploader_id = None
853 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
854 if mobj is not None:
855 video_uploader_id = mobj.group(1)
856 else:
69ea8ca4 857 self._downloader.report_warning('unable to extract uploader nickname')
c5e8d7af
PH
858
859 # title
a8c6b241 860 if 'title' in video_info:
aa92f063 861 video_title = video_info['title'][0]
a8c6b241 862 else:
69ea8ca4 863 self._downloader.report_warning('Unable to extract video title')
78caa52a 864 video_title = '_'
c5e8d7af
PH
865
866 # thumbnail image
7763b04e
JMF
867 # We try first to get a high quality image:
868 m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">',
869 video_webpage, re.DOTALL)
870 if m_thumb is not None:
871 video_thumbnail = m_thumb.group(1)
872 elif 'thumbnail_url' not in video_info:
69ea8ca4 873 self._downloader.report_warning('unable to extract video thumbnail')
f490e77e 874 video_thumbnail = None
c5e8d7af
PH
875 else: # don't panic if we can't find it
876 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
877
878 # upload date
879 upload_date = None
ad3bc6ac 880 mobj = re.search(r'(?s)id="eow-date.*?>(.*?)</span>', video_webpage)
beee53de
PH
881 if mobj is None:
882 mobj = re.search(
263bd4ec 883 r'(?s)id="watch-uploader-info".*?>.*?(?:Published|Uploaded|Streamed live) on (.*?)</strong>',
beee53de 884 video_webpage)
c5e8d7af
PH
885 if mobj is not None:
886 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
887 upload_date = unified_strdate(upload_date)
888
55f7bd2d
PH
889 m_cat_container = self._search_regex(
890 r'(?s)<h4[^>]*>\s*Category\s*</h4>\s*<ul[^>]*>(.*?)</ul>',
624dcebf 891 video_webpage, 'categories', default=None)
ec8deefc 892 if m_cat_container:
ad3bc6ac 893 category = self._html_search_regex(
01ed5c9b 894 r'(?s)<a[^<]+>(.*?)</a>', m_cat_container, 'category',
ad3bc6ac
PH
895 default=None)
896 video_categories = None if category is None else [category]
897 else:
898 video_categories = None
ec8deefc 899
c5e8d7af
PH
900 # description
901 video_description = get_element_by_id("eow-description", video_webpage)
902 if video_description:
27dcce19
PH
903 video_description = re.sub(r'''(?x)
904 <a\s+
905 (?:[a-zA-Z-]+="[^"]+"\s+)*?
906 title="([^"]+)"\s+
907 (?:[a-zA-Z-]+="[^"]+"\s+)*?
908 class="yt-uix-redirect-link"\s*>
909 [^<]+
910 </a>
911 ''', r'\1', video_description)
c5e8d7af
PH
912 video_description = clean_html(video_description)
913 else:
914 fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
915 if fd_mobj:
916 video_description = unescapeHTML(fd_mobj.group(1))
917 else:
78caa52a 918 video_description = ''
c5e8d7af 919
f30a38be 920 def _extract_count(count_name):
46374a56 921 count = self._search_regex(
f30a38be
JMF
922 r'id="watch-%s"[^>]*>.*?([\d,]+)\s*</span>' % re.escape(count_name),
923 video_webpage, count_name, default=None)
336c3a69
JMF
924 if count is not None:
925 return int(count.replace(',', ''))
926 return None
69ea8ca4
PH
927 like_count = _extract_count('like')
928 dislike_count = _extract_count('dislike')
336c3a69 929
c5e8d7af 930 # subtitles
d82134c3 931 video_subtitles = self.extract_subtitles(video_id, video_webpage)
c5e8d7af 932
c5e8d7af 933 if self._downloader.params.get('listsubtitles', False):
d665f8d3 934 self._list_available_subtitles(video_id, video_webpage)
c5e8d7af
PH
935 return
936
937 if 'length_seconds' not in video_info:
69ea8ca4 938 self._downloader.report_warning('unable to extract video duration')
b466b702 939 video_duration = None
c5e8d7af 940 else:
b466b702 941 video_duration = int(compat_urllib_parse.unquote_plus(video_info['length_seconds'][0]))
c5e8d7af 942
1fb07d10
JG
943 # annotations
944 video_annotations = None
945 if self._downloader.params.get('writeannotations', False):
5f6a1245 946 video_annotations = self._extract_annotations(video_id)
1fb07d10 947
dd27fd17
PH
948 def _map_to_format_list(urlmap):
949 formats = []
950 for itag, video_real_url in urlmap.items():
951 dct = {
952 'format_id': itag,
953 'url': video_real_url,
954 'player_url': player_url,
955 }
0b65e5d4
PH
956 if itag in self._formats:
957 dct.update(self._formats[itag])
dd27fd17
PH
958 formats.append(dct)
959 return formats
960
c5e8d7af
PH
961 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
962 self.report_rtmp_download()
dd27fd17
PH
963 formats = [{
964 'format_id': '_rtmp',
965 'protocol': 'rtmp',
966 'url': video_info['conn'][0],
967 'player_url': player_url,
968 }]
24270b03 969 elif len(video_info.get('url_encoded_fmt_stream_map', [''])[0]) >= 1 or len(video_info.get('adaptive_fmts', [''])[0]) >= 1:
5f6a1245 970 encoded_url_map = video_info.get('url_encoded_fmt_stream_map', [''])[0] + ',' + video_info.get('adaptive_fmts', [''])[0]
00fe14fc 971 if 'rtmpe%3Dyes' in encoded_url_map:
a7055eb9 972 raise ExtractorError('rtmpe downloads are not supported, see https://github.com/rg3/youtube-dl/issues/343 for more information.', expected=True)
c5e8d7af 973 url_map = {}
00fe14fc 974 for url_data_str in encoded_url_map.split(','):
c5e8d7af 975 url_data = compat_parse_qs(url_data_str)
201e9eaa
PH
976 if 'itag' not in url_data or 'url' not in url_data:
977 continue
978 format_id = url_data['itag'][0]
979 url = url_data['url'][0]
980
981 if 'sig' in url_data:
982 url += '&signature=' + url_data['sig'][0]
983 elif 's' in url_data:
984 encrypted_sig = url_data['s'][0]
985
beb95e77
CL
986 jsplayer_url_json = self._search_regex(
987 r'"assets":.+?"js":\s*("[^"]+")',
988 embed_webpage if age_gate else video_webpage, 'JS player URL')
989 player_url = json.loads(jsplayer_url_json)
201e9eaa
PH
990 if player_url is None:
991 player_url_json = self._search_regex(
992 r'ytplayer\.config.*?"url"\s*:\s*("[^"]+")',
78caa52a 993 video_webpage, 'age gate player URL')
201e9eaa
PH
994 player_url = json.loads(player_url_json)
995
996 if self._downloader.params.get('verbose'):
cf010131 997 if player_url is None:
201e9eaa
PH
998 player_version = 'unknown'
999 player_desc = 'unknown'
1000 else:
1001 if player_url.endswith('swf'):
1002 player_version = self._search_regex(
1003 r'-(.+?)(?:/watch_as3)?\.swf$', player_url,
78caa52a 1004 'flash player', fatal=False)
201e9eaa 1005 player_desc = 'flash player %s' % player_version
cf010131 1006 else:
201e9eaa
PH
1007 player_version = self._search_regex(
1008 r'html5player-([^/]+?)(?:/html5player)?\.js',
1009 player_url,
1010 'html5 player', fatal=False)
78caa52a 1011 player_desc = 'html5 player %s' % player_version
201e9eaa 1012
60064c53 1013 parts_sizes = self._signature_cache_id(encrypted_sig)
69ea8ca4 1014 self.to_screen('{%s} signature length %s, %s' %
9e1a5b84 1015 (format_id, parts_sizes, player_desc))
201e9eaa
PH
1016
1017 signature = self._decrypt_signature(
1018 encrypted_sig, video_id, player_url, age_gate)
1019 url += '&signature=' + signature
1020 if 'ratebypass' not in url:
1021 url += '&ratebypass=yes'
1022 url_map[format_id] = url
dd27fd17 1023 formats = _map_to_format_list(url_map)
1d043b93
JMF
1024 elif video_info.get('hlsvp'):
1025 manifest_url = video_info['hlsvp'][0]
1026 url_map = self._extract_from_m3u8(manifest_url, video_id)
dd27fd17 1027 formats = _map_to_format_list(url_map)
c5e8d7af 1028 else:
69ea8ca4 1029 raise ExtractorError('no conn, hlsvp or url_encoded_fmt_stream_map information found in video info')
c5e8d7af 1030
dd27fd17 1031 # Look for the DASH manifest
203fb43f 1032 if self._downloader.params.get('youtube_include_dash_manifest', True):
774e208f 1033 dash_mpd = video_info.get('dashmpd')
75111274 1034 if dash_mpd:
774e208f
PH
1035 dash_manifest_url = dash_mpd[0]
1036 try:
1037 dash_formats = self._parse_dash_manifest(
da276600 1038 video_id, dash_manifest_url, player_url, age_gate)
774e208f
PH
1039 except (ExtractorError, KeyError) as e:
1040 self.report_warning(
1041 'Skipping DASH manifest: %r' % e, video_id)
1042 else:
1043 formats.extend(dash_formats)
d80044c2 1044
4bcc7bd1 1045 self._sort_formats(formats)
4ea3be0a 1046
1047 return {
8bcc8756
JW
1048 'id': video_id,
1049 'uploader': video_uploader,
1050 'uploader_id': video_uploader_id,
1051 'upload_date': upload_date,
1052 'title': video_title,
1053 'thumbnail': video_thumbnail,
1054 'description': video_description,
1055 'categories': video_categories,
1056 'subtitles': video_subtitles,
1057 'duration': video_duration,
1058 'age_limit': 18 if age_gate else 0,
1059 'annotations': video_annotations,
7e8c0af0 1060 'webpage_url': proto + '://www.youtube.com/watch?v=%s' % video_id,
8bcc8756 1061 'view_count': view_count,
4ea3be0a 1062 'like_count': like_count,
1063 'dislike_count': dislike_count,
8bcc8756 1064 'formats': formats,
4ea3be0a 1065 }
c5e8d7af 1066
5f6a1245 1067
880e1c52 1068class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
78caa52a 1069 IE_DESC = 'YouTube.com playlists'
d67cc9fa 1070 _VALID_URL = r"""(?x)(?:
c5e8d7af
PH
1071 (?:https?://)?
1072 (?:\w+\.)?
1073 youtube\.com/
1074 (?:
ac7553d0 1075 (?:course|view_play_list|my_playlists|artist|playlist|watch|embed/videoseries)
c5e8d7af
PH
1076 \? (?:.*?&)*? (?:p|a|list)=
1077 | p/
1078 )
d67cc9fa 1079 (
7d568f5a 1080 (?:PL|LL|EC|UU|FL|RD)?[0-9A-Za-z-_]{10,}
5f6a1245 1081 # Top tracks, they can also include dots
d67cc9fa
JMF
1082 |(?:MC)[\w\.]*
1083 )
c5e8d7af
PH
1084 .*
1085 |
7d568f5a 1086 ((?:PL|LL|EC|UU|FL|RD)[0-9A-Za-z-_]{10,})
c5e8d7af 1087 )"""
dbb94fb0 1088 _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s'
dbb94fb0 1089 _VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})&amp;[^"]*?index=(?P<index>\d+)'
78caa52a 1090 IE_NAME = 'youtube:playlist'
81127aa5
PH
1091 _TESTS = [{
1092 'url': 'https://www.youtube.com/playlist?list=PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re',
1093 'info_dict': {
1094 'title': 'ytdl test PL',
a1cf99d0 1095 'id': 'PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re',
81127aa5
PH
1096 },
1097 'playlist_count': 3,
9291475f
PH
1098 }, {
1099 'url': 'https://www.youtube.com/playlist?list=PLtPgu7CB4gbZDA7i_euNxn75ISqxwZPYx',
1100 'info_dict': {
1101 'title': 'YDL_Empty_List',
1102 },
1103 'playlist_count': 0,
1104 }, {
1105 'note': 'Playlist with deleted videos (#651). As a bonus, the video #51 is also twice in this list.',
1106 'url': 'https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC',
1107 'info_dict': {
1108 'title': '29C3: Not my department',
1109 },
1110 'playlist_count': 95,
1111 }, {
1112 'note': 'issue #673',
1113 'url': 'PLBB231211A4F62143',
1114 'info_dict': {
f46a8702 1115 'title': '[OLD]Team Fortress 2 (Class-based LP)',
9291475f
PH
1116 },
1117 'playlist_mincount': 26,
1118 }, {
1119 'note': 'Large playlist',
1120 'url': 'https://www.youtube.com/playlist?list=UUBABnxM4Ar9ten8Mdjj1j0Q',
1121 'info_dict': {
1122 'title': 'Uploads from Cauchemar',
1123 },
1124 'playlist_mincount': 799,
1125 }, {
1126 'url': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl',
1127 'info_dict': {
1128 'title': 'YDL_safe_search',
1129 },
1130 'playlist_count': 2,
ac7553d0
PH
1131 }, {
1132 'note': 'embedded',
1133 'url': 'http://www.youtube.com/embed/videoseries?list=PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu',
1134 'playlist_count': 4,
1135 'info_dict': {
1136 'title': 'JODA15',
1137 }
6b08cdf6
PH
1138 }, {
1139 'note': 'Embedded SWF player',
1140 'url': 'http://www.youtube.com/p/YN5VISEtHet5D4NEvfTd0zcgFk84NqFZ?hl=en_US&fs=1&rel=0',
1141 'playlist_count': 4,
1142 'info_dict': {
1143 'title': 'JODA7',
1144 }
4b7df0d3
JMF
1145 }, {
1146 'note': 'Buggy playlist: the webpage has a "Load more" button but it doesn\'t have more videos',
1147 'url': 'https://www.youtube.com/playlist?list=UUXw-G3eDE9trcvY2sBMM_aA',
1148 'info_dict': {
1149 'title': 'Uploads from Interstellar Movie',
1150 },
1151 'playlist_mincout': 21,
81127aa5 1152 }]
c5e8d7af 1153
880e1c52
JMF
1154 def _real_initialize(self):
1155 self._login()
1156
652cdaa2 1157 def _ids_to_results(self, ids):
c9cc0bf5
PH
1158 return [
1159 self.url_result(vid_id, 'Youtube', video_id=vid_id)
1160 for vid_id in ids]
652cdaa2
JMF
1161
1162 def _extract_mix(self, playlist_id):
1163 # The mixes are generated from a a single video
1164 # the id of the playlist is just 'RD' + video_id
7d4afc55 1165 url = 'https://youtube.com/watch?v=%s&list=%s' % (playlist_id[-11:], playlist_id)
c9cc0bf5 1166 webpage = self._download_webpage(
78caa52a 1167 url, playlist_id, 'Downloading Youtube mix')
bc2f773b 1168 search_title = lambda class_name: get_element_by_attribute('class', class_name, webpage)
c9cc0bf5
PH
1169 title_span = (
1170 search_title('playlist-title') or
1171 search_title('title long-title') or
1172 search_title('title'))
76d1700b 1173 title = clean_html(title_span)
c9cc0bf5
PH
1174 ids = orderedSet(re.findall(
1175 r'''(?xs)data-video-username=".*?".*?
1176 href="/watch\?v=([0-9A-Za-z_-]{11})&amp;[^"]*?list=%s''' % re.escape(playlist_id),
1177 webpage))
652cdaa2
JMF
1178 url_results = self._ids_to_results(ids)
1179
1180 return self.playlist_result(url_results, playlist_id, title)
1181
c5e8d7af
PH
1182 def _real_extract(self, url):
1183 # Extract playlist id
d67cc9fa 1184 mobj = re.match(self._VALID_URL, url)
c5e8d7af 1185 if mobj is None:
69ea8ca4 1186 raise ExtractorError('Invalid URL: %s' % url)
47192f92
FV
1187 playlist_id = mobj.group(1) or mobj.group(2)
1188
1189 # Check if it's a video-specific URL
7c61bd36 1190 query_dict = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
47192f92
FV
1191 if 'v' in query_dict:
1192 video_id = query_dict['v'][0]
1193 if self._downloader.params.get('noplaylist'):
69ea8ca4 1194 self.to_screen('Downloading just video %s because of --no-playlist' % video_id)
7012b23c 1195 return self.url_result(video_id, 'Youtube', video_id=video_id)
47192f92 1196 else:
69ea8ca4 1197 self.to_screen('Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id))
c5e8d7af 1198
7d4afc55 1199 if playlist_id.startswith('RD'):
652cdaa2
JMF
1200 # Mixes require a custom extraction process
1201 return self._extract_mix(playlist_id)
0a688bc0 1202 if playlist_id.startswith('TL'):
69ea8ca4 1203 raise ExtractorError('For downloading YouTube.com top lists, use '
8bcc8756 1204 'the "yttoplist" keyword, for example "youtube-dl \'yttoplist:music:Top Tracks\'"', expected=True)
652cdaa2 1205
dbb94fb0
S
1206 url = self._TEMPLATE_URL % playlist_id
1207 page = self._download_webpage(url, playlist_id)
1208 more_widget_html = content_html = page
1209
10c0e2d8 1210 # Check if the playlist exists or is private
e399853d 1211 if re.search(r'<div class="yt-alert-message">[^<]*?(The|This) playlist (does not exist|is private)[^<]*?</div>', page) is not None:
10c0e2d8 1212 raise ExtractorError(
78caa52a 1213 'The playlist doesn\'t exist or is private, use --username or '
10c0e2d8
JMF
1214 '--netrc to access it.',
1215 expected=True)
1216
dcbb4580
JMF
1217 # Extract the video ids from the playlist pages
1218 ids = []
c5e8d7af 1219
755eb032 1220 for page_num in itertools.count(1):
dbb94fb0 1221 matches = re.finditer(self._VIDEO_RE, content_html)
6e47b51e
JMF
1222 # We remove the duplicates and the link with index 0
1223 # (it's not the first video of the playlist)
1224 new_ids = orderedSet(m.group('id') for m in matches if m.group('index') != '0')
dcbb4580 1225 ids.extend(new_ids)
c5e8d7af 1226
dbb94fb0
S
1227 mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
1228 if not mobj:
c5e8d7af
PH
1229 break
1230
dbb94fb0 1231 more = self._download_json(
5912c639
PH
1232 'https://youtube.com/%s' % mobj.group('more'), playlist_id,
1233 'Downloading page #%s' % page_num,
1234 transform_source=uppercase_escape)
dbb94fb0 1235 content_html = more['content_html']
4b7df0d3
JMF
1236 if not content_html.strip():
1237 # Some webpages show a "Load more" button but they don't
1238 # have more videos
1239 break
dbb94fb0
S
1240 more_widget_html = more['load_more_widget_html']
1241
1242 playlist_title = self._html_search_regex(
68eb8e90 1243 r'(?s)<h1 class="pl-header-title[^"]*">\s*(.*?)\s*</h1>',
78caa52a 1244 page, 'title')
c5e8d7af 1245
652cdaa2 1246 url_results = self._ids_to_results(ids)
dcbb4580 1247 return self.playlist_result(url_results, playlist_id, playlist_title)
c5e8d7af
PH
1248
1249
0a688bc0 1250class YoutubeTopListIE(YoutubePlaylistIE):
78caa52a 1251 IE_NAME = 'youtube:toplist'
69ea8ca4 1252 IE_DESC = ('YouTube.com top lists, "yttoplist:{channel}:{list title}"'
9e1a5b84 1253 ' (Example: "yttoplist:music:Top Tracks")')
0a688bc0 1254 _VALID_URL = r'yttoplist:(?P<chann>.*?):(?P<title>.*?)$'
cdc628a4
PH
1255 _TESTS = [{
1256 'url': 'yttoplist:music:Trending',
1257 'playlist_mincount': 5,
1258 'skip': 'Only works for logged-in users',
1259 }]
0a688bc0
JMF
1260
1261 def _real_extract(self, url):
1262 mobj = re.match(self._VALID_URL, url)
1263 channel = mobj.group('chann')
1264 title = mobj.group('title')
1265 query = compat_urllib_parse.urlencode({'title': title})
cdc628a4
PH
1266 channel_page = self._download_webpage(
1267 'https://www.youtube.com/%s' % channel, title)
1268 link = self._html_search_regex(
1269 r'''(?x)
1270 <a\s+href="([^"]+)".*?>\s*
1271 <span\s+class="branded-page-module-title-text">\s*
1272 <span[^>]*>.*?%s.*?</span>''' % re.escape(query),
1273 channel_page, 'list')
0a688bc0 1274 url = compat_urlparse.urljoin('https://www.youtube.com/', link)
5f6a1245 1275
0a688bc0
JMF
1276 video_re = r'data-index="\d+".*?data-video-id="([0-9A-Za-z_-]{11})"'
1277 ids = []
1278 # sometimes the webpage doesn't contain the videos
1279 # retry until we get them
1280 for i in itertools.count(0):
78caa52a 1281 msg = 'Downloading Youtube mix'
0a688bc0
JMF
1282 if i > 0:
1283 msg += ', retry #%d' % i
c9cc0bf5 1284
0a688bc0
JMF
1285 webpage = self._download_webpage(url, title, msg)
1286 ids = orderedSet(re.findall(video_re, webpage))
1287 if ids:
1288 break
1289 url_results = self._ids_to_results(ids)
1290 return self.playlist_result(url_results, playlist_title=title)
1291
1292
c5e8d7af 1293class YoutubeChannelIE(InfoExtractor):
78caa52a 1294 IE_DESC = 'YouTube.com channels'
9ff67727 1295 _VALID_URL = r'https?://(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/(?P<id>[0-9A-Za-z_-]+)'
78caa52a 1296 IE_NAME = 'youtube:channel'
cdc628a4
PH
1297 _TESTS = [{
1298 'note': 'paginated channel',
1299 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w',
1300 'playlist_mincount': 91,
1301 }]
c5e8d7af
PH
1302
1303 def extract_videos_from_page(self, page):
1304 ids_in_page = []
1305 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1306 if mobj.group(1) not in ids_in_page:
1307 ids_in_page.append(mobj.group(1))
1308 return ids_in_page
1309
1310 def _real_extract(self, url):
9ff67727 1311 channel_id = self._match_id(url)
c5e8d7af 1312
c5e8d7af 1313 video_ids = []
b9643eed
JMF
1314 url = 'https://www.youtube.com/channel/%s/videos' % channel_id
1315 channel_page = self._download_webpage(url, channel_id)
31812a9e
PH
1316 autogenerated = re.search(r'''(?x)
1317 class="[^"]*?(?:
1318 channel-header-autogenerated-label|
1319 yt-channel-title-autogenerated
1320 )[^"]*"''', channel_page) is not None
c5e8d7af 1321
b9643eed
JMF
1322 if autogenerated:
1323 # The videos are contained in a single page
1324 # the ajax pages can't be used, they are empty
1325 video_ids = self.extract_videos_from_page(channel_page)
b82f815f
PH
1326 entries = [
1327 self.url_result(video_id, 'Youtube', video_id=video_id)
1328 for video_id in video_ids]
1329 return self.playlist_result(entries, channel_id)
1330
1331 def _entries():
23d3608c 1332 more_widget_html = content_html = channel_page
b9643eed 1333 for pagenum in itertools.count(1):
81c2f20b 1334
23d3608c 1335 ids_in_page = self.extract_videos_from_page(content_html)
b82f815f
PH
1336 for video_id in ids_in_page:
1337 yield self.url_result(
1338 video_id, 'Youtube', video_id=video_id)
5f6a1245 1339
23d3608c
JMF
1340 mobj = re.search(
1341 r'data-uix-load-more-href="/?(?P<more>[^"]+)"',
1342 more_widget_html)
1343 if not mobj:
b9643eed 1344 break
c5e8d7af 1345
23d3608c
JMF
1346 more = self._download_json(
1347 'https://youtube.com/%s' % mobj.group('more'), channel_id,
1348 'Downloading page #%s' % (pagenum + 1),
1349 transform_source=uppercase_escape)
1350 content_html = more['content_html']
1351 more_widget_html = more['load_more_widget_html']
1352
b82f815f 1353 return self.playlist_result(_entries(), channel_id)
c5e8d7af
PH
1354
1355
1356class YoutubeUserIE(InfoExtractor):
78caa52a 1357 IE_DESC = 'YouTube.com user videos (URL or "ytuser" keyword)'
9ff67727 1358 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:user/)?(?!(?:attribution_link|watch|results)(?:$|[^a-z_A-Z0-9-])))|ytuser:)(?!feed/)(?P<id>[A-Za-z0-9_-]+)'
38c2e5b8 1359 _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/users/%s'
c5e8d7af 1360 _GDATA_PAGE_SIZE = 50
38c2e5b8 1361 _GDATA_URL = 'https://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d&alt=json'
78caa52a 1362 IE_NAME = 'youtube:user'
c5e8d7af 1363
cdc628a4
PH
1364 _TESTS = [{
1365 'url': 'https://www.youtube.com/user/TheLinuxFoundation',
1366 'playlist_mincount': 320,
1367 'info_dict': {
1368 'title': 'TheLinuxFoundation',
1369 }
1370 }, {
1371 'url': 'ytuser:phihag',
1372 'only_matching': True,
1373 }]
1374
e3ea4790 1375 @classmethod
f4b05232 1376 def suitable(cls, url):
e3ea4790
JMF
1377 # Don't return True if the url can be extracted with other youtube
1378 # extractor, the regex would is too permissive and it would match.
1379 other_ies = iter(klass for (name, klass) in globals().items() if name.endswith('IE') and klass is not cls)
5f6a1245
JW
1380 if any(ie.suitable(url) for ie in other_ies):
1381 return False
1382 else:
1383 return super(YoutubeUserIE, cls).suitable(url)
f4b05232 1384
c5e8d7af 1385 def _real_extract(self, url):
9ff67727 1386 username = self._match_id(url)
c5e8d7af
PH
1387
1388 # Download video ids using YouTube Data API. Result size per
1389 # query is limited (currently to 50 videos) so we need to query
1390 # page by page until there are no video ids - it means we got
1391 # all of them.
1392
b7ab0590 1393 def download_page(pagenum):
c5e8d7af
PH
1394 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1395
1396 gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
b7ab0590
PH
1397 page = self._download_webpage(
1398 gdata_url, username,
78caa52a 1399 'Downloading video ids from %d to %d' % (
b7ab0590 1400 start_index, start_index + self._GDATA_PAGE_SIZE))
c5e8d7af 1401
fd9cf738
JMF
1402 try:
1403 response = json.loads(page)
1404 except ValueError as err:
69ea8ca4 1405 raise ExtractorError('Invalid JSON in API response: ' + compat_str(err))
71c82637 1406 if 'entry' not in response['feed']:
b7ab0590 1407 return
fd9cf738 1408
c5e8d7af 1409 # Extract video identifiers
e302f9ce
PH
1410 entries = response['feed']['entry']
1411 for entry in entries:
1412 title = entry['title']['$t']
1413 video_id = entry['id']['$t'].split('/')[-1]
b7ab0590 1414 yield {
e302f9ce
PH
1415 '_type': 'url',
1416 'url': video_id,
1417 'ie_key': 'Youtube',
b11cec41 1418 'id': video_id,
e302f9ce 1419 'title': title,
b7ab0590 1420 }
9c44d242 1421 url_results = OnDemandPagedList(download_page, self._GDATA_PAGE_SIZE)
c5e8d7af 1422
7012b23c
PH
1423 return self.playlist_result(url_results, playlist_title=username)
1424
b05654f0
PH
1425
1426class YoutubeSearchIE(SearchInfoExtractor):
78caa52a
PH
1427 IE_DESC = 'YouTube.com searches'
1428 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
b05654f0 1429 _MAX_RESULTS = 1000
78caa52a 1430 IE_NAME = 'youtube:search'
b05654f0
PH
1431 _SEARCH_KEY = 'ytsearch'
1432
b05654f0
PH
1433 def _get_n_results(self, query, n):
1434 """Get a specified number of results for a query"""
1435
1436 video_ids = []
1437 pagenum = 0
1438 limit = n
83d548ef 1439 PAGE_SIZE = 50
b05654f0 1440
83d548ef
PH
1441 while (PAGE_SIZE * pagenum) < limit:
1442 result_url = self._API_URL % (
1443 compat_urllib_parse.quote_plus(query.encode('utf-8')),
1444 (PAGE_SIZE * pagenum) + 1)
7cc3570e 1445 data_json = self._download_webpage(
69ea8ca4
PH
1446 result_url, video_id='query "%s"' % query,
1447 note='Downloading page %s' % (pagenum + 1),
1448 errnote='Unable to download API page')
7cc3570e
PH
1449 data = json.loads(data_json)
1450 api_response = data['data']
1451
1452 if 'items' not in api_response:
07ad22b8 1453 raise ExtractorError(
78caa52a 1454 '[youtube] No video results', expected=True)
b05654f0
PH
1455
1456 new_ids = list(video['id'] for video in api_response['items'])
1457 video_ids += new_ids
1458
1459 limit = min(n, api_response['totalItems'])
1460 pagenum += 1
1461
1462 if len(video_ids) > n:
1463 video_ids = video_ids[:n]
7012b23c
PH
1464 videos = [self.url_result(video_id, 'Youtube', video_id=video_id)
1465 for video_id in video_ids]
b05654f0 1466 return self.playlist_result(videos, query)
75dff0ee 1467
c9ae7b95 1468
a3dd9248 1469class YoutubeSearchDateIE(YoutubeSearchIE):
cb7fb546 1470 IE_NAME = YoutubeSearchIE.IE_NAME + ':date'
a3dd9248
CM
1471 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc&orderby=published'
1472 _SEARCH_KEY = 'ytsearchdate'
78caa52a 1473 IE_DESC = 'YouTube.com searches, newest videos first'
75dff0ee 1474
c9ae7b95
PH
1475
1476class YoutubeSearchURLIE(InfoExtractor):
78caa52a
PH
1477 IE_DESC = 'YouTube.com search URLs'
1478 IE_NAME = 'youtube:search_url'
c9ae7b95 1479 _VALID_URL = r'https?://(?:www\.)?youtube\.com/results\?(.*?&)?search_query=(?P<query>[^&]+)(?:[&]|$)'
cdc628a4
PH
1480 _TESTS = [{
1481 'url': 'https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video',
1482 'playlist_mincount': 5,
1483 'info_dict': {
1484 'title': 'youtube-dl test video',
1485 }
1486 }]
c9ae7b95
PH
1487
1488 def _real_extract(self, url):
1489 mobj = re.match(self._VALID_URL, url)
1490 query = compat_urllib_parse.unquote_plus(mobj.group('query'))
1491
1492 webpage = self._download_webpage(url, query)
1493 result_code = self._search_regex(
78caa52a 1494 r'(?s)<ol class="item-section"(.*?)</ol>', webpage, 'result HTML')
c9ae7b95
PH
1495
1496 part_codes = re.findall(
1497 r'(?s)<h3 class="yt-lockup-title">(.*?)</h3>', result_code)
1498 entries = []
1499 for part_code in part_codes:
1500 part_title = self._html_search_regex(
6feb2d5e 1501 [r'(?s)title="([^"]+)"', r'>([^<]+)</a>'], part_code, 'item title', fatal=False)
c9ae7b95
PH
1502 part_url_snippet = self._html_search_regex(
1503 r'(?s)href="([^"]+)"', part_code, 'item URL')
1504 part_url = compat_urlparse.urljoin(
1505 'https://www.youtube.com/', part_url_snippet)
1506 entries.append({
1507 '_type': 'url',
1508 'url': part_url,
1509 'title': part_title,
1510 })
1511
1512 return {
1513 '_type': 'playlist',
1514 'entries': entries,
1515 'title': query,
1516 }
1517
1518
75dff0ee 1519class YoutubeShowIE(InfoExtractor):
78caa52a 1520 IE_DESC = 'YouTube.com (multi-season) shows'
cdc628a4 1521 _VALID_URL = r'https?://www\.youtube\.com/show/(?P<id>[^?#]*)'
78caa52a 1522 IE_NAME = 'youtube:show'
cdc628a4
PH
1523 _TESTS = [{
1524 'url': 'http://www.youtube.com/show/airdisasters',
1525 'playlist_mincount': 3,
1526 'info_dict': {
1527 'id': 'airdisasters',
1528 'title': 'Air Disasters',
1529 }
1530 }]
75dff0ee
JMF
1531
1532 def _real_extract(self, url):
1533 mobj = re.match(self._VALID_URL, url)
cdc628a4
PH
1534 playlist_id = mobj.group('id')
1535 webpage = self._download_webpage(
1536 url, playlist_id, 'Downloading show webpage')
75dff0ee
JMF
1537 # There's one playlist for each season of the show
1538 m_seasons = list(re.finditer(r'href="(/playlist\?list=.*?)"', webpage))
cdc628a4
PH
1539 self.to_screen('%s: Found %s seasons' % (playlist_id, len(m_seasons)))
1540 entries = [
1541 self.url_result(
1542 'https://www.youtube.com' + season.group(1), 'YoutubePlaylist')
1543 for season in m_seasons
1544 ]
1545 title = self._og_search_title(webpage, fatal=False)
1546
1547 return {
1548 '_type': 'playlist',
1549 'id': playlist_id,
1550 'title': title,
1551 'entries': entries,
1552 }
04cc9617
JMF
1553
1554
b2e8bc1b 1555class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
d7ae0639
JMF
1556 """
1557 Base class for extractors that fetch info from
1558 http://www.youtube.com/feed_ajax
1559 Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.
1560 """
b2e8bc1b 1561 _LOGIN_REQUIRED = True
43ba5456
JMF
1562 # use action_load_personal_feed instead of action_load_system_feed
1563 _PERSONAL_FEED = False
04cc9617 1564
d7ae0639
JMF
1565 @property
1566 def _FEED_TEMPLATE(self):
43ba5456
JMF
1567 action = 'action_load_system_feed'
1568 if self._PERSONAL_FEED:
1569 action = 'action_load_personal_feed'
38c2e5b8 1570 return 'https://www.youtube.com/feed_ajax?%s=1&feed_name=%s&paging=%%s' % (action, self._FEED_NAME)
d7ae0639
JMF
1571
1572 @property
1573 def IE_NAME(self):
78caa52a 1574 return 'youtube:%s' % self._FEED_NAME
04cc9617 1575
81f0259b 1576 def _real_initialize(self):
b2e8bc1b 1577 self._login()
81f0259b 1578
04cc9617
JMF
1579 def _real_extract(self, url):
1580 feed_entries = []
0e44d838
JMF
1581 paging = 0
1582 for i in itertools.count(1):
84d84211
PH
1583 info = self._download_json(
1584 self._FEED_TEMPLATE % paging,
1585 '%s feed' % self._FEED_NAME,
1586 'Downloading page %s' % i,
1587 transform_source=uppercase_escape)
f6177462 1588 feed_html = info.get('feed_html') or info.get('content_html')
1a9b9649 1589 load_more_widget_html = info.get('load_more_widget_html') or feed_html
43ba5456 1590 m_ids = re.finditer(r'"/watch\?v=(.*?)["&]', feed_html)
04cc9617 1591 ids = orderedSet(m.group(1) for m in m_ids)
7012b23c
PH
1592 feed_entries.extend(
1593 self.url_result(video_id, 'Youtube', video_id=video_id)
1594 for video_id in ids)
05ee2b6d
JMF
1595 mobj = re.search(
1596 r'data-uix-load-more-href="/?[^"]+paging=(?P<paging>\d+)',
1a9b9649 1597 load_more_widget_html)
05ee2b6d 1598 if mobj is None:
04cc9617 1599 break
05ee2b6d 1600 paging = mobj.group('paging')
d7ae0639
JMF
1601 return self.playlist_result(feed_entries, playlist_title=self._PLAYLIST_TITLE)
1602
5f6a1245 1603
d7ae0639 1604class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
f3a34072 1605 IE_DESC = 'YouTube.com recommended videos, ":ytrec" for short (requires authentication)'
d7ae0639
JMF
1606 _VALID_URL = r'https?://www\.youtube\.com/feed/recommended|:ytrec(?:ommended)?'
1607 _FEED_NAME = 'recommended'
78caa52a 1608 _PLAYLIST_TITLE = 'Youtube Recommended videos'
c626a3d9 1609
5f6a1245 1610
43ba5456 1611class YoutubeWatchLaterIE(YoutubeFeedsInfoExtractor):
f3a34072 1612 IE_DESC = 'Youtube watch later list, ":ytwatchlater" for short (requires authentication)'
43ba5456
JMF
1613 _VALID_URL = r'https?://www\.youtube\.com/feed/watch_later|:ytwatchlater'
1614 _FEED_NAME = 'watch_later'
78caa52a 1615 _PLAYLIST_TITLE = 'Youtube Watch Later'
43ba5456 1616 _PERSONAL_FEED = True
c626a3d9 1617
5f6a1245 1618
f459d170 1619class YoutubeHistoryIE(YoutubeFeedsInfoExtractor):
f3a34072 1620 IE_DESC = 'Youtube watch history, ":ythistory" for short (requires authentication)'
78caa52a 1621 _VALID_URL = 'https?://www\.youtube\.com/feed/history|:ythistory'
f459d170
JMF
1622 _FEED_NAME = 'history'
1623 _PERSONAL_FEED = True
78caa52a 1624 _PLAYLIST_TITLE = 'Youtube Watch History'
f459d170 1625
5f6a1245 1626
c626a3d9 1627class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
78caa52a 1628 IE_NAME = 'youtube:favorites'
f3a34072 1629 IE_DESC = 'YouTube.com favourite videos, ":ytfav" for short (requires authentication)'
c7a7750d 1630 _VALID_URL = r'https?://www\.youtube\.com/my_favorites|:ytfav(?:ou?rites)?'
c626a3d9
JMF
1631 _LOGIN_REQUIRED = True
1632
1633 def _real_extract(self, url):
1634 webpage = self._download_webpage('https://www.youtube.com/my_favorites', 'Youtube Favourites videos')
78caa52a 1635 playlist_id = self._search_regex(r'list=(.+?)["&]', webpage, 'favourites playlist id')
c626a3d9 1636 return self.url_result(playlist_id, 'YoutubePlaylist')
15870e90
PH
1637
1638
1ed5b5c9 1639class YoutubeSubscriptionsIE(YoutubePlaylistIE):
78caa52a
PH
1640 IE_NAME = 'youtube:subscriptions'
1641 IE_DESC = 'YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication)'
1ed5b5c9 1642 _VALID_URL = r'https?://www\.youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
81127aa5 1643 _TESTS = []
1ed5b5c9
JMF
1644
1645 def _real_extract(self, url):
78caa52a 1646 title = 'Youtube Subscriptions'
1ed5b5c9
JMF
1647 page = self._download_webpage('https://www.youtube.com/feed/subscriptions', title)
1648
1649 # The extraction process is the same as for playlists, but the regex
1650 # for the video ids doesn't contain an index
1651 ids = []
1652 more_widget_html = content_html = page
1653
1654 for page_num in itertools.count(1):
1655 matches = re.findall(r'href="\s*/watch\?v=([0-9A-Za-z_-]{11})', content_html)
1656 new_ids = orderedSet(matches)
1657 ids.extend(new_ids)
1658
1659 mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
1660 if not mobj:
1661 break
1662
1663 more = self._download_json(
1664 'https://youtube.com/%s' % mobj.group('more'), title,
1665 'Downloading page #%s' % page_num,
1666 transform_source=uppercase_escape)
1667 content_html = more['content_html']
1668 more_widget_html = more['load_more_widget_html']
1669
1670 return {
1671 '_type': 'playlist',
1672 'title': title,
1673 'entries': self._ids_to_results(ids),
1674 }
1675
1676
15870e90
PH
1677class YoutubeTruncatedURLIE(InfoExtractor):
1678 IE_NAME = 'youtube:truncated_url'
1679 IE_DESC = False # Do not list
975d35db 1680 _VALID_URL = r'''(?x)
c4808c60
PH
1681 (?:https?://)?[^/]+/watch\?(?:
1682 feature=[a-z_]+|
1683 annotation_id=annotation_[^&]+
1684 )?$|
975d35db
PH
1685 (?:https?://)?(?:www\.)?youtube\.com/attribution_link\?a=[^&]+$
1686 '''
15870e90 1687
c4808c60
PH
1688 _TESTS = [{
1689 'url': 'http://www.youtube.com/watch?annotation_id=annotation_3951667041',
1690 'only_matching': True,
dc2fc736
PH
1691 }, {
1692 'url': 'http://www.youtube.com/watch?',
1693 'only_matching': True,
c4808c60
PH
1694 }]
1695
15870e90
PH
1696 def _real_extract(self, url):
1697 raise ExtractorError(
78caa52a
PH
1698 'Did you forget to quote the URL? Remember that & is a meta '
1699 'character in most shells, so you want to put the URL in quotes, '
1700 'like youtube-dl '
1701 '"http://www.youtube.com/watch?feature=foo&v=BaW_jenozKc" '
1702 ' or simply youtube-dl BaW_jenozKc .',
15870e90 1703 expected=True)