]> jfr.im git - yt-dlp.git/blame - youtube_dl/extractor/youtube.py
[mit] Modernize
[yt-dlp.git] / youtube_dl / extractor / youtube.py
CommitLineData
c5e8d7af 1# coding: utf-8
c5e8d7af 2
78caa52a
PH
3from __future__ import unicode_literals
4
5
0ca96d48 6import itertools
c5e8d7af 7import json
c4417ddb 8import os.path
c5e8d7af 9import re
42939b61 10import time
e0df6211 11import traceback
c5e8d7af 12
b05654f0 13from .common import InfoExtractor, SearchInfoExtractor
54d39d8b 14from .subtitles import SubtitlesInfoExtractor
2b25cb5d 15from ..jsinterp import JSInterpreter
54256267 16from ..swfinterp import SWFInterpreter
4bb4a188 17from ..compat import (
edf3e38e 18 compat_chr,
c5e8d7af 19 compat_parse_qs,
c5e8d7af
PH
20 compat_urllib_parse,
21 compat_urllib_request,
7c61bd36 22 compat_urlparse,
c5e8d7af 23 compat_str,
4bb4a188
PH
24)
25from ..utils import (
c5e8d7af 26 clean_html,
c5e8d7af 27 ExtractorError,
4bb4a188
PH
28 get_element_by_attribute,
29 get_element_by_id,
dd27fd17 30 int_or_none,
9c44d242 31 OnDemandPagedList,
4bb4a188 32 orderedSet,
c5e8d7af
PH
33 unescapeHTML,
34 unified_strdate,
81c2f20b 35 uppercase_escape,
c5e8d7af
PH
36)
37
5f6a1245 38
de7f3446 39class YoutubeBaseInfoExtractor(InfoExtractor):
b2e8bc1b
JMF
40 """Provide base functions for Youtube extractors"""
41 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
83317f69 42 _TWOFACTOR_URL = 'https://accounts.google.com/SecondFactor'
b2e8bc1b
JMF
43 _NETRC_MACHINE = 'youtube'
44 # If True it will raise an error if no login info is provided
45 _LOGIN_REQUIRED = False
46
b2e8bc1b 47 def _set_language(self):
810fb84d
PH
48 self._set_cookie(
49 '.youtube.com', 'PREF', 'f1=50000000&hl=en',
42939b61 50 # YouTube sets the expire time to about two months
810fb84d 51 expire_time=time.time() + 2 * 30 * 24 * 3600)
b2e8bc1b
JMF
52
53 def _login(self):
83317f69 54 """
55 Attempt to log in to YouTube.
56 True is returned if successful or skipped.
57 False is returned if login failed.
58
59 If _LOGIN_REQUIRED is set and no authentication was provided, an error is raised.
60 """
b2e8bc1b
JMF
61 (username, password) = self._get_login_info()
62 # No authentication to be performed
63 if username is None:
64 if self._LOGIN_REQUIRED:
69ea8ca4 65 raise ExtractorError('No login info available, needed for using %s.' % self.IE_NAME, expected=True)
83317f69 66 return True
b2e8bc1b 67
7cc3570e
PH
68 login_page = self._download_webpage(
69 self._LOGIN_URL, None,
69ea8ca4
PH
70 note='Downloading login page',
71 errnote='unable to fetch login page', fatal=False)
7cc3570e
PH
72 if login_page is False:
73 return
b2e8bc1b 74
795f28f8 75 galx = self._search_regex(r'(?s)<input.+?name="GALX".+?value="(.+?)"',
78caa52a 76 login_page, 'Login GALX parameter')
c5e8d7af 77
b2e8bc1b
JMF
78 # Log in
79 login_form_strs = {
8bcc8756
JW
80 'continue': 'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
81 'Email': username,
82 'GALX': galx,
83 'Passwd': password,
84
85 'PersistentCookie': 'yes',
86 '_utf8': '霱',
87 'bgresponse': 'js_disabled',
88 'checkConnection': '',
89 'checkedDomains': 'youtube',
90 'dnConn': '',
91 'pstMsg': '0',
92 'rmShown': '1',
93 'secTok': '',
94 'signIn': 'Sign in',
95 'timeStmp': '',
96 'service': 'youtube',
97 'uilel': '3',
98 'hl': 'en_US',
b2e8bc1b 99 }
83317f69 100
b2e8bc1b
JMF
101 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
102 # chokes on unicode
5f6a1245 103 login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k, v in login_form_strs.items())
b2e8bc1b 104 login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
7cc3570e
PH
105
106 req = compat_urllib_request.Request(self._LOGIN_URL, login_data)
107 login_results = self._download_webpage(
108 req, None,
69ea8ca4 109 note='Logging in', errnote='unable to log in', fatal=False)
7cc3570e
PH
110 if login_results is False:
111 return False
83317f69 112
113 if re.search(r'id="errormsg_0_Passwd"', login_results) is not None:
69ea8ca4 114 raise ExtractorError('Please use your account password and a two-factor code instead of an application-specific password.', expected=True)
83317f69 115
116 # Two-Factor
117 # TODO add SMS and phone call support - these require making a request and then prompting the user
118
119 if re.search(r'(?i)<form[^>]* id="gaia_secondfactorform"', login_results) is not None:
120 tfa_code = self._get_tfa_info()
121
122 if tfa_code is None:
69ea8ca4
PH
123 self._downloader.report_warning('Two-factor authentication required. Provide it with --twofactor <code>')
124 self._downloader.report_warning('(Note that only TOTP (Google Authenticator App) codes work at this time.)')
83317f69 125 return False
126
127 # Unlike the first login form, secTok and timeStmp are both required for the TFA form
128
129 match = re.search(r'id="secTok"\n\s+value=\'(.+)\'/>', login_results, re.M | re.U)
130 if match is None:
69ea8ca4 131 self._downloader.report_warning('Failed to get secTok - did the page structure change?')
83317f69 132 secTok = match.group(1)
133 match = re.search(r'id="timeStmp"\n\s+value=\'(.+)\'/>', login_results, re.M | re.U)
134 if match is None:
69ea8ca4 135 self._downloader.report_warning('Failed to get timeStmp - did the page structure change?')
83317f69 136 timeStmp = match.group(1)
137
138 tfa_form_strs = {
78caa52a
PH
139 'continue': 'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
140 'smsToken': '',
141 'smsUserPin': tfa_code,
142 'smsVerifyPin': 'Verify',
143
144 'PersistentCookie': 'yes',
145 'checkConnection': '',
146 'checkedDomains': 'youtube',
147 'pstMsg': '1',
148 'secTok': secTok,
149 'timeStmp': timeStmp,
150 'service': 'youtube',
151 'hl': 'en_US',
83317f69 152 }
5f6a1245 153 tfa_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k, v in tfa_form_strs.items())
83317f69 154 tfa_data = compat_urllib_parse.urlencode(tfa_form).encode('ascii')
155
156 tfa_req = compat_urllib_request.Request(self._TWOFACTOR_URL, tfa_data)
157 tfa_results = self._download_webpage(
158 tfa_req, None,
69ea8ca4 159 note='Submitting TFA code', errnote='unable to submit tfa', fatal=False)
83317f69 160
161 if tfa_results is False:
162 return False
163
164 if re.search(r'(?i)<form[^>]* id="gaia_secondfactorform"', tfa_results) is not None:
69ea8ca4 165 self._downloader.report_warning('Two-factor code expired. Please try again, or use a one-use backup code instead.')
83317f69 166 return False
167 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', tfa_results) is not None:
69ea8ca4 168 self._downloader.report_warning('unable to log in - did the page structure change?')
83317f69 169 return False
170 if re.search(r'smsauth-interstitial-reviewsettings', tfa_results) is not None:
69ea8ca4 171 self._downloader.report_warning('Your Google account has a security notice. Please log in on your web browser, resolve the notice, and try again.')
83317f69 172 return False
173
7cc3570e 174 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
69ea8ca4 175 self._downloader.report_warning('unable to log in: bad username or password')
b2e8bc1b
JMF
176 return False
177 return True
178
b2e8bc1b
JMF
179 def _real_initialize(self):
180 if self._downloader is None:
181 return
42939b61 182 self._set_language()
b2e8bc1b
JMF
183 if not self._login():
184 return
c5e8d7af 185
8377574c 186
de7f3446 187class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
78caa52a 188 IE_DESC = 'YouTube.com'
cb7dfeea 189 _VALID_URL = r"""(?x)^
c5e8d7af 190 (
edb53e2d 191 (?:https?://|//) # http(s):// or protocol-independent URL
cb7dfeea 192 (?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/|
484aaeb2 193 (?:www\.)?deturl\.com/www\.youtube\.com/|
e70dc1d1 194 (?:www\.)?pwnyoutube\.com/|
f7000f3a 195 (?:www\.)?yourepeat\.com/|
e69ae5b9
JMF
196 tube\.majestyc\.net/|
197 youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains
c5e8d7af
PH
198 (?:.*?\#/)? # handle anchor (#/) redirect urls
199 (?: # the various things that can precede the ID:
ac7553d0 200 (?:(?:v|embed|e)/(?!videoseries)) # v/ or embed/ or e/
c5e8d7af 201 |(?: # or the v= param in all its forms
f7000f3a 202 (?:(?:watch|movie)(?:_popup)?(?:\.php)?/?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
c5e8d7af
PH
203 (?:\?|\#!?) # the params delimiter ? or # or #!
204 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
205 v=
206 )
f4b05232
JMF
207 ))
208 |youtu\.be/ # just youtu.be/xxxx
edb53e2d 209 |(?:www\.)?cleanvideosearch\.com/media/action/yt/watch\?videoId=
f4b05232 210 )
c5e8d7af 211 )? # all until now is optional -> you can pass the naked ID
8963d9c2 212 ([0-9A-Za-z_-]{11}) # here is it! the YouTube video ID
9291475f 213 (?!.*?&list=) # combined list/video URLs are handled by the playlist IE
c5e8d7af
PH
214 (?(1).+)? # if we found the ID, everything can follow
215 $"""
c5e8d7af 216 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
2c62dc26
PH
217 _formats = {
218 '5': {'ext': 'flv', 'width': 400, 'height': 240},
219 '6': {'ext': 'flv', 'width': 450, 'height': 270},
220 '13': {'ext': '3gp'},
221 '17': {'ext': '3gp', 'width': 176, 'height': 144},
222 '18': {'ext': 'mp4', 'width': 640, 'height': 360},
223 '22': {'ext': 'mp4', 'width': 1280, 'height': 720},
224 '34': {'ext': 'flv', 'width': 640, 'height': 360},
225 '35': {'ext': 'flv', 'width': 854, 'height': 480},
226 '36': {'ext': '3gp', 'width': 320, 'height': 240},
227 '37': {'ext': 'mp4', 'width': 1920, 'height': 1080},
228 '38': {'ext': 'mp4', 'width': 4096, 'height': 3072},
229 '43': {'ext': 'webm', 'width': 640, 'height': 360},
230 '44': {'ext': 'webm', 'width': 854, 'height': 480},
231 '45': {'ext': 'webm', 'width': 1280, 'height': 720},
232 '46': {'ext': 'webm', 'width': 1920, 'height': 1080},
233
1d043b93 234
86fe61c8 235 # 3d videos
43b81eb9
PH
236 '82': {'ext': 'mp4', 'height': 360, 'format_note': '3D', 'preference': -20},
237 '83': {'ext': 'mp4', 'height': 480, 'format_note': '3D', 'preference': -20},
238 '84': {'ext': 'mp4', 'height': 720, 'format_note': '3D', 'preference': -20},
239 '85': {'ext': 'mp4', 'height': 1080, 'format_note': '3D', 'preference': -20},
240 '100': {'ext': 'webm', 'height': 360, 'format_note': '3D', 'preference': -20},
241 '101': {'ext': 'webm', 'height': 480, 'format_note': '3D', 'preference': -20},
242 '102': {'ext': 'webm', 'height': 720, 'format_note': '3D', 'preference': -20},
836a086c 243
96fb5605 244 # Apple HTTP Live Streaming
43b81eb9
PH
245 '92': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'preference': -10},
246 '93': {'ext': 'mp4', 'height': 360, 'format_note': 'HLS', 'preference': -10},
247 '94': {'ext': 'mp4', 'height': 480, 'format_note': 'HLS', 'preference': -10},
248 '95': {'ext': 'mp4', 'height': 720, 'format_note': 'HLS', 'preference': -10},
249 '96': {'ext': 'mp4', 'height': 1080, 'format_note': 'HLS', 'preference': -10},
250 '132': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'preference': -10},
251 '151': {'ext': 'mp4', 'height': 72, 'format_note': 'HLS', 'preference': -10},
2c62dc26
PH
252
253 # DASH mp4 video
43b81eb9
PH
254 '133': {'ext': 'mp4', 'height': 240, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
255 '134': {'ext': 'mp4', 'height': 360, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
256 '135': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
257 '136': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
258 '137': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
259 '138': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
260 '160': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
261 '264': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
0d2c1418
PH
262 '298': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'h264'},
263 '299': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'h264'},
264 '266': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'vcodec': 'h264'},
836a086c 265
f6f1fc92 266 # Dash mp4 audio
2c62dc26
PH
267 '139': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 48, 'preference': -50},
268 '140': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 128, 'preference': -50},
269 '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 256, 'preference': -50},
836a086c
AZ
270
271 # Dash webm
e75cafe9
A
272 '167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
273 '168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
274 '169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
275 '170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
276 '218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
277 '219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
1cc887cb 278 '278': {'ext': 'webm', 'height': 144, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'container': 'webm', 'vcodec': 'VP9'},
e75cafe9
A
279 '242': {'ext': 'webm', 'height': 240, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
280 '243': {'ext': 'webm', 'height': 360, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
281 '244': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
282 '245': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
283 '246': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
284 '247': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
285 '248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
3c80377b 286 '271': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
18061bba 287 '272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
a51d3aa0
PH
288 '302': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'VP9'},
289 '303': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'VP9'},
10a404c3 290 '313': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'vcodec': 'VP9'},
2c62dc26
PH
291
292 # Dash webm audio
55db73ef 293 '171': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'abr': 128, 'preference': -50},
e75cafe9 294 '172': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'abr': 256, 'preference': -50},
ce6b9a2d 295
0857baad
PH
296 # Dash webm audio with opus inside
297 '249': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 50, 'preference': -50},
298 '250': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 70, 'preference': -50},
299 '251': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 160, 'preference': -50},
300
ce6b9a2d
PH
301 # RTMP (unnamed)
302 '_rtmp': {'protocol': 'rtmp'},
c5e8d7af 303 }
836a086c 304
78caa52a 305 IE_NAME = 'youtube'
2eb88d95
PH
306 _TESTS = [
307 {
4bc3a23e
PH
308 'url': 'http://www.youtube.com/watch?v=BaW_jenozKc',
309 'info_dict': {
310 'id': 'BaW_jenozKc',
311 'ext': 'mp4',
312 'title': 'youtube-dl test video "\'/\\ä↭𝕐',
313 'uploader': 'Philipp Hagemeister',
314 'uploader_id': 'phihag',
315 'upload_date': '20121002',
316 'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',
317 'categories': ['Science & Technology'],
3e7c1224
PH
318 'like_count': int,
319 'dislike_count': int,
2eb88d95 320 }
0e853ca4 321 },
0e853ca4 322 {
4bc3a23e
PH
323 'url': 'http://www.youtube.com/watch?v=UxxajLWwzqY',
324 'note': 'Test generic use_cipher_signature video (#897)',
325 'info_dict': {
326 'id': 'UxxajLWwzqY',
327 'ext': 'mp4',
328 'upload_date': '20120506',
329 'title': 'Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]',
330 'description': 'md5:fea86fda2d5a5784273df5c7cc994d9f',
331 'uploader': 'Icona Pop',
332 'uploader_id': 'IconaPop',
2eb88d95 333 }
c108eb73
JMF
334 },
335 {
4bc3a23e
PH
336 'url': 'https://www.youtube.com/watch?v=07FYdnEawAQ',
337 'note': 'Test VEVO video with age protection (#956)',
338 'info_dict': {
339 'id': '07FYdnEawAQ',
340 'ext': 'mp4',
341 'upload_date': '20130703',
342 'title': 'Justin Timberlake - Tunnel Vision (Explicit)',
343 'description': 'md5:64249768eec3bc4276236606ea996373',
344 'uploader': 'justintimberlakeVEVO',
345 'uploader_id': 'justintimberlakeVEVO',
c108eb73
JMF
346 }
347 },
fccd3771 348 {
4bc3a23e
PH
349 'url': '//www.YouTube.com/watch?v=yZIXLfi8CZQ',
350 'note': 'Embed-only video (#1746)',
351 'info_dict': {
352 'id': 'yZIXLfi8CZQ',
353 'ext': 'mp4',
354 'upload_date': '20120608',
355 'title': 'Principal Sexually Assaults A Teacher - Episode 117 - 8th June 2012',
356 'description': 'md5:09b78bd971f1e3e289601dfba15ca4f7',
357 'uploader': 'SET India',
358 'uploader_id': 'setindia'
fccd3771
PH
359 }
360 },
dd27fd17 361 {
4bc3a23e
PH
362 'url': 'http://www.youtube.com/watch?v=a9LDPn-MO4I',
363 'note': '256k DASH audio (format 141) via DASH manifest',
364 'info_dict': {
365 'id': 'a9LDPn-MO4I',
366 'ext': 'm4a',
367 'upload_date': '20121002',
368 'uploader_id': '8KVIDEO',
369 'description': '',
370 'uploader': '8KVIDEO',
371 'title': 'UHDTV TEST 8K VIDEO.mp4'
4919603f 372 },
4bc3a23e
PH
373 'params': {
374 'youtube_include_dash_manifest': True,
375 'format': '141',
4919603f 376 },
dd27fd17 377 },
3489b7d2
JMF
378 # DASH manifest with encrypted signature
379 {
78caa52a
PH
380 'url': 'https://www.youtube.com/watch?v=IB3lcPjvWLA',
381 'info_dict': {
382 'id': 'IB3lcPjvWLA',
383 'ext': 'm4a',
b766eb27
JMF
384 'title': 'Afrojack, Spree Wilson - The Spark ft. Spree Wilson',
385 'description': 'md5:12e7067fa6735a77bdcbb58cb1187d2d',
78caa52a
PH
386 'uploader': 'AfrojackVEVO',
387 'uploader_id': 'AfrojackVEVO',
388 'upload_date': '20131011',
3489b7d2 389 },
4bc3a23e 390 'params': {
78caa52a
PH
391 'youtube_include_dash_manifest': True,
392 'format': '141',
3489b7d2
JMF
393 },
394 },
aa79ac0c
PH
395 # Controversy video
396 {
397 'url': 'https://www.youtube.com/watch?v=T4XJQO3qol8',
398 'info_dict': {
399 'id': 'T4XJQO3qol8',
400 'ext': 'mp4',
401 'upload_date': '20100909',
402 'uploader': 'The Amazing Atheist',
403 'uploader_id': 'TheAmazingAtheist',
404 'title': 'Burning Everyone\'s Koran',
405 'description': 'SUBSCRIBE: http://www.youtube.com/saturninefilms\n\nEven Obama has taken a stand against freedom on this issue: http://www.huffingtonpost.com/2010/09/09/obama-gma-interview-quran_n_710282.html',
406 }
c522adb1
JMF
407 },
408 # Normal age-gate video (No vevo, embed allowed)
409 {
410 'url': 'http://youtube.com/watch?v=HtVdAasjOgU',
411 'info_dict': {
412 'id': 'HtVdAasjOgU',
413 'ext': 'mp4',
414 'title': 'The Witcher 3: Wild Hunt - The Sword Of Destiny Trailer',
415 'description': 'md5:eca57043abae25130f58f655ad9a7771',
416 'uploader': 'The Witcher',
417 'uploader_id': 'WitcherGame',
418 'upload_date': '20140605',
419 },
420 },
774e208f
PH
421 # video_info is None (https://github.com/rg3/youtube-dl/issues/4421)
422 {
423 'url': '__2ABJjxzNo',
424 'info_dict': {
425 'id': '__2ABJjxzNo',
426 'ext': 'mp4',
427 'upload_date': '20100430',
428 'uploader_id': 'deadmau5',
429 'description': 'md5:12c56784b8032162bb936a5f76d55360',
430 'uploader': 'deadmau5',
431 'title': 'Deadmau5 - Some Chords (HD)',
432 },
433 'expected_warnings': [
434 'DASH manifest missing',
435 ]
e52a40ab
PH
436 },
437 # Olympics (https://github.com/rg3/youtube-dl/issues/4431)
438 {
439 'url': 'lqQg6PlCWgI',
440 'info_dict': {
441 'id': 'lqQg6PlCWgI',
442 'ext': 'mp4',
cbe2bd91
PH
443 'upload_date': '20120731',
444 'uploader_id': 'olympic',
445 'description': 'HO09 - Women - GER-AUS - Hockey - 31 July 2012 - London 2012 Olympic Games',
446 'uploader': 'Olympics',
447 'title': 'Hockey - Women - GER-AUS - London 2012 Olympic Games',
448 },
449 'params': {
450 'skip_download': 'requires avconv',
e52a40ab 451 }
cbe2bd91 452 },
2eb88d95
PH
453 ]
454
e0df6211
PH
455 def __init__(self, *args, **kwargs):
456 super(YoutubeIE, self).__init__(*args, **kwargs)
83799698 457 self._player_cache = {}
e0df6211 458
c5e8d7af
PH
459 def report_video_info_webpage_download(self, video_id):
460 """Report attempt to download video info webpage."""
69ea8ca4 461 self.to_screen('%s: Downloading video info webpage' % video_id)
c5e8d7af 462
c5e8d7af
PH
463 def report_information_extraction(self, video_id):
464 """Report attempt to extract video information."""
69ea8ca4 465 self.to_screen('%s: Extracting video information' % video_id)
c5e8d7af
PH
466
467 def report_unavailable_format(self, video_id, format):
468 """Report extracted video URL."""
69ea8ca4 469 self.to_screen('%s: Format %s not available' % (video_id, format))
c5e8d7af
PH
470
471 def report_rtmp_download(self):
472 """Indicate the download will use the RTMP protocol."""
69ea8ca4 473 self.to_screen('RTMP download detected')
c5e8d7af 474
60064c53
PH
475 def _signature_cache_id(self, example_sig):
476 """ Return a string representation of a signature """
78caa52a 477 return '.'.join(compat_str(len(part)) for part in example_sig.split('.'))
60064c53
PH
478
479 def _extract_signature_function(self, video_id, player_url, example_sig):
cf010131 480 id_m = re.match(
60620368 481 r'.*?-(?P<id>[a-zA-Z0-9_-]+)(?:/watch_as3|/html5player)?\.(?P<ext>[a-z]+)$',
cf010131 482 player_url)
c081b35c
PH
483 if not id_m:
484 raise ExtractorError('Cannot identify player %r' % player_url)
e0df6211
PH
485 player_type = id_m.group('ext')
486 player_id = id_m.group('id')
487
c4417ddb 488 # Read from filesystem cache
60064c53
PH
489 func_id = '%s_%s_%s' % (
490 player_type, player_id, self._signature_cache_id(example_sig))
c4417ddb 491 assert os.path.basename(func_id) == func_id
a0e07d31 492
69ea8ca4 493 cache_spec = self._downloader.cache.load('youtube-sigfuncs', func_id)
a0e07d31 494 if cache_spec is not None:
78caa52a 495 return lambda s: ''.join(s[i] for i in cache_spec)
83799698 496
e0df6211
PH
497 if player_type == 'js':
498 code = self._download_webpage(
499 player_url, video_id,
69ea8ca4
PH
500 note='Downloading %s player %s' % (player_type, player_id),
501 errnote='Download of %s failed' % player_url)
83799698 502 res = self._parse_sig_js(code)
c4417ddb 503 elif player_type == 'swf':
e0df6211
PH
504 urlh = self._request_webpage(
505 player_url, video_id,
69ea8ca4
PH
506 note='Downloading %s player %s' % (player_type, player_id),
507 errnote='Download of %s failed' % player_url)
e0df6211 508 code = urlh.read()
83799698 509 res = self._parse_sig_swf(code)
e0df6211
PH
510 else:
511 assert False, 'Invalid player type %r' % player_type
512
a0e07d31 513 if cache_spec is None:
78caa52a 514 test_string = ''.join(map(compat_chr, range(len(example_sig))))
a0e07d31
PH
515 cache_res = res(test_string)
516 cache_spec = [ord(c) for c in cache_res]
83799698 517
69ea8ca4 518 self._downloader.cache.store('youtube-sigfuncs', func_id, cache_spec)
83799698
PH
519 return res
520
60064c53 521 def _print_sig_code(self, func, example_sig):
edf3e38e
PH
522 def gen_sig_code(idxs):
523 def _genslice(start, end, step):
78caa52a 524 starts = '' if start == 0 else str(start)
8bcc8756 525 ends = (':%d' % (end + step)) if end + step >= 0 else ':'
69ea8ca4 526 steps = '' if step == 1 else (':%d' % step)
78caa52a 527 return 's[%s%s%s]' % (starts, ends, steps)
edf3e38e
PH
528
529 step = None
0ca96d48
PH
530 start = '(Never used)' # Quelch pyflakes warnings - start will be
531 # set as soon as step is set
edf3e38e
PH
532 for i, prev in zip(idxs[1:], idxs[:-1]):
533 if step is not None:
534 if i - prev == step:
535 continue
536 yield _genslice(start, prev, step)
537 step = None
538 continue
539 if i - prev in [-1, 1]:
540 step = i - prev
541 start = prev
542 continue
543 else:
78caa52a 544 yield 's[%d]' % prev
edf3e38e 545 if step is None:
78caa52a 546 yield 's[%d]' % i
edf3e38e
PH
547 else:
548 yield _genslice(start, i, step)
549
78caa52a 550 test_string = ''.join(map(compat_chr, range(len(example_sig))))
c705320f 551 cache_res = func(test_string)
edf3e38e 552 cache_spec = [ord(c) for c in cache_res]
78caa52a 553 expr_code = ' + '.join(gen_sig_code(cache_spec))
60064c53
PH
554 signature_id_tuple = '(%s)' % (
555 ', '.join(compat_str(len(p)) for p in example_sig.split('.')))
69ea8ca4 556 code = ('if tuple(len(p) for p in s.split(\'.\')) == %s:\n'
78caa52a 557 ' return %s\n') % (signature_id_tuple, expr_code)
69ea8ca4 558 self.to_screen('Extracted signature function:\n' + code)
edf3e38e 559
e0df6211
PH
560 def _parse_sig_js(self, jscode):
561 funcname = self._search_regex(
894dd868 562 r'\.sig\|\|([a-zA-Z0-9]+)\(', jscode,
8bcc8756 563 'Initial JS player signature function name')
2b25cb5d
PH
564
565 jsi = JSInterpreter(jscode)
566 initial_function = jsi.extract_function(funcname)
e0df6211
PH
567 return lambda s: initial_function([s])
568
569 def _parse_sig_swf(self, file_contents):
54256267 570 swfi = SWFInterpreter(file_contents)
78caa52a 571 TARGET_CLASSNAME = 'SignatureDecipher'
54256267 572 searched_class = swfi.extract_class(TARGET_CLASSNAME)
78caa52a 573 initial_function = swfi.extract_function(searched_class, 'decipher')
e0df6211
PH
574 return lambda s: initial_function([s])
575
83799698 576 def _decrypt_signature(self, s, video_id, player_url, age_gate=False):
257a2501 577 """Turn the encrypted s field into a working signature"""
6b37f0be 578
c8bf86d5 579 if player_url is None:
69ea8ca4 580 raise ExtractorError('Cannot decrypt signature without player_url')
920de7a2 581
69ea8ca4 582 if player_url.startswith('//'):
78caa52a 583 player_url = 'https:' + player_url
c8bf86d5 584 try:
62af3a0e 585 player_id = (player_url, self._signature_cache_id(s))
c8bf86d5
PH
586 if player_id not in self._player_cache:
587 func = self._extract_signature_function(
60064c53 588 video_id, player_url, s
c8bf86d5
PH
589 )
590 self._player_cache[player_id] = func
591 func = self._player_cache[player_id]
592 if self._downloader.params.get('youtube_print_sig_code'):
60064c53 593 self._print_sig_code(func, s)
c8bf86d5
PH
594 return func(s)
595 except Exception as e:
596 tb = traceback.format_exc()
597 raise ExtractorError(
78caa52a 598 'Signature extraction failed: ' + tb, cause=e)
e0df6211 599
1f343eaa 600 def _get_available_subtitles(self, video_id, webpage):
de7f3446 601 try:
7fad1c63 602 sub_list = self._download_webpage(
38c2e5b8 603 'https://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,
7fad1c63
JMF
604 video_id, note=False)
605 except ExtractorError as err:
69ea8ca4 606 self._downloader.report_warning('unable to download video subtitles: %s' % compat_str(err))
de7f3446
JMF
607 return {}
608 lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
609
610 sub_lang_list = {}
611 for l in lang_list:
612 lang = l[1]
7e660ac1
LD
613 if lang in sub_lang_list:
614 continue
de7f3446
JMF
615 params = compat_urllib_parse.urlencode({
616 'lang': lang,
617 'v': video_id,
ca715127 618 'fmt': self._downloader.params.get('subtitlesformat', 'srt'),
c3197e3e 619 'name': unescapeHTML(l[0]).encode('utf-8'),
de7f3446 620 })
78caa52a 621 url = 'https://www.youtube.com/api/timedtext?' + params
de7f3446
JMF
622 sub_lang_list[lang] = url
623 if not sub_lang_list:
69ea8ca4 624 self._downloader.report_warning('video doesn\'t have subtitles')
de7f3446
JMF
625 return {}
626 return sub_lang_list
627
055e6f36 628 def _get_available_automatic_caption(self, video_id, webpage):
de7f3446
JMF
629 """We need the webpage for getting the captions url, pass it as an
630 argument to speed up the process."""
ca715127 631 sub_format = self._downloader.params.get('subtitlesformat', 'srt')
69ea8ca4 632 self.to_screen('%s: Looking for automatic captions' % video_id)
de7f3446 633 mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
78caa52a 634 err_msg = 'Couldn\'t find automatic captions for %s' % video_id
de7f3446
JMF
635 if mobj is None:
636 self._downloader.report_warning(err_msg)
637 return {}
638 player_config = json.loads(mobj.group(1))
639 try:
0792d563
PH
640 args = player_config['args']
641 caption_url = args['ttsurl']
642 timestamp = args['timestamp']
055e6f36
JMF
643 # We get the available subtitles
644 list_params = compat_urllib_parse.urlencode({
645 'type': 'list',
646 'tlangs': 1,
647 'asrs': 1,
de7f3446 648 })
055e6f36 649 list_url = caption_url + '&' + list_params
e26f8712 650 caption_list = self._download_xml(list_url, video_id)
e3dc22ca 651 original_lang_node = caption_list.find('track')
5f6a1245 652 if original_lang_node is None or original_lang_node.attrib.get('kind') != 'asr':
69ea8ca4 653 self._downloader.report_warning('Video doesn\'t have automatic captions')
e3dc22ca
JMF
654 return {}
655 original_lang = original_lang_node.attrib['lang_code']
055e6f36
JMF
656
657 sub_lang_list = {}
658 for lang_node in caption_list.findall('target'):
659 sub_lang = lang_node.attrib['lang_code']
660 params = compat_urllib_parse.urlencode({
661 'lang': original_lang,
662 'tlang': sub_lang,
663 'fmt': sub_format,
664 'ts': timestamp,
665 'kind': 'asr',
666 })
667 sub_lang_list[sub_lang] = caption_url + '&' + params
668 return sub_lang_list
de7f3446
JMF
669 # An extractor error can be raise by the download process if there are
670 # no automatic captions but there are subtitles
671 except (KeyError, ExtractorError):
672 self._downloader.report_warning(err_msg)
673 return {}
674
97665381
PH
675 @classmethod
676 def extract_id(cls, url):
677 mobj = re.match(cls._VALID_URL, url, re.VERBOSE)
c5e8d7af 678 if mobj is None:
69ea8ca4 679 raise ExtractorError('Invalid URL: %s' % url)
c5e8d7af
PH
680 video_id = mobj.group(2)
681 return video_id
682
1d043b93
JMF
683 def _extract_from_m3u8(self, manifest_url, video_id):
684 url_map = {}
5f6a1245 685
1d043b93
JMF
686 def _get_urls(_manifest):
687 lines = _manifest.split('\n')
688 urls = filter(lambda l: l and not l.startswith('#'),
8bcc8756 689 lines)
1d043b93 690 return urls
78caa52a 691 manifest = self._download_webpage(manifest_url, video_id, 'Downloading formats manifest')
1d043b93
JMF
692 formats_urls = _get_urls(manifest)
693 for format_url in formats_urls:
890f62e8 694 itag = self._search_regex(r'itag/(\d+?)/', format_url, 'itag')
1d043b93
JMF
695 url_map[itag] = format_url
696 return url_map
697
1fb07d10
JG
698 def _extract_annotations(self, video_id):
699 url = 'https://www.youtube.com/annotations_invideo?features=1&legacy=1&video_id=%s' % video_id
69ea8ca4 700 return self._download_webpage(url, video_id, note='Searching for annotations.', errnote='Unable to download video annotations.')
1fb07d10 701
da276600
PH
702 def _parse_dash_manifest(
703 self, video_id, dash_manifest_url, player_url, age_gate):
774e208f
PH
704 def decrypt_sig(mobj):
705 s = mobj.group(1)
706 dec_s = self._decrypt_signature(s, video_id, player_url, age_gate)
707 return '/signature/%s' % dec_s
708 dash_manifest_url = re.sub(r'/s/([\w\.]+)', decrypt_sig, dash_manifest_url)
709 dash_doc = self._download_xml(
710 dash_manifest_url, video_id,
711 note='Downloading DASH manifest',
712 errnote='Could not download DASH manifest')
713
714 formats = []
715 for r in dash_doc.findall('.//{urn:mpeg:DASH:schema:MPD:2011}Representation'):
716 url_el = r.find('{urn:mpeg:DASH:schema:MPD:2011}BaseURL')
717 if url_el is None:
718 continue
719 format_id = r.attrib['id']
720 video_url = url_el.text
721 filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength'))
722 f = {
723 'format_id': format_id,
724 'url': video_url,
725 'width': int_or_none(r.attrib.get('width')),
726 'tbr': int_or_none(r.attrib.get('bandwidth'), 1000),
727 'asr': int_or_none(r.attrib.get('audioSamplingRate')),
728 'filesize': filesize,
729 'fps': int_or_none(r.attrib.get('frameRate')),
730 }
731 try:
732 existing_format = next(
733 fo for fo in formats
734 if fo['format_id'] == format_id)
735 except StopIteration:
736 f.update(self._formats.get(format_id, {}))
737 formats.append(f)
738 else:
739 existing_format.update(f)
740 return formats
741
c5e8d7af 742 def _real_extract(self, url):
7e8c0af0 743 proto = (
78caa52a
PH
744 'http' if self._downloader.params.get('prefer_insecure', False)
745 else 'https')
7e8c0af0 746
c5e8d7af
PH
747 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
748 mobj = re.search(self._NEXT_URL_RE, url)
749 if mobj:
7e8c0af0 750 url = proto + '://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
97665381 751 video_id = self.extract_id(url)
c5e8d7af
PH
752
753 # Get video webpage
aa79ac0c 754 url = proto + '://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1&bpctr=9999999999' % video_id
a1f934b1 755 video_webpage = self._download_webpage(url, video_id)
c5e8d7af
PH
756
757 # Attempt to extract SWF player URL
e0df6211 758 mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
c5e8d7af
PH
759 if mobj is not None:
760 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
761 else:
762 player_url = None
763
764 # Get video info
c108eb73 765 if re.search(r'player-age-gate-content">', video_webpage) is not None:
c108eb73
JMF
766 age_gate = True
767 # We simulate the access to the video from www.youtube.com/v/{video_id}
768 # this can be viewed without login into Youtube
2c57c7fa
JMF
769 data = compat_urllib_parse.urlencode({
770 'video_id': video_id,
771 'eurl': 'https://youtube.googleapis.com/v/' + video_id,
c084c934 772 'sts': self._search_regex(
94bd3613 773 r'"sts"\s*:\s*(\d+)', video_webpage, 'sts', default=''),
2c57c7fa 774 })
7e8c0af0 775 video_info_url = proto + '://www.youtube.com/get_video_info?' + data
94bd3613
PH
776 video_info_webpage = self._download_webpage(
777 video_info_url, video_id,
20436c30 778 note='Refetching age-gated info webpage',
94bd3613 779 errnote='unable to download video info webpage')
c5e8d7af 780 video_info = compat_parse_qs(video_info_webpage)
c108eb73
JMF
781 else:
782 age_gate = False
4e62ebe2
JMF
783 try:
784 # Try looking directly into the video webpage
785 mobj = re.search(r';ytplayer\.config\s*=\s*({.*?});', video_webpage)
786 if not mobj:
787 raise ValueError('Could not find ytplayer.config') # caught below
788 json_code = uppercase_escape(mobj.group(1))
789 ytplayer_config = json.loads(json_code)
790 args = ytplayer_config['args']
791 # Convert to the same format returned by compat_parse_qs
792 video_info = dict((k, [v]) for k, v in args.items())
793 if 'url_encoded_fmt_stream_map' not in args:
794 raise ValueError('No stream_map present') # caught below
795 except ValueError:
796 # We fallback to the get_video_info pages (used by the embed page)
797 self.report_video_info_webpage_download(video_id)
798 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
810fb84d
PH
799 video_info_url = (
800 '%s://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
801 % (proto, video_id, el_type))
802 video_info_webpage = self._download_webpage(
803 video_info_url,
4e62ebe2
JMF
804 video_id, note=False,
805 errnote='unable to download video info webpage')
806 video_info = compat_parse_qs(video_info_webpage)
807 if 'token' in video_info:
808 break
c5e8d7af
PH
809 if 'token' not in video_info:
810 if 'reason' in video_info:
d11271dd 811 raise ExtractorError(
78caa52a 812 'YouTube said: %s' % video_info['reason'][0],
d11271dd 813 expected=True, video_id=video_id)
c5e8d7af 814 else:
d11271dd 815 raise ExtractorError(
78caa52a 816 '"token" parameter not in video info for unknown reason',
d11271dd 817 video_id=video_id)
c5e8d7af 818
1d699755
PH
819 if 'view_count' in video_info:
820 view_count = int(video_info['view_count'][0])
821 else:
822 view_count = None
823
c5e8d7af
PH
824 # Check for "rental" videos
825 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
69ea8ca4 826 raise ExtractorError('"rental" videos not supported')
c5e8d7af
PH
827
828 # Start extracting information
829 self.report_information_extraction(video_id)
830
831 # uploader
832 if 'author' not in video_info:
69ea8ca4 833 raise ExtractorError('Unable to extract uploader name')
c5e8d7af
PH
834 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
835
836 # uploader_id
837 video_uploader_id = None
838 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
839 if mobj is not None:
840 video_uploader_id = mobj.group(1)
841 else:
69ea8ca4 842 self._downloader.report_warning('unable to extract uploader nickname')
c5e8d7af
PH
843
844 # title
a8c6b241 845 if 'title' in video_info:
aa92f063 846 video_title = video_info['title'][0]
a8c6b241 847 else:
69ea8ca4 848 self._downloader.report_warning('Unable to extract video title')
78caa52a 849 video_title = '_'
c5e8d7af
PH
850
851 # thumbnail image
7763b04e
JMF
852 # We try first to get a high quality image:
853 m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">',
854 video_webpage, re.DOTALL)
855 if m_thumb is not None:
856 video_thumbnail = m_thumb.group(1)
857 elif 'thumbnail_url' not in video_info:
69ea8ca4 858 self._downloader.report_warning('unable to extract video thumbnail')
f490e77e 859 video_thumbnail = None
c5e8d7af
PH
860 else: # don't panic if we can't find it
861 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
862
863 # upload date
864 upload_date = None
ad3bc6ac 865 mobj = re.search(r'(?s)id="eow-date.*?>(.*?)</span>', video_webpage)
beee53de
PH
866 if mobj is None:
867 mobj = re.search(
263bd4ec 868 r'(?s)id="watch-uploader-info".*?>.*?(?:Published|Uploaded|Streamed live) on (.*?)</strong>',
beee53de 869 video_webpage)
c5e8d7af
PH
870 if mobj is not None:
871 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
872 upload_date = unified_strdate(upload_date)
873
55f7bd2d
PH
874 m_cat_container = self._search_regex(
875 r'(?s)<h4[^>]*>\s*Category\s*</h4>\s*<ul[^>]*>(.*?)</ul>',
624dcebf 876 video_webpage, 'categories', default=None)
ec8deefc 877 if m_cat_container:
ad3bc6ac 878 category = self._html_search_regex(
01ed5c9b 879 r'(?s)<a[^<]+>(.*?)</a>', m_cat_container, 'category',
ad3bc6ac
PH
880 default=None)
881 video_categories = None if category is None else [category]
882 else:
883 video_categories = None
ec8deefc 884
c5e8d7af
PH
885 # description
886 video_description = get_element_by_id("eow-description", video_webpage)
887 if video_description:
27dcce19
PH
888 video_description = re.sub(r'''(?x)
889 <a\s+
890 (?:[a-zA-Z-]+="[^"]+"\s+)*?
891 title="([^"]+)"\s+
892 (?:[a-zA-Z-]+="[^"]+"\s+)*?
893 class="yt-uix-redirect-link"\s*>
894 [^<]+
895 </a>
896 ''', r'\1', video_description)
c5e8d7af
PH
897 video_description = clean_html(video_description)
898 else:
899 fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
900 if fd_mobj:
901 video_description = unescapeHTML(fd_mobj.group(1))
902 else:
78caa52a 903 video_description = ''
c5e8d7af 904
f30a38be 905 def _extract_count(count_name):
46374a56 906 count = self._search_regex(
f30a38be
JMF
907 r'id="watch-%s"[^>]*>.*?([\d,]+)\s*</span>' % re.escape(count_name),
908 video_webpage, count_name, default=None)
336c3a69
JMF
909 if count is not None:
910 return int(count.replace(',', ''))
911 return None
69ea8ca4
PH
912 like_count = _extract_count('like')
913 dislike_count = _extract_count('dislike')
336c3a69 914
c5e8d7af 915 # subtitles
d82134c3 916 video_subtitles = self.extract_subtitles(video_id, video_webpage)
c5e8d7af 917
c5e8d7af 918 if self._downloader.params.get('listsubtitles', False):
d665f8d3 919 self._list_available_subtitles(video_id, video_webpage)
c5e8d7af
PH
920 return
921
922 if 'length_seconds' not in video_info:
69ea8ca4 923 self._downloader.report_warning('unable to extract video duration')
b466b702 924 video_duration = None
c5e8d7af 925 else:
b466b702 926 video_duration = int(compat_urllib_parse.unquote_plus(video_info['length_seconds'][0]))
c5e8d7af 927
1fb07d10
JG
928 # annotations
929 video_annotations = None
930 if self._downloader.params.get('writeannotations', False):
5f6a1245 931 video_annotations = self._extract_annotations(video_id)
1fb07d10 932
dd27fd17
PH
933 def _map_to_format_list(urlmap):
934 formats = []
935 for itag, video_real_url in urlmap.items():
936 dct = {
937 'format_id': itag,
938 'url': video_real_url,
939 'player_url': player_url,
940 }
0b65e5d4
PH
941 if itag in self._formats:
942 dct.update(self._formats[itag])
dd27fd17
PH
943 formats.append(dct)
944 return formats
945
c5e8d7af
PH
946 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
947 self.report_rtmp_download()
dd27fd17
PH
948 formats = [{
949 'format_id': '_rtmp',
950 'protocol': 'rtmp',
951 'url': video_info['conn'][0],
952 'player_url': player_url,
953 }]
24270b03 954 elif len(video_info.get('url_encoded_fmt_stream_map', [''])[0]) >= 1 or len(video_info.get('adaptive_fmts', [''])[0]) >= 1:
5f6a1245 955 encoded_url_map = video_info.get('url_encoded_fmt_stream_map', [''])[0] + ',' + video_info.get('adaptive_fmts', [''])[0]
00fe14fc 956 if 'rtmpe%3Dyes' in encoded_url_map:
a7055eb9 957 raise ExtractorError('rtmpe downloads are not supported, see https://github.com/rg3/youtube-dl/issues/343 for more information.', expected=True)
c5e8d7af 958 url_map = {}
00fe14fc 959 for url_data_str in encoded_url_map.split(','):
c5e8d7af 960 url_data = compat_parse_qs(url_data_str)
201e9eaa
PH
961 if 'itag' not in url_data or 'url' not in url_data:
962 continue
963 format_id = url_data['itag'][0]
964 url = url_data['url'][0]
965
966 if 'sig' in url_data:
967 url += '&signature=' + url_data['sig'][0]
968 elif 's' in url_data:
969 encrypted_sig = url_data['s'][0]
970
971 if not age_gate:
972 jsplayer_url_json = self._search_regex(
973 r'"assets":.+?"js":\s*("[^"]+")',
78caa52a 974 video_webpage, 'JS player URL')
201e9eaa
PH
975 player_url = json.loads(jsplayer_url_json)
976 if player_url is None:
977 player_url_json = self._search_regex(
978 r'ytplayer\.config.*?"url"\s*:\s*("[^"]+")',
78caa52a 979 video_webpage, 'age gate player URL')
201e9eaa
PH
980 player_url = json.loads(player_url_json)
981
982 if self._downloader.params.get('verbose'):
cf010131 983 if player_url is None:
201e9eaa
PH
984 player_version = 'unknown'
985 player_desc = 'unknown'
986 else:
987 if player_url.endswith('swf'):
988 player_version = self._search_regex(
989 r'-(.+?)(?:/watch_as3)?\.swf$', player_url,
78caa52a 990 'flash player', fatal=False)
201e9eaa 991 player_desc = 'flash player %s' % player_version
cf010131 992 else:
201e9eaa
PH
993 player_version = self._search_regex(
994 r'html5player-([^/]+?)(?:/html5player)?\.js',
995 player_url,
996 'html5 player', fatal=False)
78caa52a 997 player_desc = 'html5 player %s' % player_version
201e9eaa 998
60064c53 999 parts_sizes = self._signature_cache_id(encrypted_sig)
69ea8ca4 1000 self.to_screen('{%s} signature length %s, %s' %
9e1a5b84 1001 (format_id, parts_sizes, player_desc))
201e9eaa
PH
1002
1003 signature = self._decrypt_signature(
1004 encrypted_sig, video_id, player_url, age_gate)
1005 url += '&signature=' + signature
1006 if 'ratebypass' not in url:
1007 url += '&ratebypass=yes'
1008 url_map[format_id] = url
dd27fd17 1009 formats = _map_to_format_list(url_map)
1d043b93
JMF
1010 elif video_info.get('hlsvp'):
1011 manifest_url = video_info['hlsvp'][0]
1012 url_map = self._extract_from_m3u8(manifest_url, video_id)
dd27fd17 1013 formats = _map_to_format_list(url_map)
c5e8d7af 1014 else:
69ea8ca4 1015 raise ExtractorError('no conn, hlsvp or url_encoded_fmt_stream_map information found in video info')
c5e8d7af 1016
dd27fd17 1017 # Look for the DASH manifest
203fb43f 1018 if self._downloader.params.get('youtube_include_dash_manifest', True):
774e208f 1019 dash_mpd = video_info.get('dashmpd')
75111274 1020 if dash_mpd:
774e208f
PH
1021 dash_manifest_url = dash_mpd[0]
1022 try:
1023 dash_formats = self._parse_dash_manifest(
da276600 1024 video_id, dash_manifest_url, player_url, age_gate)
774e208f
PH
1025 except (ExtractorError, KeyError) as e:
1026 self.report_warning(
1027 'Skipping DASH manifest: %r' % e, video_id)
1028 else:
1029 formats.extend(dash_formats)
d80044c2 1030
4bcc7bd1 1031 self._sort_formats(formats)
4ea3be0a 1032
1033 return {
8bcc8756
JW
1034 'id': video_id,
1035 'uploader': video_uploader,
1036 'uploader_id': video_uploader_id,
1037 'upload_date': upload_date,
1038 'title': video_title,
1039 'thumbnail': video_thumbnail,
1040 'description': video_description,
1041 'categories': video_categories,
1042 'subtitles': video_subtitles,
1043 'duration': video_duration,
1044 'age_limit': 18 if age_gate else 0,
1045 'annotations': video_annotations,
7e8c0af0 1046 'webpage_url': proto + '://www.youtube.com/watch?v=%s' % video_id,
8bcc8756 1047 'view_count': view_count,
4ea3be0a 1048 'like_count': like_count,
1049 'dislike_count': dislike_count,
8bcc8756 1050 'formats': formats,
4ea3be0a 1051 }
c5e8d7af 1052
5f6a1245 1053
880e1c52 1054class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
78caa52a 1055 IE_DESC = 'YouTube.com playlists'
d67cc9fa 1056 _VALID_URL = r"""(?x)(?:
c5e8d7af
PH
1057 (?:https?://)?
1058 (?:\w+\.)?
1059 youtube\.com/
1060 (?:
ac7553d0 1061 (?:course|view_play_list|my_playlists|artist|playlist|watch|embed/videoseries)
c5e8d7af
PH
1062 \? (?:.*?&)*? (?:p|a|list)=
1063 | p/
1064 )
d67cc9fa 1065 (
7d568f5a 1066 (?:PL|LL|EC|UU|FL|RD)?[0-9A-Za-z-_]{10,}
5f6a1245 1067 # Top tracks, they can also include dots
d67cc9fa
JMF
1068 |(?:MC)[\w\.]*
1069 )
c5e8d7af
PH
1070 .*
1071 |
7d568f5a 1072 ((?:PL|LL|EC|UU|FL|RD)[0-9A-Za-z-_]{10,})
c5e8d7af 1073 )"""
dbb94fb0 1074 _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s'
dbb94fb0 1075 _VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})&amp;[^"]*?index=(?P<index>\d+)'
78caa52a 1076 IE_NAME = 'youtube:playlist'
81127aa5
PH
1077 _TESTS = [{
1078 'url': 'https://www.youtube.com/playlist?list=PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re',
1079 'info_dict': {
1080 'title': 'ytdl test PL',
a1cf99d0 1081 'id': 'PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re',
81127aa5
PH
1082 },
1083 'playlist_count': 3,
9291475f
PH
1084 }, {
1085 'url': 'https://www.youtube.com/playlist?list=PLtPgu7CB4gbZDA7i_euNxn75ISqxwZPYx',
1086 'info_dict': {
1087 'title': 'YDL_Empty_List',
1088 },
1089 'playlist_count': 0,
1090 }, {
1091 'note': 'Playlist with deleted videos (#651). As a bonus, the video #51 is also twice in this list.',
1092 'url': 'https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC',
1093 'info_dict': {
1094 'title': '29C3: Not my department',
1095 },
1096 'playlist_count': 95,
1097 }, {
1098 'note': 'issue #673',
1099 'url': 'PLBB231211A4F62143',
1100 'info_dict': {
f46a8702 1101 'title': '[OLD]Team Fortress 2 (Class-based LP)',
9291475f
PH
1102 },
1103 'playlist_mincount': 26,
1104 }, {
1105 'note': 'Large playlist',
1106 'url': 'https://www.youtube.com/playlist?list=UUBABnxM4Ar9ten8Mdjj1j0Q',
1107 'info_dict': {
1108 'title': 'Uploads from Cauchemar',
1109 },
1110 'playlist_mincount': 799,
1111 }, {
1112 'url': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl',
1113 'info_dict': {
1114 'title': 'YDL_safe_search',
1115 },
1116 'playlist_count': 2,
ac7553d0
PH
1117 }, {
1118 'note': 'embedded',
1119 'url': 'http://www.youtube.com/embed/videoseries?list=PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu',
1120 'playlist_count': 4,
1121 'info_dict': {
1122 'title': 'JODA15',
1123 }
6b08cdf6
PH
1124 }, {
1125 'note': 'Embedded SWF player',
1126 'url': 'http://www.youtube.com/p/YN5VISEtHet5D4NEvfTd0zcgFk84NqFZ?hl=en_US&fs=1&rel=0',
1127 'playlist_count': 4,
1128 'info_dict': {
1129 'title': 'JODA7',
1130 }
4b7df0d3
JMF
1131 }, {
1132 'note': 'Buggy playlist: the webpage has a "Load more" button but it doesn\'t have more videos',
1133 'url': 'https://www.youtube.com/playlist?list=UUXw-G3eDE9trcvY2sBMM_aA',
1134 'info_dict': {
1135 'title': 'Uploads from Interstellar Movie',
1136 },
1137 'playlist_mincout': 21,
81127aa5 1138 }]
c5e8d7af 1139
880e1c52
JMF
1140 def _real_initialize(self):
1141 self._login()
1142
652cdaa2 1143 def _ids_to_results(self, ids):
c9cc0bf5
PH
1144 return [
1145 self.url_result(vid_id, 'Youtube', video_id=vid_id)
1146 for vid_id in ids]
652cdaa2
JMF
1147
1148 def _extract_mix(self, playlist_id):
1149 # The mixes are generated from a a single video
1150 # the id of the playlist is just 'RD' + video_id
7d4afc55 1151 url = 'https://youtube.com/watch?v=%s&list=%s' % (playlist_id[-11:], playlist_id)
c9cc0bf5 1152 webpage = self._download_webpage(
78caa52a 1153 url, playlist_id, 'Downloading Youtube mix')
bc2f773b 1154 search_title = lambda class_name: get_element_by_attribute('class', class_name, webpage)
c9cc0bf5
PH
1155 title_span = (
1156 search_title('playlist-title') or
1157 search_title('title long-title') or
1158 search_title('title'))
76d1700b 1159 title = clean_html(title_span)
c9cc0bf5
PH
1160 ids = orderedSet(re.findall(
1161 r'''(?xs)data-video-username=".*?".*?
1162 href="/watch\?v=([0-9A-Za-z_-]{11})&amp;[^"]*?list=%s''' % re.escape(playlist_id),
1163 webpage))
652cdaa2
JMF
1164 url_results = self._ids_to_results(ids)
1165
1166 return self.playlist_result(url_results, playlist_id, title)
1167
c5e8d7af
PH
1168 def _real_extract(self, url):
1169 # Extract playlist id
d67cc9fa 1170 mobj = re.match(self._VALID_URL, url)
c5e8d7af 1171 if mobj is None:
69ea8ca4 1172 raise ExtractorError('Invalid URL: %s' % url)
47192f92
FV
1173 playlist_id = mobj.group(1) or mobj.group(2)
1174
1175 # Check if it's a video-specific URL
7c61bd36 1176 query_dict = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
47192f92
FV
1177 if 'v' in query_dict:
1178 video_id = query_dict['v'][0]
1179 if self._downloader.params.get('noplaylist'):
69ea8ca4 1180 self.to_screen('Downloading just video %s because of --no-playlist' % video_id)
7012b23c 1181 return self.url_result(video_id, 'Youtube', video_id=video_id)
47192f92 1182 else:
69ea8ca4 1183 self.to_screen('Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id))
c5e8d7af 1184
7d4afc55 1185 if playlist_id.startswith('RD'):
652cdaa2
JMF
1186 # Mixes require a custom extraction process
1187 return self._extract_mix(playlist_id)
0a688bc0 1188 if playlist_id.startswith('TL'):
69ea8ca4 1189 raise ExtractorError('For downloading YouTube.com top lists, use '
8bcc8756 1190 'the "yttoplist" keyword, for example "youtube-dl \'yttoplist:music:Top Tracks\'"', expected=True)
652cdaa2 1191
dbb94fb0
S
1192 url = self._TEMPLATE_URL % playlist_id
1193 page = self._download_webpage(url, playlist_id)
1194 more_widget_html = content_html = page
1195
10c0e2d8 1196 # Check if the playlist exists or is private
e399853d 1197 if re.search(r'<div class="yt-alert-message">[^<]*?(The|This) playlist (does not exist|is private)[^<]*?</div>', page) is not None:
10c0e2d8 1198 raise ExtractorError(
78caa52a 1199 'The playlist doesn\'t exist or is private, use --username or '
10c0e2d8
JMF
1200 '--netrc to access it.',
1201 expected=True)
1202
dcbb4580
JMF
1203 # Extract the video ids from the playlist pages
1204 ids = []
c5e8d7af 1205
755eb032 1206 for page_num in itertools.count(1):
dbb94fb0 1207 matches = re.finditer(self._VIDEO_RE, content_html)
6e47b51e
JMF
1208 # We remove the duplicates and the link with index 0
1209 # (it's not the first video of the playlist)
1210 new_ids = orderedSet(m.group('id') for m in matches if m.group('index') != '0')
dcbb4580 1211 ids.extend(new_ids)
c5e8d7af 1212
dbb94fb0
S
1213 mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
1214 if not mobj:
c5e8d7af
PH
1215 break
1216
dbb94fb0 1217 more = self._download_json(
5912c639
PH
1218 'https://youtube.com/%s' % mobj.group('more'), playlist_id,
1219 'Downloading page #%s' % page_num,
1220 transform_source=uppercase_escape)
dbb94fb0 1221 content_html = more['content_html']
4b7df0d3
JMF
1222 if not content_html.strip():
1223 # Some webpages show a "Load more" button but they don't
1224 # have more videos
1225 break
dbb94fb0
S
1226 more_widget_html = more['load_more_widget_html']
1227
1228 playlist_title = self._html_search_regex(
68eb8e90 1229 r'(?s)<h1 class="pl-header-title[^"]*">\s*(.*?)\s*</h1>',
78caa52a 1230 page, 'title')
c5e8d7af 1231
652cdaa2 1232 url_results = self._ids_to_results(ids)
dcbb4580 1233 return self.playlist_result(url_results, playlist_id, playlist_title)
c5e8d7af
PH
1234
1235
0a688bc0 1236class YoutubeTopListIE(YoutubePlaylistIE):
78caa52a 1237 IE_NAME = 'youtube:toplist'
69ea8ca4 1238 IE_DESC = ('YouTube.com top lists, "yttoplist:{channel}:{list title}"'
9e1a5b84 1239 ' (Example: "yttoplist:music:Top Tracks")')
0a688bc0 1240 _VALID_URL = r'yttoplist:(?P<chann>.*?):(?P<title>.*?)$'
cdc628a4
PH
1241 _TESTS = [{
1242 'url': 'yttoplist:music:Trending',
1243 'playlist_mincount': 5,
1244 'skip': 'Only works for logged-in users',
1245 }]
0a688bc0
JMF
1246
1247 def _real_extract(self, url):
1248 mobj = re.match(self._VALID_URL, url)
1249 channel = mobj.group('chann')
1250 title = mobj.group('title')
1251 query = compat_urllib_parse.urlencode({'title': title})
cdc628a4
PH
1252 channel_page = self._download_webpage(
1253 'https://www.youtube.com/%s' % channel, title)
1254 link = self._html_search_regex(
1255 r'''(?x)
1256 <a\s+href="([^"]+)".*?>\s*
1257 <span\s+class="branded-page-module-title-text">\s*
1258 <span[^>]*>.*?%s.*?</span>''' % re.escape(query),
1259 channel_page, 'list')
0a688bc0 1260 url = compat_urlparse.urljoin('https://www.youtube.com/', link)
5f6a1245 1261
0a688bc0
JMF
1262 video_re = r'data-index="\d+".*?data-video-id="([0-9A-Za-z_-]{11})"'
1263 ids = []
1264 # sometimes the webpage doesn't contain the videos
1265 # retry until we get them
1266 for i in itertools.count(0):
78caa52a 1267 msg = 'Downloading Youtube mix'
0a688bc0
JMF
1268 if i > 0:
1269 msg += ', retry #%d' % i
c9cc0bf5 1270
0a688bc0
JMF
1271 webpage = self._download_webpage(url, title, msg)
1272 ids = orderedSet(re.findall(video_re, webpage))
1273 if ids:
1274 break
1275 url_results = self._ids_to_results(ids)
1276 return self.playlist_result(url_results, playlist_title=title)
1277
1278
c5e8d7af 1279class YoutubeChannelIE(InfoExtractor):
78caa52a 1280 IE_DESC = 'YouTube.com channels'
9ff67727 1281 _VALID_URL = r'https?://(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/(?P<id>[0-9A-Za-z_-]+)'
78caa52a 1282 IE_NAME = 'youtube:channel'
cdc628a4
PH
1283 _TESTS = [{
1284 'note': 'paginated channel',
1285 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w',
1286 'playlist_mincount': 91,
1287 }]
c5e8d7af
PH
1288
1289 def extract_videos_from_page(self, page):
1290 ids_in_page = []
1291 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1292 if mobj.group(1) not in ids_in_page:
1293 ids_in_page.append(mobj.group(1))
1294 return ids_in_page
1295
1296 def _real_extract(self, url):
9ff67727 1297 channel_id = self._match_id(url)
c5e8d7af 1298
c5e8d7af 1299 video_ids = []
b9643eed
JMF
1300 url = 'https://www.youtube.com/channel/%s/videos' % channel_id
1301 channel_page = self._download_webpage(url, channel_id)
31812a9e
PH
1302 autogenerated = re.search(r'''(?x)
1303 class="[^"]*?(?:
1304 channel-header-autogenerated-label|
1305 yt-channel-title-autogenerated
1306 )[^"]*"''', channel_page) is not None
c5e8d7af 1307
b9643eed
JMF
1308 if autogenerated:
1309 # The videos are contained in a single page
1310 # the ajax pages can't be used, they are empty
1311 video_ids = self.extract_videos_from_page(channel_page)
b82f815f
PH
1312 entries = [
1313 self.url_result(video_id, 'Youtube', video_id=video_id)
1314 for video_id in video_ids]
1315 return self.playlist_result(entries, channel_id)
1316
1317 def _entries():
23d3608c 1318 more_widget_html = content_html = channel_page
b9643eed 1319 for pagenum in itertools.count(1):
81c2f20b 1320
23d3608c 1321 ids_in_page = self.extract_videos_from_page(content_html)
b82f815f
PH
1322 for video_id in ids_in_page:
1323 yield self.url_result(
1324 video_id, 'Youtube', video_id=video_id)
5f6a1245 1325
23d3608c
JMF
1326 mobj = re.search(
1327 r'data-uix-load-more-href="/?(?P<more>[^"]+)"',
1328 more_widget_html)
1329 if not mobj:
b9643eed 1330 break
c5e8d7af 1331
23d3608c
JMF
1332 more = self._download_json(
1333 'https://youtube.com/%s' % mobj.group('more'), channel_id,
1334 'Downloading page #%s' % (pagenum + 1),
1335 transform_source=uppercase_escape)
1336 content_html = more['content_html']
1337 more_widget_html = more['load_more_widget_html']
1338
b82f815f 1339 return self.playlist_result(_entries(), channel_id)
c5e8d7af
PH
1340
1341
1342class YoutubeUserIE(InfoExtractor):
78caa52a 1343 IE_DESC = 'YouTube.com user videos (URL or "ytuser" keyword)'
9ff67727 1344 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:user/)?(?!(?:attribution_link|watch|results)(?:$|[^a-z_A-Z0-9-])))|ytuser:)(?!feed/)(?P<id>[A-Za-z0-9_-]+)'
38c2e5b8 1345 _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/users/%s'
c5e8d7af 1346 _GDATA_PAGE_SIZE = 50
38c2e5b8 1347 _GDATA_URL = 'https://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d&alt=json'
78caa52a 1348 IE_NAME = 'youtube:user'
c5e8d7af 1349
cdc628a4
PH
1350 _TESTS = [{
1351 'url': 'https://www.youtube.com/user/TheLinuxFoundation',
1352 'playlist_mincount': 320,
1353 'info_dict': {
1354 'title': 'TheLinuxFoundation',
1355 }
1356 }, {
1357 'url': 'ytuser:phihag',
1358 'only_matching': True,
1359 }]
1360
e3ea4790 1361 @classmethod
f4b05232 1362 def suitable(cls, url):
e3ea4790
JMF
1363 # Don't return True if the url can be extracted with other youtube
1364 # extractor, the regex would is too permissive and it would match.
1365 other_ies = iter(klass for (name, klass) in globals().items() if name.endswith('IE') and klass is not cls)
5f6a1245
JW
1366 if any(ie.suitable(url) for ie in other_ies):
1367 return False
1368 else:
1369 return super(YoutubeUserIE, cls).suitable(url)
f4b05232 1370
c5e8d7af 1371 def _real_extract(self, url):
9ff67727 1372 username = self._match_id(url)
c5e8d7af
PH
1373
1374 # Download video ids using YouTube Data API. Result size per
1375 # query is limited (currently to 50 videos) so we need to query
1376 # page by page until there are no video ids - it means we got
1377 # all of them.
1378
b7ab0590 1379 def download_page(pagenum):
c5e8d7af
PH
1380 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1381
1382 gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
b7ab0590
PH
1383 page = self._download_webpage(
1384 gdata_url, username,
78caa52a 1385 'Downloading video ids from %d to %d' % (
b7ab0590 1386 start_index, start_index + self._GDATA_PAGE_SIZE))
c5e8d7af 1387
fd9cf738
JMF
1388 try:
1389 response = json.loads(page)
1390 except ValueError as err:
69ea8ca4 1391 raise ExtractorError('Invalid JSON in API response: ' + compat_str(err))
71c82637 1392 if 'entry' not in response['feed']:
b7ab0590 1393 return
fd9cf738 1394
c5e8d7af 1395 # Extract video identifiers
e302f9ce
PH
1396 entries = response['feed']['entry']
1397 for entry in entries:
1398 title = entry['title']['$t']
1399 video_id = entry['id']['$t'].split('/')[-1]
b7ab0590 1400 yield {
e302f9ce
PH
1401 '_type': 'url',
1402 'url': video_id,
1403 'ie_key': 'Youtube',
b11cec41 1404 'id': video_id,
e302f9ce 1405 'title': title,
b7ab0590 1406 }
9c44d242 1407 url_results = OnDemandPagedList(download_page, self._GDATA_PAGE_SIZE)
c5e8d7af 1408
7012b23c
PH
1409 return self.playlist_result(url_results, playlist_title=username)
1410
b05654f0
PH
1411
1412class YoutubeSearchIE(SearchInfoExtractor):
78caa52a
PH
1413 IE_DESC = 'YouTube.com searches'
1414 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
b05654f0 1415 _MAX_RESULTS = 1000
78caa52a 1416 IE_NAME = 'youtube:search'
b05654f0
PH
1417 _SEARCH_KEY = 'ytsearch'
1418
b05654f0
PH
1419 def _get_n_results(self, query, n):
1420 """Get a specified number of results for a query"""
1421
1422 video_ids = []
1423 pagenum = 0
1424 limit = n
83d548ef 1425 PAGE_SIZE = 50
b05654f0 1426
83d548ef
PH
1427 while (PAGE_SIZE * pagenum) < limit:
1428 result_url = self._API_URL % (
1429 compat_urllib_parse.quote_plus(query.encode('utf-8')),
1430 (PAGE_SIZE * pagenum) + 1)
7cc3570e 1431 data_json = self._download_webpage(
69ea8ca4
PH
1432 result_url, video_id='query "%s"' % query,
1433 note='Downloading page %s' % (pagenum + 1),
1434 errnote='Unable to download API page')
7cc3570e
PH
1435 data = json.loads(data_json)
1436 api_response = data['data']
1437
1438 if 'items' not in api_response:
07ad22b8 1439 raise ExtractorError(
78caa52a 1440 '[youtube] No video results', expected=True)
b05654f0
PH
1441
1442 new_ids = list(video['id'] for video in api_response['items'])
1443 video_ids += new_ids
1444
1445 limit = min(n, api_response['totalItems'])
1446 pagenum += 1
1447
1448 if len(video_ids) > n:
1449 video_ids = video_ids[:n]
7012b23c
PH
1450 videos = [self.url_result(video_id, 'Youtube', video_id=video_id)
1451 for video_id in video_ids]
b05654f0 1452 return self.playlist_result(videos, query)
75dff0ee 1453
c9ae7b95 1454
a3dd9248 1455class YoutubeSearchDateIE(YoutubeSearchIE):
cb7fb546 1456 IE_NAME = YoutubeSearchIE.IE_NAME + ':date'
a3dd9248
CM
1457 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc&orderby=published'
1458 _SEARCH_KEY = 'ytsearchdate'
78caa52a 1459 IE_DESC = 'YouTube.com searches, newest videos first'
75dff0ee 1460
c9ae7b95
PH
1461
1462class YoutubeSearchURLIE(InfoExtractor):
78caa52a
PH
1463 IE_DESC = 'YouTube.com search URLs'
1464 IE_NAME = 'youtube:search_url'
c9ae7b95 1465 _VALID_URL = r'https?://(?:www\.)?youtube\.com/results\?(.*?&)?search_query=(?P<query>[^&]+)(?:[&]|$)'
cdc628a4
PH
1466 _TESTS = [{
1467 'url': 'https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video',
1468 'playlist_mincount': 5,
1469 'info_dict': {
1470 'title': 'youtube-dl test video',
1471 }
1472 }]
c9ae7b95
PH
1473
1474 def _real_extract(self, url):
1475 mobj = re.match(self._VALID_URL, url)
1476 query = compat_urllib_parse.unquote_plus(mobj.group('query'))
1477
1478 webpage = self._download_webpage(url, query)
1479 result_code = self._search_regex(
78caa52a 1480 r'(?s)<ol class="item-section"(.*?)</ol>', webpage, 'result HTML')
c9ae7b95
PH
1481
1482 part_codes = re.findall(
1483 r'(?s)<h3 class="yt-lockup-title">(.*?)</h3>', result_code)
1484 entries = []
1485 for part_code in part_codes:
1486 part_title = self._html_search_regex(
6feb2d5e 1487 [r'(?s)title="([^"]+)"', r'>([^<]+)</a>'], part_code, 'item title', fatal=False)
c9ae7b95
PH
1488 part_url_snippet = self._html_search_regex(
1489 r'(?s)href="([^"]+)"', part_code, 'item URL')
1490 part_url = compat_urlparse.urljoin(
1491 'https://www.youtube.com/', part_url_snippet)
1492 entries.append({
1493 '_type': 'url',
1494 'url': part_url,
1495 'title': part_title,
1496 })
1497
1498 return {
1499 '_type': 'playlist',
1500 'entries': entries,
1501 'title': query,
1502 }
1503
1504
75dff0ee 1505class YoutubeShowIE(InfoExtractor):
78caa52a 1506 IE_DESC = 'YouTube.com (multi-season) shows'
cdc628a4 1507 _VALID_URL = r'https?://www\.youtube\.com/show/(?P<id>[^?#]*)'
78caa52a 1508 IE_NAME = 'youtube:show'
cdc628a4
PH
1509 _TESTS = [{
1510 'url': 'http://www.youtube.com/show/airdisasters',
1511 'playlist_mincount': 3,
1512 'info_dict': {
1513 'id': 'airdisasters',
1514 'title': 'Air Disasters',
1515 }
1516 }]
75dff0ee
JMF
1517
1518 def _real_extract(self, url):
1519 mobj = re.match(self._VALID_URL, url)
cdc628a4
PH
1520 playlist_id = mobj.group('id')
1521 webpage = self._download_webpage(
1522 url, playlist_id, 'Downloading show webpage')
75dff0ee
JMF
1523 # There's one playlist for each season of the show
1524 m_seasons = list(re.finditer(r'href="(/playlist\?list=.*?)"', webpage))
cdc628a4
PH
1525 self.to_screen('%s: Found %s seasons' % (playlist_id, len(m_seasons)))
1526 entries = [
1527 self.url_result(
1528 'https://www.youtube.com' + season.group(1), 'YoutubePlaylist')
1529 for season in m_seasons
1530 ]
1531 title = self._og_search_title(webpage, fatal=False)
1532
1533 return {
1534 '_type': 'playlist',
1535 'id': playlist_id,
1536 'title': title,
1537 'entries': entries,
1538 }
04cc9617
JMF
1539
1540
b2e8bc1b 1541class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
d7ae0639
JMF
1542 """
1543 Base class for extractors that fetch info from
1544 http://www.youtube.com/feed_ajax
1545 Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.
1546 """
b2e8bc1b 1547 _LOGIN_REQUIRED = True
43ba5456
JMF
1548 # use action_load_personal_feed instead of action_load_system_feed
1549 _PERSONAL_FEED = False
04cc9617 1550
d7ae0639
JMF
1551 @property
1552 def _FEED_TEMPLATE(self):
43ba5456
JMF
1553 action = 'action_load_system_feed'
1554 if self._PERSONAL_FEED:
1555 action = 'action_load_personal_feed'
38c2e5b8 1556 return 'https://www.youtube.com/feed_ajax?%s=1&feed_name=%s&paging=%%s' % (action, self._FEED_NAME)
d7ae0639
JMF
1557
1558 @property
1559 def IE_NAME(self):
78caa52a 1560 return 'youtube:%s' % self._FEED_NAME
04cc9617 1561
81f0259b 1562 def _real_initialize(self):
b2e8bc1b 1563 self._login()
81f0259b 1564
04cc9617
JMF
1565 def _real_extract(self, url):
1566 feed_entries = []
0e44d838
JMF
1567 paging = 0
1568 for i in itertools.count(1):
84d84211
PH
1569 info = self._download_json(
1570 self._FEED_TEMPLATE % paging,
1571 '%s feed' % self._FEED_NAME,
1572 'Downloading page %s' % i,
1573 transform_source=uppercase_escape)
f6177462 1574 feed_html = info.get('feed_html') or info.get('content_html')
1a9b9649 1575 load_more_widget_html = info.get('load_more_widget_html') or feed_html
43ba5456 1576 m_ids = re.finditer(r'"/watch\?v=(.*?)["&]', feed_html)
04cc9617 1577 ids = orderedSet(m.group(1) for m in m_ids)
7012b23c
PH
1578 feed_entries.extend(
1579 self.url_result(video_id, 'Youtube', video_id=video_id)
1580 for video_id in ids)
05ee2b6d
JMF
1581 mobj = re.search(
1582 r'data-uix-load-more-href="/?[^"]+paging=(?P<paging>\d+)',
1a9b9649 1583 load_more_widget_html)
05ee2b6d 1584 if mobj is None:
04cc9617 1585 break
05ee2b6d 1586 paging = mobj.group('paging')
d7ae0639
JMF
1587 return self.playlist_result(feed_entries, playlist_title=self._PLAYLIST_TITLE)
1588
5f6a1245 1589
d7ae0639 1590class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
f3a34072 1591 IE_DESC = 'YouTube.com recommended videos, ":ytrec" for short (requires authentication)'
d7ae0639
JMF
1592 _VALID_URL = r'https?://www\.youtube\.com/feed/recommended|:ytrec(?:ommended)?'
1593 _FEED_NAME = 'recommended'
78caa52a 1594 _PLAYLIST_TITLE = 'Youtube Recommended videos'
c626a3d9 1595
5f6a1245 1596
43ba5456 1597class YoutubeWatchLaterIE(YoutubeFeedsInfoExtractor):
f3a34072 1598 IE_DESC = 'Youtube watch later list, ":ytwatchlater" for short (requires authentication)'
43ba5456
JMF
1599 _VALID_URL = r'https?://www\.youtube\.com/feed/watch_later|:ytwatchlater'
1600 _FEED_NAME = 'watch_later'
78caa52a 1601 _PLAYLIST_TITLE = 'Youtube Watch Later'
43ba5456 1602 _PERSONAL_FEED = True
c626a3d9 1603
5f6a1245 1604
f459d170 1605class YoutubeHistoryIE(YoutubeFeedsInfoExtractor):
f3a34072 1606 IE_DESC = 'Youtube watch history, ":ythistory" for short (requires authentication)'
78caa52a 1607 _VALID_URL = 'https?://www\.youtube\.com/feed/history|:ythistory'
f459d170
JMF
1608 _FEED_NAME = 'history'
1609 _PERSONAL_FEED = True
78caa52a 1610 _PLAYLIST_TITLE = 'Youtube Watch History'
f459d170 1611
5f6a1245 1612
c626a3d9 1613class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
78caa52a 1614 IE_NAME = 'youtube:favorites'
f3a34072 1615 IE_DESC = 'YouTube.com favourite videos, ":ytfav" for short (requires authentication)'
c7a7750d 1616 _VALID_URL = r'https?://www\.youtube\.com/my_favorites|:ytfav(?:ou?rites)?'
c626a3d9
JMF
1617 _LOGIN_REQUIRED = True
1618
1619 def _real_extract(self, url):
1620 webpage = self._download_webpage('https://www.youtube.com/my_favorites', 'Youtube Favourites videos')
78caa52a 1621 playlist_id = self._search_regex(r'list=(.+?)["&]', webpage, 'favourites playlist id')
c626a3d9 1622 return self.url_result(playlist_id, 'YoutubePlaylist')
15870e90
PH
1623
1624
1ed5b5c9 1625class YoutubeSubscriptionsIE(YoutubePlaylistIE):
78caa52a
PH
1626 IE_NAME = 'youtube:subscriptions'
1627 IE_DESC = 'YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication)'
1ed5b5c9 1628 _VALID_URL = r'https?://www\.youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
81127aa5 1629 _TESTS = []
1ed5b5c9
JMF
1630
1631 def _real_extract(self, url):
78caa52a 1632 title = 'Youtube Subscriptions'
1ed5b5c9
JMF
1633 page = self._download_webpage('https://www.youtube.com/feed/subscriptions', title)
1634
1635 # The extraction process is the same as for playlists, but the regex
1636 # for the video ids doesn't contain an index
1637 ids = []
1638 more_widget_html = content_html = page
1639
1640 for page_num in itertools.count(1):
1641 matches = re.findall(r'href="\s*/watch\?v=([0-9A-Za-z_-]{11})', content_html)
1642 new_ids = orderedSet(matches)
1643 ids.extend(new_ids)
1644
1645 mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
1646 if not mobj:
1647 break
1648
1649 more = self._download_json(
1650 'https://youtube.com/%s' % mobj.group('more'), title,
1651 'Downloading page #%s' % page_num,
1652 transform_source=uppercase_escape)
1653 content_html = more['content_html']
1654 more_widget_html = more['load_more_widget_html']
1655
1656 return {
1657 '_type': 'playlist',
1658 'title': title,
1659 'entries': self._ids_to_results(ids),
1660 }
1661
1662
15870e90
PH
1663class YoutubeTruncatedURLIE(InfoExtractor):
1664 IE_NAME = 'youtube:truncated_url'
1665 IE_DESC = False # Do not list
975d35db 1666 _VALID_URL = r'''(?x)
c4808c60
PH
1667 (?:https?://)?[^/]+/watch\?(?:
1668 feature=[a-z_]+|
1669 annotation_id=annotation_[^&]+
1670 )?$|
975d35db
PH
1671 (?:https?://)?(?:www\.)?youtube\.com/attribution_link\?a=[^&]+$
1672 '''
15870e90 1673
c4808c60
PH
1674 _TESTS = [{
1675 'url': 'http://www.youtube.com/watch?annotation_id=annotation_3951667041',
1676 'only_matching': True,
dc2fc736
PH
1677 }, {
1678 'url': 'http://www.youtube.com/watch?',
1679 'only_matching': True,
c4808c60
PH
1680 }]
1681
15870e90
PH
1682 def _real_extract(self, url):
1683 raise ExtractorError(
78caa52a
PH
1684 'Did you forget to quote the URL? Remember that & is a meta '
1685 'character in most shells, so you want to put the URL in quotes, '
1686 'like youtube-dl '
1687 '"http://www.youtube.com/watch?feature=foo&v=BaW_jenozKc" '
1688 ' or simply youtube-dl BaW_jenozKc .',
15870e90 1689 expected=True)