]> jfr.im git - yt-dlp.git/blame - youtube_dl/extractor/youtube.py
[vessel] Fix pep8 issue
[yt-dlp.git] / youtube_dl / extractor / youtube.py
CommitLineData
c5e8d7af 1# coding: utf-8
c5e8d7af 2
78caa52a
PH
3from __future__ import unicode_literals
4
5
0ca96d48 6import itertools
c5e8d7af 7import json
c4417ddb 8import os.path
c5e8d7af 9import re
42939b61 10import time
e0df6211 11import traceback
c5e8d7af 12
b05654f0 13from .common import InfoExtractor, SearchInfoExtractor
2b25cb5d 14from ..jsinterp import JSInterpreter
54256267 15from ..swfinterp import SWFInterpreter
4bb4a188 16from ..compat import (
edf3e38e 17 compat_chr,
c5e8d7af 18 compat_parse_qs,
c5e8d7af
PH
19 compat_urllib_parse,
20 compat_urllib_request,
7c61bd36 21 compat_urlparse,
c5e8d7af 22 compat_str,
4bb4a188
PH
23)
24from ..utils import (
c5e8d7af 25 clean_html,
c5e8d7af 26 ExtractorError,
2d30521a 27 float_or_none,
4bb4a188
PH
28 get_element_by_attribute,
29 get_element_by_id,
dd27fd17 30 int_or_none,
9c44d242 31 OnDemandPagedList,
4bb4a188 32 orderedSet,
c5e8d7af
PH
33 unescapeHTML,
34 unified_strdate,
81c2f20b 35 uppercase_escape,
c5e8d7af
PH
36)
37
5f6a1245 38
de7f3446 39class YoutubeBaseInfoExtractor(InfoExtractor):
b2e8bc1b
JMF
40 """Provide base functions for Youtube extractors"""
41 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
83317f69 42 _TWOFACTOR_URL = 'https://accounts.google.com/SecondFactor'
b2e8bc1b
JMF
43 _NETRC_MACHINE = 'youtube'
44 # If True it will raise an error if no login info is provided
45 _LOGIN_REQUIRED = False
46
b2e8bc1b 47 def _set_language(self):
810fb84d
PH
48 self._set_cookie(
49 '.youtube.com', 'PREF', 'f1=50000000&hl=en',
42939b61 50 # YouTube sets the expire time to about two months
810fb84d 51 expire_time=time.time() + 2 * 30 * 24 * 3600)
b2e8bc1b
JMF
52
53 def _login(self):
83317f69 54 """
55 Attempt to log in to YouTube.
56 True is returned if successful or skipped.
57 False is returned if login failed.
58
59 If _LOGIN_REQUIRED is set and no authentication was provided, an error is raised.
60 """
b2e8bc1b
JMF
61 (username, password) = self._get_login_info()
62 # No authentication to be performed
63 if username is None:
64 if self._LOGIN_REQUIRED:
69ea8ca4 65 raise ExtractorError('No login info available, needed for using %s.' % self.IE_NAME, expected=True)
83317f69 66 return True
b2e8bc1b 67
7cc3570e
PH
68 login_page = self._download_webpage(
69 self._LOGIN_URL, None,
69ea8ca4
PH
70 note='Downloading login page',
71 errnote='unable to fetch login page', fatal=False)
7cc3570e
PH
72 if login_page is False:
73 return
b2e8bc1b 74
795f28f8 75 galx = self._search_regex(r'(?s)<input.+?name="GALX".+?value="(.+?)"',
78caa52a 76 login_page, 'Login GALX parameter')
c5e8d7af 77
b2e8bc1b
JMF
78 # Log in
79 login_form_strs = {
8bcc8756
JW
80 'continue': 'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
81 'Email': username,
82 'GALX': galx,
83 'Passwd': password,
84
85 'PersistentCookie': 'yes',
86 '_utf8': '霱',
87 'bgresponse': 'js_disabled',
88 'checkConnection': '',
89 'checkedDomains': 'youtube',
90 'dnConn': '',
91 'pstMsg': '0',
92 'rmShown': '1',
93 'secTok': '',
94 'signIn': 'Sign in',
95 'timeStmp': '',
96 'service': 'youtube',
97 'uilel': '3',
98 'hl': 'en_US',
b2e8bc1b 99 }
83317f69 100
b2e8bc1b
JMF
101 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
102 # chokes on unicode
5f6a1245 103 login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k, v in login_form_strs.items())
b2e8bc1b 104 login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
7cc3570e
PH
105
106 req = compat_urllib_request.Request(self._LOGIN_URL, login_data)
107 login_results = self._download_webpage(
108 req, None,
69ea8ca4 109 note='Logging in', errnote='unable to log in', fatal=False)
7cc3570e
PH
110 if login_results is False:
111 return False
83317f69 112
113 if re.search(r'id="errormsg_0_Passwd"', login_results) is not None:
69ea8ca4 114 raise ExtractorError('Please use your account password and a two-factor code instead of an application-specific password.', expected=True)
83317f69 115
116 # Two-Factor
117 # TODO add SMS and phone call support - these require making a request and then prompting the user
118
119 if re.search(r'(?i)<form[^>]* id="gaia_secondfactorform"', login_results) is not None:
120 tfa_code = self._get_tfa_info()
121
122 if tfa_code is None:
69ea8ca4
PH
123 self._downloader.report_warning('Two-factor authentication required. Provide it with --twofactor <code>')
124 self._downloader.report_warning('(Note that only TOTP (Google Authenticator App) codes work at this time.)')
83317f69 125 return False
126
127 # Unlike the first login form, secTok and timeStmp are both required for the TFA form
128
129 match = re.search(r'id="secTok"\n\s+value=\'(.+)\'/>', login_results, re.M | re.U)
130 if match is None:
69ea8ca4 131 self._downloader.report_warning('Failed to get secTok - did the page structure change?')
83317f69 132 secTok = match.group(1)
133 match = re.search(r'id="timeStmp"\n\s+value=\'(.+)\'/>', login_results, re.M | re.U)
134 if match is None:
69ea8ca4 135 self._downloader.report_warning('Failed to get timeStmp - did the page structure change?')
83317f69 136 timeStmp = match.group(1)
137
138 tfa_form_strs = {
78caa52a
PH
139 'continue': 'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
140 'smsToken': '',
141 'smsUserPin': tfa_code,
142 'smsVerifyPin': 'Verify',
143
144 'PersistentCookie': 'yes',
145 'checkConnection': '',
146 'checkedDomains': 'youtube',
147 'pstMsg': '1',
148 'secTok': secTok,
149 'timeStmp': timeStmp,
150 'service': 'youtube',
151 'hl': 'en_US',
83317f69 152 }
5f6a1245 153 tfa_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k, v in tfa_form_strs.items())
83317f69 154 tfa_data = compat_urllib_parse.urlencode(tfa_form).encode('ascii')
155
156 tfa_req = compat_urllib_request.Request(self._TWOFACTOR_URL, tfa_data)
157 tfa_results = self._download_webpage(
158 tfa_req, None,
69ea8ca4 159 note='Submitting TFA code', errnote='unable to submit tfa', fatal=False)
83317f69 160
161 if tfa_results is False:
162 return False
163
164 if re.search(r'(?i)<form[^>]* id="gaia_secondfactorform"', tfa_results) is not None:
69ea8ca4 165 self._downloader.report_warning('Two-factor code expired. Please try again, or use a one-use backup code instead.')
83317f69 166 return False
167 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', tfa_results) is not None:
69ea8ca4 168 self._downloader.report_warning('unable to log in - did the page structure change?')
83317f69 169 return False
170 if re.search(r'smsauth-interstitial-reviewsettings', tfa_results) is not None:
69ea8ca4 171 self._downloader.report_warning('Your Google account has a security notice. Please log in on your web browser, resolve the notice, and try again.')
83317f69 172 return False
173
7cc3570e 174 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
69ea8ca4 175 self._downloader.report_warning('unable to log in: bad username or password')
b2e8bc1b
JMF
176 return False
177 return True
178
b2e8bc1b
JMF
179 def _real_initialize(self):
180 if self._downloader is None:
181 return
42939b61 182 self._set_language()
b2e8bc1b
JMF
183 if not self._login():
184 return
c5e8d7af 185
8377574c 186
360e1ca5 187class YoutubeIE(YoutubeBaseInfoExtractor):
78caa52a 188 IE_DESC = 'YouTube.com'
cb7dfeea 189 _VALID_URL = r"""(?x)^
c5e8d7af 190 (
edb53e2d 191 (?:https?://|//) # http(s):// or protocol-independent URL
cb7dfeea 192 (?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/|
484aaeb2 193 (?:www\.)?deturl\.com/www\.youtube\.com/|
e70dc1d1 194 (?:www\.)?pwnyoutube\.com/|
f7000f3a 195 (?:www\.)?yourepeat\.com/|
e69ae5b9
JMF
196 tube\.majestyc\.net/|
197 youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains
c5e8d7af
PH
198 (?:.*?\#/)? # handle anchor (#/) redirect urls
199 (?: # the various things that can precede the ID:
ac7553d0 200 (?:(?:v|embed|e)/(?!videoseries)) # v/ or embed/ or e/
c5e8d7af 201 |(?: # or the v= param in all its forms
f7000f3a 202 (?:(?:watch|movie)(?:_popup)?(?:\.php)?/?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
c5e8d7af
PH
203 (?:\?|\#!?) # the params delimiter ? or # or #!
204 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
205 v=
206 )
f4b05232
JMF
207 ))
208 |youtu\.be/ # just youtu.be/xxxx
edb53e2d 209 |(?:www\.)?cleanvideosearch\.com/media/action/yt/watch\?videoId=
f4b05232 210 )
c5e8d7af 211 )? # all until now is optional -> you can pass the naked ID
8963d9c2 212 ([0-9A-Za-z_-]{11}) # here is it! the YouTube video ID
9291475f 213 (?!.*?&list=) # combined list/video URLs are handled by the playlist IE
c5e8d7af
PH
214 (?(1).+)? # if we found the ID, everything can follow
215 $"""
c5e8d7af 216 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
2c62dc26
PH
217 _formats = {
218 '5': {'ext': 'flv', 'width': 400, 'height': 240},
219 '6': {'ext': 'flv', 'width': 450, 'height': 270},
220 '13': {'ext': '3gp'},
221 '17': {'ext': '3gp', 'width': 176, 'height': 144},
222 '18': {'ext': 'mp4', 'width': 640, 'height': 360},
223 '22': {'ext': 'mp4', 'width': 1280, 'height': 720},
224 '34': {'ext': 'flv', 'width': 640, 'height': 360},
225 '35': {'ext': 'flv', 'width': 854, 'height': 480},
226 '36': {'ext': '3gp', 'width': 320, 'height': 240},
227 '37': {'ext': 'mp4', 'width': 1920, 'height': 1080},
228 '38': {'ext': 'mp4', 'width': 4096, 'height': 3072},
229 '43': {'ext': 'webm', 'width': 640, 'height': 360},
230 '44': {'ext': 'webm', 'width': 854, 'height': 480},
231 '45': {'ext': 'webm', 'width': 1280, 'height': 720},
232 '46': {'ext': 'webm', 'width': 1920, 'height': 1080},
233
1d043b93 234
86fe61c8 235 # 3d videos
43b81eb9
PH
236 '82': {'ext': 'mp4', 'height': 360, 'format_note': '3D', 'preference': -20},
237 '83': {'ext': 'mp4', 'height': 480, 'format_note': '3D', 'preference': -20},
238 '84': {'ext': 'mp4', 'height': 720, 'format_note': '3D', 'preference': -20},
239 '85': {'ext': 'mp4', 'height': 1080, 'format_note': '3D', 'preference': -20},
240 '100': {'ext': 'webm', 'height': 360, 'format_note': '3D', 'preference': -20},
241 '101': {'ext': 'webm', 'height': 480, 'format_note': '3D', 'preference': -20},
242 '102': {'ext': 'webm', 'height': 720, 'format_note': '3D', 'preference': -20},
836a086c 243
96fb5605 244 # Apple HTTP Live Streaming
43b81eb9
PH
245 '92': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'preference': -10},
246 '93': {'ext': 'mp4', 'height': 360, 'format_note': 'HLS', 'preference': -10},
247 '94': {'ext': 'mp4', 'height': 480, 'format_note': 'HLS', 'preference': -10},
248 '95': {'ext': 'mp4', 'height': 720, 'format_note': 'HLS', 'preference': -10},
249 '96': {'ext': 'mp4', 'height': 1080, 'format_note': 'HLS', 'preference': -10},
250 '132': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'preference': -10},
251 '151': {'ext': 'mp4', 'height': 72, 'format_note': 'HLS', 'preference': -10},
2c62dc26
PH
252
253 # DASH mp4 video
43b81eb9
PH
254 '133': {'ext': 'mp4', 'height': 240, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
255 '134': {'ext': 'mp4', 'height': 360, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
256 '135': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
257 '136': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
258 '137': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
e65566a9 259 '138': {'ext': 'mp4', 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40}, # Height can vary (https://github.com/rg3/youtube-dl/issues/4559)
43b81eb9
PH
260 '160': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
261 '264': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
0d2c1418
PH
262 '298': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'h264'},
263 '299': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'h264'},
264 '266': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'vcodec': 'h264'},
836a086c 265
f6f1fc92 266 # Dash mp4 audio
62cd676c
PH
267 '139': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'vcodec': 'none', 'abr': 48, 'preference': -50, 'container': 'm4a_dash'},
268 '140': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'vcodec': 'none', 'abr': 128, 'preference': -50, 'container': 'm4a_dash'},
269 '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'vcodec': 'none', 'abr': 256, 'preference': -50, 'container': 'm4a_dash'},
836a086c
AZ
270
271 # Dash webm
e75cafe9
A
272 '167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
273 '168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
274 '169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
275 '170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
276 '218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
277 '219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
1cc887cb 278 '278': {'ext': 'webm', 'height': 144, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'container': 'webm', 'vcodec': 'VP9'},
e75cafe9
A
279 '242': {'ext': 'webm', 'height': 240, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
280 '243': {'ext': 'webm', 'height': 360, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
281 '244': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
282 '245': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
283 '246': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
284 '247': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
285 '248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
3c80377b 286 '271': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
18061bba 287 '272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
a51d3aa0
PH
288 '302': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'VP9'},
289 '303': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'VP9'},
76b3c610 290 '308': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'VP9'},
10a404c3 291 '313': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'vcodec': 'VP9'},
76b3c610 292 '315': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'VP9'},
2c62dc26
PH
293
294 # Dash webm audio
55db73ef 295 '171': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'abr': 128, 'preference': -50},
e75cafe9 296 '172': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'abr': 256, 'preference': -50},
ce6b9a2d 297
0857baad
PH
298 # Dash webm audio with opus inside
299 '249': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 50, 'preference': -50},
300 '250': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 70, 'preference': -50},
301 '251': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 160, 'preference': -50},
302
ce6b9a2d
PH
303 # RTMP (unnamed)
304 '_rtmp': {'protocol': 'rtmp'},
c5e8d7af 305 }
836a086c 306
78caa52a 307 IE_NAME = 'youtube'
2eb88d95
PH
308 _TESTS = [
309 {
4bc3a23e
PH
310 'url': 'http://www.youtube.com/watch?v=BaW_jenozKc',
311 'info_dict': {
312 'id': 'BaW_jenozKc',
313 'ext': 'mp4',
314 'title': 'youtube-dl test video "\'/\\ä↭𝕐',
315 'uploader': 'Philipp Hagemeister',
316 'uploader_id': 'phihag',
317 'upload_date': '20121002',
318 'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',
319 'categories': ['Science & Technology'],
3e7c1224
PH
320 'like_count': int,
321 'dislike_count': int,
2eb88d95 322 }
0e853ca4 323 },
0e853ca4 324 {
4bc3a23e
PH
325 'url': 'http://www.youtube.com/watch?v=UxxajLWwzqY',
326 'note': 'Test generic use_cipher_signature video (#897)',
327 'info_dict': {
328 'id': 'UxxajLWwzqY',
329 'ext': 'mp4',
330 'upload_date': '20120506',
331 'title': 'Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]',
332 'description': 'md5:fea86fda2d5a5784273df5c7cc994d9f',
333 'uploader': 'Icona Pop',
334 'uploader_id': 'IconaPop',
2eb88d95 335 }
c108eb73
JMF
336 },
337 {
4bc3a23e
PH
338 'url': 'https://www.youtube.com/watch?v=07FYdnEawAQ',
339 'note': 'Test VEVO video with age protection (#956)',
340 'info_dict': {
341 'id': '07FYdnEawAQ',
342 'ext': 'mp4',
343 'upload_date': '20130703',
344 'title': 'Justin Timberlake - Tunnel Vision (Explicit)',
345 'description': 'md5:64249768eec3bc4276236606ea996373',
346 'uploader': 'justintimberlakeVEVO',
347 'uploader_id': 'justintimberlakeVEVO',
c108eb73
JMF
348 }
349 },
fccd3771 350 {
4bc3a23e
PH
351 'url': '//www.YouTube.com/watch?v=yZIXLfi8CZQ',
352 'note': 'Embed-only video (#1746)',
353 'info_dict': {
354 'id': 'yZIXLfi8CZQ',
355 'ext': 'mp4',
356 'upload_date': '20120608',
357 'title': 'Principal Sexually Assaults A Teacher - Episode 117 - 8th June 2012',
358 'description': 'md5:09b78bd971f1e3e289601dfba15ca4f7',
359 'uploader': 'SET India',
360 'uploader_id': 'setindia'
fccd3771
PH
361 }
362 },
dd27fd17 363 {
4bc3a23e
PH
364 'url': 'http://www.youtube.com/watch?v=a9LDPn-MO4I',
365 'note': '256k DASH audio (format 141) via DASH manifest',
366 'info_dict': {
367 'id': 'a9LDPn-MO4I',
368 'ext': 'm4a',
369 'upload_date': '20121002',
370 'uploader_id': '8KVIDEO',
371 'description': '',
372 'uploader': '8KVIDEO',
373 'title': 'UHDTV TEST 8K VIDEO.mp4'
4919603f 374 },
4bc3a23e
PH
375 'params': {
376 'youtube_include_dash_manifest': True,
377 'format': '141',
4919603f 378 },
dd27fd17 379 },
3489b7d2
JMF
380 # DASH manifest with encrypted signature
381 {
78caa52a
PH
382 'url': 'https://www.youtube.com/watch?v=IB3lcPjvWLA',
383 'info_dict': {
384 'id': 'IB3lcPjvWLA',
385 'ext': 'm4a',
b766eb27
JMF
386 'title': 'Afrojack, Spree Wilson - The Spark ft. Spree Wilson',
387 'description': 'md5:12e7067fa6735a77bdcbb58cb1187d2d',
78caa52a
PH
388 'uploader': 'AfrojackVEVO',
389 'uploader_id': 'AfrojackVEVO',
390 'upload_date': '20131011',
3489b7d2 391 },
4bc3a23e 392 'params': {
78caa52a
PH
393 'youtube_include_dash_manifest': True,
394 'format': '141',
3489b7d2
JMF
395 },
396 },
aaeb86f6
S
397 # JS player signature function name containing $
398 {
399 'url': 'https://www.youtube.com/watch?v=nfWlot6h_JM',
400 'info_dict': {
401 'id': 'nfWlot6h_JM',
402 'ext': 'm4a',
403 'title': 'Taylor Swift - Shake It Off',
404 'description': 'md5:2acfda1b285bdd478ccec22f9918199d',
405 'uploader': 'TaylorSwiftVEVO',
406 'uploader_id': 'TaylorSwiftVEVO',
407 'upload_date': '20140818',
408 },
409 'params': {
410 'youtube_include_dash_manifest': True,
411 'format': '141',
412 },
413 },
aa79ac0c
PH
414 # Controversy video
415 {
416 'url': 'https://www.youtube.com/watch?v=T4XJQO3qol8',
417 'info_dict': {
418 'id': 'T4XJQO3qol8',
419 'ext': 'mp4',
420 'upload_date': '20100909',
421 'uploader': 'The Amazing Atheist',
422 'uploader_id': 'TheAmazingAtheist',
423 'title': 'Burning Everyone\'s Koran',
424 'description': 'SUBSCRIBE: http://www.youtube.com/saturninefilms\n\nEven Obama has taken a stand against freedom on this issue: http://www.huffingtonpost.com/2010/09/09/obama-gma-interview-quran_n_710282.html',
425 }
c522adb1
JMF
426 },
427 # Normal age-gate video (No vevo, embed allowed)
428 {
429 'url': 'http://youtube.com/watch?v=HtVdAasjOgU',
430 'info_dict': {
431 'id': 'HtVdAasjOgU',
432 'ext': 'mp4',
433 'title': 'The Witcher 3: Wild Hunt - The Sword Of Destiny Trailer',
9ed99402 434 'description': 're:(?s).{100,}About the Game\n.*?The Witcher 3: Wild Hunt.{100,}',
c522adb1
JMF
435 'uploader': 'The Witcher',
436 'uploader_id': 'WitcherGame',
437 'upload_date': '20140605',
438 },
439 },
fccae2b9
S
440 # Age-gate video with encrypted signature
441 {
442 'url': 'http://www.youtube.com/watch?v=6kLq3WMV1nU',
443 'info_dict': {
444 'id': '6kLq3WMV1nU',
445 'ext': 'mp4',
446 'title': 'Dedication To My Ex (Miss That) (Lyric Video)',
447 'description': 'md5:33765bb339e1b47e7e72b5490139bb41',
448 'uploader': 'LloydVEVO',
449 'uploader_id': 'LloydVEVO',
450 'upload_date': '20110629',
451 },
452 },
774e208f
PH
453 # video_info is None (https://github.com/rg3/youtube-dl/issues/4421)
454 {
455 'url': '__2ABJjxzNo',
456 'info_dict': {
457 'id': '__2ABJjxzNo',
458 'ext': 'mp4',
459 'upload_date': '20100430',
460 'uploader_id': 'deadmau5',
461 'description': 'md5:12c56784b8032162bb936a5f76d55360',
462 'uploader': 'deadmau5',
463 'title': 'Deadmau5 - Some Chords (HD)',
464 },
465 'expected_warnings': [
466 'DASH manifest missing',
467 ]
e52a40ab
PH
468 },
469 # Olympics (https://github.com/rg3/youtube-dl/issues/4431)
470 {
471 'url': 'lqQg6PlCWgI',
472 'info_dict': {
473 'id': 'lqQg6PlCWgI',
474 'ext': 'mp4',
cbe2bd91
PH
475 'upload_date': '20120731',
476 'uploader_id': 'olympic',
477 'description': 'HO09 - Women - GER-AUS - Hockey - 31 July 2012 - London 2012 Olympic Games',
478 'uploader': 'Olympics',
479 'title': 'Hockey - Women - GER-AUS - London 2012 Olympic Games',
480 },
481 'params': {
482 'skip_download': 'requires avconv',
e52a40ab 483 }
cbe2bd91 484 },
6271f1ca
PH
485 # Non-square pixels
486 {
487 'url': 'https://www.youtube.com/watch?v=_b-2C3KPAM0',
488 'info_dict': {
489 'id': '_b-2C3KPAM0',
490 'ext': 'mp4',
491 'stretched_ratio': 16 / 9.,
492 'upload_date': '20110310',
493 'uploader_id': 'AllenMeow',
494 'description': 'made by Wacom from Korea | 字幕&加油添醋 by TY\'s Allen | 感謝heylisa00cavey1001同學熱情提供梗及翻譯',
495 'uploader': '孫艾倫',
496 'title': '[A-made] 變態妍字幕版 太妍 我就是這樣的人',
497 },
498 }
2eb88d95
PH
499 ]
500
e0df6211
PH
501 def __init__(self, *args, **kwargs):
502 super(YoutubeIE, self).__init__(*args, **kwargs)
83799698 503 self._player_cache = {}
e0df6211 504
c5e8d7af
PH
505 def report_video_info_webpage_download(self, video_id):
506 """Report attempt to download video info webpage."""
69ea8ca4 507 self.to_screen('%s: Downloading video info webpage' % video_id)
c5e8d7af 508
c5e8d7af
PH
509 def report_information_extraction(self, video_id):
510 """Report attempt to extract video information."""
69ea8ca4 511 self.to_screen('%s: Extracting video information' % video_id)
c5e8d7af
PH
512
513 def report_unavailable_format(self, video_id, format):
514 """Report extracted video URL."""
69ea8ca4 515 self.to_screen('%s: Format %s not available' % (video_id, format))
c5e8d7af
PH
516
517 def report_rtmp_download(self):
518 """Indicate the download will use the RTMP protocol."""
69ea8ca4 519 self.to_screen('RTMP download detected')
c5e8d7af 520
60064c53
PH
521 def _signature_cache_id(self, example_sig):
522 """ Return a string representation of a signature """
78caa52a 523 return '.'.join(compat_str(len(part)) for part in example_sig.split('.'))
60064c53
PH
524
525 def _extract_signature_function(self, video_id, player_url, example_sig):
cf010131 526 id_m = re.match(
60620368 527 r'.*?-(?P<id>[a-zA-Z0-9_-]+)(?:/watch_as3|/html5player)?\.(?P<ext>[a-z]+)$',
cf010131 528 player_url)
c081b35c
PH
529 if not id_m:
530 raise ExtractorError('Cannot identify player %r' % player_url)
e0df6211
PH
531 player_type = id_m.group('ext')
532 player_id = id_m.group('id')
533
c4417ddb 534 # Read from filesystem cache
60064c53
PH
535 func_id = '%s_%s_%s' % (
536 player_type, player_id, self._signature_cache_id(example_sig))
c4417ddb 537 assert os.path.basename(func_id) == func_id
a0e07d31 538
69ea8ca4 539 cache_spec = self._downloader.cache.load('youtube-sigfuncs', func_id)
a0e07d31 540 if cache_spec is not None:
78caa52a 541 return lambda s: ''.join(s[i] for i in cache_spec)
83799698 542
6d1a55a5
PH
543 download_note = (
544 'Downloading player %s' % player_url
545 if self._downloader.params.get('verbose') else
546 'Downloading %s player %s' % (player_type, player_id)
547 )
e0df6211
PH
548 if player_type == 'js':
549 code = self._download_webpage(
550 player_url, video_id,
6d1a55a5 551 note=download_note,
69ea8ca4 552 errnote='Download of %s failed' % player_url)
83799698 553 res = self._parse_sig_js(code)
c4417ddb 554 elif player_type == 'swf':
e0df6211
PH
555 urlh = self._request_webpage(
556 player_url, video_id,
6d1a55a5 557 note=download_note,
69ea8ca4 558 errnote='Download of %s failed' % player_url)
e0df6211 559 code = urlh.read()
83799698 560 res = self._parse_sig_swf(code)
e0df6211
PH
561 else:
562 assert False, 'Invalid player type %r' % player_type
563
785521bf
PH
564 test_string = ''.join(map(compat_chr, range(len(example_sig))))
565 cache_res = res(test_string)
566 cache_spec = [ord(c) for c in cache_res]
83799698 567
69ea8ca4 568 self._downloader.cache.store('youtube-sigfuncs', func_id, cache_spec)
83799698
PH
569 return res
570
60064c53 571 def _print_sig_code(self, func, example_sig):
edf3e38e
PH
572 def gen_sig_code(idxs):
573 def _genslice(start, end, step):
78caa52a 574 starts = '' if start == 0 else str(start)
8bcc8756 575 ends = (':%d' % (end + step)) if end + step >= 0 else ':'
69ea8ca4 576 steps = '' if step == 1 else (':%d' % step)
78caa52a 577 return 's[%s%s%s]' % (starts, ends, steps)
edf3e38e
PH
578
579 step = None
7af808a5
PH
580 # Quelch pyflakes warnings - start will be set when step is set
581 start = '(Never used)'
edf3e38e
PH
582 for i, prev in zip(idxs[1:], idxs[:-1]):
583 if step is not None:
584 if i - prev == step:
585 continue
586 yield _genslice(start, prev, step)
587 step = None
588 continue
589 if i - prev in [-1, 1]:
590 step = i - prev
591 start = prev
592 continue
593 else:
78caa52a 594 yield 's[%d]' % prev
edf3e38e 595 if step is None:
78caa52a 596 yield 's[%d]' % i
edf3e38e
PH
597 else:
598 yield _genslice(start, i, step)
599
78caa52a 600 test_string = ''.join(map(compat_chr, range(len(example_sig))))
c705320f 601 cache_res = func(test_string)
edf3e38e 602 cache_spec = [ord(c) for c in cache_res]
78caa52a 603 expr_code = ' + '.join(gen_sig_code(cache_spec))
60064c53
PH
604 signature_id_tuple = '(%s)' % (
605 ', '.join(compat_str(len(p)) for p in example_sig.split('.')))
69ea8ca4 606 code = ('if tuple(len(p) for p in s.split(\'.\')) == %s:\n'
78caa52a 607 ' return %s\n') % (signature_id_tuple, expr_code)
69ea8ca4 608 self.to_screen('Extracted signature function:\n' + code)
edf3e38e 609
e0df6211
PH
610 def _parse_sig_js(self, jscode):
611 funcname = self._search_regex(
aaeb86f6 612 r'\.sig\|\|([a-zA-Z0-9$]+)\(', jscode,
8bcc8756 613 'Initial JS player signature function name')
2b25cb5d
PH
614
615 jsi = JSInterpreter(jscode)
616 initial_function = jsi.extract_function(funcname)
e0df6211
PH
617 return lambda s: initial_function([s])
618
619 def _parse_sig_swf(self, file_contents):
54256267 620 swfi = SWFInterpreter(file_contents)
78caa52a 621 TARGET_CLASSNAME = 'SignatureDecipher'
54256267 622 searched_class = swfi.extract_class(TARGET_CLASSNAME)
78caa52a 623 initial_function = swfi.extract_function(searched_class, 'decipher')
e0df6211
PH
624 return lambda s: initial_function([s])
625
83799698 626 def _decrypt_signature(self, s, video_id, player_url, age_gate=False):
257a2501 627 """Turn the encrypted s field into a working signature"""
6b37f0be 628
c8bf86d5 629 if player_url is None:
69ea8ca4 630 raise ExtractorError('Cannot decrypt signature without player_url')
920de7a2 631
69ea8ca4 632 if player_url.startswith('//'):
78caa52a 633 player_url = 'https:' + player_url
c8bf86d5 634 try:
62af3a0e 635 player_id = (player_url, self._signature_cache_id(s))
c8bf86d5
PH
636 if player_id not in self._player_cache:
637 func = self._extract_signature_function(
60064c53 638 video_id, player_url, s
c8bf86d5
PH
639 )
640 self._player_cache[player_id] = func
641 func = self._player_cache[player_id]
642 if self._downloader.params.get('youtube_print_sig_code'):
60064c53 643 self._print_sig_code(func, s)
c8bf86d5
PH
644 return func(s)
645 except Exception as e:
646 tb = traceback.format_exc()
647 raise ExtractorError(
78caa52a 648 'Signature extraction failed: ' + tb, cause=e)
e0df6211 649
360e1ca5 650 def _get_subtitles(self, video_id, webpage):
de7f3446 651 try:
60e47a26 652 subs_doc = self._download_xml(
38c2e5b8 653 'https://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,
7fad1c63
JMF
654 video_id, note=False)
655 except ExtractorError as err:
69ea8ca4 656 self._downloader.report_warning('unable to download video subtitles: %s' % compat_str(err))
de7f3446 657 return {}
de7f3446
JMF
658
659 sub_lang_list = {}
60e47a26
JMF
660 for track in subs_doc.findall('track'):
661 lang = track.attrib['lang_code']
7e660ac1
LD
662 if lang in sub_lang_list:
663 continue
360e1ca5
JMF
664 sub_formats = []
665 for ext in ['sbv', 'vtt', 'srt']:
666 params = compat_urllib_parse.urlencode({
667 'lang': lang,
668 'v': video_id,
669 'fmt': ext,
670 'name': track.attrib['name'].encode('utf-8'),
671 })
672 sub_formats.append({
673 'url': 'https://www.youtube.com/api/timedtext?' + params,
674 'ext': ext,
675 })
676 sub_lang_list[lang] = sub_formats
de7f3446 677 if not sub_lang_list:
69ea8ca4 678 self._downloader.report_warning('video doesn\'t have subtitles')
de7f3446
JMF
679 return {}
680 return sub_lang_list
681
360e1ca5 682 def _get_automatic_captions(self, video_id, webpage):
de7f3446
JMF
683 """We need the webpage for getting the captions url, pass it as an
684 argument to speed up the process."""
69ea8ca4 685 self.to_screen('%s: Looking for automatic captions' % video_id)
de7f3446 686 mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
78caa52a 687 err_msg = 'Couldn\'t find automatic captions for %s' % video_id
de7f3446
JMF
688 if mobj is None:
689 self._downloader.report_warning(err_msg)
690 return {}
691 player_config = json.loads(mobj.group(1))
692 try:
0792d563
PH
693 args = player_config['args']
694 caption_url = args['ttsurl']
695 timestamp = args['timestamp']
055e6f36
JMF
696 # We get the available subtitles
697 list_params = compat_urllib_parse.urlencode({
698 'type': 'list',
699 'tlangs': 1,
700 'asrs': 1,
de7f3446 701 })
055e6f36 702 list_url = caption_url + '&' + list_params
e26f8712 703 caption_list = self._download_xml(list_url, video_id)
e3dc22ca 704 original_lang_node = caption_list.find('track')
7d900ef1 705 if original_lang_node is None:
69ea8ca4 706 self._downloader.report_warning('Video doesn\'t have automatic captions')
e3dc22ca
JMF
707 return {}
708 original_lang = original_lang_node.attrib['lang_code']
7d900ef1 709 caption_kind = original_lang_node.attrib.get('kind', '')
055e6f36
JMF
710
711 sub_lang_list = {}
712 for lang_node in caption_list.findall('target'):
713 sub_lang = lang_node.attrib['lang_code']
360e1ca5
JMF
714 sub_formats = []
715 for ext in ['sbv', 'vtt', 'srt']:
716 params = compat_urllib_parse.urlencode({
717 'lang': original_lang,
718 'tlang': sub_lang,
719 'fmt': ext,
720 'ts': timestamp,
721 'kind': caption_kind,
722 })
723 sub_formats.append({
724 'url': caption_url + '&' + params,
725 'ext': ext,
726 })
727 sub_lang_list[sub_lang] = sub_formats
055e6f36 728 return sub_lang_list
de7f3446
JMF
729 # An extractor error can be raise by the download process if there are
730 # no automatic captions but there are subtitles
731 except (KeyError, ExtractorError):
732 self._downloader.report_warning(err_msg)
733 return {}
734
97665381
PH
735 @classmethod
736 def extract_id(cls, url):
737 mobj = re.match(cls._VALID_URL, url, re.VERBOSE)
c5e8d7af 738 if mobj is None:
69ea8ca4 739 raise ExtractorError('Invalid URL: %s' % url)
c5e8d7af
PH
740 video_id = mobj.group(2)
741 return video_id
742
1d043b93
JMF
743 def _extract_from_m3u8(self, manifest_url, video_id):
744 url_map = {}
5f6a1245 745
1d043b93
JMF
746 def _get_urls(_manifest):
747 lines = _manifest.split('\n')
748 urls = filter(lambda l: l and not l.startswith('#'),
8bcc8756 749 lines)
1d043b93 750 return urls
78caa52a 751 manifest = self._download_webpage(manifest_url, video_id, 'Downloading formats manifest')
1d043b93
JMF
752 formats_urls = _get_urls(manifest)
753 for format_url in formats_urls:
890f62e8 754 itag = self._search_regex(r'itag/(\d+?)/', format_url, 'itag')
1d043b93
JMF
755 url_map[itag] = format_url
756 return url_map
757
1fb07d10
JG
758 def _extract_annotations(self, video_id):
759 url = 'https://www.youtube.com/annotations_invideo?features=1&legacy=1&video_id=%s' % video_id
69ea8ca4 760 return self._download_webpage(url, video_id, note='Searching for annotations.', errnote='Unable to download video annotations.')
1fb07d10 761
da276600
PH
762 def _parse_dash_manifest(
763 self, video_id, dash_manifest_url, player_url, age_gate):
774e208f
PH
764 def decrypt_sig(mobj):
765 s = mobj.group(1)
766 dec_s = self._decrypt_signature(s, video_id, player_url, age_gate)
767 return '/signature/%s' % dec_s
768 dash_manifest_url = re.sub(r'/s/([\w\.]+)', decrypt_sig, dash_manifest_url)
769 dash_doc = self._download_xml(
770 dash_manifest_url, video_id,
771 note='Downloading DASH manifest',
772 errnote='Could not download DASH manifest')
773
774 formats = []
775 for r in dash_doc.findall('.//{urn:mpeg:DASH:schema:MPD:2011}Representation'):
776 url_el = r.find('{urn:mpeg:DASH:schema:MPD:2011}BaseURL')
777 if url_el is None:
778 continue
779 format_id = r.attrib['id']
780 video_url = url_el.text
781 filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength'))
782 f = {
783 'format_id': format_id,
784 'url': video_url,
785 'width': int_or_none(r.attrib.get('width')),
e65566a9 786 'height': int_or_none(r.attrib.get('height')),
774e208f
PH
787 'tbr': int_or_none(r.attrib.get('bandwidth'), 1000),
788 'asr': int_or_none(r.attrib.get('audioSamplingRate')),
789 'filesize': filesize,
790 'fps': int_or_none(r.attrib.get('frameRate')),
791 }
792 try:
793 existing_format = next(
794 fo for fo in formats
795 if fo['format_id'] == format_id)
796 except StopIteration:
ba617964
JMF
797 full_info = self._formats.get(format_id, {}).copy()
798 full_info.update(f)
799 formats.append(full_info)
774e208f
PH
800 else:
801 existing_format.update(f)
802 return formats
803
c5e8d7af 804 def _real_extract(self, url):
7e8c0af0 805 proto = (
78caa52a
PH
806 'http' if self._downloader.params.get('prefer_insecure', False)
807 else 'https')
7e8c0af0 808
c5e8d7af
PH
809 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
810 mobj = re.search(self._NEXT_URL_RE, url)
811 if mobj:
7e8c0af0 812 url = proto + '://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
97665381 813 video_id = self.extract_id(url)
c5e8d7af
PH
814
815 # Get video webpage
aa79ac0c 816 url = proto + '://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1&bpctr=9999999999' % video_id
a1f934b1 817 video_webpage = self._download_webpage(url, video_id)
c5e8d7af
PH
818
819 # Attempt to extract SWF player URL
e0df6211 820 mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
c5e8d7af
PH
821 if mobj is not None:
822 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
823 else:
824 player_url = None
825
826 # Get video info
6449cd80 827 embed_webpage = None
c108eb73 828 if re.search(r'player-age-gate-content">', video_webpage) is not None:
c108eb73
JMF
829 age_gate = True
830 # We simulate the access to the video from www.youtube.com/v/{video_id}
831 # this can be viewed without login into Youtube
beb95e77
CL
832 url = proto + '://www.youtube.com/embed/%s' % video_id
833 embed_webpage = self._download_webpage(url, video_id, 'Downloading embed webpage')
2c57c7fa
JMF
834 data = compat_urllib_parse.urlencode({
835 'video_id': video_id,
836 'eurl': 'https://youtube.googleapis.com/v/' + video_id,
c084c934 837 'sts': self._search_regex(
beb95e77 838 r'"sts"\s*:\s*(\d+)', embed_webpage, 'sts', default=''),
2c57c7fa 839 })
7e8c0af0 840 video_info_url = proto + '://www.youtube.com/get_video_info?' + data
94bd3613
PH
841 video_info_webpage = self._download_webpage(
842 video_info_url, video_id,
20436c30 843 note='Refetching age-gated info webpage',
94bd3613 844 errnote='unable to download video info webpage')
c5e8d7af 845 video_info = compat_parse_qs(video_info_webpage)
c108eb73
JMF
846 else:
847 age_gate = False
4e62ebe2
JMF
848 try:
849 # Try looking directly into the video webpage
850 mobj = re.search(r';ytplayer\.config\s*=\s*({.*?});', video_webpage)
851 if not mobj:
852 raise ValueError('Could not find ytplayer.config') # caught below
853 json_code = uppercase_escape(mobj.group(1))
854 ytplayer_config = json.loads(json_code)
855 args = ytplayer_config['args']
856 # Convert to the same format returned by compat_parse_qs
857 video_info = dict((k, [v]) for k, v in args.items())
858 if 'url_encoded_fmt_stream_map' not in args:
859 raise ValueError('No stream_map present') # caught below
860 except ValueError:
861 # We fallback to the get_video_info pages (used by the embed page)
862 self.report_video_info_webpage_download(video_id)
863 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
810fb84d
PH
864 video_info_url = (
865 '%s://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
866 % (proto, video_id, el_type))
867 video_info_webpage = self._download_webpage(
868 video_info_url,
4e62ebe2
JMF
869 video_id, note=False,
870 errnote='unable to download video info webpage')
871 video_info = compat_parse_qs(video_info_webpage)
872 if 'token' in video_info:
873 break
c5e8d7af
PH
874 if 'token' not in video_info:
875 if 'reason' in video_info:
d11271dd 876 raise ExtractorError(
78caa52a 877 'YouTube said: %s' % video_info['reason'][0],
d11271dd 878 expected=True, video_id=video_id)
c5e8d7af 879 else:
d11271dd 880 raise ExtractorError(
78caa52a 881 '"token" parameter not in video info for unknown reason',
d11271dd 882 video_id=video_id)
c5e8d7af 883
1d699755
PH
884 if 'view_count' in video_info:
885 view_count = int(video_info['view_count'][0])
886 else:
887 view_count = None
888
c5e8d7af
PH
889 # Check for "rental" videos
890 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
69ea8ca4 891 raise ExtractorError('"rental" videos not supported')
c5e8d7af
PH
892
893 # Start extracting information
894 self.report_information_extraction(video_id)
895
896 # uploader
897 if 'author' not in video_info:
69ea8ca4 898 raise ExtractorError('Unable to extract uploader name')
c5e8d7af
PH
899 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
900
901 # uploader_id
902 video_uploader_id = None
903 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
904 if mobj is not None:
905 video_uploader_id = mobj.group(1)
906 else:
69ea8ca4 907 self._downloader.report_warning('unable to extract uploader nickname')
c5e8d7af
PH
908
909 # title
a8c6b241 910 if 'title' in video_info:
aa92f063 911 video_title = video_info['title'][0]
a8c6b241 912 else:
69ea8ca4 913 self._downloader.report_warning('Unable to extract video title')
78caa52a 914 video_title = '_'
c5e8d7af
PH
915
916 # thumbnail image
7763b04e
JMF
917 # We try first to get a high quality image:
918 m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">',
919 video_webpage, re.DOTALL)
920 if m_thumb is not None:
921 video_thumbnail = m_thumb.group(1)
922 elif 'thumbnail_url' not in video_info:
69ea8ca4 923 self._downloader.report_warning('unable to extract video thumbnail')
f490e77e 924 video_thumbnail = None
c5e8d7af
PH
925 else: # don't panic if we can't find it
926 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
927
928 # upload date
929 upload_date = None
ad3bc6ac 930 mobj = re.search(r'(?s)id="eow-date.*?>(.*?)</span>', video_webpage)
beee53de
PH
931 if mobj is None:
932 mobj = re.search(
263bd4ec 933 r'(?s)id="watch-uploader-info".*?>.*?(?:Published|Uploaded|Streamed live) on (.*?)</strong>',
beee53de 934 video_webpage)
c5e8d7af
PH
935 if mobj is not None:
936 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
937 upload_date = unified_strdate(upload_date)
938
55f7bd2d
PH
939 m_cat_container = self._search_regex(
940 r'(?s)<h4[^>]*>\s*Category\s*</h4>\s*<ul[^>]*>(.*?)</ul>',
624dcebf 941 video_webpage, 'categories', default=None)
ec8deefc 942 if m_cat_container:
ad3bc6ac 943 category = self._html_search_regex(
01ed5c9b 944 r'(?s)<a[^<]+>(.*?)</a>', m_cat_container, 'category',
ad3bc6ac
PH
945 default=None)
946 video_categories = None if category is None else [category]
947 else:
948 video_categories = None
ec8deefc 949
c5e8d7af
PH
950 # description
951 video_description = get_element_by_id("eow-description", video_webpage)
952 if video_description:
27dcce19
PH
953 video_description = re.sub(r'''(?x)
954 <a\s+
955 (?:[a-zA-Z-]+="[^"]+"\s+)*?
956 title="([^"]+)"\s+
957 (?:[a-zA-Z-]+="[^"]+"\s+)*?
958 class="yt-uix-redirect-link"\s*>
959 [^<]+
960 </a>
961 ''', r'\1', video_description)
c5e8d7af
PH
962 video_description = clean_html(video_description)
963 else:
964 fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
965 if fd_mobj:
966 video_description = unescapeHTML(fd_mobj.group(1))
967 else:
78caa52a 968 video_description = ''
c5e8d7af 969
f30a38be 970 def _extract_count(count_name):
46374a56 971 count = self._search_regex(
f30a38be
JMF
972 r'id="watch-%s"[^>]*>.*?([\d,]+)\s*</span>' % re.escape(count_name),
973 video_webpage, count_name, default=None)
336c3a69
JMF
974 if count is not None:
975 return int(count.replace(',', ''))
976 return None
69ea8ca4
PH
977 like_count = _extract_count('like')
978 dislike_count = _extract_count('dislike')
336c3a69 979
c5e8d7af 980 # subtitles
d82134c3 981 video_subtitles = self.extract_subtitles(video_id, video_webpage)
360e1ca5 982 automatic_captions = self.extract_automatic_captions(video_id, video_webpage)
c5e8d7af
PH
983
984 if 'length_seconds' not in video_info:
69ea8ca4 985 self._downloader.report_warning('unable to extract video duration')
b466b702 986 video_duration = None
c5e8d7af 987 else:
b466b702 988 video_duration = int(compat_urllib_parse.unquote_plus(video_info['length_seconds'][0]))
c5e8d7af 989
1fb07d10
JG
990 # annotations
991 video_annotations = None
992 if self._downloader.params.get('writeannotations', False):
5f6a1245 993 video_annotations = self._extract_annotations(video_id)
1fb07d10 994
dd27fd17
PH
995 def _map_to_format_list(urlmap):
996 formats = []
997 for itag, video_real_url in urlmap.items():
998 dct = {
999 'format_id': itag,
1000 'url': video_real_url,
1001 'player_url': player_url,
1002 }
0b65e5d4
PH
1003 if itag in self._formats:
1004 dct.update(self._formats[itag])
dd27fd17
PH
1005 formats.append(dct)
1006 return formats
1007
c5e8d7af
PH
1008 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1009 self.report_rtmp_download()
dd27fd17
PH
1010 formats = [{
1011 'format_id': '_rtmp',
1012 'protocol': 'rtmp',
1013 'url': video_info['conn'][0],
1014 'player_url': player_url,
1015 }]
24270b03 1016 elif len(video_info.get('url_encoded_fmt_stream_map', [''])[0]) >= 1 or len(video_info.get('adaptive_fmts', [''])[0]) >= 1:
5f6a1245 1017 encoded_url_map = video_info.get('url_encoded_fmt_stream_map', [''])[0] + ',' + video_info.get('adaptive_fmts', [''])[0]
00fe14fc 1018 if 'rtmpe%3Dyes' in encoded_url_map:
a7055eb9 1019 raise ExtractorError('rtmpe downloads are not supported, see https://github.com/rg3/youtube-dl/issues/343 for more information.', expected=True)
c5e8d7af 1020 url_map = {}
00fe14fc 1021 for url_data_str in encoded_url_map.split(','):
c5e8d7af 1022 url_data = compat_parse_qs(url_data_str)
201e9eaa
PH
1023 if 'itag' not in url_data or 'url' not in url_data:
1024 continue
1025 format_id = url_data['itag'][0]
1026 url = url_data['url'][0]
1027
1028 if 'sig' in url_data:
1029 url += '&signature=' + url_data['sig'][0]
1030 elif 's' in url_data:
1031 encrypted_sig = url_data['s'][0]
6449cd80 1032 ASSETS_RE = r'"assets":.+?"js":\s*("[^"]+")'
201e9eaa 1033
beb95e77 1034 jsplayer_url_json = self._search_regex(
6449cd80
PH
1035 ASSETS_RE,
1036 embed_webpage if age_gate else video_webpage,
1037 'JS player URL (1)', default=None)
1038 if not jsplayer_url_json and not age_gate:
1039 # We need the embed website after all
1040 if embed_webpage is None:
1041 embed_url = proto + '://www.youtube.com/embed/%s' % video_id
1042 embed_webpage = self._download_webpage(
1043 embed_url, video_id, 'Downloading embed webpage')
1044 jsplayer_url_json = self._search_regex(
1045 ASSETS_RE, embed_webpage, 'JS player URL')
1046
beb95e77 1047 player_url = json.loads(jsplayer_url_json)
201e9eaa
PH
1048 if player_url is None:
1049 player_url_json = self._search_regex(
1050 r'ytplayer\.config.*?"url"\s*:\s*("[^"]+")',
78caa52a 1051 video_webpage, 'age gate player URL')
201e9eaa
PH
1052 player_url = json.loads(player_url_json)
1053
1054 if self._downloader.params.get('verbose'):
cf010131 1055 if player_url is None:
201e9eaa
PH
1056 player_version = 'unknown'
1057 player_desc = 'unknown'
1058 else:
1059 if player_url.endswith('swf'):
1060 player_version = self._search_regex(
1061 r'-(.+?)(?:/watch_as3)?\.swf$', player_url,
78caa52a 1062 'flash player', fatal=False)
201e9eaa 1063 player_desc = 'flash player %s' % player_version
cf010131 1064 else:
201e9eaa
PH
1065 player_version = self._search_regex(
1066 r'html5player-([^/]+?)(?:/html5player)?\.js',
1067 player_url,
1068 'html5 player', fatal=False)
78caa52a 1069 player_desc = 'html5 player %s' % player_version
201e9eaa 1070
60064c53 1071 parts_sizes = self._signature_cache_id(encrypted_sig)
69ea8ca4 1072 self.to_screen('{%s} signature length %s, %s' %
9e1a5b84 1073 (format_id, parts_sizes, player_desc))
201e9eaa
PH
1074
1075 signature = self._decrypt_signature(
1076 encrypted_sig, video_id, player_url, age_gate)
1077 url += '&signature=' + signature
1078 if 'ratebypass' not in url:
1079 url += '&ratebypass=yes'
1080 url_map[format_id] = url
dd27fd17 1081 formats = _map_to_format_list(url_map)
1d043b93
JMF
1082 elif video_info.get('hlsvp'):
1083 manifest_url = video_info['hlsvp'][0]
1084 url_map = self._extract_from_m3u8(manifest_url, video_id)
dd27fd17 1085 formats = _map_to_format_list(url_map)
c5e8d7af 1086 else:
69ea8ca4 1087 raise ExtractorError('no conn, hlsvp or url_encoded_fmt_stream_map information found in video info')
c5e8d7af 1088
dd27fd17 1089 # Look for the DASH manifest
203fb43f 1090 if self._downloader.params.get('youtube_include_dash_manifest', True):
774e208f 1091 dash_mpd = video_info.get('dashmpd')
75111274 1092 if dash_mpd:
774e208f
PH
1093 dash_manifest_url = dash_mpd[0]
1094 try:
1095 dash_formats = self._parse_dash_manifest(
da276600 1096 video_id, dash_manifest_url, player_url, age_gate)
774e208f
PH
1097 except (ExtractorError, KeyError) as e:
1098 self.report_warning(
1099 'Skipping DASH manifest: %r' % e, video_id)
1100 else:
e65566a9
PH
1101 # Hide the formats we found through non-DASH
1102 dash_keys = set(df['format_id'] for df in dash_formats)
1103 for f in formats:
1104 if f['format_id'] in dash_keys:
1105 f['format_id'] = 'nondash-%s' % f['format_id']
ee61f6f3 1106 f['preference'] = f.get('preference', 0) - 10000
774e208f 1107 formats.extend(dash_formats)
d80044c2 1108
6271f1ca
PH
1109 # Check for malformed aspect ratio
1110 stretched_m = re.search(
1111 r'<meta\s+property="og:video:tag".*?content="yt:stretch=(?P<w>[0-9]+):(?P<h>[0-9]+)">',
1112 video_webpage)
1113 if stretched_m:
1114 ratio = float(stretched_m.group('w')) / float(stretched_m.group('h'))
1115 for f in formats:
1116 if f.get('vcodec') != 'none':
1117 f['stretched_ratio'] = ratio
1118
4bcc7bd1 1119 self._sort_formats(formats)
4ea3be0a 1120
1121 return {
8bcc8756
JW
1122 'id': video_id,
1123 'uploader': video_uploader,
1124 'uploader_id': video_uploader_id,
1125 'upload_date': upload_date,
1126 'title': video_title,
1127 'thumbnail': video_thumbnail,
1128 'description': video_description,
1129 'categories': video_categories,
1130 'subtitles': video_subtitles,
360e1ca5 1131 'automatic_captions': automatic_captions,
8bcc8756
JW
1132 'duration': video_duration,
1133 'age_limit': 18 if age_gate else 0,
1134 'annotations': video_annotations,
7e8c0af0 1135 'webpage_url': proto + '://www.youtube.com/watch?v=%s' % video_id,
8bcc8756 1136 'view_count': view_count,
4ea3be0a 1137 'like_count': like_count,
1138 'dislike_count': dislike_count,
2d30521a 1139 'average_rating': float_or_none(video_info.get('avg_rating', [None])[0]),
8bcc8756 1140 'formats': formats,
4ea3be0a 1141 }
c5e8d7af 1142
5f6a1245 1143
880e1c52 1144class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
78caa52a 1145 IE_DESC = 'YouTube.com playlists'
d67cc9fa 1146 _VALID_URL = r"""(?x)(?:
c5e8d7af
PH
1147 (?:https?://)?
1148 (?:\w+\.)?
1149 youtube\.com/
1150 (?:
ac7553d0 1151 (?:course|view_play_list|my_playlists|artist|playlist|watch|embed/videoseries)
c5e8d7af
PH
1152 \? (?:.*?&)*? (?:p|a|list)=
1153 | p/
1154 )
d67cc9fa 1155 (
99209c29 1156 (?:PL|LL|EC|UU|FL|RD|UL)?[0-9A-Za-z-_]{10,}
5f6a1245 1157 # Top tracks, they can also include dots
d67cc9fa
JMF
1158 |(?:MC)[\w\.]*
1159 )
c5e8d7af
PH
1160 .*
1161 |
99209c29 1162 ((?:PL|LL|EC|UU|FL|RD|UL)[0-9A-Za-z-_]{10,})
c5e8d7af 1163 )"""
dbb94fb0 1164 _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s'
dbb94fb0 1165 _VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})&amp;[^"]*?index=(?P<index>\d+)'
78caa52a 1166 IE_NAME = 'youtube:playlist'
81127aa5
PH
1167 _TESTS = [{
1168 'url': 'https://www.youtube.com/playlist?list=PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re',
1169 'info_dict': {
1170 'title': 'ytdl test PL',
a1cf99d0 1171 'id': 'PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re',
81127aa5
PH
1172 },
1173 'playlist_count': 3,
9291475f
PH
1174 }, {
1175 'url': 'https://www.youtube.com/playlist?list=PLtPgu7CB4gbZDA7i_euNxn75ISqxwZPYx',
1176 'info_dict': {
acf757f4 1177 'id': 'PLtPgu7CB4gbZDA7i_euNxn75ISqxwZPYx',
9291475f
PH
1178 'title': 'YDL_Empty_List',
1179 },
1180 'playlist_count': 0,
1181 }, {
1182 'note': 'Playlist with deleted videos (#651). As a bonus, the video #51 is also twice in this list.',
1183 'url': 'https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC',
1184 'info_dict': {
1185 'title': '29C3: Not my department',
acf757f4 1186 'id': 'PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC',
9291475f
PH
1187 },
1188 'playlist_count': 95,
1189 }, {
1190 'note': 'issue #673',
1191 'url': 'PLBB231211A4F62143',
1192 'info_dict': {
f46a8702 1193 'title': '[OLD]Team Fortress 2 (Class-based LP)',
acf757f4 1194 'id': 'PLBB231211A4F62143',
9291475f
PH
1195 },
1196 'playlist_mincount': 26,
1197 }, {
1198 'note': 'Large playlist',
1199 'url': 'https://www.youtube.com/playlist?list=UUBABnxM4Ar9ten8Mdjj1j0Q',
1200 'info_dict': {
1201 'title': 'Uploads from Cauchemar',
acf757f4 1202 'id': 'UUBABnxM4Ar9ten8Mdjj1j0Q',
9291475f
PH
1203 },
1204 'playlist_mincount': 799,
1205 }, {
1206 'url': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl',
1207 'info_dict': {
1208 'title': 'YDL_safe_search',
acf757f4 1209 'id': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl',
9291475f
PH
1210 },
1211 'playlist_count': 2,
ac7553d0
PH
1212 }, {
1213 'note': 'embedded',
1214 'url': 'http://www.youtube.com/embed/videoseries?list=PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu',
1215 'playlist_count': 4,
1216 'info_dict': {
1217 'title': 'JODA15',
acf757f4 1218 'id': 'PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu',
ac7553d0 1219 }
6b08cdf6
PH
1220 }, {
1221 'note': 'Embedded SWF player',
1222 'url': 'http://www.youtube.com/p/YN5VISEtHet5D4NEvfTd0zcgFk84NqFZ?hl=en_US&fs=1&rel=0',
1223 'playlist_count': 4,
1224 'info_dict': {
1225 'title': 'JODA7',
acf757f4 1226 'id': 'YN5VISEtHet5D4NEvfTd0zcgFk84NqFZ',
6b08cdf6 1227 }
4b7df0d3
JMF
1228 }, {
1229 'note': 'Buggy playlist: the webpage has a "Load more" button but it doesn\'t have more videos',
1230 'url': 'https://www.youtube.com/playlist?list=UUXw-G3eDE9trcvY2sBMM_aA',
1231 'info_dict': {
acf757f4
PH
1232 'title': 'Uploads from Interstellar Movie',
1233 'id': 'UUXw-G3eDE9trcvY2sBMM_aA',
4b7df0d3
JMF
1234 },
1235 'playlist_mincout': 21,
81127aa5 1236 }]
c5e8d7af 1237
880e1c52
JMF
1238 def _real_initialize(self):
1239 self._login()
1240
652cdaa2 1241 def _ids_to_results(self, ids):
c9cc0bf5
PH
1242 return [
1243 self.url_result(vid_id, 'Youtube', video_id=vid_id)
1244 for vid_id in ids]
652cdaa2
JMF
1245
1246 def _extract_mix(self, playlist_id):
99209c29 1247 # The mixes are generated from a single video
652cdaa2 1248 # the id of the playlist is just 'RD' + video_id
7d4afc55 1249 url = 'https://youtube.com/watch?v=%s&list=%s' % (playlist_id[-11:], playlist_id)
c9cc0bf5 1250 webpage = self._download_webpage(
78caa52a 1251 url, playlist_id, 'Downloading Youtube mix')
bc2f773b 1252 search_title = lambda class_name: get_element_by_attribute('class', class_name, webpage)
c9cc0bf5
PH
1253 title_span = (
1254 search_title('playlist-title') or
1255 search_title('title long-title') or
1256 search_title('title'))
76d1700b 1257 title = clean_html(title_span)
c9cc0bf5
PH
1258 ids = orderedSet(re.findall(
1259 r'''(?xs)data-video-username=".*?".*?
1260 href="/watch\?v=([0-9A-Za-z_-]{11})&amp;[^"]*?list=%s''' % re.escape(playlist_id),
1261 webpage))
652cdaa2
JMF
1262 url_results = self._ids_to_results(ids)
1263
1264 return self.playlist_result(url_results, playlist_id, title)
1265
448830ce 1266 def _extract_playlist(self, playlist_id):
dbb94fb0
S
1267 url = self._TEMPLATE_URL % playlist_id
1268 page = self._download_webpage(url, playlist_id)
1269 more_widget_html = content_html = page
1270
10c0e2d8 1271 # Check if the playlist exists or is private
e399853d 1272 if re.search(r'<div class="yt-alert-message">[^<]*?(The|This) playlist (does not exist|is private)[^<]*?</div>', page) is not None:
10c0e2d8 1273 raise ExtractorError(
78caa52a 1274 'The playlist doesn\'t exist or is private, use --username or '
10c0e2d8
JMF
1275 '--netrc to access it.',
1276 expected=True)
1277
dcbb4580
JMF
1278 # Extract the video ids from the playlist pages
1279 ids = []
c5e8d7af 1280
755eb032 1281 for page_num in itertools.count(1):
dbb94fb0 1282 matches = re.finditer(self._VIDEO_RE, content_html)
6e47b51e
JMF
1283 # We remove the duplicates and the link with index 0
1284 # (it's not the first video of the playlist)
1285 new_ids = orderedSet(m.group('id') for m in matches if m.group('index') != '0')
dcbb4580 1286 ids.extend(new_ids)
c5e8d7af 1287
dbb94fb0
S
1288 mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
1289 if not mobj:
c5e8d7af
PH
1290 break
1291
dbb94fb0 1292 more = self._download_json(
5912c639
PH
1293 'https://youtube.com/%s' % mobj.group('more'), playlist_id,
1294 'Downloading page #%s' % page_num,
1295 transform_source=uppercase_escape)
dbb94fb0 1296 content_html = more['content_html']
4b7df0d3
JMF
1297 if not content_html.strip():
1298 # Some webpages show a "Load more" button but they don't
1299 # have more videos
1300 break
dbb94fb0
S
1301 more_widget_html = more['load_more_widget_html']
1302
1303 playlist_title = self._html_search_regex(
68eb8e90 1304 r'(?s)<h1 class="pl-header-title[^"]*">\s*(.*?)\s*</h1>',
78caa52a 1305 page, 'title')
c5e8d7af 1306
652cdaa2 1307 url_results = self._ids_to_results(ids)
dcbb4580 1308 return self.playlist_result(url_results, playlist_id, playlist_title)
c5e8d7af 1309
448830ce
S
1310 def _real_extract(self, url):
1311 # Extract playlist id
1312 mobj = re.match(self._VALID_URL, url)
1313 if mobj is None:
1314 raise ExtractorError('Invalid URL: %s' % url)
1315 playlist_id = mobj.group(1) or mobj.group(2)
1316
1317 # Check if it's a video-specific URL
1318 query_dict = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
1319 if 'v' in query_dict:
1320 video_id = query_dict['v'][0]
1321 if self._downloader.params.get('noplaylist'):
1322 self.to_screen('Downloading just video %s because of --no-playlist' % video_id)
1323 return self.url_result(video_id, 'Youtube', video_id=video_id)
1324 else:
1325 self.to_screen('Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id))
1326
1327 if playlist_id.startswith('RD') or playlist_id.startswith('UL'):
1328 # Mixes require a custom extraction process
1329 return self._extract_mix(playlist_id)
1330
1331 return self._extract_playlist(playlist_id)
1332
c5e8d7af
PH
1333
1334class YoutubeChannelIE(InfoExtractor):
78caa52a 1335 IE_DESC = 'YouTube.com channels'
9ff67727 1336 _VALID_URL = r'https?://(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/(?P<id>[0-9A-Za-z_-]+)'
78caa52a 1337 IE_NAME = 'youtube:channel'
cdc628a4
PH
1338 _TESTS = [{
1339 'note': 'paginated channel',
1340 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w',
1341 'playlist_mincount': 91,
acf757f4
PH
1342 'info_dict': {
1343 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
1344 }
cdc628a4 1345 }]
c5e8d7af
PH
1346
1347 def extract_videos_from_page(self, page):
1348 ids_in_page = []
1349 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1350 if mobj.group(1) not in ids_in_page:
1351 ids_in_page.append(mobj.group(1))
1352 return ids_in_page
1353
1354 def _real_extract(self, url):
9ff67727 1355 channel_id = self._match_id(url)
c5e8d7af 1356
c5e8d7af 1357 video_ids = []
b9643eed
JMF
1358 url = 'https://www.youtube.com/channel/%s/videos' % channel_id
1359 channel_page = self._download_webpage(url, channel_id)
31812a9e
PH
1360 autogenerated = re.search(r'''(?x)
1361 class="[^"]*?(?:
1362 channel-header-autogenerated-label|
1363 yt-channel-title-autogenerated
1364 )[^"]*"''', channel_page) is not None
c5e8d7af 1365
b9643eed
JMF
1366 if autogenerated:
1367 # The videos are contained in a single page
1368 # the ajax pages can't be used, they are empty
1369 video_ids = self.extract_videos_from_page(channel_page)
b82f815f
PH
1370 entries = [
1371 self.url_result(video_id, 'Youtube', video_id=video_id)
1372 for video_id in video_ids]
1373 return self.playlist_result(entries, channel_id)
1374
1375 def _entries():
23d3608c 1376 more_widget_html = content_html = channel_page
b9643eed 1377 for pagenum in itertools.count(1):
81c2f20b 1378
23d3608c 1379 ids_in_page = self.extract_videos_from_page(content_html)
b82f815f
PH
1380 for video_id in ids_in_page:
1381 yield self.url_result(
1382 video_id, 'Youtube', video_id=video_id)
5f6a1245 1383
23d3608c
JMF
1384 mobj = re.search(
1385 r'data-uix-load-more-href="/?(?P<more>[^"]+)"',
1386 more_widget_html)
1387 if not mobj:
b9643eed 1388 break
c5e8d7af 1389
23d3608c
JMF
1390 more = self._download_json(
1391 'https://youtube.com/%s' % mobj.group('more'), channel_id,
1392 'Downloading page #%s' % (pagenum + 1),
1393 transform_source=uppercase_escape)
1394 content_html = more['content_html']
1395 more_widget_html = more['load_more_widget_html']
1396
b82f815f 1397 return self.playlist_result(_entries(), channel_id)
c5e8d7af
PH
1398
1399
1400class YoutubeUserIE(InfoExtractor):
78caa52a 1401 IE_DESC = 'YouTube.com user videos (URL or "ytuser" keyword)'
9ff67727 1402 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:user/)?(?!(?:attribution_link|watch|results)(?:$|[^a-z_A-Z0-9-])))|ytuser:)(?!feed/)(?P<id>[A-Za-z0-9_-]+)'
38c2e5b8 1403 _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/users/%s'
c5e8d7af 1404 _GDATA_PAGE_SIZE = 50
38c2e5b8 1405 _GDATA_URL = 'https://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d&alt=json'
78caa52a 1406 IE_NAME = 'youtube:user'
c5e8d7af 1407
cdc628a4
PH
1408 _TESTS = [{
1409 'url': 'https://www.youtube.com/user/TheLinuxFoundation',
1410 'playlist_mincount': 320,
1411 'info_dict': {
1412 'title': 'TheLinuxFoundation',
1413 }
1414 }, {
1415 'url': 'ytuser:phihag',
1416 'only_matching': True,
1417 }]
1418
e3ea4790 1419 @classmethod
f4b05232 1420 def suitable(cls, url):
e3ea4790
JMF
1421 # Don't return True if the url can be extracted with other youtube
1422 # extractor, the regex would is too permissive and it would match.
1423 other_ies = iter(klass for (name, klass) in globals().items() if name.endswith('IE') and klass is not cls)
5f6a1245
JW
1424 if any(ie.suitable(url) for ie in other_ies):
1425 return False
1426 else:
1427 return super(YoutubeUserIE, cls).suitable(url)
f4b05232 1428
c5e8d7af 1429 def _real_extract(self, url):
9ff67727 1430 username = self._match_id(url)
c5e8d7af
PH
1431
1432 # Download video ids using YouTube Data API. Result size per
1433 # query is limited (currently to 50 videos) so we need to query
1434 # page by page until there are no video ids - it means we got
1435 # all of them.
1436
b7ab0590 1437 def download_page(pagenum):
c5e8d7af
PH
1438 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1439
1440 gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
b7ab0590
PH
1441 page = self._download_webpage(
1442 gdata_url, username,
78caa52a 1443 'Downloading video ids from %d to %d' % (
b7ab0590 1444 start_index, start_index + self._GDATA_PAGE_SIZE))
c5e8d7af 1445
fd9cf738
JMF
1446 try:
1447 response = json.loads(page)
1448 except ValueError as err:
69ea8ca4 1449 raise ExtractorError('Invalid JSON in API response: ' + compat_str(err))
71c82637 1450 if 'entry' not in response['feed']:
b7ab0590 1451 return
fd9cf738 1452
c5e8d7af 1453 # Extract video identifiers
e302f9ce
PH
1454 entries = response['feed']['entry']
1455 for entry in entries:
1456 title = entry['title']['$t']
1457 video_id = entry['id']['$t'].split('/')[-1]
b7ab0590 1458 yield {
e302f9ce
PH
1459 '_type': 'url',
1460 'url': video_id,
1461 'ie_key': 'Youtube',
b11cec41 1462 'id': video_id,
e302f9ce 1463 'title': title,
b7ab0590 1464 }
9c44d242 1465 url_results = OnDemandPagedList(download_page, self._GDATA_PAGE_SIZE)
c5e8d7af 1466
7012b23c
PH
1467 return self.playlist_result(url_results, playlist_title=username)
1468
b05654f0
PH
1469
1470class YoutubeSearchIE(SearchInfoExtractor):
78caa52a
PH
1471 IE_DESC = 'YouTube.com searches'
1472 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
b05654f0 1473 _MAX_RESULTS = 1000
78caa52a 1474 IE_NAME = 'youtube:search'
b05654f0
PH
1475 _SEARCH_KEY = 'ytsearch'
1476
b05654f0
PH
1477 def _get_n_results(self, query, n):
1478 """Get a specified number of results for a query"""
1479
1480 video_ids = []
1481 pagenum = 0
1482 limit = n
83d548ef 1483 PAGE_SIZE = 50
b05654f0 1484
83d548ef
PH
1485 while (PAGE_SIZE * pagenum) < limit:
1486 result_url = self._API_URL % (
1487 compat_urllib_parse.quote_plus(query.encode('utf-8')),
1488 (PAGE_SIZE * pagenum) + 1)
7cc3570e 1489 data_json = self._download_webpage(
69ea8ca4
PH
1490 result_url, video_id='query "%s"' % query,
1491 note='Downloading page %s' % (pagenum + 1),
1492 errnote='Unable to download API page')
7cc3570e
PH
1493 data = json.loads(data_json)
1494 api_response = data['data']
1495
1496 if 'items' not in api_response:
07ad22b8 1497 raise ExtractorError(
78caa52a 1498 '[youtube] No video results', expected=True)
b05654f0
PH
1499
1500 new_ids = list(video['id'] for video in api_response['items'])
1501 video_ids += new_ids
1502
1503 limit = min(n, api_response['totalItems'])
1504 pagenum += 1
1505
1506 if len(video_ids) > n:
1507 video_ids = video_ids[:n]
7012b23c
PH
1508 videos = [self.url_result(video_id, 'Youtube', video_id=video_id)
1509 for video_id in video_ids]
b05654f0 1510 return self.playlist_result(videos, query)
75dff0ee 1511
c9ae7b95 1512
a3dd9248 1513class YoutubeSearchDateIE(YoutubeSearchIE):
cb7fb546 1514 IE_NAME = YoutubeSearchIE.IE_NAME + ':date'
a3dd9248
CM
1515 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc&orderby=published'
1516 _SEARCH_KEY = 'ytsearchdate'
78caa52a 1517 IE_DESC = 'YouTube.com searches, newest videos first'
75dff0ee 1518
c9ae7b95
PH
1519
1520class YoutubeSearchURLIE(InfoExtractor):
78caa52a
PH
1521 IE_DESC = 'YouTube.com search URLs'
1522 IE_NAME = 'youtube:search_url'
c9ae7b95 1523 _VALID_URL = r'https?://(?:www\.)?youtube\.com/results\?(.*?&)?search_query=(?P<query>[^&]+)(?:[&]|$)'
cdc628a4
PH
1524 _TESTS = [{
1525 'url': 'https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video',
1526 'playlist_mincount': 5,
1527 'info_dict': {
1528 'title': 'youtube-dl test video',
1529 }
1530 }]
c9ae7b95
PH
1531
1532 def _real_extract(self, url):
1533 mobj = re.match(self._VALID_URL, url)
1534 query = compat_urllib_parse.unquote_plus(mobj.group('query'))
1535
1536 webpage = self._download_webpage(url, query)
1537 result_code = self._search_regex(
98998cde 1538 r'(?s)<ol[^>]+class="item-section"(.*?)</ol>', webpage, 'result HTML')
c9ae7b95
PH
1539
1540 part_codes = re.findall(
1541 r'(?s)<h3 class="yt-lockup-title">(.*?)</h3>', result_code)
1542 entries = []
1543 for part_code in part_codes:
1544 part_title = self._html_search_regex(
6feb2d5e 1545 [r'(?s)title="([^"]+)"', r'>([^<]+)</a>'], part_code, 'item title', fatal=False)
c9ae7b95
PH
1546 part_url_snippet = self._html_search_regex(
1547 r'(?s)href="([^"]+)"', part_code, 'item URL')
1548 part_url = compat_urlparse.urljoin(
1549 'https://www.youtube.com/', part_url_snippet)
1550 entries.append({
1551 '_type': 'url',
1552 'url': part_url,
1553 'title': part_title,
1554 })
1555
1556 return {
1557 '_type': 'playlist',
1558 'entries': entries,
1559 'title': query,
1560 }
1561
1562
75dff0ee 1563class YoutubeShowIE(InfoExtractor):
78caa52a 1564 IE_DESC = 'YouTube.com (multi-season) shows'
cdc628a4 1565 _VALID_URL = r'https?://www\.youtube\.com/show/(?P<id>[^?#]*)'
78caa52a 1566 IE_NAME = 'youtube:show'
cdc628a4
PH
1567 _TESTS = [{
1568 'url': 'http://www.youtube.com/show/airdisasters',
1569 'playlist_mincount': 3,
1570 'info_dict': {
1571 'id': 'airdisasters',
1572 'title': 'Air Disasters',
1573 }
1574 }]
75dff0ee
JMF
1575
1576 def _real_extract(self, url):
1577 mobj = re.match(self._VALID_URL, url)
cdc628a4
PH
1578 playlist_id = mobj.group('id')
1579 webpage = self._download_webpage(
1580 url, playlist_id, 'Downloading show webpage')
75dff0ee
JMF
1581 # There's one playlist for each season of the show
1582 m_seasons = list(re.finditer(r'href="(/playlist\?list=.*?)"', webpage))
cdc628a4
PH
1583 self.to_screen('%s: Found %s seasons' % (playlist_id, len(m_seasons)))
1584 entries = [
1585 self.url_result(
1586 'https://www.youtube.com' + season.group(1), 'YoutubePlaylist')
1587 for season in m_seasons
1588 ]
1589 title = self._og_search_title(webpage, fatal=False)
1590
1591 return {
1592 '_type': 'playlist',
1593 'id': playlist_id,
1594 'title': title,
1595 'entries': entries,
1596 }
04cc9617
JMF
1597
1598
b2e8bc1b 1599class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
d7ae0639
JMF
1600 """
1601 Base class for extractors that fetch info from
1602 http://www.youtube.com/feed_ajax
1603 Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.
1604 """
b2e8bc1b 1605 _LOGIN_REQUIRED = True
43ba5456
JMF
1606 # use action_load_personal_feed instead of action_load_system_feed
1607 _PERSONAL_FEED = False
04cc9617 1608
d7ae0639
JMF
1609 @property
1610 def _FEED_TEMPLATE(self):
43ba5456
JMF
1611 action = 'action_load_system_feed'
1612 if self._PERSONAL_FEED:
1613 action = 'action_load_personal_feed'
38c2e5b8 1614 return 'https://www.youtube.com/feed_ajax?%s=1&feed_name=%s&paging=%%s' % (action, self._FEED_NAME)
d7ae0639
JMF
1615
1616 @property
1617 def IE_NAME(self):
78caa52a 1618 return 'youtube:%s' % self._FEED_NAME
04cc9617 1619
81f0259b 1620 def _real_initialize(self):
b2e8bc1b 1621 self._login()
81f0259b 1622
04cc9617
JMF
1623 def _real_extract(self, url):
1624 feed_entries = []
0e44d838
JMF
1625 paging = 0
1626 for i in itertools.count(1):
84d84211
PH
1627 info = self._download_json(
1628 self._FEED_TEMPLATE % paging,
1629 '%s feed' % self._FEED_NAME,
1630 'Downloading page %s' % i,
1631 transform_source=uppercase_escape)
f6177462 1632 feed_html = info.get('feed_html') or info.get('content_html')
1a9b9649 1633 load_more_widget_html = info.get('load_more_widget_html') or feed_html
43ba5456 1634 m_ids = re.finditer(r'"/watch\?v=(.*?)["&]', feed_html)
04cc9617 1635 ids = orderedSet(m.group(1) for m in m_ids)
7012b23c
PH
1636 feed_entries.extend(
1637 self.url_result(video_id, 'Youtube', video_id=video_id)
1638 for video_id in ids)
05ee2b6d
JMF
1639 mobj = re.search(
1640 r'data-uix-load-more-href="/?[^"]+paging=(?P<paging>\d+)',
1a9b9649 1641 load_more_widget_html)
05ee2b6d 1642 if mobj is None:
04cc9617 1643 break
05ee2b6d 1644 paging = mobj.group('paging')
d7ae0639
JMF
1645 return self.playlist_result(feed_entries, playlist_title=self._PLAYLIST_TITLE)
1646
5f6a1245 1647
d7ae0639 1648class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
7e17ec8c 1649 IE_NAME = 'youtube:recommended'
f3a34072 1650 IE_DESC = 'YouTube.com recommended videos, ":ytrec" for short (requires authentication)'
d7ae0639
JMF
1651 _VALID_URL = r'https?://www\.youtube\.com/feed/recommended|:ytrec(?:ommended)?'
1652 _FEED_NAME = 'recommended'
78caa52a 1653 _PLAYLIST_TITLE = 'Youtube Recommended videos'
c626a3d9 1654
5f6a1245 1655
448830ce 1656class YoutubeWatchLaterIE(YoutubePlaylistIE):
7e17ec8c 1657 IE_NAME = 'youtube:watchlater'
f3a34072 1658 IE_DESC = 'Youtube watch later list, ":ytwatchlater" for short (requires authentication)'
448830ce 1659 _VALID_URL = r'https?://www\.youtube\.com/(?:feed/watch_later|playlist\?list=WL)|:ytwatchlater'
43ba5456 1660 _FEED_NAME = 'watch_later'
78caa52a 1661 _PLAYLIST_TITLE = 'Youtube Watch Later'
43ba5456 1662 _PERSONAL_FEED = True
c626a3d9 1663
448830ce
S
1664 def _real_extract(self, url):
1665 return self._extract_playlist('WL')
1666
5f6a1245 1667
f459d170 1668class YoutubeHistoryIE(YoutubeFeedsInfoExtractor):
7e17ec8c 1669 IE_NAME = 'youtube:history'
f3a34072 1670 IE_DESC = 'Youtube watch history, ":ythistory" for short (requires authentication)'
78caa52a 1671 _VALID_URL = 'https?://www\.youtube\.com/feed/history|:ythistory'
f459d170
JMF
1672 _FEED_NAME = 'history'
1673 _PERSONAL_FEED = True
78caa52a 1674 _PLAYLIST_TITLE = 'Youtube Watch History'
f459d170 1675
5f6a1245 1676
c626a3d9 1677class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
78caa52a 1678 IE_NAME = 'youtube:favorites'
f3a34072 1679 IE_DESC = 'YouTube.com favourite videos, ":ytfav" for short (requires authentication)'
c7a7750d 1680 _VALID_URL = r'https?://www\.youtube\.com/my_favorites|:ytfav(?:ou?rites)?'
c626a3d9
JMF
1681 _LOGIN_REQUIRED = True
1682
1683 def _real_extract(self, url):
1684 webpage = self._download_webpage('https://www.youtube.com/my_favorites', 'Youtube Favourites videos')
78caa52a 1685 playlist_id = self._search_regex(r'list=(.+?)["&]', webpage, 'favourites playlist id')
c626a3d9 1686 return self.url_result(playlist_id, 'YoutubePlaylist')
15870e90
PH
1687
1688
1ed5b5c9 1689class YoutubeSubscriptionsIE(YoutubePlaylistIE):
78caa52a
PH
1690 IE_NAME = 'youtube:subscriptions'
1691 IE_DESC = 'YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication)'
1ed5b5c9 1692 _VALID_URL = r'https?://www\.youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
81127aa5 1693 _TESTS = []
1ed5b5c9
JMF
1694
1695 def _real_extract(self, url):
78caa52a 1696 title = 'Youtube Subscriptions'
1ed5b5c9
JMF
1697 page = self._download_webpage('https://www.youtube.com/feed/subscriptions', title)
1698
1699 # The extraction process is the same as for playlists, but the regex
1700 # for the video ids doesn't contain an index
1701 ids = []
1702 more_widget_html = content_html = page
1703
1704 for page_num in itertools.count(1):
1705 matches = re.findall(r'href="\s*/watch\?v=([0-9A-Za-z_-]{11})', content_html)
1706 new_ids = orderedSet(matches)
1707 ids.extend(new_ids)
1708
1709 mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
1710 if not mobj:
1711 break
1712
1713 more = self._download_json(
1714 'https://youtube.com/%s' % mobj.group('more'), title,
1715 'Downloading page #%s' % page_num,
1716 transform_source=uppercase_escape)
1717 content_html = more['content_html']
1718 more_widget_html = more['load_more_widget_html']
1719
1720 return {
1721 '_type': 'playlist',
1722 'title': title,
1723 'entries': self._ids_to_results(ids),
1724 }
1725
1726
15870e90
PH
1727class YoutubeTruncatedURLIE(InfoExtractor):
1728 IE_NAME = 'youtube:truncated_url'
1729 IE_DESC = False # Do not list
975d35db 1730 _VALID_URL = r'''(?x)
b95aab84
PH
1731 (?:https?://)?
1732 (?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/
1733 (?:watch\?(?:
c4808c60 1734 feature=[a-z_]+|
b95aab84
PH
1735 annotation_id=annotation_[^&]+|
1736 x-yt-cl=[0-9]+|
c1708b89 1737 hl=[^&]*|
b95aab84
PH
1738 )?
1739 |
1740 attribution_link\?a=[^&]+
1741 )
1742 $
975d35db 1743 '''
15870e90 1744
c4808c60
PH
1745 _TESTS = [{
1746 'url': 'http://www.youtube.com/watch?annotation_id=annotation_3951667041',
1747 'only_matching': True,
dc2fc736
PH
1748 }, {
1749 'url': 'http://www.youtube.com/watch?',
1750 'only_matching': True,
b95aab84
PH
1751 }, {
1752 'url': 'https://www.youtube.com/watch?x-yt-cl=84503534',
1753 'only_matching': True,
1754 }, {
1755 'url': 'https://www.youtube.com/watch?feature=foo',
1756 'only_matching': True,
c1708b89
PH
1757 }, {
1758 'url': 'https://www.youtube.com/watch?hl=en-GB',
1759 'only_matching': True,
c4808c60
PH
1760 }]
1761
15870e90
PH
1762 def _real_extract(self, url):
1763 raise ExtractorError(
78caa52a
PH
1764 'Did you forget to quote the URL? Remember that & is a meta '
1765 'character in most shells, so you want to put the URL in quotes, '
1766 'like youtube-dl '
1767 '"http://www.youtube.com/watch?feature=foo&v=BaW_jenozKc" '
1768 ' or simply youtube-dl BaW_jenozKc .',
15870e90 1769 expected=True)
772fd5cc
PH
1770
1771
1772class YoutubeTruncatedIDIE(InfoExtractor):
1773 IE_NAME = 'youtube:truncated_id'
1774 IE_DESC = False # Do not list
b95aab84 1775 _VALID_URL = r'https?://(?:www\.)?youtube\.com/watch\?v=(?P<id>[0-9A-Za-z_-]{1,10})$'
772fd5cc
PH
1776
1777 _TESTS = [{
1778 'url': 'https://www.youtube.com/watch?v=N_708QY7Ob',
1779 'only_matching': True,
1780 }]
1781
1782 def _real_extract(self, url):
1783 video_id = self._match_id(url)
1784 raise ExtractorError(
1785 'Incomplete YouTube ID %s. URL %s looks truncated.' % (video_id, url),
1786 expected=True)