]> jfr.im git - yt-dlp.git/blame - youtube_dl/extractor/youtube.py
[dailymotion] Convert to new subtitles system
[yt-dlp.git] / youtube_dl / extractor / youtube.py
CommitLineData
c5e8d7af 1# coding: utf-8
c5e8d7af 2
78caa52a
PH
3from __future__ import unicode_literals
4
5
0ca96d48 6import itertools
c5e8d7af 7import json
c4417ddb 8import os.path
c5e8d7af 9import re
42939b61 10import time
e0df6211 11import traceback
c5e8d7af 12
b05654f0 13from .common import InfoExtractor, SearchInfoExtractor
54d39d8b 14from .subtitles import SubtitlesInfoExtractor
2b25cb5d 15from ..jsinterp import JSInterpreter
54256267 16from ..swfinterp import SWFInterpreter
4bb4a188 17from ..compat import (
edf3e38e 18 compat_chr,
c5e8d7af 19 compat_parse_qs,
c5e8d7af
PH
20 compat_urllib_parse,
21 compat_urllib_request,
7c61bd36 22 compat_urlparse,
c5e8d7af 23 compat_str,
4bb4a188
PH
24)
25from ..utils import (
c5e8d7af 26 clean_html,
c5e8d7af 27 ExtractorError,
2d30521a 28 float_or_none,
4bb4a188
PH
29 get_element_by_attribute,
30 get_element_by_id,
dd27fd17 31 int_or_none,
9c44d242 32 OnDemandPagedList,
4bb4a188 33 orderedSet,
c5e8d7af
PH
34 unescapeHTML,
35 unified_strdate,
81c2f20b 36 uppercase_escape,
c5e8d7af
PH
37)
38
5f6a1245 39
de7f3446 40class YoutubeBaseInfoExtractor(InfoExtractor):
b2e8bc1b
JMF
41 """Provide base functions for Youtube extractors"""
42 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
83317f69 43 _TWOFACTOR_URL = 'https://accounts.google.com/SecondFactor'
b2e8bc1b
JMF
44 _NETRC_MACHINE = 'youtube'
45 # If True it will raise an error if no login info is provided
46 _LOGIN_REQUIRED = False
47
b2e8bc1b 48 def _set_language(self):
810fb84d
PH
49 self._set_cookie(
50 '.youtube.com', 'PREF', 'f1=50000000&hl=en',
42939b61 51 # YouTube sets the expire time to about two months
810fb84d 52 expire_time=time.time() + 2 * 30 * 24 * 3600)
b2e8bc1b
JMF
53
54 def _login(self):
83317f69 55 """
56 Attempt to log in to YouTube.
57 True is returned if successful or skipped.
58 False is returned if login failed.
59
60 If _LOGIN_REQUIRED is set and no authentication was provided, an error is raised.
61 """
b2e8bc1b
JMF
62 (username, password) = self._get_login_info()
63 # No authentication to be performed
64 if username is None:
65 if self._LOGIN_REQUIRED:
69ea8ca4 66 raise ExtractorError('No login info available, needed for using %s.' % self.IE_NAME, expected=True)
83317f69 67 return True
b2e8bc1b 68
7cc3570e
PH
69 login_page = self._download_webpage(
70 self._LOGIN_URL, None,
69ea8ca4
PH
71 note='Downloading login page',
72 errnote='unable to fetch login page', fatal=False)
7cc3570e
PH
73 if login_page is False:
74 return
b2e8bc1b 75
795f28f8 76 galx = self._search_regex(r'(?s)<input.+?name="GALX".+?value="(.+?)"',
78caa52a 77 login_page, 'Login GALX parameter')
c5e8d7af 78
b2e8bc1b
JMF
79 # Log in
80 login_form_strs = {
8bcc8756
JW
81 'continue': 'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
82 'Email': username,
83 'GALX': galx,
84 'Passwd': password,
85
86 'PersistentCookie': 'yes',
87 '_utf8': '霱',
88 'bgresponse': 'js_disabled',
89 'checkConnection': '',
90 'checkedDomains': 'youtube',
91 'dnConn': '',
92 'pstMsg': '0',
93 'rmShown': '1',
94 'secTok': '',
95 'signIn': 'Sign in',
96 'timeStmp': '',
97 'service': 'youtube',
98 'uilel': '3',
99 'hl': 'en_US',
b2e8bc1b 100 }
83317f69 101
b2e8bc1b
JMF
102 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
103 # chokes on unicode
5f6a1245 104 login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k, v in login_form_strs.items())
b2e8bc1b 105 login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
7cc3570e
PH
106
107 req = compat_urllib_request.Request(self._LOGIN_URL, login_data)
108 login_results = self._download_webpage(
109 req, None,
69ea8ca4 110 note='Logging in', errnote='unable to log in', fatal=False)
7cc3570e
PH
111 if login_results is False:
112 return False
83317f69 113
114 if re.search(r'id="errormsg_0_Passwd"', login_results) is not None:
69ea8ca4 115 raise ExtractorError('Please use your account password and a two-factor code instead of an application-specific password.', expected=True)
83317f69 116
117 # Two-Factor
118 # TODO add SMS and phone call support - these require making a request and then prompting the user
119
120 if re.search(r'(?i)<form[^>]* id="gaia_secondfactorform"', login_results) is not None:
121 tfa_code = self._get_tfa_info()
122
123 if tfa_code is None:
69ea8ca4
PH
124 self._downloader.report_warning('Two-factor authentication required. Provide it with --twofactor <code>')
125 self._downloader.report_warning('(Note that only TOTP (Google Authenticator App) codes work at this time.)')
83317f69 126 return False
127
128 # Unlike the first login form, secTok and timeStmp are both required for the TFA form
129
130 match = re.search(r'id="secTok"\n\s+value=\'(.+)\'/>', login_results, re.M | re.U)
131 if match is None:
69ea8ca4 132 self._downloader.report_warning('Failed to get secTok - did the page structure change?')
83317f69 133 secTok = match.group(1)
134 match = re.search(r'id="timeStmp"\n\s+value=\'(.+)\'/>', login_results, re.M | re.U)
135 if match is None:
69ea8ca4 136 self._downloader.report_warning('Failed to get timeStmp - did the page structure change?')
83317f69 137 timeStmp = match.group(1)
138
139 tfa_form_strs = {
78caa52a
PH
140 'continue': 'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
141 'smsToken': '',
142 'smsUserPin': tfa_code,
143 'smsVerifyPin': 'Verify',
144
145 'PersistentCookie': 'yes',
146 'checkConnection': '',
147 'checkedDomains': 'youtube',
148 'pstMsg': '1',
149 'secTok': secTok,
150 'timeStmp': timeStmp,
151 'service': 'youtube',
152 'hl': 'en_US',
83317f69 153 }
5f6a1245 154 tfa_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k, v in tfa_form_strs.items())
83317f69 155 tfa_data = compat_urllib_parse.urlencode(tfa_form).encode('ascii')
156
157 tfa_req = compat_urllib_request.Request(self._TWOFACTOR_URL, tfa_data)
158 tfa_results = self._download_webpage(
159 tfa_req, None,
69ea8ca4 160 note='Submitting TFA code', errnote='unable to submit tfa', fatal=False)
83317f69 161
162 if tfa_results is False:
163 return False
164
165 if re.search(r'(?i)<form[^>]* id="gaia_secondfactorform"', tfa_results) is not None:
69ea8ca4 166 self._downloader.report_warning('Two-factor code expired. Please try again, or use a one-use backup code instead.')
83317f69 167 return False
168 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', tfa_results) is not None:
69ea8ca4 169 self._downloader.report_warning('unable to log in - did the page structure change?')
83317f69 170 return False
171 if re.search(r'smsauth-interstitial-reviewsettings', tfa_results) is not None:
69ea8ca4 172 self._downloader.report_warning('Your Google account has a security notice. Please log in on your web browser, resolve the notice, and try again.')
83317f69 173 return False
174
7cc3570e 175 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
69ea8ca4 176 self._downloader.report_warning('unable to log in: bad username or password')
b2e8bc1b
JMF
177 return False
178 return True
179
b2e8bc1b
JMF
180 def _real_initialize(self):
181 if self._downloader is None:
182 return
42939b61 183 self._set_language()
b2e8bc1b
JMF
184 if not self._login():
185 return
c5e8d7af 186
8377574c 187
de7f3446 188class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
78caa52a 189 IE_DESC = 'YouTube.com'
cb7dfeea 190 _VALID_URL = r"""(?x)^
c5e8d7af 191 (
edb53e2d 192 (?:https?://|//) # http(s):// or protocol-independent URL
cb7dfeea 193 (?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/|
484aaeb2 194 (?:www\.)?deturl\.com/www\.youtube\.com/|
e70dc1d1 195 (?:www\.)?pwnyoutube\.com/|
f7000f3a 196 (?:www\.)?yourepeat\.com/|
e69ae5b9
JMF
197 tube\.majestyc\.net/|
198 youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains
c5e8d7af
PH
199 (?:.*?\#/)? # handle anchor (#/) redirect urls
200 (?: # the various things that can precede the ID:
ac7553d0 201 (?:(?:v|embed|e)/(?!videoseries)) # v/ or embed/ or e/
c5e8d7af 202 |(?: # or the v= param in all its forms
f7000f3a 203 (?:(?:watch|movie)(?:_popup)?(?:\.php)?/?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
c5e8d7af
PH
204 (?:\?|\#!?) # the params delimiter ? or # or #!
205 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
206 v=
207 )
f4b05232
JMF
208 ))
209 |youtu\.be/ # just youtu.be/xxxx
edb53e2d 210 |(?:www\.)?cleanvideosearch\.com/media/action/yt/watch\?videoId=
f4b05232 211 )
c5e8d7af 212 )? # all until now is optional -> you can pass the naked ID
8963d9c2 213 ([0-9A-Za-z_-]{11}) # here is it! the YouTube video ID
9291475f 214 (?!.*?&list=) # combined list/video URLs are handled by the playlist IE
c5e8d7af
PH
215 (?(1).+)? # if we found the ID, everything can follow
216 $"""
c5e8d7af 217 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
2c62dc26
PH
218 _formats = {
219 '5': {'ext': 'flv', 'width': 400, 'height': 240},
220 '6': {'ext': 'flv', 'width': 450, 'height': 270},
221 '13': {'ext': '3gp'},
222 '17': {'ext': '3gp', 'width': 176, 'height': 144},
223 '18': {'ext': 'mp4', 'width': 640, 'height': 360},
224 '22': {'ext': 'mp4', 'width': 1280, 'height': 720},
225 '34': {'ext': 'flv', 'width': 640, 'height': 360},
226 '35': {'ext': 'flv', 'width': 854, 'height': 480},
227 '36': {'ext': '3gp', 'width': 320, 'height': 240},
228 '37': {'ext': 'mp4', 'width': 1920, 'height': 1080},
229 '38': {'ext': 'mp4', 'width': 4096, 'height': 3072},
230 '43': {'ext': 'webm', 'width': 640, 'height': 360},
231 '44': {'ext': 'webm', 'width': 854, 'height': 480},
232 '45': {'ext': 'webm', 'width': 1280, 'height': 720},
233 '46': {'ext': 'webm', 'width': 1920, 'height': 1080},
234
1d043b93 235
86fe61c8 236 # 3d videos
43b81eb9
PH
237 '82': {'ext': 'mp4', 'height': 360, 'format_note': '3D', 'preference': -20},
238 '83': {'ext': 'mp4', 'height': 480, 'format_note': '3D', 'preference': -20},
239 '84': {'ext': 'mp4', 'height': 720, 'format_note': '3D', 'preference': -20},
240 '85': {'ext': 'mp4', 'height': 1080, 'format_note': '3D', 'preference': -20},
241 '100': {'ext': 'webm', 'height': 360, 'format_note': '3D', 'preference': -20},
242 '101': {'ext': 'webm', 'height': 480, 'format_note': '3D', 'preference': -20},
243 '102': {'ext': 'webm', 'height': 720, 'format_note': '3D', 'preference': -20},
836a086c 244
96fb5605 245 # Apple HTTP Live Streaming
43b81eb9
PH
246 '92': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'preference': -10},
247 '93': {'ext': 'mp4', 'height': 360, 'format_note': 'HLS', 'preference': -10},
248 '94': {'ext': 'mp4', 'height': 480, 'format_note': 'HLS', 'preference': -10},
249 '95': {'ext': 'mp4', 'height': 720, 'format_note': 'HLS', 'preference': -10},
250 '96': {'ext': 'mp4', 'height': 1080, 'format_note': 'HLS', 'preference': -10},
251 '132': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'preference': -10},
252 '151': {'ext': 'mp4', 'height': 72, 'format_note': 'HLS', 'preference': -10},
2c62dc26
PH
253
254 # DASH mp4 video
43b81eb9
PH
255 '133': {'ext': 'mp4', 'height': 240, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
256 '134': {'ext': 'mp4', 'height': 360, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
257 '135': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
258 '136': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
259 '137': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
e65566a9 260 '138': {'ext': 'mp4', 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40}, # Height can vary (https://github.com/rg3/youtube-dl/issues/4559)
43b81eb9
PH
261 '160': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
262 '264': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
0d2c1418
PH
263 '298': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'h264'},
264 '299': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'h264'},
265 '266': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'vcodec': 'h264'},
836a086c 266
f6f1fc92 267 # Dash mp4 audio
62cd676c
PH
268 '139': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'vcodec': 'none', 'abr': 48, 'preference': -50, 'container': 'm4a_dash'},
269 '140': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'vcodec': 'none', 'abr': 128, 'preference': -50, 'container': 'm4a_dash'},
270 '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'vcodec': 'none', 'abr': 256, 'preference': -50, 'container': 'm4a_dash'},
836a086c
AZ
271
272 # Dash webm
e75cafe9
A
273 '167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
274 '168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
275 '169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
276 '170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
277 '218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
278 '219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
1cc887cb 279 '278': {'ext': 'webm', 'height': 144, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'container': 'webm', 'vcodec': 'VP9'},
e75cafe9
A
280 '242': {'ext': 'webm', 'height': 240, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
281 '243': {'ext': 'webm', 'height': 360, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
282 '244': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
283 '245': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
284 '246': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
285 '247': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
286 '248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
3c80377b 287 '271': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
18061bba 288 '272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
a51d3aa0
PH
289 '302': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'VP9'},
290 '303': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'VP9'},
76b3c610 291 '308': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'VP9'},
10a404c3 292 '313': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'vcodec': 'VP9'},
76b3c610 293 '315': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'VP9'},
2c62dc26
PH
294
295 # Dash webm audio
55db73ef 296 '171': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'abr': 128, 'preference': -50},
e75cafe9 297 '172': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'abr': 256, 'preference': -50},
ce6b9a2d 298
0857baad
PH
299 # Dash webm audio with opus inside
300 '249': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 50, 'preference': -50},
301 '250': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 70, 'preference': -50},
302 '251': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 160, 'preference': -50},
303
ce6b9a2d
PH
304 # RTMP (unnamed)
305 '_rtmp': {'protocol': 'rtmp'},
c5e8d7af 306 }
836a086c 307
78caa52a 308 IE_NAME = 'youtube'
2eb88d95
PH
309 _TESTS = [
310 {
4bc3a23e
PH
311 'url': 'http://www.youtube.com/watch?v=BaW_jenozKc',
312 'info_dict': {
313 'id': 'BaW_jenozKc',
314 'ext': 'mp4',
315 'title': 'youtube-dl test video "\'/\\ä↭𝕐',
316 'uploader': 'Philipp Hagemeister',
317 'uploader_id': 'phihag',
318 'upload_date': '20121002',
319 'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',
320 'categories': ['Science & Technology'],
3e7c1224
PH
321 'like_count': int,
322 'dislike_count': int,
2eb88d95 323 }
0e853ca4 324 },
0e853ca4 325 {
4bc3a23e
PH
326 'url': 'http://www.youtube.com/watch?v=UxxajLWwzqY',
327 'note': 'Test generic use_cipher_signature video (#897)',
328 'info_dict': {
329 'id': 'UxxajLWwzqY',
330 'ext': 'mp4',
331 'upload_date': '20120506',
332 'title': 'Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]',
333 'description': 'md5:fea86fda2d5a5784273df5c7cc994d9f',
334 'uploader': 'Icona Pop',
335 'uploader_id': 'IconaPop',
2eb88d95 336 }
c108eb73
JMF
337 },
338 {
4bc3a23e
PH
339 'url': 'https://www.youtube.com/watch?v=07FYdnEawAQ',
340 'note': 'Test VEVO video with age protection (#956)',
341 'info_dict': {
342 'id': '07FYdnEawAQ',
343 'ext': 'mp4',
344 'upload_date': '20130703',
345 'title': 'Justin Timberlake - Tunnel Vision (Explicit)',
346 'description': 'md5:64249768eec3bc4276236606ea996373',
347 'uploader': 'justintimberlakeVEVO',
348 'uploader_id': 'justintimberlakeVEVO',
c108eb73
JMF
349 }
350 },
fccd3771 351 {
4bc3a23e
PH
352 'url': '//www.YouTube.com/watch?v=yZIXLfi8CZQ',
353 'note': 'Embed-only video (#1746)',
354 'info_dict': {
355 'id': 'yZIXLfi8CZQ',
356 'ext': 'mp4',
357 'upload_date': '20120608',
358 'title': 'Principal Sexually Assaults A Teacher - Episode 117 - 8th June 2012',
359 'description': 'md5:09b78bd971f1e3e289601dfba15ca4f7',
360 'uploader': 'SET India',
361 'uploader_id': 'setindia'
fccd3771
PH
362 }
363 },
dd27fd17 364 {
4bc3a23e
PH
365 'url': 'http://www.youtube.com/watch?v=a9LDPn-MO4I',
366 'note': '256k DASH audio (format 141) via DASH manifest',
367 'info_dict': {
368 'id': 'a9LDPn-MO4I',
369 'ext': 'm4a',
370 'upload_date': '20121002',
371 'uploader_id': '8KVIDEO',
372 'description': '',
373 'uploader': '8KVIDEO',
374 'title': 'UHDTV TEST 8K VIDEO.mp4'
4919603f 375 },
4bc3a23e
PH
376 'params': {
377 'youtube_include_dash_manifest': True,
378 'format': '141',
4919603f 379 },
dd27fd17 380 },
3489b7d2
JMF
381 # DASH manifest with encrypted signature
382 {
78caa52a
PH
383 'url': 'https://www.youtube.com/watch?v=IB3lcPjvWLA',
384 'info_dict': {
385 'id': 'IB3lcPjvWLA',
386 'ext': 'm4a',
b766eb27
JMF
387 'title': 'Afrojack, Spree Wilson - The Spark ft. Spree Wilson',
388 'description': 'md5:12e7067fa6735a77bdcbb58cb1187d2d',
78caa52a
PH
389 'uploader': 'AfrojackVEVO',
390 'uploader_id': 'AfrojackVEVO',
391 'upload_date': '20131011',
3489b7d2 392 },
4bc3a23e 393 'params': {
78caa52a
PH
394 'youtube_include_dash_manifest': True,
395 'format': '141',
3489b7d2
JMF
396 },
397 },
aaeb86f6
S
398 # JS player signature function name containing $
399 {
400 'url': 'https://www.youtube.com/watch?v=nfWlot6h_JM',
401 'info_dict': {
402 'id': 'nfWlot6h_JM',
403 'ext': 'm4a',
404 'title': 'Taylor Swift - Shake It Off',
405 'description': 'md5:2acfda1b285bdd478ccec22f9918199d',
406 'uploader': 'TaylorSwiftVEVO',
407 'uploader_id': 'TaylorSwiftVEVO',
408 'upload_date': '20140818',
409 },
410 'params': {
411 'youtube_include_dash_manifest': True,
412 'format': '141',
413 },
414 },
aa79ac0c
PH
415 # Controversy video
416 {
417 'url': 'https://www.youtube.com/watch?v=T4XJQO3qol8',
418 'info_dict': {
419 'id': 'T4XJQO3qol8',
420 'ext': 'mp4',
421 'upload_date': '20100909',
422 'uploader': 'The Amazing Atheist',
423 'uploader_id': 'TheAmazingAtheist',
424 'title': 'Burning Everyone\'s Koran',
425 'description': 'SUBSCRIBE: http://www.youtube.com/saturninefilms\n\nEven Obama has taken a stand against freedom on this issue: http://www.huffingtonpost.com/2010/09/09/obama-gma-interview-quran_n_710282.html',
426 }
c522adb1
JMF
427 },
428 # Normal age-gate video (No vevo, embed allowed)
429 {
430 'url': 'http://youtube.com/watch?v=HtVdAasjOgU',
431 'info_dict': {
432 'id': 'HtVdAasjOgU',
433 'ext': 'mp4',
434 'title': 'The Witcher 3: Wild Hunt - The Sword Of Destiny Trailer',
9ed99402 435 'description': 're:(?s).{100,}About the Game\n.*?The Witcher 3: Wild Hunt.{100,}',
c522adb1
JMF
436 'uploader': 'The Witcher',
437 'uploader_id': 'WitcherGame',
438 'upload_date': '20140605',
439 },
440 },
fccae2b9
S
441 # Age-gate video with encrypted signature
442 {
443 'url': 'http://www.youtube.com/watch?v=6kLq3WMV1nU',
444 'info_dict': {
445 'id': '6kLq3WMV1nU',
446 'ext': 'mp4',
447 'title': 'Dedication To My Ex (Miss That) (Lyric Video)',
448 'description': 'md5:33765bb339e1b47e7e72b5490139bb41',
449 'uploader': 'LloydVEVO',
450 'uploader_id': 'LloydVEVO',
451 'upload_date': '20110629',
452 },
453 },
774e208f
PH
454 # video_info is None (https://github.com/rg3/youtube-dl/issues/4421)
455 {
456 'url': '__2ABJjxzNo',
457 'info_dict': {
458 'id': '__2ABJjxzNo',
459 'ext': 'mp4',
460 'upload_date': '20100430',
461 'uploader_id': 'deadmau5',
462 'description': 'md5:12c56784b8032162bb936a5f76d55360',
463 'uploader': 'deadmau5',
464 'title': 'Deadmau5 - Some Chords (HD)',
465 },
466 'expected_warnings': [
467 'DASH manifest missing',
468 ]
e52a40ab
PH
469 },
470 # Olympics (https://github.com/rg3/youtube-dl/issues/4431)
471 {
472 'url': 'lqQg6PlCWgI',
473 'info_dict': {
474 'id': 'lqQg6PlCWgI',
475 'ext': 'mp4',
cbe2bd91
PH
476 'upload_date': '20120731',
477 'uploader_id': 'olympic',
478 'description': 'HO09 - Women - GER-AUS - Hockey - 31 July 2012 - London 2012 Olympic Games',
479 'uploader': 'Olympics',
480 'title': 'Hockey - Women - GER-AUS - London 2012 Olympic Games',
481 },
482 'params': {
483 'skip_download': 'requires avconv',
e52a40ab 484 }
cbe2bd91 485 },
6271f1ca
PH
486 # Non-square pixels
487 {
488 'url': 'https://www.youtube.com/watch?v=_b-2C3KPAM0',
489 'info_dict': {
490 'id': '_b-2C3KPAM0',
491 'ext': 'mp4',
492 'stretched_ratio': 16 / 9.,
493 'upload_date': '20110310',
494 'uploader_id': 'AllenMeow',
495 'description': 'made by Wacom from Korea | 字幕&加油添醋 by TY\'s Allen | 感謝heylisa00cavey1001同學熱情提供梗及翻譯',
496 'uploader': '孫艾倫',
497 'title': '[A-made] 變態妍字幕版 太妍 我就是這樣的人',
498 },
499 }
2eb88d95
PH
500 ]
501
e0df6211
PH
502 def __init__(self, *args, **kwargs):
503 super(YoutubeIE, self).__init__(*args, **kwargs)
83799698 504 self._player_cache = {}
e0df6211 505
c5e8d7af
PH
506 def report_video_info_webpage_download(self, video_id):
507 """Report attempt to download video info webpage."""
69ea8ca4 508 self.to_screen('%s: Downloading video info webpage' % video_id)
c5e8d7af 509
c5e8d7af
PH
510 def report_information_extraction(self, video_id):
511 """Report attempt to extract video information."""
69ea8ca4 512 self.to_screen('%s: Extracting video information' % video_id)
c5e8d7af
PH
513
514 def report_unavailable_format(self, video_id, format):
515 """Report extracted video URL."""
69ea8ca4 516 self.to_screen('%s: Format %s not available' % (video_id, format))
c5e8d7af
PH
517
518 def report_rtmp_download(self):
519 """Indicate the download will use the RTMP protocol."""
69ea8ca4 520 self.to_screen('RTMP download detected')
c5e8d7af 521
60064c53
PH
522 def _signature_cache_id(self, example_sig):
523 """ Return a string representation of a signature """
78caa52a 524 return '.'.join(compat_str(len(part)) for part in example_sig.split('.'))
60064c53
PH
525
526 def _extract_signature_function(self, video_id, player_url, example_sig):
cf010131 527 id_m = re.match(
60620368 528 r'.*?-(?P<id>[a-zA-Z0-9_-]+)(?:/watch_as3|/html5player)?\.(?P<ext>[a-z]+)$',
cf010131 529 player_url)
c081b35c
PH
530 if not id_m:
531 raise ExtractorError('Cannot identify player %r' % player_url)
e0df6211
PH
532 player_type = id_m.group('ext')
533 player_id = id_m.group('id')
534
c4417ddb 535 # Read from filesystem cache
60064c53
PH
536 func_id = '%s_%s_%s' % (
537 player_type, player_id, self._signature_cache_id(example_sig))
c4417ddb 538 assert os.path.basename(func_id) == func_id
a0e07d31 539
69ea8ca4 540 cache_spec = self._downloader.cache.load('youtube-sigfuncs', func_id)
a0e07d31 541 if cache_spec is not None:
78caa52a 542 return lambda s: ''.join(s[i] for i in cache_spec)
83799698 543
e0df6211
PH
544 if player_type == 'js':
545 code = self._download_webpage(
546 player_url, video_id,
69ea8ca4
PH
547 note='Downloading %s player %s' % (player_type, player_id),
548 errnote='Download of %s failed' % player_url)
83799698 549 res = self._parse_sig_js(code)
c4417ddb 550 elif player_type == 'swf':
e0df6211
PH
551 urlh = self._request_webpage(
552 player_url, video_id,
69ea8ca4
PH
553 note='Downloading %s player %s' % (player_type, player_id),
554 errnote='Download of %s failed' % player_url)
e0df6211 555 code = urlh.read()
83799698 556 res = self._parse_sig_swf(code)
e0df6211
PH
557 else:
558 assert False, 'Invalid player type %r' % player_type
559
a0e07d31 560 if cache_spec is None:
78caa52a 561 test_string = ''.join(map(compat_chr, range(len(example_sig))))
a0e07d31
PH
562 cache_res = res(test_string)
563 cache_spec = [ord(c) for c in cache_res]
83799698 564
69ea8ca4 565 self._downloader.cache.store('youtube-sigfuncs', func_id, cache_spec)
83799698
PH
566 return res
567
60064c53 568 def _print_sig_code(self, func, example_sig):
edf3e38e
PH
569 def gen_sig_code(idxs):
570 def _genslice(start, end, step):
78caa52a 571 starts = '' if start == 0 else str(start)
8bcc8756 572 ends = (':%d' % (end + step)) if end + step >= 0 else ':'
69ea8ca4 573 steps = '' if step == 1 else (':%d' % step)
78caa52a 574 return 's[%s%s%s]' % (starts, ends, steps)
edf3e38e
PH
575
576 step = None
7af808a5
PH
577 # Quelch pyflakes warnings - start will be set when step is set
578 start = '(Never used)'
edf3e38e
PH
579 for i, prev in zip(idxs[1:], idxs[:-1]):
580 if step is not None:
581 if i - prev == step:
582 continue
583 yield _genslice(start, prev, step)
584 step = None
585 continue
586 if i - prev in [-1, 1]:
587 step = i - prev
588 start = prev
589 continue
590 else:
78caa52a 591 yield 's[%d]' % prev
edf3e38e 592 if step is None:
78caa52a 593 yield 's[%d]' % i
edf3e38e
PH
594 else:
595 yield _genslice(start, i, step)
596
78caa52a 597 test_string = ''.join(map(compat_chr, range(len(example_sig))))
c705320f 598 cache_res = func(test_string)
edf3e38e 599 cache_spec = [ord(c) for c in cache_res]
78caa52a 600 expr_code = ' + '.join(gen_sig_code(cache_spec))
60064c53
PH
601 signature_id_tuple = '(%s)' % (
602 ', '.join(compat_str(len(p)) for p in example_sig.split('.')))
69ea8ca4 603 code = ('if tuple(len(p) for p in s.split(\'.\')) == %s:\n'
78caa52a 604 ' return %s\n') % (signature_id_tuple, expr_code)
69ea8ca4 605 self.to_screen('Extracted signature function:\n' + code)
edf3e38e 606
e0df6211
PH
607 def _parse_sig_js(self, jscode):
608 funcname = self._search_regex(
aaeb86f6 609 r'\.sig\|\|([a-zA-Z0-9$]+)\(', jscode,
8bcc8756 610 'Initial JS player signature function name')
2b25cb5d
PH
611
612 jsi = JSInterpreter(jscode)
613 initial_function = jsi.extract_function(funcname)
e0df6211
PH
614 return lambda s: initial_function([s])
615
616 def _parse_sig_swf(self, file_contents):
54256267 617 swfi = SWFInterpreter(file_contents)
78caa52a 618 TARGET_CLASSNAME = 'SignatureDecipher'
54256267 619 searched_class = swfi.extract_class(TARGET_CLASSNAME)
78caa52a 620 initial_function = swfi.extract_function(searched_class, 'decipher')
e0df6211
PH
621 return lambda s: initial_function([s])
622
83799698 623 def _decrypt_signature(self, s, video_id, player_url, age_gate=False):
257a2501 624 """Turn the encrypted s field into a working signature"""
6b37f0be 625
c8bf86d5 626 if player_url is None:
69ea8ca4 627 raise ExtractorError('Cannot decrypt signature without player_url')
920de7a2 628
69ea8ca4 629 if player_url.startswith('//'):
78caa52a 630 player_url = 'https:' + player_url
c8bf86d5 631 try:
62af3a0e 632 player_id = (player_url, self._signature_cache_id(s))
c8bf86d5
PH
633 if player_id not in self._player_cache:
634 func = self._extract_signature_function(
60064c53 635 video_id, player_url, s
c8bf86d5
PH
636 )
637 self._player_cache[player_id] = func
638 func = self._player_cache[player_id]
639 if self._downloader.params.get('youtube_print_sig_code'):
60064c53 640 self._print_sig_code(func, s)
c8bf86d5
PH
641 return func(s)
642 except Exception as e:
643 tb = traceback.format_exc()
644 raise ExtractorError(
78caa52a 645 'Signature extraction failed: ' + tb, cause=e)
e0df6211 646
1f343eaa 647 def _get_available_subtitles(self, video_id, webpage):
de7f3446 648 try:
60e47a26 649 subs_doc = self._download_xml(
38c2e5b8 650 'https://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,
7fad1c63
JMF
651 video_id, note=False)
652 except ExtractorError as err:
69ea8ca4 653 self._downloader.report_warning('unable to download video subtitles: %s' % compat_str(err))
de7f3446 654 return {}
de7f3446
JMF
655
656 sub_lang_list = {}
60e47a26
JMF
657 for track in subs_doc.findall('track'):
658 lang = track.attrib['lang_code']
7e660ac1
LD
659 if lang in sub_lang_list:
660 continue
de7f3446
JMF
661 params = compat_urllib_parse.urlencode({
662 'lang': lang,
663 'v': video_id,
ca715127 664 'fmt': self._downloader.params.get('subtitlesformat', 'srt'),
60e47a26 665 'name': track.attrib['name'].encode('utf-8'),
de7f3446 666 })
78caa52a 667 url = 'https://www.youtube.com/api/timedtext?' + params
de7f3446
JMF
668 sub_lang_list[lang] = url
669 if not sub_lang_list:
69ea8ca4 670 self._downloader.report_warning('video doesn\'t have subtitles')
de7f3446
JMF
671 return {}
672 return sub_lang_list
673
055e6f36 674 def _get_available_automatic_caption(self, video_id, webpage):
de7f3446
JMF
675 """We need the webpage for getting the captions url, pass it as an
676 argument to speed up the process."""
ca715127 677 sub_format = self._downloader.params.get('subtitlesformat', 'srt')
69ea8ca4 678 self.to_screen('%s: Looking for automatic captions' % video_id)
de7f3446 679 mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
78caa52a 680 err_msg = 'Couldn\'t find automatic captions for %s' % video_id
de7f3446
JMF
681 if mobj is None:
682 self._downloader.report_warning(err_msg)
683 return {}
684 player_config = json.loads(mobj.group(1))
685 try:
0792d563
PH
686 args = player_config['args']
687 caption_url = args['ttsurl']
688 timestamp = args['timestamp']
055e6f36
JMF
689 # We get the available subtitles
690 list_params = compat_urllib_parse.urlencode({
691 'type': 'list',
692 'tlangs': 1,
693 'asrs': 1,
de7f3446 694 })
055e6f36 695 list_url = caption_url + '&' + list_params
e26f8712 696 caption_list = self._download_xml(list_url, video_id)
e3dc22ca 697 original_lang_node = caption_list.find('track')
7d900ef1 698 if original_lang_node is None:
69ea8ca4 699 self._downloader.report_warning('Video doesn\'t have automatic captions')
e3dc22ca
JMF
700 return {}
701 original_lang = original_lang_node.attrib['lang_code']
7d900ef1 702 caption_kind = original_lang_node.attrib.get('kind', '')
055e6f36
JMF
703
704 sub_lang_list = {}
705 for lang_node in caption_list.findall('target'):
706 sub_lang = lang_node.attrib['lang_code']
707 params = compat_urllib_parse.urlencode({
708 'lang': original_lang,
709 'tlang': sub_lang,
710 'fmt': sub_format,
711 'ts': timestamp,
7d900ef1 712 'kind': caption_kind,
055e6f36
JMF
713 })
714 sub_lang_list[sub_lang] = caption_url + '&' + params
715 return sub_lang_list
de7f3446
JMF
716 # An extractor error can be raise by the download process if there are
717 # no automatic captions but there are subtitles
718 except (KeyError, ExtractorError):
719 self._downloader.report_warning(err_msg)
720 return {}
721
97665381
PH
722 @classmethod
723 def extract_id(cls, url):
724 mobj = re.match(cls._VALID_URL, url, re.VERBOSE)
c5e8d7af 725 if mobj is None:
69ea8ca4 726 raise ExtractorError('Invalid URL: %s' % url)
c5e8d7af
PH
727 video_id = mobj.group(2)
728 return video_id
729
1d043b93
JMF
730 def _extract_from_m3u8(self, manifest_url, video_id):
731 url_map = {}
5f6a1245 732
1d043b93
JMF
733 def _get_urls(_manifest):
734 lines = _manifest.split('\n')
735 urls = filter(lambda l: l and not l.startswith('#'),
8bcc8756 736 lines)
1d043b93 737 return urls
78caa52a 738 manifest = self._download_webpage(manifest_url, video_id, 'Downloading formats manifest')
1d043b93
JMF
739 formats_urls = _get_urls(manifest)
740 for format_url in formats_urls:
890f62e8 741 itag = self._search_regex(r'itag/(\d+?)/', format_url, 'itag')
1d043b93
JMF
742 url_map[itag] = format_url
743 return url_map
744
1fb07d10
JG
745 def _extract_annotations(self, video_id):
746 url = 'https://www.youtube.com/annotations_invideo?features=1&legacy=1&video_id=%s' % video_id
69ea8ca4 747 return self._download_webpage(url, video_id, note='Searching for annotations.', errnote='Unable to download video annotations.')
1fb07d10 748
da276600
PH
749 def _parse_dash_manifest(
750 self, video_id, dash_manifest_url, player_url, age_gate):
774e208f
PH
751 def decrypt_sig(mobj):
752 s = mobj.group(1)
753 dec_s = self._decrypt_signature(s, video_id, player_url, age_gate)
754 return '/signature/%s' % dec_s
755 dash_manifest_url = re.sub(r'/s/([\w\.]+)', decrypt_sig, dash_manifest_url)
756 dash_doc = self._download_xml(
757 dash_manifest_url, video_id,
758 note='Downloading DASH manifest',
759 errnote='Could not download DASH manifest')
760
761 formats = []
762 for r in dash_doc.findall('.//{urn:mpeg:DASH:schema:MPD:2011}Representation'):
763 url_el = r.find('{urn:mpeg:DASH:schema:MPD:2011}BaseURL')
764 if url_el is None:
765 continue
766 format_id = r.attrib['id']
767 video_url = url_el.text
768 filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength'))
769 f = {
770 'format_id': format_id,
771 'url': video_url,
772 'width': int_or_none(r.attrib.get('width')),
e65566a9 773 'height': int_or_none(r.attrib.get('height')),
774e208f
PH
774 'tbr': int_or_none(r.attrib.get('bandwidth'), 1000),
775 'asr': int_or_none(r.attrib.get('audioSamplingRate')),
776 'filesize': filesize,
777 'fps': int_or_none(r.attrib.get('frameRate')),
778 }
779 try:
780 existing_format = next(
781 fo for fo in formats
782 if fo['format_id'] == format_id)
783 except StopIteration:
ba617964
JMF
784 full_info = self._formats.get(format_id, {}).copy()
785 full_info.update(f)
786 formats.append(full_info)
774e208f
PH
787 else:
788 existing_format.update(f)
789 return formats
790
c5e8d7af 791 def _real_extract(self, url):
7e8c0af0 792 proto = (
78caa52a
PH
793 'http' if self._downloader.params.get('prefer_insecure', False)
794 else 'https')
7e8c0af0 795
c5e8d7af
PH
796 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
797 mobj = re.search(self._NEXT_URL_RE, url)
798 if mobj:
7e8c0af0 799 url = proto + '://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
97665381 800 video_id = self.extract_id(url)
c5e8d7af
PH
801
802 # Get video webpage
aa79ac0c 803 url = proto + '://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1&bpctr=9999999999' % video_id
a1f934b1 804 video_webpage = self._download_webpage(url, video_id)
c5e8d7af
PH
805
806 # Attempt to extract SWF player URL
e0df6211 807 mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
c5e8d7af
PH
808 if mobj is not None:
809 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
810 else:
811 player_url = None
812
813 # Get video info
6449cd80 814 embed_webpage = None
c108eb73 815 if re.search(r'player-age-gate-content">', video_webpage) is not None:
c108eb73
JMF
816 age_gate = True
817 # We simulate the access to the video from www.youtube.com/v/{video_id}
818 # this can be viewed without login into Youtube
beb95e77
CL
819 url = proto + '://www.youtube.com/embed/%s' % video_id
820 embed_webpage = self._download_webpage(url, video_id, 'Downloading embed webpage')
2c57c7fa
JMF
821 data = compat_urllib_parse.urlencode({
822 'video_id': video_id,
823 'eurl': 'https://youtube.googleapis.com/v/' + video_id,
c084c934 824 'sts': self._search_regex(
beb95e77 825 r'"sts"\s*:\s*(\d+)', embed_webpage, 'sts', default=''),
2c57c7fa 826 })
7e8c0af0 827 video_info_url = proto + '://www.youtube.com/get_video_info?' + data
94bd3613
PH
828 video_info_webpage = self._download_webpage(
829 video_info_url, video_id,
20436c30 830 note='Refetching age-gated info webpage',
94bd3613 831 errnote='unable to download video info webpage')
c5e8d7af 832 video_info = compat_parse_qs(video_info_webpage)
c108eb73
JMF
833 else:
834 age_gate = False
4e62ebe2
JMF
835 try:
836 # Try looking directly into the video webpage
837 mobj = re.search(r';ytplayer\.config\s*=\s*({.*?});', video_webpage)
838 if not mobj:
839 raise ValueError('Could not find ytplayer.config') # caught below
840 json_code = uppercase_escape(mobj.group(1))
841 ytplayer_config = json.loads(json_code)
842 args = ytplayer_config['args']
843 # Convert to the same format returned by compat_parse_qs
844 video_info = dict((k, [v]) for k, v in args.items())
845 if 'url_encoded_fmt_stream_map' not in args:
846 raise ValueError('No stream_map present') # caught below
847 except ValueError:
848 # We fallback to the get_video_info pages (used by the embed page)
849 self.report_video_info_webpage_download(video_id)
850 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
810fb84d
PH
851 video_info_url = (
852 '%s://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
853 % (proto, video_id, el_type))
854 video_info_webpage = self._download_webpage(
855 video_info_url,
4e62ebe2
JMF
856 video_id, note=False,
857 errnote='unable to download video info webpage')
858 video_info = compat_parse_qs(video_info_webpage)
859 if 'token' in video_info:
860 break
c5e8d7af
PH
861 if 'token' not in video_info:
862 if 'reason' in video_info:
d11271dd 863 raise ExtractorError(
78caa52a 864 'YouTube said: %s' % video_info['reason'][0],
d11271dd 865 expected=True, video_id=video_id)
c5e8d7af 866 else:
d11271dd 867 raise ExtractorError(
78caa52a 868 '"token" parameter not in video info for unknown reason',
d11271dd 869 video_id=video_id)
c5e8d7af 870
1d699755
PH
871 if 'view_count' in video_info:
872 view_count = int(video_info['view_count'][0])
873 else:
874 view_count = None
875
c5e8d7af
PH
876 # Check for "rental" videos
877 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
69ea8ca4 878 raise ExtractorError('"rental" videos not supported')
c5e8d7af
PH
879
880 # Start extracting information
881 self.report_information_extraction(video_id)
882
883 # uploader
884 if 'author' not in video_info:
69ea8ca4 885 raise ExtractorError('Unable to extract uploader name')
c5e8d7af
PH
886 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
887
888 # uploader_id
889 video_uploader_id = None
890 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
891 if mobj is not None:
892 video_uploader_id = mobj.group(1)
893 else:
69ea8ca4 894 self._downloader.report_warning('unable to extract uploader nickname')
c5e8d7af
PH
895
896 # title
a8c6b241 897 if 'title' in video_info:
aa92f063 898 video_title = video_info['title'][0]
a8c6b241 899 else:
69ea8ca4 900 self._downloader.report_warning('Unable to extract video title')
78caa52a 901 video_title = '_'
c5e8d7af
PH
902
903 # thumbnail image
7763b04e
JMF
904 # We try first to get a high quality image:
905 m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">',
906 video_webpage, re.DOTALL)
907 if m_thumb is not None:
908 video_thumbnail = m_thumb.group(1)
909 elif 'thumbnail_url' not in video_info:
69ea8ca4 910 self._downloader.report_warning('unable to extract video thumbnail')
f490e77e 911 video_thumbnail = None
c5e8d7af
PH
912 else: # don't panic if we can't find it
913 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
914
915 # upload date
916 upload_date = None
ad3bc6ac 917 mobj = re.search(r'(?s)id="eow-date.*?>(.*?)</span>', video_webpage)
beee53de
PH
918 if mobj is None:
919 mobj = re.search(
263bd4ec 920 r'(?s)id="watch-uploader-info".*?>.*?(?:Published|Uploaded|Streamed live) on (.*?)</strong>',
beee53de 921 video_webpage)
c5e8d7af
PH
922 if mobj is not None:
923 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
924 upload_date = unified_strdate(upload_date)
925
55f7bd2d
PH
926 m_cat_container = self._search_regex(
927 r'(?s)<h4[^>]*>\s*Category\s*</h4>\s*<ul[^>]*>(.*?)</ul>',
624dcebf 928 video_webpage, 'categories', default=None)
ec8deefc 929 if m_cat_container:
ad3bc6ac 930 category = self._html_search_regex(
01ed5c9b 931 r'(?s)<a[^<]+>(.*?)</a>', m_cat_container, 'category',
ad3bc6ac
PH
932 default=None)
933 video_categories = None if category is None else [category]
934 else:
935 video_categories = None
ec8deefc 936
c5e8d7af
PH
937 # description
938 video_description = get_element_by_id("eow-description", video_webpage)
939 if video_description:
27dcce19
PH
940 video_description = re.sub(r'''(?x)
941 <a\s+
942 (?:[a-zA-Z-]+="[^"]+"\s+)*?
943 title="([^"]+)"\s+
944 (?:[a-zA-Z-]+="[^"]+"\s+)*?
945 class="yt-uix-redirect-link"\s*>
946 [^<]+
947 </a>
948 ''', r'\1', video_description)
c5e8d7af
PH
949 video_description = clean_html(video_description)
950 else:
951 fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
952 if fd_mobj:
953 video_description = unescapeHTML(fd_mobj.group(1))
954 else:
78caa52a 955 video_description = ''
c5e8d7af 956
f30a38be 957 def _extract_count(count_name):
46374a56 958 count = self._search_regex(
f30a38be
JMF
959 r'id="watch-%s"[^>]*>.*?([\d,]+)\s*</span>' % re.escape(count_name),
960 video_webpage, count_name, default=None)
336c3a69
JMF
961 if count is not None:
962 return int(count.replace(',', ''))
963 return None
69ea8ca4
PH
964 like_count = _extract_count('like')
965 dislike_count = _extract_count('dislike')
336c3a69 966
c5e8d7af 967 # subtitles
d82134c3 968 video_subtitles = self.extract_subtitles(video_id, video_webpage)
c5e8d7af 969
c5e8d7af 970 if self._downloader.params.get('listsubtitles', False):
d665f8d3 971 self._list_available_subtitles(video_id, video_webpage)
c5e8d7af
PH
972 return
973
974 if 'length_seconds' not in video_info:
69ea8ca4 975 self._downloader.report_warning('unable to extract video duration')
b466b702 976 video_duration = None
c5e8d7af 977 else:
b466b702 978 video_duration = int(compat_urllib_parse.unquote_plus(video_info['length_seconds'][0]))
c5e8d7af 979
1fb07d10
JG
980 # annotations
981 video_annotations = None
982 if self._downloader.params.get('writeannotations', False):
5f6a1245 983 video_annotations = self._extract_annotations(video_id)
1fb07d10 984
dd27fd17
PH
985 def _map_to_format_list(urlmap):
986 formats = []
987 for itag, video_real_url in urlmap.items():
988 dct = {
989 'format_id': itag,
990 'url': video_real_url,
991 'player_url': player_url,
992 }
0b65e5d4
PH
993 if itag in self._formats:
994 dct.update(self._formats[itag])
dd27fd17
PH
995 formats.append(dct)
996 return formats
997
c5e8d7af
PH
998 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
999 self.report_rtmp_download()
dd27fd17
PH
1000 formats = [{
1001 'format_id': '_rtmp',
1002 'protocol': 'rtmp',
1003 'url': video_info['conn'][0],
1004 'player_url': player_url,
1005 }]
24270b03 1006 elif len(video_info.get('url_encoded_fmt_stream_map', [''])[0]) >= 1 or len(video_info.get('adaptive_fmts', [''])[0]) >= 1:
5f6a1245 1007 encoded_url_map = video_info.get('url_encoded_fmt_stream_map', [''])[0] + ',' + video_info.get('adaptive_fmts', [''])[0]
00fe14fc 1008 if 'rtmpe%3Dyes' in encoded_url_map:
a7055eb9 1009 raise ExtractorError('rtmpe downloads are not supported, see https://github.com/rg3/youtube-dl/issues/343 for more information.', expected=True)
c5e8d7af 1010 url_map = {}
00fe14fc 1011 for url_data_str in encoded_url_map.split(','):
c5e8d7af 1012 url_data = compat_parse_qs(url_data_str)
201e9eaa
PH
1013 if 'itag' not in url_data or 'url' not in url_data:
1014 continue
1015 format_id = url_data['itag'][0]
1016 url = url_data['url'][0]
1017
1018 if 'sig' in url_data:
1019 url += '&signature=' + url_data['sig'][0]
1020 elif 's' in url_data:
1021 encrypted_sig = url_data['s'][0]
6449cd80 1022 ASSETS_RE = r'"assets":.+?"js":\s*("[^"]+")'
201e9eaa 1023
beb95e77 1024 jsplayer_url_json = self._search_regex(
6449cd80
PH
1025 ASSETS_RE,
1026 embed_webpage if age_gate else video_webpage,
1027 'JS player URL (1)', default=None)
1028 if not jsplayer_url_json and not age_gate:
1029 # We need the embed website after all
1030 if embed_webpage is None:
1031 embed_url = proto + '://www.youtube.com/embed/%s' % video_id
1032 embed_webpage = self._download_webpage(
1033 embed_url, video_id, 'Downloading embed webpage')
1034 jsplayer_url_json = self._search_regex(
1035 ASSETS_RE, embed_webpage, 'JS player URL')
1036
beb95e77 1037 player_url = json.loads(jsplayer_url_json)
201e9eaa
PH
1038 if player_url is None:
1039 player_url_json = self._search_regex(
1040 r'ytplayer\.config.*?"url"\s*:\s*("[^"]+")',
78caa52a 1041 video_webpage, 'age gate player URL')
201e9eaa
PH
1042 player_url = json.loads(player_url_json)
1043
1044 if self._downloader.params.get('verbose'):
cf010131 1045 if player_url is None:
201e9eaa
PH
1046 player_version = 'unknown'
1047 player_desc = 'unknown'
1048 else:
1049 if player_url.endswith('swf'):
1050 player_version = self._search_regex(
1051 r'-(.+?)(?:/watch_as3)?\.swf$', player_url,
78caa52a 1052 'flash player', fatal=False)
201e9eaa 1053 player_desc = 'flash player %s' % player_version
cf010131 1054 else:
201e9eaa
PH
1055 player_version = self._search_regex(
1056 r'html5player-([^/]+?)(?:/html5player)?\.js',
1057 player_url,
1058 'html5 player', fatal=False)
78caa52a 1059 player_desc = 'html5 player %s' % player_version
201e9eaa 1060
60064c53 1061 parts_sizes = self._signature_cache_id(encrypted_sig)
69ea8ca4 1062 self.to_screen('{%s} signature length %s, %s' %
9e1a5b84 1063 (format_id, parts_sizes, player_desc))
201e9eaa
PH
1064
1065 signature = self._decrypt_signature(
1066 encrypted_sig, video_id, player_url, age_gate)
1067 url += '&signature=' + signature
1068 if 'ratebypass' not in url:
1069 url += '&ratebypass=yes'
1070 url_map[format_id] = url
dd27fd17 1071 formats = _map_to_format_list(url_map)
1d043b93
JMF
1072 elif video_info.get('hlsvp'):
1073 manifest_url = video_info['hlsvp'][0]
1074 url_map = self._extract_from_m3u8(manifest_url, video_id)
dd27fd17 1075 formats = _map_to_format_list(url_map)
c5e8d7af 1076 else:
69ea8ca4 1077 raise ExtractorError('no conn, hlsvp or url_encoded_fmt_stream_map information found in video info')
c5e8d7af 1078
dd27fd17 1079 # Look for the DASH manifest
203fb43f 1080 if self._downloader.params.get('youtube_include_dash_manifest', True):
774e208f 1081 dash_mpd = video_info.get('dashmpd')
75111274 1082 if dash_mpd:
774e208f
PH
1083 dash_manifest_url = dash_mpd[0]
1084 try:
1085 dash_formats = self._parse_dash_manifest(
da276600 1086 video_id, dash_manifest_url, player_url, age_gate)
774e208f
PH
1087 except (ExtractorError, KeyError) as e:
1088 self.report_warning(
1089 'Skipping DASH manifest: %r' % e, video_id)
1090 else:
e65566a9
PH
1091 # Hide the formats we found through non-DASH
1092 dash_keys = set(df['format_id'] for df in dash_formats)
1093 for f in formats:
1094 if f['format_id'] in dash_keys:
1095 f['format_id'] = 'nondash-%s' % f['format_id']
ee61f6f3 1096 f['preference'] = f.get('preference', 0) - 10000
774e208f 1097 formats.extend(dash_formats)
d80044c2 1098
6271f1ca
PH
1099 # Check for malformed aspect ratio
1100 stretched_m = re.search(
1101 r'<meta\s+property="og:video:tag".*?content="yt:stretch=(?P<w>[0-9]+):(?P<h>[0-9]+)">',
1102 video_webpage)
1103 if stretched_m:
1104 ratio = float(stretched_m.group('w')) / float(stretched_m.group('h'))
1105 for f in formats:
1106 if f.get('vcodec') != 'none':
1107 f['stretched_ratio'] = ratio
1108
4bcc7bd1 1109 self._sort_formats(formats)
4ea3be0a 1110
1111 return {
8bcc8756
JW
1112 'id': video_id,
1113 'uploader': video_uploader,
1114 'uploader_id': video_uploader_id,
1115 'upload_date': upload_date,
1116 'title': video_title,
1117 'thumbnail': video_thumbnail,
1118 'description': video_description,
1119 'categories': video_categories,
1120 'subtitles': video_subtitles,
1121 'duration': video_duration,
1122 'age_limit': 18 if age_gate else 0,
1123 'annotations': video_annotations,
7e8c0af0 1124 'webpage_url': proto + '://www.youtube.com/watch?v=%s' % video_id,
8bcc8756 1125 'view_count': view_count,
4ea3be0a 1126 'like_count': like_count,
1127 'dislike_count': dislike_count,
2d30521a 1128 'average_rating': float_or_none(video_info.get('avg_rating', [None])[0]),
8bcc8756 1129 'formats': formats,
4ea3be0a 1130 }
c5e8d7af 1131
5f6a1245 1132
880e1c52 1133class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
78caa52a 1134 IE_DESC = 'YouTube.com playlists'
d67cc9fa 1135 _VALID_URL = r"""(?x)(?:
c5e8d7af
PH
1136 (?:https?://)?
1137 (?:\w+\.)?
1138 youtube\.com/
1139 (?:
ac7553d0 1140 (?:course|view_play_list|my_playlists|artist|playlist|watch|embed/videoseries)
c5e8d7af
PH
1141 \? (?:.*?&)*? (?:p|a|list)=
1142 | p/
1143 )
d67cc9fa 1144 (
7d568f5a 1145 (?:PL|LL|EC|UU|FL|RD)?[0-9A-Za-z-_]{10,}
5f6a1245 1146 # Top tracks, they can also include dots
d67cc9fa
JMF
1147 |(?:MC)[\w\.]*
1148 )
c5e8d7af
PH
1149 .*
1150 |
7d568f5a 1151 ((?:PL|LL|EC|UU|FL|RD)[0-9A-Za-z-_]{10,})
c5e8d7af 1152 )"""
dbb94fb0 1153 _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s'
dbb94fb0 1154 _VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})&amp;[^"]*?index=(?P<index>\d+)'
78caa52a 1155 IE_NAME = 'youtube:playlist'
81127aa5
PH
1156 _TESTS = [{
1157 'url': 'https://www.youtube.com/playlist?list=PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re',
1158 'info_dict': {
1159 'title': 'ytdl test PL',
a1cf99d0 1160 'id': 'PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re',
81127aa5
PH
1161 },
1162 'playlist_count': 3,
9291475f
PH
1163 }, {
1164 'url': 'https://www.youtube.com/playlist?list=PLtPgu7CB4gbZDA7i_euNxn75ISqxwZPYx',
1165 'info_dict': {
acf757f4 1166 'id': 'PLtPgu7CB4gbZDA7i_euNxn75ISqxwZPYx',
9291475f
PH
1167 'title': 'YDL_Empty_List',
1168 },
1169 'playlist_count': 0,
1170 }, {
1171 'note': 'Playlist with deleted videos (#651). As a bonus, the video #51 is also twice in this list.',
1172 'url': 'https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC',
1173 'info_dict': {
1174 'title': '29C3: Not my department',
acf757f4 1175 'id': 'PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC',
9291475f
PH
1176 },
1177 'playlist_count': 95,
1178 }, {
1179 'note': 'issue #673',
1180 'url': 'PLBB231211A4F62143',
1181 'info_dict': {
f46a8702 1182 'title': '[OLD]Team Fortress 2 (Class-based LP)',
acf757f4 1183 'id': 'PLBB231211A4F62143',
9291475f
PH
1184 },
1185 'playlist_mincount': 26,
1186 }, {
1187 'note': 'Large playlist',
1188 'url': 'https://www.youtube.com/playlist?list=UUBABnxM4Ar9ten8Mdjj1j0Q',
1189 'info_dict': {
1190 'title': 'Uploads from Cauchemar',
acf757f4 1191 'id': 'UUBABnxM4Ar9ten8Mdjj1j0Q',
9291475f
PH
1192 },
1193 'playlist_mincount': 799,
1194 }, {
1195 'url': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl',
1196 'info_dict': {
1197 'title': 'YDL_safe_search',
acf757f4 1198 'id': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl',
9291475f
PH
1199 },
1200 'playlist_count': 2,
ac7553d0
PH
1201 }, {
1202 'note': 'embedded',
1203 'url': 'http://www.youtube.com/embed/videoseries?list=PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu',
1204 'playlist_count': 4,
1205 'info_dict': {
1206 'title': 'JODA15',
acf757f4 1207 'id': 'PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu',
ac7553d0 1208 }
6b08cdf6
PH
1209 }, {
1210 'note': 'Embedded SWF player',
1211 'url': 'http://www.youtube.com/p/YN5VISEtHet5D4NEvfTd0zcgFk84NqFZ?hl=en_US&fs=1&rel=0',
1212 'playlist_count': 4,
1213 'info_dict': {
1214 'title': 'JODA7',
acf757f4 1215 'id': 'YN5VISEtHet5D4NEvfTd0zcgFk84NqFZ',
6b08cdf6 1216 }
4b7df0d3
JMF
1217 }, {
1218 'note': 'Buggy playlist: the webpage has a "Load more" button but it doesn\'t have more videos',
1219 'url': 'https://www.youtube.com/playlist?list=UUXw-G3eDE9trcvY2sBMM_aA',
1220 'info_dict': {
acf757f4
PH
1221 'title': 'Uploads from Interstellar Movie',
1222 'id': 'UUXw-G3eDE9trcvY2sBMM_aA',
4b7df0d3
JMF
1223 },
1224 'playlist_mincout': 21,
81127aa5 1225 }]
c5e8d7af 1226
880e1c52
JMF
1227 def _real_initialize(self):
1228 self._login()
1229
652cdaa2 1230 def _ids_to_results(self, ids):
c9cc0bf5
PH
1231 return [
1232 self.url_result(vid_id, 'Youtube', video_id=vid_id)
1233 for vid_id in ids]
652cdaa2
JMF
1234
1235 def _extract_mix(self, playlist_id):
1236 # The mixes are generated from a a single video
1237 # the id of the playlist is just 'RD' + video_id
7d4afc55 1238 url = 'https://youtube.com/watch?v=%s&list=%s' % (playlist_id[-11:], playlist_id)
c9cc0bf5 1239 webpage = self._download_webpage(
78caa52a 1240 url, playlist_id, 'Downloading Youtube mix')
bc2f773b 1241 search_title = lambda class_name: get_element_by_attribute('class', class_name, webpage)
c9cc0bf5
PH
1242 title_span = (
1243 search_title('playlist-title') or
1244 search_title('title long-title') or
1245 search_title('title'))
76d1700b 1246 title = clean_html(title_span)
c9cc0bf5
PH
1247 ids = orderedSet(re.findall(
1248 r'''(?xs)data-video-username=".*?".*?
1249 href="/watch\?v=([0-9A-Za-z_-]{11})&amp;[^"]*?list=%s''' % re.escape(playlist_id),
1250 webpage))
652cdaa2
JMF
1251 url_results = self._ids_to_results(ids)
1252
1253 return self.playlist_result(url_results, playlist_id, title)
1254
c5e8d7af
PH
1255 def _real_extract(self, url):
1256 # Extract playlist id
d67cc9fa 1257 mobj = re.match(self._VALID_URL, url)
c5e8d7af 1258 if mobj is None:
69ea8ca4 1259 raise ExtractorError('Invalid URL: %s' % url)
47192f92
FV
1260 playlist_id = mobj.group(1) or mobj.group(2)
1261
1262 # Check if it's a video-specific URL
7c61bd36 1263 query_dict = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
47192f92
FV
1264 if 'v' in query_dict:
1265 video_id = query_dict['v'][0]
1266 if self._downloader.params.get('noplaylist'):
69ea8ca4 1267 self.to_screen('Downloading just video %s because of --no-playlist' % video_id)
7012b23c 1268 return self.url_result(video_id, 'Youtube', video_id=video_id)
47192f92 1269 else:
69ea8ca4 1270 self.to_screen('Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id))
c5e8d7af 1271
7d4afc55 1272 if playlist_id.startswith('RD'):
652cdaa2
JMF
1273 # Mixes require a custom extraction process
1274 return self._extract_mix(playlist_id)
1275
dbb94fb0
S
1276 url = self._TEMPLATE_URL % playlist_id
1277 page = self._download_webpage(url, playlist_id)
1278 more_widget_html = content_html = page
1279
10c0e2d8 1280 # Check if the playlist exists or is private
e399853d 1281 if re.search(r'<div class="yt-alert-message">[^<]*?(The|This) playlist (does not exist|is private)[^<]*?</div>', page) is not None:
10c0e2d8 1282 raise ExtractorError(
78caa52a 1283 'The playlist doesn\'t exist or is private, use --username or '
10c0e2d8
JMF
1284 '--netrc to access it.',
1285 expected=True)
1286
dcbb4580
JMF
1287 # Extract the video ids from the playlist pages
1288 ids = []
c5e8d7af 1289
755eb032 1290 for page_num in itertools.count(1):
dbb94fb0 1291 matches = re.finditer(self._VIDEO_RE, content_html)
6e47b51e
JMF
1292 # We remove the duplicates and the link with index 0
1293 # (it's not the first video of the playlist)
1294 new_ids = orderedSet(m.group('id') for m in matches if m.group('index') != '0')
dcbb4580 1295 ids.extend(new_ids)
c5e8d7af 1296
dbb94fb0
S
1297 mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
1298 if not mobj:
c5e8d7af
PH
1299 break
1300
dbb94fb0 1301 more = self._download_json(
5912c639
PH
1302 'https://youtube.com/%s' % mobj.group('more'), playlist_id,
1303 'Downloading page #%s' % page_num,
1304 transform_source=uppercase_escape)
dbb94fb0 1305 content_html = more['content_html']
4b7df0d3
JMF
1306 if not content_html.strip():
1307 # Some webpages show a "Load more" button but they don't
1308 # have more videos
1309 break
dbb94fb0
S
1310 more_widget_html = more['load_more_widget_html']
1311
1312 playlist_title = self._html_search_regex(
68eb8e90 1313 r'(?s)<h1 class="pl-header-title[^"]*">\s*(.*?)\s*</h1>',
78caa52a 1314 page, 'title')
c5e8d7af 1315
652cdaa2 1316 url_results = self._ids_to_results(ids)
dcbb4580 1317 return self.playlist_result(url_results, playlist_id, playlist_title)
c5e8d7af
PH
1318
1319
1320class YoutubeChannelIE(InfoExtractor):
78caa52a 1321 IE_DESC = 'YouTube.com channels'
9ff67727 1322 _VALID_URL = r'https?://(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/(?P<id>[0-9A-Za-z_-]+)'
78caa52a 1323 IE_NAME = 'youtube:channel'
cdc628a4
PH
1324 _TESTS = [{
1325 'note': 'paginated channel',
1326 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w',
1327 'playlist_mincount': 91,
acf757f4
PH
1328 'info_dict': {
1329 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
1330 }
cdc628a4 1331 }]
c5e8d7af
PH
1332
1333 def extract_videos_from_page(self, page):
1334 ids_in_page = []
1335 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1336 if mobj.group(1) not in ids_in_page:
1337 ids_in_page.append(mobj.group(1))
1338 return ids_in_page
1339
1340 def _real_extract(self, url):
9ff67727 1341 channel_id = self._match_id(url)
c5e8d7af 1342
c5e8d7af 1343 video_ids = []
b9643eed
JMF
1344 url = 'https://www.youtube.com/channel/%s/videos' % channel_id
1345 channel_page = self._download_webpage(url, channel_id)
31812a9e
PH
1346 autogenerated = re.search(r'''(?x)
1347 class="[^"]*?(?:
1348 channel-header-autogenerated-label|
1349 yt-channel-title-autogenerated
1350 )[^"]*"''', channel_page) is not None
c5e8d7af 1351
b9643eed
JMF
1352 if autogenerated:
1353 # The videos are contained in a single page
1354 # the ajax pages can't be used, they are empty
1355 video_ids = self.extract_videos_from_page(channel_page)
b82f815f
PH
1356 entries = [
1357 self.url_result(video_id, 'Youtube', video_id=video_id)
1358 for video_id in video_ids]
1359 return self.playlist_result(entries, channel_id)
1360
1361 def _entries():
23d3608c 1362 more_widget_html = content_html = channel_page
b9643eed 1363 for pagenum in itertools.count(1):
81c2f20b 1364
23d3608c 1365 ids_in_page = self.extract_videos_from_page(content_html)
b82f815f
PH
1366 for video_id in ids_in_page:
1367 yield self.url_result(
1368 video_id, 'Youtube', video_id=video_id)
5f6a1245 1369
23d3608c
JMF
1370 mobj = re.search(
1371 r'data-uix-load-more-href="/?(?P<more>[^"]+)"',
1372 more_widget_html)
1373 if not mobj:
b9643eed 1374 break
c5e8d7af 1375
23d3608c
JMF
1376 more = self._download_json(
1377 'https://youtube.com/%s' % mobj.group('more'), channel_id,
1378 'Downloading page #%s' % (pagenum + 1),
1379 transform_source=uppercase_escape)
1380 content_html = more['content_html']
1381 more_widget_html = more['load_more_widget_html']
1382
b82f815f 1383 return self.playlist_result(_entries(), channel_id)
c5e8d7af
PH
1384
1385
1386class YoutubeUserIE(InfoExtractor):
78caa52a 1387 IE_DESC = 'YouTube.com user videos (URL or "ytuser" keyword)'
9ff67727 1388 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:user/)?(?!(?:attribution_link|watch|results)(?:$|[^a-z_A-Z0-9-])))|ytuser:)(?!feed/)(?P<id>[A-Za-z0-9_-]+)'
38c2e5b8 1389 _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/users/%s'
c5e8d7af 1390 _GDATA_PAGE_SIZE = 50
38c2e5b8 1391 _GDATA_URL = 'https://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d&alt=json'
78caa52a 1392 IE_NAME = 'youtube:user'
c5e8d7af 1393
cdc628a4
PH
1394 _TESTS = [{
1395 'url': 'https://www.youtube.com/user/TheLinuxFoundation',
1396 'playlist_mincount': 320,
1397 'info_dict': {
1398 'title': 'TheLinuxFoundation',
1399 }
1400 }, {
1401 'url': 'ytuser:phihag',
1402 'only_matching': True,
1403 }]
1404
e3ea4790 1405 @classmethod
f4b05232 1406 def suitable(cls, url):
e3ea4790
JMF
1407 # Don't return True if the url can be extracted with other youtube
1408 # extractor, the regex would is too permissive and it would match.
1409 other_ies = iter(klass for (name, klass) in globals().items() if name.endswith('IE') and klass is not cls)
5f6a1245
JW
1410 if any(ie.suitable(url) for ie in other_ies):
1411 return False
1412 else:
1413 return super(YoutubeUserIE, cls).suitable(url)
f4b05232 1414
c5e8d7af 1415 def _real_extract(self, url):
9ff67727 1416 username = self._match_id(url)
c5e8d7af
PH
1417
1418 # Download video ids using YouTube Data API. Result size per
1419 # query is limited (currently to 50 videos) so we need to query
1420 # page by page until there are no video ids - it means we got
1421 # all of them.
1422
b7ab0590 1423 def download_page(pagenum):
c5e8d7af
PH
1424 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1425
1426 gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
b7ab0590
PH
1427 page = self._download_webpage(
1428 gdata_url, username,
78caa52a 1429 'Downloading video ids from %d to %d' % (
b7ab0590 1430 start_index, start_index + self._GDATA_PAGE_SIZE))
c5e8d7af 1431
fd9cf738
JMF
1432 try:
1433 response = json.loads(page)
1434 except ValueError as err:
69ea8ca4 1435 raise ExtractorError('Invalid JSON in API response: ' + compat_str(err))
71c82637 1436 if 'entry' not in response['feed']:
b7ab0590 1437 return
fd9cf738 1438
c5e8d7af 1439 # Extract video identifiers
e302f9ce
PH
1440 entries = response['feed']['entry']
1441 for entry in entries:
1442 title = entry['title']['$t']
1443 video_id = entry['id']['$t'].split('/')[-1]
b7ab0590 1444 yield {
e302f9ce
PH
1445 '_type': 'url',
1446 'url': video_id,
1447 'ie_key': 'Youtube',
b11cec41 1448 'id': video_id,
e302f9ce 1449 'title': title,
b7ab0590 1450 }
9c44d242 1451 url_results = OnDemandPagedList(download_page, self._GDATA_PAGE_SIZE)
c5e8d7af 1452
7012b23c
PH
1453 return self.playlist_result(url_results, playlist_title=username)
1454
b05654f0
PH
1455
1456class YoutubeSearchIE(SearchInfoExtractor):
78caa52a
PH
1457 IE_DESC = 'YouTube.com searches'
1458 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
b05654f0 1459 _MAX_RESULTS = 1000
78caa52a 1460 IE_NAME = 'youtube:search'
b05654f0
PH
1461 _SEARCH_KEY = 'ytsearch'
1462
b05654f0
PH
1463 def _get_n_results(self, query, n):
1464 """Get a specified number of results for a query"""
1465
1466 video_ids = []
1467 pagenum = 0
1468 limit = n
83d548ef 1469 PAGE_SIZE = 50
b05654f0 1470
83d548ef
PH
1471 while (PAGE_SIZE * pagenum) < limit:
1472 result_url = self._API_URL % (
1473 compat_urllib_parse.quote_plus(query.encode('utf-8')),
1474 (PAGE_SIZE * pagenum) + 1)
7cc3570e 1475 data_json = self._download_webpage(
69ea8ca4
PH
1476 result_url, video_id='query "%s"' % query,
1477 note='Downloading page %s' % (pagenum + 1),
1478 errnote='Unable to download API page')
7cc3570e
PH
1479 data = json.loads(data_json)
1480 api_response = data['data']
1481
1482 if 'items' not in api_response:
07ad22b8 1483 raise ExtractorError(
78caa52a 1484 '[youtube] No video results', expected=True)
b05654f0
PH
1485
1486 new_ids = list(video['id'] for video in api_response['items'])
1487 video_ids += new_ids
1488
1489 limit = min(n, api_response['totalItems'])
1490 pagenum += 1
1491
1492 if len(video_ids) > n:
1493 video_ids = video_ids[:n]
7012b23c
PH
1494 videos = [self.url_result(video_id, 'Youtube', video_id=video_id)
1495 for video_id in video_ids]
b05654f0 1496 return self.playlist_result(videos, query)
75dff0ee 1497
c9ae7b95 1498
a3dd9248 1499class YoutubeSearchDateIE(YoutubeSearchIE):
cb7fb546 1500 IE_NAME = YoutubeSearchIE.IE_NAME + ':date'
a3dd9248
CM
1501 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc&orderby=published'
1502 _SEARCH_KEY = 'ytsearchdate'
78caa52a 1503 IE_DESC = 'YouTube.com searches, newest videos first'
75dff0ee 1504
c9ae7b95
PH
1505
1506class YoutubeSearchURLIE(InfoExtractor):
78caa52a
PH
1507 IE_DESC = 'YouTube.com search URLs'
1508 IE_NAME = 'youtube:search_url'
c9ae7b95 1509 _VALID_URL = r'https?://(?:www\.)?youtube\.com/results\?(.*?&)?search_query=(?P<query>[^&]+)(?:[&]|$)'
cdc628a4
PH
1510 _TESTS = [{
1511 'url': 'https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video',
1512 'playlist_mincount': 5,
1513 'info_dict': {
1514 'title': 'youtube-dl test video',
1515 }
1516 }]
c9ae7b95
PH
1517
1518 def _real_extract(self, url):
1519 mobj = re.match(self._VALID_URL, url)
1520 query = compat_urllib_parse.unquote_plus(mobj.group('query'))
1521
1522 webpage = self._download_webpage(url, query)
1523 result_code = self._search_regex(
78caa52a 1524 r'(?s)<ol class="item-section"(.*?)</ol>', webpage, 'result HTML')
c9ae7b95
PH
1525
1526 part_codes = re.findall(
1527 r'(?s)<h3 class="yt-lockup-title">(.*?)</h3>', result_code)
1528 entries = []
1529 for part_code in part_codes:
1530 part_title = self._html_search_regex(
6feb2d5e 1531 [r'(?s)title="([^"]+)"', r'>([^<]+)</a>'], part_code, 'item title', fatal=False)
c9ae7b95
PH
1532 part_url_snippet = self._html_search_regex(
1533 r'(?s)href="([^"]+)"', part_code, 'item URL')
1534 part_url = compat_urlparse.urljoin(
1535 'https://www.youtube.com/', part_url_snippet)
1536 entries.append({
1537 '_type': 'url',
1538 'url': part_url,
1539 'title': part_title,
1540 })
1541
1542 return {
1543 '_type': 'playlist',
1544 'entries': entries,
1545 'title': query,
1546 }
1547
1548
75dff0ee 1549class YoutubeShowIE(InfoExtractor):
78caa52a 1550 IE_DESC = 'YouTube.com (multi-season) shows'
cdc628a4 1551 _VALID_URL = r'https?://www\.youtube\.com/show/(?P<id>[^?#]*)'
78caa52a 1552 IE_NAME = 'youtube:show'
cdc628a4
PH
1553 _TESTS = [{
1554 'url': 'http://www.youtube.com/show/airdisasters',
1555 'playlist_mincount': 3,
1556 'info_dict': {
1557 'id': 'airdisasters',
1558 'title': 'Air Disasters',
1559 }
1560 }]
75dff0ee
JMF
1561
1562 def _real_extract(self, url):
1563 mobj = re.match(self._VALID_URL, url)
cdc628a4
PH
1564 playlist_id = mobj.group('id')
1565 webpage = self._download_webpage(
1566 url, playlist_id, 'Downloading show webpage')
75dff0ee
JMF
1567 # There's one playlist for each season of the show
1568 m_seasons = list(re.finditer(r'href="(/playlist\?list=.*?)"', webpage))
cdc628a4
PH
1569 self.to_screen('%s: Found %s seasons' % (playlist_id, len(m_seasons)))
1570 entries = [
1571 self.url_result(
1572 'https://www.youtube.com' + season.group(1), 'YoutubePlaylist')
1573 for season in m_seasons
1574 ]
1575 title = self._og_search_title(webpage, fatal=False)
1576
1577 return {
1578 '_type': 'playlist',
1579 'id': playlist_id,
1580 'title': title,
1581 'entries': entries,
1582 }
04cc9617
JMF
1583
1584
b2e8bc1b 1585class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
d7ae0639
JMF
1586 """
1587 Base class for extractors that fetch info from
1588 http://www.youtube.com/feed_ajax
1589 Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.
1590 """
b2e8bc1b 1591 _LOGIN_REQUIRED = True
43ba5456
JMF
1592 # use action_load_personal_feed instead of action_load_system_feed
1593 _PERSONAL_FEED = False
04cc9617 1594
d7ae0639
JMF
1595 @property
1596 def _FEED_TEMPLATE(self):
43ba5456
JMF
1597 action = 'action_load_system_feed'
1598 if self._PERSONAL_FEED:
1599 action = 'action_load_personal_feed'
38c2e5b8 1600 return 'https://www.youtube.com/feed_ajax?%s=1&feed_name=%s&paging=%%s' % (action, self._FEED_NAME)
d7ae0639
JMF
1601
1602 @property
1603 def IE_NAME(self):
78caa52a 1604 return 'youtube:%s' % self._FEED_NAME
04cc9617 1605
81f0259b 1606 def _real_initialize(self):
b2e8bc1b 1607 self._login()
81f0259b 1608
04cc9617
JMF
1609 def _real_extract(self, url):
1610 feed_entries = []
0e44d838
JMF
1611 paging = 0
1612 for i in itertools.count(1):
84d84211
PH
1613 info = self._download_json(
1614 self._FEED_TEMPLATE % paging,
1615 '%s feed' % self._FEED_NAME,
1616 'Downloading page %s' % i,
1617 transform_source=uppercase_escape)
f6177462 1618 feed_html = info.get('feed_html') or info.get('content_html')
1a9b9649 1619 load_more_widget_html = info.get('load_more_widget_html') or feed_html
43ba5456 1620 m_ids = re.finditer(r'"/watch\?v=(.*?)["&]', feed_html)
04cc9617 1621 ids = orderedSet(m.group(1) for m in m_ids)
7012b23c
PH
1622 feed_entries.extend(
1623 self.url_result(video_id, 'Youtube', video_id=video_id)
1624 for video_id in ids)
05ee2b6d
JMF
1625 mobj = re.search(
1626 r'data-uix-load-more-href="/?[^"]+paging=(?P<paging>\d+)',
1a9b9649 1627 load_more_widget_html)
05ee2b6d 1628 if mobj is None:
04cc9617 1629 break
05ee2b6d 1630 paging = mobj.group('paging')
d7ae0639
JMF
1631 return self.playlist_result(feed_entries, playlist_title=self._PLAYLIST_TITLE)
1632
5f6a1245 1633
d7ae0639 1634class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
f3a34072 1635 IE_DESC = 'YouTube.com recommended videos, ":ytrec" for short (requires authentication)'
d7ae0639
JMF
1636 _VALID_URL = r'https?://www\.youtube\.com/feed/recommended|:ytrec(?:ommended)?'
1637 _FEED_NAME = 'recommended'
78caa52a 1638 _PLAYLIST_TITLE = 'Youtube Recommended videos'
c626a3d9 1639
5f6a1245 1640
43ba5456 1641class YoutubeWatchLaterIE(YoutubeFeedsInfoExtractor):
f3a34072 1642 IE_DESC = 'Youtube watch later list, ":ytwatchlater" for short (requires authentication)'
43ba5456
JMF
1643 _VALID_URL = r'https?://www\.youtube\.com/feed/watch_later|:ytwatchlater'
1644 _FEED_NAME = 'watch_later'
78caa52a 1645 _PLAYLIST_TITLE = 'Youtube Watch Later'
43ba5456 1646 _PERSONAL_FEED = True
c626a3d9 1647
5f6a1245 1648
f459d170 1649class YoutubeHistoryIE(YoutubeFeedsInfoExtractor):
f3a34072 1650 IE_DESC = 'Youtube watch history, ":ythistory" for short (requires authentication)'
78caa52a 1651 _VALID_URL = 'https?://www\.youtube\.com/feed/history|:ythistory'
f459d170
JMF
1652 _FEED_NAME = 'history'
1653 _PERSONAL_FEED = True
78caa52a 1654 _PLAYLIST_TITLE = 'Youtube Watch History'
f459d170 1655
5f6a1245 1656
c626a3d9 1657class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
78caa52a 1658 IE_NAME = 'youtube:favorites'
f3a34072 1659 IE_DESC = 'YouTube.com favourite videos, ":ytfav" for short (requires authentication)'
c7a7750d 1660 _VALID_URL = r'https?://www\.youtube\.com/my_favorites|:ytfav(?:ou?rites)?'
c626a3d9
JMF
1661 _LOGIN_REQUIRED = True
1662
1663 def _real_extract(self, url):
1664 webpage = self._download_webpage('https://www.youtube.com/my_favorites', 'Youtube Favourites videos')
78caa52a 1665 playlist_id = self._search_regex(r'list=(.+?)["&]', webpage, 'favourites playlist id')
c626a3d9 1666 return self.url_result(playlist_id, 'YoutubePlaylist')
15870e90
PH
1667
1668
1ed5b5c9 1669class YoutubeSubscriptionsIE(YoutubePlaylistIE):
78caa52a
PH
1670 IE_NAME = 'youtube:subscriptions'
1671 IE_DESC = 'YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication)'
1ed5b5c9 1672 _VALID_URL = r'https?://www\.youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
81127aa5 1673 _TESTS = []
1ed5b5c9
JMF
1674
1675 def _real_extract(self, url):
78caa52a 1676 title = 'Youtube Subscriptions'
1ed5b5c9
JMF
1677 page = self._download_webpage('https://www.youtube.com/feed/subscriptions', title)
1678
1679 # The extraction process is the same as for playlists, but the regex
1680 # for the video ids doesn't contain an index
1681 ids = []
1682 more_widget_html = content_html = page
1683
1684 for page_num in itertools.count(1):
1685 matches = re.findall(r'href="\s*/watch\?v=([0-9A-Za-z_-]{11})', content_html)
1686 new_ids = orderedSet(matches)
1687 ids.extend(new_ids)
1688
1689 mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
1690 if not mobj:
1691 break
1692
1693 more = self._download_json(
1694 'https://youtube.com/%s' % mobj.group('more'), title,
1695 'Downloading page #%s' % page_num,
1696 transform_source=uppercase_escape)
1697 content_html = more['content_html']
1698 more_widget_html = more['load_more_widget_html']
1699
1700 return {
1701 '_type': 'playlist',
1702 'title': title,
1703 'entries': self._ids_to_results(ids),
1704 }
1705
1706
15870e90
PH
1707class YoutubeTruncatedURLIE(InfoExtractor):
1708 IE_NAME = 'youtube:truncated_url'
1709 IE_DESC = False # Do not list
975d35db 1710 _VALID_URL = r'''(?x)
b95aab84
PH
1711 (?:https?://)?
1712 (?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/
1713 (?:watch\?(?:
c4808c60 1714 feature=[a-z_]+|
b95aab84
PH
1715 annotation_id=annotation_[^&]+|
1716 x-yt-cl=[0-9]+|
c1708b89 1717 hl=[^&]*|
b95aab84
PH
1718 )?
1719 |
1720 attribution_link\?a=[^&]+
1721 )
1722 $
975d35db 1723 '''
15870e90 1724
c4808c60
PH
1725 _TESTS = [{
1726 'url': 'http://www.youtube.com/watch?annotation_id=annotation_3951667041',
1727 'only_matching': True,
dc2fc736
PH
1728 }, {
1729 'url': 'http://www.youtube.com/watch?',
1730 'only_matching': True,
b95aab84
PH
1731 }, {
1732 'url': 'https://www.youtube.com/watch?x-yt-cl=84503534',
1733 'only_matching': True,
1734 }, {
1735 'url': 'https://www.youtube.com/watch?feature=foo',
1736 'only_matching': True,
c1708b89
PH
1737 }, {
1738 'url': 'https://www.youtube.com/watch?hl=en-GB',
1739 'only_matching': True,
c4808c60
PH
1740 }]
1741
15870e90
PH
1742 def _real_extract(self, url):
1743 raise ExtractorError(
78caa52a
PH
1744 'Did you forget to quote the URL? Remember that & is a meta '
1745 'character in most shells, so you want to put the URL in quotes, '
1746 'like youtube-dl '
1747 '"http://www.youtube.com/watch?feature=foo&v=BaW_jenozKc" '
1748 ' or simply youtube-dl BaW_jenozKc .',
15870e90 1749 expected=True)
772fd5cc
PH
1750
1751
1752class YoutubeTruncatedIDIE(InfoExtractor):
1753 IE_NAME = 'youtube:truncated_id'
1754 IE_DESC = False # Do not list
b95aab84 1755 _VALID_URL = r'https?://(?:www\.)?youtube\.com/watch\?v=(?P<id>[0-9A-Za-z_-]{1,10})$'
772fd5cc
PH
1756
1757 _TESTS = [{
1758 'url': 'https://www.youtube.com/watch?v=N_708QY7Ob',
1759 'only_matching': True,
1760 }]
1761
1762 def _real_extract(self, url):
1763 video_id = self._match_id(url)
1764 raise ExtractorError(
1765 'Incomplete YouTube ID %s. URL %s looks truncated.' % (video_id, url),
1766 expected=True)