]> jfr.im git - yt-dlp.git/blame - youtube_dl/extractor/youtube.py
[youtube] Extract start_time
[yt-dlp.git] / youtube_dl / extractor / youtube.py
CommitLineData
c5e8d7af 1# coding: utf-8
c5e8d7af 2
78caa52a
PH
3from __future__ import unicode_literals
4
5
0ca96d48 6import itertools
c5e8d7af 7import json
c4417ddb 8import os.path
c5e8d7af 9import re
42939b61 10import time
e0df6211 11import traceback
c5e8d7af 12
b05654f0 13from .common import InfoExtractor, SearchInfoExtractor
2b25cb5d 14from ..jsinterp import JSInterpreter
54256267 15from ..swfinterp import SWFInterpreter
4bb4a188 16from ..compat import (
edf3e38e 17 compat_chr,
c5e8d7af 18 compat_parse_qs,
c5e8d7af 19 compat_urllib_parse,
7fd002c0
S
20 compat_urllib_parse_unquote,
21 compat_urllib_parse_unquote_plus,
7c80519c 22 compat_urllib_parse_urlparse,
c5e8d7af 23 compat_urllib_request,
7c61bd36 24 compat_urlparse,
c5e8d7af 25 compat_str,
4bb4a188
PH
26)
27from ..utils import (
c5e8d7af 28 clean_html,
c5e8d7af 29 ExtractorError,
2d30521a 30 float_or_none,
4bb4a188
PH
31 get_element_by_attribute,
32 get_element_by_id,
dd27fd17 33 int_or_none,
4bb4a188 34 orderedSet,
7c80519c 35 parse_duration,
c93d53f5 36 str_to_int,
c5e8d7af
PH
37 unescapeHTML,
38 unified_strdate,
81c2f20b 39 uppercase_escape,
af214c3a 40 ISO3166Utils,
c5e8d7af
PH
41)
42
5f6a1245 43
de7f3446 44class YoutubeBaseInfoExtractor(InfoExtractor):
b2e8bc1b
JMF
45 """Provide base functions for Youtube extractors"""
46 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
83317f69 47 _TWOFACTOR_URL = 'https://accounts.google.com/SecondFactor'
b2e8bc1b
JMF
48 _NETRC_MACHINE = 'youtube'
49 # If True it will raise an error if no login info is provided
50 _LOGIN_REQUIRED = False
51
b2e8bc1b 52 def _set_language(self):
810fb84d
PH
53 self._set_cookie(
54 '.youtube.com', 'PREF', 'f1=50000000&hl=en',
42939b61 55 # YouTube sets the expire time to about two months
810fb84d 56 expire_time=time.time() + 2 * 30 * 24 * 3600)
b2e8bc1b 57
25f14e9f
S
58 def _ids_to_results(self, ids):
59 return [
60 self.url_result(vid_id, 'Youtube', video_id=vid_id)
61 for vid_id in ids]
62
b2e8bc1b 63 def _login(self):
83317f69 64 """
65 Attempt to log in to YouTube.
66 True is returned if successful or skipped.
67 False is returned if login failed.
68
69 If _LOGIN_REQUIRED is set and no authentication was provided, an error is raised.
70 """
b2e8bc1b
JMF
71 (username, password) = self._get_login_info()
72 # No authentication to be performed
73 if username is None:
74 if self._LOGIN_REQUIRED:
69ea8ca4 75 raise ExtractorError('No login info available, needed for using %s.' % self.IE_NAME, expected=True)
83317f69 76 return True
b2e8bc1b 77
7cc3570e
PH
78 login_page = self._download_webpage(
79 self._LOGIN_URL, None,
69ea8ca4
PH
80 note='Downloading login page',
81 errnote='unable to fetch login page', fatal=False)
7cc3570e
PH
82 if login_page is False:
83 return
b2e8bc1b 84
795f28f8 85 galx = self._search_regex(r'(?s)<input.+?name="GALX".+?value="(.+?)"',
78caa52a 86 login_page, 'Login GALX parameter')
c5e8d7af 87
b2e8bc1b
JMF
88 # Log in
89 login_form_strs = {
8bcc8756
JW
90 'continue': 'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
91 'Email': username,
92 'GALX': galx,
93 'Passwd': password,
94
95 'PersistentCookie': 'yes',
96 '_utf8': '霱',
97 'bgresponse': 'js_disabled',
98 'checkConnection': '',
99 'checkedDomains': 'youtube',
100 'dnConn': '',
101 'pstMsg': '0',
102 'rmShown': '1',
103 'secTok': '',
104 'signIn': 'Sign in',
105 'timeStmp': '',
106 'service': 'youtube',
107 'uilel': '3',
108 'hl': 'en_US',
b2e8bc1b 109 }
83317f69 110
b2e8bc1b
JMF
111 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
112 # chokes on unicode
5f6a1245 113 login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k, v in login_form_strs.items())
b2e8bc1b 114 login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
7cc3570e
PH
115
116 req = compat_urllib_request.Request(self._LOGIN_URL, login_data)
117 login_results = self._download_webpage(
118 req, None,
69ea8ca4 119 note='Logging in', errnote='unable to log in', fatal=False)
7cc3570e
PH
120 if login_results is False:
121 return False
83317f69 122
123 if re.search(r'id="errormsg_0_Passwd"', login_results) is not None:
69ea8ca4 124 raise ExtractorError('Please use your account password and a two-factor code instead of an application-specific password.', expected=True)
83317f69 125
126 # Two-Factor
127 # TODO add SMS and phone call support - these require making a request and then prompting the user
128
129 if re.search(r'(?i)<form[^>]* id="gaia_secondfactorform"', login_results) is not None:
130 tfa_code = self._get_tfa_info()
131
132 if tfa_code is None:
69ea8ca4
PH
133 self._downloader.report_warning('Two-factor authentication required. Provide it with --twofactor <code>')
134 self._downloader.report_warning('(Note that only TOTP (Google Authenticator App) codes work at this time.)')
83317f69 135 return False
136
137 # Unlike the first login form, secTok and timeStmp are both required for the TFA form
138
139 match = re.search(r'id="secTok"\n\s+value=\'(.+)\'/>', login_results, re.M | re.U)
140 if match is None:
69ea8ca4 141 self._downloader.report_warning('Failed to get secTok - did the page structure change?')
83317f69 142 secTok = match.group(1)
143 match = re.search(r'id="timeStmp"\n\s+value=\'(.+)\'/>', login_results, re.M | re.U)
144 if match is None:
69ea8ca4 145 self._downloader.report_warning('Failed to get timeStmp - did the page structure change?')
83317f69 146 timeStmp = match.group(1)
147
148 tfa_form_strs = {
78caa52a
PH
149 'continue': 'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
150 'smsToken': '',
151 'smsUserPin': tfa_code,
152 'smsVerifyPin': 'Verify',
153
154 'PersistentCookie': 'yes',
155 'checkConnection': '',
156 'checkedDomains': 'youtube',
157 'pstMsg': '1',
158 'secTok': secTok,
159 'timeStmp': timeStmp,
160 'service': 'youtube',
161 'hl': 'en_US',
83317f69 162 }
5f6a1245 163 tfa_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k, v in tfa_form_strs.items())
83317f69 164 tfa_data = compat_urllib_parse.urlencode(tfa_form).encode('ascii')
165
166 tfa_req = compat_urllib_request.Request(self._TWOFACTOR_URL, tfa_data)
167 tfa_results = self._download_webpage(
168 tfa_req, None,
69ea8ca4 169 note='Submitting TFA code', errnote='unable to submit tfa', fatal=False)
83317f69 170
171 if tfa_results is False:
172 return False
173
174 if re.search(r'(?i)<form[^>]* id="gaia_secondfactorform"', tfa_results) is not None:
69ea8ca4 175 self._downloader.report_warning('Two-factor code expired. Please try again, or use a one-use backup code instead.')
83317f69 176 return False
177 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', tfa_results) is not None:
69ea8ca4 178 self._downloader.report_warning('unable to log in - did the page structure change?')
83317f69 179 return False
180 if re.search(r'smsauth-interstitial-reviewsettings', tfa_results) is not None:
69ea8ca4 181 self._downloader.report_warning('Your Google account has a security notice. Please log in on your web browser, resolve the notice, and try again.')
83317f69 182 return False
183
7cc3570e 184 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
69ea8ca4 185 self._downloader.report_warning('unable to log in: bad username or password')
b2e8bc1b
JMF
186 return False
187 return True
188
b2e8bc1b
JMF
189 def _real_initialize(self):
190 if self._downloader is None:
191 return
42939b61 192 self._set_language()
b2e8bc1b
JMF
193 if not self._login():
194 return
c5e8d7af 195
8377574c 196
360e1ca5 197class YoutubeIE(YoutubeBaseInfoExtractor):
78caa52a 198 IE_DESC = 'YouTube.com'
cb7dfeea 199 _VALID_URL = r"""(?x)^
c5e8d7af 200 (
edb53e2d 201 (?:https?://|//) # http(s):// or protocol-independent URL
cb7dfeea 202 (?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/|
484aaeb2 203 (?:www\.)?deturl\.com/www\.youtube\.com/|
e70dc1d1 204 (?:www\.)?pwnyoutube\.com/|
f7000f3a 205 (?:www\.)?yourepeat\.com/|
e69ae5b9
JMF
206 tube\.majestyc\.net/|
207 youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains
c5e8d7af
PH
208 (?:.*?\#/)? # handle anchor (#/) redirect urls
209 (?: # the various things that can precede the ID:
ac7553d0 210 (?:(?:v|embed|e)/(?!videoseries)) # v/ or embed/ or e/
c5e8d7af 211 |(?: # or the v= param in all its forms
f7000f3a 212 (?:(?:watch|movie)(?:_popup)?(?:\.php)?/?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
c5e8d7af
PH
213 (?:\?|\#!?) # the params delimiter ? or # or #!
214 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
215 v=
216 )
f4b05232
JMF
217 ))
218 |youtu\.be/ # just youtu.be/xxxx
edb53e2d 219 |(?:www\.)?cleanvideosearch\.com/media/action/yt/watch\?videoId=
f4b05232 220 )
c5e8d7af 221 )? # all until now is optional -> you can pass the naked ID
8963d9c2 222 ([0-9A-Za-z_-]{11}) # here is it! the YouTube video ID
9291475f 223 (?!.*?&list=) # combined list/video URLs are handled by the playlist IE
c5e8d7af
PH
224 (?(1).+)? # if we found the ID, everything can follow
225 $"""
c5e8d7af 226 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
2c62dc26
PH
227 _formats = {
228 '5': {'ext': 'flv', 'width': 400, 'height': 240},
229 '6': {'ext': 'flv', 'width': 450, 'height': 270},
230 '13': {'ext': '3gp'},
231 '17': {'ext': '3gp', 'width': 176, 'height': 144},
232 '18': {'ext': 'mp4', 'width': 640, 'height': 360},
233 '22': {'ext': 'mp4', 'width': 1280, 'height': 720},
234 '34': {'ext': 'flv', 'width': 640, 'height': 360},
235 '35': {'ext': 'flv', 'width': 854, 'height': 480},
236 '36': {'ext': '3gp', 'width': 320, 'height': 240},
237 '37': {'ext': 'mp4', 'width': 1920, 'height': 1080},
238 '38': {'ext': 'mp4', 'width': 4096, 'height': 3072},
239 '43': {'ext': 'webm', 'width': 640, 'height': 360},
240 '44': {'ext': 'webm', 'width': 854, 'height': 480},
241 '45': {'ext': 'webm', 'width': 1280, 'height': 720},
242 '46': {'ext': 'webm', 'width': 1920, 'height': 1080},
c9bebed2
S
243 '59': {'ext': 'mp4', 'width': 854, 'height': 480},
244 '78': {'ext': 'mp4', 'width': 854, 'height': 480},
2c62dc26 245
1d043b93 246
86fe61c8 247 # 3d videos
43b81eb9
PH
248 '82': {'ext': 'mp4', 'height': 360, 'format_note': '3D', 'preference': -20},
249 '83': {'ext': 'mp4', 'height': 480, 'format_note': '3D', 'preference': -20},
250 '84': {'ext': 'mp4', 'height': 720, 'format_note': '3D', 'preference': -20},
251 '85': {'ext': 'mp4', 'height': 1080, 'format_note': '3D', 'preference': -20},
252 '100': {'ext': 'webm', 'height': 360, 'format_note': '3D', 'preference': -20},
253 '101': {'ext': 'webm', 'height': 480, 'format_note': '3D', 'preference': -20},
254 '102': {'ext': 'webm', 'height': 720, 'format_note': '3D', 'preference': -20},
836a086c 255
96fb5605 256 # Apple HTTP Live Streaming
43b81eb9
PH
257 '92': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'preference': -10},
258 '93': {'ext': 'mp4', 'height': 360, 'format_note': 'HLS', 'preference': -10},
259 '94': {'ext': 'mp4', 'height': 480, 'format_note': 'HLS', 'preference': -10},
260 '95': {'ext': 'mp4', 'height': 720, 'format_note': 'HLS', 'preference': -10},
261 '96': {'ext': 'mp4', 'height': 1080, 'format_note': 'HLS', 'preference': -10},
262 '132': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'preference': -10},
263 '151': {'ext': 'mp4', 'height': 72, 'format_note': 'HLS', 'preference': -10},
2c62dc26
PH
264
265 # DASH mp4 video
43b81eb9
PH
266 '133': {'ext': 'mp4', 'height': 240, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
267 '134': {'ext': 'mp4', 'height': 360, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
268 '135': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
269 '136': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
270 '137': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
e65566a9 271 '138': {'ext': 'mp4', 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40}, # Height can vary (https://github.com/rg3/youtube-dl/issues/4559)
43b81eb9
PH
272 '160': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
273 '264': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
0d2c1418
PH
274 '298': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'h264'},
275 '299': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'h264'},
276 '266': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'vcodec': 'h264'},
836a086c 277
f6f1fc92 278 # Dash mp4 audio
62cd676c
PH
279 '139': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'vcodec': 'none', 'abr': 48, 'preference': -50, 'container': 'm4a_dash'},
280 '140': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'vcodec': 'none', 'abr': 128, 'preference': -50, 'container': 'm4a_dash'},
281 '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'vcodec': 'none', 'abr': 256, 'preference': -50, 'container': 'm4a_dash'},
836a086c
AZ
282
283 # Dash webm
e75cafe9
A
284 '167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
285 '168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
286 '169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
287 '170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
288 '218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
289 '219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
1cc887cb 290 '278': {'ext': 'webm', 'height': 144, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'container': 'webm', 'vcodec': 'VP9'},
e75cafe9
A
291 '242': {'ext': 'webm', 'height': 240, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
292 '243': {'ext': 'webm', 'height': 360, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
293 '244': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
294 '245': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
295 '246': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
296 '247': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
297 '248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
3c80377b 298 '271': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
18061bba 299 '272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
a51d3aa0
PH
300 '302': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'VP9'},
301 '303': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'VP9'},
76b3c610 302 '308': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'VP9'},
10a404c3 303 '313': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'vcodec': 'VP9'},
76b3c610 304 '315': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'VP9'},
2c62dc26
PH
305
306 # Dash webm audio
55db73ef 307 '171': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'abr': 128, 'preference': -50},
e75cafe9 308 '172': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'abr': 256, 'preference': -50},
ce6b9a2d 309
0857baad
PH
310 # Dash webm audio with opus inside
311 '249': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 50, 'preference': -50},
312 '250': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 70, 'preference': -50},
313 '251': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 160, 'preference': -50},
314
ce6b9a2d
PH
315 # RTMP (unnamed)
316 '_rtmp': {'protocol': 'rtmp'},
c5e8d7af 317 }
836a086c 318
78caa52a 319 IE_NAME = 'youtube'
2eb88d95
PH
320 _TESTS = [
321 {
7c80519c 322 'url': 'http://www.youtube.com/watch?v=BaW_jenozKcj&t=1s',
4bc3a23e
PH
323 'info_dict': {
324 'id': 'BaW_jenozKc',
325 'ext': 'mp4',
326 'title': 'youtube-dl test video "\'/\\ä↭𝕐',
327 'uploader': 'Philipp Hagemeister',
328 'uploader_id': 'phihag',
329 'upload_date': '20121002',
330 'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',
331 'categories': ['Science & Technology'],
3e7c1224
PH
332 'like_count': int,
333 'dislike_count': int,
7c80519c 334 'start_time': 1,
2eb88d95 335 }
0e853ca4 336 },
0e853ca4 337 {
4bc3a23e
PH
338 'url': 'http://www.youtube.com/watch?v=UxxajLWwzqY',
339 'note': 'Test generic use_cipher_signature video (#897)',
340 'info_dict': {
341 'id': 'UxxajLWwzqY',
342 'ext': 'mp4',
343 'upload_date': '20120506',
344 'title': 'Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]',
345 'description': 'md5:fea86fda2d5a5784273df5c7cc994d9f',
346 'uploader': 'Icona Pop',
347 'uploader_id': 'IconaPop',
2eb88d95 348 }
c108eb73
JMF
349 },
350 {
4bc3a23e
PH
351 'url': 'https://www.youtube.com/watch?v=07FYdnEawAQ',
352 'note': 'Test VEVO video with age protection (#956)',
353 'info_dict': {
354 'id': '07FYdnEawAQ',
355 'ext': 'mp4',
356 'upload_date': '20130703',
357 'title': 'Justin Timberlake - Tunnel Vision (Explicit)',
358 'description': 'md5:64249768eec3bc4276236606ea996373',
359 'uploader': 'justintimberlakeVEVO',
360 'uploader_id': 'justintimberlakeVEVO',
c108eb73
JMF
361 }
362 },
fccd3771 363 {
4bc3a23e
PH
364 'url': '//www.YouTube.com/watch?v=yZIXLfi8CZQ',
365 'note': 'Embed-only video (#1746)',
366 'info_dict': {
367 'id': 'yZIXLfi8CZQ',
368 'ext': 'mp4',
369 'upload_date': '20120608',
370 'title': 'Principal Sexually Assaults A Teacher - Episode 117 - 8th June 2012',
371 'description': 'md5:09b78bd971f1e3e289601dfba15ca4f7',
372 'uploader': 'SET India',
373 'uploader_id': 'setindia'
fccd3771
PH
374 }
375 },
dd27fd17 376 {
4bc3a23e
PH
377 'url': 'http://www.youtube.com/watch?v=a9LDPn-MO4I',
378 'note': '256k DASH audio (format 141) via DASH manifest',
379 'info_dict': {
380 'id': 'a9LDPn-MO4I',
381 'ext': 'm4a',
382 'upload_date': '20121002',
383 'uploader_id': '8KVIDEO',
384 'description': '',
385 'uploader': '8KVIDEO',
386 'title': 'UHDTV TEST 8K VIDEO.mp4'
4919603f 387 },
4bc3a23e
PH
388 'params': {
389 'youtube_include_dash_manifest': True,
390 'format': '141',
4919603f 391 },
dd27fd17 392 },
3489b7d2
JMF
393 # DASH manifest with encrypted signature
394 {
78caa52a
PH
395 'url': 'https://www.youtube.com/watch?v=IB3lcPjvWLA',
396 'info_dict': {
397 'id': 'IB3lcPjvWLA',
398 'ext': 'm4a',
b766eb27
JMF
399 'title': 'Afrojack, Spree Wilson - The Spark ft. Spree Wilson',
400 'description': 'md5:12e7067fa6735a77bdcbb58cb1187d2d',
78caa52a
PH
401 'uploader': 'AfrojackVEVO',
402 'uploader_id': 'AfrojackVEVO',
403 'upload_date': '20131011',
3489b7d2 404 },
4bc3a23e 405 'params': {
78caa52a
PH
406 'youtube_include_dash_manifest': True,
407 'format': '141',
3489b7d2
JMF
408 },
409 },
aaeb86f6
S
410 # JS player signature function name containing $
411 {
412 'url': 'https://www.youtube.com/watch?v=nfWlot6h_JM',
413 'info_dict': {
414 'id': 'nfWlot6h_JM',
415 'ext': 'm4a',
416 'title': 'Taylor Swift - Shake It Off',
417 'description': 'md5:2acfda1b285bdd478ccec22f9918199d',
418 'uploader': 'TaylorSwiftVEVO',
419 'uploader_id': 'TaylorSwiftVEVO',
420 'upload_date': '20140818',
421 },
422 'params': {
423 'youtube_include_dash_manifest': True,
424 'format': '141',
425 },
426 },
aa79ac0c
PH
427 # Controversy video
428 {
429 'url': 'https://www.youtube.com/watch?v=T4XJQO3qol8',
430 'info_dict': {
431 'id': 'T4XJQO3qol8',
432 'ext': 'mp4',
433 'upload_date': '20100909',
434 'uploader': 'The Amazing Atheist',
435 'uploader_id': 'TheAmazingAtheist',
436 'title': 'Burning Everyone\'s Koran',
437 'description': 'SUBSCRIBE: http://www.youtube.com/saturninefilms\n\nEven Obama has taken a stand against freedom on this issue: http://www.huffingtonpost.com/2010/09/09/obama-gma-interview-quran_n_710282.html',
438 }
c522adb1
JMF
439 },
440 # Normal age-gate video (No vevo, embed allowed)
441 {
442 'url': 'http://youtube.com/watch?v=HtVdAasjOgU',
443 'info_dict': {
444 'id': 'HtVdAasjOgU',
445 'ext': 'mp4',
446 'title': 'The Witcher 3: Wild Hunt - The Sword Of Destiny Trailer',
9ed99402 447 'description': 're:(?s).{100,}About the Game\n.*?The Witcher 3: Wild Hunt.{100,}',
c522adb1
JMF
448 'uploader': 'The Witcher',
449 'uploader_id': 'WitcherGame',
450 'upload_date': '20140605',
451 },
452 },
fccae2b9
S
453 # Age-gate video with encrypted signature
454 {
455 'url': 'http://www.youtube.com/watch?v=6kLq3WMV1nU',
456 'info_dict': {
457 'id': '6kLq3WMV1nU',
458 'ext': 'mp4',
459 'title': 'Dedication To My Ex (Miss That) (Lyric Video)',
460 'description': 'md5:33765bb339e1b47e7e72b5490139bb41',
461 'uploader': 'LloydVEVO',
462 'uploader_id': 'LloydVEVO',
463 'upload_date': '20110629',
464 },
465 },
774e208f
PH
466 # video_info is None (https://github.com/rg3/youtube-dl/issues/4421)
467 {
468 'url': '__2ABJjxzNo',
469 'info_dict': {
470 'id': '__2ABJjxzNo',
471 'ext': 'mp4',
472 'upload_date': '20100430',
473 'uploader_id': 'deadmau5',
474 'description': 'md5:12c56784b8032162bb936a5f76d55360',
475 'uploader': 'deadmau5',
476 'title': 'Deadmau5 - Some Chords (HD)',
477 },
478 'expected_warnings': [
479 'DASH manifest missing',
480 ]
e52a40ab
PH
481 },
482 # Olympics (https://github.com/rg3/youtube-dl/issues/4431)
483 {
484 'url': 'lqQg6PlCWgI',
485 'info_dict': {
486 'id': 'lqQg6PlCWgI',
487 'ext': 'mp4',
cbe2bd91
PH
488 'upload_date': '20120731',
489 'uploader_id': 'olympic',
490 'description': 'HO09 - Women - GER-AUS - Hockey - 31 July 2012 - London 2012 Olympic Games',
491 'uploader': 'Olympics',
492 'title': 'Hockey - Women - GER-AUS - London 2012 Olympic Games',
493 },
494 'params': {
495 'skip_download': 'requires avconv',
e52a40ab 496 }
cbe2bd91 497 },
6271f1ca
PH
498 # Non-square pixels
499 {
500 'url': 'https://www.youtube.com/watch?v=_b-2C3KPAM0',
501 'info_dict': {
502 'id': '_b-2C3KPAM0',
503 'ext': 'mp4',
504 'stretched_ratio': 16 / 9.,
505 'upload_date': '20110310',
506 'uploader_id': 'AllenMeow',
507 'description': 'made by Wacom from Korea | 字幕&加油添醋 by TY\'s Allen | 感謝heylisa00cavey1001同學熱情提供梗及翻譯',
508 'uploader': '孫艾倫',
509 'title': '[A-made] 變態妍字幕版 太妍 我就是這樣的人',
510 },
06b491eb
S
511 },
512 # url_encoded_fmt_stream_map is empty string
513 {
514 'url': 'qEJwOuvDf7I',
515 'info_dict': {
516 'id': 'qEJwOuvDf7I',
517 'ext': 'mp4',
518 'title': 'Обсуждение судебной практики по выборам 14 сентября 2014 года в Санкт-Петербурге',
519 'description': '',
520 'upload_date': '20150404',
521 'uploader_id': 'spbelect',
522 'uploader': 'Наблюдатели Петербурга',
523 },
524 'params': {
525 'skip_download': 'requires avconv',
526 }
527 },
da77d856
S
528 # Extraction from multiple DASH manifests (https://github.com/rg3/youtube-dl/pull/6097)
529 {
530 'url': 'https://www.youtube.com/watch?v=FIl7x6_3R5Y',
531 'info_dict': {
532 'id': 'FIl7x6_3R5Y',
533 'ext': 'mp4',
534 'title': 'md5:7b81415841e02ecd4313668cde88737a',
535 'description': 'md5:116377fd2963b81ec4ce64b542173306',
536 'upload_date': '20150625',
537 'uploader_id': 'dorappi2000',
538 'uploader': 'dorappi2000',
539 'formats': 'mincount:33',
540 },
2ee8f5d8 541 },
8a1a26ce
YCH
542 # DASH manifest with segment_list
543 {
544 'url': 'https://www.youtube.com/embed/CsmdDsKjzN8',
545 'md5': '8ce563a1d667b599d21064e982ab9e31',
546 'info_dict': {
547 'id': 'CsmdDsKjzN8',
548 'ext': 'mp4',
17ee98e1 549 'upload_date': '20150501', # According to '<meta itemprop="datePublished"', but in other places it's 20150510
8a1a26ce
YCH
550 'uploader': 'Airtek',
551 'description': 'Retransmisión en directo de la XVIII media maratón de Zaragoza.',
552 'uploader_id': 'UCzTzUmjXxxacNnL8I3m4LnQ',
553 'title': 'Retransmisión XVIII Media maratón Zaragoza 2015',
554 },
555 'params': {
556 'youtube_include_dash_manifest': True,
557 'format': '135', # bestvideo
558 }
2ee8f5d8 559 },
2eb88d95
PH
560 ]
561
e0df6211
PH
562 def __init__(self, *args, **kwargs):
563 super(YoutubeIE, self).__init__(*args, **kwargs)
83799698 564 self._player_cache = {}
e0df6211 565
c5e8d7af
PH
566 def report_video_info_webpage_download(self, video_id):
567 """Report attempt to download video info webpage."""
69ea8ca4 568 self.to_screen('%s: Downloading video info webpage' % video_id)
c5e8d7af 569
c5e8d7af
PH
570 def report_information_extraction(self, video_id):
571 """Report attempt to extract video information."""
69ea8ca4 572 self.to_screen('%s: Extracting video information' % video_id)
c5e8d7af
PH
573
574 def report_unavailable_format(self, video_id, format):
575 """Report extracted video URL."""
69ea8ca4 576 self.to_screen('%s: Format %s not available' % (video_id, format))
c5e8d7af
PH
577
578 def report_rtmp_download(self):
579 """Indicate the download will use the RTMP protocol."""
69ea8ca4 580 self.to_screen('RTMP download detected')
c5e8d7af 581
60064c53
PH
582 def _signature_cache_id(self, example_sig):
583 """ Return a string representation of a signature """
78caa52a 584 return '.'.join(compat_str(len(part)) for part in example_sig.split('.'))
60064c53
PH
585
586 def _extract_signature_function(self, video_id, player_url, example_sig):
cf010131 587 id_m = re.match(
60620368 588 r'.*?-(?P<id>[a-zA-Z0-9_-]+)(?:/watch_as3|/html5player)?\.(?P<ext>[a-z]+)$',
cf010131 589 player_url)
c081b35c
PH
590 if not id_m:
591 raise ExtractorError('Cannot identify player %r' % player_url)
e0df6211
PH
592 player_type = id_m.group('ext')
593 player_id = id_m.group('id')
594
c4417ddb 595 # Read from filesystem cache
60064c53
PH
596 func_id = '%s_%s_%s' % (
597 player_type, player_id, self._signature_cache_id(example_sig))
c4417ddb 598 assert os.path.basename(func_id) == func_id
a0e07d31 599
69ea8ca4 600 cache_spec = self._downloader.cache.load('youtube-sigfuncs', func_id)
a0e07d31 601 if cache_spec is not None:
78caa52a 602 return lambda s: ''.join(s[i] for i in cache_spec)
83799698 603
6d1a55a5
PH
604 download_note = (
605 'Downloading player %s' % player_url
606 if self._downloader.params.get('verbose') else
607 'Downloading %s player %s' % (player_type, player_id)
608 )
e0df6211
PH
609 if player_type == 'js':
610 code = self._download_webpage(
611 player_url, video_id,
6d1a55a5 612 note=download_note,
69ea8ca4 613 errnote='Download of %s failed' % player_url)
83799698 614 res = self._parse_sig_js(code)
c4417ddb 615 elif player_type == 'swf':
e0df6211
PH
616 urlh = self._request_webpage(
617 player_url, video_id,
6d1a55a5 618 note=download_note,
69ea8ca4 619 errnote='Download of %s failed' % player_url)
e0df6211 620 code = urlh.read()
83799698 621 res = self._parse_sig_swf(code)
e0df6211
PH
622 else:
623 assert False, 'Invalid player type %r' % player_type
624
785521bf
PH
625 test_string = ''.join(map(compat_chr, range(len(example_sig))))
626 cache_res = res(test_string)
627 cache_spec = [ord(c) for c in cache_res]
83799698 628
69ea8ca4 629 self._downloader.cache.store('youtube-sigfuncs', func_id, cache_spec)
83799698
PH
630 return res
631
60064c53 632 def _print_sig_code(self, func, example_sig):
edf3e38e
PH
633 def gen_sig_code(idxs):
634 def _genslice(start, end, step):
78caa52a 635 starts = '' if start == 0 else str(start)
8bcc8756 636 ends = (':%d' % (end + step)) if end + step >= 0 else ':'
69ea8ca4 637 steps = '' if step == 1 else (':%d' % step)
78caa52a 638 return 's[%s%s%s]' % (starts, ends, steps)
edf3e38e
PH
639
640 step = None
7af808a5
PH
641 # Quelch pyflakes warnings - start will be set when step is set
642 start = '(Never used)'
edf3e38e
PH
643 for i, prev in zip(idxs[1:], idxs[:-1]):
644 if step is not None:
645 if i - prev == step:
646 continue
647 yield _genslice(start, prev, step)
648 step = None
649 continue
650 if i - prev in [-1, 1]:
651 step = i - prev
652 start = prev
653 continue
654 else:
78caa52a 655 yield 's[%d]' % prev
edf3e38e 656 if step is None:
78caa52a 657 yield 's[%d]' % i
edf3e38e
PH
658 else:
659 yield _genslice(start, i, step)
660
78caa52a 661 test_string = ''.join(map(compat_chr, range(len(example_sig))))
c705320f 662 cache_res = func(test_string)
edf3e38e 663 cache_spec = [ord(c) for c in cache_res]
78caa52a 664 expr_code = ' + '.join(gen_sig_code(cache_spec))
60064c53
PH
665 signature_id_tuple = '(%s)' % (
666 ', '.join(compat_str(len(p)) for p in example_sig.split('.')))
69ea8ca4 667 code = ('if tuple(len(p) for p in s.split(\'.\')) == %s:\n'
78caa52a 668 ' return %s\n') % (signature_id_tuple, expr_code)
69ea8ca4 669 self.to_screen('Extracted signature function:\n' + code)
edf3e38e 670
e0df6211
PH
671 def _parse_sig_js(self, jscode):
672 funcname = self._search_regex(
aaeb86f6 673 r'\.sig\|\|([a-zA-Z0-9$]+)\(', jscode,
8bcc8756 674 'Initial JS player signature function name')
2b25cb5d
PH
675
676 jsi = JSInterpreter(jscode)
677 initial_function = jsi.extract_function(funcname)
e0df6211
PH
678 return lambda s: initial_function([s])
679
680 def _parse_sig_swf(self, file_contents):
54256267 681 swfi = SWFInterpreter(file_contents)
78caa52a 682 TARGET_CLASSNAME = 'SignatureDecipher'
54256267 683 searched_class = swfi.extract_class(TARGET_CLASSNAME)
78caa52a 684 initial_function = swfi.extract_function(searched_class, 'decipher')
e0df6211
PH
685 return lambda s: initial_function([s])
686
83799698 687 def _decrypt_signature(self, s, video_id, player_url, age_gate=False):
257a2501 688 """Turn the encrypted s field into a working signature"""
6b37f0be 689
c8bf86d5 690 if player_url is None:
69ea8ca4 691 raise ExtractorError('Cannot decrypt signature without player_url')
920de7a2 692
69ea8ca4 693 if player_url.startswith('//'):
78caa52a 694 player_url = 'https:' + player_url
c8bf86d5 695 try:
62af3a0e 696 player_id = (player_url, self._signature_cache_id(s))
c8bf86d5
PH
697 if player_id not in self._player_cache:
698 func = self._extract_signature_function(
60064c53 699 video_id, player_url, s
c8bf86d5
PH
700 )
701 self._player_cache[player_id] = func
702 func = self._player_cache[player_id]
703 if self._downloader.params.get('youtube_print_sig_code'):
60064c53 704 self._print_sig_code(func, s)
c8bf86d5
PH
705 return func(s)
706 except Exception as e:
707 tb = traceback.format_exc()
708 raise ExtractorError(
78caa52a 709 'Signature extraction failed: ' + tb, cause=e)
e0df6211 710
360e1ca5 711 def _get_subtitles(self, video_id, webpage):
de7f3446 712 try:
60e47a26 713 subs_doc = self._download_xml(
38c2e5b8 714 'https://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,
7fad1c63
JMF
715 video_id, note=False)
716 except ExtractorError as err:
69ea8ca4 717 self._downloader.report_warning('unable to download video subtitles: %s' % compat_str(err))
de7f3446 718 return {}
de7f3446
JMF
719
720 sub_lang_list = {}
60e47a26
JMF
721 for track in subs_doc.findall('track'):
722 lang = track.attrib['lang_code']
7e660ac1
LD
723 if lang in sub_lang_list:
724 continue
360e1ca5
JMF
725 sub_formats = []
726 for ext in ['sbv', 'vtt', 'srt']:
727 params = compat_urllib_parse.urlencode({
728 'lang': lang,
729 'v': video_id,
730 'fmt': ext,
731 'name': track.attrib['name'].encode('utf-8'),
732 })
733 sub_formats.append({
734 'url': 'https://www.youtube.com/api/timedtext?' + params,
735 'ext': ext,
736 })
737 sub_lang_list[lang] = sub_formats
de7f3446 738 if not sub_lang_list:
69ea8ca4 739 self._downloader.report_warning('video doesn\'t have subtitles')
de7f3446
JMF
740 return {}
741 return sub_lang_list
742
360e1ca5 743 def _get_automatic_captions(self, video_id, webpage):
de7f3446
JMF
744 """We need the webpage for getting the captions url, pass it as an
745 argument to speed up the process."""
69ea8ca4 746 self.to_screen('%s: Looking for automatic captions' % video_id)
de7f3446 747 mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
78caa52a 748 err_msg = 'Couldn\'t find automatic captions for %s' % video_id
de7f3446
JMF
749 if mobj is None:
750 self._downloader.report_warning(err_msg)
751 return {}
752 player_config = json.loads(mobj.group(1))
753 try:
0792d563
PH
754 args = player_config['args']
755 caption_url = args['ttsurl']
756 timestamp = args['timestamp']
055e6f36
JMF
757 # We get the available subtitles
758 list_params = compat_urllib_parse.urlencode({
759 'type': 'list',
760 'tlangs': 1,
761 'asrs': 1,
de7f3446 762 })
055e6f36 763 list_url = caption_url + '&' + list_params
e26f8712 764 caption_list = self._download_xml(list_url, video_id)
e3dc22ca 765 original_lang_node = caption_list.find('track')
7d900ef1 766 if original_lang_node is None:
69ea8ca4 767 self._downloader.report_warning('Video doesn\'t have automatic captions')
e3dc22ca
JMF
768 return {}
769 original_lang = original_lang_node.attrib['lang_code']
7d900ef1 770 caption_kind = original_lang_node.attrib.get('kind', '')
055e6f36
JMF
771
772 sub_lang_list = {}
773 for lang_node in caption_list.findall('target'):
774 sub_lang = lang_node.attrib['lang_code']
360e1ca5
JMF
775 sub_formats = []
776 for ext in ['sbv', 'vtt', 'srt']:
777 params = compat_urllib_parse.urlencode({
778 'lang': original_lang,
779 'tlang': sub_lang,
780 'fmt': ext,
781 'ts': timestamp,
782 'kind': caption_kind,
783 })
784 sub_formats.append({
785 'url': caption_url + '&' + params,
786 'ext': ext,
787 })
788 sub_lang_list[sub_lang] = sub_formats
055e6f36 789 return sub_lang_list
de7f3446
JMF
790 # An extractor error can be raise by the download process if there are
791 # no automatic captions but there are subtitles
792 except (KeyError, ExtractorError):
793 self._downloader.report_warning(err_msg)
794 return {}
795
97665381
PH
796 @classmethod
797 def extract_id(cls, url):
798 mobj = re.match(cls._VALID_URL, url, re.VERBOSE)
c5e8d7af 799 if mobj is None:
69ea8ca4 800 raise ExtractorError('Invalid URL: %s' % url)
c5e8d7af
PH
801 video_id = mobj.group(2)
802 return video_id
803
1d043b93
JMF
804 def _extract_from_m3u8(self, manifest_url, video_id):
805 url_map = {}
5f6a1245 806
1d043b93
JMF
807 def _get_urls(_manifest):
808 lines = _manifest.split('\n')
809 urls = filter(lambda l: l and not l.startswith('#'),
8bcc8756 810 lines)
1d043b93 811 return urls
78caa52a 812 manifest = self._download_webpage(manifest_url, video_id, 'Downloading formats manifest')
1d043b93
JMF
813 formats_urls = _get_urls(manifest)
814 for format_url in formats_urls:
890f62e8 815 itag = self._search_regex(r'itag/(\d+?)/', format_url, 'itag')
1d043b93
JMF
816 url_map[itag] = format_url
817 return url_map
818
1fb07d10
JG
819 def _extract_annotations(self, video_id):
820 url = 'https://www.youtube.com/annotations_invideo?features=1&legacy=1&video_id=%s' % video_id
69ea8ca4 821 return self._download_webpage(url, video_id, note='Searching for annotations.', errnote='Unable to download video annotations.')
1fb07d10 822
da276600 823 def _parse_dash_manifest(
77c6fb5b 824 self, video_id, dash_manifest_url, player_url, age_gate, fatal=True):
774e208f
PH
825 def decrypt_sig(mobj):
826 s = mobj.group(1)
827 dec_s = self._decrypt_signature(s, video_id, player_url, age_gate)
828 return '/signature/%s' % dec_s
e1b9322b 829 dash_manifest_url = re.sub(r'/s/([a-fA-F0-9\.]+)', decrypt_sig, dash_manifest_url)
774e208f
PH
830 dash_doc = self._download_xml(
831 dash_manifest_url, video_id,
832 note='Downloading DASH manifest',
77c6fb5b
S
833 errnote='Could not download DASH manifest',
834 fatal=fatal)
835
836 if dash_doc is False:
837 return []
774e208f
PH
838
839 formats = []
de5c5456
YCH
840 for a in dash_doc.findall('.//{urn:mpeg:DASH:schema:MPD:2011}AdaptationSet'):
841 mime_type = a.attrib.get('mimeType')
842 for r in a.findall('{urn:mpeg:DASH:schema:MPD:2011}Representation'):
843 url_el = r.find('{urn:mpeg:DASH:schema:MPD:2011}BaseURL')
844 if url_el is None:
845 continue
846 if mime_type == 'text/vtt':
847 # TODO implement WebVTT downloading
848 pass
849 elif mime_type.startswith('audio/') or mime_type.startswith('video/'):
6800d337 850 segment_list = r.find('{urn:mpeg:DASH:schema:MPD:2011}SegmentList')
de5c5456
YCH
851 format_id = r.attrib['id']
852 video_url = url_el.text
853 filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength'))
854 f = {
855 'format_id': format_id,
856 'url': video_url,
857 'width': int_or_none(r.attrib.get('width')),
858 'height': int_or_none(r.attrib.get('height')),
859 'tbr': int_or_none(r.attrib.get('bandwidth'), 1000),
860 'asr': int_or_none(r.attrib.get('audioSamplingRate')),
861 'filesize': filesize,
862 'fps': int_or_none(r.attrib.get('frameRate')),
863 }
0c8662d2 864 if segment_list is not None:
6800d337
YCH
865 f.update({
866 'initialization_url': segment_list.find('{urn:mpeg:DASH:schema:MPD:2011}Initialization').attrib['sourceURL'],
b9258c61 867 'segment_urls': [segment.attrib.get('media') for segment in segment_list.findall('{urn:mpeg:DASH:schema:MPD:2011}SegmentURL')],
423d2be5 868 'protocol': 'http_dash_segments',
6800d337 869 })
de5c5456
YCH
870 try:
871 existing_format = next(
872 fo for fo in formats
873 if fo['format_id'] == format_id)
874 except StopIteration:
875 full_info = self._formats.get(format_id, {}).copy()
876 full_info.update(f)
1b5a1ae2
S
877 codecs = r.attrib.get('codecs')
878 if codecs:
879 if full_info.get('acodec') == 'none' and 'vcodec' not in full_info:
880 full_info['vcodec'] = codecs
881 elif full_info.get('vcodec') == 'none' and 'acodec' not in full_info:
882 full_info['acodec'] = codecs
de5c5456
YCH
883 formats.append(full_info)
884 else:
885 existing_format.update(f)
886 else:
887 self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
774e208f
PH
888 return formats
889
c5e8d7af 890 def _real_extract(self, url):
7e8c0af0 891 proto = (
78caa52a
PH
892 'http' if self._downloader.params.get('prefer_insecure', False)
893 else 'https')
7e8c0af0 894
7c80519c
JMF
895 start_time = None
896 parsed_url = compat_urllib_parse_urlparse(url)
897 for component in [parsed_url.fragment, parsed_url.query]:
898 query = compat_parse_qs(component)
899 if 't' in query:
900 start_time = parse_duration(query['t'][0])
901 break
902
c5e8d7af
PH
903 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
904 mobj = re.search(self._NEXT_URL_RE, url)
905 if mobj:
7fd002c0 906 url = proto + '://www.youtube.com/' + compat_urllib_parse_unquote(mobj.group(1)).lstrip('/')
97665381 907 video_id = self.extract_id(url)
c5e8d7af
PH
908
909 # Get video webpage
aa79ac0c 910 url = proto + '://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1&bpctr=9999999999' % video_id
a1f934b1 911 video_webpage = self._download_webpage(url, video_id)
c5e8d7af
PH
912
913 # Attempt to extract SWF player URL
e0df6211 914 mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
c5e8d7af
PH
915 if mobj is not None:
916 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
917 else:
918 player_url = None
919
d8d24a92
S
920 dash_mpds = []
921
922 def add_dash_mpd(video_info):
923 dash_mpd = video_info.get('dashmpd')
924 if dash_mpd and dash_mpd[0] not in dash_mpds:
925 dash_mpds.append(dash_mpd[0])
926
c5e8d7af 927 # Get video info
6449cd80 928 embed_webpage = None
2fe1ff85 929 is_live = None
c108eb73 930 if re.search(r'player-age-gate-content">', video_webpage) is not None:
c108eb73
JMF
931 age_gate = True
932 # We simulate the access to the video from www.youtube.com/v/{video_id}
933 # this can be viewed without login into Youtube
beb95e77
CL
934 url = proto + '://www.youtube.com/embed/%s' % video_id
935 embed_webpage = self._download_webpage(url, video_id, 'Downloading embed webpage')
2c57c7fa
JMF
936 data = compat_urllib_parse.urlencode({
937 'video_id': video_id,
938 'eurl': 'https://youtube.googleapis.com/v/' + video_id,
c084c934 939 'sts': self._search_regex(
beb95e77 940 r'"sts"\s*:\s*(\d+)', embed_webpage, 'sts', default=''),
2c57c7fa 941 })
7e8c0af0 942 video_info_url = proto + '://www.youtube.com/get_video_info?' + data
94bd3613
PH
943 video_info_webpage = self._download_webpage(
944 video_info_url, video_id,
20436c30 945 note='Refetching age-gated info webpage',
94bd3613 946 errnote='unable to download video info webpage')
c5e8d7af 947 video_info = compat_parse_qs(video_info_webpage)
d8d24a92 948 add_dash_mpd(video_info)
c108eb73
JMF
949 else:
950 age_gate = False
bc93bdb5 951 video_info = None
d8d24a92
S
952 # Try looking directly into the video webpage
953 mobj = re.search(r';ytplayer\.config\s*=\s*({.*?});', video_webpage)
954 if mobj:
4e62ebe2
JMF
955 json_code = uppercase_escape(mobj.group(1))
956 ytplayer_config = json.loads(json_code)
957 args = ytplayer_config['args']
d8d24a92
S
958 if args.get('url_encoded_fmt_stream_map'):
959 # Convert to the same format returned by compat_parse_qs
960 video_info = dict((k, [v]) for k, v in args.items())
961 add_dash_mpd(video_info)
2fe1ff85
JMF
962 if args.get('livestream') == '1' or args.get('live_playback') == 1:
963 is_live = True
0a3cf9ad
S
964 if not video_info or self._downloader.params.get('youtube_include_dash_manifest', True):
965 # We also try looking in get_video_info since it may contain different dashmpd
966 # URL that points to a DASH manifest with possibly different itag set (some itags
967 # are missing from DASH manifest pointed by webpage's dashmpd, some - from DASH
968 # manifest pointed by get_video_info's dashmpd).
969 # The general idea is to take a union of itags of both DASH manifests (for example
970 # video with such 'manifest behavior' see https://github.com/rg3/youtube-dl/issues/6093)
4e62ebe2 971 self.report_video_info_webpage_download(video_id)
0a3cf9ad 972 for el_type in ['&el=info', '&el=embedded', '&el=detailpage', '&el=vevo', '']:
810fb84d
PH
973 video_info_url = (
974 '%s://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
975 % (proto, video_id, el_type))
976 video_info_webpage = self._download_webpage(
977 video_info_url,
4e62ebe2
JMF
978 video_id, note=False,
979 errnote='unable to download video info webpage')
0a3cf9ad
S
980 get_video_info = compat_parse_qs(video_info_webpage)
981 add_dash_mpd(get_video_info)
982 if not video_info:
983 video_info = get_video_info
984 if 'token' in get_video_info:
4e62ebe2 985 break
c5e8d7af
PH
986 if 'token' not in video_info:
987 if 'reason' in video_info:
af214c3a
YCH
988 if 'The uploader has not made this video available in your country.' in video_info['reason']:
989 regions_allowed = self._html_search_meta('regionsAllowed', video_webpage, default=None)
990 if regions_allowed is not None:
991 raise ExtractorError('YouTube said: This video is available in %s only' % (
992 ', '.join(map(ISO3166Utils.short2full, regions_allowed.split(',')))),
993 expected=True)
d11271dd 994 raise ExtractorError(
78caa52a 995 'YouTube said: %s' % video_info['reason'][0],
d11271dd 996 expected=True, video_id=video_id)
c5e8d7af 997 else:
d11271dd 998 raise ExtractorError(
78caa52a 999 '"token" parameter not in video info for unknown reason',
d11271dd 1000 video_id=video_id)
c5e8d7af 1001
1d699755
PH
1002 if 'view_count' in video_info:
1003 view_count = int(video_info['view_count'][0])
1004 else:
1005 view_count = None
1006
c5e8d7af
PH
1007 # Check for "rental" videos
1008 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
69ea8ca4 1009 raise ExtractorError('"rental" videos not supported')
c5e8d7af
PH
1010
1011 # Start extracting information
1012 self.report_information_extraction(video_id)
1013
1014 # uploader
1015 if 'author' not in video_info:
69ea8ca4 1016 raise ExtractorError('Unable to extract uploader name')
7fd002c0 1017 video_uploader = compat_urllib_parse_unquote_plus(video_info['author'][0])
c5e8d7af
PH
1018
1019 # uploader_id
1020 video_uploader_id = None
1021 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
1022 if mobj is not None:
1023 video_uploader_id = mobj.group(1)
1024 else:
69ea8ca4 1025 self._downloader.report_warning('unable to extract uploader nickname')
c5e8d7af
PH
1026
1027 # title
a8c6b241 1028 if 'title' in video_info:
aa92f063 1029 video_title = video_info['title'][0]
a8c6b241 1030 else:
69ea8ca4 1031 self._downloader.report_warning('Unable to extract video title')
78caa52a 1032 video_title = '_'
c5e8d7af
PH
1033
1034 # thumbnail image
7763b04e
JMF
1035 # We try first to get a high quality image:
1036 m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">',
1037 video_webpage, re.DOTALL)
1038 if m_thumb is not None:
1039 video_thumbnail = m_thumb.group(1)
1040 elif 'thumbnail_url' not in video_info:
69ea8ca4 1041 self._downloader.report_warning('unable to extract video thumbnail')
f490e77e 1042 video_thumbnail = None
c5e8d7af 1043 else: # don't panic if we can't find it
7fd002c0 1044 video_thumbnail = compat_urllib_parse_unquote_plus(video_info['thumbnail_url'][0])
c5e8d7af
PH
1045
1046 # upload date
9d0b581f
S
1047 upload_date = self._html_search_meta(
1048 'datePublished', video_webpage, 'upload date', default=None)
1049 if not upload_date:
1050 upload_date = self._search_regex(
1051 [r'(?s)id="eow-date.*?>(.*?)</span>',
1052 r'id="watch-uploader-info".*?>.*?(?:Published|Uploaded|Streamed live|Started) on (.+?)</strong>'],
1053 video_webpage, 'upload date', default=None)
1054 if upload_date:
1055 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1056 upload_date = unified_strdate(upload_date)
c5e8d7af 1057
55f7bd2d
PH
1058 m_cat_container = self._search_regex(
1059 r'(?s)<h4[^>]*>\s*Category\s*</h4>\s*<ul[^>]*>(.*?)</ul>',
624dcebf 1060 video_webpage, 'categories', default=None)
ec8deefc 1061 if m_cat_container:
ad3bc6ac 1062 category = self._html_search_regex(
01ed5c9b 1063 r'(?s)<a[^<]+>(.*?)</a>', m_cat_container, 'category',
ad3bc6ac
PH
1064 default=None)
1065 video_categories = None if category is None else [category]
1066 else:
1067 video_categories = None
ec8deefc 1068
c5e8d7af
PH
1069 # description
1070 video_description = get_element_by_id("eow-description", video_webpage)
1071 if video_description:
27dcce19
PH
1072 video_description = re.sub(r'''(?x)
1073 <a\s+
1074 (?:[a-zA-Z-]+="[^"]+"\s+)*?
1075 title="([^"]+)"\s+
1076 (?:[a-zA-Z-]+="[^"]+"\s+)*?
1077 class="yt-uix-redirect-link"\s*>
1078 [^<]+
1079 </a>
1080 ''', r'\1', video_description)
c5e8d7af
PH
1081 video_description = clean_html(video_description)
1082 else:
1083 fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
1084 if fd_mobj:
1085 video_description = unescapeHTML(fd_mobj.group(1))
1086 else:
78caa52a 1087 video_description = ''
c5e8d7af 1088
f30a38be 1089 def _extract_count(count_name):
c93d53f5
S
1090 return str_to_int(self._search_regex(
1091 r'-%s-button[^>]+><span[^>]+class="yt-uix-button-content"[^>]*>([\d,]+)</span>'
1092 % re.escape(count_name),
1093 video_webpage, count_name, default=None))
1094
69ea8ca4
PH
1095 like_count = _extract_count('like')
1096 dislike_count = _extract_count('dislike')
336c3a69 1097
c5e8d7af 1098 # subtitles
d82134c3 1099 video_subtitles = self.extract_subtitles(video_id, video_webpage)
360e1ca5 1100 automatic_captions = self.extract_automatic_captions(video_id, video_webpage)
c5e8d7af
PH
1101
1102 if 'length_seconds' not in video_info:
69ea8ca4 1103 self._downloader.report_warning('unable to extract video duration')
b466b702 1104 video_duration = None
c5e8d7af 1105 else:
7fd002c0 1106 video_duration = int(compat_urllib_parse_unquote_plus(video_info['length_seconds'][0]))
c5e8d7af 1107
1fb07d10
JG
1108 # annotations
1109 video_annotations = None
1110 if self._downloader.params.get('writeannotations', False):
5f6a1245 1111 video_annotations = self._extract_annotations(video_id)
1fb07d10 1112
dd27fd17
PH
1113 def _map_to_format_list(urlmap):
1114 formats = []
1115 for itag, video_real_url in urlmap.items():
1116 dct = {
1117 'format_id': itag,
1118 'url': video_real_url,
1119 'player_url': player_url,
1120 }
0b65e5d4
PH
1121 if itag in self._formats:
1122 dct.update(self._formats[itag])
dd27fd17
PH
1123 formats.append(dct)
1124 return formats
1125
c5e8d7af
PH
1126 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1127 self.report_rtmp_download()
dd27fd17
PH
1128 formats = [{
1129 'format_id': '_rtmp',
1130 'protocol': 'rtmp',
1131 'url': video_info['conn'][0],
1132 'player_url': player_url,
1133 }]
24270b03 1134 elif len(video_info.get('url_encoded_fmt_stream_map', [''])[0]) >= 1 or len(video_info.get('adaptive_fmts', [''])[0]) >= 1:
5f6a1245 1135 encoded_url_map = video_info.get('url_encoded_fmt_stream_map', [''])[0] + ',' + video_info.get('adaptive_fmts', [''])[0]
00fe14fc 1136 if 'rtmpe%3Dyes' in encoded_url_map:
a7055eb9 1137 raise ExtractorError('rtmpe downloads are not supported, see https://github.com/rg3/youtube-dl/issues/343 for more information.', expected=True)
c5e8d7af 1138 url_map = {}
00fe14fc 1139 for url_data_str in encoded_url_map.split(','):
c5e8d7af 1140 url_data = compat_parse_qs(url_data_str)
201e9eaa
PH
1141 if 'itag' not in url_data or 'url' not in url_data:
1142 continue
1143 format_id = url_data['itag'][0]
1144 url = url_data['url'][0]
1145
1146 if 'sig' in url_data:
1147 url += '&signature=' + url_data['sig'][0]
1148 elif 's' in url_data:
1149 encrypted_sig = url_data['s'][0]
6449cd80 1150 ASSETS_RE = r'"assets":.+?"js":\s*("[^"]+")'
201e9eaa 1151
beb95e77 1152 jsplayer_url_json = self._search_regex(
6449cd80
PH
1153 ASSETS_RE,
1154 embed_webpage if age_gate else video_webpage,
1155 'JS player URL (1)', default=None)
1156 if not jsplayer_url_json and not age_gate:
1157 # We need the embed website after all
1158 if embed_webpage is None:
1159 embed_url = proto + '://www.youtube.com/embed/%s' % video_id
1160 embed_webpage = self._download_webpage(
1161 embed_url, video_id, 'Downloading embed webpage')
1162 jsplayer_url_json = self._search_regex(
1163 ASSETS_RE, embed_webpage, 'JS player URL')
1164
beb95e77 1165 player_url = json.loads(jsplayer_url_json)
201e9eaa
PH
1166 if player_url is None:
1167 player_url_json = self._search_regex(
1168 r'ytplayer\.config.*?"url"\s*:\s*("[^"]+")',
78caa52a 1169 video_webpage, 'age gate player URL')
201e9eaa
PH
1170 player_url = json.loads(player_url_json)
1171
1172 if self._downloader.params.get('verbose'):
cf010131 1173 if player_url is None:
201e9eaa
PH
1174 player_version = 'unknown'
1175 player_desc = 'unknown'
1176 else:
1177 if player_url.endswith('swf'):
1178 player_version = self._search_regex(
1179 r'-(.+?)(?:/watch_as3)?\.swf$', player_url,
78caa52a 1180 'flash player', fatal=False)
201e9eaa 1181 player_desc = 'flash player %s' % player_version
cf010131 1182 else:
201e9eaa
PH
1183 player_version = self._search_regex(
1184 r'html5player-([^/]+?)(?:/html5player)?\.js',
1185 player_url,
1186 'html5 player', fatal=False)
78caa52a 1187 player_desc = 'html5 player %s' % player_version
201e9eaa 1188
60064c53 1189 parts_sizes = self._signature_cache_id(encrypted_sig)
69ea8ca4 1190 self.to_screen('{%s} signature length %s, %s' %
9e1a5b84 1191 (format_id, parts_sizes, player_desc))
201e9eaa
PH
1192
1193 signature = self._decrypt_signature(
1194 encrypted_sig, video_id, player_url, age_gate)
1195 url += '&signature=' + signature
1196 if 'ratebypass' not in url:
1197 url += '&ratebypass=yes'
1198 url_map[format_id] = url
dd27fd17 1199 formats = _map_to_format_list(url_map)
1d043b93
JMF
1200 elif video_info.get('hlsvp'):
1201 manifest_url = video_info['hlsvp'][0]
1202 url_map = self._extract_from_m3u8(manifest_url, video_id)
dd27fd17 1203 formats = _map_to_format_list(url_map)
c5e8d7af 1204 else:
69ea8ca4 1205 raise ExtractorError('no conn, hlsvp or url_encoded_fmt_stream_map information found in video info')
c5e8d7af 1206
dd27fd17 1207 # Look for the DASH manifest
203fb43f 1208 if self._downloader.params.get('youtube_include_dash_manifest', True):
77c6fb5b 1209 dash_mpd_fatal = True
d8d24a92
S
1210 for dash_manifest_url in dash_mpds:
1211 dash_formats = {}
774e208f 1212 try:
d8d24a92 1213 for df in self._parse_dash_manifest(
77c6fb5b 1214 video_id, dash_manifest_url, player_url, age_gate, dash_mpd_fatal):
d8d24a92
S
1215 # Do not overwrite DASH format found in some previous DASH manifest
1216 if df['format_id'] not in dash_formats:
1217 dash_formats[df['format_id']] = df
77c6fb5b
S
1218 # Additional DASH manifests may end up in HTTP Error 403 therefore
1219 # allow them to fail without bug report message if we already have
1220 # some DASH manifest succeeded. This is temporary workaround to reduce
1221 # burst of bug reports until we figure out the reason and whether it
1222 # can be fixed at all.
1223 dash_mpd_fatal = False
774e208f
PH
1224 except (ExtractorError, KeyError) as e:
1225 self.report_warning(
1226 'Skipping DASH manifest: %r' % e, video_id)
d8d24a92 1227 if dash_formats:
04b3b3df
JMF
1228 # Remove the formats we found through non-DASH, they
1229 # contain less info and it can be wrong, because we use
1230 # fixed values (for example the resolution). See
1231 # https://github.com/rg3/youtube-dl/issues/5774 for an
1232 # example.
d80265cc 1233 formats = [f for f in formats if f['format_id'] not in dash_formats.keys()]
d8d24a92 1234 formats.extend(dash_formats.values())
d80044c2 1235
6271f1ca
PH
1236 # Check for malformed aspect ratio
1237 stretched_m = re.search(
1238 r'<meta\s+property="og:video:tag".*?content="yt:stretch=(?P<w>[0-9]+):(?P<h>[0-9]+)">',
1239 video_webpage)
1240 if stretched_m:
1241 ratio = float(stretched_m.group('w')) / float(stretched_m.group('h'))
1242 for f in formats:
1243 if f.get('vcodec') != 'none':
1244 f['stretched_ratio'] = ratio
1245
4bcc7bd1 1246 self._sort_formats(formats)
4ea3be0a 1247
1248 return {
8bcc8756
JW
1249 'id': video_id,
1250 'uploader': video_uploader,
1251 'uploader_id': video_uploader_id,
1252 'upload_date': upload_date,
1253 'title': video_title,
1254 'thumbnail': video_thumbnail,
1255 'description': video_description,
1256 'categories': video_categories,
1257 'subtitles': video_subtitles,
360e1ca5 1258 'automatic_captions': automatic_captions,
8bcc8756
JW
1259 'duration': video_duration,
1260 'age_limit': 18 if age_gate else 0,
1261 'annotations': video_annotations,
7e8c0af0 1262 'webpage_url': proto + '://www.youtube.com/watch?v=%s' % video_id,
8bcc8756 1263 'view_count': view_count,
4ea3be0a 1264 'like_count': like_count,
1265 'dislike_count': dislike_count,
2d30521a 1266 'average_rating': float_or_none(video_info.get('avg_rating', [None])[0]),
8bcc8756 1267 'formats': formats,
2fe1ff85 1268 'is_live': is_live,
7c80519c 1269 'start_time': start_time,
4ea3be0a 1270 }
c5e8d7af 1271
5f6a1245 1272
880e1c52 1273class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
78caa52a 1274 IE_DESC = 'YouTube.com playlists'
d67cc9fa 1275 _VALID_URL = r"""(?x)(?:
c5e8d7af
PH
1276 (?:https?://)?
1277 (?:\w+\.)?
1278 youtube\.com/
1279 (?:
ac7553d0 1280 (?:course|view_play_list|my_playlists|artist|playlist|watch|embed/videoseries)
c5e8d7af
PH
1281 \? (?:.*?&)*? (?:p|a|list)=
1282 | p/
1283 )
d67cc9fa 1284 (
99209c29 1285 (?:PL|LL|EC|UU|FL|RD|UL)?[0-9A-Za-z-_]{10,}
5f6a1245 1286 # Top tracks, they can also include dots
d67cc9fa
JMF
1287 |(?:MC)[\w\.]*
1288 )
c5e8d7af
PH
1289 .*
1290 |
99209c29 1291 ((?:PL|LL|EC|UU|FL|RD|UL)[0-9A-Za-z-_]{10,})
c5e8d7af 1292 )"""
dbb94fb0 1293 _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s'
dbb94fb0 1294 _VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})&amp;[^"]*?index=(?P<index>\d+)'
78caa52a 1295 IE_NAME = 'youtube:playlist'
81127aa5
PH
1296 _TESTS = [{
1297 'url': 'https://www.youtube.com/playlist?list=PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re',
1298 'info_dict': {
1299 'title': 'ytdl test PL',
a1cf99d0 1300 'id': 'PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re',
81127aa5
PH
1301 },
1302 'playlist_count': 3,
9291475f
PH
1303 }, {
1304 'url': 'https://www.youtube.com/playlist?list=PLtPgu7CB4gbZDA7i_euNxn75ISqxwZPYx',
1305 'info_dict': {
acf757f4 1306 'id': 'PLtPgu7CB4gbZDA7i_euNxn75ISqxwZPYx',
9291475f
PH
1307 'title': 'YDL_Empty_List',
1308 },
1309 'playlist_count': 0,
1310 }, {
1311 'note': 'Playlist with deleted videos (#651). As a bonus, the video #51 is also twice in this list.',
1312 'url': 'https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC',
1313 'info_dict': {
1314 'title': '29C3: Not my department',
acf757f4 1315 'id': 'PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC',
9291475f
PH
1316 },
1317 'playlist_count': 95,
1318 }, {
1319 'note': 'issue #673',
1320 'url': 'PLBB231211A4F62143',
1321 'info_dict': {
f46a8702 1322 'title': '[OLD]Team Fortress 2 (Class-based LP)',
acf757f4 1323 'id': 'PLBB231211A4F62143',
9291475f
PH
1324 },
1325 'playlist_mincount': 26,
1326 }, {
1327 'note': 'Large playlist',
1328 'url': 'https://www.youtube.com/playlist?list=UUBABnxM4Ar9ten8Mdjj1j0Q',
1329 'info_dict': {
1330 'title': 'Uploads from Cauchemar',
acf757f4 1331 'id': 'UUBABnxM4Ar9ten8Mdjj1j0Q',
9291475f
PH
1332 },
1333 'playlist_mincount': 799,
1334 }, {
1335 'url': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl',
1336 'info_dict': {
1337 'title': 'YDL_safe_search',
acf757f4 1338 'id': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl',
9291475f
PH
1339 },
1340 'playlist_count': 2,
ac7553d0
PH
1341 }, {
1342 'note': 'embedded',
1343 'url': 'http://www.youtube.com/embed/videoseries?list=PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu',
1344 'playlist_count': 4,
1345 'info_dict': {
1346 'title': 'JODA15',
acf757f4 1347 'id': 'PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu',
ac7553d0 1348 }
6b08cdf6
PH
1349 }, {
1350 'note': 'Embedded SWF player',
1351 'url': 'http://www.youtube.com/p/YN5VISEtHet5D4NEvfTd0zcgFk84NqFZ?hl=en_US&fs=1&rel=0',
1352 'playlist_count': 4,
1353 'info_dict': {
1354 'title': 'JODA7',
acf757f4 1355 'id': 'YN5VISEtHet5D4NEvfTd0zcgFk84NqFZ',
6b08cdf6 1356 }
4b7df0d3
JMF
1357 }, {
1358 'note': 'Buggy playlist: the webpage has a "Load more" button but it doesn\'t have more videos',
1359 'url': 'https://www.youtube.com/playlist?list=UUXw-G3eDE9trcvY2sBMM_aA',
1360 'info_dict': {
acf757f4
PH
1361 'title': 'Uploads from Interstellar Movie',
1362 'id': 'UUXw-G3eDE9trcvY2sBMM_aA',
4b7df0d3
JMF
1363 },
1364 'playlist_mincout': 21,
81127aa5 1365 }]
c5e8d7af 1366
880e1c52
JMF
1367 def _real_initialize(self):
1368 self._login()
1369
652cdaa2 1370 def _extract_mix(self, playlist_id):
99209c29 1371 # The mixes are generated from a single video
652cdaa2 1372 # the id of the playlist is just 'RD' + video_id
7d4afc55 1373 url = 'https://youtube.com/watch?v=%s&list=%s' % (playlist_id[-11:], playlist_id)
c9cc0bf5 1374 webpage = self._download_webpage(
78caa52a 1375 url, playlist_id, 'Downloading Youtube mix')
bc2f773b 1376 search_title = lambda class_name: get_element_by_attribute('class', class_name, webpage)
c9cc0bf5
PH
1377 title_span = (
1378 search_title('playlist-title') or
1379 search_title('title long-title') or
1380 search_title('title'))
76d1700b 1381 title = clean_html(title_span)
c9cc0bf5
PH
1382 ids = orderedSet(re.findall(
1383 r'''(?xs)data-video-username=".*?".*?
1384 href="/watch\?v=([0-9A-Za-z_-]{11})&amp;[^"]*?list=%s''' % re.escape(playlist_id),
1385 webpage))
652cdaa2
JMF
1386 url_results = self._ids_to_results(ids)
1387
1388 return self.playlist_result(url_results, playlist_id, title)
1389
448830ce 1390 def _extract_playlist(self, playlist_id):
dbb94fb0
S
1391 url = self._TEMPLATE_URL % playlist_id
1392 page = self._download_webpage(url, playlist_id)
dbb94fb0 1393
39b62db1
YCH
1394 for match in re.findall(r'<div class="yt-alert-message">([^<]+)</div>', page):
1395 match = match.strip()
1396 # Check if the playlist exists or is private
1397 if re.match(r'[^<]*(The|This) playlist (does not exist|is private)[^<]*', match):
1398 raise ExtractorError(
1399 'The playlist doesn\'t exist or is private, use --username or '
1400 '--netrc to access it.',
1401 expected=True)
1402 elif re.match(r'[^<]*Invalid parameters[^<]*', match):
1403 raise ExtractorError(
1404 'Invalid parameters. Maybe URL is incorrect.',
1405 expected=True)
1406 elif re.match(r'[^<]*Choose your language[^<]*', match):
1407 continue
1408 else:
1409 self.report_warning('Youtube gives an alert message: ' + match)
10c0e2d8 1410
dcbb4580 1411 # Extract the video ids from the playlist pages
70219b0f
JMF
1412 def _entries():
1413 more_widget_html = content_html = page
1414 for page_num in itertools.count(1):
1415 matches = re.finditer(self._VIDEO_RE, content_html)
1416 # We remove the duplicates and the link with index 0
1417 # (it's not the first video of the playlist)
1418 new_ids = orderedSet(m.group('id') for m in matches if m.group('index') != '0')
1419 for vid_id in new_ids:
1420 yield self.url_result(vid_id, 'Youtube', video_id=vid_id)
1421
1422 mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
1423 if not mobj:
1424 break
1425
1426 more = self._download_json(
1427 'https://youtube.com/%s' % mobj.group('more'), playlist_id,
1428 'Downloading page #%s' % page_num,
1429 transform_source=uppercase_escape)
1430 content_html = more['content_html']
1431 if not content_html.strip():
1432 # Some webpages show a "Load more" button but they don't
1433 # have more videos
1434 break
1435 more_widget_html = more['load_more_widget_html']
dbb94fb0
S
1436
1437 playlist_title = self._html_search_regex(
68eb8e90 1438 r'(?s)<h1 class="pl-header-title[^"]*">\s*(.*?)\s*</h1>',
78caa52a 1439 page, 'title')
c5e8d7af 1440
70219b0f 1441 return self.playlist_result(_entries(), playlist_id, playlist_title)
c5e8d7af 1442
448830ce
S
1443 def _real_extract(self, url):
1444 # Extract playlist id
1445 mobj = re.match(self._VALID_URL, url)
1446 if mobj is None:
1447 raise ExtractorError('Invalid URL: %s' % url)
1448 playlist_id = mobj.group(1) or mobj.group(2)
1449
1450 # Check if it's a video-specific URL
1451 query_dict = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
1452 if 'v' in query_dict:
1453 video_id = query_dict['v'][0]
1454 if self._downloader.params.get('noplaylist'):
1455 self.to_screen('Downloading just video %s because of --no-playlist' % video_id)
1456 return self.url_result(video_id, 'Youtube', video_id=video_id)
1457 else:
1458 self.to_screen('Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id))
1459
1460 if playlist_id.startswith('RD') or playlist_id.startswith('UL'):
1461 # Mixes require a custom extraction process
1462 return self._extract_mix(playlist_id)
1463
1464 return self._extract_playlist(playlist_id)
1465
c5e8d7af
PH
1466
1467class YoutubeChannelIE(InfoExtractor):
78caa52a 1468 IE_DESC = 'YouTube.com channels'
9ff67727 1469 _VALID_URL = r'https?://(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/(?P<id>[0-9A-Za-z_-]+)'
eb0f3e7e 1470 _TEMPLATE_URL = 'https://www.youtube.com/channel/%s/videos'
78caa52a 1471 IE_NAME = 'youtube:channel'
cdc628a4
PH
1472 _TESTS = [{
1473 'note': 'paginated channel',
1474 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w',
1475 'playlist_mincount': 91,
acf757f4
PH
1476 'info_dict': {
1477 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
1478 }
cdc628a4 1479 }]
c5e8d7af 1480
6de5dbaf
S
1481 @staticmethod
1482 def extract_videos_from_page(page):
c5e8d7af 1483 ids_in_page = []
fb69240c
S
1484 titles_in_page = []
1485 for mobj in re.finditer(r'(?:title="(?P<title>[^"]+)"[^>]+)?href="/watch\?v=(?P<id>[0-9A-Za-z_-]+)&?', page):
1486 video_id = mobj.group('id')
1487 video_title = unescapeHTML(mobj.group('title'))
1488 try:
1489 idx = ids_in_page.index(video_id)
1490 if video_title and not titles_in_page[idx]:
1491 titles_in_page[idx] = video_title
1492 except ValueError:
1493 ids_in_page.append(video_id)
1494 titles_in_page.append(video_title)
1495 return zip(ids_in_page, titles_in_page)
c5e8d7af
PH
1496
1497 def _real_extract(self, url):
9ff67727 1498 channel_id = self._match_id(url)
c5e8d7af 1499
eb0f3e7e 1500 url = self._TEMPLATE_URL % channel_id
386bdfa6
S
1501
1502 # Channel by page listing is restricted to 35 pages of 30 items, i.e. 1050 videos total (see #5778)
1503 # Workaround by extracting as a playlist if managed to obtain channel playlist URL
1504 # otherwise fallback on channel by page extraction
1505 channel_page = self._download_webpage(
1506 url + '?view=57', channel_id,
1507 'Downloading channel page', fatal=False)
3d8e9573
S
1508 channel_playlist_id = self._html_search_meta(
1509 'channelId', channel_page, 'channel id', default=None)
1510 if not channel_playlist_id:
1511 channel_playlist_id = self._search_regex(
1512 r'data-channel-external-id="([^"]+)"',
1513 channel_page, 'channel id', default=None)
386bdfa6
S
1514 if channel_playlist_id and channel_playlist_id.startswith('UC'):
1515 playlist_id = 'UU' + channel_playlist_id[2:]
d2a9de78
IK
1516 return self.url_result(
1517 compat_urlparse.urljoin(url, '/playlist?list=%s' % playlist_id), 'YoutubePlaylist')
386bdfa6 1518
60bf45c8 1519 channel_page = self._download_webpage(url, channel_id, 'Downloading page #1')
31812a9e
PH
1520 autogenerated = re.search(r'''(?x)
1521 class="[^"]*?(?:
1522 channel-header-autogenerated-label|
1523 yt-channel-title-autogenerated
1524 )[^"]*"''', channel_page) is not None
c5e8d7af 1525
b9643eed
JMF
1526 if autogenerated:
1527 # The videos are contained in a single page
1528 # the ajax pages can't be used, they are empty
b82f815f 1529 entries = [
fb69240c
S
1530 self.url_result(
1531 video_id, 'Youtube', video_id=video_id,
1532 video_title=video_title)
8f02ad4f 1533 for video_id, video_title in self.extract_videos_from_page(channel_page)]
b82f815f
PH
1534 return self.playlist_result(entries, channel_id)
1535
1536 def _entries():
23d3608c 1537 more_widget_html = content_html = channel_page
b9643eed 1538 for pagenum in itertools.count(1):
81c2f20b 1539
8f02ad4f 1540 for video_id, video_title in self.extract_videos_from_page(content_html):
b82f815f 1541 yield self.url_result(
fb69240c
S
1542 video_id, 'Youtube', video_id=video_id,
1543 video_title=video_title)
5f6a1245 1544
23d3608c
JMF
1545 mobj = re.search(
1546 r'data-uix-load-more-href="/?(?P<more>[^"]+)"',
1547 more_widget_html)
1548 if not mobj:
b9643eed 1549 break
c5e8d7af 1550
23d3608c
JMF
1551 more = self._download_json(
1552 'https://youtube.com/%s' % mobj.group('more'), channel_id,
1553 'Downloading page #%s' % (pagenum + 1),
1554 transform_source=uppercase_escape)
1555 content_html = more['content_html']
1556 more_widget_html = more['load_more_widget_html']
1557
b82f815f 1558 return self.playlist_result(_entries(), channel_id)
c5e8d7af
PH
1559
1560
eb0f3e7e 1561class YoutubeUserIE(YoutubeChannelIE):
78caa52a 1562 IE_DESC = 'YouTube.com user videos (URL or "ytuser" keyword)'
9ff67727 1563 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:user/)?(?!(?:attribution_link|watch|results)(?:$|[^a-z_A-Z0-9-])))|ytuser:)(?!feed/)(?P<id>[A-Za-z0-9_-]+)'
eb0f3e7e 1564 _TEMPLATE_URL = 'https://www.youtube.com/user/%s/videos'
78caa52a 1565 IE_NAME = 'youtube:user'
c5e8d7af 1566
cdc628a4
PH
1567 _TESTS = [{
1568 'url': 'https://www.youtube.com/user/TheLinuxFoundation',
1569 'playlist_mincount': 320,
1570 'info_dict': {
1571 'title': 'TheLinuxFoundation',
1572 }
1573 }, {
1574 'url': 'ytuser:phihag',
1575 'only_matching': True,
1576 }]
1577
e3ea4790 1578 @classmethod
f4b05232 1579 def suitable(cls, url):
e3ea4790
JMF
1580 # Don't return True if the url can be extracted with other youtube
1581 # extractor, the regex would is too permissive and it would match.
1582 other_ies = iter(klass for (name, klass) in globals().items() if name.endswith('IE') and klass is not cls)
5f6a1245
JW
1583 if any(ie.suitable(url) for ie in other_ies):
1584 return False
1585 else:
1586 return super(YoutubeUserIE, cls).suitable(url)
f4b05232 1587
b05654f0 1588
b4c08069 1589class YoutubeSearchIE(SearchInfoExtractor, YoutubePlaylistIE):
78caa52a 1590 IE_DESC = 'YouTube.com searches'
b4c08069
JMF
1591 # there doesn't appear to be a real limit, for example if you search for
1592 # 'python' you get more than 8.000.000 results
1593 _MAX_RESULTS = float('inf')
78caa52a 1594 IE_NAME = 'youtube:search'
b05654f0 1595 _SEARCH_KEY = 'ytsearch'
b4c08069 1596 _EXTRA_QUERY_ARGS = {}
9dd8e46a 1597 _TESTS = []
b05654f0 1598
b05654f0
PH
1599 def _get_n_results(self, query, n):
1600 """Get a specified number of results for a query"""
1601
b4c08069 1602 videos = []
b05654f0
PH
1603 limit = n
1604
b4c08069
JMF
1605 for pagenum in itertools.count(1):
1606 url_query = {
02175a79 1607 'search_query': query.encode('utf-8'),
b4c08069
JMF
1608 'page': pagenum,
1609 'spf': 'navigate',
1610 }
1611 url_query.update(self._EXTRA_QUERY_ARGS)
1612 result_url = 'https://www.youtube.com/results?' + compat_urllib_parse.urlencode(url_query)
1613 data = self._download_json(
69ea8ca4 1614 result_url, video_id='query "%s"' % query,
b4c08069 1615 note='Downloading page %s' % pagenum,
69ea8ca4 1616 errnote='Unable to download API page')
b4c08069 1617 html_content = data[1]['body']['content']
7cc3570e 1618
b4c08069 1619 if 'class="search-message' in html_content:
07ad22b8 1620 raise ExtractorError(
78caa52a 1621 '[youtube] No video results', expected=True)
b05654f0 1622
b4c08069
JMF
1623 new_videos = self._ids_to_results(orderedSet(re.findall(
1624 r'href="/watch\?v=(.{11})', html_content)))
1625 videos += new_videos
1626 if not new_videos or len(videos) > limit:
1627 break
b05654f0 1628
b4c08069
JMF
1629 if len(videos) > n:
1630 videos = videos[:n]
b05654f0 1631 return self.playlist_result(videos, query)
75dff0ee 1632
c9ae7b95 1633
a3dd9248 1634class YoutubeSearchDateIE(YoutubeSearchIE):
cb7fb546 1635 IE_NAME = YoutubeSearchIE.IE_NAME + ':date'
a3dd9248 1636 _SEARCH_KEY = 'ytsearchdate'
78caa52a 1637 IE_DESC = 'YouTube.com searches, newest videos first'
b4c08069 1638 _EXTRA_QUERY_ARGS = {'search_sort': 'video_date_uploaded'}
75dff0ee 1639
c9ae7b95
PH
1640
1641class YoutubeSearchURLIE(InfoExtractor):
78caa52a
PH
1642 IE_DESC = 'YouTube.com search URLs'
1643 IE_NAME = 'youtube:search_url'
c9ae7b95 1644 _VALID_URL = r'https?://(?:www\.)?youtube\.com/results\?(.*?&)?search_query=(?P<query>[^&]+)(?:[&]|$)'
cdc628a4
PH
1645 _TESTS = [{
1646 'url': 'https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video',
1647 'playlist_mincount': 5,
1648 'info_dict': {
1649 'title': 'youtube-dl test video',
1650 }
1651 }]
c9ae7b95
PH
1652
1653 def _real_extract(self, url):
1654 mobj = re.match(self._VALID_URL, url)
7fd002c0 1655 query = compat_urllib_parse_unquote_plus(mobj.group('query'))
c9ae7b95
PH
1656
1657 webpage = self._download_webpage(url, query)
1658 result_code = self._search_regex(
98998cde 1659 r'(?s)<ol[^>]+class="item-section"(.*?)</ol>', webpage, 'result HTML')
c9ae7b95
PH
1660
1661 part_codes = re.findall(
1662 r'(?s)<h3 class="yt-lockup-title">(.*?)</h3>', result_code)
1663 entries = []
1664 for part_code in part_codes:
1665 part_title = self._html_search_regex(
6feb2d5e 1666 [r'(?s)title="([^"]+)"', r'>([^<]+)</a>'], part_code, 'item title', fatal=False)
c9ae7b95
PH
1667 part_url_snippet = self._html_search_regex(
1668 r'(?s)href="([^"]+)"', part_code, 'item URL')
1669 part_url = compat_urlparse.urljoin(
1670 'https://www.youtube.com/', part_url_snippet)
1671 entries.append({
1672 '_type': 'url',
1673 'url': part_url,
1674 'title': part_title,
1675 })
1676
1677 return {
1678 '_type': 'playlist',
1679 'entries': entries,
1680 'title': query,
1681 }
1682
1683
75dff0ee 1684class YoutubeShowIE(InfoExtractor):
78caa52a 1685 IE_DESC = 'YouTube.com (multi-season) shows'
cdc628a4 1686 _VALID_URL = r'https?://www\.youtube\.com/show/(?P<id>[^?#]*)'
78caa52a 1687 IE_NAME = 'youtube:show'
cdc628a4
PH
1688 _TESTS = [{
1689 'url': 'http://www.youtube.com/show/airdisasters',
1690 'playlist_mincount': 3,
1691 'info_dict': {
1692 'id': 'airdisasters',
1693 'title': 'Air Disasters',
1694 }
1695 }]
75dff0ee
JMF
1696
1697 def _real_extract(self, url):
1698 mobj = re.match(self._VALID_URL, url)
cdc628a4
PH
1699 playlist_id = mobj.group('id')
1700 webpage = self._download_webpage(
1701 url, playlist_id, 'Downloading show webpage')
75dff0ee
JMF
1702 # There's one playlist for each season of the show
1703 m_seasons = list(re.finditer(r'href="(/playlist\?list=.*?)"', webpage))
cdc628a4
PH
1704 self.to_screen('%s: Found %s seasons' % (playlist_id, len(m_seasons)))
1705 entries = [
1706 self.url_result(
1707 'https://www.youtube.com' + season.group(1), 'YoutubePlaylist')
1708 for season in m_seasons
1709 ]
1710 title = self._og_search_title(webpage, fatal=False)
1711
1712 return {
1713 '_type': 'playlist',
1714 'id': playlist_id,
1715 'title': title,
1716 'entries': entries,
1717 }
04cc9617
JMF
1718
1719
b2e8bc1b 1720class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
d7ae0639 1721 """
25f14e9f 1722 Base class for feed extractors
d7ae0639
JMF
1723 Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.
1724 """
b2e8bc1b 1725 _LOGIN_REQUIRED = True
d7ae0639
JMF
1726
1727 @property
1728 def IE_NAME(self):
78caa52a 1729 return 'youtube:%s' % self._FEED_NAME
04cc9617 1730
81f0259b 1731 def _real_initialize(self):
b2e8bc1b 1732 self._login()
81f0259b 1733
04cc9617 1734 def _real_extract(self, url):
25f14e9f
S
1735 page = self._download_webpage(
1736 'https://www.youtube.com/feed/%s' % self._FEED_NAME, self._PLAYLIST_TITLE)
2bc43303
JMF
1737
1738 # The extraction process is the same as for playlists, but the regex
1739 # for the video ids doesn't contain an index
1740 ids = []
1741 more_widget_html = content_html = page
2bc43303
JMF
1742 for page_num in itertools.count(1):
1743 matches = re.findall(r'href="\s*/watch\?v=([0-9A-Za-z_-]{11})', content_html)
62c95fd5
S
1744
1745 # 'recommended' feed has infinite 'load more' and each new portion spins
1746 # the same videos in (sometimes) slightly different order, so we'll check
1747 # for unicity and break when portion has no new videos
1748 new_ids = filter(lambda video_id: video_id not in ids, orderedSet(matches))
1749 if not new_ids:
1750 break
1751
2bc43303
JMF
1752 ids.extend(new_ids)
1753
1754 mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
1755 if not mobj:
1756 break
1757
1758 more = self._download_json(
25f14e9f 1759 'https://youtube.com/%s' % mobj.group('more'), self._PLAYLIST_TITLE,
2bc43303
JMF
1760 'Downloading page #%s' % page_num,
1761 transform_source=uppercase_escape)
1762 content_html = more['content_html']
1763 more_widget_html = more['load_more_widget_html']
1764
25f14e9f
S
1765 return self.playlist_result(
1766 self._ids_to_results(ids), playlist_title=self._PLAYLIST_TITLE)
1767
1768
1769class YoutubeWatchLaterIE(YoutubePlaylistIE):
1770 IE_NAME = 'youtube:watchlater'
1771 IE_DESC = 'Youtube watch later list, ":ytwatchlater" for short (requires authentication)'
1772 _VALID_URL = r'https?://www\.youtube\.com/(?:feed/watch_later|playlist\?list=WL)|:ytwatchlater'
1773
1774 _TESTS = [] # override PlaylistIE tests
1775
1776 def _real_extract(self, url):
1777 return self._extract_playlist('WL')
f459d170 1778
5f6a1245 1779
c626a3d9 1780class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
78caa52a 1781 IE_NAME = 'youtube:favorites'
f3a34072 1782 IE_DESC = 'YouTube.com favourite videos, ":ytfav" for short (requires authentication)'
c7a7750d 1783 _VALID_URL = r'https?://www\.youtube\.com/my_favorites|:ytfav(?:ou?rites)?'
c626a3d9
JMF
1784 _LOGIN_REQUIRED = True
1785
1786 def _real_extract(self, url):
1787 webpage = self._download_webpage('https://www.youtube.com/my_favorites', 'Youtube Favourites videos')
78caa52a 1788 playlist_id = self._search_regex(r'list=(.+?)["&]', webpage, 'favourites playlist id')
c626a3d9 1789 return self.url_result(playlist_id, 'YoutubePlaylist')
15870e90
PH
1790
1791
25f14e9f
S
1792class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
1793 IE_DESC = 'YouTube.com recommended videos, ":ytrec" for short (requires authentication)'
1794 _VALID_URL = r'https?://www\.youtube\.com/feed/recommended|:ytrec(?:ommended)?'
1795 _FEED_NAME = 'recommended'
1796 _PLAYLIST_TITLE = 'Youtube Recommended videos'
1ed5b5c9 1797
1ed5b5c9 1798
25f14e9f
S
1799class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor):
1800 IE_DESC = 'YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication)'
1801 _VALID_URL = r'https?://www\.youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
1802 _FEED_NAME = 'subscriptions'
1803 _PLAYLIST_TITLE = 'Youtube Subscriptions'
1ed5b5c9 1804
1ed5b5c9 1805
25f14e9f
S
1806class YoutubeHistoryIE(YoutubeFeedsInfoExtractor):
1807 IE_DESC = 'Youtube watch history, ":ythistory" for short (requires authentication)'
1808 _VALID_URL = 'https?://www\.youtube\.com/feed/history|:ythistory'
1809 _FEED_NAME = 'history'
1810 _PLAYLIST_TITLE = 'Youtube History'
1ed5b5c9
JMF
1811
1812
15870e90
PH
1813class YoutubeTruncatedURLIE(InfoExtractor):
1814 IE_NAME = 'youtube:truncated_url'
1815 IE_DESC = False # Do not list
975d35db 1816 _VALID_URL = r'''(?x)
b95aab84
PH
1817 (?:https?://)?
1818 (?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/
1819 (?:watch\?(?:
c4808c60 1820 feature=[a-z_]+|
b95aab84
PH
1821 annotation_id=annotation_[^&]+|
1822 x-yt-cl=[0-9]+|
c1708b89 1823 hl=[^&]*|
b95aab84
PH
1824 )?
1825 |
1826 attribution_link\?a=[^&]+
1827 )
1828 $
975d35db 1829 '''
15870e90 1830
c4808c60
PH
1831 _TESTS = [{
1832 'url': 'http://www.youtube.com/watch?annotation_id=annotation_3951667041',
1833 'only_matching': True,
dc2fc736
PH
1834 }, {
1835 'url': 'http://www.youtube.com/watch?',
1836 'only_matching': True,
b95aab84
PH
1837 }, {
1838 'url': 'https://www.youtube.com/watch?x-yt-cl=84503534',
1839 'only_matching': True,
1840 }, {
1841 'url': 'https://www.youtube.com/watch?feature=foo',
1842 'only_matching': True,
c1708b89
PH
1843 }, {
1844 'url': 'https://www.youtube.com/watch?hl=en-GB',
1845 'only_matching': True,
c4808c60
PH
1846 }]
1847
15870e90
PH
1848 def _real_extract(self, url):
1849 raise ExtractorError(
78caa52a
PH
1850 'Did you forget to quote the URL? Remember that & is a meta '
1851 'character in most shells, so you want to put the URL in quotes, '
1852 'like youtube-dl '
1853 '"http://www.youtube.com/watch?feature=foo&v=BaW_jenozKc" '
1854 ' or simply youtube-dl BaW_jenozKc .',
15870e90 1855 expected=True)
772fd5cc
PH
1856
1857
1858class YoutubeTruncatedIDIE(InfoExtractor):
1859 IE_NAME = 'youtube:truncated_id'
1860 IE_DESC = False # Do not list
b95aab84 1861 _VALID_URL = r'https?://(?:www\.)?youtube\.com/watch\?v=(?P<id>[0-9A-Za-z_-]{1,10})$'
772fd5cc
PH
1862
1863 _TESTS = [{
1864 'url': 'https://www.youtube.com/watch?v=N_708QY7Ob',
1865 'only_matching': True,
1866 }]
1867
1868 def _real_extract(self, url):
1869 video_id = self._match_id(url)
1870 raise ExtractorError(
1871 'Incomplete YouTube ID %s. URL %s looks truncated.' % (video_id, url),
1872 expected=True)