]> jfr.im git - yt-dlp.git/blame - youtube_dl/extractor/youtube.py
Merge branch 'atomicdryad-pr-bbcnews'
[yt-dlp.git] / youtube_dl / extractor / youtube.py
CommitLineData
c5e8d7af 1# coding: utf-8
c5e8d7af 2
78caa52a
PH
3from __future__ import unicode_literals
4
5
0ca96d48 6import itertools
c5e8d7af 7import json
c4417ddb 8import os.path
c5e8d7af 9import re
42939b61 10import time
e0df6211 11import traceback
c5e8d7af 12
b05654f0 13from .common import InfoExtractor, SearchInfoExtractor
2b25cb5d 14from ..jsinterp import JSInterpreter
54256267 15from ..swfinterp import SWFInterpreter
4bb4a188 16from ..compat import (
edf3e38e 17 compat_chr,
c5e8d7af 18 compat_parse_qs,
c5e8d7af 19 compat_urllib_parse,
7fd002c0
S
20 compat_urllib_parse_unquote,
21 compat_urllib_parse_unquote_plus,
7c80519c 22 compat_urllib_parse_urlparse,
c5e8d7af 23 compat_urllib_request,
7c61bd36 24 compat_urlparse,
c5e8d7af 25 compat_str,
4bb4a188
PH
26)
27from ..utils import (
c5e8d7af 28 clean_html,
c5e8d7af 29 ExtractorError,
2d30521a 30 float_or_none,
4bb4a188
PH
31 get_element_by_attribute,
32 get_element_by_id,
dd27fd17 33 int_or_none,
4bb4a188 34 orderedSet,
7c80519c 35 parse_duration,
c93d53f5 36 str_to_int,
c5e8d7af
PH
37 unescapeHTML,
38 unified_strdate,
81c2f20b 39 uppercase_escape,
af214c3a 40 ISO3166Utils,
c5e8d7af
PH
41)
42
5f6a1245 43
de7f3446 44class YoutubeBaseInfoExtractor(InfoExtractor):
b2e8bc1b
JMF
45 """Provide base functions for Youtube extractors"""
46 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
83317f69 47 _TWOFACTOR_URL = 'https://accounts.google.com/SecondFactor'
b2e8bc1b
JMF
48 _NETRC_MACHINE = 'youtube'
49 # If True it will raise an error if no login info is provided
50 _LOGIN_REQUIRED = False
51
b2e8bc1b 52 def _set_language(self):
810fb84d
PH
53 self._set_cookie(
54 '.youtube.com', 'PREF', 'f1=50000000&hl=en',
42939b61 55 # YouTube sets the expire time to about two months
810fb84d 56 expire_time=time.time() + 2 * 30 * 24 * 3600)
b2e8bc1b 57
25f14e9f
S
58 def _ids_to_results(self, ids):
59 return [
60 self.url_result(vid_id, 'Youtube', video_id=vid_id)
61 for vid_id in ids]
62
b2e8bc1b 63 def _login(self):
83317f69 64 """
65 Attempt to log in to YouTube.
66 True is returned if successful or skipped.
67 False is returned if login failed.
68
69 If _LOGIN_REQUIRED is set and no authentication was provided, an error is raised.
70 """
b2e8bc1b
JMF
71 (username, password) = self._get_login_info()
72 # No authentication to be performed
73 if username is None:
74 if self._LOGIN_REQUIRED:
69ea8ca4 75 raise ExtractorError('No login info available, needed for using %s.' % self.IE_NAME, expected=True)
83317f69 76 return True
b2e8bc1b 77
7cc3570e
PH
78 login_page = self._download_webpage(
79 self._LOGIN_URL, None,
69ea8ca4
PH
80 note='Downloading login page',
81 errnote='unable to fetch login page', fatal=False)
7cc3570e
PH
82 if login_page is False:
83 return
b2e8bc1b 84
795f28f8 85 galx = self._search_regex(r'(?s)<input.+?name="GALX".+?value="(.+?)"',
78caa52a 86 login_page, 'Login GALX parameter')
c5e8d7af 87
b2e8bc1b
JMF
88 # Log in
89 login_form_strs = {
8bcc8756
JW
90 'continue': 'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
91 'Email': username,
92 'GALX': galx,
93 'Passwd': password,
94
95 'PersistentCookie': 'yes',
96 '_utf8': '霱',
97 'bgresponse': 'js_disabled',
98 'checkConnection': '',
99 'checkedDomains': 'youtube',
100 'dnConn': '',
101 'pstMsg': '0',
102 'rmShown': '1',
103 'secTok': '',
104 'signIn': 'Sign in',
105 'timeStmp': '',
106 'service': 'youtube',
107 'uilel': '3',
108 'hl': 'en_US',
b2e8bc1b 109 }
83317f69 110
b2e8bc1b
JMF
111 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
112 # chokes on unicode
5f6a1245 113 login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k, v in login_form_strs.items())
b2e8bc1b 114 login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
7cc3570e
PH
115
116 req = compat_urllib_request.Request(self._LOGIN_URL, login_data)
117 login_results = self._download_webpage(
118 req, None,
69ea8ca4 119 note='Logging in', errnote='unable to log in', fatal=False)
7cc3570e
PH
120 if login_results is False:
121 return False
83317f69 122
123 if re.search(r'id="errormsg_0_Passwd"', login_results) is not None:
69ea8ca4 124 raise ExtractorError('Please use your account password and a two-factor code instead of an application-specific password.', expected=True)
83317f69 125
126 # Two-Factor
127 # TODO add SMS and phone call support - these require making a request and then prompting the user
128
129 if re.search(r'(?i)<form[^>]* id="gaia_secondfactorform"', login_results) is not None:
130 tfa_code = self._get_tfa_info()
131
132 if tfa_code is None:
69ea8ca4
PH
133 self._downloader.report_warning('Two-factor authentication required. Provide it with --twofactor <code>')
134 self._downloader.report_warning('(Note that only TOTP (Google Authenticator App) codes work at this time.)')
83317f69 135 return False
136
137 # Unlike the first login form, secTok and timeStmp are both required for the TFA form
138
139 match = re.search(r'id="secTok"\n\s+value=\'(.+)\'/>', login_results, re.M | re.U)
140 if match is None:
69ea8ca4 141 self._downloader.report_warning('Failed to get secTok - did the page structure change?')
83317f69 142 secTok = match.group(1)
143 match = re.search(r'id="timeStmp"\n\s+value=\'(.+)\'/>', login_results, re.M | re.U)
144 if match is None:
69ea8ca4 145 self._downloader.report_warning('Failed to get timeStmp - did the page structure change?')
83317f69 146 timeStmp = match.group(1)
147
148 tfa_form_strs = {
78caa52a
PH
149 'continue': 'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
150 'smsToken': '',
151 'smsUserPin': tfa_code,
152 'smsVerifyPin': 'Verify',
153
154 'PersistentCookie': 'yes',
155 'checkConnection': '',
156 'checkedDomains': 'youtube',
157 'pstMsg': '1',
158 'secTok': secTok,
159 'timeStmp': timeStmp,
160 'service': 'youtube',
161 'hl': 'en_US',
83317f69 162 }
5f6a1245 163 tfa_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k, v in tfa_form_strs.items())
83317f69 164 tfa_data = compat_urllib_parse.urlencode(tfa_form).encode('ascii')
165
166 tfa_req = compat_urllib_request.Request(self._TWOFACTOR_URL, tfa_data)
167 tfa_results = self._download_webpage(
168 tfa_req, None,
69ea8ca4 169 note='Submitting TFA code', errnote='unable to submit tfa', fatal=False)
83317f69 170
171 if tfa_results is False:
172 return False
173
174 if re.search(r'(?i)<form[^>]* id="gaia_secondfactorform"', tfa_results) is not None:
69ea8ca4 175 self._downloader.report_warning('Two-factor code expired. Please try again, or use a one-use backup code instead.')
83317f69 176 return False
177 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', tfa_results) is not None:
69ea8ca4 178 self._downloader.report_warning('unable to log in - did the page structure change?')
83317f69 179 return False
180 if re.search(r'smsauth-interstitial-reviewsettings', tfa_results) is not None:
69ea8ca4 181 self._downloader.report_warning('Your Google account has a security notice. Please log in on your web browser, resolve the notice, and try again.')
83317f69 182 return False
183
7cc3570e 184 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
69ea8ca4 185 self._downloader.report_warning('unable to log in: bad username or password')
b2e8bc1b
JMF
186 return False
187 return True
188
b2e8bc1b
JMF
189 def _real_initialize(self):
190 if self._downloader is None:
191 return
42939b61 192 self._set_language()
b2e8bc1b
JMF
193 if not self._login():
194 return
c5e8d7af 195
8377574c 196
360e1ca5 197class YoutubeIE(YoutubeBaseInfoExtractor):
78caa52a 198 IE_DESC = 'YouTube.com'
cb7dfeea 199 _VALID_URL = r"""(?x)^
c5e8d7af 200 (
edb53e2d 201 (?:https?://|//) # http(s):// or protocol-independent URL
cb7dfeea 202 (?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/|
484aaeb2 203 (?:www\.)?deturl\.com/www\.youtube\.com/|
e70dc1d1 204 (?:www\.)?pwnyoutube\.com/|
f7000f3a 205 (?:www\.)?yourepeat\.com/|
e69ae5b9
JMF
206 tube\.majestyc\.net/|
207 youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains
c5e8d7af
PH
208 (?:.*?\#/)? # handle anchor (#/) redirect urls
209 (?: # the various things that can precede the ID:
ac7553d0 210 (?:(?:v|embed|e)/(?!videoseries)) # v/ or embed/ or e/
c5e8d7af 211 |(?: # or the v= param in all its forms
f7000f3a 212 (?:(?:watch|movie)(?:_popup)?(?:\.php)?/?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
c5e8d7af
PH
213 (?:\?|\#!?) # the params delimiter ? or # or #!
214 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
215 v=
216 )
f4b05232
JMF
217 ))
218 |youtu\.be/ # just youtu.be/xxxx
edb53e2d 219 |(?:www\.)?cleanvideosearch\.com/media/action/yt/watch\?videoId=
f4b05232 220 )
c5e8d7af 221 )? # all until now is optional -> you can pass the naked ID
8963d9c2 222 ([0-9A-Za-z_-]{11}) # here is it! the YouTube video ID
9291475f 223 (?!.*?&list=) # combined list/video URLs are handled by the playlist IE
c5e8d7af
PH
224 (?(1).+)? # if we found the ID, everything can follow
225 $"""
c5e8d7af 226 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
2c62dc26
PH
227 _formats = {
228 '5': {'ext': 'flv', 'width': 400, 'height': 240},
229 '6': {'ext': 'flv', 'width': 450, 'height': 270},
230 '13': {'ext': '3gp'},
231 '17': {'ext': '3gp', 'width': 176, 'height': 144},
232 '18': {'ext': 'mp4', 'width': 640, 'height': 360},
233 '22': {'ext': 'mp4', 'width': 1280, 'height': 720},
234 '34': {'ext': 'flv', 'width': 640, 'height': 360},
235 '35': {'ext': 'flv', 'width': 854, 'height': 480},
236 '36': {'ext': '3gp', 'width': 320, 'height': 240},
237 '37': {'ext': 'mp4', 'width': 1920, 'height': 1080},
238 '38': {'ext': 'mp4', 'width': 4096, 'height': 3072},
239 '43': {'ext': 'webm', 'width': 640, 'height': 360},
240 '44': {'ext': 'webm', 'width': 854, 'height': 480},
241 '45': {'ext': 'webm', 'width': 1280, 'height': 720},
242 '46': {'ext': 'webm', 'width': 1920, 'height': 1080},
c9bebed2
S
243 '59': {'ext': 'mp4', 'width': 854, 'height': 480},
244 '78': {'ext': 'mp4', 'width': 854, 'height': 480},
2c62dc26 245
1d043b93 246
86fe61c8 247 # 3d videos
43b81eb9
PH
248 '82': {'ext': 'mp4', 'height': 360, 'format_note': '3D', 'preference': -20},
249 '83': {'ext': 'mp4', 'height': 480, 'format_note': '3D', 'preference': -20},
250 '84': {'ext': 'mp4', 'height': 720, 'format_note': '3D', 'preference': -20},
251 '85': {'ext': 'mp4', 'height': 1080, 'format_note': '3D', 'preference': -20},
252 '100': {'ext': 'webm', 'height': 360, 'format_note': '3D', 'preference': -20},
253 '101': {'ext': 'webm', 'height': 480, 'format_note': '3D', 'preference': -20},
254 '102': {'ext': 'webm', 'height': 720, 'format_note': '3D', 'preference': -20},
836a086c 255
96fb5605 256 # Apple HTTP Live Streaming
43b81eb9
PH
257 '92': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'preference': -10},
258 '93': {'ext': 'mp4', 'height': 360, 'format_note': 'HLS', 'preference': -10},
259 '94': {'ext': 'mp4', 'height': 480, 'format_note': 'HLS', 'preference': -10},
260 '95': {'ext': 'mp4', 'height': 720, 'format_note': 'HLS', 'preference': -10},
261 '96': {'ext': 'mp4', 'height': 1080, 'format_note': 'HLS', 'preference': -10},
262 '132': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'preference': -10},
263 '151': {'ext': 'mp4', 'height': 72, 'format_note': 'HLS', 'preference': -10},
2c62dc26
PH
264
265 # DASH mp4 video
43b81eb9
PH
266 '133': {'ext': 'mp4', 'height': 240, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
267 '134': {'ext': 'mp4', 'height': 360, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
268 '135': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
269 '136': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
270 '137': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
e65566a9 271 '138': {'ext': 'mp4', 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40}, # Height can vary (https://github.com/rg3/youtube-dl/issues/4559)
43b81eb9
PH
272 '160': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
273 '264': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
0d2c1418
PH
274 '298': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'h264'},
275 '299': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'h264'},
276 '266': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'vcodec': 'h264'},
836a086c 277
f6f1fc92 278 # Dash mp4 audio
62cd676c
PH
279 '139': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'vcodec': 'none', 'abr': 48, 'preference': -50, 'container': 'm4a_dash'},
280 '140': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'vcodec': 'none', 'abr': 128, 'preference': -50, 'container': 'm4a_dash'},
281 '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'vcodec': 'none', 'abr': 256, 'preference': -50, 'container': 'm4a_dash'},
836a086c
AZ
282
283 # Dash webm
e75cafe9
A
284 '167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
285 '168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
286 '169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
287 '170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
288 '218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
289 '219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
1cc887cb 290 '278': {'ext': 'webm', 'height': 144, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'container': 'webm', 'vcodec': 'VP9'},
e75cafe9
A
291 '242': {'ext': 'webm', 'height': 240, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
292 '243': {'ext': 'webm', 'height': 360, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
293 '244': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
294 '245': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
295 '246': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
296 '247': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
297 '248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
3c80377b 298 '271': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
18061bba 299 '272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
a51d3aa0
PH
300 '302': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'VP9'},
301 '303': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'VP9'},
76b3c610 302 '308': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'VP9'},
10a404c3 303 '313': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'vcodec': 'VP9'},
76b3c610 304 '315': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'VP9'},
2c62dc26
PH
305
306 # Dash webm audio
55db73ef 307 '171': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'abr': 128, 'preference': -50},
e75cafe9 308 '172': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'abr': 256, 'preference': -50},
ce6b9a2d 309
0857baad
PH
310 # Dash webm audio with opus inside
311 '249': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 50, 'preference': -50},
312 '250': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 70, 'preference': -50},
313 '251': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 160, 'preference': -50},
314
ce6b9a2d
PH
315 # RTMP (unnamed)
316 '_rtmp': {'protocol': 'rtmp'},
c5e8d7af 317 }
836a086c 318
78caa52a 319 IE_NAME = 'youtube'
2eb88d95
PH
320 _TESTS = [
321 {
297a564b 322 'url': 'http://www.youtube.com/watch?v=BaW_jenozKcj&t=1s&end=9',
4bc3a23e
PH
323 'info_dict': {
324 'id': 'BaW_jenozKc',
325 'ext': 'mp4',
326 'title': 'youtube-dl test video "\'/\\ä↭𝕐',
327 'uploader': 'Philipp Hagemeister',
328 'uploader_id': 'phihag',
329 'upload_date': '20121002',
330 'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',
331 'categories': ['Science & Technology'],
3e7c1224
PH
332 'like_count': int,
333 'dislike_count': int,
7c80519c 334 'start_time': 1,
297a564b 335 'end_time': 9,
2eb88d95 336 }
0e853ca4 337 },
0e853ca4 338 {
4bc3a23e
PH
339 'url': 'http://www.youtube.com/watch?v=UxxajLWwzqY',
340 'note': 'Test generic use_cipher_signature video (#897)',
341 'info_dict': {
342 'id': 'UxxajLWwzqY',
343 'ext': 'mp4',
344 'upload_date': '20120506',
345 'title': 'Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]',
346 'description': 'md5:fea86fda2d5a5784273df5c7cc994d9f',
347 'uploader': 'Icona Pop',
348 'uploader_id': 'IconaPop',
2eb88d95 349 }
c108eb73
JMF
350 },
351 {
4bc3a23e
PH
352 'url': 'https://www.youtube.com/watch?v=07FYdnEawAQ',
353 'note': 'Test VEVO video with age protection (#956)',
354 'info_dict': {
355 'id': '07FYdnEawAQ',
356 'ext': 'mp4',
357 'upload_date': '20130703',
358 'title': 'Justin Timberlake - Tunnel Vision (Explicit)',
359 'description': 'md5:64249768eec3bc4276236606ea996373',
360 'uploader': 'justintimberlakeVEVO',
361 'uploader_id': 'justintimberlakeVEVO',
c108eb73
JMF
362 }
363 },
fccd3771 364 {
4bc3a23e
PH
365 'url': '//www.YouTube.com/watch?v=yZIXLfi8CZQ',
366 'note': 'Embed-only video (#1746)',
367 'info_dict': {
368 'id': 'yZIXLfi8CZQ',
369 'ext': 'mp4',
370 'upload_date': '20120608',
371 'title': 'Principal Sexually Assaults A Teacher - Episode 117 - 8th June 2012',
372 'description': 'md5:09b78bd971f1e3e289601dfba15ca4f7',
373 'uploader': 'SET India',
374 'uploader_id': 'setindia'
fccd3771
PH
375 }
376 },
dd27fd17 377 {
4bc3a23e
PH
378 'url': 'http://www.youtube.com/watch?v=a9LDPn-MO4I',
379 'note': '256k DASH audio (format 141) via DASH manifest',
380 'info_dict': {
381 'id': 'a9LDPn-MO4I',
382 'ext': 'm4a',
383 'upload_date': '20121002',
384 'uploader_id': '8KVIDEO',
385 'description': '',
386 'uploader': '8KVIDEO',
387 'title': 'UHDTV TEST 8K VIDEO.mp4'
4919603f 388 },
4bc3a23e
PH
389 'params': {
390 'youtube_include_dash_manifest': True,
391 'format': '141',
4919603f 392 },
dd27fd17 393 },
3489b7d2
JMF
394 # DASH manifest with encrypted signature
395 {
78caa52a
PH
396 'url': 'https://www.youtube.com/watch?v=IB3lcPjvWLA',
397 'info_dict': {
398 'id': 'IB3lcPjvWLA',
399 'ext': 'm4a',
b766eb27
JMF
400 'title': 'Afrojack, Spree Wilson - The Spark ft. Spree Wilson',
401 'description': 'md5:12e7067fa6735a77bdcbb58cb1187d2d',
78caa52a
PH
402 'uploader': 'AfrojackVEVO',
403 'uploader_id': 'AfrojackVEVO',
404 'upload_date': '20131011',
3489b7d2 405 },
4bc3a23e 406 'params': {
78caa52a
PH
407 'youtube_include_dash_manifest': True,
408 'format': '141',
3489b7d2
JMF
409 },
410 },
aaeb86f6
S
411 # JS player signature function name containing $
412 {
413 'url': 'https://www.youtube.com/watch?v=nfWlot6h_JM',
414 'info_dict': {
415 'id': 'nfWlot6h_JM',
416 'ext': 'm4a',
417 'title': 'Taylor Swift - Shake It Off',
418 'description': 'md5:2acfda1b285bdd478ccec22f9918199d',
419 'uploader': 'TaylorSwiftVEVO',
420 'uploader_id': 'TaylorSwiftVEVO',
421 'upload_date': '20140818',
422 },
423 'params': {
424 'youtube_include_dash_manifest': True,
425 'format': '141',
426 },
427 },
aa79ac0c
PH
428 # Controversy video
429 {
430 'url': 'https://www.youtube.com/watch?v=T4XJQO3qol8',
431 'info_dict': {
432 'id': 'T4XJQO3qol8',
433 'ext': 'mp4',
434 'upload_date': '20100909',
435 'uploader': 'The Amazing Atheist',
436 'uploader_id': 'TheAmazingAtheist',
437 'title': 'Burning Everyone\'s Koran',
438 'description': 'SUBSCRIBE: http://www.youtube.com/saturninefilms\n\nEven Obama has taken a stand against freedom on this issue: http://www.huffingtonpost.com/2010/09/09/obama-gma-interview-quran_n_710282.html',
439 }
c522adb1
JMF
440 },
441 # Normal age-gate video (No vevo, embed allowed)
442 {
443 'url': 'http://youtube.com/watch?v=HtVdAasjOgU',
444 'info_dict': {
445 'id': 'HtVdAasjOgU',
446 'ext': 'mp4',
447 'title': 'The Witcher 3: Wild Hunt - The Sword Of Destiny Trailer',
9ed99402 448 'description': 're:(?s).{100,}About the Game\n.*?The Witcher 3: Wild Hunt.{100,}',
c522adb1
JMF
449 'uploader': 'The Witcher',
450 'uploader_id': 'WitcherGame',
451 'upload_date': '20140605',
452 },
453 },
fccae2b9
S
454 # Age-gate video with encrypted signature
455 {
456 'url': 'http://www.youtube.com/watch?v=6kLq3WMV1nU',
457 'info_dict': {
458 'id': '6kLq3WMV1nU',
459 'ext': 'mp4',
460 'title': 'Dedication To My Ex (Miss That) (Lyric Video)',
461 'description': 'md5:33765bb339e1b47e7e72b5490139bb41',
462 'uploader': 'LloydVEVO',
463 'uploader_id': 'LloydVEVO',
464 'upload_date': '20110629',
465 },
466 },
774e208f
PH
467 # video_info is None (https://github.com/rg3/youtube-dl/issues/4421)
468 {
469 'url': '__2ABJjxzNo',
470 'info_dict': {
471 'id': '__2ABJjxzNo',
472 'ext': 'mp4',
473 'upload_date': '20100430',
474 'uploader_id': 'deadmau5',
475 'description': 'md5:12c56784b8032162bb936a5f76d55360',
476 'uploader': 'deadmau5',
477 'title': 'Deadmau5 - Some Chords (HD)',
478 },
479 'expected_warnings': [
480 'DASH manifest missing',
481 ]
e52a40ab
PH
482 },
483 # Olympics (https://github.com/rg3/youtube-dl/issues/4431)
484 {
485 'url': 'lqQg6PlCWgI',
486 'info_dict': {
487 'id': 'lqQg6PlCWgI',
488 'ext': 'mp4',
cbe2bd91
PH
489 'upload_date': '20120731',
490 'uploader_id': 'olympic',
491 'description': 'HO09 - Women - GER-AUS - Hockey - 31 July 2012 - London 2012 Olympic Games',
492 'uploader': 'Olympics',
493 'title': 'Hockey - Women - GER-AUS - London 2012 Olympic Games',
494 },
495 'params': {
496 'skip_download': 'requires avconv',
e52a40ab 497 }
cbe2bd91 498 },
6271f1ca
PH
499 # Non-square pixels
500 {
501 'url': 'https://www.youtube.com/watch?v=_b-2C3KPAM0',
502 'info_dict': {
503 'id': '_b-2C3KPAM0',
504 'ext': 'mp4',
505 'stretched_ratio': 16 / 9.,
506 'upload_date': '20110310',
507 'uploader_id': 'AllenMeow',
508 'description': 'made by Wacom from Korea | 字幕&加油添醋 by TY\'s Allen | 感謝heylisa00cavey1001同學熱情提供梗及翻譯',
509 'uploader': '孫艾倫',
510 'title': '[A-made] 變態妍字幕版 太妍 我就是這樣的人',
511 },
06b491eb
S
512 },
513 # url_encoded_fmt_stream_map is empty string
514 {
515 'url': 'qEJwOuvDf7I',
516 'info_dict': {
517 'id': 'qEJwOuvDf7I',
518 'ext': 'mp4',
519 'title': 'Обсуждение судебной практики по выборам 14 сентября 2014 года в Санкт-Петербурге',
520 'description': '',
521 'upload_date': '20150404',
522 'uploader_id': 'spbelect',
523 'uploader': 'Наблюдатели Петербурга',
524 },
525 'params': {
526 'skip_download': 'requires avconv',
527 }
528 },
da77d856
S
529 # Extraction from multiple DASH manifests (https://github.com/rg3/youtube-dl/pull/6097)
530 {
531 'url': 'https://www.youtube.com/watch?v=FIl7x6_3R5Y',
532 'info_dict': {
533 'id': 'FIl7x6_3R5Y',
534 'ext': 'mp4',
535 'title': 'md5:7b81415841e02ecd4313668cde88737a',
536 'description': 'md5:116377fd2963b81ec4ce64b542173306',
537 'upload_date': '20150625',
538 'uploader_id': 'dorappi2000',
539 'uploader': 'dorappi2000',
540 'formats': 'mincount:33',
541 },
2ee8f5d8 542 },
8a1a26ce
YCH
543 # DASH manifest with segment_list
544 {
545 'url': 'https://www.youtube.com/embed/CsmdDsKjzN8',
546 'md5': '8ce563a1d667b599d21064e982ab9e31',
547 'info_dict': {
548 'id': 'CsmdDsKjzN8',
549 'ext': 'mp4',
17ee98e1 550 'upload_date': '20150501', # According to '<meta itemprop="datePublished"', but in other places it's 20150510
8a1a26ce
YCH
551 'uploader': 'Airtek',
552 'description': 'Retransmisión en directo de la XVIII media maratón de Zaragoza.',
553 'uploader_id': 'UCzTzUmjXxxacNnL8I3m4LnQ',
554 'title': 'Retransmisión XVIII Media maratón Zaragoza 2015',
555 },
556 'params': {
557 'youtube_include_dash_manifest': True,
558 'format': '135', # bestvideo
559 }
2ee8f5d8 560 },
2eb88d95
PH
561 ]
562
e0df6211
PH
563 def __init__(self, *args, **kwargs):
564 super(YoutubeIE, self).__init__(*args, **kwargs)
83799698 565 self._player_cache = {}
e0df6211 566
c5e8d7af
PH
567 def report_video_info_webpage_download(self, video_id):
568 """Report attempt to download video info webpage."""
69ea8ca4 569 self.to_screen('%s: Downloading video info webpage' % video_id)
c5e8d7af 570
c5e8d7af
PH
571 def report_information_extraction(self, video_id):
572 """Report attempt to extract video information."""
69ea8ca4 573 self.to_screen('%s: Extracting video information' % video_id)
c5e8d7af
PH
574
575 def report_unavailable_format(self, video_id, format):
576 """Report extracted video URL."""
69ea8ca4 577 self.to_screen('%s: Format %s not available' % (video_id, format))
c5e8d7af
PH
578
579 def report_rtmp_download(self):
580 """Indicate the download will use the RTMP protocol."""
69ea8ca4 581 self.to_screen('RTMP download detected')
c5e8d7af 582
60064c53
PH
583 def _signature_cache_id(self, example_sig):
584 """ Return a string representation of a signature """
78caa52a 585 return '.'.join(compat_str(len(part)) for part in example_sig.split('.'))
60064c53
PH
586
587 def _extract_signature_function(self, video_id, player_url, example_sig):
cf010131 588 id_m = re.match(
60620368 589 r'.*?-(?P<id>[a-zA-Z0-9_-]+)(?:/watch_as3|/html5player)?\.(?P<ext>[a-z]+)$',
cf010131 590 player_url)
c081b35c
PH
591 if not id_m:
592 raise ExtractorError('Cannot identify player %r' % player_url)
e0df6211
PH
593 player_type = id_m.group('ext')
594 player_id = id_m.group('id')
595
c4417ddb 596 # Read from filesystem cache
60064c53
PH
597 func_id = '%s_%s_%s' % (
598 player_type, player_id, self._signature_cache_id(example_sig))
c4417ddb 599 assert os.path.basename(func_id) == func_id
a0e07d31 600
69ea8ca4 601 cache_spec = self._downloader.cache.load('youtube-sigfuncs', func_id)
a0e07d31 602 if cache_spec is not None:
78caa52a 603 return lambda s: ''.join(s[i] for i in cache_spec)
83799698 604
6d1a55a5
PH
605 download_note = (
606 'Downloading player %s' % player_url
607 if self._downloader.params.get('verbose') else
608 'Downloading %s player %s' % (player_type, player_id)
609 )
e0df6211
PH
610 if player_type == 'js':
611 code = self._download_webpage(
612 player_url, video_id,
6d1a55a5 613 note=download_note,
69ea8ca4 614 errnote='Download of %s failed' % player_url)
83799698 615 res = self._parse_sig_js(code)
c4417ddb 616 elif player_type == 'swf':
e0df6211
PH
617 urlh = self._request_webpage(
618 player_url, video_id,
6d1a55a5 619 note=download_note,
69ea8ca4 620 errnote='Download of %s failed' % player_url)
e0df6211 621 code = urlh.read()
83799698 622 res = self._parse_sig_swf(code)
e0df6211
PH
623 else:
624 assert False, 'Invalid player type %r' % player_type
625
785521bf
PH
626 test_string = ''.join(map(compat_chr, range(len(example_sig))))
627 cache_res = res(test_string)
628 cache_spec = [ord(c) for c in cache_res]
83799698 629
69ea8ca4 630 self._downloader.cache.store('youtube-sigfuncs', func_id, cache_spec)
83799698
PH
631 return res
632
60064c53 633 def _print_sig_code(self, func, example_sig):
edf3e38e
PH
634 def gen_sig_code(idxs):
635 def _genslice(start, end, step):
78caa52a 636 starts = '' if start == 0 else str(start)
8bcc8756 637 ends = (':%d' % (end + step)) if end + step >= 0 else ':'
69ea8ca4 638 steps = '' if step == 1 else (':%d' % step)
78caa52a 639 return 's[%s%s%s]' % (starts, ends, steps)
edf3e38e
PH
640
641 step = None
7af808a5
PH
642 # Quelch pyflakes warnings - start will be set when step is set
643 start = '(Never used)'
edf3e38e
PH
644 for i, prev in zip(idxs[1:], idxs[:-1]):
645 if step is not None:
646 if i - prev == step:
647 continue
648 yield _genslice(start, prev, step)
649 step = None
650 continue
651 if i - prev in [-1, 1]:
652 step = i - prev
653 start = prev
654 continue
655 else:
78caa52a 656 yield 's[%d]' % prev
edf3e38e 657 if step is None:
78caa52a 658 yield 's[%d]' % i
edf3e38e
PH
659 else:
660 yield _genslice(start, i, step)
661
78caa52a 662 test_string = ''.join(map(compat_chr, range(len(example_sig))))
c705320f 663 cache_res = func(test_string)
edf3e38e 664 cache_spec = [ord(c) for c in cache_res]
78caa52a 665 expr_code = ' + '.join(gen_sig_code(cache_spec))
60064c53
PH
666 signature_id_tuple = '(%s)' % (
667 ', '.join(compat_str(len(p)) for p in example_sig.split('.')))
69ea8ca4 668 code = ('if tuple(len(p) for p in s.split(\'.\')) == %s:\n'
78caa52a 669 ' return %s\n') % (signature_id_tuple, expr_code)
69ea8ca4 670 self.to_screen('Extracted signature function:\n' + code)
edf3e38e 671
e0df6211
PH
672 def _parse_sig_js(self, jscode):
673 funcname = self._search_regex(
aaeb86f6 674 r'\.sig\|\|([a-zA-Z0-9$]+)\(', jscode,
8bcc8756 675 'Initial JS player signature function name')
2b25cb5d
PH
676
677 jsi = JSInterpreter(jscode)
678 initial_function = jsi.extract_function(funcname)
e0df6211
PH
679 return lambda s: initial_function([s])
680
681 def _parse_sig_swf(self, file_contents):
54256267 682 swfi = SWFInterpreter(file_contents)
78caa52a 683 TARGET_CLASSNAME = 'SignatureDecipher'
54256267 684 searched_class = swfi.extract_class(TARGET_CLASSNAME)
78caa52a 685 initial_function = swfi.extract_function(searched_class, 'decipher')
e0df6211
PH
686 return lambda s: initial_function([s])
687
83799698 688 def _decrypt_signature(self, s, video_id, player_url, age_gate=False):
257a2501 689 """Turn the encrypted s field into a working signature"""
6b37f0be 690
c8bf86d5 691 if player_url is None:
69ea8ca4 692 raise ExtractorError('Cannot decrypt signature without player_url')
920de7a2 693
69ea8ca4 694 if player_url.startswith('//'):
78caa52a 695 player_url = 'https:' + player_url
c8bf86d5 696 try:
62af3a0e 697 player_id = (player_url, self._signature_cache_id(s))
c8bf86d5
PH
698 if player_id not in self._player_cache:
699 func = self._extract_signature_function(
60064c53 700 video_id, player_url, s
c8bf86d5
PH
701 )
702 self._player_cache[player_id] = func
703 func = self._player_cache[player_id]
704 if self._downloader.params.get('youtube_print_sig_code'):
60064c53 705 self._print_sig_code(func, s)
c8bf86d5
PH
706 return func(s)
707 except Exception as e:
708 tb = traceback.format_exc()
709 raise ExtractorError(
78caa52a 710 'Signature extraction failed: ' + tb, cause=e)
e0df6211 711
360e1ca5 712 def _get_subtitles(self, video_id, webpage):
de7f3446 713 try:
60e47a26 714 subs_doc = self._download_xml(
38c2e5b8 715 'https://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,
7fad1c63
JMF
716 video_id, note=False)
717 except ExtractorError as err:
69ea8ca4 718 self._downloader.report_warning('unable to download video subtitles: %s' % compat_str(err))
de7f3446 719 return {}
de7f3446
JMF
720
721 sub_lang_list = {}
60e47a26
JMF
722 for track in subs_doc.findall('track'):
723 lang = track.attrib['lang_code']
7e660ac1
LD
724 if lang in sub_lang_list:
725 continue
360e1ca5
JMF
726 sub_formats = []
727 for ext in ['sbv', 'vtt', 'srt']:
728 params = compat_urllib_parse.urlencode({
729 'lang': lang,
730 'v': video_id,
731 'fmt': ext,
732 'name': track.attrib['name'].encode('utf-8'),
733 })
734 sub_formats.append({
735 'url': 'https://www.youtube.com/api/timedtext?' + params,
736 'ext': ext,
737 })
738 sub_lang_list[lang] = sub_formats
de7f3446 739 if not sub_lang_list:
69ea8ca4 740 self._downloader.report_warning('video doesn\'t have subtitles')
de7f3446
JMF
741 return {}
742 return sub_lang_list
743
360e1ca5 744 def _get_automatic_captions(self, video_id, webpage):
de7f3446
JMF
745 """We need the webpage for getting the captions url, pass it as an
746 argument to speed up the process."""
69ea8ca4 747 self.to_screen('%s: Looking for automatic captions' % video_id)
de7f3446 748 mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
78caa52a 749 err_msg = 'Couldn\'t find automatic captions for %s' % video_id
de7f3446
JMF
750 if mobj is None:
751 self._downloader.report_warning(err_msg)
752 return {}
753 player_config = json.loads(mobj.group(1))
754 try:
0792d563
PH
755 args = player_config['args']
756 caption_url = args['ttsurl']
757 timestamp = args['timestamp']
055e6f36
JMF
758 # We get the available subtitles
759 list_params = compat_urllib_parse.urlencode({
760 'type': 'list',
761 'tlangs': 1,
762 'asrs': 1,
de7f3446 763 })
055e6f36 764 list_url = caption_url + '&' + list_params
e26f8712 765 caption_list = self._download_xml(list_url, video_id)
e3dc22ca 766 original_lang_node = caption_list.find('track')
7d900ef1 767 if original_lang_node is None:
69ea8ca4 768 self._downloader.report_warning('Video doesn\'t have automatic captions')
e3dc22ca
JMF
769 return {}
770 original_lang = original_lang_node.attrib['lang_code']
7d900ef1 771 caption_kind = original_lang_node.attrib.get('kind', '')
055e6f36
JMF
772
773 sub_lang_list = {}
774 for lang_node in caption_list.findall('target'):
775 sub_lang = lang_node.attrib['lang_code']
360e1ca5
JMF
776 sub_formats = []
777 for ext in ['sbv', 'vtt', 'srt']:
778 params = compat_urllib_parse.urlencode({
779 'lang': original_lang,
780 'tlang': sub_lang,
781 'fmt': ext,
782 'ts': timestamp,
783 'kind': caption_kind,
784 })
785 sub_formats.append({
786 'url': caption_url + '&' + params,
787 'ext': ext,
788 })
789 sub_lang_list[sub_lang] = sub_formats
055e6f36 790 return sub_lang_list
de7f3446
JMF
791 # An extractor error can be raise by the download process if there are
792 # no automatic captions but there are subtitles
793 except (KeyError, ExtractorError):
794 self._downloader.report_warning(err_msg)
795 return {}
796
97665381
PH
797 @classmethod
798 def extract_id(cls, url):
799 mobj = re.match(cls._VALID_URL, url, re.VERBOSE)
c5e8d7af 800 if mobj is None:
69ea8ca4 801 raise ExtractorError('Invalid URL: %s' % url)
c5e8d7af
PH
802 video_id = mobj.group(2)
803 return video_id
804
1d043b93
JMF
805 def _extract_from_m3u8(self, manifest_url, video_id):
806 url_map = {}
5f6a1245 807
1d043b93
JMF
808 def _get_urls(_manifest):
809 lines = _manifest.split('\n')
810 urls = filter(lambda l: l and not l.startswith('#'),
8bcc8756 811 lines)
1d043b93 812 return urls
78caa52a 813 manifest = self._download_webpage(manifest_url, video_id, 'Downloading formats manifest')
1d043b93
JMF
814 formats_urls = _get_urls(manifest)
815 for format_url in formats_urls:
890f62e8 816 itag = self._search_regex(r'itag/(\d+?)/', format_url, 'itag')
1d043b93
JMF
817 url_map[itag] = format_url
818 return url_map
819
1fb07d10
JG
820 def _extract_annotations(self, video_id):
821 url = 'https://www.youtube.com/annotations_invideo?features=1&legacy=1&video_id=%s' % video_id
69ea8ca4 822 return self._download_webpage(url, video_id, note='Searching for annotations.', errnote='Unable to download video annotations.')
1fb07d10 823
da276600 824 def _parse_dash_manifest(
77c6fb5b 825 self, video_id, dash_manifest_url, player_url, age_gate, fatal=True):
774e208f
PH
826 def decrypt_sig(mobj):
827 s = mobj.group(1)
828 dec_s = self._decrypt_signature(s, video_id, player_url, age_gate)
829 return '/signature/%s' % dec_s
e1b9322b 830 dash_manifest_url = re.sub(r'/s/([a-fA-F0-9\.]+)', decrypt_sig, dash_manifest_url)
774e208f
PH
831 dash_doc = self._download_xml(
832 dash_manifest_url, video_id,
833 note='Downloading DASH manifest',
77c6fb5b
S
834 errnote='Could not download DASH manifest',
835 fatal=fatal)
836
837 if dash_doc is False:
838 return []
774e208f
PH
839
840 formats = []
de5c5456
YCH
841 for a in dash_doc.findall('.//{urn:mpeg:DASH:schema:MPD:2011}AdaptationSet'):
842 mime_type = a.attrib.get('mimeType')
843 for r in a.findall('{urn:mpeg:DASH:schema:MPD:2011}Representation'):
844 url_el = r.find('{urn:mpeg:DASH:schema:MPD:2011}BaseURL')
845 if url_el is None:
846 continue
847 if mime_type == 'text/vtt':
848 # TODO implement WebVTT downloading
849 pass
850 elif mime_type.startswith('audio/') or mime_type.startswith('video/'):
6800d337 851 segment_list = r.find('{urn:mpeg:DASH:schema:MPD:2011}SegmentList')
de5c5456
YCH
852 format_id = r.attrib['id']
853 video_url = url_el.text
854 filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength'))
855 f = {
856 'format_id': format_id,
857 'url': video_url,
858 'width': int_or_none(r.attrib.get('width')),
859 'height': int_or_none(r.attrib.get('height')),
860 'tbr': int_or_none(r.attrib.get('bandwidth'), 1000),
861 'asr': int_or_none(r.attrib.get('audioSamplingRate')),
862 'filesize': filesize,
863 'fps': int_or_none(r.attrib.get('frameRate')),
864 }
0c8662d2 865 if segment_list is not None:
6800d337
YCH
866 f.update({
867 'initialization_url': segment_list.find('{urn:mpeg:DASH:schema:MPD:2011}Initialization').attrib['sourceURL'],
b9258c61 868 'segment_urls': [segment.attrib.get('media') for segment in segment_list.findall('{urn:mpeg:DASH:schema:MPD:2011}SegmentURL')],
423d2be5 869 'protocol': 'http_dash_segments',
6800d337 870 })
de5c5456
YCH
871 try:
872 existing_format = next(
873 fo for fo in formats
874 if fo['format_id'] == format_id)
875 except StopIteration:
876 full_info = self._formats.get(format_id, {}).copy()
877 full_info.update(f)
1b5a1ae2
S
878 codecs = r.attrib.get('codecs')
879 if codecs:
880 if full_info.get('acodec') == 'none' and 'vcodec' not in full_info:
881 full_info['vcodec'] = codecs
882 elif full_info.get('vcodec') == 'none' and 'acodec' not in full_info:
883 full_info['acodec'] = codecs
de5c5456
YCH
884 formats.append(full_info)
885 else:
886 existing_format.update(f)
887 else:
888 self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
774e208f
PH
889 return formats
890
c5e8d7af 891 def _real_extract(self, url):
7e8c0af0 892 proto = (
78caa52a
PH
893 'http' if self._downloader.params.get('prefer_insecure', False)
894 else 'https')
7e8c0af0 895
7c80519c 896 start_time = None
297a564b 897 end_time = None
7c80519c
JMF
898 parsed_url = compat_urllib_parse_urlparse(url)
899 for component in [parsed_url.fragment, parsed_url.query]:
900 query = compat_parse_qs(component)
297a564b 901 if start_time is None and 't' in query:
7c80519c 902 start_time = parse_duration(query['t'][0])
2929fa0e
JMF
903 if start_time is None and 'start' in query:
904 start_time = parse_duration(query['start'][0])
297a564b
JMF
905 if end_time is None and 'end' in query:
906 end_time = parse_duration(query['end'][0])
7c80519c 907
c5e8d7af
PH
908 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
909 mobj = re.search(self._NEXT_URL_RE, url)
910 if mobj:
7fd002c0 911 url = proto + '://www.youtube.com/' + compat_urllib_parse_unquote(mobj.group(1)).lstrip('/')
97665381 912 video_id = self.extract_id(url)
c5e8d7af
PH
913
914 # Get video webpage
aa79ac0c 915 url = proto + '://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1&bpctr=9999999999' % video_id
a1f934b1 916 video_webpage = self._download_webpage(url, video_id)
c5e8d7af
PH
917
918 # Attempt to extract SWF player URL
e0df6211 919 mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
c5e8d7af
PH
920 if mobj is not None:
921 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
922 else:
923 player_url = None
924
d8d24a92
S
925 dash_mpds = []
926
927 def add_dash_mpd(video_info):
928 dash_mpd = video_info.get('dashmpd')
929 if dash_mpd and dash_mpd[0] not in dash_mpds:
930 dash_mpds.append(dash_mpd[0])
931
c5e8d7af 932 # Get video info
6449cd80 933 embed_webpage = None
2fe1ff85 934 is_live = None
c108eb73 935 if re.search(r'player-age-gate-content">', video_webpage) is not None:
c108eb73
JMF
936 age_gate = True
937 # We simulate the access to the video from www.youtube.com/v/{video_id}
938 # this can be viewed without login into Youtube
beb95e77
CL
939 url = proto + '://www.youtube.com/embed/%s' % video_id
940 embed_webpage = self._download_webpage(url, video_id, 'Downloading embed webpage')
2c57c7fa
JMF
941 data = compat_urllib_parse.urlencode({
942 'video_id': video_id,
943 'eurl': 'https://youtube.googleapis.com/v/' + video_id,
c084c934 944 'sts': self._search_regex(
beb95e77 945 r'"sts"\s*:\s*(\d+)', embed_webpage, 'sts', default=''),
2c57c7fa 946 })
7e8c0af0 947 video_info_url = proto + '://www.youtube.com/get_video_info?' + data
94bd3613
PH
948 video_info_webpage = self._download_webpage(
949 video_info_url, video_id,
20436c30 950 note='Refetching age-gated info webpage',
94bd3613 951 errnote='unable to download video info webpage')
c5e8d7af 952 video_info = compat_parse_qs(video_info_webpage)
d8d24a92 953 add_dash_mpd(video_info)
c108eb73
JMF
954 else:
955 age_gate = False
bc93bdb5 956 video_info = None
d8d24a92
S
957 # Try looking directly into the video webpage
958 mobj = re.search(r';ytplayer\.config\s*=\s*({.*?});', video_webpage)
959 if mobj:
4e62ebe2
JMF
960 json_code = uppercase_escape(mobj.group(1))
961 ytplayer_config = json.loads(json_code)
962 args = ytplayer_config['args']
d8d24a92
S
963 if args.get('url_encoded_fmt_stream_map'):
964 # Convert to the same format returned by compat_parse_qs
965 video_info = dict((k, [v]) for k, v in args.items())
966 add_dash_mpd(video_info)
2fe1ff85
JMF
967 if args.get('livestream') == '1' or args.get('live_playback') == 1:
968 is_live = True
0a3cf9ad
S
969 if not video_info or self._downloader.params.get('youtube_include_dash_manifest', True):
970 # We also try looking in get_video_info since it may contain different dashmpd
971 # URL that points to a DASH manifest with possibly different itag set (some itags
972 # are missing from DASH manifest pointed by webpage's dashmpd, some - from DASH
973 # manifest pointed by get_video_info's dashmpd).
974 # The general idea is to take a union of itags of both DASH manifests (for example
975 # video with such 'manifest behavior' see https://github.com/rg3/youtube-dl/issues/6093)
4e62ebe2 976 self.report_video_info_webpage_download(video_id)
0a3cf9ad 977 for el_type in ['&el=info', '&el=embedded', '&el=detailpage', '&el=vevo', '']:
810fb84d
PH
978 video_info_url = (
979 '%s://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
980 % (proto, video_id, el_type))
981 video_info_webpage = self._download_webpage(
982 video_info_url,
4e62ebe2
JMF
983 video_id, note=False,
984 errnote='unable to download video info webpage')
0a3cf9ad 985 get_video_info = compat_parse_qs(video_info_webpage)
87dc4511
JMF
986 if get_video_info.get('use_cipher_signature') != ['True']:
987 add_dash_mpd(get_video_info)
0a3cf9ad
S
988 if not video_info:
989 video_info = get_video_info
990 if 'token' in get_video_info:
4e62ebe2 991 break
c5e8d7af
PH
992 if 'token' not in video_info:
993 if 'reason' in video_info:
af214c3a
YCH
994 if 'The uploader has not made this video available in your country.' in video_info['reason']:
995 regions_allowed = self._html_search_meta('regionsAllowed', video_webpage, default=None)
678e436f 996 if regions_allowed:
af214c3a
YCH
997 raise ExtractorError('YouTube said: This video is available in %s only' % (
998 ', '.join(map(ISO3166Utils.short2full, regions_allowed.split(',')))),
999 expected=True)
d11271dd 1000 raise ExtractorError(
78caa52a 1001 'YouTube said: %s' % video_info['reason'][0],
d11271dd 1002 expected=True, video_id=video_id)
c5e8d7af 1003 else:
d11271dd 1004 raise ExtractorError(
78caa52a 1005 '"token" parameter not in video info for unknown reason',
d11271dd 1006 video_id=video_id)
c5e8d7af 1007
1d699755
PH
1008 if 'view_count' in video_info:
1009 view_count = int(video_info['view_count'][0])
1010 else:
1011 view_count = None
1012
c5e8d7af
PH
1013 # Check for "rental" videos
1014 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
69ea8ca4 1015 raise ExtractorError('"rental" videos not supported')
c5e8d7af
PH
1016
1017 # Start extracting information
1018 self.report_information_extraction(video_id)
1019
1020 # uploader
1021 if 'author' not in video_info:
69ea8ca4 1022 raise ExtractorError('Unable to extract uploader name')
7fd002c0 1023 video_uploader = compat_urllib_parse_unquote_plus(video_info['author'][0])
c5e8d7af
PH
1024
1025 # uploader_id
1026 video_uploader_id = None
1027 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
1028 if mobj is not None:
1029 video_uploader_id = mobj.group(1)
1030 else:
69ea8ca4 1031 self._downloader.report_warning('unable to extract uploader nickname')
c5e8d7af
PH
1032
1033 # title
a8c6b241 1034 if 'title' in video_info:
aa92f063 1035 video_title = video_info['title'][0]
a8c6b241 1036 else:
69ea8ca4 1037 self._downloader.report_warning('Unable to extract video title')
78caa52a 1038 video_title = '_'
c5e8d7af
PH
1039
1040 # thumbnail image
7763b04e
JMF
1041 # We try first to get a high quality image:
1042 m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">',
1043 video_webpage, re.DOTALL)
1044 if m_thumb is not None:
1045 video_thumbnail = m_thumb.group(1)
1046 elif 'thumbnail_url' not in video_info:
69ea8ca4 1047 self._downloader.report_warning('unable to extract video thumbnail')
f490e77e 1048 video_thumbnail = None
c5e8d7af 1049 else: # don't panic if we can't find it
7fd002c0 1050 video_thumbnail = compat_urllib_parse_unquote_plus(video_info['thumbnail_url'][0])
c5e8d7af
PH
1051
1052 # upload date
9d0b581f
S
1053 upload_date = self._html_search_meta(
1054 'datePublished', video_webpage, 'upload date', default=None)
1055 if not upload_date:
1056 upload_date = self._search_regex(
1057 [r'(?s)id="eow-date.*?>(.*?)</span>',
1058 r'id="watch-uploader-info".*?>.*?(?:Published|Uploaded|Streamed live|Started) on (.+?)</strong>'],
1059 video_webpage, 'upload date', default=None)
1060 if upload_date:
1061 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1062 upload_date = unified_strdate(upload_date)
c5e8d7af 1063
55f7bd2d
PH
1064 m_cat_container = self._search_regex(
1065 r'(?s)<h4[^>]*>\s*Category\s*</h4>\s*<ul[^>]*>(.*?)</ul>',
624dcebf 1066 video_webpage, 'categories', default=None)
ec8deefc 1067 if m_cat_container:
ad3bc6ac 1068 category = self._html_search_regex(
01ed5c9b 1069 r'(?s)<a[^<]+>(.*?)</a>', m_cat_container, 'category',
ad3bc6ac
PH
1070 default=None)
1071 video_categories = None if category is None else [category]
1072 else:
1073 video_categories = None
ec8deefc 1074
c5e8d7af
PH
1075 # description
1076 video_description = get_element_by_id("eow-description", video_webpage)
1077 if video_description:
27dcce19
PH
1078 video_description = re.sub(r'''(?x)
1079 <a\s+
1080 (?:[a-zA-Z-]+="[^"]+"\s+)*?
1081 title="([^"]+)"\s+
1082 (?:[a-zA-Z-]+="[^"]+"\s+)*?
1083 class="yt-uix-redirect-link"\s*>
1084 [^<]+
1085 </a>
1086 ''', r'\1', video_description)
c5e8d7af
PH
1087 video_description = clean_html(video_description)
1088 else:
1089 fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
1090 if fd_mobj:
1091 video_description = unescapeHTML(fd_mobj.group(1))
1092 else:
78caa52a 1093 video_description = ''
c5e8d7af 1094
f30a38be 1095 def _extract_count(count_name):
c93d53f5
S
1096 return str_to_int(self._search_regex(
1097 r'-%s-button[^>]+><span[^>]+class="yt-uix-button-content"[^>]*>([\d,]+)</span>'
1098 % re.escape(count_name),
1099 video_webpage, count_name, default=None))
1100
69ea8ca4
PH
1101 like_count = _extract_count('like')
1102 dislike_count = _extract_count('dislike')
336c3a69 1103
c5e8d7af 1104 # subtitles
d82134c3 1105 video_subtitles = self.extract_subtitles(video_id, video_webpage)
360e1ca5 1106 automatic_captions = self.extract_automatic_captions(video_id, video_webpage)
c5e8d7af
PH
1107
1108 if 'length_seconds' not in video_info:
69ea8ca4 1109 self._downloader.report_warning('unable to extract video duration')
b466b702 1110 video_duration = None
c5e8d7af 1111 else:
7fd002c0 1112 video_duration = int(compat_urllib_parse_unquote_plus(video_info['length_seconds'][0]))
c5e8d7af 1113
1fb07d10
JG
1114 # annotations
1115 video_annotations = None
1116 if self._downloader.params.get('writeannotations', False):
5f6a1245 1117 video_annotations = self._extract_annotations(video_id)
1fb07d10 1118
dd27fd17
PH
1119 def _map_to_format_list(urlmap):
1120 formats = []
1121 for itag, video_real_url in urlmap.items():
1122 dct = {
1123 'format_id': itag,
1124 'url': video_real_url,
1125 'player_url': player_url,
1126 }
0b65e5d4
PH
1127 if itag in self._formats:
1128 dct.update(self._formats[itag])
dd27fd17
PH
1129 formats.append(dct)
1130 return formats
1131
c5e8d7af
PH
1132 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1133 self.report_rtmp_download()
dd27fd17
PH
1134 formats = [{
1135 'format_id': '_rtmp',
1136 'protocol': 'rtmp',
1137 'url': video_info['conn'][0],
1138 'player_url': player_url,
1139 }]
24270b03 1140 elif len(video_info.get('url_encoded_fmt_stream_map', [''])[0]) >= 1 or len(video_info.get('adaptive_fmts', [''])[0]) >= 1:
5f6a1245 1141 encoded_url_map = video_info.get('url_encoded_fmt_stream_map', [''])[0] + ',' + video_info.get('adaptive_fmts', [''])[0]
00fe14fc 1142 if 'rtmpe%3Dyes' in encoded_url_map:
a7055eb9 1143 raise ExtractorError('rtmpe downloads are not supported, see https://github.com/rg3/youtube-dl/issues/343 for more information.', expected=True)
c5e8d7af 1144 url_map = {}
00fe14fc 1145 for url_data_str in encoded_url_map.split(','):
c5e8d7af 1146 url_data = compat_parse_qs(url_data_str)
201e9eaa
PH
1147 if 'itag' not in url_data or 'url' not in url_data:
1148 continue
1149 format_id = url_data['itag'][0]
1150 url = url_data['url'][0]
1151
1152 if 'sig' in url_data:
1153 url += '&signature=' + url_data['sig'][0]
1154 elif 's' in url_data:
1155 encrypted_sig = url_data['s'][0]
6449cd80 1156 ASSETS_RE = r'"assets":.+?"js":\s*("[^"]+")'
201e9eaa 1157
beb95e77 1158 jsplayer_url_json = self._search_regex(
6449cd80
PH
1159 ASSETS_RE,
1160 embed_webpage if age_gate else video_webpage,
1161 'JS player URL (1)', default=None)
1162 if not jsplayer_url_json and not age_gate:
1163 # We need the embed website after all
1164 if embed_webpage is None:
1165 embed_url = proto + '://www.youtube.com/embed/%s' % video_id
1166 embed_webpage = self._download_webpage(
1167 embed_url, video_id, 'Downloading embed webpage')
1168 jsplayer_url_json = self._search_regex(
1169 ASSETS_RE, embed_webpage, 'JS player URL')
1170
beb95e77 1171 player_url = json.loads(jsplayer_url_json)
201e9eaa
PH
1172 if player_url is None:
1173 player_url_json = self._search_regex(
1174 r'ytplayer\.config.*?"url"\s*:\s*("[^"]+")',
78caa52a 1175 video_webpage, 'age gate player URL')
201e9eaa
PH
1176 player_url = json.loads(player_url_json)
1177
1178 if self._downloader.params.get('verbose'):
cf010131 1179 if player_url is None:
201e9eaa
PH
1180 player_version = 'unknown'
1181 player_desc = 'unknown'
1182 else:
1183 if player_url.endswith('swf'):
1184 player_version = self._search_regex(
1185 r'-(.+?)(?:/watch_as3)?\.swf$', player_url,
78caa52a 1186 'flash player', fatal=False)
201e9eaa 1187 player_desc = 'flash player %s' % player_version
cf010131 1188 else:
201e9eaa
PH
1189 player_version = self._search_regex(
1190 r'html5player-([^/]+?)(?:/html5player)?\.js',
1191 player_url,
1192 'html5 player', fatal=False)
78caa52a 1193 player_desc = 'html5 player %s' % player_version
201e9eaa 1194
60064c53 1195 parts_sizes = self._signature_cache_id(encrypted_sig)
69ea8ca4 1196 self.to_screen('{%s} signature length %s, %s' %
9e1a5b84 1197 (format_id, parts_sizes, player_desc))
201e9eaa
PH
1198
1199 signature = self._decrypt_signature(
1200 encrypted_sig, video_id, player_url, age_gate)
1201 url += '&signature=' + signature
1202 if 'ratebypass' not in url:
1203 url += '&ratebypass=yes'
1204 url_map[format_id] = url
dd27fd17 1205 formats = _map_to_format_list(url_map)
1d043b93
JMF
1206 elif video_info.get('hlsvp'):
1207 manifest_url = video_info['hlsvp'][0]
1208 url_map = self._extract_from_m3u8(manifest_url, video_id)
dd27fd17 1209 formats = _map_to_format_list(url_map)
c5e8d7af 1210 else:
69ea8ca4 1211 raise ExtractorError('no conn, hlsvp or url_encoded_fmt_stream_map information found in video info')
c5e8d7af 1212
dd27fd17 1213 # Look for the DASH manifest
203fb43f 1214 if self._downloader.params.get('youtube_include_dash_manifest', True):
77c6fb5b 1215 dash_mpd_fatal = True
d8d24a92
S
1216 for dash_manifest_url in dash_mpds:
1217 dash_formats = {}
774e208f 1218 try:
d8d24a92 1219 for df in self._parse_dash_manifest(
77c6fb5b 1220 video_id, dash_manifest_url, player_url, age_gate, dash_mpd_fatal):
d8d24a92
S
1221 # Do not overwrite DASH format found in some previous DASH manifest
1222 if df['format_id'] not in dash_formats:
1223 dash_formats[df['format_id']] = df
77c6fb5b
S
1224 # Additional DASH manifests may end up in HTTP Error 403 therefore
1225 # allow them to fail without bug report message if we already have
1226 # some DASH manifest succeeded. This is temporary workaround to reduce
1227 # burst of bug reports until we figure out the reason and whether it
1228 # can be fixed at all.
1229 dash_mpd_fatal = False
774e208f
PH
1230 except (ExtractorError, KeyError) as e:
1231 self.report_warning(
1232 'Skipping DASH manifest: %r' % e, video_id)
d8d24a92 1233 if dash_formats:
04b3b3df
JMF
1234 # Remove the formats we found through non-DASH, they
1235 # contain less info and it can be wrong, because we use
1236 # fixed values (for example the resolution). See
1237 # https://github.com/rg3/youtube-dl/issues/5774 for an
1238 # example.
d80265cc 1239 formats = [f for f in formats if f['format_id'] not in dash_formats.keys()]
d8d24a92 1240 formats.extend(dash_formats.values())
d80044c2 1241
6271f1ca
PH
1242 # Check for malformed aspect ratio
1243 stretched_m = re.search(
1244 r'<meta\s+property="og:video:tag".*?content="yt:stretch=(?P<w>[0-9]+):(?P<h>[0-9]+)">',
1245 video_webpage)
1246 if stretched_m:
1247 ratio = float(stretched_m.group('w')) / float(stretched_m.group('h'))
1248 for f in formats:
1249 if f.get('vcodec') != 'none':
1250 f['stretched_ratio'] = ratio
1251
4bcc7bd1 1252 self._sort_formats(formats)
4ea3be0a 1253
1254 return {
8bcc8756
JW
1255 'id': video_id,
1256 'uploader': video_uploader,
1257 'uploader_id': video_uploader_id,
1258 'upload_date': upload_date,
1259 'title': video_title,
1260 'thumbnail': video_thumbnail,
1261 'description': video_description,
1262 'categories': video_categories,
1263 'subtitles': video_subtitles,
360e1ca5 1264 'automatic_captions': automatic_captions,
8bcc8756
JW
1265 'duration': video_duration,
1266 'age_limit': 18 if age_gate else 0,
1267 'annotations': video_annotations,
7e8c0af0 1268 'webpage_url': proto + '://www.youtube.com/watch?v=%s' % video_id,
8bcc8756 1269 'view_count': view_count,
4ea3be0a 1270 'like_count': like_count,
1271 'dislike_count': dislike_count,
2d30521a 1272 'average_rating': float_or_none(video_info.get('avg_rating', [None])[0]),
8bcc8756 1273 'formats': formats,
2fe1ff85 1274 'is_live': is_live,
7c80519c 1275 'start_time': start_time,
297a564b 1276 'end_time': end_time,
4ea3be0a 1277 }
c5e8d7af 1278
5f6a1245 1279
880e1c52 1280class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
78caa52a 1281 IE_DESC = 'YouTube.com playlists'
d67cc9fa 1282 _VALID_URL = r"""(?x)(?:
c5e8d7af
PH
1283 (?:https?://)?
1284 (?:\w+\.)?
1285 youtube\.com/
1286 (?:
ac7553d0 1287 (?:course|view_play_list|my_playlists|artist|playlist|watch|embed/videoseries)
c5e8d7af
PH
1288 \? (?:.*?&)*? (?:p|a|list)=
1289 | p/
1290 )
d67cc9fa 1291 (
99209c29 1292 (?:PL|LL|EC|UU|FL|RD|UL)?[0-9A-Za-z-_]{10,}
5f6a1245 1293 # Top tracks, they can also include dots
d67cc9fa
JMF
1294 |(?:MC)[\w\.]*
1295 )
c5e8d7af
PH
1296 .*
1297 |
99209c29 1298 ((?:PL|LL|EC|UU|FL|RD|UL)[0-9A-Za-z-_]{10,})
c5e8d7af 1299 )"""
dbb94fb0 1300 _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s'
dbb94fb0 1301 _VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})&amp;[^"]*?index=(?P<index>\d+)'
78caa52a 1302 IE_NAME = 'youtube:playlist'
81127aa5
PH
1303 _TESTS = [{
1304 'url': 'https://www.youtube.com/playlist?list=PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re',
1305 'info_dict': {
1306 'title': 'ytdl test PL',
a1cf99d0 1307 'id': 'PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re',
81127aa5
PH
1308 },
1309 'playlist_count': 3,
9291475f
PH
1310 }, {
1311 'url': 'https://www.youtube.com/playlist?list=PLtPgu7CB4gbZDA7i_euNxn75ISqxwZPYx',
1312 'info_dict': {
acf757f4 1313 'id': 'PLtPgu7CB4gbZDA7i_euNxn75ISqxwZPYx',
9291475f
PH
1314 'title': 'YDL_Empty_List',
1315 },
1316 'playlist_count': 0,
1317 }, {
1318 'note': 'Playlist with deleted videos (#651). As a bonus, the video #51 is also twice in this list.',
1319 'url': 'https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC',
1320 'info_dict': {
1321 'title': '29C3: Not my department',
acf757f4 1322 'id': 'PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC',
9291475f
PH
1323 },
1324 'playlist_count': 95,
1325 }, {
1326 'note': 'issue #673',
1327 'url': 'PLBB231211A4F62143',
1328 'info_dict': {
f46a8702 1329 'title': '[OLD]Team Fortress 2 (Class-based LP)',
acf757f4 1330 'id': 'PLBB231211A4F62143',
9291475f
PH
1331 },
1332 'playlist_mincount': 26,
1333 }, {
1334 'note': 'Large playlist',
1335 'url': 'https://www.youtube.com/playlist?list=UUBABnxM4Ar9ten8Mdjj1j0Q',
1336 'info_dict': {
1337 'title': 'Uploads from Cauchemar',
acf757f4 1338 'id': 'UUBABnxM4Ar9ten8Mdjj1j0Q',
9291475f
PH
1339 },
1340 'playlist_mincount': 799,
1341 }, {
1342 'url': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl',
1343 'info_dict': {
1344 'title': 'YDL_safe_search',
acf757f4 1345 'id': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl',
9291475f
PH
1346 },
1347 'playlist_count': 2,
ac7553d0
PH
1348 }, {
1349 'note': 'embedded',
1350 'url': 'http://www.youtube.com/embed/videoseries?list=PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu',
1351 'playlist_count': 4,
1352 'info_dict': {
1353 'title': 'JODA15',
acf757f4 1354 'id': 'PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu',
ac7553d0 1355 }
6b08cdf6
PH
1356 }, {
1357 'note': 'Embedded SWF player',
1358 'url': 'http://www.youtube.com/p/YN5VISEtHet5D4NEvfTd0zcgFk84NqFZ?hl=en_US&fs=1&rel=0',
1359 'playlist_count': 4,
1360 'info_dict': {
1361 'title': 'JODA7',
acf757f4 1362 'id': 'YN5VISEtHet5D4NEvfTd0zcgFk84NqFZ',
6b08cdf6 1363 }
4b7df0d3
JMF
1364 }, {
1365 'note': 'Buggy playlist: the webpage has a "Load more" button but it doesn\'t have more videos',
1366 'url': 'https://www.youtube.com/playlist?list=UUXw-G3eDE9trcvY2sBMM_aA',
1367 'info_dict': {
acf757f4
PH
1368 'title': 'Uploads from Interstellar Movie',
1369 'id': 'UUXw-G3eDE9trcvY2sBMM_aA',
4b7df0d3
JMF
1370 },
1371 'playlist_mincout': 21,
81127aa5 1372 }]
c5e8d7af 1373
880e1c52
JMF
1374 def _real_initialize(self):
1375 self._login()
1376
652cdaa2 1377 def _extract_mix(self, playlist_id):
99209c29 1378 # The mixes are generated from a single video
652cdaa2 1379 # the id of the playlist is just 'RD' + video_id
7d4afc55 1380 url = 'https://youtube.com/watch?v=%s&list=%s' % (playlist_id[-11:], playlist_id)
c9cc0bf5 1381 webpage = self._download_webpage(
78caa52a 1382 url, playlist_id, 'Downloading Youtube mix')
bc2f773b 1383 search_title = lambda class_name: get_element_by_attribute('class', class_name, webpage)
c9cc0bf5
PH
1384 title_span = (
1385 search_title('playlist-title') or
1386 search_title('title long-title') or
1387 search_title('title'))
76d1700b 1388 title = clean_html(title_span)
c9cc0bf5
PH
1389 ids = orderedSet(re.findall(
1390 r'''(?xs)data-video-username=".*?".*?
1391 href="/watch\?v=([0-9A-Za-z_-]{11})&amp;[^"]*?list=%s''' % re.escape(playlist_id),
1392 webpage))
652cdaa2
JMF
1393 url_results = self._ids_to_results(ids)
1394
1395 return self.playlist_result(url_results, playlist_id, title)
1396
448830ce 1397 def _extract_playlist(self, playlist_id):
dbb94fb0
S
1398 url = self._TEMPLATE_URL % playlist_id
1399 page = self._download_webpage(url, playlist_id)
dbb94fb0 1400
39b62db1
YCH
1401 for match in re.findall(r'<div class="yt-alert-message">([^<]+)</div>', page):
1402 match = match.strip()
1403 # Check if the playlist exists or is private
1404 if re.match(r'[^<]*(The|This) playlist (does not exist|is private)[^<]*', match):
1405 raise ExtractorError(
1406 'The playlist doesn\'t exist or is private, use --username or '
1407 '--netrc to access it.',
1408 expected=True)
1409 elif re.match(r'[^<]*Invalid parameters[^<]*', match):
1410 raise ExtractorError(
1411 'Invalid parameters. Maybe URL is incorrect.',
1412 expected=True)
1413 elif re.match(r'[^<]*Choose your language[^<]*', match):
1414 continue
1415 else:
1416 self.report_warning('Youtube gives an alert message: ' + match)
10c0e2d8 1417
dcbb4580 1418 # Extract the video ids from the playlist pages
70219b0f
JMF
1419 def _entries():
1420 more_widget_html = content_html = page
1421 for page_num in itertools.count(1):
1422 matches = re.finditer(self._VIDEO_RE, content_html)
1423 # We remove the duplicates and the link with index 0
1424 # (it's not the first video of the playlist)
1425 new_ids = orderedSet(m.group('id') for m in matches if m.group('index') != '0')
1426 for vid_id in new_ids:
1427 yield self.url_result(vid_id, 'Youtube', video_id=vid_id)
1428
1429 mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
1430 if not mobj:
1431 break
1432
1433 more = self._download_json(
1434 'https://youtube.com/%s' % mobj.group('more'), playlist_id,
1435 'Downloading page #%s' % page_num,
1436 transform_source=uppercase_escape)
1437 content_html = more['content_html']
1438 if not content_html.strip():
1439 # Some webpages show a "Load more" button but they don't
1440 # have more videos
1441 break
1442 more_widget_html = more['load_more_widget_html']
dbb94fb0
S
1443
1444 playlist_title = self._html_search_regex(
68eb8e90 1445 r'(?s)<h1 class="pl-header-title[^"]*">\s*(.*?)\s*</h1>',
78caa52a 1446 page, 'title')
c5e8d7af 1447
70219b0f 1448 return self.playlist_result(_entries(), playlist_id, playlist_title)
c5e8d7af 1449
448830ce
S
1450 def _real_extract(self, url):
1451 # Extract playlist id
1452 mobj = re.match(self._VALID_URL, url)
1453 if mobj is None:
1454 raise ExtractorError('Invalid URL: %s' % url)
1455 playlist_id = mobj.group(1) or mobj.group(2)
1456
1457 # Check if it's a video-specific URL
1458 query_dict = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
1459 if 'v' in query_dict:
1460 video_id = query_dict['v'][0]
1461 if self._downloader.params.get('noplaylist'):
1462 self.to_screen('Downloading just video %s because of --no-playlist' % video_id)
1463 return self.url_result(video_id, 'Youtube', video_id=video_id)
1464 else:
1465 self.to_screen('Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id))
1466
1467 if playlist_id.startswith('RD') or playlist_id.startswith('UL'):
1468 # Mixes require a custom extraction process
1469 return self._extract_mix(playlist_id)
1470
1471 return self._extract_playlist(playlist_id)
1472
c5e8d7af
PH
1473
1474class YoutubeChannelIE(InfoExtractor):
78caa52a 1475 IE_DESC = 'YouTube.com channels'
9ff67727 1476 _VALID_URL = r'https?://(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/(?P<id>[0-9A-Za-z_-]+)'
eb0f3e7e 1477 _TEMPLATE_URL = 'https://www.youtube.com/channel/%s/videos'
78caa52a 1478 IE_NAME = 'youtube:channel'
cdc628a4
PH
1479 _TESTS = [{
1480 'note': 'paginated channel',
1481 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w',
1482 'playlist_mincount': 91,
acf757f4
PH
1483 'info_dict': {
1484 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
1485 }
cdc628a4 1486 }]
c5e8d7af 1487
6de5dbaf
S
1488 @staticmethod
1489 def extract_videos_from_page(page):
c5e8d7af 1490 ids_in_page = []
fb69240c
S
1491 titles_in_page = []
1492 for mobj in re.finditer(r'(?:title="(?P<title>[^"]+)"[^>]+)?href="/watch\?v=(?P<id>[0-9A-Za-z_-]+)&?', page):
1493 video_id = mobj.group('id')
1494 video_title = unescapeHTML(mobj.group('title'))
1495 try:
1496 idx = ids_in_page.index(video_id)
1497 if video_title and not titles_in_page[idx]:
1498 titles_in_page[idx] = video_title
1499 except ValueError:
1500 ids_in_page.append(video_id)
1501 titles_in_page.append(video_title)
1502 return zip(ids_in_page, titles_in_page)
c5e8d7af
PH
1503
1504 def _real_extract(self, url):
9ff67727 1505 channel_id = self._match_id(url)
c5e8d7af 1506
eb0f3e7e 1507 url = self._TEMPLATE_URL % channel_id
386bdfa6
S
1508
1509 # Channel by page listing is restricted to 35 pages of 30 items, i.e. 1050 videos total (see #5778)
1510 # Workaround by extracting as a playlist if managed to obtain channel playlist URL
1511 # otherwise fallback on channel by page extraction
1512 channel_page = self._download_webpage(
1513 url + '?view=57', channel_id,
1514 'Downloading channel page', fatal=False)
3d8e9573
S
1515 channel_playlist_id = self._html_search_meta(
1516 'channelId', channel_page, 'channel id', default=None)
1517 if not channel_playlist_id:
1518 channel_playlist_id = self._search_regex(
1519 r'data-channel-external-id="([^"]+)"',
1520 channel_page, 'channel id', default=None)
386bdfa6
S
1521 if channel_playlist_id and channel_playlist_id.startswith('UC'):
1522 playlist_id = 'UU' + channel_playlist_id[2:]
d2a9de78
IK
1523 return self.url_result(
1524 compat_urlparse.urljoin(url, '/playlist?list=%s' % playlist_id), 'YoutubePlaylist')
386bdfa6 1525
60bf45c8 1526 channel_page = self._download_webpage(url, channel_id, 'Downloading page #1')
31812a9e
PH
1527 autogenerated = re.search(r'''(?x)
1528 class="[^"]*?(?:
1529 channel-header-autogenerated-label|
1530 yt-channel-title-autogenerated
1531 )[^"]*"''', channel_page) is not None
c5e8d7af 1532
b9643eed
JMF
1533 if autogenerated:
1534 # The videos are contained in a single page
1535 # the ajax pages can't be used, they are empty
b82f815f 1536 entries = [
fb69240c
S
1537 self.url_result(
1538 video_id, 'Youtube', video_id=video_id,
1539 video_title=video_title)
8f02ad4f 1540 for video_id, video_title in self.extract_videos_from_page(channel_page)]
b82f815f
PH
1541 return self.playlist_result(entries, channel_id)
1542
1543 def _entries():
23d3608c 1544 more_widget_html = content_html = channel_page
b9643eed 1545 for pagenum in itertools.count(1):
81c2f20b 1546
8f02ad4f 1547 for video_id, video_title in self.extract_videos_from_page(content_html):
b82f815f 1548 yield self.url_result(
fb69240c
S
1549 video_id, 'Youtube', video_id=video_id,
1550 video_title=video_title)
5f6a1245 1551
23d3608c
JMF
1552 mobj = re.search(
1553 r'data-uix-load-more-href="/?(?P<more>[^"]+)"',
1554 more_widget_html)
1555 if not mobj:
b9643eed 1556 break
c5e8d7af 1557
23d3608c
JMF
1558 more = self._download_json(
1559 'https://youtube.com/%s' % mobj.group('more'), channel_id,
1560 'Downloading page #%s' % (pagenum + 1),
1561 transform_source=uppercase_escape)
1562 content_html = more['content_html']
1563 more_widget_html = more['load_more_widget_html']
1564
b82f815f 1565 return self.playlist_result(_entries(), channel_id)
c5e8d7af
PH
1566
1567
eb0f3e7e 1568class YoutubeUserIE(YoutubeChannelIE):
78caa52a 1569 IE_DESC = 'YouTube.com user videos (URL or "ytuser" keyword)'
9ff67727 1570 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:user/)?(?!(?:attribution_link|watch|results)(?:$|[^a-z_A-Z0-9-])))|ytuser:)(?!feed/)(?P<id>[A-Za-z0-9_-]+)'
eb0f3e7e 1571 _TEMPLATE_URL = 'https://www.youtube.com/user/%s/videos'
78caa52a 1572 IE_NAME = 'youtube:user'
c5e8d7af 1573
cdc628a4
PH
1574 _TESTS = [{
1575 'url': 'https://www.youtube.com/user/TheLinuxFoundation',
1576 'playlist_mincount': 320,
1577 'info_dict': {
1578 'title': 'TheLinuxFoundation',
1579 }
1580 }, {
1581 'url': 'ytuser:phihag',
1582 'only_matching': True,
1583 }]
1584
e3ea4790 1585 @classmethod
f4b05232 1586 def suitable(cls, url):
e3ea4790
JMF
1587 # Don't return True if the url can be extracted with other youtube
1588 # extractor, the regex would is too permissive and it would match.
1589 other_ies = iter(klass for (name, klass) in globals().items() if name.endswith('IE') and klass is not cls)
5f6a1245
JW
1590 if any(ie.suitable(url) for ie in other_ies):
1591 return False
1592 else:
1593 return super(YoutubeUserIE, cls).suitable(url)
f4b05232 1594
b05654f0 1595
b4c08069 1596class YoutubeSearchIE(SearchInfoExtractor, YoutubePlaylistIE):
78caa52a 1597 IE_DESC = 'YouTube.com searches'
b4c08069
JMF
1598 # there doesn't appear to be a real limit, for example if you search for
1599 # 'python' you get more than 8.000.000 results
1600 _MAX_RESULTS = float('inf')
78caa52a 1601 IE_NAME = 'youtube:search'
b05654f0 1602 _SEARCH_KEY = 'ytsearch'
b4c08069 1603 _EXTRA_QUERY_ARGS = {}
9dd8e46a 1604 _TESTS = []
b05654f0 1605
b05654f0
PH
1606 def _get_n_results(self, query, n):
1607 """Get a specified number of results for a query"""
1608
b4c08069 1609 videos = []
b05654f0
PH
1610 limit = n
1611
b4c08069
JMF
1612 for pagenum in itertools.count(1):
1613 url_query = {
02175a79 1614 'search_query': query.encode('utf-8'),
b4c08069
JMF
1615 'page': pagenum,
1616 'spf': 'navigate',
1617 }
1618 url_query.update(self._EXTRA_QUERY_ARGS)
1619 result_url = 'https://www.youtube.com/results?' + compat_urllib_parse.urlencode(url_query)
1620 data = self._download_json(
69ea8ca4 1621 result_url, video_id='query "%s"' % query,
b4c08069 1622 note='Downloading page %s' % pagenum,
69ea8ca4 1623 errnote='Unable to download API page')
b4c08069 1624 html_content = data[1]['body']['content']
7cc3570e 1625
b4c08069 1626 if 'class="search-message' in html_content:
07ad22b8 1627 raise ExtractorError(
78caa52a 1628 '[youtube] No video results', expected=True)
b05654f0 1629
b4c08069
JMF
1630 new_videos = self._ids_to_results(orderedSet(re.findall(
1631 r'href="/watch\?v=(.{11})', html_content)))
1632 videos += new_videos
1633 if not new_videos or len(videos) > limit:
1634 break
b05654f0 1635
b4c08069
JMF
1636 if len(videos) > n:
1637 videos = videos[:n]
b05654f0 1638 return self.playlist_result(videos, query)
75dff0ee 1639
c9ae7b95 1640
a3dd9248 1641class YoutubeSearchDateIE(YoutubeSearchIE):
cb7fb546 1642 IE_NAME = YoutubeSearchIE.IE_NAME + ':date'
a3dd9248 1643 _SEARCH_KEY = 'ytsearchdate'
78caa52a 1644 IE_DESC = 'YouTube.com searches, newest videos first'
b4c08069 1645 _EXTRA_QUERY_ARGS = {'search_sort': 'video_date_uploaded'}
75dff0ee 1646
c9ae7b95
PH
1647
1648class YoutubeSearchURLIE(InfoExtractor):
78caa52a
PH
1649 IE_DESC = 'YouTube.com search URLs'
1650 IE_NAME = 'youtube:search_url'
c9ae7b95 1651 _VALID_URL = r'https?://(?:www\.)?youtube\.com/results\?(.*?&)?search_query=(?P<query>[^&]+)(?:[&]|$)'
cdc628a4
PH
1652 _TESTS = [{
1653 'url': 'https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video',
1654 'playlist_mincount': 5,
1655 'info_dict': {
1656 'title': 'youtube-dl test video',
1657 }
1658 }]
c9ae7b95
PH
1659
1660 def _real_extract(self, url):
1661 mobj = re.match(self._VALID_URL, url)
7fd002c0 1662 query = compat_urllib_parse_unquote_plus(mobj.group('query'))
c9ae7b95
PH
1663
1664 webpage = self._download_webpage(url, query)
1665 result_code = self._search_regex(
98998cde 1666 r'(?s)<ol[^>]+class="item-section"(.*?)</ol>', webpage, 'result HTML')
c9ae7b95
PH
1667
1668 part_codes = re.findall(
1669 r'(?s)<h3 class="yt-lockup-title">(.*?)</h3>', result_code)
1670 entries = []
1671 for part_code in part_codes:
1672 part_title = self._html_search_regex(
6feb2d5e 1673 [r'(?s)title="([^"]+)"', r'>([^<]+)</a>'], part_code, 'item title', fatal=False)
c9ae7b95
PH
1674 part_url_snippet = self._html_search_regex(
1675 r'(?s)href="([^"]+)"', part_code, 'item URL')
1676 part_url = compat_urlparse.urljoin(
1677 'https://www.youtube.com/', part_url_snippet)
1678 entries.append({
1679 '_type': 'url',
1680 'url': part_url,
1681 'title': part_title,
1682 })
1683
1684 return {
1685 '_type': 'playlist',
1686 'entries': entries,
1687 'title': query,
1688 }
1689
1690
75dff0ee 1691class YoutubeShowIE(InfoExtractor):
78caa52a 1692 IE_DESC = 'YouTube.com (multi-season) shows'
cdc628a4 1693 _VALID_URL = r'https?://www\.youtube\.com/show/(?P<id>[^?#]*)'
78caa52a 1694 IE_NAME = 'youtube:show'
cdc628a4
PH
1695 _TESTS = [{
1696 'url': 'http://www.youtube.com/show/airdisasters',
1697 'playlist_mincount': 3,
1698 'info_dict': {
1699 'id': 'airdisasters',
1700 'title': 'Air Disasters',
1701 }
1702 }]
75dff0ee
JMF
1703
1704 def _real_extract(self, url):
1705 mobj = re.match(self._VALID_URL, url)
cdc628a4
PH
1706 playlist_id = mobj.group('id')
1707 webpage = self._download_webpage(
1708 url, playlist_id, 'Downloading show webpage')
75dff0ee
JMF
1709 # There's one playlist for each season of the show
1710 m_seasons = list(re.finditer(r'href="(/playlist\?list=.*?)"', webpage))
cdc628a4
PH
1711 self.to_screen('%s: Found %s seasons' % (playlist_id, len(m_seasons)))
1712 entries = [
1713 self.url_result(
1714 'https://www.youtube.com' + season.group(1), 'YoutubePlaylist')
1715 for season in m_seasons
1716 ]
1717 title = self._og_search_title(webpage, fatal=False)
1718
1719 return {
1720 '_type': 'playlist',
1721 'id': playlist_id,
1722 'title': title,
1723 'entries': entries,
1724 }
04cc9617
JMF
1725
1726
b2e8bc1b 1727class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
d7ae0639 1728 """
25f14e9f 1729 Base class for feed extractors
d7ae0639
JMF
1730 Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.
1731 """
b2e8bc1b 1732 _LOGIN_REQUIRED = True
d7ae0639
JMF
1733
1734 @property
1735 def IE_NAME(self):
78caa52a 1736 return 'youtube:%s' % self._FEED_NAME
04cc9617 1737
81f0259b 1738 def _real_initialize(self):
b2e8bc1b 1739 self._login()
81f0259b 1740
04cc9617 1741 def _real_extract(self, url):
25f14e9f
S
1742 page = self._download_webpage(
1743 'https://www.youtube.com/feed/%s' % self._FEED_NAME, self._PLAYLIST_TITLE)
2bc43303
JMF
1744
1745 # The extraction process is the same as for playlists, but the regex
1746 # for the video ids doesn't contain an index
1747 ids = []
1748 more_widget_html = content_html = page
2bc43303
JMF
1749 for page_num in itertools.count(1):
1750 matches = re.findall(r'href="\s*/watch\?v=([0-9A-Za-z_-]{11})', content_html)
62c95fd5
S
1751
1752 # 'recommended' feed has infinite 'load more' and each new portion spins
1753 # the same videos in (sometimes) slightly different order, so we'll check
1754 # for unicity and break when portion has no new videos
1755 new_ids = filter(lambda video_id: video_id not in ids, orderedSet(matches))
1756 if not new_ids:
1757 break
1758
2bc43303
JMF
1759 ids.extend(new_ids)
1760
1761 mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
1762 if not mobj:
1763 break
1764
1765 more = self._download_json(
25f14e9f 1766 'https://youtube.com/%s' % mobj.group('more'), self._PLAYLIST_TITLE,
2bc43303
JMF
1767 'Downloading page #%s' % page_num,
1768 transform_source=uppercase_escape)
1769 content_html = more['content_html']
1770 more_widget_html = more['load_more_widget_html']
1771
25f14e9f
S
1772 return self.playlist_result(
1773 self._ids_to_results(ids), playlist_title=self._PLAYLIST_TITLE)
1774
1775
1776class YoutubeWatchLaterIE(YoutubePlaylistIE):
1777 IE_NAME = 'youtube:watchlater'
1778 IE_DESC = 'Youtube watch later list, ":ytwatchlater" for short (requires authentication)'
1779 _VALID_URL = r'https?://www\.youtube\.com/(?:feed/watch_later|playlist\?list=WL)|:ytwatchlater'
1780
1781 _TESTS = [] # override PlaylistIE tests
1782
1783 def _real_extract(self, url):
1784 return self._extract_playlist('WL')
f459d170 1785
5f6a1245 1786
c626a3d9 1787class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
78caa52a 1788 IE_NAME = 'youtube:favorites'
f3a34072 1789 IE_DESC = 'YouTube.com favourite videos, ":ytfav" for short (requires authentication)'
c7a7750d 1790 _VALID_URL = r'https?://www\.youtube\.com/my_favorites|:ytfav(?:ou?rites)?'
c626a3d9
JMF
1791 _LOGIN_REQUIRED = True
1792
1793 def _real_extract(self, url):
1794 webpage = self._download_webpage('https://www.youtube.com/my_favorites', 'Youtube Favourites videos')
78caa52a 1795 playlist_id = self._search_regex(r'list=(.+?)["&]', webpage, 'favourites playlist id')
c626a3d9 1796 return self.url_result(playlist_id, 'YoutubePlaylist')
15870e90
PH
1797
1798
25f14e9f
S
1799class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
1800 IE_DESC = 'YouTube.com recommended videos, ":ytrec" for short (requires authentication)'
1801 _VALID_URL = r'https?://www\.youtube\.com/feed/recommended|:ytrec(?:ommended)?'
1802 _FEED_NAME = 'recommended'
1803 _PLAYLIST_TITLE = 'Youtube Recommended videos'
1ed5b5c9 1804
1ed5b5c9 1805
25f14e9f
S
1806class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor):
1807 IE_DESC = 'YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication)'
1808 _VALID_URL = r'https?://www\.youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
1809 _FEED_NAME = 'subscriptions'
1810 _PLAYLIST_TITLE = 'Youtube Subscriptions'
1ed5b5c9 1811
1ed5b5c9 1812
25f14e9f
S
1813class YoutubeHistoryIE(YoutubeFeedsInfoExtractor):
1814 IE_DESC = 'Youtube watch history, ":ythistory" for short (requires authentication)'
1815 _VALID_URL = 'https?://www\.youtube\.com/feed/history|:ythistory'
1816 _FEED_NAME = 'history'
1817 _PLAYLIST_TITLE = 'Youtube History'
1ed5b5c9
JMF
1818
1819
15870e90
PH
1820class YoutubeTruncatedURLIE(InfoExtractor):
1821 IE_NAME = 'youtube:truncated_url'
1822 IE_DESC = False # Do not list
975d35db 1823 _VALID_URL = r'''(?x)
b95aab84
PH
1824 (?:https?://)?
1825 (?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/
1826 (?:watch\?(?:
c4808c60 1827 feature=[a-z_]+|
b95aab84
PH
1828 annotation_id=annotation_[^&]+|
1829 x-yt-cl=[0-9]+|
c1708b89 1830 hl=[^&]*|
b95aab84
PH
1831 )?
1832 |
1833 attribution_link\?a=[^&]+
1834 )
1835 $
975d35db 1836 '''
15870e90 1837
c4808c60
PH
1838 _TESTS = [{
1839 'url': 'http://www.youtube.com/watch?annotation_id=annotation_3951667041',
1840 'only_matching': True,
dc2fc736
PH
1841 }, {
1842 'url': 'http://www.youtube.com/watch?',
1843 'only_matching': True,
b95aab84
PH
1844 }, {
1845 'url': 'https://www.youtube.com/watch?x-yt-cl=84503534',
1846 'only_matching': True,
1847 }, {
1848 'url': 'https://www.youtube.com/watch?feature=foo',
1849 'only_matching': True,
c1708b89
PH
1850 }, {
1851 'url': 'https://www.youtube.com/watch?hl=en-GB',
1852 'only_matching': True,
c4808c60
PH
1853 }]
1854
15870e90
PH
1855 def _real_extract(self, url):
1856 raise ExtractorError(
78caa52a
PH
1857 'Did you forget to quote the URL? Remember that & is a meta '
1858 'character in most shells, so you want to put the URL in quotes, '
1859 'like youtube-dl '
1860 '"http://www.youtube.com/watch?feature=foo&v=BaW_jenozKc" '
1861 ' or simply youtube-dl BaW_jenozKc .',
15870e90 1862 expected=True)
772fd5cc
PH
1863
1864
1865class YoutubeTruncatedIDIE(InfoExtractor):
1866 IE_NAME = 'youtube:truncated_id'
1867 IE_DESC = False # Do not list
b95aab84 1868 _VALID_URL = r'https?://(?:www\.)?youtube\.com/watch\?v=(?P<id>[0-9A-Za-z_-]{1,10})$'
772fd5cc
PH
1869
1870 _TESTS = [{
1871 'url': 'https://www.youtube.com/watch?v=N_708QY7Ob',
1872 'only_matching': True,
1873 }]
1874
1875 def _real_extract(self, url):
1876 video_id = self._match_id(url)
1877 raise ExtractorError(
1878 'Incomplete YouTube ID %s. URL %s looks truncated.' % (video_id, url),
1879 expected=True)