]> jfr.im git - yt-dlp.git/blame - youtube_dl/extractor/youtube.py
[vimeo:watchlater] Fix extraction (Closes #3886)
[yt-dlp.git] / youtube_dl / extractor / youtube.py
CommitLineData
c5e8d7af 1# coding: utf-8
c5e8d7af 2
78caa52a
PH
3from __future__ import unicode_literals
4
5
0ca96d48 6import itertools
c5e8d7af 7import json
c4417ddb 8import os.path
c5e8d7af 9import re
42939b61 10import time
e0df6211 11import traceback
c5e8d7af 12
b05654f0 13from .common import InfoExtractor, SearchInfoExtractor
2b25cb5d 14from ..jsinterp import JSInterpreter
54256267 15from ..swfinterp import SWFInterpreter
4bb4a188 16from ..compat import (
edf3e38e 17 compat_chr,
c5e8d7af 18 compat_parse_qs,
c5e8d7af 19 compat_urllib_parse,
7fd002c0
S
20 compat_urllib_parse_unquote,
21 compat_urllib_parse_unquote_plus,
7c80519c 22 compat_urllib_parse_urlparse,
c5e8d7af 23 compat_urllib_request,
7c61bd36 24 compat_urlparse,
c5e8d7af 25 compat_str,
4bb4a188
PH
26)
27from ..utils import (
c5e8d7af 28 clean_html,
c5e8d7af 29 ExtractorError,
2d30521a 30 float_or_none,
4bb4a188
PH
31 get_element_by_attribute,
32 get_element_by_id,
dd27fd17 33 int_or_none,
4bb4a188 34 orderedSet,
7c80519c 35 parse_duration,
cf7e015f 36 smuggle_url,
c93d53f5 37 str_to_int,
c5e8d7af
PH
38 unescapeHTML,
39 unified_strdate,
cf7e015f 40 unsmuggle_url,
81c2f20b 41 uppercase_escape,
af214c3a 42 ISO3166Utils,
c5e8d7af
PH
43)
44
5f6a1245 45
de7f3446 46class YoutubeBaseInfoExtractor(InfoExtractor):
b2e8bc1b
JMF
47 """Provide base functions for Youtube extractors"""
48 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
83317f69 49 _TWOFACTOR_URL = 'https://accounts.google.com/SecondFactor'
b2e8bc1b
JMF
50 _NETRC_MACHINE = 'youtube'
51 # If True it will raise an error if no login info is provided
52 _LOGIN_REQUIRED = False
53
b2e8bc1b 54 def _set_language(self):
810fb84d
PH
55 self._set_cookie(
56 '.youtube.com', 'PREF', 'f1=50000000&hl=en',
42939b61 57 # YouTube sets the expire time to about two months
810fb84d 58 expire_time=time.time() + 2 * 30 * 24 * 3600)
b2e8bc1b 59
25f14e9f
S
60 def _ids_to_results(self, ids):
61 return [
62 self.url_result(vid_id, 'Youtube', video_id=vid_id)
63 for vid_id in ids]
64
b2e8bc1b 65 def _login(self):
83317f69 66 """
67 Attempt to log in to YouTube.
68 True is returned if successful or skipped.
69 False is returned if login failed.
70
71 If _LOGIN_REQUIRED is set and no authentication was provided, an error is raised.
72 """
b2e8bc1b
JMF
73 (username, password) = self._get_login_info()
74 # No authentication to be performed
75 if username is None:
76 if self._LOGIN_REQUIRED:
69ea8ca4 77 raise ExtractorError('No login info available, needed for using %s.' % self.IE_NAME, expected=True)
83317f69 78 return True
b2e8bc1b 79
7cc3570e
PH
80 login_page = self._download_webpage(
81 self._LOGIN_URL, None,
69ea8ca4
PH
82 note='Downloading login page',
83 errnote='unable to fetch login page', fatal=False)
7cc3570e
PH
84 if login_page is False:
85 return
b2e8bc1b 86
795f28f8 87 galx = self._search_regex(r'(?s)<input.+?name="GALX".+?value="(.+?)"',
78caa52a 88 login_page, 'Login GALX parameter')
c5e8d7af 89
b2e8bc1b
JMF
90 # Log in
91 login_form_strs = {
8bcc8756
JW
92 'continue': 'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
93 'Email': username,
94 'GALX': galx,
95 'Passwd': password,
96
97 'PersistentCookie': 'yes',
98 '_utf8': '霱',
99 'bgresponse': 'js_disabled',
100 'checkConnection': '',
101 'checkedDomains': 'youtube',
102 'dnConn': '',
103 'pstMsg': '0',
104 'rmShown': '1',
105 'secTok': '',
106 'signIn': 'Sign in',
107 'timeStmp': '',
108 'service': 'youtube',
109 'uilel': '3',
110 'hl': 'en_US',
b2e8bc1b 111 }
83317f69 112
b2e8bc1b
JMF
113 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
114 # chokes on unicode
5f6a1245 115 login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k, v in login_form_strs.items())
b2e8bc1b 116 login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
7cc3570e
PH
117
118 req = compat_urllib_request.Request(self._LOGIN_URL, login_data)
119 login_results = self._download_webpage(
120 req, None,
69ea8ca4 121 note='Logging in', errnote='unable to log in', fatal=False)
7cc3570e
PH
122 if login_results is False:
123 return False
83317f69 124
125 if re.search(r'id="errormsg_0_Passwd"', login_results) is not None:
69ea8ca4 126 raise ExtractorError('Please use your account password and a two-factor code instead of an application-specific password.', expected=True)
83317f69 127
128 # Two-Factor
129 # TODO add SMS and phone call support - these require making a request and then prompting the user
130
131 if re.search(r'(?i)<form[^>]* id="gaia_secondfactorform"', login_results) is not None:
132 tfa_code = self._get_tfa_info()
133
134 if tfa_code is None:
69ea8ca4
PH
135 self._downloader.report_warning('Two-factor authentication required. Provide it with --twofactor <code>')
136 self._downloader.report_warning('(Note that only TOTP (Google Authenticator App) codes work at this time.)')
83317f69 137 return False
138
139 # Unlike the first login form, secTok and timeStmp are both required for the TFA form
140
141 match = re.search(r'id="secTok"\n\s+value=\'(.+)\'/>', login_results, re.M | re.U)
142 if match is None:
69ea8ca4 143 self._downloader.report_warning('Failed to get secTok - did the page structure change?')
83317f69 144 secTok = match.group(1)
145 match = re.search(r'id="timeStmp"\n\s+value=\'(.+)\'/>', login_results, re.M | re.U)
146 if match is None:
69ea8ca4 147 self._downloader.report_warning('Failed to get timeStmp - did the page structure change?')
83317f69 148 timeStmp = match.group(1)
149
150 tfa_form_strs = {
78caa52a
PH
151 'continue': 'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
152 'smsToken': '',
153 'smsUserPin': tfa_code,
154 'smsVerifyPin': 'Verify',
155
156 'PersistentCookie': 'yes',
157 'checkConnection': '',
158 'checkedDomains': 'youtube',
159 'pstMsg': '1',
160 'secTok': secTok,
161 'timeStmp': timeStmp,
162 'service': 'youtube',
163 'hl': 'en_US',
83317f69 164 }
5f6a1245 165 tfa_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k, v in tfa_form_strs.items())
83317f69 166 tfa_data = compat_urllib_parse.urlencode(tfa_form).encode('ascii')
167
168 tfa_req = compat_urllib_request.Request(self._TWOFACTOR_URL, tfa_data)
169 tfa_results = self._download_webpage(
170 tfa_req, None,
69ea8ca4 171 note='Submitting TFA code', errnote='unable to submit tfa', fatal=False)
83317f69 172
173 if tfa_results is False:
174 return False
175
176 if re.search(r'(?i)<form[^>]* id="gaia_secondfactorform"', tfa_results) is not None:
69ea8ca4 177 self._downloader.report_warning('Two-factor code expired. Please try again, or use a one-use backup code instead.')
83317f69 178 return False
179 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', tfa_results) is not None:
69ea8ca4 180 self._downloader.report_warning('unable to log in - did the page structure change?')
83317f69 181 return False
182 if re.search(r'smsauth-interstitial-reviewsettings', tfa_results) is not None:
69ea8ca4 183 self._downloader.report_warning('Your Google account has a security notice. Please log in on your web browser, resolve the notice, and try again.')
83317f69 184 return False
185
7cc3570e 186 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
69ea8ca4 187 self._downloader.report_warning('unable to log in: bad username or password')
b2e8bc1b
JMF
188 return False
189 return True
190
b2e8bc1b
JMF
191 def _real_initialize(self):
192 if self._downloader is None:
193 return
42939b61 194 self._set_language()
b2e8bc1b
JMF
195 if not self._login():
196 return
c5e8d7af 197
8377574c 198
360e1ca5 199class YoutubeIE(YoutubeBaseInfoExtractor):
78caa52a 200 IE_DESC = 'YouTube.com'
cb7dfeea 201 _VALID_URL = r"""(?x)^
c5e8d7af 202 (
edb53e2d 203 (?:https?://|//) # http(s):// or protocol-independent URL
cb7dfeea 204 (?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/|
484aaeb2 205 (?:www\.)?deturl\.com/www\.youtube\.com/|
e70dc1d1 206 (?:www\.)?pwnyoutube\.com/|
f7000f3a 207 (?:www\.)?yourepeat\.com/|
e69ae5b9
JMF
208 tube\.majestyc\.net/|
209 youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains
c5e8d7af
PH
210 (?:.*?\#/)? # handle anchor (#/) redirect urls
211 (?: # the various things that can precede the ID:
ac7553d0 212 (?:(?:v|embed|e)/(?!videoseries)) # v/ or embed/ or e/
c5e8d7af 213 |(?: # or the v= param in all its forms
f7000f3a 214 (?:(?:watch|movie)(?:_popup)?(?:\.php)?/?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
c5e8d7af
PH
215 (?:\?|\#!?) # the params delimiter ? or # or #!
216 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
217 v=
218 )
f4b05232
JMF
219 ))
220 |youtu\.be/ # just youtu.be/xxxx
edb53e2d 221 |(?:www\.)?cleanvideosearch\.com/media/action/yt/watch\?videoId=
f4b05232 222 )
c5e8d7af 223 )? # all until now is optional -> you can pass the naked ID
8963d9c2 224 ([0-9A-Za-z_-]{11}) # here is it! the YouTube video ID
9291475f 225 (?!.*?&list=) # combined list/video URLs are handled by the playlist IE
c5e8d7af
PH
226 (?(1).+)? # if we found the ID, everything can follow
227 $"""
c5e8d7af 228 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
2c62dc26
PH
229 _formats = {
230 '5': {'ext': 'flv', 'width': 400, 'height': 240},
231 '6': {'ext': 'flv', 'width': 450, 'height': 270},
232 '13': {'ext': '3gp'},
233 '17': {'ext': '3gp', 'width': 176, 'height': 144},
234 '18': {'ext': 'mp4', 'width': 640, 'height': 360},
235 '22': {'ext': 'mp4', 'width': 1280, 'height': 720},
236 '34': {'ext': 'flv', 'width': 640, 'height': 360},
237 '35': {'ext': 'flv', 'width': 854, 'height': 480},
238 '36': {'ext': '3gp', 'width': 320, 'height': 240},
239 '37': {'ext': 'mp4', 'width': 1920, 'height': 1080},
240 '38': {'ext': 'mp4', 'width': 4096, 'height': 3072},
241 '43': {'ext': 'webm', 'width': 640, 'height': 360},
242 '44': {'ext': 'webm', 'width': 854, 'height': 480},
243 '45': {'ext': 'webm', 'width': 1280, 'height': 720},
244 '46': {'ext': 'webm', 'width': 1920, 'height': 1080},
c9bebed2
S
245 '59': {'ext': 'mp4', 'width': 854, 'height': 480},
246 '78': {'ext': 'mp4', 'width': 854, 'height': 480},
2c62dc26 247
1d043b93 248
86fe61c8 249 # 3d videos
43b81eb9
PH
250 '82': {'ext': 'mp4', 'height': 360, 'format_note': '3D', 'preference': -20},
251 '83': {'ext': 'mp4', 'height': 480, 'format_note': '3D', 'preference': -20},
252 '84': {'ext': 'mp4', 'height': 720, 'format_note': '3D', 'preference': -20},
253 '85': {'ext': 'mp4', 'height': 1080, 'format_note': '3D', 'preference': -20},
254 '100': {'ext': 'webm', 'height': 360, 'format_note': '3D', 'preference': -20},
255 '101': {'ext': 'webm', 'height': 480, 'format_note': '3D', 'preference': -20},
256 '102': {'ext': 'webm', 'height': 720, 'format_note': '3D', 'preference': -20},
836a086c 257
96fb5605 258 # Apple HTTP Live Streaming
43b81eb9
PH
259 '92': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'preference': -10},
260 '93': {'ext': 'mp4', 'height': 360, 'format_note': 'HLS', 'preference': -10},
261 '94': {'ext': 'mp4', 'height': 480, 'format_note': 'HLS', 'preference': -10},
262 '95': {'ext': 'mp4', 'height': 720, 'format_note': 'HLS', 'preference': -10},
263 '96': {'ext': 'mp4', 'height': 1080, 'format_note': 'HLS', 'preference': -10},
264 '132': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'preference': -10},
265 '151': {'ext': 'mp4', 'height': 72, 'format_note': 'HLS', 'preference': -10},
2c62dc26
PH
266
267 # DASH mp4 video
43b81eb9
PH
268 '133': {'ext': 'mp4', 'height': 240, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
269 '134': {'ext': 'mp4', 'height': 360, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
270 '135': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
271 '136': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
272 '137': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
e65566a9 273 '138': {'ext': 'mp4', 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40}, # Height can vary (https://github.com/rg3/youtube-dl/issues/4559)
43b81eb9
PH
274 '160': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
275 '264': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
0d2c1418
PH
276 '298': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'h264'},
277 '299': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'h264'},
278 '266': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'vcodec': 'h264'},
836a086c 279
f6f1fc92 280 # Dash mp4 audio
62cd676c
PH
281 '139': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'vcodec': 'none', 'abr': 48, 'preference': -50, 'container': 'm4a_dash'},
282 '140': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'vcodec': 'none', 'abr': 128, 'preference': -50, 'container': 'm4a_dash'},
283 '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'vcodec': 'none', 'abr': 256, 'preference': -50, 'container': 'm4a_dash'},
836a086c
AZ
284
285 # Dash webm
4c6bd5b5
JMF
286 '167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40},
287 '168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40},
288 '169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40},
289 '170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40},
290 '218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40},
291 '219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40},
292 '278': {'ext': 'webm', 'height': 144, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'container': 'webm', 'vcodec': 'vp9'},
e75cafe9
A
293 '242': {'ext': 'webm', 'height': 240, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
294 '243': {'ext': 'webm', 'height': 360, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
295 '244': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
296 '245': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
297 '246': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
298 '247': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
299 '248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
3c80377b 300 '271': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
18061bba 301 '272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
4c6bd5b5
JMF
302 '302': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'vp9'},
303 '303': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'vp9'},
304 '308': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'vp9'},
305 '313': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'vcodec': 'vp9'},
306 '315': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'vp9'},
2c62dc26
PH
307
308 # Dash webm audio
55db73ef 309 '171': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'abr': 128, 'preference': -50},
e75cafe9 310 '172': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'abr': 256, 'preference': -50},
ce6b9a2d 311
0857baad
PH
312 # Dash webm audio with opus inside
313 '249': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 50, 'preference': -50},
314 '250': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 70, 'preference': -50},
315 '251': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 160, 'preference': -50},
316
ce6b9a2d
PH
317 # RTMP (unnamed)
318 '_rtmp': {'protocol': 'rtmp'},
c5e8d7af 319 }
836a086c 320
78caa52a 321 IE_NAME = 'youtube'
2eb88d95
PH
322 _TESTS = [
323 {
297a564b 324 'url': 'http://www.youtube.com/watch?v=BaW_jenozKcj&t=1s&end=9',
4bc3a23e
PH
325 'info_dict': {
326 'id': 'BaW_jenozKc',
327 'ext': 'mp4',
328 'title': 'youtube-dl test video "\'/\\ä↭𝕐',
329 'uploader': 'Philipp Hagemeister',
330 'uploader_id': 'phihag',
331 'upload_date': '20121002',
332 'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',
333 'categories': ['Science & Technology'],
000b6b5a 334 'tags': ['youtube-dl'],
3e7c1224
PH
335 'like_count': int,
336 'dislike_count': int,
7c80519c 337 'start_time': 1,
297a564b 338 'end_time': 9,
2eb88d95 339 }
0e853ca4 340 },
0e853ca4 341 {
4bc3a23e
PH
342 'url': 'http://www.youtube.com/watch?v=UxxajLWwzqY',
343 'note': 'Test generic use_cipher_signature video (#897)',
344 'info_dict': {
345 'id': 'UxxajLWwzqY',
346 'ext': 'mp4',
347 'upload_date': '20120506',
348 'title': 'Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]',
000b6b5a
S
349 'description': 'md5:782e8651347686cba06e58f71ab51773',
350 'tags': ['Icona Pop i love it', 'sweden', 'pop music', 'big beat records', 'big beat', 'charli',
351 'xcx', 'charli xcx', 'girls', 'hbo', 'i love it', "i don't care", 'icona', 'pop',
352 'iconic ep', 'iconic', 'love', 'it'],
4bc3a23e
PH
353 'uploader': 'Icona Pop',
354 'uploader_id': 'IconaPop',
2eb88d95 355 }
c108eb73
JMF
356 },
357 {
4bc3a23e
PH
358 'url': 'https://www.youtube.com/watch?v=07FYdnEawAQ',
359 'note': 'Test VEVO video with age protection (#956)',
360 'info_dict': {
361 'id': '07FYdnEawAQ',
362 'ext': 'mp4',
363 'upload_date': '20130703',
364 'title': 'Justin Timberlake - Tunnel Vision (Explicit)',
365 'description': 'md5:64249768eec3bc4276236606ea996373',
366 'uploader': 'justintimberlakeVEVO',
367 'uploader_id': 'justintimberlakeVEVO',
c108eb73
JMF
368 }
369 },
fccd3771 370 {
4bc3a23e
PH
371 'url': '//www.YouTube.com/watch?v=yZIXLfi8CZQ',
372 'note': 'Embed-only video (#1746)',
373 'info_dict': {
374 'id': 'yZIXLfi8CZQ',
375 'ext': 'mp4',
376 'upload_date': '20120608',
377 'title': 'Principal Sexually Assaults A Teacher - Episode 117 - 8th June 2012',
378 'description': 'md5:09b78bd971f1e3e289601dfba15ca4f7',
379 'uploader': 'SET India',
380 'uploader_id': 'setindia'
fccd3771
PH
381 }
382 },
dd27fd17 383 {
4bc3a23e
PH
384 'url': 'http://www.youtube.com/watch?v=a9LDPn-MO4I',
385 'note': '256k DASH audio (format 141) via DASH manifest',
386 'info_dict': {
387 'id': 'a9LDPn-MO4I',
388 'ext': 'm4a',
389 'upload_date': '20121002',
390 'uploader_id': '8KVIDEO',
391 'description': '',
392 'uploader': '8KVIDEO',
393 'title': 'UHDTV TEST 8K VIDEO.mp4'
4919603f 394 },
4bc3a23e
PH
395 'params': {
396 'youtube_include_dash_manifest': True,
397 'format': '141',
4919603f 398 },
dd27fd17 399 },
3489b7d2
JMF
400 # DASH manifest with encrypted signature
401 {
78caa52a
PH
402 'url': 'https://www.youtube.com/watch?v=IB3lcPjvWLA',
403 'info_dict': {
404 'id': 'IB3lcPjvWLA',
405 'ext': 'm4a',
b766eb27
JMF
406 'title': 'Afrojack, Spree Wilson - The Spark ft. Spree Wilson',
407 'description': 'md5:12e7067fa6735a77bdcbb58cb1187d2d',
78caa52a
PH
408 'uploader': 'AfrojackVEVO',
409 'uploader_id': 'AfrojackVEVO',
410 'upload_date': '20131011',
3489b7d2 411 },
4bc3a23e 412 'params': {
78caa52a
PH
413 'youtube_include_dash_manifest': True,
414 'format': '141',
3489b7d2
JMF
415 },
416 },
aaeb86f6
S
417 # JS player signature function name containing $
418 {
419 'url': 'https://www.youtube.com/watch?v=nfWlot6h_JM',
420 'info_dict': {
421 'id': 'nfWlot6h_JM',
422 'ext': 'm4a',
423 'title': 'Taylor Swift - Shake It Off',
424 'description': 'md5:2acfda1b285bdd478ccec22f9918199d',
425 'uploader': 'TaylorSwiftVEVO',
426 'uploader_id': 'TaylorSwiftVEVO',
427 'upload_date': '20140818',
428 },
429 'params': {
430 'youtube_include_dash_manifest': True,
431 'format': '141',
432 },
433 },
aa79ac0c
PH
434 # Controversy video
435 {
436 'url': 'https://www.youtube.com/watch?v=T4XJQO3qol8',
437 'info_dict': {
438 'id': 'T4XJQO3qol8',
439 'ext': 'mp4',
440 'upload_date': '20100909',
441 'uploader': 'The Amazing Atheist',
442 'uploader_id': 'TheAmazingAtheist',
443 'title': 'Burning Everyone\'s Koran',
444 'description': 'SUBSCRIBE: http://www.youtube.com/saturninefilms\n\nEven Obama has taken a stand against freedom on this issue: http://www.huffingtonpost.com/2010/09/09/obama-gma-interview-quran_n_710282.html',
445 }
c522adb1
JMF
446 },
447 # Normal age-gate video (No vevo, embed allowed)
448 {
449 'url': 'http://youtube.com/watch?v=HtVdAasjOgU',
450 'info_dict': {
451 'id': 'HtVdAasjOgU',
452 'ext': 'mp4',
453 'title': 'The Witcher 3: Wild Hunt - The Sword Of Destiny Trailer',
9ed99402 454 'description': 're:(?s).{100,}About the Game\n.*?The Witcher 3: Wild Hunt.{100,}',
c522adb1
JMF
455 'uploader': 'The Witcher',
456 'uploader_id': 'WitcherGame',
457 'upload_date': '20140605',
458 },
459 },
fccae2b9
S
460 # Age-gate video with encrypted signature
461 {
462 'url': 'http://www.youtube.com/watch?v=6kLq3WMV1nU',
463 'info_dict': {
464 'id': '6kLq3WMV1nU',
465 'ext': 'mp4',
466 'title': 'Dedication To My Ex (Miss That) (Lyric Video)',
467 'description': 'md5:33765bb339e1b47e7e72b5490139bb41',
468 'uploader': 'LloydVEVO',
469 'uploader_id': 'LloydVEVO',
470 'upload_date': '20110629',
471 },
472 },
774e208f
PH
473 # video_info is None (https://github.com/rg3/youtube-dl/issues/4421)
474 {
475 'url': '__2ABJjxzNo',
476 'info_dict': {
477 'id': '__2ABJjxzNo',
478 'ext': 'mp4',
479 'upload_date': '20100430',
480 'uploader_id': 'deadmau5',
481 'description': 'md5:12c56784b8032162bb936a5f76d55360',
482 'uploader': 'deadmau5',
483 'title': 'Deadmau5 - Some Chords (HD)',
484 },
485 'expected_warnings': [
486 'DASH manifest missing',
487 ]
e52a40ab
PH
488 },
489 # Olympics (https://github.com/rg3/youtube-dl/issues/4431)
490 {
491 'url': 'lqQg6PlCWgI',
492 'info_dict': {
493 'id': 'lqQg6PlCWgI',
494 'ext': 'mp4',
cbe2bd91
PH
495 'upload_date': '20120731',
496 'uploader_id': 'olympic',
497 'description': 'HO09 - Women - GER-AUS - Hockey - 31 July 2012 - London 2012 Olympic Games',
498 'uploader': 'Olympics',
499 'title': 'Hockey - Women - GER-AUS - London 2012 Olympic Games',
500 },
501 'params': {
502 'skip_download': 'requires avconv',
e52a40ab 503 }
cbe2bd91 504 },
6271f1ca
PH
505 # Non-square pixels
506 {
507 'url': 'https://www.youtube.com/watch?v=_b-2C3KPAM0',
508 'info_dict': {
509 'id': '_b-2C3KPAM0',
510 'ext': 'mp4',
511 'stretched_ratio': 16 / 9.,
512 'upload_date': '20110310',
513 'uploader_id': 'AllenMeow',
514 'description': 'made by Wacom from Korea | 字幕&加油添醋 by TY\'s Allen | 感謝heylisa00cavey1001同學熱情提供梗及翻譯',
515 'uploader': '孫艾倫',
516 'title': '[A-made] 變態妍字幕版 太妍 我就是這樣的人',
517 },
06b491eb
S
518 },
519 # url_encoded_fmt_stream_map is empty string
520 {
521 'url': 'qEJwOuvDf7I',
522 'info_dict': {
523 'id': 'qEJwOuvDf7I',
524 'ext': 'mp4',
525 'title': 'Обсуждение судебной практики по выборам 14 сентября 2014 года в Санкт-Петербурге',
526 'description': '',
527 'upload_date': '20150404',
528 'uploader_id': 'spbelect',
529 'uploader': 'Наблюдатели Петербурга',
530 },
531 'params': {
532 'skip_download': 'requires avconv',
533 }
534 },
da77d856
S
535 # Extraction from multiple DASH manifests (https://github.com/rg3/youtube-dl/pull/6097)
536 {
537 'url': 'https://www.youtube.com/watch?v=FIl7x6_3R5Y',
538 'info_dict': {
539 'id': 'FIl7x6_3R5Y',
540 'ext': 'mp4',
541 'title': 'md5:7b81415841e02ecd4313668cde88737a',
542 'description': 'md5:116377fd2963b81ec4ce64b542173306',
543 'upload_date': '20150625',
544 'uploader_id': 'dorappi2000',
545 'uploader': 'dorappi2000',
546 'formats': 'mincount:33',
547 },
2ee8f5d8 548 },
8a1a26ce
YCH
549 # DASH manifest with segment_list
550 {
551 'url': 'https://www.youtube.com/embed/CsmdDsKjzN8',
552 'md5': '8ce563a1d667b599d21064e982ab9e31',
553 'info_dict': {
554 'id': 'CsmdDsKjzN8',
555 'ext': 'mp4',
17ee98e1 556 'upload_date': '20150501', # According to '<meta itemprop="datePublished"', but in other places it's 20150510
8a1a26ce
YCH
557 'uploader': 'Airtek',
558 'description': 'Retransmisión en directo de la XVIII media maratón de Zaragoza.',
559 'uploader_id': 'UCzTzUmjXxxacNnL8I3m4LnQ',
560 'title': 'Retransmisión XVIII Media maratón Zaragoza 2015',
561 },
562 'params': {
563 'youtube_include_dash_manifest': True,
564 'format': '135', # bestvideo
565 }
2ee8f5d8 566 },
cf7e015f
S
567 {
568 # Multifeed videos (multiple cameras), URL is for Main Camera
569 'url': 'https://www.youtube.com/watch?v=jqWvoWXjCVs',
570 'info_dict': {
571 'id': 'jqWvoWXjCVs',
572 'title': 'teamPGP: Rocket League Noob Stream',
573 'description': 'md5:dc7872fb300e143831327f1bae3af010',
574 },
575 'playlist': [{
576 'info_dict': {
577 'id': 'jqWvoWXjCVs',
578 'ext': 'mp4',
579 'title': 'teamPGP: Rocket League Noob Stream (Main Camera)',
580 'description': 'md5:dc7872fb300e143831327f1bae3af010',
581 'upload_date': '20150721',
582 'uploader': 'Beer Games Beer',
583 'uploader_id': 'beergamesbeer',
584 },
585 }, {
586 'info_dict': {
587 'id': '6h8e8xoXJzg',
588 'ext': 'mp4',
589 'title': 'teamPGP: Rocket League Noob Stream (kreestuh)',
590 'description': 'md5:dc7872fb300e143831327f1bae3af010',
591 'upload_date': '20150721',
592 'uploader': 'Beer Games Beer',
593 'uploader_id': 'beergamesbeer',
594 },
595 }, {
596 'info_dict': {
597 'id': 'PUOgX5z9xZw',
598 'ext': 'mp4',
599 'title': 'teamPGP: Rocket League Noob Stream (grizzle)',
600 'description': 'md5:dc7872fb300e143831327f1bae3af010',
601 'upload_date': '20150721',
602 'uploader': 'Beer Games Beer',
603 'uploader_id': 'beergamesbeer',
604 },
605 }, {
606 'info_dict': {
607 'id': 'teuwxikvS5k',
608 'ext': 'mp4',
609 'title': 'teamPGP: Rocket League Noob Stream (zim)',
610 'description': 'md5:dc7872fb300e143831327f1bae3af010',
611 'upload_date': '20150721',
612 'uploader': 'Beer Games Beer',
613 'uploader_id': 'beergamesbeer',
614 },
615 }],
616 'params': {
617 'skip_download': True,
618 },
619 }
2eb88d95
PH
620 ]
621
e0df6211
PH
622 def __init__(self, *args, **kwargs):
623 super(YoutubeIE, self).__init__(*args, **kwargs)
83799698 624 self._player_cache = {}
e0df6211 625
c5e8d7af
PH
626 def report_video_info_webpage_download(self, video_id):
627 """Report attempt to download video info webpage."""
69ea8ca4 628 self.to_screen('%s: Downloading video info webpage' % video_id)
c5e8d7af 629
c5e8d7af
PH
630 def report_information_extraction(self, video_id):
631 """Report attempt to extract video information."""
69ea8ca4 632 self.to_screen('%s: Extracting video information' % video_id)
c5e8d7af
PH
633
634 def report_unavailable_format(self, video_id, format):
635 """Report extracted video URL."""
69ea8ca4 636 self.to_screen('%s: Format %s not available' % (video_id, format))
c5e8d7af
PH
637
638 def report_rtmp_download(self):
639 """Indicate the download will use the RTMP protocol."""
69ea8ca4 640 self.to_screen('RTMP download detected')
c5e8d7af 641
60064c53
PH
642 def _signature_cache_id(self, example_sig):
643 """ Return a string representation of a signature """
78caa52a 644 return '.'.join(compat_str(len(part)) for part in example_sig.split('.'))
60064c53
PH
645
646 def _extract_signature_function(self, video_id, player_url, example_sig):
cf010131 647 id_m = re.match(
60620368 648 r'.*?-(?P<id>[a-zA-Z0-9_-]+)(?:/watch_as3|/html5player)?\.(?P<ext>[a-z]+)$',
cf010131 649 player_url)
c081b35c
PH
650 if not id_m:
651 raise ExtractorError('Cannot identify player %r' % player_url)
e0df6211
PH
652 player_type = id_m.group('ext')
653 player_id = id_m.group('id')
654
c4417ddb 655 # Read from filesystem cache
60064c53
PH
656 func_id = '%s_%s_%s' % (
657 player_type, player_id, self._signature_cache_id(example_sig))
c4417ddb 658 assert os.path.basename(func_id) == func_id
a0e07d31 659
69ea8ca4 660 cache_spec = self._downloader.cache.load('youtube-sigfuncs', func_id)
a0e07d31 661 if cache_spec is not None:
78caa52a 662 return lambda s: ''.join(s[i] for i in cache_spec)
83799698 663
6d1a55a5
PH
664 download_note = (
665 'Downloading player %s' % player_url
666 if self._downloader.params.get('verbose') else
667 'Downloading %s player %s' % (player_type, player_id)
668 )
e0df6211
PH
669 if player_type == 'js':
670 code = self._download_webpage(
671 player_url, video_id,
6d1a55a5 672 note=download_note,
69ea8ca4 673 errnote='Download of %s failed' % player_url)
83799698 674 res = self._parse_sig_js(code)
c4417ddb 675 elif player_type == 'swf':
e0df6211
PH
676 urlh = self._request_webpage(
677 player_url, video_id,
6d1a55a5 678 note=download_note,
69ea8ca4 679 errnote='Download of %s failed' % player_url)
e0df6211 680 code = urlh.read()
83799698 681 res = self._parse_sig_swf(code)
e0df6211
PH
682 else:
683 assert False, 'Invalid player type %r' % player_type
684
785521bf
PH
685 test_string = ''.join(map(compat_chr, range(len(example_sig))))
686 cache_res = res(test_string)
687 cache_spec = [ord(c) for c in cache_res]
83799698 688
69ea8ca4 689 self._downloader.cache.store('youtube-sigfuncs', func_id, cache_spec)
83799698
PH
690 return res
691
60064c53 692 def _print_sig_code(self, func, example_sig):
edf3e38e
PH
693 def gen_sig_code(idxs):
694 def _genslice(start, end, step):
78caa52a 695 starts = '' if start == 0 else str(start)
8bcc8756 696 ends = (':%d' % (end + step)) if end + step >= 0 else ':'
69ea8ca4 697 steps = '' if step == 1 else (':%d' % step)
78caa52a 698 return 's[%s%s%s]' % (starts, ends, steps)
edf3e38e
PH
699
700 step = None
7af808a5
PH
701 # Quelch pyflakes warnings - start will be set when step is set
702 start = '(Never used)'
edf3e38e
PH
703 for i, prev in zip(idxs[1:], idxs[:-1]):
704 if step is not None:
705 if i - prev == step:
706 continue
707 yield _genslice(start, prev, step)
708 step = None
709 continue
710 if i - prev in [-1, 1]:
711 step = i - prev
712 start = prev
713 continue
714 else:
78caa52a 715 yield 's[%d]' % prev
edf3e38e 716 if step is None:
78caa52a 717 yield 's[%d]' % i
edf3e38e
PH
718 else:
719 yield _genslice(start, i, step)
720
78caa52a 721 test_string = ''.join(map(compat_chr, range(len(example_sig))))
c705320f 722 cache_res = func(test_string)
edf3e38e 723 cache_spec = [ord(c) for c in cache_res]
78caa52a 724 expr_code = ' + '.join(gen_sig_code(cache_spec))
60064c53
PH
725 signature_id_tuple = '(%s)' % (
726 ', '.join(compat_str(len(p)) for p in example_sig.split('.')))
69ea8ca4 727 code = ('if tuple(len(p) for p in s.split(\'.\')) == %s:\n'
78caa52a 728 ' return %s\n') % (signature_id_tuple, expr_code)
69ea8ca4 729 self.to_screen('Extracted signature function:\n' + code)
edf3e38e 730
e0df6211
PH
731 def _parse_sig_js(self, jscode):
732 funcname = self._search_regex(
aaeb86f6 733 r'\.sig\|\|([a-zA-Z0-9$]+)\(', jscode,
8bcc8756 734 'Initial JS player signature function name')
2b25cb5d
PH
735
736 jsi = JSInterpreter(jscode)
737 initial_function = jsi.extract_function(funcname)
e0df6211
PH
738 return lambda s: initial_function([s])
739
740 def _parse_sig_swf(self, file_contents):
54256267 741 swfi = SWFInterpreter(file_contents)
78caa52a 742 TARGET_CLASSNAME = 'SignatureDecipher'
54256267 743 searched_class = swfi.extract_class(TARGET_CLASSNAME)
78caa52a 744 initial_function = swfi.extract_function(searched_class, 'decipher')
e0df6211
PH
745 return lambda s: initial_function([s])
746
83799698 747 def _decrypt_signature(self, s, video_id, player_url, age_gate=False):
257a2501 748 """Turn the encrypted s field into a working signature"""
6b37f0be 749
c8bf86d5 750 if player_url is None:
69ea8ca4 751 raise ExtractorError('Cannot decrypt signature without player_url')
920de7a2 752
69ea8ca4 753 if player_url.startswith('//'):
78caa52a 754 player_url = 'https:' + player_url
c8bf86d5 755 try:
62af3a0e 756 player_id = (player_url, self._signature_cache_id(s))
c8bf86d5
PH
757 if player_id not in self._player_cache:
758 func = self._extract_signature_function(
60064c53 759 video_id, player_url, s
c8bf86d5
PH
760 )
761 self._player_cache[player_id] = func
762 func = self._player_cache[player_id]
763 if self._downloader.params.get('youtube_print_sig_code'):
60064c53 764 self._print_sig_code(func, s)
c8bf86d5
PH
765 return func(s)
766 except Exception as e:
767 tb = traceback.format_exc()
768 raise ExtractorError(
78caa52a 769 'Signature extraction failed: ' + tb, cause=e)
e0df6211 770
360e1ca5 771 def _get_subtitles(self, video_id, webpage):
de7f3446 772 try:
60e47a26 773 subs_doc = self._download_xml(
38c2e5b8 774 'https://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,
7fad1c63
JMF
775 video_id, note=False)
776 except ExtractorError as err:
69ea8ca4 777 self._downloader.report_warning('unable to download video subtitles: %s' % compat_str(err))
de7f3446 778 return {}
de7f3446
JMF
779
780 sub_lang_list = {}
60e47a26
JMF
781 for track in subs_doc.findall('track'):
782 lang = track.attrib['lang_code']
7e660ac1
LD
783 if lang in sub_lang_list:
784 continue
360e1ca5
JMF
785 sub_formats = []
786 for ext in ['sbv', 'vtt', 'srt']:
787 params = compat_urllib_parse.urlencode({
788 'lang': lang,
789 'v': video_id,
790 'fmt': ext,
791 'name': track.attrib['name'].encode('utf-8'),
792 })
793 sub_formats.append({
794 'url': 'https://www.youtube.com/api/timedtext?' + params,
795 'ext': ext,
796 })
797 sub_lang_list[lang] = sub_formats
de7f3446 798 if not sub_lang_list:
69ea8ca4 799 self._downloader.report_warning('video doesn\'t have subtitles')
de7f3446
JMF
800 return {}
801 return sub_lang_list
802
360e1ca5 803 def _get_automatic_captions(self, video_id, webpage):
de7f3446
JMF
804 """We need the webpage for getting the captions url, pass it as an
805 argument to speed up the process."""
69ea8ca4 806 self.to_screen('%s: Looking for automatic captions' % video_id)
de7f3446 807 mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
78caa52a 808 err_msg = 'Couldn\'t find automatic captions for %s' % video_id
de7f3446
JMF
809 if mobj is None:
810 self._downloader.report_warning(err_msg)
811 return {}
812 player_config = json.loads(mobj.group(1))
813 try:
0792d563
PH
814 args = player_config['args']
815 caption_url = args['ttsurl']
816 timestamp = args['timestamp']
055e6f36
JMF
817 # We get the available subtitles
818 list_params = compat_urllib_parse.urlencode({
819 'type': 'list',
820 'tlangs': 1,
821 'asrs': 1,
de7f3446 822 })
055e6f36 823 list_url = caption_url + '&' + list_params
e26f8712 824 caption_list = self._download_xml(list_url, video_id)
e3dc22ca 825 original_lang_node = caption_list.find('track')
7d900ef1 826 if original_lang_node is None:
69ea8ca4 827 self._downloader.report_warning('Video doesn\'t have automatic captions')
e3dc22ca
JMF
828 return {}
829 original_lang = original_lang_node.attrib['lang_code']
7d900ef1 830 caption_kind = original_lang_node.attrib.get('kind', '')
055e6f36
JMF
831
832 sub_lang_list = {}
833 for lang_node in caption_list.findall('target'):
834 sub_lang = lang_node.attrib['lang_code']
360e1ca5
JMF
835 sub_formats = []
836 for ext in ['sbv', 'vtt', 'srt']:
837 params = compat_urllib_parse.urlencode({
838 'lang': original_lang,
839 'tlang': sub_lang,
840 'fmt': ext,
841 'ts': timestamp,
842 'kind': caption_kind,
843 })
844 sub_formats.append({
845 'url': caption_url + '&' + params,
846 'ext': ext,
847 })
848 sub_lang_list[sub_lang] = sub_formats
055e6f36 849 return sub_lang_list
de7f3446
JMF
850 # An extractor error can be raise by the download process if there are
851 # no automatic captions but there are subtitles
852 except (KeyError, ExtractorError):
853 self._downloader.report_warning(err_msg)
854 return {}
855
97665381
PH
856 @classmethod
857 def extract_id(cls, url):
858 mobj = re.match(cls._VALID_URL, url, re.VERBOSE)
c5e8d7af 859 if mobj is None:
69ea8ca4 860 raise ExtractorError('Invalid URL: %s' % url)
c5e8d7af
PH
861 video_id = mobj.group(2)
862 return video_id
863
1d043b93
JMF
864 def _extract_from_m3u8(self, manifest_url, video_id):
865 url_map = {}
5f6a1245 866
1d043b93
JMF
867 def _get_urls(_manifest):
868 lines = _manifest.split('\n')
869 urls = filter(lambda l: l and not l.startswith('#'),
8bcc8756 870 lines)
1d043b93 871 return urls
78caa52a 872 manifest = self._download_webpage(manifest_url, video_id, 'Downloading formats manifest')
1d043b93
JMF
873 formats_urls = _get_urls(manifest)
874 for format_url in formats_urls:
890f62e8 875 itag = self._search_regex(r'itag/(\d+?)/', format_url, 'itag')
1d043b93
JMF
876 url_map[itag] = format_url
877 return url_map
878
1fb07d10
JG
879 def _extract_annotations(self, video_id):
880 url = 'https://www.youtube.com/annotations_invideo?features=1&legacy=1&video_id=%s' % video_id
69ea8ca4 881 return self._download_webpage(url, video_id, note='Searching for annotations.', errnote='Unable to download video annotations.')
1fb07d10 882
da276600 883 def _parse_dash_manifest(
77c6fb5b 884 self, video_id, dash_manifest_url, player_url, age_gate, fatal=True):
774e208f
PH
885 def decrypt_sig(mobj):
886 s = mobj.group(1)
887 dec_s = self._decrypt_signature(s, video_id, player_url, age_gate)
888 return '/signature/%s' % dec_s
e1b9322b 889 dash_manifest_url = re.sub(r'/s/([a-fA-F0-9\.]+)', decrypt_sig, dash_manifest_url)
774e208f
PH
890 dash_doc = self._download_xml(
891 dash_manifest_url, video_id,
892 note='Downloading DASH manifest',
77c6fb5b
S
893 errnote='Could not download DASH manifest',
894 fatal=fatal)
895
896 if dash_doc is False:
897 return []
774e208f
PH
898
899 formats = []
de5c5456
YCH
900 for a in dash_doc.findall('.//{urn:mpeg:DASH:schema:MPD:2011}AdaptationSet'):
901 mime_type = a.attrib.get('mimeType')
902 for r in a.findall('{urn:mpeg:DASH:schema:MPD:2011}Representation'):
903 url_el = r.find('{urn:mpeg:DASH:schema:MPD:2011}BaseURL')
904 if url_el is None:
905 continue
906 if mime_type == 'text/vtt':
907 # TODO implement WebVTT downloading
908 pass
909 elif mime_type.startswith('audio/') or mime_type.startswith('video/'):
6800d337 910 segment_list = r.find('{urn:mpeg:DASH:schema:MPD:2011}SegmentList')
de5c5456
YCH
911 format_id = r.attrib['id']
912 video_url = url_el.text
913 filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength'))
914 f = {
915 'format_id': format_id,
916 'url': video_url,
917 'width': int_or_none(r.attrib.get('width')),
918 'height': int_or_none(r.attrib.get('height')),
919 'tbr': int_or_none(r.attrib.get('bandwidth'), 1000),
920 'asr': int_or_none(r.attrib.get('audioSamplingRate')),
921 'filesize': filesize,
922 'fps': int_or_none(r.attrib.get('frameRate')),
923 }
0c8662d2 924 if segment_list is not None:
6800d337
YCH
925 f.update({
926 'initialization_url': segment_list.find('{urn:mpeg:DASH:schema:MPD:2011}Initialization').attrib['sourceURL'],
b9258c61 927 'segment_urls': [segment.attrib.get('media') for segment in segment_list.findall('{urn:mpeg:DASH:schema:MPD:2011}SegmentURL')],
423d2be5 928 'protocol': 'http_dash_segments',
6800d337 929 })
de5c5456
YCH
930 try:
931 existing_format = next(
932 fo for fo in formats
933 if fo['format_id'] == format_id)
934 except StopIteration:
935 full_info = self._formats.get(format_id, {}).copy()
936 full_info.update(f)
1b5a1ae2
S
937 codecs = r.attrib.get('codecs')
938 if codecs:
939 if full_info.get('acodec') == 'none' and 'vcodec' not in full_info:
940 full_info['vcodec'] = codecs
941 elif full_info.get('vcodec') == 'none' and 'acodec' not in full_info:
942 full_info['acodec'] = codecs
de5c5456
YCH
943 formats.append(full_info)
944 else:
945 existing_format.update(f)
946 else:
947 self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
774e208f
PH
948 return formats
949
c5e8d7af 950 def _real_extract(self, url):
cf7e015f
S
951 url, smuggled_data = unsmuggle_url(url, {})
952
7e8c0af0 953 proto = (
78caa52a
PH
954 'http' if self._downloader.params.get('prefer_insecure', False)
955 else 'https')
7e8c0af0 956
7c80519c 957 start_time = None
297a564b 958 end_time = None
7c80519c
JMF
959 parsed_url = compat_urllib_parse_urlparse(url)
960 for component in [parsed_url.fragment, parsed_url.query]:
961 query = compat_parse_qs(component)
297a564b 962 if start_time is None and 't' in query:
7c80519c 963 start_time = parse_duration(query['t'][0])
2929fa0e
JMF
964 if start_time is None and 'start' in query:
965 start_time = parse_duration(query['start'][0])
297a564b
JMF
966 if end_time is None and 'end' in query:
967 end_time = parse_duration(query['end'][0])
7c80519c 968
c5e8d7af
PH
969 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
970 mobj = re.search(self._NEXT_URL_RE, url)
971 if mobj:
7fd002c0 972 url = proto + '://www.youtube.com/' + compat_urllib_parse_unquote(mobj.group(1)).lstrip('/')
97665381 973 video_id = self.extract_id(url)
c5e8d7af
PH
974
975 # Get video webpage
aa79ac0c 976 url = proto + '://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1&bpctr=9999999999' % video_id
a1f934b1 977 video_webpage = self._download_webpage(url, video_id)
c5e8d7af
PH
978
979 # Attempt to extract SWF player URL
e0df6211 980 mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
c5e8d7af
PH
981 if mobj is not None:
982 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
983 else:
984 player_url = None
985
d8d24a92
S
986 dash_mpds = []
987
988 def add_dash_mpd(video_info):
989 dash_mpd = video_info.get('dashmpd')
990 if dash_mpd and dash_mpd[0] not in dash_mpds:
991 dash_mpds.append(dash_mpd[0])
992
c5e8d7af 993 # Get video info
6449cd80 994 embed_webpage = None
2fe1ff85 995 is_live = None
c108eb73 996 if re.search(r'player-age-gate-content">', video_webpage) is not None:
c108eb73
JMF
997 age_gate = True
998 # We simulate the access to the video from www.youtube.com/v/{video_id}
999 # this can be viewed without login into Youtube
beb95e77
CL
1000 url = proto + '://www.youtube.com/embed/%s' % video_id
1001 embed_webpage = self._download_webpage(url, video_id, 'Downloading embed webpage')
2c57c7fa
JMF
1002 data = compat_urllib_parse.urlencode({
1003 'video_id': video_id,
1004 'eurl': 'https://youtube.googleapis.com/v/' + video_id,
c084c934 1005 'sts': self._search_regex(
beb95e77 1006 r'"sts"\s*:\s*(\d+)', embed_webpage, 'sts', default=''),
2c57c7fa 1007 })
7e8c0af0 1008 video_info_url = proto + '://www.youtube.com/get_video_info?' + data
94bd3613
PH
1009 video_info_webpage = self._download_webpage(
1010 video_info_url, video_id,
20436c30 1011 note='Refetching age-gated info webpage',
94bd3613 1012 errnote='unable to download video info webpage')
c5e8d7af 1013 video_info = compat_parse_qs(video_info_webpage)
d8d24a92 1014 add_dash_mpd(video_info)
c108eb73
JMF
1015 else:
1016 age_gate = False
bc93bdb5 1017 video_info = None
d8d24a92
S
1018 # Try looking directly into the video webpage
1019 mobj = re.search(r';ytplayer\.config\s*=\s*({.*?});', video_webpage)
1020 if mobj:
4e62ebe2
JMF
1021 json_code = uppercase_escape(mobj.group(1))
1022 ytplayer_config = json.loads(json_code)
1023 args = ytplayer_config['args']
d8d24a92
S
1024 if args.get('url_encoded_fmt_stream_map'):
1025 # Convert to the same format returned by compat_parse_qs
1026 video_info = dict((k, [v]) for k, v in args.items())
1027 add_dash_mpd(video_info)
2fe1ff85
JMF
1028 if args.get('livestream') == '1' or args.get('live_playback') == 1:
1029 is_live = True
0a3cf9ad
S
1030 if not video_info or self._downloader.params.get('youtube_include_dash_manifest', True):
1031 # We also try looking in get_video_info since it may contain different dashmpd
1032 # URL that points to a DASH manifest with possibly different itag set (some itags
1033 # are missing from DASH manifest pointed by webpage's dashmpd, some - from DASH
1034 # manifest pointed by get_video_info's dashmpd).
1035 # The general idea is to take a union of itags of both DASH manifests (for example
1036 # video with such 'manifest behavior' see https://github.com/rg3/youtube-dl/issues/6093)
4e62ebe2 1037 self.report_video_info_webpage_download(video_id)
0a3cf9ad 1038 for el_type in ['&el=info', '&el=embedded', '&el=detailpage', '&el=vevo', '']:
810fb84d
PH
1039 video_info_url = (
1040 '%s://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1041 % (proto, video_id, el_type))
1042 video_info_webpage = self._download_webpage(
1043 video_info_url,
4e62ebe2
JMF
1044 video_id, note=False,
1045 errnote='unable to download video info webpage')
0a3cf9ad 1046 get_video_info = compat_parse_qs(video_info_webpage)
87dc4511
JMF
1047 if get_video_info.get('use_cipher_signature') != ['True']:
1048 add_dash_mpd(get_video_info)
0a3cf9ad
S
1049 if not video_info:
1050 video_info = get_video_info
1051 if 'token' in get_video_info:
4e62ebe2 1052 break
c5e8d7af
PH
1053 if 'token' not in video_info:
1054 if 'reason' in video_info:
af214c3a
YCH
1055 if 'The uploader has not made this video available in your country.' in video_info['reason']:
1056 regions_allowed = self._html_search_meta('regionsAllowed', video_webpage, default=None)
678e436f 1057 if regions_allowed:
af214c3a
YCH
1058 raise ExtractorError('YouTube said: This video is available in %s only' % (
1059 ', '.join(map(ISO3166Utils.short2full, regions_allowed.split(',')))),
1060 expected=True)
d11271dd 1061 raise ExtractorError(
78caa52a 1062 'YouTube said: %s' % video_info['reason'][0],
d11271dd 1063 expected=True, video_id=video_id)
c5e8d7af 1064 else:
d11271dd 1065 raise ExtractorError(
78caa52a 1066 '"token" parameter not in video info for unknown reason',
d11271dd 1067 video_id=video_id)
c5e8d7af 1068
cf7e015f
S
1069 # title
1070 if 'title' in video_info:
1071 video_title = video_info['title'][0]
1072 else:
1073 self._downloader.report_warning('Unable to extract video title')
1074 video_title = '_'
1075
1076 # description
1077 video_description = get_element_by_id("eow-description", video_webpage)
1078 if video_description:
1079 video_description = re.sub(r'''(?x)
1080 <a\s+
1081 (?:[a-zA-Z-]+="[^"]+"\s+)*?
1082 title="([^"]+)"\s+
1083 (?:[a-zA-Z-]+="[^"]+"\s+)*?
1084 class="yt-uix-redirect-link"\s*>
1085 [^<]+
1086 </a>
1087 ''', r'\1', video_description)
1088 video_description = clean_html(video_description)
1089 else:
1090 fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
1091 if fd_mobj:
1092 video_description = unescapeHTML(fd_mobj.group(1))
1093 else:
1094 video_description = ''
1095
5e1eddb9
S
1096 if 'multifeed_metadata_list' in video_info and not smuggled_data.get('force_singlefeed', False):
1097 if not self._downloader.params.get('noplaylist'):
1098 entries = []
1099 feed_ids = []
1100 multifeed_metadata_list = compat_urllib_parse_unquote_plus(video_info['multifeed_metadata_list'][0])
1101 for feed in multifeed_metadata_list.split(','):
1102 feed_data = compat_parse_qs(feed)
1103 entries.append({
1104 '_type': 'url_transparent',
1105 'ie_key': 'Youtube',
1106 'url': smuggle_url(
1107 '%s://www.youtube.com/watch?v=%s' % (proto, feed_data['id'][0]),
1108 {'force_singlefeed': True}),
1109 'title': '%s (%s)' % (video_title, feed_data['title'][0]),
1110 })
1111 feed_ids.append(feed_data['id'][0])
1112 self.to_screen(
1113 'Downloading multifeed video (%s) - add --no-playlist to just download video %s'
1114 % (', '.join(feed_ids), video_id))
1115 return self.playlist_result(entries, video_id, video_title, video_description)
1116 self.to_screen('Downloading just video %s because of --no-playlist' % video_id)
cf7e015f 1117
1d699755
PH
1118 if 'view_count' in video_info:
1119 view_count = int(video_info['view_count'][0])
1120 else:
1121 view_count = None
1122
c5e8d7af
PH
1123 # Check for "rental" videos
1124 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
69ea8ca4 1125 raise ExtractorError('"rental" videos not supported')
c5e8d7af
PH
1126
1127 # Start extracting information
1128 self.report_information_extraction(video_id)
1129
1130 # uploader
1131 if 'author' not in video_info:
69ea8ca4 1132 raise ExtractorError('Unable to extract uploader name')
7fd002c0 1133 video_uploader = compat_urllib_parse_unquote_plus(video_info['author'][0])
c5e8d7af
PH
1134
1135 # uploader_id
1136 video_uploader_id = None
1137 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
1138 if mobj is not None:
1139 video_uploader_id = mobj.group(1)
1140 else:
69ea8ca4 1141 self._downloader.report_warning('unable to extract uploader nickname')
c5e8d7af 1142
c5e8d7af 1143 # thumbnail image
7763b04e
JMF
1144 # We try first to get a high quality image:
1145 m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">',
1146 video_webpage, re.DOTALL)
1147 if m_thumb is not None:
1148 video_thumbnail = m_thumb.group(1)
1149 elif 'thumbnail_url' not in video_info:
69ea8ca4 1150 self._downloader.report_warning('unable to extract video thumbnail')
f490e77e 1151 video_thumbnail = None
c5e8d7af 1152 else: # don't panic if we can't find it
7fd002c0 1153 video_thumbnail = compat_urllib_parse_unquote_plus(video_info['thumbnail_url'][0])
c5e8d7af
PH
1154
1155 # upload date
9d0b581f
S
1156 upload_date = self._html_search_meta(
1157 'datePublished', video_webpage, 'upload date', default=None)
1158 if not upload_date:
1159 upload_date = self._search_regex(
1160 [r'(?s)id="eow-date.*?>(.*?)</span>',
1161 r'id="watch-uploader-info".*?>.*?(?:Published|Uploaded|Streamed live|Started) on (.+?)</strong>'],
1162 video_webpage, 'upload date', default=None)
1163 if upload_date:
1164 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1165 upload_date = unified_strdate(upload_date)
c5e8d7af 1166
55f7bd2d
PH
1167 m_cat_container = self._search_regex(
1168 r'(?s)<h4[^>]*>\s*Category\s*</h4>\s*<ul[^>]*>(.*?)</ul>',
624dcebf 1169 video_webpage, 'categories', default=None)
ec8deefc 1170 if m_cat_container:
ad3bc6ac 1171 category = self._html_search_regex(
01ed5c9b 1172 r'(?s)<a[^<]+>(.*?)</a>', m_cat_container, 'category',
ad3bc6ac
PH
1173 default=None)
1174 video_categories = None if category is None else [category]
1175 else:
1176 video_categories = None
ec8deefc 1177
000b6b5a
S
1178 video_tags = [
1179 unescapeHTML(m.group('content'))
1180 for m in re.finditer(self._meta_regex('og:video:tag'), video_webpage)]
1181
f30a38be 1182 def _extract_count(count_name):
c93d53f5
S
1183 return str_to_int(self._search_regex(
1184 r'-%s-button[^>]+><span[^>]+class="yt-uix-button-content"[^>]*>([\d,]+)</span>'
1185 % re.escape(count_name),
1186 video_webpage, count_name, default=None))
1187
69ea8ca4
PH
1188 like_count = _extract_count('like')
1189 dislike_count = _extract_count('dislike')
336c3a69 1190
c5e8d7af 1191 # subtitles
d82134c3 1192 video_subtitles = self.extract_subtitles(video_id, video_webpage)
360e1ca5 1193 automatic_captions = self.extract_automatic_captions(video_id, video_webpage)
c5e8d7af
PH
1194
1195 if 'length_seconds' not in video_info:
69ea8ca4 1196 self._downloader.report_warning('unable to extract video duration')
b466b702 1197 video_duration = None
c5e8d7af 1198 else:
7fd002c0 1199 video_duration = int(compat_urllib_parse_unquote_plus(video_info['length_seconds'][0]))
c5e8d7af 1200
1fb07d10
JG
1201 # annotations
1202 video_annotations = None
1203 if self._downloader.params.get('writeannotations', False):
5f6a1245 1204 video_annotations = self._extract_annotations(video_id)
1fb07d10 1205
dd27fd17
PH
1206 def _map_to_format_list(urlmap):
1207 formats = []
1208 for itag, video_real_url in urlmap.items():
1209 dct = {
1210 'format_id': itag,
1211 'url': video_real_url,
1212 'player_url': player_url,
1213 }
0b65e5d4
PH
1214 if itag in self._formats:
1215 dct.update(self._formats[itag])
dd27fd17
PH
1216 formats.append(dct)
1217 return formats
1218
c5e8d7af
PH
1219 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1220 self.report_rtmp_download()
dd27fd17
PH
1221 formats = [{
1222 'format_id': '_rtmp',
1223 'protocol': 'rtmp',
1224 'url': video_info['conn'][0],
1225 'player_url': player_url,
1226 }]
24270b03 1227 elif len(video_info.get('url_encoded_fmt_stream_map', [''])[0]) >= 1 or len(video_info.get('adaptive_fmts', [''])[0]) >= 1:
5f6a1245 1228 encoded_url_map = video_info.get('url_encoded_fmt_stream_map', [''])[0] + ',' + video_info.get('adaptive_fmts', [''])[0]
00fe14fc 1229 if 'rtmpe%3Dyes' in encoded_url_map:
a7055eb9 1230 raise ExtractorError('rtmpe downloads are not supported, see https://github.com/rg3/youtube-dl/issues/343 for more information.', expected=True)
c5e8d7af 1231 url_map = {}
00fe14fc 1232 for url_data_str in encoded_url_map.split(','):
c5e8d7af 1233 url_data = compat_parse_qs(url_data_str)
201e9eaa
PH
1234 if 'itag' not in url_data or 'url' not in url_data:
1235 continue
1236 format_id = url_data['itag'][0]
1237 url = url_data['url'][0]
1238
1239 if 'sig' in url_data:
1240 url += '&signature=' + url_data['sig'][0]
1241 elif 's' in url_data:
1242 encrypted_sig = url_data['s'][0]
6449cd80 1243 ASSETS_RE = r'"assets":.+?"js":\s*("[^"]+")'
201e9eaa 1244
beb95e77 1245 jsplayer_url_json = self._search_regex(
6449cd80
PH
1246 ASSETS_RE,
1247 embed_webpage if age_gate else video_webpage,
1248 'JS player URL (1)', default=None)
1249 if not jsplayer_url_json and not age_gate:
1250 # We need the embed website after all
1251 if embed_webpage is None:
1252 embed_url = proto + '://www.youtube.com/embed/%s' % video_id
1253 embed_webpage = self._download_webpage(
1254 embed_url, video_id, 'Downloading embed webpage')
1255 jsplayer_url_json = self._search_regex(
1256 ASSETS_RE, embed_webpage, 'JS player URL')
1257
beb95e77 1258 player_url = json.loads(jsplayer_url_json)
201e9eaa
PH
1259 if player_url is None:
1260 player_url_json = self._search_regex(
1261 r'ytplayer\.config.*?"url"\s*:\s*("[^"]+")',
78caa52a 1262 video_webpage, 'age gate player URL')
201e9eaa
PH
1263 player_url = json.loads(player_url_json)
1264
1265 if self._downloader.params.get('verbose'):
cf010131 1266 if player_url is None:
201e9eaa
PH
1267 player_version = 'unknown'
1268 player_desc = 'unknown'
1269 else:
1270 if player_url.endswith('swf'):
1271 player_version = self._search_regex(
1272 r'-(.+?)(?:/watch_as3)?\.swf$', player_url,
78caa52a 1273 'flash player', fatal=False)
201e9eaa 1274 player_desc = 'flash player %s' % player_version
cf010131 1275 else:
201e9eaa
PH
1276 player_version = self._search_regex(
1277 r'html5player-([^/]+?)(?:/html5player)?\.js',
1278 player_url,
1279 'html5 player', fatal=False)
78caa52a 1280 player_desc = 'html5 player %s' % player_version
201e9eaa 1281
60064c53 1282 parts_sizes = self._signature_cache_id(encrypted_sig)
69ea8ca4 1283 self.to_screen('{%s} signature length %s, %s' %
9e1a5b84 1284 (format_id, parts_sizes, player_desc))
201e9eaa
PH
1285
1286 signature = self._decrypt_signature(
1287 encrypted_sig, video_id, player_url, age_gate)
1288 url += '&signature=' + signature
1289 if 'ratebypass' not in url:
1290 url += '&ratebypass=yes'
1291 url_map[format_id] = url
dd27fd17 1292 formats = _map_to_format_list(url_map)
1d043b93
JMF
1293 elif video_info.get('hlsvp'):
1294 manifest_url = video_info['hlsvp'][0]
1295 url_map = self._extract_from_m3u8(manifest_url, video_id)
dd27fd17 1296 formats = _map_to_format_list(url_map)
c5e8d7af 1297 else:
69ea8ca4 1298 raise ExtractorError('no conn, hlsvp or url_encoded_fmt_stream_map information found in video info')
c5e8d7af 1299
dd27fd17 1300 # Look for the DASH manifest
203fb43f 1301 if self._downloader.params.get('youtube_include_dash_manifest', True):
77c6fb5b 1302 dash_mpd_fatal = True
d8d24a92
S
1303 for dash_manifest_url in dash_mpds:
1304 dash_formats = {}
774e208f 1305 try:
d8d24a92 1306 for df in self._parse_dash_manifest(
77c6fb5b 1307 video_id, dash_manifest_url, player_url, age_gate, dash_mpd_fatal):
d8d24a92
S
1308 # Do not overwrite DASH format found in some previous DASH manifest
1309 if df['format_id'] not in dash_formats:
1310 dash_formats[df['format_id']] = df
77c6fb5b
S
1311 # Additional DASH manifests may end up in HTTP Error 403 therefore
1312 # allow them to fail without bug report message if we already have
1313 # some DASH manifest succeeded. This is temporary workaround to reduce
1314 # burst of bug reports until we figure out the reason and whether it
1315 # can be fixed at all.
1316 dash_mpd_fatal = False
774e208f
PH
1317 except (ExtractorError, KeyError) as e:
1318 self.report_warning(
1319 'Skipping DASH manifest: %r' % e, video_id)
d8d24a92 1320 if dash_formats:
04b3b3df
JMF
1321 # Remove the formats we found through non-DASH, they
1322 # contain less info and it can be wrong, because we use
1323 # fixed values (for example the resolution). See
1324 # https://github.com/rg3/youtube-dl/issues/5774 for an
1325 # example.
d80265cc 1326 formats = [f for f in formats if f['format_id'] not in dash_formats.keys()]
d8d24a92 1327 formats.extend(dash_formats.values())
d80044c2 1328
6271f1ca
PH
1329 # Check for malformed aspect ratio
1330 stretched_m = re.search(
1331 r'<meta\s+property="og:video:tag".*?content="yt:stretch=(?P<w>[0-9]+):(?P<h>[0-9]+)">',
1332 video_webpage)
1333 if stretched_m:
1334 ratio = float(stretched_m.group('w')) / float(stretched_m.group('h'))
1335 for f in formats:
1336 if f.get('vcodec') != 'none':
1337 f['stretched_ratio'] = ratio
1338
4bcc7bd1 1339 self._sort_formats(formats)
4ea3be0a 1340
1341 return {
8bcc8756
JW
1342 'id': video_id,
1343 'uploader': video_uploader,
1344 'uploader_id': video_uploader_id,
1345 'upload_date': upload_date,
1346 'title': video_title,
1347 'thumbnail': video_thumbnail,
1348 'description': video_description,
1349 'categories': video_categories,
000b6b5a 1350 'tags': video_tags,
8bcc8756 1351 'subtitles': video_subtitles,
360e1ca5 1352 'automatic_captions': automatic_captions,
8bcc8756
JW
1353 'duration': video_duration,
1354 'age_limit': 18 if age_gate else 0,
1355 'annotations': video_annotations,
7e8c0af0 1356 'webpage_url': proto + '://www.youtube.com/watch?v=%s' % video_id,
8bcc8756 1357 'view_count': view_count,
4ea3be0a 1358 'like_count': like_count,
1359 'dislike_count': dislike_count,
2d30521a 1360 'average_rating': float_or_none(video_info.get('avg_rating', [None])[0]),
8bcc8756 1361 'formats': formats,
2fe1ff85 1362 'is_live': is_live,
7c80519c 1363 'start_time': start_time,
297a564b 1364 'end_time': end_time,
4ea3be0a 1365 }
c5e8d7af 1366
5f6a1245 1367
880e1c52 1368class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
78caa52a 1369 IE_DESC = 'YouTube.com playlists'
d67cc9fa 1370 _VALID_URL = r"""(?x)(?:
c5e8d7af
PH
1371 (?:https?://)?
1372 (?:\w+\.)?
1373 youtube\.com/
1374 (?:
ac7553d0 1375 (?:course|view_play_list|my_playlists|artist|playlist|watch|embed/videoseries)
c5e8d7af
PH
1376 \? (?:.*?&)*? (?:p|a|list)=
1377 | p/
1378 )
d67cc9fa 1379 (
99209c29 1380 (?:PL|LL|EC|UU|FL|RD|UL)?[0-9A-Za-z-_]{10,}
5f6a1245 1381 # Top tracks, they can also include dots
d67cc9fa
JMF
1382 |(?:MC)[\w\.]*
1383 )
c5e8d7af
PH
1384 .*
1385 |
99209c29 1386 ((?:PL|LL|EC|UU|FL|RD|UL)[0-9A-Za-z-_]{10,})
c5e8d7af 1387 )"""
dbb94fb0 1388 _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s'
dbb94fb0 1389 _VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})&amp;[^"]*?index=(?P<index>\d+)'
78caa52a 1390 IE_NAME = 'youtube:playlist'
81127aa5
PH
1391 _TESTS = [{
1392 'url': 'https://www.youtube.com/playlist?list=PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re',
1393 'info_dict': {
1394 'title': 'ytdl test PL',
a1cf99d0 1395 'id': 'PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re',
81127aa5
PH
1396 },
1397 'playlist_count': 3,
9291475f
PH
1398 }, {
1399 'url': 'https://www.youtube.com/playlist?list=PLtPgu7CB4gbZDA7i_euNxn75ISqxwZPYx',
1400 'info_dict': {
acf757f4 1401 'id': 'PLtPgu7CB4gbZDA7i_euNxn75ISqxwZPYx',
9291475f
PH
1402 'title': 'YDL_Empty_List',
1403 },
1404 'playlist_count': 0,
1405 }, {
1406 'note': 'Playlist with deleted videos (#651). As a bonus, the video #51 is also twice in this list.',
1407 'url': 'https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC',
1408 'info_dict': {
1409 'title': '29C3: Not my department',
acf757f4 1410 'id': 'PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC',
9291475f
PH
1411 },
1412 'playlist_count': 95,
1413 }, {
1414 'note': 'issue #673',
1415 'url': 'PLBB231211A4F62143',
1416 'info_dict': {
f46a8702 1417 'title': '[OLD]Team Fortress 2 (Class-based LP)',
acf757f4 1418 'id': 'PLBB231211A4F62143',
9291475f
PH
1419 },
1420 'playlist_mincount': 26,
1421 }, {
1422 'note': 'Large playlist',
1423 'url': 'https://www.youtube.com/playlist?list=UUBABnxM4Ar9ten8Mdjj1j0Q',
1424 'info_dict': {
1425 'title': 'Uploads from Cauchemar',
acf757f4 1426 'id': 'UUBABnxM4Ar9ten8Mdjj1j0Q',
9291475f
PH
1427 },
1428 'playlist_mincount': 799,
1429 }, {
1430 'url': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl',
1431 'info_dict': {
1432 'title': 'YDL_safe_search',
acf757f4 1433 'id': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl',
9291475f
PH
1434 },
1435 'playlist_count': 2,
ac7553d0
PH
1436 }, {
1437 'note': 'embedded',
1438 'url': 'http://www.youtube.com/embed/videoseries?list=PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu',
1439 'playlist_count': 4,
1440 'info_dict': {
1441 'title': 'JODA15',
acf757f4 1442 'id': 'PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu',
ac7553d0 1443 }
6b08cdf6
PH
1444 }, {
1445 'note': 'Embedded SWF player',
1446 'url': 'http://www.youtube.com/p/YN5VISEtHet5D4NEvfTd0zcgFk84NqFZ?hl=en_US&fs=1&rel=0',
1447 'playlist_count': 4,
1448 'info_dict': {
1449 'title': 'JODA7',
acf757f4 1450 'id': 'YN5VISEtHet5D4NEvfTd0zcgFk84NqFZ',
6b08cdf6 1451 }
4b7df0d3
JMF
1452 }, {
1453 'note': 'Buggy playlist: the webpage has a "Load more" button but it doesn\'t have more videos',
1454 'url': 'https://www.youtube.com/playlist?list=UUXw-G3eDE9trcvY2sBMM_aA',
1455 'info_dict': {
acf757f4
PH
1456 'title': 'Uploads from Interstellar Movie',
1457 'id': 'UUXw-G3eDE9trcvY2sBMM_aA',
4b7df0d3
JMF
1458 },
1459 'playlist_mincout': 21,
81127aa5 1460 }]
c5e8d7af 1461
880e1c52
JMF
1462 def _real_initialize(self):
1463 self._login()
1464
652cdaa2 1465 def _extract_mix(self, playlist_id):
99209c29 1466 # The mixes are generated from a single video
652cdaa2 1467 # the id of the playlist is just 'RD' + video_id
7d4afc55 1468 url = 'https://youtube.com/watch?v=%s&list=%s' % (playlist_id[-11:], playlist_id)
c9cc0bf5 1469 webpage = self._download_webpage(
78caa52a 1470 url, playlist_id, 'Downloading Youtube mix')
bc2f773b 1471 search_title = lambda class_name: get_element_by_attribute('class', class_name, webpage)
c9cc0bf5
PH
1472 title_span = (
1473 search_title('playlist-title') or
1474 search_title('title long-title') or
1475 search_title('title'))
76d1700b 1476 title = clean_html(title_span)
c9cc0bf5
PH
1477 ids = orderedSet(re.findall(
1478 r'''(?xs)data-video-username=".*?".*?
1479 href="/watch\?v=([0-9A-Za-z_-]{11})&amp;[^"]*?list=%s''' % re.escape(playlist_id),
1480 webpage))
652cdaa2
JMF
1481 url_results = self._ids_to_results(ids)
1482
1483 return self.playlist_result(url_results, playlist_id, title)
1484
448830ce 1485 def _extract_playlist(self, playlist_id):
dbb94fb0
S
1486 url = self._TEMPLATE_URL % playlist_id
1487 page = self._download_webpage(url, playlist_id)
dbb94fb0 1488
39b62db1
YCH
1489 for match in re.findall(r'<div class="yt-alert-message">([^<]+)</div>', page):
1490 match = match.strip()
1491 # Check if the playlist exists or is private
1492 if re.match(r'[^<]*(The|This) playlist (does not exist|is private)[^<]*', match):
1493 raise ExtractorError(
1494 'The playlist doesn\'t exist or is private, use --username or '
1495 '--netrc to access it.',
1496 expected=True)
1497 elif re.match(r'[^<]*Invalid parameters[^<]*', match):
1498 raise ExtractorError(
1499 'Invalid parameters. Maybe URL is incorrect.',
1500 expected=True)
1501 elif re.match(r'[^<]*Choose your language[^<]*', match):
1502 continue
1503 else:
1504 self.report_warning('Youtube gives an alert message: ' + match)
10c0e2d8 1505
dcbb4580 1506 # Extract the video ids from the playlist pages
70219b0f
JMF
1507 def _entries():
1508 more_widget_html = content_html = page
1509 for page_num in itertools.count(1):
1510 matches = re.finditer(self._VIDEO_RE, content_html)
1511 # We remove the duplicates and the link with index 0
1512 # (it's not the first video of the playlist)
1513 new_ids = orderedSet(m.group('id') for m in matches if m.group('index') != '0')
1514 for vid_id in new_ids:
1515 yield self.url_result(vid_id, 'Youtube', video_id=vid_id)
1516
1517 mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
1518 if not mobj:
1519 break
1520
1521 more = self._download_json(
1522 'https://youtube.com/%s' % mobj.group('more'), playlist_id,
1523 'Downloading page #%s' % page_num,
1524 transform_source=uppercase_escape)
1525 content_html = more['content_html']
1526 if not content_html.strip():
1527 # Some webpages show a "Load more" button but they don't
1528 # have more videos
1529 break
1530 more_widget_html = more['load_more_widget_html']
dbb94fb0
S
1531
1532 playlist_title = self._html_search_regex(
68eb8e90 1533 r'(?s)<h1 class="pl-header-title[^"]*">\s*(.*?)\s*</h1>',
78caa52a 1534 page, 'title')
c5e8d7af 1535
70219b0f 1536 return self.playlist_result(_entries(), playlist_id, playlist_title)
c5e8d7af 1537
448830ce
S
1538 def _real_extract(self, url):
1539 # Extract playlist id
1540 mobj = re.match(self._VALID_URL, url)
1541 if mobj is None:
1542 raise ExtractorError('Invalid URL: %s' % url)
1543 playlist_id = mobj.group(1) or mobj.group(2)
1544
1545 # Check if it's a video-specific URL
1546 query_dict = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
1547 if 'v' in query_dict:
1548 video_id = query_dict['v'][0]
1549 if self._downloader.params.get('noplaylist'):
1550 self.to_screen('Downloading just video %s because of --no-playlist' % video_id)
1551 return self.url_result(video_id, 'Youtube', video_id=video_id)
1552 else:
1553 self.to_screen('Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id))
1554
1555 if playlist_id.startswith('RD') or playlist_id.startswith('UL'):
1556 # Mixes require a custom extraction process
1557 return self._extract_mix(playlist_id)
1558
1559 return self._extract_playlist(playlist_id)
1560
c5e8d7af
PH
1561
1562class YoutubeChannelIE(InfoExtractor):
78caa52a 1563 IE_DESC = 'YouTube.com channels'
9ff67727 1564 _VALID_URL = r'https?://(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/(?P<id>[0-9A-Za-z_-]+)'
eb0f3e7e 1565 _TEMPLATE_URL = 'https://www.youtube.com/channel/%s/videos'
78caa52a 1566 IE_NAME = 'youtube:channel'
cdc628a4
PH
1567 _TESTS = [{
1568 'note': 'paginated channel',
1569 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w',
1570 'playlist_mincount': 91,
acf757f4
PH
1571 'info_dict': {
1572 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
1573 }
cdc628a4 1574 }]
c5e8d7af 1575
6de5dbaf
S
1576 @staticmethod
1577 def extract_videos_from_page(page):
c5e8d7af 1578 ids_in_page = []
fb69240c
S
1579 titles_in_page = []
1580 for mobj in re.finditer(r'(?:title="(?P<title>[^"]+)"[^>]+)?href="/watch\?v=(?P<id>[0-9A-Za-z_-]+)&?', page):
1581 video_id = mobj.group('id')
1582 video_title = unescapeHTML(mobj.group('title'))
1583 try:
1584 idx = ids_in_page.index(video_id)
1585 if video_title and not titles_in_page[idx]:
1586 titles_in_page[idx] = video_title
1587 except ValueError:
1588 ids_in_page.append(video_id)
1589 titles_in_page.append(video_title)
1590 return zip(ids_in_page, titles_in_page)
c5e8d7af
PH
1591
1592 def _real_extract(self, url):
9ff67727 1593 channel_id = self._match_id(url)
c5e8d7af 1594
eb0f3e7e 1595 url = self._TEMPLATE_URL % channel_id
386bdfa6
S
1596
1597 # Channel by page listing is restricted to 35 pages of 30 items, i.e. 1050 videos total (see #5778)
1598 # Workaround by extracting as a playlist if managed to obtain channel playlist URL
1599 # otherwise fallback on channel by page extraction
1600 channel_page = self._download_webpage(
1601 url + '?view=57', channel_id,
1602 'Downloading channel page', fatal=False)
3d8e9573
S
1603 channel_playlist_id = self._html_search_meta(
1604 'channelId', channel_page, 'channel id', default=None)
1605 if not channel_playlist_id:
1606 channel_playlist_id = self._search_regex(
1607 r'data-channel-external-id="([^"]+)"',
1608 channel_page, 'channel id', default=None)
386bdfa6
S
1609 if channel_playlist_id and channel_playlist_id.startswith('UC'):
1610 playlist_id = 'UU' + channel_playlist_id[2:]
d2a9de78
IK
1611 return self.url_result(
1612 compat_urlparse.urljoin(url, '/playlist?list=%s' % playlist_id), 'YoutubePlaylist')
386bdfa6 1613
60bf45c8 1614 channel_page = self._download_webpage(url, channel_id, 'Downloading page #1')
31812a9e
PH
1615 autogenerated = re.search(r'''(?x)
1616 class="[^"]*?(?:
1617 channel-header-autogenerated-label|
1618 yt-channel-title-autogenerated
1619 )[^"]*"''', channel_page) is not None
c5e8d7af 1620
b9643eed
JMF
1621 if autogenerated:
1622 # The videos are contained in a single page
1623 # the ajax pages can't be used, they are empty
b82f815f 1624 entries = [
fb69240c
S
1625 self.url_result(
1626 video_id, 'Youtube', video_id=video_id,
1627 video_title=video_title)
8f02ad4f 1628 for video_id, video_title in self.extract_videos_from_page(channel_page)]
b82f815f
PH
1629 return self.playlist_result(entries, channel_id)
1630
1631 def _entries():
23d3608c 1632 more_widget_html = content_html = channel_page
b9643eed 1633 for pagenum in itertools.count(1):
81c2f20b 1634
8f02ad4f 1635 for video_id, video_title in self.extract_videos_from_page(content_html):
b82f815f 1636 yield self.url_result(
fb69240c
S
1637 video_id, 'Youtube', video_id=video_id,
1638 video_title=video_title)
5f6a1245 1639
23d3608c
JMF
1640 mobj = re.search(
1641 r'data-uix-load-more-href="/?(?P<more>[^"]+)"',
1642 more_widget_html)
1643 if not mobj:
b9643eed 1644 break
c5e8d7af 1645
23d3608c
JMF
1646 more = self._download_json(
1647 'https://youtube.com/%s' % mobj.group('more'), channel_id,
1648 'Downloading page #%s' % (pagenum + 1),
1649 transform_source=uppercase_escape)
1650 content_html = more['content_html']
1651 more_widget_html = more['load_more_widget_html']
1652
b82f815f 1653 return self.playlist_result(_entries(), channel_id)
c5e8d7af
PH
1654
1655
eb0f3e7e 1656class YoutubeUserIE(YoutubeChannelIE):
78caa52a 1657 IE_DESC = 'YouTube.com user videos (URL or "ytuser" keyword)'
9ff67727 1658 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:user/)?(?!(?:attribution_link|watch|results)(?:$|[^a-z_A-Z0-9-])))|ytuser:)(?!feed/)(?P<id>[A-Za-z0-9_-]+)'
eb0f3e7e 1659 _TEMPLATE_URL = 'https://www.youtube.com/user/%s/videos'
78caa52a 1660 IE_NAME = 'youtube:user'
c5e8d7af 1661
cdc628a4
PH
1662 _TESTS = [{
1663 'url': 'https://www.youtube.com/user/TheLinuxFoundation',
1664 'playlist_mincount': 320,
1665 'info_dict': {
1666 'title': 'TheLinuxFoundation',
1667 }
1668 }, {
1669 'url': 'ytuser:phihag',
1670 'only_matching': True,
1671 }]
1672
e3ea4790 1673 @classmethod
f4b05232 1674 def suitable(cls, url):
e3ea4790
JMF
1675 # Don't return True if the url can be extracted with other youtube
1676 # extractor, the regex would is too permissive and it would match.
1677 other_ies = iter(klass for (name, klass) in globals().items() if name.endswith('IE') and klass is not cls)
5f6a1245
JW
1678 if any(ie.suitable(url) for ie in other_ies):
1679 return False
1680 else:
1681 return super(YoutubeUserIE, cls).suitable(url)
f4b05232 1682
b05654f0 1683
b4c08069 1684class YoutubeSearchIE(SearchInfoExtractor, YoutubePlaylistIE):
78caa52a 1685 IE_DESC = 'YouTube.com searches'
b4c08069
JMF
1686 # there doesn't appear to be a real limit, for example if you search for
1687 # 'python' you get more than 8.000.000 results
1688 _MAX_RESULTS = float('inf')
78caa52a 1689 IE_NAME = 'youtube:search'
b05654f0 1690 _SEARCH_KEY = 'ytsearch'
b4c08069 1691 _EXTRA_QUERY_ARGS = {}
9dd8e46a 1692 _TESTS = []
b05654f0 1693
b05654f0
PH
1694 def _get_n_results(self, query, n):
1695 """Get a specified number of results for a query"""
1696
b4c08069 1697 videos = []
b05654f0
PH
1698 limit = n
1699
b4c08069
JMF
1700 for pagenum in itertools.count(1):
1701 url_query = {
02175a79 1702 'search_query': query.encode('utf-8'),
b4c08069
JMF
1703 'page': pagenum,
1704 'spf': 'navigate',
1705 }
1706 url_query.update(self._EXTRA_QUERY_ARGS)
1707 result_url = 'https://www.youtube.com/results?' + compat_urllib_parse.urlencode(url_query)
1708 data = self._download_json(
69ea8ca4 1709 result_url, video_id='query "%s"' % query,
b4c08069 1710 note='Downloading page %s' % pagenum,
69ea8ca4 1711 errnote='Unable to download API page')
b4c08069 1712 html_content = data[1]['body']['content']
7cc3570e 1713
b4c08069 1714 if 'class="search-message' in html_content:
07ad22b8 1715 raise ExtractorError(
78caa52a 1716 '[youtube] No video results', expected=True)
b05654f0 1717
b4c08069
JMF
1718 new_videos = self._ids_to_results(orderedSet(re.findall(
1719 r'href="/watch\?v=(.{11})', html_content)))
1720 videos += new_videos
1721 if not new_videos or len(videos) > limit:
1722 break
b05654f0 1723
b4c08069
JMF
1724 if len(videos) > n:
1725 videos = videos[:n]
b05654f0 1726 return self.playlist_result(videos, query)
75dff0ee 1727
c9ae7b95 1728
a3dd9248 1729class YoutubeSearchDateIE(YoutubeSearchIE):
cb7fb546 1730 IE_NAME = YoutubeSearchIE.IE_NAME + ':date'
a3dd9248 1731 _SEARCH_KEY = 'ytsearchdate'
78caa52a 1732 IE_DESC = 'YouTube.com searches, newest videos first'
b4c08069 1733 _EXTRA_QUERY_ARGS = {'search_sort': 'video_date_uploaded'}
75dff0ee 1734
c9ae7b95
PH
1735
1736class YoutubeSearchURLIE(InfoExtractor):
78caa52a
PH
1737 IE_DESC = 'YouTube.com search URLs'
1738 IE_NAME = 'youtube:search_url'
c9ae7b95 1739 _VALID_URL = r'https?://(?:www\.)?youtube\.com/results\?(.*?&)?search_query=(?P<query>[^&]+)(?:[&]|$)'
cdc628a4
PH
1740 _TESTS = [{
1741 'url': 'https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video',
1742 'playlist_mincount': 5,
1743 'info_dict': {
1744 'title': 'youtube-dl test video',
1745 }
1746 }]
c9ae7b95
PH
1747
1748 def _real_extract(self, url):
1749 mobj = re.match(self._VALID_URL, url)
7fd002c0 1750 query = compat_urllib_parse_unquote_plus(mobj.group('query'))
c9ae7b95
PH
1751
1752 webpage = self._download_webpage(url, query)
1753 result_code = self._search_regex(
98998cde 1754 r'(?s)<ol[^>]+class="item-section"(.*?)</ol>', webpage, 'result HTML')
c9ae7b95
PH
1755
1756 part_codes = re.findall(
1757 r'(?s)<h3 class="yt-lockup-title">(.*?)</h3>', result_code)
1758 entries = []
1759 for part_code in part_codes:
1760 part_title = self._html_search_regex(
6feb2d5e 1761 [r'(?s)title="([^"]+)"', r'>([^<]+)</a>'], part_code, 'item title', fatal=False)
c9ae7b95
PH
1762 part_url_snippet = self._html_search_regex(
1763 r'(?s)href="([^"]+)"', part_code, 'item URL')
1764 part_url = compat_urlparse.urljoin(
1765 'https://www.youtube.com/', part_url_snippet)
1766 entries.append({
1767 '_type': 'url',
1768 'url': part_url,
1769 'title': part_title,
1770 })
1771
1772 return {
1773 '_type': 'playlist',
1774 'entries': entries,
1775 'title': query,
1776 }
1777
1778
75dff0ee 1779class YoutubeShowIE(InfoExtractor):
78caa52a 1780 IE_DESC = 'YouTube.com (multi-season) shows'
cdc628a4 1781 _VALID_URL = r'https?://www\.youtube\.com/show/(?P<id>[^?#]*)'
78caa52a 1782 IE_NAME = 'youtube:show'
cdc628a4
PH
1783 _TESTS = [{
1784 'url': 'http://www.youtube.com/show/airdisasters',
1785 'playlist_mincount': 3,
1786 'info_dict': {
1787 'id': 'airdisasters',
1788 'title': 'Air Disasters',
1789 }
1790 }]
75dff0ee
JMF
1791
1792 def _real_extract(self, url):
1793 mobj = re.match(self._VALID_URL, url)
cdc628a4
PH
1794 playlist_id = mobj.group('id')
1795 webpage = self._download_webpage(
1796 url, playlist_id, 'Downloading show webpage')
75dff0ee
JMF
1797 # There's one playlist for each season of the show
1798 m_seasons = list(re.finditer(r'href="(/playlist\?list=.*?)"', webpage))
cdc628a4
PH
1799 self.to_screen('%s: Found %s seasons' % (playlist_id, len(m_seasons)))
1800 entries = [
1801 self.url_result(
1802 'https://www.youtube.com' + season.group(1), 'YoutubePlaylist')
1803 for season in m_seasons
1804 ]
1805 title = self._og_search_title(webpage, fatal=False)
1806
1807 return {
1808 '_type': 'playlist',
1809 'id': playlist_id,
1810 'title': title,
1811 'entries': entries,
1812 }
04cc9617
JMF
1813
1814
b2e8bc1b 1815class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
d7ae0639 1816 """
25f14e9f 1817 Base class for feed extractors
d7ae0639
JMF
1818 Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.
1819 """
b2e8bc1b 1820 _LOGIN_REQUIRED = True
d7ae0639
JMF
1821
1822 @property
1823 def IE_NAME(self):
78caa52a 1824 return 'youtube:%s' % self._FEED_NAME
04cc9617 1825
81f0259b 1826 def _real_initialize(self):
b2e8bc1b 1827 self._login()
81f0259b 1828
04cc9617 1829 def _real_extract(self, url):
25f14e9f
S
1830 page = self._download_webpage(
1831 'https://www.youtube.com/feed/%s' % self._FEED_NAME, self._PLAYLIST_TITLE)
2bc43303
JMF
1832
1833 # The extraction process is the same as for playlists, but the regex
1834 # for the video ids doesn't contain an index
1835 ids = []
1836 more_widget_html = content_html = page
2bc43303
JMF
1837 for page_num in itertools.count(1):
1838 matches = re.findall(r'href="\s*/watch\?v=([0-9A-Za-z_-]{11})', content_html)
62c95fd5
S
1839
1840 # 'recommended' feed has infinite 'load more' and each new portion spins
1841 # the same videos in (sometimes) slightly different order, so we'll check
1842 # for unicity and break when portion has no new videos
1843 new_ids = filter(lambda video_id: video_id not in ids, orderedSet(matches))
1844 if not new_ids:
1845 break
1846
2bc43303
JMF
1847 ids.extend(new_ids)
1848
1849 mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
1850 if not mobj:
1851 break
1852
1853 more = self._download_json(
25f14e9f 1854 'https://youtube.com/%s' % mobj.group('more'), self._PLAYLIST_TITLE,
2bc43303
JMF
1855 'Downloading page #%s' % page_num,
1856 transform_source=uppercase_escape)
1857 content_html = more['content_html']
1858 more_widget_html = more['load_more_widget_html']
1859
25f14e9f
S
1860 return self.playlist_result(
1861 self._ids_to_results(ids), playlist_title=self._PLAYLIST_TITLE)
1862
1863
1864class YoutubeWatchLaterIE(YoutubePlaylistIE):
1865 IE_NAME = 'youtube:watchlater'
1866 IE_DESC = 'Youtube watch later list, ":ytwatchlater" for short (requires authentication)'
1867 _VALID_URL = r'https?://www\.youtube\.com/(?:feed/watch_later|playlist\?list=WL)|:ytwatchlater'
1868
1869 _TESTS = [] # override PlaylistIE tests
1870
1871 def _real_extract(self, url):
1872 return self._extract_playlist('WL')
f459d170 1873
5f6a1245 1874
c626a3d9 1875class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
78caa52a 1876 IE_NAME = 'youtube:favorites'
f3a34072 1877 IE_DESC = 'YouTube.com favourite videos, ":ytfav" for short (requires authentication)'
c7a7750d 1878 _VALID_URL = r'https?://www\.youtube\.com/my_favorites|:ytfav(?:ou?rites)?'
c626a3d9
JMF
1879 _LOGIN_REQUIRED = True
1880
1881 def _real_extract(self, url):
1882 webpage = self._download_webpage('https://www.youtube.com/my_favorites', 'Youtube Favourites videos')
78caa52a 1883 playlist_id = self._search_regex(r'list=(.+?)["&]', webpage, 'favourites playlist id')
c626a3d9 1884 return self.url_result(playlist_id, 'YoutubePlaylist')
15870e90
PH
1885
1886
25f14e9f
S
1887class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
1888 IE_DESC = 'YouTube.com recommended videos, ":ytrec" for short (requires authentication)'
1889 _VALID_URL = r'https?://www\.youtube\.com/feed/recommended|:ytrec(?:ommended)?'
1890 _FEED_NAME = 'recommended'
1891 _PLAYLIST_TITLE = 'Youtube Recommended videos'
1ed5b5c9 1892
1ed5b5c9 1893
25f14e9f
S
1894class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor):
1895 IE_DESC = 'YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication)'
1896 _VALID_URL = r'https?://www\.youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
1897 _FEED_NAME = 'subscriptions'
1898 _PLAYLIST_TITLE = 'Youtube Subscriptions'
1ed5b5c9 1899
1ed5b5c9 1900
25f14e9f
S
1901class YoutubeHistoryIE(YoutubeFeedsInfoExtractor):
1902 IE_DESC = 'Youtube watch history, ":ythistory" for short (requires authentication)'
1903 _VALID_URL = 'https?://www\.youtube\.com/feed/history|:ythistory'
1904 _FEED_NAME = 'history'
1905 _PLAYLIST_TITLE = 'Youtube History'
1ed5b5c9
JMF
1906
1907
15870e90
PH
1908class YoutubeTruncatedURLIE(InfoExtractor):
1909 IE_NAME = 'youtube:truncated_url'
1910 IE_DESC = False # Do not list
975d35db 1911 _VALID_URL = r'''(?x)
b95aab84
PH
1912 (?:https?://)?
1913 (?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/
1914 (?:watch\?(?:
c4808c60 1915 feature=[a-z_]+|
b95aab84
PH
1916 annotation_id=annotation_[^&]+|
1917 x-yt-cl=[0-9]+|
c1708b89 1918 hl=[^&]*|
b95aab84
PH
1919 )?
1920 |
1921 attribution_link\?a=[^&]+
1922 )
1923 $
975d35db 1924 '''
15870e90 1925
c4808c60
PH
1926 _TESTS = [{
1927 'url': 'http://www.youtube.com/watch?annotation_id=annotation_3951667041',
1928 'only_matching': True,
dc2fc736
PH
1929 }, {
1930 'url': 'http://www.youtube.com/watch?',
1931 'only_matching': True,
b95aab84
PH
1932 }, {
1933 'url': 'https://www.youtube.com/watch?x-yt-cl=84503534',
1934 'only_matching': True,
1935 }, {
1936 'url': 'https://www.youtube.com/watch?feature=foo',
1937 'only_matching': True,
c1708b89
PH
1938 }, {
1939 'url': 'https://www.youtube.com/watch?hl=en-GB',
1940 'only_matching': True,
c4808c60
PH
1941 }]
1942
15870e90
PH
1943 def _real_extract(self, url):
1944 raise ExtractorError(
78caa52a
PH
1945 'Did you forget to quote the URL? Remember that & is a meta '
1946 'character in most shells, so you want to put the URL in quotes, '
1947 'like youtube-dl '
1948 '"http://www.youtube.com/watch?feature=foo&v=BaW_jenozKc" '
1949 ' or simply youtube-dl BaW_jenozKc .',
15870e90 1950 expected=True)
772fd5cc
PH
1951
1952
1953class YoutubeTruncatedIDIE(InfoExtractor):
1954 IE_NAME = 'youtube:truncated_id'
1955 IE_DESC = False # Do not list
b95aab84 1956 _VALID_URL = r'https?://(?:www\.)?youtube\.com/watch\?v=(?P<id>[0-9A-Za-z_-]{1,10})$'
772fd5cc
PH
1957
1958 _TESTS = [{
1959 'url': 'https://www.youtube.com/watch?v=N_708QY7Ob',
1960 'only_matching': True,
1961 }]
1962
1963 def _real_extract(self, url):
1964 video_id = self._match_id(url)
1965 raise ExtractorError(
1966 'Incomplete YouTube ID %s. URL %s looks truncated.' % (video_id, url),
1967 expected=True)