]> jfr.im git - yt-dlp.git/blame - youtube_dl/extractor/youtube.py
[youtube:search_url] Fix extraction (Closes #6578)
[yt-dlp.git] / youtube_dl / extractor / youtube.py
CommitLineData
c5e8d7af 1# coding: utf-8
c5e8d7af 2
78caa52a
PH
3from __future__ import unicode_literals
4
5
0ca96d48 6import itertools
c5e8d7af 7import json
c4417ddb 8import os.path
c5e8d7af 9import re
42939b61 10import time
e0df6211 11import traceback
c5e8d7af 12
b05654f0 13from .common import InfoExtractor, SearchInfoExtractor
2b25cb5d 14from ..jsinterp import JSInterpreter
54256267 15from ..swfinterp import SWFInterpreter
4bb4a188 16from ..compat import (
edf3e38e 17 compat_chr,
c5e8d7af 18 compat_parse_qs,
c5e8d7af 19 compat_urllib_parse,
7fd002c0
S
20 compat_urllib_parse_unquote,
21 compat_urllib_parse_unquote_plus,
7c80519c 22 compat_urllib_parse_urlparse,
c5e8d7af 23 compat_urllib_request,
7c61bd36 24 compat_urlparse,
c5e8d7af 25 compat_str,
4bb4a188
PH
26)
27from ..utils import (
c5e8d7af 28 clean_html,
c5e8d7af 29 ExtractorError,
2d30521a 30 float_or_none,
4bb4a188
PH
31 get_element_by_attribute,
32 get_element_by_id,
dd27fd17 33 int_or_none,
4bb4a188 34 orderedSet,
7c80519c 35 parse_duration,
041bc3ad 36 remove_start,
cf7e015f 37 smuggle_url,
c93d53f5 38 str_to_int,
c5e8d7af
PH
39 unescapeHTML,
40 unified_strdate,
cf7e015f 41 unsmuggle_url,
81c2f20b 42 uppercase_escape,
af214c3a 43 ISO3166Utils,
c5e8d7af
PH
44)
45
5f6a1245 46
de7f3446 47class YoutubeBaseInfoExtractor(InfoExtractor):
b2e8bc1b
JMF
48 """Provide base functions for Youtube extractors"""
49 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
9303ce3e 50 _TWOFACTOR_URL = 'https://accounts.google.com/signin/challenge'
b2e8bc1b
JMF
51 _NETRC_MACHINE = 'youtube'
52 # If True it will raise an error if no login info is provided
53 _LOGIN_REQUIRED = False
54
b2e8bc1b 55 def _set_language(self):
810fb84d
PH
56 self._set_cookie(
57 '.youtube.com', 'PREF', 'f1=50000000&hl=en',
42939b61 58 # YouTube sets the expire time to about two months
810fb84d 59 expire_time=time.time() + 2 * 30 * 24 * 3600)
b2e8bc1b 60
25f14e9f
S
61 def _ids_to_results(self, ids):
62 return [
63 self.url_result(vid_id, 'Youtube', video_id=vid_id)
64 for vid_id in ids]
65
b2e8bc1b 66 def _login(self):
83317f69 67 """
68 Attempt to log in to YouTube.
69 True is returned if successful or skipped.
70 False is returned if login failed.
71
72 If _LOGIN_REQUIRED is set and no authentication was provided, an error is raised.
73 """
b2e8bc1b
JMF
74 (username, password) = self._get_login_info()
75 # No authentication to be performed
76 if username is None:
77 if self._LOGIN_REQUIRED:
69ea8ca4 78 raise ExtractorError('No login info available, needed for using %s.' % self.IE_NAME, expected=True)
83317f69 79 return True
b2e8bc1b 80
7cc3570e
PH
81 login_page = self._download_webpage(
82 self._LOGIN_URL, None,
69ea8ca4
PH
83 note='Downloading login page',
84 errnote='unable to fetch login page', fatal=False)
7cc3570e
PH
85 if login_page is False:
86 return
b2e8bc1b 87
795f28f8 88 galx = self._search_regex(r'(?s)<input.+?name="GALX".+?value="(.+?)"',
78caa52a 89 login_page, 'Login GALX parameter')
c5e8d7af 90
b2e8bc1b
JMF
91 # Log in
92 login_form_strs = {
8bcc8756
JW
93 'continue': 'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
94 'Email': username,
95 'GALX': galx,
96 'Passwd': password,
97
98 'PersistentCookie': 'yes',
99 '_utf8': '霱',
100 'bgresponse': 'js_disabled',
101 'checkConnection': '',
102 'checkedDomains': 'youtube',
103 'dnConn': '',
104 'pstMsg': '0',
105 'rmShown': '1',
106 'secTok': '',
107 'signIn': 'Sign in',
108 'timeStmp': '',
109 'service': 'youtube',
110 'uilel': '3',
111 'hl': 'en_US',
b2e8bc1b 112 }
83317f69 113
b2e8bc1b
JMF
114 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
115 # chokes on unicode
5f6a1245 116 login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k, v in login_form_strs.items())
b2e8bc1b 117 login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
7cc3570e
PH
118
119 req = compat_urllib_request.Request(self._LOGIN_URL, login_data)
120 login_results = self._download_webpage(
121 req, None,
69ea8ca4 122 note='Logging in', errnote='unable to log in', fatal=False)
7cc3570e
PH
123 if login_results is False:
124 return False
83317f69 125
126 if re.search(r'id="errormsg_0_Passwd"', login_results) is not None:
69ea8ca4 127 raise ExtractorError('Please use your account password and a two-factor code instead of an application-specific password.', expected=True)
83317f69 128
129 # Two-Factor
130 # TODO add SMS and phone call support - these require making a request and then prompting the user
131
9303ce3e 132 if re.search(r'(?i)<form[^>]* id="challenge"', login_results) is not None:
041bc3ad 133 tfa_code = self._get_tfa_info('2-step verification code')
83317f69 134
041bc3ad
S
135 if not tfa_code:
136 self._downloader.report_warning(
137 'Two-factor authentication required. Provide it either interactively or with --twofactor <code>'
138 '(Note that only TOTP (Google Authenticator App) codes work at this time.)')
83317f69 139 return False
140
041bc3ad
S
141 tfa_code = remove_start(tfa_code, 'G-')
142
143 tfa_form_strs = self._form_hidden_inputs('challenge', login_results)
144
145 tfa_form_strs.update({
9303ce3e 146 'Pin': tfa_code,
147 'TrustDevice': 'on',
041bc3ad
S
148 })
149
5f6a1245 150 tfa_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k, v in tfa_form_strs.items())
83317f69 151 tfa_data = compat_urllib_parse.urlencode(tfa_form).encode('ascii')
152
153 tfa_req = compat_urllib_request.Request(self._TWOFACTOR_URL, tfa_data)
154 tfa_results = self._download_webpage(
155 tfa_req, None,
69ea8ca4 156 note='Submitting TFA code', errnote='unable to submit tfa', fatal=False)
83317f69 157
158 if tfa_results is False:
159 return False
160
9303ce3e 161 if re.search(r'(?i)<form[^>]* id="challenge"', tfa_results) is not None:
041bc3ad 162 self._downloader.report_warning('Two-factor code expired or invalid. Please try again, or use a one-use backup code instead.')
83317f69 163 return False
164 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', tfa_results) is not None:
69ea8ca4 165 self._downloader.report_warning('unable to log in - did the page structure change?')
83317f69 166 return False
167 if re.search(r'smsauth-interstitial-reviewsettings', tfa_results) is not None:
69ea8ca4 168 self._downloader.report_warning('Your Google account has a security notice. Please log in on your web browser, resolve the notice, and try again.')
83317f69 169 return False
170
7cc3570e 171 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
69ea8ca4 172 self._downloader.report_warning('unable to log in: bad username or password')
b2e8bc1b
JMF
173 return False
174 return True
175
b2e8bc1b
JMF
176 def _real_initialize(self):
177 if self._downloader is None:
178 return
42939b61 179 self._set_language()
b2e8bc1b
JMF
180 if not self._login():
181 return
c5e8d7af 182
8377574c 183
360e1ca5 184class YoutubeIE(YoutubeBaseInfoExtractor):
78caa52a 185 IE_DESC = 'YouTube.com'
cb7dfeea 186 _VALID_URL = r"""(?x)^
c5e8d7af 187 (
edb53e2d 188 (?:https?://|//) # http(s):// or protocol-independent URL
cb7dfeea 189 (?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/|
484aaeb2 190 (?:www\.)?deturl\.com/www\.youtube\.com/|
e70dc1d1 191 (?:www\.)?pwnyoutube\.com/|
f7000f3a 192 (?:www\.)?yourepeat\.com/|
e69ae5b9
JMF
193 tube\.majestyc\.net/|
194 youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains
c5e8d7af
PH
195 (?:.*?\#/)? # handle anchor (#/) redirect urls
196 (?: # the various things that can precede the ID:
ac7553d0 197 (?:(?:v|embed|e)/(?!videoseries)) # v/ or embed/ or e/
c5e8d7af 198 |(?: # or the v= param in all its forms
f7000f3a 199 (?:(?:watch|movie)(?:_popup)?(?:\.php)?/?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
c5e8d7af 200 (?:\?|\#!?) # the params delimiter ? or # or #!
11b56058 201 (?:.*?&)?? # any other preceding param (like /?s=tuff&v=xxxx)
c5e8d7af
PH
202 v=
203 )
f4b05232
JMF
204 ))
205 |youtu\.be/ # just youtu.be/xxxx
edb53e2d 206 |(?:www\.)?cleanvideosearch\.com/media/action/yt/watch\?videoId=
f4b05232 207 )
c5e8d7af 208 )? # all until now is optional -> you can pass the naked ID
8963d9c2 209 ([0-9A-Za-z_-]{11}) # here is it! the YouTube video ID
9291475f 210 (?!.*?&list=) # combined list/video URLs are handled by the playlist IE
c5e8d7af
PH
211 (?(1).+)? # if we found the ID, everything can follow
212 $"""
c5e8d7af 213 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
2c62dc26
PH
214 _formats = {
215 '5': {'ext': 'flv', 'width': 400, 'height': 240},
216 '6': {'ext': 'flv', 'width': 450, 'height': 270},
217 '13': {'ext': '3gp'},
218 '17': {'ext': '3gp', 'width': 176, 'height': 144},
219 '18': {'ext': 'mp4', 'width': 640, 'height': 360},
220 '22': {'ext': 'mp4', 'width': 1280, 'height': 720},
221 '34': {'ext': 'flv', 'width': 640, 'height': 360},
222 '35': {'ext': 'flv', 'width': 854, 'height': 480},
223 '36': {'ext': '3gp', 'width': 320, 'height': 240},
224 '37': {'ext': 'mp4', 'width': 1920, 'height': 1080},
225 '38': {'ext': 'mp4', 'width': 4096, 'height': 3072},
226 '43': {'ext': 'webm', 'width': 640, 'height': 360},
227 '44': {'ext': 'webm', 'width': 854, 'height': 480},
228 '45': {'ext': 'webm', 'width': 1280, 'height': 720},
229 '46': {'ext': 'webm', 'width': 1920, 'height': 1080},
c9bebed2
S
230 '59': {'ext': 'mp4', 'width': 854, 'height': 480},
231 '78': {'ext': 'mp4', 'width': 854, 'height': 480},
2c62dc26 232
1d043b93 233
86fe61c8 234 # 3d videos
43b81eb9
PH
235 '82': {'ext': 'mp4', 'height': 360, 'format_note': '3D', 'preference': -20},
236 '83': {'ext': 'mp4', 'height': 480, 'format_note': '3D', 'preference': -20},
237 '84': {'ext': 'mp4', 'height': 720, 'format_note': '3D', 'preference': -20},
238 '85': {'ext': 'mp4', 'height': 1080, 'format_note': '3D', 'preference': -20},
239 '100': {'ext': 'webm', 'height': 360, 'format_note': '3D', 'preference': -20},
240 '101': {'ext': 'webm', 'height': 480, 'format_note': '3D', 'preference': -20},
241 '102': {'ext': 'webm', 'height': 720, 'format_note': '3D', 'preference': -20},
836a086c 242
96fb5605 243 # Apple HTTP Live Streaming
43b81eb9
PH
244 '92': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'preference': -10},
245 '93': {'ext': 'mp4', 'height': 360, 'format_note': 'HLS', 'preference': -10},
246 '94': {'ext': 'mp4', 'height': 480, 'format_note': 'HLS', 'preference': -10},
247 '95': {'ext': 'mp4', 'height': 720, 'format_note': 'HLS', 'preference': -10},
248 '96': {'ext': 'mp4', 'height': 1080, 'format_note': 'HLS', 'preference': -10},
249 '132': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'preference': -10},
250 '151': {'ext': 'mp4', 'height': 72, 'format_note': 'HLS', 'preference': -10},
2c62dc26
PH
251
252 # DASH mp4 video
43b81eb9
PH
253 '133': {'ext': 'mp4', 'height': 240, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
254 '134': {'ext': 'mp4', 'height': 360, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
255 '135': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
256 '136': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
257 '137': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
e65566a9 258 '138': {'ext': 'mp4', 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40}, # Height can vary (https://github.com/rg3/youtube-dl/issues/4559)
43b81eb9
PH
259 '160': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
260 '264': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
0d2c1418
PH
261 '298': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'h264'},
262 '299': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'h264'},
263 '266': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'vcodec': 'h264'},
836a086c 264
f6f1fc92 265 # Dash mp4 audio
62cd676c
PH
266 '139': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'vcodec': 'none', 'abr': 48, 'preference': -50, 'container': 'm4a_dash'},
267 '140': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'vcodec': 'none', 'abr': 128, 'preference': -50, 'container': 'm4a_dash'},
268 '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'vcodec': 'none', 'abr': 256, 'preference': -50, 'container': 'm4a_dash'},
836a086c
AZ
269
270 # Dash webm
4c6bd5b5
JMF
271 '167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40},
272 '168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40},
273 '169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40},
274 '170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40},
275 '218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40},
276 '219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40},
277 '278': {'ext': 'webm', 'height': 144, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'container': 'webm', 'vcodec': 'vp9'},
e75cafe9
A
278 '242': {'ext': 'webm', 'height': 240, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
279 '243': {'ext': 'webm', 'height': 360, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
280 '244': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
281 '245': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
282 '246': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
283 '247': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
284 '248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
3c80377b 285 '271': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
18061bba 286 '272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
4c6bd5b5
JMF
287 '302': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'vp9'},
288 '303': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'vp9'},
289 '308': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'vp9'},
290 '313': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'vcodec': 'vp9'},
291 '315': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'vp9'},
2c62dc26
PH
292
293 # Dash webm audio
55db73ef 294 '171': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'abr': 128, 'preference': -50},
e75cafe9 295 '172': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'abr': 256, 'preference': -50},
ce6b9a2d 296
0857baad
PH
297 # Dash webm audio with opus inside
298 '249': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 50, 'preference': -50},
299 '250': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 70, 'preference': -50},
300 '251': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 160, 'preference': -50},
301
ce6b9a2d
PH
302 # RTMP (unnamed)
303 '_rtmp': {'protocol': 'rtmp'},
c5e8d7af 304 }
836a086c 305
78caa52a 306 IE_NAME = 'youtube'
2eb88d95
PH
307 _TESTS = [
308 {
297a564b 309 'url': 'http://www.youtube.com/watch?v=BaW_jenozKcj&t=1s&end=9',
4bc3a23e
PH
310 'info_dict': {
311 'id': 'BaW_jenozKc',
312 'ext': 'mp4',
313 'title': 'youtube-dl test video "\'/\\ä↭𝕐',
314 'uploader': 'Philipp Hagemeister',
315 'uploader_id': 'phihag',
316 'upload_date': '20121002',
317 'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',
318 'categories': ['Science & Technology'],
000b6b5a 319 'tags': ['youtube-dl'],
3e7c1224
PH
320 'like_count': int,
321 'dislike_count': int,
7c80519c 322 'start_time': 1,
297a564b 323 'end_time': 9,
2eb88d95 324 }
0e853ca4 325 },
0e853ca4 326 {
4bc3a23e
PH
327 'url': 'http://www.youtube.com/watch?v=UxxajLWwzqY',
328 'note': 'Test generic use_cipher_signature video (#897)',
329 'info_dict': {
330 'id': 'UxxajLWwzqY',
331 'ext': 'mp4',
332 'upload_date': '20120506',
333 'title': 'Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]',
000b6b5a
S
334 'description': 'md5:782e8651347686cba06e58f71ab51773',
335 'tags': ['Icona Pop i love it', 'sweden', 'pop music', 'big beat records', 'big beat', 'charli',
336 'xcx', 'charli xcx', 'girls', 'hbo', 'i love it', "i don't care", 'icona', 'pop',
337 'iconic ep', 'iconic', 'love', 'it'],
4bc3a23e
PH
338 'uploader': 'Icona Pop',
339 'uploader_id': 'IconaPop',
2eb88d95 340 }
c108eb73
JMF
341 },
342 {
4bc3a23e
PH
343 'url': 'https://www.youtube.com/watch?v=07FYdnEawAQ',
344 'note': 'Test VEVO video with age protection (#956)',
345 'info_dict': {
346 'id': '07FYdnEawAQ',
347 'ext': 'mp4',
348 'upload_date': '20130703',
349 'title': 'Justin Timberlake - Tunnel Vision (Explicit)',
350 'description': 'md5:64249768eec3bc4276236606ea996373',
351 'uploader': 'justintimberlakeVEVO',
352 'uploader_id': 'justintimberlakeVEVO',
34952f09 353 'age_limit': 18,
c108eb73
JMF
354 }
355 },
fccd3771 356 {
4bc3a23e
PH
357 'url': '//www.YouTube.com/watch?v=yZIXLfi8CZQ',
358 'note': 'Embed-only video (#1746)',
359 'info_dict': {
360 'id': 'yZIXLfi8CZQ',
361 'ext': 'mp4',
362 'upload_date': '20120608',
363 'title': 'Principal Sexually Assaults A Teacher - Episode 117 - 8th June 2012',
364 'description': 'md5:09b78bd971f1e3e289601dfba15ca4f7',
365 'uploader': 'SET India',
366 'uploader_id': 'setindia'
fccd3771
PH
367 }
368 },
11b56058
PM
369 {
370 'url': 'http://www.youtube.com/watch?v=BaW_jenozKcj&v=UxxajLWwzqY',
371 'note': 'Use the first video ID in the URL',
372 'info_dict': {
373 'id': 'BaW_jenozKc',
374 'ext': 'mp4',
375 'title': 'youtube-dl test video "\'/\\ä↭𝕐',
376 'uploader': 'Philipp Hagemeister',
377 'uploader_id': 'phihag',
378 'upload_date': '20121002',
379 'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',
380 'categories': ['Science & Technology'],
381 'tags': ['youtube-dl'],
382 'like_count': int,
383 'dislike_count': int,
34a7de29
S
384 },
385 'params': {
386 'skip_download': True,
387 },
11b56058 388 },
dd27fd17 389 {
4bc3a23e
PH
390 'url': 'http://www.youtube.com/watch?v=a9LDPn-MO4I',
391 'note': '256k DASH audio (format 141) via DASH manifest',
392 'info_dict': {
393 'id': 'a9LDPn-MO4I',
394 'ext': 'm4a',
395 'upload_date': '20121002',
396 'uploader_id': '8KVIDEO',
397 'description': '',
398 'uploader': '8KVIDEO',
399 'title': 'UHDTV TEST 8K VIDEO.mp4'
4919603f 400 },
4bc3a23e
PH
401 'params': {
402 'youtube_include_dash_manifest': True,
403 'format': '141',
4919603f 404 },
dd27fd17 405 },
3489b7d2
JMF
406 # DASH manifest with encrypted signature
407 {
78caa52a
PH
408 'url': 'https://www.youtube.com/watch?v=IB3lcPjvWLA',
409 'info_dict': {
410 'id': 'IB3lcPjvWLA',
411 'ext': 'm4a',
b766eb27
JMF
412 'title': 'Afrojack, Spree Wilson - The Spark ft. Spree Wilson',
413 'description': 'md5:12e7067fa6735a77bdcbb58cb1187d2d',
78caa52a
PH
414 'uploader': 'AfrojackVEVO',
415 'uploader_id': 'AfrojackVEVO',
416 'upload_date': '20131011',
3489b7d2 417 },
4bc3a23e 418 'params': {
78caa52a
PH
419 'youtube_include_dash_manifest': True,
420 'format': '141',
3489b7d2
JMF
421 },
422 },
aaeb86f6
S
423 # JS player signature function name containing $
424 {
425 'url': 'https://www.youtube.com/watch?v=nfWlot6h_JM',
426 'info_dict': {
427 'id': 'nfWlot6h_JM',
428 'ext': 'm4a',
429 'title': 'Taylor Swift - Shake It Off',
f57b7835 430 'description': 'md5:95f66187cd7c8b2c13eb78e1223b63c3',
aaeb86f6
S
431 'uploader': 'TaylorSwiftVEVO',
432 'uploader_id': 'TaylorSwiftVEVO',
433 'upload_date': '20140818',
434 },
435 'params': {
436 'youtube_include_dash_manifest': True,
437 'format': '141',
438 },
439 },
aa79ac0c
PH
440 # Controversy video
441 {
442 'url': 'https://www.youtube.com/watch?v=T4XJQO3qol8',
443 'info_dict': {
444 'id': 'T4XJQO3qol8',
445 'ext': 'mp4',
446 'upload_date': '20100909',
447 'uploader': 'The Amazing Atheist',
448 'uploader_id': 'TheAmazingAtheist',
449 'title': 'Burning Everyone\'s Koran',
450 'description': 'SUBSCRIBE: http://www.youtube.com/saturninefilms\n\nEven Obama has taken a stand against freedom on this issue: http://www.huffingtonpost.com/2010/09/09/obama-gma-interview-quran_n_710282.html',
451 }
c522adb1
JMF
452 },
453 # Normal age-gate video (No vevo, embed allowed)
454 {
455 'url': 'http://youtube.com/watch?v=HtVdAasjOgU',
456 'info_dict': {
457 'id': 'HtVdAasjOgU',
458 'ext': 'mp4',
459 'title': 'The Witcher 3: Wild Hunt - The Sword Of Destiny Trailer',
9ed99402 460 'description': 're:(?s).{100,}About the Game\n.*?The Witcher 3: Wild Hunt.{100,}',
c522adb1
JMF
461 'uploader': 'The Witcher',
462 'uploader_id': 'WitcherGame',
463 'upload_date': '20140605',
34952f09 464 'age_limit': 18,
c522adb1
JMF
465 },
466 },
fccae2b9
S
467 # Age-gate video with encrypted signature
468 {
469 'url': 'http://www.youtube.com/watch?v=6kLq3WMV1nU',
470 'info_dict': {
471 'id': '6kLq3WMV1nU',
472 'ext': 'mp4',
473 'title': 'Dedication To My Ex (Miss That) (Lyric Video)',
474 'description': 'md5:33765bb339e1b47e7e72b5490139bb41',
475 'uploader': 'LloydVEVO',
476 'uploader_id': 'LloydVEVO',
477 'upload_date': '20110629',
34952f09 478 'age_limit': 18,
fccae2b9
S
479 },
480 },
774e208f
PH
481 # video_info is None (https://github.com/rg3/youtube-dl/issues/4421)
482 {
483 'url': '__2ABJjxzNo',
484 'info_dict': {
485 'id': '__2ABJjxzNo',
486 'ext': 'mp4',
487 'upload_date': '20100430',
488 'uploader_id': 'deadmau5',
489 'description': 'md5:12c56784b8032162bb936a5f76d55360',
490 'uploader': 'deadmau5',
491 'title': 'Deadmau5 - Some Chords (HD)',
492 },
493 'expected_warnings': [
494 'DASH manifest missing',
495 ]
e52a40ab
PH
496 },
497 # Olympics (https://github.com/rg3/youtube-dl/issues/4431)
498 {
499 'url': 'lqQg6PlCWgI',
500 'info_dict': {
501 'id': 'lqQg6PlCWgI',
502 'ext': 'mp4',
f57b7835 503 'upload_date': '20120724',
cbe2bd91
PH
504 'uploader_id': 'olympic',
505 'description': 'HO09 - Women - GER-AUS - Hockey - 31 July 2012 - London 2012 Olympic Games',
506 'uploader': 'Olympics',
507 'title': 'Hockey - Women - GER-AUS - London 2012 Olympic Games',
508 },
509 'params': {
510 'skip_download': 'requires avconv',
e52a40ab 511 }
cbe2bd91 512 },
6271f1ca
PH
513 # Non-square pixels
514 {
515 'url': 'https://www.youtube.com/watch?v=_b-2C3KPAM0',
516 'info_dict': {
517 'id': '_b-2C3KPAM0',
518 'ext': 'mp4',
519 'stretched_ratio': 16 / 9.,
520 'upload_date': '20110310',
521 'uploader_id': 'AllenMeow',
522 'description': 'made by Wacom from Korea | 字幕&加油添醋 by TY\'s Allen | 感謝heylisa00cavey1001同學熱情提供梗及翻譯',
523 'uploader': '孫艾倫',
524 'title': '[A-made] 變態妍字幕版 太妍 我就是這樣的人',
525 },
06b491eb
S
526 },
527 # url_encoded_fmt_stream_map is empty string
528 {
529 'url': 'qEJwOuvDf7I',
530 'info_dict': {
531 'id': 'qEJwOuvDf7I',
f57b7835 532 'ext': 'webm',
06b491eb
S
533 'title': 'Обсуждение судебной практики по выборам 14 сентября 2014 года в Санкт-Петербурге',
534 'description': '',
535 'upload_date': '20150404',
536 'uploader_id': 'spbelect',
537 'uploader': 'Наблюдатели Петербурга',
538 },
539 'params': {
540 'skip_download': 'requires avconv',
541 }
542 },
da77d856
S
543 # Extraction from multiple DASH manifests (https://github.com/rg3/youtube-dl/pull/6097)
544 {
545 'url': 'https://www.youtube.com/watch?v=FIl7x6_3R5Y',
546 'info_dict': {
547 'id': 'FIl7x6_3R5Y',
548 'ext': 'mp4',
549 'title': 'md5:7b81415841e02ecd4313668cde88737a',
550 'description': 'md5:116377fd2963b81ec4ce64b542173306',
551 'upload_date': '20150625',
552 'uploader_id': 'dorappi2000',
553 'uploader': 'dorappi2000',
554 'formats': 'mincount:33',
555 },
2ee8f5d8 556 },
8a1a26ce
YCH
557 # DASH manifest with segment_list
558 {
559 'url': 'https://www.youtube.com/embed/CsmdDsKjzN8',
560 'md5': '8ce563a1d667b599d21064e982ab9e31',
561 'info_dict': {
562 'id': 'CsmdDsKjzN8',
563 'ext': 'mp4',
17ee98e1 564 'upload_date': '20150501', # According to '<meta itemprop="datePublished"', but in other places it's 20150510
8a1a26ce
YCH
565 'uploader': 'Airtek',
566 'description': 'Retransmisión en directo de la XVIII media maratón de Zaragoza.',
567 'uploader_id': 'UCzTzUmjXxxacNnL8I3m4LnQ',
568 'title': 'Retransmisión XVIII Media maratón Zaragoza 2015',
569 },
570 'params': {
571 'youtube_include_dash_manifest': True,
572 'format': '135', # bestvideo
573 }
2ee8f5d8 574 },
cf7e015f
S
575 {
576 # Multifeed videos (multiple cameras), URL is for Main Camera
577 'url': 'https://www.youtube.com/watch?v=jqWvoWXjCVs',
578 'info_dict': {
579 'id': 'jqWvoWXjCVs',
580 'title': 'teamPGP: Rocket League Noob Stream',
581 'description': 'md5:dc7872fb300e143831327f1bae3af010',
582 },
583 'playlist': [{
584 'info_dict': {
585 'id': 'jqWvoWXjCVs',
586 'ext': 'mp4',
587 'title': 'teamPGP: Rocket League Noob Stream (Main Camera)',
588 'description': 'md5:dc7872fb300e143831327f1bae3af010',
589 'upload_date': '20150721',
590 'uploader': 'Beer Games Beer',
591 'uploader_id': 'beergamesbeer',
592 },
593 }, {
594 'info_dict': {
595 'id': '6h8e8xoXJzg',
596 'ext': 'mp4',
597 'title': 'teamPGP: Rocket League Noob Stream (kreestuh)',
598 'description': 'md5:dc7872fb300e143831327f1bae3af010',
599 'upload_date': '20150721',
600 'uploader': 'Beer Games Beer',
601 'uploader_id': 'beergamesbeer',
602 },
603 }, {
604 'info_dict': {
605 'id': 'PUOgX5z9xZw',
606 'ext': 'mp4',
607 'title': 'teamPGP: Rocket League Noob Stream (grizzle)',
608 'description': 'md5:dc7872fb300e143831327f1bae3af010',
609 'upload_date': '20150721',
610 'uploader': 'Beer Games Beer',
611 'uploader_id': 'beergamesbeer',
612 },
613 }, {
614 'info_dict': {
615 'id': 'teuwxikvS5k',
616 'ext': 'mp4',
617 'title': 'teamPGP: Rocket League Noob Stream (zim)',
618 'description': 'md5:dc7872fb300e143831327f1bae3af010',
619 'upload_date': '20150721',
620 'uploader': 'Beer Games Beer',
621 'uploader_id': 'beergamesbeer',
622 },
623 }],
624 'params': {
625 'skip_download': True,
626 },
627 }
2eb88d95
PH
628 ]
629
e0df6211
PH
630 def __init__(self, *args, **kwargs):
631 super(YoutubeIE, self).__init__(*args, **kwargs)
83799698 632 self._player_cache = {}
e0df6211 633
c5e8d7af
PH
634 def report_video_info_webpage_download(self, video_id):
635 """Report attempt to download video info webpage."""
69ea8ca4 636 self.to_screen('%s: Downloading video info webpage' % video_id)
c5e8d7af 637
c5e8d7af
PH
638 def report_information_extraction(self, video_id):
639 """Report attempt to extract video information."""
69ea8ca4 640 self.to_screen('%s: Extracting video information' % video_id)
c5e8d7af
PH
641
642 def report_unavailable_format(self, video_id, format):
643 """Report extracted video URL."""
69ea8ca4 644 self.to_screen('%s: Format %s not available' % (video_id, format))
c5e8d7af
PH
645
646 def report_rtmp_download(self):
647 """Indicate the download will use the RTMP protocol."""
69ea8ca4 648 self.to_screen('RTMP download detected')
c5e8d7af 649
60064c53
PH
650 def _signature_cache_id(self, example_sig):
651 """ Return a string representation of a signature """
78caa52a 652 return '.'.join(compat_str(len(part)) for part in example_sig.split('.'))
60064c53
PH
653
654 def _extract_signature_function(self, video_id, player_url, example_sig):
cf010131 655 id_m = re.match(
60620368 656 r'.*?-(?P<id>[a-zA-Z0-9_-]+)(?:/watch_as3|/html5player)?\.(?P<ext>[a-z]+)$',
cf010131 657 player_url)
c081b35c
PH
658 if not id_m:
659 raise ExtractorError('Cannot identify player %r' % player_url)
e0df6211
PH
660 player_type = id_m.group('ext')
661 player_id = id_m.group('id')
662
c4417ddb 663 # Read from filesystem cache
60064c53
PH
664 func_id = '%s_%s_%s' % (
665 player_type, player_id, self._signature_cache_id(example_sig))
c4417ddb 666 assert os.path.basename(func_id) == func_id
a0e07d31 667
69ea8ca4 668 cache_spec = self._downloader.cache.load('youtube-sigfuncs', func_id)
a0e07d31 669 if cache_spec is not None:
78caa52a 670 return lambda s: ''.join(s[i] for i in cache_spec)
83799698 671
6d1a55a5
PH
672 download_note = (
673 'Downloading player %s' % player_url
674 if self._downloader.params.get('verbose') else
675 'Downloading %s player %s' % (player_type, player_id)
676 )
e0df6211
PH
677 if player_type == 'js':
678 code = self._download_webpage(
679 player_url, video_id,
6d1a55a5 680 note=download_note,
69ea8ca4 681 errnote='Download of %s failed' % player_url)
83799698 682 res = self._parse_sig_js(code)
c4417ddb 683 elif player_type == 'swf':
e0df6211
PH
684 urlh = self._request_webpage(
685 player_url, video_id,
6d1a55a5 686 note=download_note,
69ea8ca4 687 errnote='Download of %s failed' % player_url)
e0df6211 688 code = urlh.read()
83799698 689 res = self._parse_sig_swf(code)
e0df6211
PH
690 else:
691 assert False, 'Invalid player type %r' % player_type
692
785521bf
PH
693 test_string = ''.join(map(compat_chr, range(len(example_sig))))
694 cache_res = res(test_string)
695 cache_spec = [ord(c) for c in cache_res]
83799698 696
69ea8ca4 697 self._downloader.cache.store('youtube-sigfuncs', func_id, cache_spec)
83799698
PH
698 return res
699
60064c53 700 def _print_sig_code(self, func, example_sig):
edf3e38e
PH
701 def gen_sig_code(idxs):
702 def _genslice(start, end, step):
78caa52a 703 starts = '' if start == 0 else str(start)
8bcc8756 704 ends = (':%d' % (end + step)) if end + step >= 0 else ':'
69ea8ca4 705 steps = '' if step == 1 else (':%d' % step)
78caa52a 706 return 's[%s%s%s]' % (starts, ends, steps)
edf3e38e
PH
707
708 step = None
7af808a5
PH
709 # Quelch pyflakes warnings - start will be set when step is set
710 start = '(Never used)'
edf3e38e
PH
711 for i, prev in zip(idxs[1:], idxs[:-1]):
712 if step is not None:
713 if i - prev == step:
714 continue
715 yield _genslice(start, prev, step)
716 step = None
717 continue
718 if i - prev in [-1, 1]:
719 step = i - prev
720 start = prev
721 continue
722 else:
78caa52a 723 yield 's[%d]' % prev
edf3e38e 724 if step is None:
78caa52a 725 yield 's[%d]' % i
edf3e38e
PH
726 else:
727 yield _genslice(start, i, step)
728
78caa52a 729 test_string = ''.join(map(compat_chr, range(len(example_sig))))
c705320f 730 cache_res = func(test_string)
edf3e38e 731 cache_spec = [ord(c) for c in cache_res]
78caa52a 732 expr_code = ' + '.join(gen_sig_code(cache_spec))
60064c53
PH
733 signature_id_tuple = '(%s)' % (
734 ', '.join(compat_str(len(p)) for p in example_sig.split('.')))
69ea8ca4 735 code = ('if tuple(len(p) for p in s.split(\'.\')) == %s:\n'
78caa52a 736 ' return %s\n') % (signature_id_tuple, expr_code)
69ea8ca4 737 self.to_screen('Extracted signature function:\n' + code)
edf3e38e 738
e0df6211
PH
739 def _parse_sig_js(self, jscode):
740 funcname = self._search_regex(
aaeb86f6 741 r'\.sig\|\|([a-zA-Z0-9$]+)\(', jscode,
8bcc8756 742 'Initial JS player signature function name')
2b25cb5d
PH
743
744 jsi = JSInterpreter(jscode)
745 initial_function = jsi.extract_function(funcname)
e0df6211
PH
746 return lambda s: initial_function([s])
747
748 def _parse_sig_swf(self, file_contents):
54256267 749 swfi = SWFInterpreter(file_contents)
78caa52a 750 TARGET_CLASSNAME = 'SignatureDecipher'
54256267 751 searched_class = swfi.extract_class(TARGET_CLASSNAME)
78caa52a 752 initial_function = swfi.extract_function(searched_class, 'decipher')
e0df6211
PH
753 return lambda s: initial_function([s])
754
83799698 755 def _decrypt_signature(self, s, video_id, player_url, age_gate=False):
257a2501 756 """Turn the encrypted s field into a working signature"""
6b37f0be 757
c8bf86d5 758 if player_url is None:
69ea8ca4 759 raise ExtractorError('Cannot decrypt signature without player_url')
920de7a2 760
69ea8ca4 761 if player_url.startswith('//'):
78caa52a 762 player_url = 'https:' + player_url
c8bf86d5 763 try:
62af3a0e 764 player_id = (player_url, self._signature_cache_id(s))
c8bf86d5
PH
765 if player_id not in self._player_cache:
766 func = self._extract_signature_function(
60064c53 767 video_id, player_url, s
c8bf86d5
PH
768 )
769 self._player_cache[player_id] = func
770 func = self._player_cache[player_id]
771 if self._downloader.params.get('youtube_print_sig_code'):
60064c53 772 self._print_sig_code(func, s)
c8bf86d5
PH
773 return func(s)
774 except Exception as e:
775 tb = traceback.format_exc()
776 raise ExtractorError(
78caa52a 777 'Signature extraction failed: ' + tb, cause=e)
e0df6211 778
360e1ca5 779 def _get_subtitles(self, video_id, webpage):
de7f3446 780 try:
60e47a26 781 subs_doc = self._download_xml(
38c2e5b8 782 'https://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,
7fad1c63
JMF
783 video_id, note=False)
784 except ExtractorError as err:
69ea8ca4 785 self._downloader.report_warning('unable to download video subtitles: %s' % compat_str(err))
de7f3446 786 return {}
de7f3446
JMF
787
788 sub_lang_list = {}
60e47a26
JMF
789 for track in subs_doc.findall('track'):
790 lang = track.attrib['lang_code']
7e660ac1
LD
791 if lang in sub_lang_list:
792 continue
360e1ca5
JMF
793 sub_formats = []
794 for ext in ['sbv', 'vtt', 'srt']:
795 params = compat_urllib_parse.urlencode({
796 'lang': lang,
797 'v': video_id,
798 'fmt': ext,
799 'name': track.attrib['name'].encode('utf-8'),
800 })
801 sub_formats.append({
802 'url': 'https://www.youtube.com/api/timedtext?' + params,
803 'ext': ext,
804 })
805 sub_lang_list[lang] = sub_formats
de7f3446 806 if not sub_lang_list:
69ea8ca4 807 self._downloader.report_warning('video doesn\'t have subtitles')
de7f3446
JMF
808 return {}
809 return sub_lang_list
810
360e1ca5 811 def _get_automatic_captions(self, video_id, webpage):
de7f3446
JMF
812 """We need the webpage for getting the captions url, pass it as an
813 argument to speed up the process."""
69ea8ca4 814 self.to_screen('%s: Looking for automatic captions' % video_id)
de7f3446 815 mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
78caa52a 816 err_msg = 'Couldn\'t find automatic captions for %s' % video_id
de7f3446
JMF
817 if mobj is None:
818 self._downloader.report_warning(err_msg)
819 return {}
820 player_config = json.loads(mobj.group(1))
821 try:
0792d563
PH
822 args = player_config['args']
823 caption_url = args['ttsurl']
824 timestamp = args['timestamp']
055e6f36
JMF
825 # We get the available subtitles
826 list_params = compat_urllib_parse.urlencode({
827 'type': 'list',
828 'tlangs': 1,
829 'asrs': 1,
de7f3446 830 })
055e6f36 831 list_url = caption_url + '&' + list_params
e26f8712 832 caption_list = self._download_xml(list_url, video_id)
e3dc22ca 833 original_lang_node = caption_list.find('track')
7d900ef1 834 if original_lang_node is None:
69ea8ca4 835 self._downloader.report_warning('Video doesn\'t have automatic captions')
e3dc22ca
JMF
836 return {}
837 original_lang = original_lang_node.attrib['lang_code']
7d900ef1 838 caption_kind = original_lang_node.attrib.get('kind', '')
055e6f36
JMF
839
840 sub_lang_list = {}
841 for lang_node in caption_list.findall('target'):
842 sub_lang = lang_node.attrib['lang_code']
360e1ca5
JMF
843 sub_formats = []
844 for ext in ['sbv', 'vtt', 'srt']:
845 params = compat_urllib_parse.urlencode({
846 'lang': original_lang,
847 'tlang': sub_lang,
848 'fmt': ext,
849 'ts': timestamp,
850 'kind': caption_kind,
851 })
852 sub_formats.append({
853 'url': caption_url + '&' + params,
854 'ext': ext,
855 })
856 sub_lang_list[sub_lang] = sub_formats
055e6f36 857 return sub_lang_list
de7f3446
JMF
858 # An extractor error can be raise by the download process if there are
859 # no automatic captions but there are subtitles
860 except (KeyError, ExtractorError):
861 self._downloader.report_warning(err_msg)
862 return {}
863
97665381
PH
864 @classmethod
865 def extract_id(cls, url):
866 mobj = re.match(cls._VALID_URL, url, re.VERBOSE)
c5e8d7af 867 if mobj is None:
69ea8ca4 868 raise ExtractorError('Invalid URL: %s' % url)
c5e8d7af
PH
869 video_id = mobj.group(2)
870 return video_id
871
1d043b93
JMF
872 def _extract_from_m3u8(self, manifest_url, video_id):
873 url_map = {}
5f6a1245 874
1d043b93
JMF
875 def _get_urls(_manifest):
876 lines = _manifest.split('\n')
877 urls = filter(lambda l: l and not l.startswith('#'),
8bcc8756 878 lines)
1d043b93 879 return urls
78caa52a 880 manifest = self._download_webpage(manifest_url, video_id, 'Downloading formats manifest')
1d043b93
JMF
881 formats_urls = _get_urls(manifest)
882 for format_url in formats_urls:
890f62e8 883 itag = self._search_regex(r'itag/(\d+?)/', format_url, 'itag')
1d043b93
JMF
884 url_map[itag] = format_url
885 return url_map
886
1fb07d10
JG
887 def _extract_annotations(self, video_id):
888 url = 'https://www.youtube.com/annotations_invideo?features=1&legacy=1&video_id=%s' % video_id
69ea8ca4 889 return self._download_webpage(url, video_id, note='Searching for annotations.', errnote='Unable to download video annotations.')
1fb07d10 890
da276600 891 def _parse_dash_manifest(
77c6fb5b 892 self, video_id, dash_manifest_url, player_url, age_gate, fatal=True):
774e208f
PH
893 def decrypt_sig(mobj):
894 s = mobj.group(1)
895 dec_s = self._decrypt_signature(s, video_id, player_url, age_gate)
896 return '/signature/%s' % dec_s
e1b9322b 897 dash_manifest_url = re.sub(r'/s/([a-fA-F0-9\.]+)', decrypt_sig, dash_manifest_url)
774e208f
PH
898 dash_doc = self._download_xml(
899 dash_manifest_url, video_id,
900 note='Downloading DASH manifest',
77c6fb5b
S
901 errnote='Could not download DASH manifest',
902 fatal=fatal)
903
904 if dash_doc is False:
905 return []
774e208f
PH
906
907 formats = []
de5c5456
YCH
908 for a in dash_doc.findall('.//{urn:mpeg:DASH:schema:MPD:2011}AdaptationSet'):
909 mime_type = a.attrib.get('mimeType')
910 for r in a.findall('{urn:mpeg:DASH:schema:MPD:2011}Representation'):
911 url_el = r.find('{urn:mpeg:DASH:schema:MPD:2011}BaseURL')
912 if url_el is None:
913 continue
914 if mime_type == 'text/vtt':
915 # TODO implement WebVTT downloading
916 pass
917 elif mime_type.startswith('audio/') or mime_type.startswith('video/'):
6800d337 918 segment_list = r.find('{urn:mpeg:DASH:schema:MPD:2011}SegmentList')
de5c5456
YCH
919 format_id = r.attrib['id']
920 video_url = url_el.text
921 filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength'))
922 f = {
923 'format_id': format_id,
924 'url': video_url,
925 'width': int_or_none(r.attrib.get('width')),
926 'height': int_or_none(r.attrib.get('height')),
927 'tbr': int_or_none(r.attrib.get('bandwidth'), 1000),
928 'asr': int_or_none(r.attrib.get('audioSamplingRate')),
929 'filesize': filesize,
930 'fps': int_or_none(r.attrib.get('frameRate')),
931 }
0c8662d2 932 if segment_list is not None:
6800d337
YCH
933 f.update({
934 'initialization_url': segment_list.find('{urn:mpeg:DASH:schema:MPD:2011}Initialization').attrib['sourceURL'],
b9258c61 935 'segment_urls': [segment.attrib.get('media') for segment in segment_list.findall('{urn:mpeg:DASH:schema:MPD:2011}SegmentURL')],
423d2be5 936 'protocol': 'http_dash_segments',
6800d337 937 })
de5c5456
YCH
938 try:
939 existing_format = next(
940 fo for fo in formats
941 if fo['format_id'] == format_id)
942 except StopIteration:
943 full_info = self._formats.get(format_id, {}).copy()
944 full_info.update(f)
1b5a1ae2
S
945 codecs = r.attrib.get('codecs')
946 if codecs:
947 if full_info.get('acodec') == 'none' and 'vcodec' not in full_info:
948 full_info['vcodec'] = codecs
949 elif full_info.get('vcodec') == 'none' and 'acodec' not in full_info:
950 full_info['acodec'] = codecs
de5c5456
YCH
951 formats.append(full_info)
952 else:
953 existing_format.update(f)
954 else:
955 self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
774e208f
PH
956 return formats
957
c5e8d7af 958 def _real_extract(self, url):
cf7e015f
S
959 url, smuggled_data = unsmuggle_url(url, {})
960
7e8c0af0 961 proto = (
78caa52a
PH
962 'http' if self._downloader.params.get('prefer_insecure', False)
963 else 'https')
7e8c0af0 964
7c80519c 965 start_time = None
297a564b 966 end_time = None
7c80519c
JMF
967 parsed_url = compat_urllib_parse_urlparse(url)
968 for component in [parsed_url.fragment, parsed_url.query]:
969 query = compat_parse_qs(component)
297a564b 970 if start_time is None and 't' in query:
7c80519c 971 start_time = parse_duration(query['t'][0])
2929fa0e
JMF
972 if start_time is None and 'start' in query:
973 start_time = parse_duration(query['start'][0])
297a564b
JMF
974 if end_time is None and 'end' in query:
975 end_time = parse_duration(query['end'][0])
7c80519c 976
c5e8d7af
PH
977 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
978 mobj = re.search(self._NEXT_URL_RE, url)
979 if mobj:
7fd002c0 980 url = proto + '://www.youtube.com/' + compat_urllib_parse_unquote(mobj.group(1)).lstrip('/')
97665381 981 video_id = self.extract_id(url)
c5e8d7af
PH
982
983 # Get video webpage
aa79ac0c 984 url = proto + '://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1&bpctr=9999999999' % video_id
a1f934b1 985 video_webpage = self._download_webpage(url, video_id)
c5e8d7af
PH
986
987 # Attempt to extract SWF player URL
e0df6211 988 mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
c5e8d7af
PH
989 if mobj is not None:
990 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
991 else:
992 player_url = None
993
d8d24a92
S
994 dash_mpds = []
995
996 def add_dash_mpd(video_info):
997 dash_mpd = video_info.get('dashmpd')
998 if dash_mpd and dash_mpd[0] not in dash_mpds:
999 dash_mpds.append(dash_mpd[0])
1000
c5e8d7af 1001 # Get video info
6449cd80 1002 embed_webpage = None
2fe1ff85 1003 is_live = None
c108eb73 1004 if re.search(r'player-age-gate-content">', video_webpage) is not None:
c108eb73
JMF
1005 age_gate = True
1006 # We simulate the access to the video from www.youtube.com/v/{video_id}
1007 # this can be viewed without login into Youtube
beb95e77
CL
1008 url = proto + '://www.youtube.com/embed/%s' % video_id
1009 embed_webpage = self._download_webpage(url, video_id, 'Downloading embed webpage')
2c57c7fa
JMF
1010 data = compat_urllib_parse.urlencode({
1011 'video_id': video_id,
1012 'eurl': 'https://youtube.googleapis.com/v/' + video_id,
c084c934 1013 'sts': self._search_regex(
beb95e77 1014 r'"sts"\s*:\s*(\d+)', embed_webpage, 'sts', default=''),
2c57c7fa 1015 })
7e8c0af0 1016 video_info_url = proto + '://www.youtube.com/get_video_info?' + data
94bd3613
PH
1017 video_info_webpage = self._download_webpage(
1018 video_info_url, video_id,
20436c30 1019 note='Refetching age-gated info webpage',
94bd3613 1020 errnote='unable to download video info webpage')
c5e8d7af 1021 video_info = compat_parse_qs(video_info_webpage)
d8d24a92 1022 add_dash_mpd(video_info)
c108eb73
JMF
1023 else:
1024 age_gate = False
bc93bdb5 1025 video_info = None
d8d24a92
S
1026 # Try looking directly into the video webpage
1027 mobj = re.search(r';ytplayer\.config\s*=\s*({.*?});', video_webpage)
1028 if mobj:
4e62ebe2
JMF
1029 json_code = uppercase_escape(mobj.group(1))
1030 ytplayer_config = json.loads(json_code)
1031 args = ytplayer_config['args']
d8d24a92
S
1032 if args.get('url_encoded_fmt_stream_map'):
1033 # Convert to the same format returned by compat_parse_qs
1034 video_info = dict((k, [v]) for k, v in args.items())
1035 add_dash_mpd(video_info)
2fe1ff85
JMF
1036 if args.get('livestream') == '1' or args.get('live_playback') == 1:
1037 is_live = True
0a3cf9ad
S
1038 if not video_info or self._downloader.params.get('youtube_include_dash_manifest', True):
1039 # We also try looking in get_video_info since it may contain different dashmpd
1040 # URL that points to a DASH manifest with possibly different itag set (some itags
1041 # are missing from DASH manifest pointed by webpage's dashmpd, some - from DASH
1042 # manifest pointed by get_video_info's dashmpd).
1043 # The general idea is to take a union of itags of both DASH manifests (for example
1044 # video with such 'manifest behavior' see https://github.com/rg3/youtube-dl/issues/6093)
4e62ebe2 1045 self.report_video_info_webpage_download(video_id)
0a3cf9ad 1046 for el_type in ['&el=info', '&el=embedded', '&el=detailpage', '&el=vevo', '']:
810fb84d
PH
1047 video_info_url = (
1048 '%s://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1049 % (proto, video_id, el_type))
1050 video_info_webpage = self._download_webpage(
1051 video_info_url,
4e62ebe2
JMF
1052 video_id, note=False,
1053 errnote='unable to download video info webpage')
0a3cf9ad 1054 get_video_info = compat_parse_qs(video_info_webpage)
87dc4511
JMF
1055 if get_video_info.get('use_cipher_signature') != ['True']:
1056 add_dash_mpd(get_video_info)
0a3cf9ad
S
1057 if not video_info:
1058 video_info = get_video_info
1059 if 'token' in get_video_info:
4e62ebe2 1060 break
c5e8d7af
PH
1061 if 'token' not in video_info:
1062 if 'reason' in video_info:
af214c3a
YCH
1063 if 'The uploader has not made this video available in your country.' in video_info['reason']:
1064 regions_allowed = self._html_search_meta('regionsAllowed', video_webpage, default=None)
678e436f 1065 if regions_allowed:
af214c3a
YCH
1066 raise ExtractorError('YouTube said: This video is available in %s only' % (
1067 ', '.join(map(ISO3166Utils.short2full, regions_allowed.split(',')))),
1068 expected=True)
d11271dd 1069 raise ExtractorError(
78caa52a 1070 'YouTube said: %s' % video_info['reason'][0],
d11271dd 1071 expected=True, video_id=video_id)
c5e8d7af 1072 else:
d11271dd 1073 raise ExtractorError(
78caa52a 1074 '"token" parameter not in video info for unknown reason',
d11271dd 1075 video_id=video_id)
c5e8d7af 1076
cf7e015f
S
1077 # title
1078 if 'title' in video_info:
1079 video_title = video_info['title'][0]
1080 else:
1081 self._downloader.report_warning('Unable to extract video title')
1082 video_title = '_'
1083
1084 # description
1085 video_description = get_element_by_id("eow-description", video_webpage)
1086 if video_description:
1087 video_description = re.sub(r'''(?x)
1088 <a\s+
1089 (?:[a-zA-Z-]+="[^"]+"\s+)*?
1090 title="([^"]+)"\s+
1091 (?:[a-zA-Z-]+="[^"]+"\s+)*?
1092 class="yt-uix-redirect-link"\s*>
1093 [^<]+
1094 </a>
1095 ''', r'\1', video_description)
1096 video_description = clean_html(video_description)
1097 else:
1098 fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
1099 if fd_mobj:
1100 video_description = unescapeHTML(fd_mobj.group(1))
1101 else:
1102 video_description = ''
1103
5e1eddb9
S
1104 if 'multifeed_metadata_list' in video_info and not smuggled_data.get('force_singlefeed', False):
1105 if not self._downloader.params.get('noplaylist'):
1106 entries = []
1107 feed_ids = []
1108 multifeed_metadata_list = compat_urllib_parse_unquote_plus(video_info['multifeed_metadata_list'][0])
1109 for feed in multifeed_metadata_list.split(','):
1110 feed_data = compat_parse_qs(feed)
1111 entries.append({
1112 '_type': 'url_transparent',
1113 'ie_key': 'Youtube',
1114 'url': smuggle_url(
1115 '%s://www.youtube.com/watch?v=%s' % (proto, feed_data['id'][0]),
1116 {'force_singlefeed': True}),
1117 'title': '%s (%s)' % (video_title, feed_data['title'][0]),
1118 })
1119 feed_ids.append(feed_data['id'][0])
1120 self.to_screen(
1121 'Downloading multifeed video (%s) - add --no-playlist to just download video %s'
1122 % (', '.join(feed_ids), video_id))
1123 return self.playlist_result(entries, video_id, video_title, video_description)
1124 self.to_screen('Downloading just video %s because of --no-playlist' % video_id)
cf7e015f 1125
1d699755
PH
1126 if 'view_count' in video_info:
1127 view_count = int(video_info['view_count'][0])
1128 else:
1129 view_count = None
1130
c5e8d7af
PH
1131 # Check for "rental" videos
1132 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
69ea8ca4 1133 raise ExtractorError('"rental" videos not supported')
c5e8d7af
PH
1134
1135 # Start extracting information
1136 self.report_information_extraction(video_id)
1137
1138 # uploader
1139 if 'author' not in video_info:
69ea8ca4 1140 raise ExtractorError('Unable to extract uploader name')
7fd002c0 1141 video_uploader = compat_urllib_parse_unquote_plus(video_info['author'][0])
c5e8d7af
PH
1142
1143 # uploader_id
1144 video_uploader_id = None
1145 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
1146 if mobj is not None:
1147 video_uploader_id = mobj.group(1)
1148 else:
69ea8ca4 1149 self._downloader.report_warning('unable to extract uploader nickname')
c5e8d7af 1150
c5e8d7af 1151 # thumbnail image
7763b04e
JMF
1152 # We try first to get a high quality image:
1153 m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">',
1154 video_webpage, re.DOTALL)
1155 if m_thumb is not None:
1156 video_thumbnail = m_thumb.group(1)
1157 elif 'thumbnail_url' not in video_info:
69ea8ca4 1158 self._downloader.report_warning('unable to extract video thumbnail')
f490e77e 1159 video_thumbnail = None
c5e8d7af 1160 else: # don't panic if we can't find it
7fd002c0 1161 video_thumbnail = compat_urllib_parse_unquote_plus(video_info['thumbnail_url'][0])
c5e8d7af
PH
1162
1163 # upload date
9d0b581f
S
1164 upload_date = self._html_search_meta(
1165 'datePublished', video_webpage, 'upload date', default=None)
1166 if not upload_date:
1167 upload_date = self._search_regex(
1168 [r'(?s)id="eow-date.*?>(.*?)</span>',
1169 r'id="watch-uploader-info".*?>.*?(?:Published|Uploaded|Streamed live|Started) on (.+?)</strong>'],
1170 video_webpage, 'upload date', default=None)
1171 if upload_date:
1172 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1173 upload_date = unified_strdate(upload_date)
c5e8d7af 1174
55f7bd2d
PH
1175 m_cat_container = self._search_regex(
1176 r'(?s)<h4[^>]*>\s*Category\s*</h4>\s*<ul[^>]*>(.*?)</ul>',
624dcebf 1177 video_webpage, 'categories', default=None)
ec8deefc 1178 if m_cat_container:
ad3bc6ac 1179 category = self._html_search_regex(
01ed5c9b 1180 r'(?s)<a[^<]+>(.*?)</a>', m_cat_container, 'category',
ad3bc6ac
PH
1181 default=None)
1182 video_categories = None if category is None else [category]
1183 else:
1184 video_categories = None
ec8deefc 1185
000b6b5a
S
1186 video_tags = [
1187 unescapeHTML(m.group('content'))
1188 for m in re.finditer(self._meta_regex('og:video:tag'), video_webpage)]
1189
f30a38be 1190 def _extract_count(count_name):
c93d53f5
S
1191 return str_to_int(self._search_regex(
1192 r'-%s-button[^>]+><span[^>]+class="yt-uix-button-content"[^>]*>([\d,]+)</span>'
1193 % re.escape(count_name),
1194 video_webpage, count_name, default=None))
1195
69ea8ca4
PH
1196 like_count = _extract_count('like')
1197 dislike_count = _extract_count('dislike')
336c3a69 1198
c5e8d7af 1199 # subtitles
d82134c3 1200 video_subtitles = self.extract_subtitles(video_id, video_webpage)
360e1ca5 1201 automatic_captions = self.extract_automatic_captions(video_id, video_webpage)
c5e8d7af
PH
1202
1203 if 'length_seconds' not in video_info:
69ea8ca4 1204 self._downloader.report_warning('unable to extract video duration')
b466b702 1205 video_duration = None
c5e8d7af 1206 else:
7fd002c0 1207 video_duration = int(compat_urllib_parse_unquote_plus(video_info['length_seconds'][0]))
c5e8d7af 1208
1fb07d10
JG
1209 # annotations
1210 video_annotations = None
1211 if self._downloader.params.get('writeannotations', False):
5f6a1245 1212 video_annotations = self._extract_annotations(video_id)
1fb07d10 1213
dd27fd17
PH
1214 def _map_to_format_list(urlmap):
1215 formats = []
1216 for itag, video_real_url in urlmap.items():
1217 dct = {
1218 'format_id': itag,
1219 'url': video_real_url,
1220 'player_url': player_url,
1221 }
0b65e5d4
PH
1222 if itag in self._formats:
1223 dct.update(self._formats[itag])
dd27fd17
PH
1224 formats.append(dct)
1225 return formats
1226
c5e8d7af
PH
1227 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1228 self.report_rtmp_download()
dd27fd17
PH
1229 formats = [{
1230 'format_id': '_rtmp',
1231 'protocol': 'rtmp',
1232 'url': video_info['conn'][0],
1233 'player_url': player_url,
1234 }]
24270b03 1235 elif len(video_info.get('url_encoded_fmt_stream_map', [''])[0]) >= 1 or len(video_info.get('adaptive_fmts', [''])[0]) >= 1:
5f6a1245 1236 encoded_url_map = video_info.get('url_encoded_fmt_stream_map', [''])[0] + ',' + video_info.get('adaptive_fmts', [''])[0]
00fe14fc 1237 if 'rtmpe%3Dyes' in encoded_url_map:
a7055eb9 1238 raise ExtractorError('rtmpe downloads are not supported, see https://github.com/rg3/youtube-dl/issues/343 for more information.', expected=True)
c5e8d7af 1239 url_map = {}
00fe14fc 1240 for url_data_str in encoded_url_map.split(','):
c5e8d7af 1241 url_data = compat_parse_qs(url_data_str)
201e9eaa
PH
1242 if 'itag' not in url_data or 'url' not in url_data:
1243 continue
1244 format_id = url_data['itag'][0]
1245 url = url_data['url'][0]
1246
1247 if 'sig' in url_data:
1248 url += '&signature=' + url_data['sig'][0]
1249 elif 's' in url_data:
1250 encrypted_sig = url_data['s'][0]
6449cd80 1251 ASSETS_RE = r'"assets":.+?"js":\s*("[^"]+")'
201e9eaa 1252
beb95e77 1253 jsplayer_url_json = self._search_regex(
6449cd80
PH
1254 ASSETS_RE,
1255 embed_webpage if age_gate else video_webpage,
1256 'JS player URL (1)', default=None)
1257 if not jsplayer_url_json and not age_gate:
1258 # We need the embed website after all
1259 if embed_webpage is None:
1260 embed_url = proto + '://www.youtube.com/embed/%s' % video_id
1261 embed_webpage = self._download_webpage(
1262 embed_url, video_id, 'Downloading embed webpage')
1263 jsplayer_url_json = self._search_regex(
1264 ASSETS_RE, embed_webpage, 'JS player URL')
1265
beb95e77 1266 player_url = json.loads(jsplayer_url_json)
201e9eaa
PH
1267 if player_url is None:
1268 player_url_json = self._search_regex(
1269 r'ytplayer\.config.*?"url"\s*:\s*("[^"]+")',
78caa52a 1270 video_webpage, 'age gate player URL')
201e9eaa
PH
1271 player_url = json.loads(player_url_json)
1272
1273 if self._downloader.params.get('verbose'):
cf010131 1274 if player_url is None:
201e9eaa
PH
1275 player_version = 'unknown'
1276 player_desc = 'unknown'
1277 else:
1278 if player_url.endswith('swf'):
1279 player_version = self._search_regex(
1280 r'-(.+?)(?:/watch_as3)?\.swf$', player_url,
78caa52a 1281 'flash player', fatal=False)
201e9eaa 1282 player_desc = 'flash player %s' % player_version
cf010131 1283 else:
201e9eaa
PH
1284 player_version = self._search_regex(
1285 r'html5player-([^/]+?)(?:/html5player)?\.js',
1286 player_url,
1287 'html5 player', fatal=False)
78caa52a 1288 player_desc = 'html5 player %s' % player_version
201e9eaa 1289
60064c53 1290 parts_sizes = self._signature_cache_id(encrypted_sig)
69ea8ca4 1291 self.to_screen('{%s} signature length %s, %s' %
9e1a5b84 1292 (format_id, parts_sizes, player_desc))
201e9eaa
PH
1293
1294 signature = self._decrypt_signature(
1295 encrypted_sig, video_id, player_url, age_gate)
1296 url += '&signature=' + signature
1297 if 'ratebypass' not in url:
1298 url += '&ratebypass=yes'
1299 url_map[format_id] = url
dd27fd17 1300 formats = _map_to_format_list(url_map)
1d043b93
JMF
1301 elif video_info.get('hlsvp'):
1302 manifest_url = video_info['hlsvp'][0]
1303 url_map = self._extract_from_m3u8(manifest_url, video_id)
dd27fd17 1304 formats = _map_to_format_list(url_map)
c5e8d7af 1305 else:
69ea8ca4 1306 raise ExtractorError('no conn, hlsvp or url_encoded_fmt_stream_map information found in video info')
c5e8d7af 1307
dd27fd17 1308 # Look for the DASH manifest
203fb43f 1309 if self._downloader.params.get('youtube_include_dash_manifest', True):
77c6fb5b 1310 dash_mpd_fatal = True
d8d24a92
S
1311 for dash_manifest_url in dash_mpds:
1312 dash_formats = {}
774e208f 1313 try:
d8d24a92 1314 for df in self._parse_dash_manifest(
77c6fb5b 1315 video_id, dash_manifest_url, player_url, age_gate, dash_mpd_fatal):
d8d24a92
S
1316 # Do not overwrite DASH format found in some previous DASH manifest
1317 if df['format_id'] not in dash_formats:
1318 dash_formats[df['format_id']] = df
77c6fb5b
S
1319 # Additional DASH manifests may end up in HTTP Error 403 therefore
1320 # allow them to fail without bug report message if we already have
1321 # some DASH manifest succeeded. This is temporary workaround to reduce
1322 # burst of bug reports until we figure out the reason and whether it
1323 # can be fixed at all.
1324 dash_mpd_fatal = False
774e208f
PH
1325 except (ExtractorError, KeyError) as e:
1326 self.report_warning(
1327 'Skipping DASH manifest: %r' % e, video_id)
d8d24a92 1328 if dash_formats:
04b3b3df
JMF
1329 # Remove the formats we found through non-DASH, they
1330 # contain less info and it can be wrong, because we use
1331 # fixed values (for example the resolution). See
1332 # https://github.com/rg3/youtube-dl/issues/5774 for an
1333 # example.
d80265cc 1334 formats = [f for f in formats if f['format_id'] not in dash_formats.keys()]
d8d24a92 1335 formats.extend(dash_formats.values())
d80044c2 1336
6271f1ca
PH
1337 # Check for malformed aspect ratio
1338 stretched_m = re.search(
1339 r'<meta\s+property="og:video:tag".*?content="yt:stretch=(?P<w>[0-9]+):(?P<h>[0-9]+)">',
1340 video_webpage)
1341 if stretched_m:
1342 ratio = float(stretched_m.group('w')) / float(stretched_m.group('h'))
1343 for f in formats:
1344 if f.get('vcodec') != 'none':
1345 f['stretched_ratio'] = ratio
1346
4bcc7bd1 1347 self._sort_formats(formats)
4ea3be0a 1348
1349 return {
8bcc8756
JW
1350 'id': video_id,
1351 'uploader': video_uploader,
1352 'uploader_id': video_uploader_id,
1353 'upload_date': upload_date,
1354 'title': video_title,
1355 'thumbnail': video_thumbnail,
1356 'description': video_description,
1357 'categories': video_categories,
000b6b5a 1358 'tags': video_tags,
8bcc8756 1359 'subtitles': video_subtitles,
360e1ca5 1360 'automatic_captions': automatic_captions,
8bcc8756
JW
1361 'duration': video_duration,
1362 'age_limit': 18 if age_gate else 0,
1363 'annotations': video_annotations,
7e8c0af0 1364 'webpage_url': proto + '://www.youtube.com/watch?v=%s' % video_id,
8bcc8756 1365 'view_count': view_count,
4ea3be0a 1366 'like_count': like_count,
1367 'dislike_count': dislike_count,
2d30521a 1368 'average_rating': float_or_none(video_info.get('avg_rating', [None])[0]),
8bcc8756 1369 'formats': formats,
2fe1ff85 1370 'is_live': is_live,
7c80519c 1371 'start_time': start_time,
297a564b 1372 'end_time': end_time,
4ea3be0a 1373 }
c5e8d7af 1374
5f6a1245 1375
880e1c52 1376class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
78caa52a 1377 IE_DESC = 'YouTube.com playlists'
d67cc9fa 1378 _VALID_URL = r"""(?x)(?:
c5e8d7af
PH
1379 (?:https?://)?
1380 (?:\w+\.)?
1381 youtube\.com/
1382 (?:
ac7553d0 1383 (?:course|view_play_list|my_playlists|artist|playlist|watch|embed/videoseries)
c5e8d7af
PH
1384 \? (?:.*?&)*? (?:p|a|list)=
1385 | p/
1386 )
d67cc9fa 1387 (
99209c29 1388 (?:PL|LL|EC|UU|FL|RD|UL)?[0-9A-Za-z-_]{10,}
5f6a1245 1389 # Top tracks, they can also include dots
d67cc9fa
JMF
1390 |(?:MC)[\w\.]*
1391 )
c5e8d7af
PH
1392 .*
1393 |
99209c29 1394 ((?:PL|LL|EC|UU|FL|RD|UL)[0-9A-Za-z-_]{10,})
c5e8d7af 1395 )"""
dbb94fb0 1396 _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s'
dbb94fb0 1397 _VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})&amp;[^"]*?index=(?P<index>\d+)'
78caa52a 1398 IE_NAME = 'youtube:playlist'
81127aa5
PH
1399 _TESTS = [{
1400 'url': 'https://www.youtube.com/playlist?list=PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re',
1401 'info_dict': {
1402 'title': 'ytdl test PL',
a1cf99d0 1403 'id': 'PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re',
81127aa5
PH
1404 },
1405 'playlist_count': 3,
9291475f
PH
1406 }, {
1407 'url': 'https://www.youtube.com/playlist?list=PLtPgu7CB4gbZDA7i_euNxn75ISqxwZPYx',
1408 'info_dict': {
acf757f4 1409 'id': 'PLtPgu7CB4gbZDA7i_euNxn75ISqxwZPYx',
9291475f
PH
1410 'title': 'YDL_Empty_List',
1411 },
1412 'playlist_count': 0,
1413 }, {
1414 'note': 'Playlist with deleted videos (#651). As a bonus, the video #51 is also twice in this list.',
1415 'url': 'https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC',
1416 'info_dict': {
1417 'title': '29C3: Not my department',
acf757f4 1418 'id': 'PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC',
9291475f
PH
1419 },
1420 'playlist_count': 95,
1421 }, {
1422 'note': 'issue #673',
1423 'url': 'PLBB231211A4F62143',
1424 'info_dict': {
f46a8702 1425 'title': '[OLD]Team Fortress 2 (Class-based LP)',
acf757f4 1426 'id': 'PLBB231211A4F62143',
9291475f
PH
1427 },
1428 'playlist_mincount': 26,
1429 }, {
1430 'note': 'Large playlist',
1431 'url': 'https://www.youtube.com/playlist?list=UUBABnxM4Ar9ten8Mdjj1j0Q',
1432 'info_dict': {
1433 'title': 'Uploads from Cauchemar',
acf757f4 1434 'id': 'UUBABnxM4Ar9ten8Mdjj1j0Q',
9291475f
PH
1435 },
1436 'playlist_mincount': 799,
1437 }, {
1438 'url': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl',
1439 'info_dict': {
1440 'title': 'YDL_safe_search',
acf757f4 1441 'id': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl',
9291475f
PH
1442 },
1443 'playlist_count': 2,
ac7553d0
PH
1444 }, {
1445 'note': 'embedded',
1446 'url': 'http://www.youtube.com/embed/videoseries?list=PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu',
1447 'playlist_count': 4,
1448 'info_dict': {
1449 'title': 'JODA15',
acf757f4 1450 'id': 'PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu',
ac7553d0 1451 }
6b08cdf6
PH
1452 }, {
1453 'note': 'Embedded SWF player',
1454 'url': 'http://www.youtube.com/p/YN5VISEtHet5D4NEvfTd0zcgFk84NqFZ?hl=en_US&fs=1&rel=0',
1455 'playlist_count': 4,
1456 'info_dict': {
1457 'title': 'JODA7',
acf757f4 1458 'id': 'YN5VISEtHet5D4NEvfTd0zcgFk84NqFZ',
6b08cdf6 1459 }
4b7df0d3
JMF
1460 }, {
1461 'note': 'Buggy playlist: the webpage has a "Load more" button but it doesn\'t have more videos',
1462 'url': 'https://www.youtube.com/playlist?list=UUXw-G3eDE9trcvY2sBMM_aA',
1463 'info_dict': {
acf757f4
PH
1464 'title': 'Uploads from Interstellar Movie',
1465 'id': 'UUXw-G3eDE9trcvY2sBMM_aA',
4b7df0d3
JMF
1466 },
1467 'playlist_mincout': 21,
81127aa5 1468 }]
c5e8d7af 1469
880e1c52
JMF
1470 def _real_initialize(self):
1471 self._login()
1472
652cdaa2 1473 def _extract_mix(self, playlist_id):
99209c29 1474 # The mixes are generated from a single video
652cdaa2 1475 # the id of the playlist is just 'RD' + video_id
7d4afc55 1476 url = 'https://youtube.com/watch?v=%s&list=%s' % (playlist_id[-11:], playlist_id)
c9cc0bf5 1477 webpage = self._download_webpage(
78caa52a 1478 url, playlist_id, 'Downloading Youtube mix')
bc2f773b 1479 search_title = lambda class_name: get_element_by_attribute('class', class_name, webpage)
c9cc0bf5
PH
1480 title_span = (
1481 search_title('playlist-title') or
1482 search_title('title long-title') or
1483 search_title('title'))
76d1700b 1484 title = clean_html(title_span)
c9cc0bf5
PH
1485 ids = orderedSet(re.findall(
1486 r'''(?xs)data-video-username=".*?".*?
1487 href="/watch\?v=([0-9A-Za-z_-]{11})&amp;[^"]*?list=%s''' % re.escape(playlist_id),
1488 webpage))
652cdaa2
JMF
1489 url_results = self._ids_to_results(ids)
1490
1491 return self.playlist_result(url_results, playlist_id, title)
1492
448830ce 1493 def _extract_playlist(self, playlist_id):
dbb94fb0
S
1494 url = self._TEMPLATE_URL % playlist_id
1495 page = self._download_webpage(url, playlist_id)
dbb94fb0 1496
39b62db1
YCH
1497 for match in re.findall(r'<div class="yt-alert-message">([^<]+)</div>', page):
1498 match = match.strip()
1499 # Check if the playlist exists or is private
1500 if re.match(r'[^<]*(The|This) playlist (does not exist|is private)[^<]*', match):
1501 raise ExtractorError(
1502 'The playlist doesn\'t exist or is private, use --username or '
1503 '--netrc to access it.',
1504 expected=True)
1505 elif re.match(r'[^<]*Invalid parameters[^<]*', match):
1506 raise ExtractorError(
1507 'Invalid parameters. Maybe URL is incorrect.',
1508 expected=True)
1509 elif re.match(r'[^<]*Choose your language[^<]*', match):
1510 continue
1511 else:
1512 self.report_warning('Youtube gives an alert message: ' + match)
10c0e2d8 1513
dcbb4580 1514 # Extract the video ids from the playlist pages
70219b0f
JMF
1515 def _entries():
1516 more_widget_html = content_html = page
1517 for page_num in itertools.count(1):
1518 matches = re.finditer(self._VIDEO_RE, content_html)
1519 # We remove the duplicates and the link with index 0
1520 # (it's not the first video of the playlist)
1521 new_ids = orderedSet(m.group('id') for m in matches if m.group('index') != '0')
1522 for vid_id in new_ids:
1523 yield self.url_result(vid_id, 'Youtube', video_id=vid_id)
1524
1525 mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
1526 if not mobj:
1527 break
1528
1529 more = self._download_json(
1530 'https://youtube.com/%s' % mobj.group('more'), playlist_id,
1531 'Downloading page #%s' % page_num,
1532 transform_source=uppercase_escape)
1533 content_html = more['content_html']
1534 if not content_html.strip():
1535 # Some webpages show a "Load more" button but they don't
1536 # have more videos
1537 break
1538 more_widget_html = more['load_more_widget_html']
dbb94fb0
S
1539
1540 playlist_title = self._html_search_regex(
68eb8e90 1541 r'(?s)<h1 class="pl-header-title[^"]*">\s*(.*?)\s*</h1>',
78caa52a 1542 page, 'title')
c5e8d7af 1543
70219b0f 1544 return self.playlist_result(_entries(), playlist_id, playlist_title)
c5e8d7af 1545
448830ce
S
1546 def _real_extract(self, url):
1547 # Extract playlist id
1548 mobj = re.match(self._VALID_URL, url)
1549 if mobj is None:
1550 raise ExtractorError('Invalid URL: %s' % url)
1551 playlist_id = mobj.group(1) or mobj.group(2)
1552
1553 # Check if it's a video-specific URL
1554 query_dict = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
1555 if 'v' in query_dict:
1556 video_id = query_dict['v'][0]
1557 if self._downloader.params.get('noplaylist'):
1558 self.to_screen('Downloading just video %s because of --no-playlist' % video_id)
1559 return self.url_result(video_id, 'Youtube', video_id=video_id)
1560 else:
1561 self.to_screen('Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id))
1562
1563 if playlist_id.startswith('RD') or playlist_id.startswith('UL'):
1564 # Mixes require a custom extraction process
1565 return self._extract_mix(playlist_id)
1566
1567 return self._extract_playlist(playlist_id)
1568
c5e8d7af
PH
1569
1570class YoutubeChannelIE(InfoExtractor):
78caa52a 1571 IE_DESC = 'YouTube.com channels'
9ff67727 1572 _VALID_URL = r'https?://(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/(?P<id>[0-9A-Za-z_-]+)'
eb0f3e7e 1573 _TEMPLATE_URL = 'https://www.youtube.com/channel/%s/videos'
78caa52a 1574 IE_NAME = 'youtube:channel'
cdc628a4
PH
1575 _TESTS = [{
1576 'note': 'paginated channel',
1577 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w',
1578 'playlist_mincount': 91,
acf757f4
PH
1579 'info_dict': {
1580 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
1581 }
cdc628a4 1582 }]
c5e8d7af 1583
6de5dbaf
S
1584 @staticmethod
1585 def extract_videos_from_page(page):
c5e8d7af 1586 ids_in_page = []
fb69240c
S
1587 titles_in_page = []
1588 for mobj in re.finditer(r'(?:title="(?P<title>[^"]+)"[^>]+)?href="/watch\?v=(?P<id>[0-9A-Za-z_-]+)&?', page):
1589 video_id = mobj.group('id')
1590 video_title = unescapeHTML(mobj.group('title'))
1591 try:
1592 idx = ids_in_page.index(video_id)
1593 if video_title and not titles_in_page[idx]:
1594 titles_in_page[idx] = video_title
1595 except ValueError:
1596 ids_in_page.append(video_id)
1597 titles_in_page.append(video_title)
1598 return zip(ids_in_page, titles_in_page)
c5e8d7af
PH
1599
1600 def _real_extract(self, url):
9ff67727 1601 channel_id = self._match_id(url)
c5e8d7af 1602
eb0f3e7e 1603 url = self._TEMPLATE_URL % channel_id
386bdfa6
S
1604
1605 # Channel by page listing is restricted to 35 pages of 30 items, i.e. 1050 videos total (see #5778)
1606 # Workaround by extracting as a playlist if managed to obtain channel playlist URL
1607 # otherwise fallback on channel by page extraction
1608 channel_page = self._download_webpage(
1609 url + '?view=57', channel_id,
1610 'Downloading channel page', fatal=False)
3d8e9573
S
1611 channel_playlist_id = self._html_search_meta(
1612 'channelId', channel_page, 'channel id', default=None)
1613 if not channel_playlist_id:
1614 channel_playlist_id = self._search_regex(
1615 r'data-channel-external-id="([^"]+)"',
1616 channel_page, 'channel id', default=None)
386bdfa6
S
1617 if channel_playlist_id and channel_playlist_id.startswith('UC'):
1618 playlist_id = 'UU' + channel_playlist_id[2:]
d2a9de78
IK
1619 return self.url_result(
1620 compat_urlparse.urljoin(url, '/playlist?list=%s' % playlist_id), 'YoutubePlaylist')
386bdfa6 1621
60bf45c8 1622 channel_page = self._download_webpage(url, channel_id, 'Downloading page #1')
31812a9e
PH
1623 autogenerated = re.search(r'''(?x)
1624 class="[^"]*?(?:
1625 channel-header-autogenerated-label|
1626 yt-channel-title-autogenerated
1627 )[^"]*"''', channel_page) is not None
c5e8d7af 1628
b9643eed
JMF
1629 if autogenerated:
1630 # The videos are contained in a single page
1631 # the ajax pages can't be used, they are empty
b82f815f 1632 entries = [
fb69240c
S
1633 self.url_result(
1634 video_id, 'Youtube', video_id=video_id,
1635 video_title=video_title)
8f02ad4f 1636 for video_id, video_title in self.extract_videos_from_page(channel_page)]
b82f815f
PH
1637 return self.playlist_result(entries, channel_id)
1638
1639 def _entries():
23d3608c 1640 more_widget_html = content_html = channel_page
b9643eed 1641 for pagenum in itertools.count(1):
81c2f20b 1642
8f02ad4f 1643 for video_id, video_title in self.extract_videos_from_page(content_html):
b82f815f 1644 yield self.url_result(
fb69240c
S
1645 video_id, 'Youtube', video_id=video_id,
1646 video_title=video_title)
5f6a1245 1647
23d3608c
JMF
1648 mobj = re.search(
1649 r'data-uix-load-more-href="/?(?P<more>[^"]+)"',
1650 more_widget_html)
1651 if not mobj:
b9643eed 1652 break
c5e8d7af 1653
23d3608c
JMF
1654 more = self._download_json(
1655 'https://youtube.com/%s' % mobj.group('more'), channel_id,
1656 'Downloading page #%s' % (pagenum + 1),
1657 transform_source=uppercase_escape)
1658 content_html = more['content_html']
1659 more_widget_html = more['load_more_widget_html']
1660
b82f815f 1661 return self.playlist_result(_entries(), channel_id)
c5e8d7af
PH
1662
1663
eb0f3e7e 1664class YoutubeUserIE(YoutubeChannelIE):
78caa52a 1665 IE_DESC = 'YouTube.com user videos (URL or "ytuser" keyword)'
9ff67727 1666 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:user/)?(?!(?:attribution_link|watch|results)(?:$|[^a-z_A-Z0-9-])))|ytuser:)(?!feed/)(?P<id>[A-Za-z0-9_-]+)'
eb0f3e7e 1667 _TEMPLATE_URL = 'https://www.youtube.com/user/%s/videos'
78caa52a 1668 IE_NAME = 'youtube:user'
c5e8d7af 1669
cdc628a4
PH
1670 _TESTS = [{
1671 'url': 'https://www.youtube.com/user/TheLinuxFoundation',
1672 'playlist_mincount': 320,
1673 'info_dict': {
1674 'title': 'TheLinuxFoundation',
1675 }
1676 }, {
1677 'url': 'ytuser:phihag',
1678 'only_matching': True,
1679 }]
1680
e3ea4790 1681 @classmethod
f4b05232 1682 def suitable(cls, url):
e3ea4790
JMF
1683 # Don't return True if the url can be extracted with other youtube
1684 # extractor, the regex would is too permissive and it would match.
1685 other_ies = iter(klass for (name, klass) in globals().items() if name.endswith('IE') and klass is not cls)
5f6a1245
JW
1686 if any(ie.suitable(url) for ie in other_ies):
1687 return False
1688 else:
1689 return super(YoutubeUserIE, cls).suitable(url)
f4b05232 1690
b05654f0 1691
b4c08069 1692class YoutubeSearchIE(SearchInfoExtractor, YoutubePlaylistIE):
78caa52a 1693 IE_DESC = 'YouTube.com searches'
b4c08069
JMF
1694 # there doesn't appear to be a real limit, for example if you search for
1695 # 'python' you get more than 8.000.000 results
1696 _MAX_RESULTS = float('inf')
78caa52a 1697 IE_NAME = 'youtube:search'
b05654f0 1698 _SEARCH_KEY = 'ytsearch'
b4c08069 1699 _EXTRA_QUERY_ARGS = {}
9dd8e46a 1700 _TESTS = []
b05654f0 1701
b05654f0
PH
1702 def _get_n_results(self, query, n):
1703 """Get a specified number of results for a query"""
1704
b4c08069 1705 videos = []
b05654f0
PH
1706 limit = n
1707
b4c08069
JMF
1708 for pagenum in itertools.count(1):
1709 url_query = {
02175a79 1710 'search_query': query.encode('utf-8'),
b4c08069
JMF
1711 'page': pagenum,
1712 'spf': 'navigate',
1713 }
1714 url_query.update(self._EXTRA_QUERY_ARGS)
1715 result_url = 'https://www.youtube.com/results?' + compat_urllib_parse.urlencode(url_query)
1716 data = self._download_json(
69ea8ca4 1717 result_url, video_id='query "%s"' % query,
b4c08069 1718 note='Downloading page %s' % pagenum,
69ea8ca4 1719 errnote='Unable to download API page')
b4c08069 1720 html_content = data[1]['body']['content']
7cc3570e 1721
b4c08069 1722 if 'class="search-message' in html_content:
07ad22b8 1723 raise ExtractorError(
78caa52a 1724 '[youtube] No video results', expected=True)
b05654f0 1725
b4c08069
JMF
1726 new_videos = self._ids_to_results(orderedSet(re.findall(
1727 r'href="/watch\?v=(.{11})', html_content)))
1728 videos += new_videos
1729 if not new_videos or len(videos) > limit:
1730 break
b05654f0 1731
b4c08069
JMF
1732 if len(videos) > n:
1733 videos = videos[:n]
b05654f0 1734 return self.playlist_result(videos, query)
75dff0ee 1735
c9ae7b95 1736
a3dd9248 1737class YoutubeSearchDateIE(YoutubeSearchIE):
cb7fb546 1738 IE_NAME = YoutubeSearchIE.IE_NAME + ':date'
a3dd9248 1739 _SEARCH_KEY = 'ytsearchdate'
78caa52a 1740 IE_DESC = 'YouTube.com searches, newest videos first'
b4c08069 1741 _EXTRA_QUERY_ARGS = {'search_sort': 'video_date_uploaded'}
75dff0ee 1742
c9ae7b95
PH
1743
1744class YoutubeSearchURLIE(InfoExtractor):
78caa52a
PH
1745 IE_DESC = 'YouTube.com search URLs'
1746 IE_NAME = 'youtube:search_url'
c9ae7b95 1747 _VALID_URL = r'https?://(?:www\.)?youtube\.com/results\?(.*?&)?search_query=(?P<query>[^&]+)(?:[&]|$)'
cdc628a4
PH
1748 _TESTS = [{
1749 'url': 'https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video',
1750 'playlist_mincount': 5,
1751 'info_dict': {
1752 'title': 'youtube-dl test video',
1753 }
1754 }]
c9ae7b95
PH
1755
1756 def _real_extract(self, url):
1757 mobj = re.match(self._VALID_URL, url)
7fd002c0 1758 query = compat_urllib_parse_unquote_plus(mobj.group('query'))
c9ae7b95
PH
1759
1760 webpage = self._download_webpage(url, query)
1761 result_code = self._search_regex(
98998cde 1762 r'(?s)<ol[^>]+class="item-section"(.*?)</ol>', webpage, 'result HTML')
c9ae7b95
PH
1763
1764 part_codes = re.findall(
f74a7348 1765 r'(?s)<h3[^>]+class="[^"]*yt-lockup-title[^"]*"[^>]*>(.*?)</h3>', result_code)
c9ae7b95
PH
1766 entries = []
1767 for part_code in part_codes:
1768 part_title = self._html_search_regex(
6feb2d5e 1769 [r'(?s)title="([^"]+)"', r'>([^<]+)</a>'], part_code, 'item title', fatal=False)
c9ae7b95
PH
1770 part_url_snippet = self._html_search_regex(
1771 r'(?s)href="([^"]+)"', part_code, 'item URL')
1772 part_url = compat_urlparse.urljoin(
1773 'https://www.youtube.com/', part_url_snippet)
1774 entries.append({
1775 '_type': 'url',
1776 'url': part_url,
1777 'title': part_title,
1778 })
1779
1780 return {
1781 '_type': 'playlist',
1782 'entries': entries,
1783 'title': query,
1784 }
1785
1786
75dff0ee 1787class YoutubeShowIE(InfoExtractor):
78caa52a 1788 IE_DESC = 'YouTube.com (multi-season) shows'
cdc628a4 1789 _VALID_URL = r'https?://www\.youtube\.com/show/(?P<id>[^?#]*)'
78caa52a 1790 IE_NAME = 'youtube:show'
cdc628a4
PH
1791 _TESTS = [{
1792 'url': 'http://www.youtube.com/show/airdisasters',
1793 'playlist_mincount': 3,
1794 'info_dict': {
1795 'id': 'airdisasters',
1796 'title': 'Air Disasters',
1797 }
1798 }]
75dff0ee
JMF
1799
1800 def _real_extract(self, url):
1801 mobj = re.match(self._VALID_URL, url)
cdc628a4
PH
1802 playlist_id = mobj.group('id')
1803 webpage = self._download_webpage(
1804 url, playlist_id, 'Downloading show webpage')
75dff0ee
JMF
1805 # There's one playlist for each season of the show
1806 m_seasons = list(re.finditer(r'href="(/playlist\?list=.*?)"', webpage))
cdc628a4
PH
1807 self.to_screen('%s: Found %s seasons' % (playlist_id, len(m_seasons)))
1808 entries = [
1809 self.url_result(
1810 'https://www.youtube.com' + season.group(1), 'YoutubePlaylist')
1811 for season in m_seasons
1812 ]
1813 title = self._og_search_title(webpage, fatal=False)
1814
1815 return {
1816 '_type': 'playlist',
1817 'id': playlist_id,
1818 'title': title,
1819 'entries': entries,
1820 }
04cc9617
JMF
1821
1822
b2e8bc1b 1823class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
d7ae0639 1824 """
25f14e9f 1825 Base class for feed extractors
d7ae0639
JMF
1826 Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.
1827 """
b2e8bc1b 1828 _LOGIN_REQUIRED = True
d7ae0639
JMF
1829
1830 @property
1831 def IE_NAME(self):
78caa52a 1832 return 'youtube:%s' % self._FEED_NAME
04cc9617 1833
81f0259b 1834 def _real_initialize(self):
b2e8bc1b 1835 self._login()
81f0259b 1836
04cc9617 1837 def _real_extract(self, url):
25f14e9f
S
1838 page = self._download_webpage(
1839 'https://www.youtube.com/feed/%s' % self._FEED_NAME, self._PLAYLIST_TITLE)
2bc43303
JMF
1840
1841 # The extraction process is the same as for playlists, but the regex
1842 # for the video ids doesn't contain an index
1843 ids = []
1844 more_widget_html = content_html = page
2bc43303
JMF
1845 for page_num in itertools.count(1):
1846 matches = re.findall(r'href="\s*/watch\?v=([0-9A-Za-z_-]{11})', content_html)
62c95fd5
S
1847
1848 # 'recommended' feed has infinite 'load more' and each new portion spins
1849 # the same videos in (sometimes) slightly different order, so we'll check
1850 # for unicity and break when portion has no new videos
1851 new_ids = filter(lambda video_id: video_id not in ids, orderedSet(matches))
1852 if not new_ids:
1853 break
1854
2bc43303
JMF
1855 ids.extend(new_ids)
1856
1857 mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
1858 if not mobj:
1859 break
1860
1861 more = self._download_json(
25f14e9f 1862 'https://youtube.com/%s' % mobj.group('more'), self._PLAYLIST_TITLE,
2bc43303
JMF
1863 'Downloading page #%s' % page_num,
1864 transform_source=uppercase_escape)
1865 content_html = more['content_html']
1866 more_widget_html = more['load_more_widget_html']
1867
25f14e9f
S
1868 return self.playlist_result(
1869 self._ids_to_results(ids), playlist_title=self._PLAYLIST_TITLE)
1870
1871
1872class YoutubeWatchLaterIE(YoutubePlaylistIE):
1873 IE_NAME = 'youtube:watchlater'
1874 IE_DESC = 'Youtube watch later list, ":ytwatchlater" for short (requires authentication)'
1875 _VALID_URL = r'https?://www\.youtube\.com/(?:feed/watch_later|playlist\?list=WL)|:ytwatchlater'
1876
1877 _TESTS = [] # override PlaylistIE tests
1878
1879 def _real_extract(self, url):
1880 return self._extract_playlist('WL')
f459d170 1881
5f6a1245 1882
c626a3d9 1883class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
78caa52a 1884 IE_NAME = 'youtube:favorites'
f3a34072 1885 IE_DESC = 'YouTube.com favourite videos, ":ytfav" for short (requires authentication)'
c7a7750d 1886 _VALID_URL = r'https?://www\.youtube\.com/my_favorites|:ytfav(?:ou?rites)?'
c626a3d9
JMF
1887 _LOGIN_REQUIRED = True
1888
1889 def _real_extract(self, url):
1890 webpage = self._download_webpage('https://www.youtube.com/my_favorites', 'Youtube Favourites videos')
78caa52a 1891 playlist_id = self._search_regex(r'list=(.+?)["&]', webpage, 'favourites playlist id')
c626a3d9 1892 return self.url_result(playlist_id, 'YoutubePlaylist')
15870e90
PH
1893
1894
25f14e9f
S
1895class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
1896 IE_DESC = 'YouTube.com recommended videos, ":ytrec" for short (requires authentication)'
1897 _VALID_URL = r'https?://www\.youtube\.com/feed/recommended|:ytrec(?:ommended)?'
1898 _FEED_NAME = 'recommended'
1899 _PLAYLIST_TITLE = 'Youtube Recommended videos'
1ed5b5c9 1900
1ed5b5c9 1901
25f14e9f
S
1902class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor):
1903 IE_DESC = 'YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication)'
1904 _VALID_URL = r'https?://www\.youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
1905 _FEED_NAME = 'subscriptions'
1906 _PLAYLIST_TITLE = 'Youtube Subscriptions'
1ed5b5c9 1907
1ed5b5c9 1908
25f14e9f
S
1909class YoutubeHistoryIE(YoutubeFeedsInfoExtractor):
1910 IE_DESC = 'Youtube watch history, ":ythistory" for short (requires authentication)'
1911 _VALID_URL = 'https?://www\.youtube\.com/feed/history|:ythistory'
1912 _FEED_NAME = 'history'
1913 _PLAYLIST_TITLE = 'Youtube History'
1ed5b5c9
JMF
1914
1915
15870e90
PH
1916class YoutubeTruncatedURLIE(InfoExtractor):
1917 IE_NAME = 'youtube:truncated_url'
1918 IE_DESC = False # Do not list
975d35db 1919 _VALID_URL = r'''(?x)
b95aab84
PH
1920 (?:https?://)?
1921 (?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/
1922 (?:watch\?(?:
c4808c60 1923 feature=[a-z_]+|
b95aab84
PH
1924 annotation_id=annotation_[^&]+|
1925 x-yt-cl=[0-9]+|
c1708b89 1926 hl=[^&]*|
b95aab84
PH
1927 )?
1928 |
1929 attribution_link\?a=[^&]+
1930 )
1931 $
975d35db 1932 '''
15870e90 1933
c4808c60
PH
1934 _TESTS = [{
1935 'url': 'http://www.youtube.com/watch?annotation_id=annotation_3951667041',
1936 'only_matching': True,
dc2fc736
PH
1937 }, {
1938 'url': 'http://www.youtube.com/watch?',
1939 'only_matching': True,
b95aab84
PH
1940 }, {
1941 'url': 'https://www.youtube.com/watch?x-yt-cl=84503534',
1942 'only_matching': True,
1943 }, {
1944 'url': 'https://www.youtube.com/watch?feature=foo',
1945 'only_matching': True,
c1708b89
PH
1946 }, {
1947 'url': 'https://www.youtube.com/watch?hl=en-GB',
1948 'only_matching': True,
c4808c60
PH
1949 }]
1950
15870e90
PH
1951 def _real_extract(self, url):
1952 raise ExtractorError(
78caa52a
PH
1953 'Did you forget to quote the URL? Remember that & is a meta '
1954 'character in most shells, so you want to put the URL in quotes, '
1955 'like youtube-dl '
1956 '"http://www.youtube.com/watch?feature=foo&v=BaW_jenozKc" '
1957 ' or simply youtube-dl BaW_jenozKc .',
15870e90 1958 expected=True)
772fd5cc
PH
1959
1960
1961class YoutubeTruncatedIDIE(InfoExtractor):
1962 IE_NAME = 'youtube:truncated_id'
1963 IE_DESC = False # Do not list
b95aab84 1964 _VALID_URL = r'https?://(?:www\.)?youtube\.com/watch\?v=(?P<id>[0-9A-Za-z_-]{1,10})$'
772fd5cc
PH
1965
1966 _TESTS = [{
1967 'url': 'https://www.youtube.com/watch?v=N_708QY7Ob',
1968 'only_matching': True,
1969 }]
1970
1971 def _real_extract(self, url):
1972 video_id = self._match_id(url)
1973 raise ExtractorError(
1974 'Incomplete YouTube ID %s. URL %s looks truncated.' % (video_id, url),
1975 expected=True)