]> jfr.im git - yt-dlp.git/blame - youtube_dl/extractor/youtube.py
[youtube] Skip download for multiple v= test
[yt-dlp.git] / youtube_dl / extractor / youtube.py
CommitLineData
c5e8d7af 1# coding: utf-8
c5e8d7af 2
78caa52a
PH
3from __future__ import unicode_literals
4
5
0ca96d48 6import itertools
c5e8d7af 7import json
c4417ddb 8import os.path
c5e8d7af 9import re
42939b61 10import time
e0df6211 11import traceback
c5e8d7af 12
b05654f0 13from .common import InfoExtractor, SearchInfoExtractor
2b25cb5d 14from ..jsinterp import JSInterpreter
54256267 15from ..swfinterp import SWFInterpreter
4bb4a188 16from ..compat import (
edf3e38e 17 compat_chr,
c5e8d7af 18 compat_parse_qs,
c5e8d7af 19 compat_urllib_parse,
7fd002c0
S
20 compat_urllib_parse_unquote,
21 compat_urllib_parse_unquote_plus,
7c80519c 22 compat_urllib_parse_urlparse,
c5e8d7af 23 compat_urllib_request,
7c61bd36 24 compat_urlparse,
c5e8d7af 25 compat_str,
4bb4a188
PH
26)
27from ..utils import (
c5e8d7af 28 clean_html,
c5e8d7af 29 ExtractorError,
2d30521a 30 float_or_none,
4bb4a188
PH
31 get_element_by_attribute,
32 get_element_by_id,
dd27fd17 33 int_or_none,
4bb4a188 34 orderedSet,
7c80519c 35 parse_duration,
cf7e015f 36 smuggle_url,
c93d53f5 37 str_to_int,
c5e8d7af
PH
38 unescapeHTML,
39 unified_strdate,
cf7e015f 40 unsmuggle_url,
81c2f20b 41 uppercase_escape,
af214c3a 42 ISO3166Utils,
c5e8d7af
PH
43)
44
5f6a1245 45
de7f3446 46class YoutubeBaseInfoExtractor(InfoExtractor):
b2e8bc1b
JMF
47 """Provide base functions for Youtube extractors"""
48 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
83317f69 49 _TWOFACTOR_URL = 'https://accounts.google.com/SecondFactor'
b2e8bc1b
JMF
50 _NETRC_MACHINE = 'youtube'
51 # If True it will raise an error if no login info is provided
52 _LOGIN_REQUIRED = False
53
b2e8bc1b 54 def _set_language(self):
810fb84d
PH
55 self._set_cookie(
56 '.youtube.com', 'PREF', 'f1=50000000&hl=en',
42939b61 57 # YouTube sets the expire time to about two months
810fb84d 58 expire_time=time.time() + 2 * 30 * 24 * 3600)
b2e8bc1b 59
25f14e9f
S
60 def _ids_to_results(self, ids):
61 return [
62 self.url_result(vid_id, 'Youtube', video_id=vid_id)
63 for vid_id in ids]
64
b2e8bc1b 65 def _login(self):
83317f69 66 """
67 Attempt to log in to YouTube.
68 True is returned if successful or skipped.
69 False is returned if login failed.
70
71 If _LOGIN_REQUIRED is set and no authentication was provided, an error is raised.
72 """
b2e8bc1b
JMF
73 (username, password) = self._get_login_info()
74 # No authentication to be performed
75 if username is None:
76 if self._LOGIN_REQUIRED:
69ea8ca4 77 raise ExtractorError('No login info available, needed for using %s.' % self.IE_NAME, expected=True)
83317f69 78 return True
b2e8bc1b 79
7cc3570e
PH
80 login_page = self._download_webpage(
81 self._LOGIN_URL, None,
69ea8ca4
PH
82 note='Downloading login page',
83 errnote='unable to fetch login page', fatal=False)
7cc3570e
PH
84 if login_page is False:
85 return
b2e8bc1b 86
795f28f8 87 galx = self._search_regex(r'(?s)<input.+?name="GALX".+?value="(.+?)"',
78caa52a 88 login_page, 'Login GALX parameter')
c5e8d7af 89
b2e8bc1b
JMF
90 # Log in
91 login_form_strs = {
8bcc8756
JW
92 'continue': 'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
93 'Email': username,
94 'GALX': galx,
95 'Passwd': password,
96
97 'PersistentCookie': 'yes',
98 '_utf8': '霱',
99 'bgresponse': 'js_disabled',
100 'checkConnection': '',
101 'checkedDomains': 'youtube',
102 'dnConn': '',
103 'pstMsg': '0',
104 'rmShown': '1',
105 'secTok': '',
106 'signIn': 'Sign in',
107 'timeStmp': '',
108 'service': 'youtube',
109 'uilel': '3',
110 'hl': 'en_US',
b2e8bc1b 111 }
83317f69 112
b2e8bc1b
JMF
113 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
114 # chokes on unicode
5f6a1245 115 login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k, v in login_form_strs.items())
b2e8bc1b 116 login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
7cc3570e
PH
117
118 req = compat_urllib_request.Request(self._LOGIN_URL, login_data)
119 login_results = self._download_webpage(
120 req, None,
69ea8ca4 121 note='Logging in', errnote='unable to log in', fatal=False)
7cc3570e
PH
122 if login_results is False:
123 return False
83317f69 124
125 if re.search(r'id="errormsg_0_Passwd"', login_results) is not None:
69ea8ca4 126 raise ExtractorError('Please use your account password and a two-factor code instead of an application-specific password.', expected=True)
83317f69 127
128 # Two-Factor
129 # TODO add SMS and phone call support - these require making a request and then prompting the user
130
131 if re.search(r'(?i)<form[^>]* id="gaia_secondfactorform"', login_results) is not None:
132 tfa_code = self._get_tfa_info()
133
134 if tfa_code is None:
69ea8ca4
PH
135 self._downloader.report_warning('Two-factor authentication required. Provide it with --twofactor <code>')
136 self._downloader.report_warning('(Note that only TOTP (Google Authenticator App) codes work at this time.)')
83317f69 137 return False
138
139 # Unlike the first login form, secTok and timeStmp are both required for the TFA form
140
141 match = re.search(r'id="secTok"\n\s+value=\'(.+)\'/>', login_results, re.M | re.U)
142 if match is None:
69ea8ca4 143 self._downloader.report_warning('Failed to get secTok - did the page structure change?')
83317f69 144 secTok = match.group(1)
145 match = re.search(r'id="timeStmp"\n\s+value=\'(.+)\'/>', login_results, re.M | re.U)
146 if match is None:
69ea8ca4 147 self._downloader.report_warning('Failed to get timeStmp - did the page structure change?')
83317f69 148 timeStmp = match.group(1)
149
150 tfa_form_strs = {
78caa52a
PH
151 'continue': 'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
152 'smsToken': '',
153 'smsUserPin': tfa_code,
154 'smsVerifyPin': 'Verify',
155
156 'PersistentCookie': 'yes',
157 'checkConnection': '',
158 'checkedDomains': 'youtube',
159 'pstMsg': '1',
160 'secTok': secTok,
161 'timeStmp': timeStmp,
162 'service': 'youtube',
163 'hl': 'en_US',
83317f69 164 }
5f6a1245 165 tfa_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k, v in tfa_form_strs.items())
83317f69 166 tfa_data = compat_urllib_parse.urlencode(tfa_form).encode('ascii')
167
168 tfa_req = compat_urllib_request.Request(self._TWOFACTOR_URL, tfa_data)
169 tfa_results = self._download_webpage(
170 tfa_req, None,
69ea8ca4 171 note='Submitting TFA code', errnote='unable to submit tfa', fatal=False)
83317f69 172
173 if tfa_results is False:
174 return False
175
176 if re.search(r'(?i)<form[^>]* id="gaia_secondfactorform"', tfa_results) is not None:
69ea8ca4 177 self._downloader.report_warning('Two-factor code expired. Please try again, or use a one-use backup code instead.')
83317f69 178 return False
179 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', tfa_results) is not None:
69ea8ca4 180 self._downloader.report_warning('unable to log in - did the page structure change?')
83317f69 181 return False
182 if re.search(r'smsauth-interstitial-reviewsettings', tfa_results) is not None:
69ea8ca4 183 self._downloader.report_warning('Your Google account has a security notice. Please log in on your web browser, resolve the notice, and try again.')
83317f69 184 return False
185
7cc3570e 186 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
69ea8ca4 187 self._downloader.report_warning('unable to log in: bad username or password')
b2e8bc1b
JMF
188 return False
189 return True
190
b2e8bc1b
JMF
191 def _real_initialize(self):
192 if self._downloader is None:
193 return
42939b61 194 self._set_language()
b2e8bc1b
JMF
195 if not self._login():
196 return
c5e8d7af 197
8377574c 198
360e1ca5 199class YoutubeIE(YoutubeBaseInfoExtractor):
78caa52a 200 IE_DESC = 'YouTube.com'
cb7dfeea 201 _VALID_URL = r"""(?x)^
c5e8d7af 202 (
edb53e2d 203 (?:https?://|//) # http(s):// or protocol-independent URL
cb7dfeea 204 (?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/|
484aaeb2 205 (?:www\.)?deturl\.com/www\.youtube\.com/|
e70dc1d1 206 (?:www\.)?pwnyoutube\.com/|
f7000f3a 207 (?:www\.)?yourepeat\.com/|
e69ae5b9
JMF
208 tube\.majestyc\.net/|
209 youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains
c5e8d7af
PH
210 (?:.*?\#/)? # handle anchor (#/) redirect urls
211 (?: # the various things that can precede the ID:
ac7553d0 212 (?:(?:v|embed|e)/(?!videoseries)) # v/ or embed/ or e/
c5e8d7af 213 |(?: # or the v= param in all its forms
f7000f3a 214 (?:(?:watch|movie)(?:_popup)?(?:\.php)?/?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
c5e8d7af 215 (?:\?|\#!?) # the params delimiter ? or # or #!
11b56058 216 (?:.*?&)?? # any other preceding param (like /?s=tuff&v=xxxx)
c5e8d7af
PH
217 v=
218 )
f4b05232
JMF
219 ))
220 |youtu\.be/ # just youtu.be/xxxx
edb53e2d 221 |(?:www\.)?cleanvideosearch\.com/media/action/yt/watch\?videoId=
f4b05232 222 )
c5e8d7af 223 )? # all until now is optional -> you can pass the naked ID
8963d9c2 224 ([0-9A-Za-z_-]{11}) # here is it! the YouTube video ID
9291475f 225 (?!.*?&list=) # combined list/video URLs are handled by the playlist IE
c5e8d7af
PH
226 (?(1).+)? # if we found the ID, everything can follow
227 $"""
c5e8d7af 228 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
2c62dc26
PH
229 _formats = {
230 '5': {'ext': 'flv', 'width': 400, 'height': 240},
231 '6': {'ext': 'flv', 'width': 450, 'height': 270},
232 '13': {'ext': '3gp'},
233 '17': {'ext': '3gp', 'width': 176, 'height': 144},
234 '18': {'ext': 'mp4', 'width': 640, 'height': 360},
235 '22': {'ext': 'mp4', 'width': 1280, 'height': 720},
236 '34': {'ext': 'flv', 'width': 640, 'height': 360},
237 '35': {'ext': 'flv', 'width': 854, 'height': 480},
238 '36': {'ext': '3gp', 'width': 320, 'height': 240},
239 '37': {'ext': 'mp4', 'width': 1920, 'height': 1080},
240 '38': {'ext': 'mp4', 'width': 4096, 'height': 3072},
241 '43': {'ext': 'webm', 'width': 640, 'height': 360},
242 '44': {'ext': 'webm', 'width': 854, 'height': 480},
243 '45': {'ext': 'webm', 'width': 1280, 'height': 720},
244 '46': {'ext': 'webm', 'width': 1920, 'height': 1080},
c9bebed2
S
245 '59': {'ext': 'mp4', 'width': 854, 'height': 480},
246 '78': {'ext': 'mp4', 'width': 854, 'height': 480},
2c62dc26 247
1d043b93 248
86fe61c8 249 # 3d videos
43b81eb9
PH
250 '82': {'ext': 'mp4', 'height': 360, 'format_note': '3D', 'preference': -20},
251 '83': {'ext': 'mp4', 'height': 480, 'format_note': '3D', 'preference': -20},
252 '84': {'ext': 'mp4', 'height': 720, 'format_note': '3D', 'preference': -20},
253 '85': {'ext': 'mp4', 'height': 1080, 'format_note': '3D', 'preference': -20},
254 '100': {'ext': 'webm', 'height': 360, 'format_note': '3D', 'preference': -20},
255 '101': {'ext': 'webm', 'height': 480, 'format_note': '3D', 'preference': -20},
256 '102': {'ext': 'webm', 'height': 720, 'format_note': '3D', 'preference': -20},
836a086c 257
96fb5605 258 # Apple HTTP Live Streaming
43b81eb9
PH
259 '92': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'preference': -10},
260 '93': {'ext': 'mp4', 'height': 360, 'format_note': 'HLS', 'preference': -10},
261 '94': {'ext': 'mp4', 'height': 480, 'format_note': 'HLS', 'preference': -10},
262 '95': {'ext': 'mp4', 'height': 720, 'format_note': 'HLS', 'preference': -10},
263 '96': {'ext': 'mp4', 'height': 1080, 'format_note': 'HLS', 'preference': -10},
264 '132': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'preference': -10},
265 '151': {'ext': 'mp4', 'height': 72, 'format_note': 'HLS', 'preference': -10},
2c62dc26
PH
266
267 # DASH mp4 video
43b81eb9
PH
268 '133': {'ext': 'mp4', 'height': 240, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
269 '134': {'ext': 'mp4', 'height': 360, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
270 '135': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
271 '136': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
272 '137': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
e65566a9 273 '138': {'ext': 'mp4', 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40}, # Height can vary (https://github.com/rg3/youtube-dl/issues/4559)
43b81eb9
PH
274 '160': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
275 '264': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
0d2c1418
PH
276 '298': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'h264'},
277 '299': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'h264'},
278 '266': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'vcodec': 'h264'},
836a086c 279
f6f1fc92 280 # Dash mp4 audio
62cd676c
PH
281 '139': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'vcodec': 'none', 'abr': 48, 'preference': -50, 'container': 'm4a_dash'},
282 '140': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'vcodec': 'none', 'abr': 128, 'preference': -50, 'container': 'm4a_dash'},
283 '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'vcodec': 'none', 'abr': 256, 'preference': -50, 'container': 'm4a_dash'},
836a086c
AZ
284
285 # Dash webm
4c6bd5b5
JMF
286 '167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40},
287 '168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40},
288 '169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40},
289 '170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40},
290 '218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40},
291 '219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40},
292 '278': {'ext': 'webm', 'height': 144, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'container': 'webm', 'vcodec': 'vp9'},
e75cafe9
A
293 '242': {'ext': 'webm', 'height': 240, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
294 '243': {'ext': 'webm', 'height': 360, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
295 '244': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
296 '245': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
297 '246': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
298 '247': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
299 '248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
3c80377b 300 '271': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
18061bba 301 '272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
4c6bd5b5
JMF
302 '302': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'vp9'},
303 '303': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'vp9'},
304 '308': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'vp9'},
305 '313': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'vcodec': 'vp9'},
306 '315': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'vp9'},
2c62dc26
PH
307
308 # Dash webm audio
55db73ef 309 '171': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'abr': 128, 'preference': -50},
e75cafe9 310 '172': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'abr': 256, 'preference': -50},
ce6b9a2d 311
0857baad
PH
312 # Dash webm audio with opus inside
313 '249': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 50, 'preference': -50},
314 '250': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 70, 'preference': -50},
315 '251': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 160, 'preference': -50},
316
ce6b9a2d
PH
317 # RTMP (unnamed)
318 '_rtmp': {'protocol': 'rtmp'},
c5e8d7af 319 }
836a086c 320
78caa52a 321 IE_NAME = 'youtube'
2eb88d95
PH
322 _TESTS = [
323 {
297a564b 324 'url': 'http://www.youtube.com/watch?v=BaW_jenozKcj&t=1s&end=9',
4bc3a23e
PH
325 'info_dict': {
326 'id': 'BaW_jenozKc',
327 'ext': 'mp4',
328 'title': 'youtube-dl test video "\'/\\ä↭𝕐',
329 'uploader': 'Philipp Hagemeister',
330 'uploader_id': 'phihag',
331 'upload_date': '20121002',
332 'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',
333 'categories': ['Science & Technology'],
000b6b5a 334 'tags': ['youtube-dl'],
3e7c1224
PH
335 'like_count': int,
336 'dislike_count': int,
7c80519c 337 'start_time': 1,
297a564b 338 'end_time': 9,
2eb88d95 339 }
0e853ca4 340 },
0e853ca4 341 {
4bc3a23e
PH
342 'url': 'http://www.youtube.com/watch?v=UxxajLWwzqY',
343 'note': 'Test generic use_cipher_signature video (#897)',
344 'info_dict': {
345 'id': 'UxxajLWwzqY',
346 'ext': 'mp4',
347 'upload_date': '20120506',
348 'title': 'Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]',
000b6b5a
S
349 'description': 'md5:782e8651347686cba06e58f71ab51773',
350 'tags': ['Icona Pop i love it', 'sweden', 'pop music', 'big beat records', 'big beat', 'charli',
351 'xcx', 'charli xcx', 'girls', 'hbo', 'i love it', "i don't care", 'icona', 'pop',
352 'iconic ep', 'iconic', 'love', 'it'],
4bc3a23e
PH
353 'uploader': 'Icona Pop',
354 'uploader_id': 'IconaPop',
2eb88d95 355 }
c108eb73
JMF
356 },
357 {
4bc3a23e
PH
358 'url': 'https://www.youtube.com/watch?v=07FYdnEawAQ',
359 'note': 'Test VEVO video with age protection (#956)',
360 'info_dict': {
361 'id': '07FYdnEawAQ',
362 'ext': 'mp4',
363 'upload_date': '20130703',
364 'title': 'Justin Timberlake - Tunnel Vision (Explicit)',
365 'description': 'md5:64249768eec3bc4276236606ea996373',
366 'uploader': 'justintimberlakeVEVO',
367 'uploader_id': 'justintimberlakeVEVO',
c108eb73
JMF
368 }
369 },
fccd3771 370 {
4bc3a23e
PH
371 'url': '//www.YouTube.com/watch?v=yZIXLfi8CZQ',
372 'note': 'Embed-only video (#1746)',
373 'info_dict': {
374 'id': 'yZIXLfi8CZQ',
375 'ext': 'mp4',
376 'upload_date': '20120608',
377 'title': 'Principal Sexually Assaults A Teacher - Episode 117 - 8th June 2012',
378 'description': 'md5:09b78bd971f1e3e289601dfba15ca4f7',
379 'uploader': 'SET India',
380 'uploader_id': 'setindia'
fccd3771
PH
381 }
382 },
11b56058
PM
383 {
384 'url': 'http://www.youtube.com/watch?v=BaW_jenozKcj&v=UxxajLWwzqY',
385 'note': 'Use the first video ID in the URL',
386 'info_dict': {
387 'id': 'BaW_jenozKc',
388 'ext': 'mp4',
389 'title': 'youtube-dl test video "\'/\\ä↭𝕐',
390 'uploader': 'Philipp Hagemeister',
391 'uploader_id': 'phihag',
392 'upload_date': '20121002',
393 'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',
394 'categories': ['Science & Technology'],
395 'tags': ['youtube-dl'],
396 'like_count': int,
397 'dislike_count': int,
34a7de29
S
398 },
399 'params': {
400 'skip_download': True,
401 },
11b56058 402 },
dd27fd17 403 {
4bc3a23e
PH
404 'url': 'http://www.youtube.com/watch?v=a9LDPn-MO4I',
405 'note': '256k DASH audio (format 141) via DASH manifest',
406 'info_dict': {
407 'id': 'a9LDPn-MO4I',
408 'ext': 'm4a',
409 'upload_date': '20121002',
410 'uploader_id': '8KVIDEO',
411 'description': '',
412 'uploader': '8KVIDEO',
413 'title': 'UHDTV TEST 8K VIDEO.mp4'
4919603f 414 },
4bc3a23e
PH
415 'params': {
416 'youtube_include_dash_manifest': True,
417 'format': '141',
4919603f 418 },
dd27fd17 419 },
3489b7d2
JMF
420 # DASH manifest with encrypted signature
421 {
78caa52a
PH
422 'url': 'https://www.youtube.com/watch?v=IB3lcPjvWLA',
423 'info_dict': {
424 'id': 'IB3lcPjvWLA',
425 'ext': 'm4a',
b766eb27
JMF
426 'title': 'Afrojack, Spree Wilson - The Spark ft. Spree Wilson',
427 'description': 'md5:12e7067fa6735a77bdcbb58cb1187d2d',
78caa52a
PH
428 'uploader': 'AfrojackVEVO',
429 'uploader_id': 'AfrojackVEVO',
430 'upload_date': '20131011',
3489b7d2 431 },
4bc3a23e 432 'params': {
78caa52a
PH
433 'youtube_include_dash_manifest': True,
434 'format': '141',
3489b7d2
JMF
435 },
436 },
aaeb86f6
S
437 # JS player signature function name containing $
438 {
439 'url': 'https://www.youtube.com/watch?v=nfWlot6h_JM',
440 'info_dict': {
441 'id': 'nfWlot6h_JM',
442 'ext': 'm4a',
443 'title': 'Taylor Swift - Shake It Off',
444 'description': 'md5:2acfda1b285bdd478ccec22f9918199d',
445 'uploader': 'TaylorSwiftVEVO',
446 'uploader_id': 'TaylorSwiftVEVO',
447 'upload_date': '20140818',
448 },
449 'params': {
450 'youtube_include_dash_manifest': True,
451 'format': '141',
452 },
453 },
aa79ac0c
PH
454 # Controversy video
455 {
456 'url': 'https://www.youtube.com/watch?v=T4XJQO3qol8',
457 'info_dict': {
458 'id': 'T4XJQO3qol8',
459 'ext': 'mp4',
460 'upload_date': '20100909',
461 'uploader': 'The Amazing Atheist',
462 'uploader_id': 'TheAmazingAtheist',
463 'title': 'Burning Everyone\'s Koran',
464 'description': 'SUBSCRIBE: http://www.youtube.com/saturninefilms\n\nEven Obama has taken a stand against freedom on this issue: http://www.huffingtonpost.com/2010/09/09/obama-gma-interview-quran_n_710282.html',
465 }
c522adb1
JMF
466 },
467 # Normal age-gate video (No vevo, embed allowed)
468 {
469 'url': 'http://youtube.com/watch?v=HtVdAasjOgU',
470 'info_dict': {
471 'id': 'HtVdAasjOgU',
472 'ext': 'mp4',
473 'title': 'The Witcher 3: Wild Hunt - The Sword Of Destiny Trailer',
9ed99402 474 'description': 're:(?s).{100,}About the Game\n.*?The Witcher 3: Wild Hunt.{100,}',
c522adb1
JMF
475 'uploader': 'The Witcher',
476 'uploader_id': 'WitcherGame',
477 'upload_date': '20140605',
478 },
479 },
fccae2b9
S
480 # Age-gate video with encrypted signature
481 {
482 'url': 'http://www.youtube.com/watch?v=6kLq3WMV1nU',
483 'info_dict': {
484 'id': '6kLq3WMV1nU',
485 'ext': 'mp4',
486 'title': 'Dedication To My Ex (Miss That) (Lyric Video)',
487 'description': 'md5:33765bb339e1b47e7e72b5490139bb41',
488 'uploader': 'LloydVEVO',
489 'uploader_id': 'LloydVEVO',
490 'upload_date': '20110629',
491 },
492 },
774e208f
PH
493 # video_info is None (https://github.com/rg3/youtube-dl/issues/4421)
494 {
495 'url': '__2ABJjxzNo',
496 'info_dict': {
497 'id': '__2ABJjxzNo',
498 'ext': 'mp4',
499 'upload_date': '20100430',
500 'uploader_id': 'deadmau5',
501 'description': 'md5:12c56784b8032162bb936a5f76d55360',
502 'uploader': 'deadmau5',
503 'title': 'Deadmau5 - Some Chords (HD)',
504 },
505 'expected_warnings': [
506 'DASH manifest missing',
507 ]
e52a40ab
PH
508 },
509 # Olympics (https://github.com/rg3/youtube-dl/issues/4431)
510 {
511 'url': 'lqQg6PlCWgI',
512 'info_dict': {
513 'id': 'lqQg6PlCWgI',
514 'ext': 'mp4',
cbe2bd91
PH
515 'upload_date': '20120731',
516 'uploader_id': 'olympic',
517 'description': 'HO09 - Women - GER-AUS - Hockey - 31 July 2012 - London 2012 Olympic Games',
518 'uploader': 'Olympics',
519 'title': 'Hockey - Women - GER-AUS - London 2012 Olympic Games',
520 },
521 'params': {
522 'skip_download': 'requires avconv',
e52a40ab 523 }
cbe2bd91 524 },
6271f1ca
PH
525 # Non-square pixels
526 {
527 'url': 'https://www.youtube.com/watch?v=_b-2C3KPAM0',
528 'info_dict': {
529 'id': '_b-2C3KPAM0',
530 'ext': 'mp4',
531 'stretched_ratio': 16 / 9.,
532 'upload_date': '20110310',
533 'uploader_id': 'AllenMeow',
534 'description': 'made by Wacom from Korea | 字幕&加油添醋 by TY\'s Allen | 感謝heylisa00cavey1001同學熱情提供梗及翻譯',
535 'uploader': '孫艾倫',
536 'title': '[A-made] 變態妍字幕版 太妍 我就是這樣的人',
537 },
06b491eb
S
538 },
539 # url_encoded_fmt_stream_map is empty string
540 {
541 'url': 'qEJwOuvDf7I',
542 'info_dict': {
543 'id': 'qEJwOuvDf7I',
544 'ext': 'mp4',
545 'title': 'Обсуждение судебной практики по выборам 14 сентября 2014 года в Санкт-Петербурге',
546 'description': '',
547 'upload_date': '20150404',
548 'uploader_id': 'spbelect',
549 'uploader': 'Наблюдатели Петербурга',
550 },
551 'params': {
552 'skip_download': 'requires avconv',
553 }
554 },
da77d856
S
555 # Extraction from multiple DASH manifests (https://github.com/rg3/youtube-dl/pull/6097)
556 {
557 'url': 'https://www.youtube.com/watch?v=FIl7x6_3R5Y',
558 'info_dict': {
559 'id': 'FIl7x6_3R5Y',
560 'ext': 'mp4',
561 'title': 'md5:7b81415841e02ecd4313668cde88737a',
562 'description': 'md5:116377fd2963b81ec4ce64b542173306',
563 'upload_date': '20150625',
564 'uploader_id': 'dorappi2000',
565 'uploader': 'dorappi2000',
566 'formats': 'mincount:33',
567 },
2ee8f5d8 568 },
8a1a26ce
YCH
569 # DASH manifest with segment_list
570 {
571 'url': 'https://www.youtube.com/embed/CsmdDsKjzN8',
572 'md5': '8ce563a1d667b599d21064e982ab9e31',
573 'info_dict': {
574 'id': 'CsmdDsKjzN8',
575 'ext': 'mp4',
17ee98e1 576 'upload_date': '20150501', # According to '<meta itemprop="datePublished"', but in other places it's 20150510
8a1a26ce
YCH
577 'uploader': 'Airtek',
578 'description': 'Retransmisión en directo de la XVIII media maratón de Zaragoza.',
579 'uploader_id': 'UCzTzUmjXxxacNnL8I3m4LnQ',
580 'title': 'Retransmisión XVIII Media maratón Zaragoza 2015',
581 },
582 'params': {
583 'youtube_include_dash_manifest': True,
584 'format': '135', # bestvideo
585 }
2ee8f5d8 586 },
cf7e015f
S
587 {
588 # Multifeed videos (multiple cameras), URL is for Main Camera
589 'url': 'https://www.youtube.com/watch?v=jqWvoWXjCVs',
590 'info_dict': {
591 'id': 'jqWvoWXjCVs',
592 'title': 'teamPGP: Rocket League Noob Stream',
593 'description': 'md5:dc7872fb300e143831327f1bae3af010',
594 },
595 'playlist': [{
596 'info_dict': {
597 'id': 'jqWvoWXjCVs',
598 'ext': 'mp4',
599 'title': 'teamPGP: Rocket League Noob Stream (Main Camera)',
600 'description': 'md5:dc7872fb300e143831327f1bae3af010',
601 'upload_date': '20150721',
602 'uploader': 'Beer Games Beer',
603 'uploader_id': 'beergamesbeer',
604 },
605 }, {
606 'info_dict': {
607 'id': '6h8e8xoXJzg',
608 'ext': 'mp4',
609 'title': 'teamPGP: Rocket League Noob Stream (kreestuh)',
610 'description': 'md5:dc7872fb300e143831327f1bae3af010',
611 'upload_date': '20150721',
612 'uploader': 'Beer Games Beer',
613 'uploader_id': 'beergamesbeer',
614 },
615 }, {
616 'info_dict': {
617 'id': 'PUOgX5z9xZw',
618 'ext': 'mp4',
619 'title': 'teamPGP: Rocket League Noob Stream (grizzle)',
620 'description': 'md5:dc7872fb300e143831327f1bae3af010',
621 'upload_date': '20150721',
622 'uploader': 'Beer Games Beer',
623 'uploader_id': 'beergamesbeer',
624 },
625 }, {
626 'info_dict': {
627 'id': 'teuwxikvS5k',
628 'ext': 'mp4',
629 'title': 'teamPGP: Rocket League Noob Stream (zim)',
630 'description': 'md5:dc7872fb300e143831327f1bae3af010',
631 'upload_date': '20150721',
632 'uploader': 'Beer Games Beer',
633 'uploader_id': 'beergamesbeer',
634 },
635 }],
636 'params': {
637 'skip_download': True,
638 },
639 }
2eb88d95
PH
640 ]
641
e0df6211
PH
642 def __init__(self, *args, **kwargs):
643 super(YoutubeIE, self).__init__(*args, **kwargs)
83799698 644 self._player_cache = {}
e0df6211 645
c5e8d7af
PH
646 def report_video_info_webpage_download(self, video_id):
647 """Report attempt to download video info webpage."""
69ea8ca4 648 self.to_screen('%s: Downloading video info webpage' % video_id)
c5e8d7af 649
c5e8d7af
PH
650 def report_information_extraction(self, video_id):
651 """Report attempt to extract video information."""
69ea8ca4 652 self.to_screen('%s: Extracting video information' % video_id)
c5e8d7af
PH
653
654 def report_unavailable_format(self, video_id, format):
655 """Report extracted video URL."""
69ea8ca4 656 self.to_screen('%s: Format %s not available' % (video_id, format))
c5e8d7af
PH
657
658 def report_rtmp_download(self):
659 """Indicate the download will use the RTMP protocol."""
69ea8ca4 660 self.to_screen('RTMP download detected')
c5e8d7af 661
60064c53
PH
662 def _signature_cache_id(self, example_sig):
663 """ Return a string representation of a signature """
78caa52a 664 return '.'.join(compat_str(len(part)) for part in example_sig.split('.'))
60064c53
PH
665
666 def _extract_signature_function(self, video_id, player_url, example_sig):
cf010131 667 id_m = re.match(
60620368 668 r'.*?-(?P<id>[a-zA-Z0-9_-]+)(?:/watch_as3|/html5player)?\.(?P<ext>[a-z]+)$',
cf010131 669 player_url)
c081b35c
PH
670 if not id_m:
671 raise ExtractorError('Cannot identify player %r' % player_url)
e0df6211
PH
672 player_type = id_m.group('ext')
673 player_id = id_m.group('id')
674
c4417ddb 675 # Read from filesystem cache
60064c53
PH
676 func_id = '%s_%s_%s' % (
677 player_type, player_id, self._signature_cache_id(example_sig))
c4417ddb 678 assert os.path.basename(func_id) == func_id
a0e07d31 679
69ea8ca4 680 cache_spec = self._downloader.cache.load('youtube-sigfuncs', func_id)
a0e07d31 681 if cache_spec is not None:
78caa52a 682 return lambda s: ''.join(s[i] for i in cache_spec)
83799698 683
6d1a55a5
PH
684 download_note = (
685 'Downloading player %s' % player_url
686 if self._downloader.params.get('verbose') else
687 'Downloading %s player %s' % (player_type, player_id)
688 )
e0df6211
PH
689 if player_type == 'js':
690 code = self._download_webpage(
691 player_url, video_id,
6d1a55a5 692 note=download_note,
69ea8ca4 693 errnote='Download of %s failed' % player_url)
83799698 694 res = self._parse_sig_js(code)
c4417ddb 695 elif player_type == 'swf':
e0df6211
PH
696 urlh = self._request_webpage(
697 player_url, video_id,
6d1a55a5 698 note=download_note,
69ea8ca4 699 errnote='Download of %s failed' % player_url)
e0df6211 700 code = urlh.read()
83799698 701 res = self._parse_sig_swf(code)
e0df6211
PH
702 else:
703 assert False, 'Invalid player type %r' % player_type
704
785521bf
PH
705 test_string = ''.join(map(compat_chr, range(len(example_sig))))
706 cache_res = res(test_string)
707 cache_spec = [ord(c) for c in cache_res]
83799698 708
69ea8ca4 709 self._downloader.cache.store('youtube-sigfuncs', func_id, cache_spec)
83799698
PH
710 return res
711
60064c53 712 def _print_sig_code(self, func, example_sig):
edf3e38e
PH
713 def gen_sig_code(idxs):
714 def _genslice(start, end, step):
78caa52a 715 starts = '' if start == 0 else str(start)
8bcc8756 716 ends = (':%d' % (end + step)) if end + step >= 0 else ':'
69ea8ca4 717 steps = '' if step == 1 else (':%d' % step)
78caa52a 718 return 's[%s%s%s]' % (starts, ends, steps)
edf3e38e
PH
719
720 step = None
7af808a5
PH
721 # Quelch pyflakes warnings - start will be set when step is set
722 start = '(Never used)'
edf3e38e
PH
723 for i, prev in zip(idxs[1:], idxs[:-1]):
724 if step is not None:
725 if i - prev == step:
726 continue
727 yield _genslice(start, prev, step)
728 step = None
729 continue
730 if i - prev in [-1, 1]:
731 step = i - prev
732 start = prev
733 continue
734 else:
78caa52a 735 yield 's[%d]' % prev
edf3e38e 736 if step is None:
78caa52a 737 yield 's[%d]' % i
edf3e38e
PH
738 else:
739 yield _genslice(start, i, step)
740
78caa52a 741 test_string = ''.join(map(compat_chr, range(len(example_sig))))
c705320f 742 cache_res = func(test_string)
edf3e38e 743 cache_spec = [ord(c) for c in cache_res]
78caa52a 744 expr_code = ' + '.join(gen_sig_code(cache_spec))
60064c53
PH
745 signature_id_tuple = '(%s)' % (
746 ', '.join(compat_str(len(p)) for p in example_sig.split('.')))
69ea8ca4 747 code = ('if tuple(len(p) for p in s.split(\'.\')) == %s:\n'
78caa52a 748 ' return %s\n') % (signature_id_tuple, expr_code)
69ea8ca4 749 self.to_screen('Extracted signature function:\n' + code)
edf3e38e 750
e0df6211
PH
751 def _parse_sig_js(self, jscode):
752 funcname = self._search_regex(
aaeb86f6 753 r'\.sig\|\|([a-zA-Z0-9$]+)\(', jscode,
8bcc8756 754 'Initial JS player signature function name')
2b25cb5d
PH
755
756 jsi = JSInterpreter(jscode)
757 initial_function = jsi.extract_function(funcname)
e0df6211
PH
758 return lambda s: initial_function([s])
759
760 def _parse_sig_swf(self, file_contents):
54256267 761 swfi = SWFInterpreter(file_contents)
78caa52a 762 TARGET_CLASSNAME = 'SignatureDecipher'
54256267 763 searched_class = swfi.extract_class(TARGET_CLASSNAME)
78caa52a 764 initial_function = swfi.extract_function(searched_class, 'decipher')
e0df6211
PH
765 return lambda s: initial_function([s])
766
83799698 767 def _decrypt_signature(self, s, video_id, player_url, age_gate=False):
257a2501 768 """Turn the encrypted s field into a working signature"""
6b37f0be 769
c8bf86d5 770 if player_url is None:
69ea8ca4 771 raise ExtractorError('Cannot decrypt signature without player_url')
920de7a2 772
69ea8ca4 773 if player_url.startswith('//'):
78caa52a 774 player_url = 'https:' + player_url
c8bf86d5 775 try:
62af3a0e 776 player_id = (player_url, self._signature_cache_id(s))
c8bf86d5
PH
777 if player_id not in self._player_cache:
778 func = self._extract_signature_function(
60064c53 779 video_id, player_url, s
c8bf86d5
PH
780 )
781 self._player_cache[player_id] = func
782 func = self._player_cache[player_id]
783 if self._downloader.params.get('youtube_print_sig_code'):
60064c53 784 self._print_sig_code(func, s)
c8bf86d5
PH
785 return func(s)
786 except Exception as e:
787 tb = traceback.format_exc()
788 raise ExtractorError(
78caa52a 789 'Signature extraction failed: ' + tb, cause=e)
e0df6211 790
360e1ca5 791 def _get_subtitles(self, video_id, webpage):
de7f3446 792 try:
60e47a26 793 subs_doc = self._download_xml(
38c2e5b8 794 'https://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,
7fad1c63
JMF
795 video_id, note=False)
796 except ExtractorError as err:
69ea8ca4 797 self._downloader.report_warning('unable to download video subtitles: %s' % compat_str(err))
de7f3446 798 return {}
de7f3446
JMF
799
800 sub_lang_list = {}
60e47a26
JMF
801 for track in subs_doc.findall('track'):
802 lang = track.attrib['lang_code']
7e660ac1
LD
803 if lang in sub_lang_list:
804 continue
360e1ca5
JMF
805 sub_formats = []
806 for ext in ['sbv', 'vtt', 'srt']:
807 params = compat_urllib_parse.urlencode({
808 'lang': lang,
809 'v': video_id,
810 'fmt': ext,
811 'name': track.attrib['name'].encode('utf-8'),
812 })
813 sub_formats.append({
814 'url': 'https://www.youtube.com/api/timedtext?' + params,
815 'ext': ext,
816 })
817 sub_lang_list[lang] = sub_formats
de7f3446 818 if not sub_lang_list:
69ea8ca4 819 self._downloader.report_warning('video doesn\'t have subtitles')
de7f3446
JMF
820 return {}
821 return sub_lang_list
822
360e1ca5 823 def _get_automatic_captions(self, video_id, webpage):
de7f3446
JMF
824 """We need the webpage for getting the captions url, pass it as an
825 argument to speed up the process."""
69ea8ca4 826 self.to_screen('%s: Looking for automatic captions' % video_id)
de7f3446 827 mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
78caa52a 828 err_msg = 'Couldn\'t find automatic captions for %s' % video_id
de7f3446
JMF
829 if mobj is None:
830 self._downloader.report_warning(err_msg)
831 return {}
832 player_config = json.loads(mobj.group(1))
833 try:
0792d563
PH
834 args = player_config['args']
835 caption_url = args['ttsurl']
836 timestamp = args['timestamp']
055e6f36
JMF
837 # We get the available subtitles
838 list_params = compat_urllib_parse.urlencode({
839 'type': 'list',
840 'tlangs': 1,
841 'asrs': 1,
de7f3446 842 })
055e6f36 843 list_url = caption_url + '&' + list_params
e26f8712 844 caption_list = self._download_xml(list_url, video_id)
e3dc22ca 845 original_lang_node = caption_list.find('track')
7d900ef1 846 if original_lang_node is None:
69ea8ca4 847 self._downloader.report_warning('Video doesn\'t have automatic captions')
e3dc22ca
JMF
848 return {}
849 original_lang = original_lang_node.attrib['lang_code']
7d900ef1 850 caption_kind = original_lang_node.attrib.get('kind', '')
055e6f36
JMF
851
852 sub_lang_list = {}
853 for lang_node in caption_list.findall('target'):
854 sub_lang = lang_node.attrib['lang_code']
360e1ca5
JMF
855 sub_formats = []
856 for ext in ['sbv', 'vtt', 'srt']:
857 params = compat_urllib_parse.urlencode({
858 'lang': original_lang,
859 'tlang': sub_lang,
860 'fmt': ext,
861 'ts': timestamp,
862 'kind': caption_kind,
863 })
864 sub_formats.append({
865 'url': caption_url + '&' + params,
866 'ext': ext,
867 })
868 sub_lang_list[sub_lang] = sub_formats
055e6f36 869 return sub_lang_list
de7f3446
JMF
870 # An extractor error can be raise by the download process if there are
871 # no automatic captions but there are subtitles
872 except (KeyError, ExtractorError):
873 self._downloader.report_warning(err_msg)
874 return {}
875
97665381
PH
876 @classmethod
877 def extract_id(cls, url):
878 mobj = re.match(cls._VALID_URL, url, re.VERBOSE)
c5e8d7af 879 if mobj is None:
69ea8ca4 880 raise ExtractorError('Invalid URL: %s' % url)
c5e8d7af
PH
881 video_id = mobj.group(2)
882 return video_id
883
1d043b93
JMF
884 def _extract_from_m3u8(self, manifest_url, video_id):
885 url_map = {}
5f6a1245 886
1d043b93
JMF
887 def _get_urls(_manifest):
888 lines = _manifest.split('\n')
889 urls = filter(lambda l: l and not l.startswith('#'),
8bcc8756 890 lines)
1d043b93 891 return urls
78caa52a 892 manifest = self._download_webpage(manifest_url, video_id, 'Downloading formats manifest')
1d043b93
JMF
893 formats_urls = _get_urls(manifest)
894 for format_url in formats_urls:
890f62e8 895 itag = self._search_regex(r'itag/(\d+?)/', format_url, 'itag')
1d043b93
JMF
896 url_map[itag] = format_url
897 return url_map
898
1fb07d10
JG
899 def _extract_annotations(self, video_id):
900 url = 'https://www.youtube.com/annotations_invideo?features=1&legacy=1&video_id=%s' % video_id
69ea8ca4 901 return self._download_webpage(url, video_id, note='Searching for annotations.', errnote='Unable to download video annotations.')
1fb07d10 902
da276600 903 def _parse_dash_manifest(
77c6fb5b 904 self, video_id, dash_manifest_url, player_url, age_gate, fatal=True):
774e208f
PH
905 def decrypt_sig(mobj):
906 s = mobj.group(1)
907 dec_s = self._decrypt_signature(s, video_id, player_url, age_gate)
908 return '/signature/%s' % dec_s
e1b9322b 909 dash_manifest_url = re.sub(r'/s/([a-fA-F0-9\.]+)', decrypt_sig, dash_manifest_url)
774e208f
PH
910 dash_doc = self._download_xml(
911 dash_manifest_url, video_id,
912 note='Downloading DASH manifest',
77c6fb5b
S
913 errnote='Could not download DASH manifest',
914 fatal=fatal)
915
916 if dash_doc is False:
917 return []
774e208f
PH
918
919 formats = []
de5c5456
YCH
920 for a in dash_doc.findall('.//{urn:mpeg:DASH:schema:MPD:2011}AdaptationSet'):
921 mime_type = a.attrib.get('mimeType')
922 for r in a.findall('{urn:mpeg:DASH:schema:MPD:2011}Representation'):
923 url_el = r.find('{urn:mpeg:DASH:schema:MPD:2011}BaseURL')
924 if url_el is None:
925 continue
926 if mime_type == 'text/vtt':
927 # TODO implement WebVTT downloading
928 pass
929 elif mime_type.startswith('audio/') or mime_type.startswith('video/'):
6800d337 930 segment_list = r.find('{urn:mpeg:DASH:schema:MPD:2011}SegmentList')
de5c5456
YCH
931 format_id = r.attrib['id']
932 video_url = url_el.text
933 filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength'))
934 f = {
935 'format_id': format_id,
936 'url': video_url,
937 'width': int_or_none(r.attrib.get('width')),
938 'height': int_or_none(r.attrib.get('height')),
939 'tbr': int_or_none(r.attrib.get('bandwidth'), 1000),
940 'asr': int_or_none(r.attrib.get('audioSamplingRate')),
941 'filesize': filesize,
942 'fps': int_or_none(r.attrib.get('frameRate')),
943 }
0c8662d2 944 if segment_list is not None:
6800d337
YCH
945 f.update({
946 'initialization_url': segment_list.find('{urn:mpeg:DASH:schema:MPD:2011}Initialization').attrib['sourceURL'],
b9258c61 947 'segment_urls': [segment.attrib.get('media') for segment in segment_list.findall('{urn:mpeg:DASH:schema:MPD:2011}SegmentURL')],
423d2be5 948 'protocol': 'http_dash_segments',
6800d337 949 })
de5c5456
YCH
950 try:
951 existing_format = next(
952 fo for fo in formats
953 if fo['format_id'] == format_id)
954 except StopIteration:
955 full_info = self._formats.get(format_id, {}).copy()
956 full_info.update(f)
1b5a1ae2
S
957 codecs = r.attrib.get('codecs')
958 if codecs:
959 if full_info.get('acodec') == 'none' and 'vcodec' not in full_info:
960 full_info['vcodec'] = codecs
961 elif full_info.get('vcodec') == 'none' and 'acodec' not in full_info:
962 full_info['acodec'] = codecs
de5c5456
YCH
963 formats.append(full_info)
964 else:
965 existing_format.update(f)
966 else:
967 self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
774e208f
PH
968 return formats
969
c5e8d7af 970 def _real_extract(self, url):
cf7e015f
S
971 url, smuggled_data = unsmuggle_url(url, {})
972
7e8c0af0 973 proto = (
78caa52a
PH
974 'http' if self._downloader.params.get('prefer_insecure', False)
975 else 'https')
7e8c0af0 976
7c80519c 977 start_time = None
297a564b 978 end_time = None
7c80519c
JMF
979 parsed_url = compat_urllib_parse_urlparse(url)
980 for component in [parsed_url.fragment, parsed_url.query]:
981 query = compat_parse_qs(component)
297a564b 982 if start_time is None and 't' in query:
7c80519c 983 start_time = parse_duration(query['t'][0])
2929fa0e
JMF
984 if start_time is None and 'start' in query:
985 start_time = parse_duration(query['start'][0])
297a564b
JMF
986 if end_time is None and 'end' in query:
987 end_time = parse_duration(query['end'][0])
7c80519c 988
c5e8d7af
PH
989 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
990 mobj = re.search(self._NEXT_URL_RE, url)
991 if mobj:
7fd002c0 992 url = proto + '://www.youtube.com/' + compat_urllib_parse_unquote(mobj.group(1)).lstrip('/')
97665381 993 video_id = self.extract_id(url)
c5e8d7af
PH
994
995 # Get video webpage
aa79ac0c 996 url = proto + '://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1&bpctr=9999999999' % video_id
a1f934b1 997 video_webpage = self._download_webpage(url, video_id)
c5e8d7af
PH
998
999 # Attempt to extract SWF player URL
e0df6211 1000 mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
c5e8d7af
PH
1001 if mobj is not None:
1002 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1003 else:
1004 player_url = None
1005
d8d24a92
S
1006 dash_mpds = []
1007
1008 def add_dash_mpd(video_info):
1009 dash_mpd = video_info.get('dashmpd')
1010 if dash_mpd and dash_mpd[0] not in dash_mpds:
1011 dash_mpds.append(dash_mpd[0])
1012
c5e8d7af 1013 # Get video info
6449cd80 1014 embed_webpage = None
2fe1ff85 1015 is_live = None
c108eb73 1016 if re.search(r'player-age-gate-content">', video_webpage) is not None:
c108eb73
JMF
1017 age_gate = True
1018 # We simulate the access to the video from www.youtube.com/v/{video_id}
1019 # this can be viewed without login into Youtube
beb95e77
CL
1020 url = proto + '://www.youtube.com/embed/%s' % video_id
1021 embed_webpage = self._download_webpage(url, video_id, 'Downloading embed webpage')
2c57c7fa
JMF
1022 data = compat_urllib_parse.urlencode({
1023 'video_id': video_id,
1024 'eurl': 'https://youtube.googleapis.com/v/' + video_id,
c084c934 1025 'sts': self._search_regex(
beb95e77 1026 r'"sts"\s*:\s*(\d+)', embed_webpage, 'sts', default=''),
2c57c7fa 1027 })
7e8c0af0 1028 video_info_url = proto + '://www.youtube.com/get_video_info?' + data
94bd3613
PH
1029 video_info_webpage = self._download_webpage(
1030 video_info_url, video_id,
20436c30 1031 note='Refetching age-gated info webpage',
94bd3613 1032 errnote='unable to download video info webpage')
c5e8d7af 1033 video_info = compat_parse_qs(video_info_webpage)
d8d24a92 1034 add_dash_mpd(video_info)
c108eb73
JMF
1035 else:
1036 age_gate = False
bc93bdb5 1037 video_info = None
d8d24a92
S
1038 # Try looking directly into the video webpage
1039 mobj = re.search(r';ytplayer\.config\s*=\s*({.*?});', video_webpage)
1040 if mobj:
4e62ebe2
JMF
1041 json_code = uppercase_escape(mobj.group(1))
1042 ytplayer_config = json.loads(json_code)
1043 args = ytplayer_config['args']
d8d24a92
S
1044 if args.get('url_encoded_fmt_stream_map'):
1045 # Convert to the same format returned by compat_parse_qs
1046 video_info = dict((k, [v]) for k, v in args.items())
1047 add_dash_mpd(video_info)
2fe1ff85
JMF
1048 if args.get('livestream') == '1' or args.get('live_playback') == 1:
1049 is_live = True
0a3cf9ad
S
1050 if not video_info or self._downloader.params.get('youtube_include_dash_manifest', True):
1051 # We also try looking in get_video_info since it may contain different dashmpd
1052 # URL that points to a DASH manifest with possibly different itag set (some itags
1053 # are missing from DASH manifest pointed by webpage's dashmpd, some - from DASH
1054 # manifest pointed by get_video_info's dashmpd).
1055 # The general idea is to take a union of itags of both DASH manifests (for example
1056 # video with such 'manifest behavior' see https://github.com/rg3/youtube-dl/issues/6093)
4e62ebe2 1057 self.report_video_info_webpage_download(video_id)
0a3cf9ad 1058 for el_type in ['&el=info', '&el=embedded', '&el=detailpage', '&el=vevo', '']:
810fb84d
PH
1059 video_info_url = (
1060 '%s://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1061 % (proto, video_id, el_type))
1062 video_info_webpage = self._download_webpage(
1063 video_info_url,
4e62ebe2
JMF
1064 video_id, note=False,
1065 errnote='unable to download video info webpage')
0a3cf9ad 1066 get_video_info = compat_parse_qs(video_info_webpage)
87dc4511
JMF
1067 if get_video_info.get('use_cipher_signature') != ['True']:
1068 add_dash_mpd(get_video_info)
0a3cf9ad
S
1069 if not video_info:
1070 video_info = get_video_info
1071 if 'token' in get_video_info:
4e62ebe2 1072 break
c5e8d7af
PH
1073 if 'token' not in video_info:
1074 if 'reason' in video_info:
af214c3a
YCH
1075 if 'The uploader has not made this video available in your country.' in video_info['reason']:
1076 regions_allowed = self._html_search_meta('regionsAllowed', video_webpage, default=None)
678e436f 1077 if regions_allowed:
af214c3a
YCH
1078 raise ExtractorError('YouTube said: This video is available in %s only' % (
1079 ', '.join(map(ISO3166Utils.short2full, regions_allowed.split(',')))),
1080 expected=True)
d11271dd 1081 raise ExtractorError(
78caa52a 1082 'YouTube said: %s' % video_info['reason'][0],
d11271dd 1083 expected=True, video_id=video_id)
c5e8d7af 1084 else:
d11271dd 1085 raise ExtractorError(
78caa52a 1086 '"token" parameter not in video info for unknown reason',
d11271dd 1087 video_id=video_id)
c5e8d7af 1088
cf7e015f
S
1089 # title
1090 if 'title' in video_info:
1091 video_title = video_info['title'][0]
1092 else:
1093 self._downloader.report_warning('Unable to extract video title')
1094 video_title = '_'
1095
1096 # description
1097 video_description = get_element_by_id("eow-description", video_webpage)
1098 if video_description:
1099 video_description = re.sub(r'''(?x)
1100 <a\s+
1101 (?:[a-zA-Z-]+="[^"]+"\s+)*?
1102 title="([^"]+)"\s+
1103 (?:[a-zA-Z-]+="[^"]+"\s+)*?
1104 class="yt-uix-redirect-link"\s*>
1105 [^<]+
1106 </a>
1107 ''', r'\1', video_description)
1108 video_description = clean_html(video_description)
1109 else:
1110 fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
1111 if fd_mobj:
1112 video_description = unescapeHTML(fd_mobj.group(1))
1113 else:
1114 video_description = ''
1115
5e1eddb9
S
1116 if 'multifeed_metadata_list' in video_info and not smuggled_data.get('force_singlefeed', False):
1117 if not self._downloader.params.get('noplaylist'):
1118 entries = []
1119 feed_ids = []
1120 multifeed_metadata_list = compat_urllib_parse_unquote_plus(video_info['multifeed_metadata_list'][0])
1121 for feed in multifeed_metadata_list.split(','):
1122 feed_data = compat_parse_qs(feed)
1123 entries.append({
1124 '_type': 'url_transparent',
1125 'ie_key': 'Youtube',
1126 'url': smuggle_url(
1127 '%s://www.youtube.com/watch?v=%s' % (proto, feed_data['id'][0]),
1128 {'force_singlefeed': True}),
1129 'title': '%s (%s)' % (video_title, feed_data['title'][0]),
1130 })
1131 feed_ids.append(feed_data['id'][0])
1132 self.to_screen(
1133 'Downloading multifeed video (%s) - add --no-playlist to just download video %s'
1134 % (', '.join(feed_ids), video_id))
1135 return self.playlist_result(entries, video_id, video_title, video_description)
1136 self.to_screen('Downloading just video %s because of --no-playlist' % video_id)
cf7e015f 1137
1d699755
PH
1138 if 'view_count' in video_info:
1139 view_count = int(video_info['view_count'][0])
1140 else:
1141 view_count = None
1142
c5e8d7af
PH
1143 # Check for "rental" videos
1144 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
69ea8ca4 1145 raise ExtractorError('"rental" videos not supported')
c5e8d7af
PH
1146
1147 # Start extracting information
1148 self.report_information_extraction(video_id)
1149
1150 # uploader
1151 if 'author' not in video_info:
69ea8ca4 1152 raise ExtractorError('Unable to extract uploader name')
7fd002c0 1153 video_uploader = compat_urllib_parse_unquote_plus(video_info['author'][0])
c5e8d7af
PH
1154
1155 # uploader_id
1156 video_uploader_id = None
1157 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
1158 if mobj is not None:
1159 video_uploader_id = mobj.group(1)
1160 else:
69ea8ca4 1161 self._downloader.report_warning('unable to extract uploader nickname')
c5e8d7af 1162
c5e8d7af 1163 # thumbnail image
7763b04e
JMF
1164 # We try first to get a high quality image:
1165 m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">',
1166 video_webpage, re.DOTALL)
1167 if m_thumb is not None:
1168 video_thumbnail = m_thumb.group(1)
1169 elif 'thumbnail_url' not in video_info:
69ea8ca4 1170 self._downloader.report_warning('unable to extract video thumbnail')
f490e77e 1171 video_thumbnail = None
c5e8d7af 1172 else: # don't panic if we can't find it
7fd002c0 1173 video_thumbnail = compat_urllib_parse_unquote_plus(video_info['thumbnail_url'][0])
c5e8d7af
PH
1174
1175 # upload date
9d0b581f
S
1176 upload_date = self._html_search_meta(
1177 'datePublished', video_webpage, 'upload date', default=None)
1178 if not upload_date:
1179 upload_date = self._search_regex(
1180 [r'(?s)id="eow-date.*?>(.*?)</span>',
1181 r'id="watch-uploader-info".*?>.*?(?:Published|Uploaded|Streamed live|Started) on (.+?)</strong>'],
1182 video_webpage, 'upload date', default=None)
1183 if upload_date:
1184 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1185 upload_date = unified_strdate(upload_date)
c5e8d7af 1186
55f7bd2d
PH
1187 m_cat_container = self._search_regex(
1188 r'(?s)<h4[^>]*>\s*Category\s*</h4>\s*<ul[^>]*>(.*?)</ul>',
624dcebf 1189 video_webpage, 'categories', default=None)
ec8deefc 1190 if m_cat_container:
ad3bc6ac 1191 category = self._html_search_regex(
01ed5c9b 1192 r'(?s)<a[^<]+>(.*?)</a>', m_cat_container, 'category',
ad3bc6ac
PH
1193 default=None)
1194 video_categories = None if category is None else [category]
1195 else:
1196 video_categories = None
ec8deefc 1197
000b6b5a
S
1198 video_tags = [
1199 unescapeHTML(m.group('content'))
1200 for m in re.finditer(self._meta_regex('og:video:tag'), video_webpage)]
1201
f30a38be 1202 def _extract_count(count_name):
c93d53f5
S
1203 return str_to_int(self._search_regex(
1204 r'-%s-button[^>]+><span[^>]+class="yt-uix-button-content"[^>]*>([\d,]+)</span>'
1205 % re.escape(count_name),
1206 video_webpage, count_name, default=None))
1207
69ea8ca4
PH
1208 like_count = _extract_count('like')
1209 dislike_count = _extract_count('dislike')
336c3a69 1210
c5e8d7af 1211 # subtitles
d82134c3 1212 video_subtitles = self.extract_subtitles(video_id, video_webpage)
360e1ca5 1213 automatic_captions = self.extract_automatic_captions(video_id, video_webpage)
c5e8d7af
PH
1214
1215 if 'length_seconds' not in video_info:
69ea8ca4 1216 self._downloader.report_warning('unable to extract video duration')
b466b702 1217 video_duration = None
c5e8d7af 1218 else:
7fd002c0 1219 video_duration = int(compat_urllib_parse_unquote_plus(video_info['length_seconds'][0]))
c5e8d7af 1220
1fb07d10
JG
1221 # annotations
1222 video_annotations = None
1223 if self._downloader.params.get('writeannotations', False):
5f6a1245 1224 video_annotations = self._extract_annotations(video_id)
1fb07d10 1225
dd27fd17
PH
1226 def _map_to_format_list(urlmap):
1227 formats = []
1228 for itag, video_real_url in urlmap.items():
1229 dct = {
1230 'format_id': itag,
1231 'url': video_real_url,
1232 'player_url': player_url,
1233 }
0b65e5d4
PH
1234 if itag in self._formats:
1235 dct.update(self._formats[itag])
dd27fd17
PH
1236 formats.append(dct)
1237 return formats
1238
c5e8d7af
PH
1239 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1240 self.report_rtmp_download()
dd27fd17
PH
1241 formats = [{
1242 'format_id': '_rtmp',
1243 'protocol': 'rtmp',
1244 'url': video_info['conn'][0],
1245 'player_url': player_url,
1246 }]
24270b03 1247 elif len(video_info.get('url_encoded_fmt_stream_map', [''])[0]) >= 1 or len(video_info.get('adaptive_fmts', [''])[0]) >= 1:
5f6a1245 1248 encoded_url_map = video_info.get('url_encoded_fmt_stream_map', [''])[0] + ',' + video_info.get('adaptive_fmts', [''])[0]
00fe14fc 1249 if 'rtmpe%3Dyes' in encoded_url_map:
a7055eb9 1250 raise ExtractorError('rtmpe downloads are not supported, see https://github.com/rg3/youtube-dl/issues/343 for more information.', expected=True)
c5e8d7af 1251 url_map = {}
00fe14fc 1252 for url_data_str in encoded_url_map.split(','):
c5e8d7af 1253 url_data = compat_parse_qs(url_data_str)
201e9eaa
PH
1254 if 'itag' not in url_data or 'url' not in url_data:
1255 continue
1256 format_id = url_data['itag'][0]
1257 url = url_data['url'][0]
1258
1259 if 'sig' in url_data:
1260 url += '&signature=' + url_data['sig'][0]
1261 elif 's' in url_data:
1262 encrypted_sig = url_data['s'][0]
6449cd80 1263 ASSETS_RE = r'"assets":.+?"js":\s*("[^"]+")'
201e9eaa 1264
beb95e77 1265 jsplayer_url_json = self._search_regex(
6449cd80
PH
1266 ASSETS_RE,
1267 embed_webpage if age_gate else video_webpage,
1268 'JS player URL (1)', default=None)
1269 if not jsplayer_url_json and not age_gate:
1270 # We need the embed website after all
1271 if embed_webpage is None:
1272 embed_url = proto + '://www.youtube.com/embed/%s' % video_id
1273 embed_webpage = self._download_webpage(
1274 embed_url, video_id, 'Downloading embed webpage')
1275 jsplayer_url_json = self._search_regex(
1276 ASSETS_RE, embed_webpage, 'JS player URL')
1277
beb95e77 1278 player_url = json.loads(jsplayer_url_json)
201e9eaa
PH
1279 if player_url is None:
1280 player_url_json = self._search_regex(
1281 r'ytplayer\.config.*?"url"\s*:\s*("[^"]+")',
78caa52a 1282 video_webpage, 'age gate player URL')
201e9eaa
PH
1283 player_url = json.loads(player_url_json)
1284
1285 if self._downloader.params.get('verbose'):
cf010131 1286 if player_url is None:
201e9eaa
PH
1287 player_version = 'unknown'
1288 player_desc = 'unknown'
1289 else:
1290 if player_url.endswith('swf'):
1291 player_version = self._search_regex(
1292 r'-(.+?)(?:/watch_as3)?\.swf$', player_url,
78caa52a 1293 'flash player', fatal=False)
201e9eaa 1294 player_desc = 'flash player %s' % player_version
cf010131 1295 else:
201e9eaa
PH
1296 player_version = self._search_regex(
1297 r'html5player-([^/]+?)(?:/html5player)?\.js',
1298 player_url,
1299 'html5 player', fatal=False)
78caa52a 1300 player_desc = 'html5 player %s' % player_version
201e9eaa 1301
60064c53 1302 parts_sizes = self._signature_cache_id(encrypted_sig)
69ea8ca4 1303 self.to_screen('{%s} signature length %s, %s' %
9e1a5b84 1304 (format_id, parts_sizes, player_desc))
201e9eaa
PH
1305
1306 signature = self._decrypt_signature(
1307 encrypted_sig, video_id, player_url, age_gate)
1308 url += '&signature=' + signature
1309 if 'ratebypass' not in url:
1310 url += '&ratebypass=yes'
1311 url_map[format_id] = url
dd27fd17 1312 formats = _map_to_format_list(url_map)
1d043b93
JMF
1313 elif video_info.get('hlsvp'):
1314 manifest_url = video_info['hlsvp'][0]
1315 url_map = self._extract_from_m3u8(manifest_url, video_id)
dd27fd17 1316 formats = _map_to_format_list(url_map)
c5e8d7af 1317 else:
69ea8ca4 1318 raise ExtractorError('no conn, hlsvp or url_encoded_fmt_stream_map information found in video info')
c5e8d7af 1319
dd27fd17 1320 # Look for the DASH manifest
203fb43f 1321 if self._downloader.params.get('youtube_include_dash_manifest', True):
77c6fb5b 1322 dash_mpd_fatal = True
d8d24a92
S
1323 for dash_manifest_url in dash_mpds:
1324 dash_formats = {}
774e208f 1325 try:
d8d24a92 1326 for df in self._parse_dash_manifest(
77c6fb5b 1327 video_id, dash_manifest_url, player_url, age_gate, dash_mpd_fatal):
d8d24a92
S
1328 # Do not overwrite DASH format found in some previous DASH manifest
1329 if df['format_id'] not in dash_formats:
1330 dash_formats[df['format_id']] = df
77c6fb5b
S
1331 # Additional DASH manifests may end up in HTTP Error 403 therefore
1332 # allow them to fail without bug report message if we already have
1333 # some DASH manifest succeeded. This is temporary workaround to reduce
1334 # burst of bug reports until we figure out the reason and whether it
1335 # can be fixed at all.
1336 dash_mpd_fatal = False
774e208f
PH
1337 except (ExtractorError, KeyError) as e:
1338 self.report_warning(
1339 'Skipping DASH manifest: %r' % e, video_id)
d8d24a92 1340 if dash_formats:
04b3b3df
JMF
1341 # Remove the formats we found through non-DASH, they
1342 # contain less info and it can be wrong, because we use
1343 # fixed values (for example the resolution). See
1344 # https://github.com/rg3/youtube-dl/issues/5774 for an
1345 # example.
d80265cc 1346 formats = [f for f in formats if f['format_id'] not in dash_formats.keys()]
d8d24a92 1347 formats.extend(dash_formats.values())
d80044c2 1348
6271f1ca
PH
1349 # Check for malformed aspect ratio
1350 stretched_m = re.search(
1351 r'<meta\s+property="og:video:tag".*?content="yt:stretch=(?P<w>[0-9]+):(?P<h>[0-9]+)">',
1352 video_webpage)
1353 if stretched_m:
1354 ratio = float(stretched_m.group('w')) / float(stretched_m.group('h'))
1355 for f in formats:
1356 if f.get('vcodec') != 'none':
1357 f['stretched_ratio'] = ratio
1358
4bcc7bd1 1359 self._sort_formats(formats)
4ea3be0a 1360
1361 return {
8bcc8756
JW
1362 'id': video_id,
1363 'uploader': video_uploader,
1364 'uploader_id': video_uploader_id,
1365 'upload_date': upload_date,
1366 'title': video_title,
1367 'thumbnail': video_thumbnail,
1368 'description': video_description,
1369 'categories': video_categories,
000b6b5a 1370 'tags': video_tags,
8bcc8756 1371 'subtitles': video_subtitles,
360e1ca5 1372 'automatic_captions': automatic_captions,
8bcc8756
JW
1373 'duration': video_duration,
1374 'age_limit': 18 if age_gate else 0,
1375 'annotations': video_annotations,
7e8c0af0 1376 'webpage_url': proto + '://www.youtube.com/watch?v=%s' % video_id,
8bcc8756 1377 'view_count': view_count,
4ea3be0a 1378 'like_count': like_count,
1379 'dislike_count': dislike_count,
2d30521a 1380 'average_rating': float_or_none(video_info.get('avg_rating', [None])[0]),
8bcc8756 1381 'formats': formats,
2fe1ff85 1382 'is_live': is_live,
7c80519c 1383 'start_time': start_time,
297a564b 1384 'end_time': end_time,
4ea3be0a 1385 }
c5e8d7af 1386
5f6a1245 1387
880e1c52 1388class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
78caa52a 1389 IE_DESC = 'YouTube.com playlists'
d67cc9fa 1390 _VALID_URL = r"""(?x)(?:
c5e8d7af
PH
1391 (?:https?://)?
1392 (?:\w+\.)?
1393 youtube\.com/
1394 (?:
ac7553d0 1395 (?:course|view_play_list|my_playlists|artist|playlist|watch|embed/videoseries)
c5e8d7af
PH
1396 \? (?:.*?&)*? (?:p|a|list)=
1397 | p/
1398 )
d67cc9fa 1399 (
99209c29 1400 (?:PL|LL|EC|UU|FL|RD|UL)?[0-9A-Za-z-_]{10,}
5f6a1245 1401 # Top tracks, they can also include dots
d67cc9fa
JMF
1402 |(?:MC)[\w\.]*
1403 )
c5e8d7af
PH
1404 .*
1405 |
99209c29 1406 ((?:PL|LL|EC|UU|FL|RD|UL)[0-9A-Za-z-_]{10,})
c5e8d7af 1407 )"""
dbb94fb0 1408 _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s'
dbb94fb0 1409 _VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})&amp;[^"]*?index=(?P<index>\d+)'
78caa52a 1410 IE_NAME = 'youtube:playlist'
81127aa5
PH
1411 _TESTS = [{
1412 'url': 'https://www.youtube.com/playlist?list=PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re',
1413 'info_dict': {
1414 'title': 'ytdl test PL',
a1cf99d0 1415 'id': 'PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re',
81127aa5
PH
1416 },
1417 'playlist_count': 3,
9291475f
PH
1418 }, {
1419 'url': 'https://www.youtube.com/playlist?list=PLtPgu7CB4gbZDA7i_euNxn75ISqxwZPYx',
1420 'info_dict': {
acf757f4 1421 'id': 'PLtPgu7CB4gbZDA7i_euNxn75ISqxwZPYx',
9291475f
PH
1422 'title': 'YDL_Empty_List',
1423 },
1424 'playlist_count': 0,
1425 }, {
1426 'note': 'Playlist with deleted videos (#651). As a bonus, the video #51 is also twice in this list.',
1427 'url': 'https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC',
1428 'info_dict': {
1429 'title': '29C3: Not my department',
acf757f4 1430 'id': 'PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC',
9291475f
PH
1431 },
1432 'playlist_count': 95,
1433 }, {
1434 'note': 'issue #673',
1435 'url': 'PLBB231211A4F62143',
1436 'info_dict': {
f46a8702 1437 'title': '[OLD]Team Fortress 2 (Class-based LP)',
acf757f4 1438 'id': 'PLBB231211A4F62143',
9291475f
PH
1439 },
1440 'playlist_mincount': 26,
1441 }, {
1442 'note': 'Large playlist',
1443 'url': 'https://www.youtube.com/playlist?list=UUBABnxM4Ar9ten8Mdjj1j0Q',
1444 'info_dict': {
1445 'title': 'Uploads from Cauchemar',
acf757f4 1446 'id': 'UUBABnxM4Ar9ten8Mdjj1j0Q',
9291475f
PH
1447 },
1448 'playlist_mincount': 799,
1449 }, {
1450 'url': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl',
1451 'info_dict': {
1452 'title': 'YDL_safe_search',
acf757f4 1453 'id': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl',
9291475f
PH
1454 },
1455 'playlist_count': 2,
ac7553d0
PH
1456 }, {
1457 'note': 'embedded',
1458 'url': 'http://www.youtube.com/embed/videoseries?list=PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu',
1459 'playlist_count': 4,
1460 'info_dict': {
1461 'title': 'JODA15',
acf757f4 1462 'id': 'PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu',
ac7553d0 1463 }
6b08cdf6
PH
1464 }, {
1465 'note': 'Embedded SWF player',
1466 'url': 'http://www.youtube.com/p/YN5VISEtHet5D4NEvfTd0zcgFk84NqFZ?hl=en_US&fs=1&rel=0',
1467 'playlist_count': 4,
1468 'info_dict': {
1469 'title': 'JODA7',
acf757f4 1470 'id': 'YN5VISEtHet5D4NEvfTd0zcgFk84NqFZ',
6b08cdf6 1471 }
4b7df0d3
JMF
1472 }, {
1473 'note': 'Buggy playlist: the webpage has a "Load more" button but it doesn\'t have more videos',
1474 'url': 'https://www.youtube.com/playlist?list=UUXw-G3eDE9trcvY2sBMM_aA',
1475 'info_dict': {
acf757f4
PH
1476 'title': 'Uploads from Interstellar Movie',
1477 'id': 'UUXw-G3eDE9trcvY2sBMM_aA',
4b7df0d3
JMF
1478 },
1479 'playlist_mincout': 21,
81127aa5 1480 }]
c5e8d7af 1481
880e1c52
JMF
1482 def _real_initialize(self):
1483 self._login()
1484
652cdaa2 1485 def _extract_mix(self, playlist_id):
99209c29 1486 # The mixes are generated from a single video
652cdaa2 1487 # the id of the playlist is just 'RD' + video_id
7d4afc55 1488 url = 'https://youtube.com/watch?v=%s&list=%s' % (playlist_id[-11:], playlist_id)
c9cc0bf5 1489 webpage = self._download_webpage(
78caa52a 1490 url, playlist_id, 'Downloading Youtube mix')
bc2f773b 1491 search_title = lambda class_name: get_element_by_attribute('class', class_name, webpage)
c9cc0bf5
PH
1492 title_span = (
1493 search_title('playlist-title') or
1494 search_title('title long-title') or
1495 search_title('title'))
76d1700b 1496 title = clean_html(title_span)
c9cc0bf5
PH
1497 ids = orderedSet(re.findall(
1498 r'''(?xs)data-video-username=".*?".*?
1499 href="/watch\?v=([0-9A-Za-z_-]{11})&amp;[^"]*?list=%s''' % re.escape(playlist_id),
1500 webpage))
652cdaa2
JMF
1501 url_results = self._ids_to_results(ids)
1502
1503 return self.playlist_result(url_results, playlist_id, title)
1504
448830ce 1505 def _extract_playlist(self, playlist_id):
dbb94fb0
S
1506 url = self._TEMPLATE_URL % playlist_id
1507 page = self._download_webpage(url, playlist_id)
dbb94fb0 1508
39b62db1
YCH
1509 for match in re.findall(r'<div class="yt-alert-message">([^<]+)</div>', page):
1510 match = match.strip()
1511 # Check if the playlist exists or is private
1512 if re.match(r'[^<]*(The|This) playlist (does not exist|is private)[^<]*', match):
1513 raise ExtractorError(
1514 'The playlist doesn\'t exist or is private, use --username or '
1515 '--netrc to access it.',
1516 expected=True)
1517 elif re.match(r'[^<]*Invalid parameters[^<]*', match):
1518 raise ExtractorError(
1519 'Invalid parameters. Maybe URL is incorrect.',
1520 expected=True)
1521 elif re.match(r'[^<]*Choose your language[^<]*', match):
1522 continue
1523 else:
1524 self.report_warning('Youtube gives an alert message: ' + match)
10c0e2d8 1525
dcbb4580 1526 # Extract the video ids from the playlist pages
70219b0f
JMF
1527 def _entries():
1528 more_widget_html = content_html = page
1529 for page_num in itertools.count(1):
1530 matches = re.finditer(self._VIDEO_RE, content_html)
1531 # We remove the duplicates and the link with index 0
1532 # (it's not the first video of the playlist)
1533 new_ids = orderedSet(m.group('id') for m in matches if m.group('index') != '0')
1534 for vid_id in new_ids:
1535 yield self.url_result(vid_id, 'Youtube', video_id=vid_id)
1536
1537 mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
1538 if not mobj:
1539 break
1540
1541 more = self._download_json(
1542 'https://youtube.com/%s' % mobj.group('more'), playlist_id,
1543 'Downloading page #%s' % page_num,
1544 transform_source=uppercase_escape)
1545 content_html = more['content_html']
1546 if not content_html.strip():
1547 # Some webpages show a "Load more" button but they don't
1548 # have more videos
1549 break
1550 more_widget_html = more['load_more_widget_html']
dbb94fb0
S
1551
1552 playlist_title = self._html_search_regex(
68eb8e90 1553 r'(?s)<h1 class="pl-header-title[^"]*">\s*(.*?)\s*</h1>',
78caa52a 1554 page, 'title')
c5e8d7af 1555
70219b0f 1556 return self.playlist_result(_entries(), playlist_id, playlist_title)
c5e8d7af 1557
448830ce
S
1558 def _real_extract(self, url):
1559 # Extract playlist id
1560 mobj = re.match(self._VALID_URL, url)
1561 if mobj is None:
1562 raise ExtractorError('Invalid URL: %s' % url)
1563 playlist_id = mobj.group(1) or mobj.group(2)
1564
1565 # Check if it's a video-specific URL
1566 query_dict = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
1567 if 'v' in query_dict:
1568 video_id = query_dict['v'][0]
1569 if self._downloader.params.get('noplaylist'):
1570 self.to_screen('Downloading just video %s because of --no-playlist' % video_id)
1571 return self.url_result(video_id, 'Youtube', video_id=video_id)
1572 else:
1573 self.to_screen('Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id))
1574
1575 if playlist_id.startswith('RD') or playlist_id.startswith('UL'):
1576 # Mixes require a custom extraction process
1577 return self._extract_mix(playlist_id)
1578
1579 return self._extract_playlist(playlist_id)
1580
c5e8d7af
PH
1581
1582class YoutubeChannelIE(InfoExtractor):
78caa52a 1583 IE_DESC = 'YouTube.com channels'
9ff67727 1584 _VALID_URL = r'https?://(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/(?P<id>[0-9A-Za-z_-]+)'
eb0f3e7e 1585 _TEMPLATE_URL = 'https://www.youtube.com/channel/%s/videos'
78caa52a 1586 IE_NAME = 'youtube:channel'
cdc628a4
PH
1587 _TESTS = [{
1588 'note': 'paginated channel',
1589 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w',
1590 'playlist_mincount': 91,
acf757f4
PH
1591 'info_dict': {
1592 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
1593 }
cdc628a4 1594 }]
c5e8d7af 1595
6de5dbaf
S
1596 @staticmethod
1597 def extract_videos_from_page(page):
c5e8d7af 1598 ids_in_page = []
fb69240c
S
1599 titles_in_page = []
1600 for mobj in re.finditer(r'(?:title="(?P<title>[^"]+)"[^>]+)?href="/watch\?v=(?P<id>[0-9A-Za-z_-]+)&?', page):
1601 video_id = mobj.group('id')
1602 video_title = unescapeHTML(mobj.group('title'))
1603 try:
1604 idx = ids_in_page.index(video_id)
1605 if video_title and not titles_in_page[idx]:
1606 titles_in_page[idx] = video_title
1607 except ValueError:
1608 ids_in_page.append(video_id)
1609 titles_in_page.append(video_title)
1610 return zip(ids_in_page, titles_in_page)
c5e8d7af
PH
1611
1612 def _real_extract(self, url):
9ff67727 1613 channel_id = self._match_id(url)
c5e8d7af 1614
eb0f3e7e 1615 url = self._TEMPLATE_URL % channel_id
386bdfa6
S
1616
1617 # Channel by page listing is restricted to 35 pages of 30 items, i.e. 1050 videos total (see #5778)
1618 # Workaround by extracting as a playlist if managed to obtain channel playlist URL
1619 # otherwise fallback on channel by page extraction
1620 channel_page = self._download_webpage(
1621 url + '?view=57', channel_id,
1622 'Downloading channel page', fatal=False)
3d8e9573
S
1623 channel_playlist_id = self._html_search_meta(
1624 'channelId', channel_page, 'channel id', default=None)
1625 if not channel_playlist_id:
1626 channel_playlist_id = self._search_regex(
1627 r'data-channel-external-id="([^"]+)"',
1628 channel_page, 'channel id', default=None)
386bdfa6
S
1629 if channel_playlist_id and channel_playlist_id.startswith('UC'):
1630 playlist_id = 'UU' + channel_playlist_id[2:]
d2a9de78
IK
1631 return self.url_result(
1632 compat_urlparse.urljoin(url, '/playlist?list=%s' % playlist_id), 'YoutubePlaylist')
386bdfa6 1633
60bf45c8 1634 channel_page = self._download_webpage(url, channel_id, 'Downloading page #1')
31812a9e
PH
1635 autogenerated = re.search(r'''(?x)
1636 class="[^"]*?(?:
1637 channel-header-autogenerated-label|
1638 yt-channel-title-autogenerated
1639 )[^"]*"''', channel_page) is not None
c5e8d7af 1640
b9643eed
JMF
1641 if autogenerated:
1642 # The videos are contained in a single page
1643 # the ajax pages can't be used, they are empty
b82f815f 1644 entries = [
fb69240c
S
1645 self.url_result(
1646 video_id, 'Youtube', video_id=video_id,
1647 video_title=video_title)
8f02ad4f 1648 for video_id, video_title in self.extract_videos_from_page(channel_page)]
b82f815f
PH
1649 return self.playlist_result(entries, channel_id)
1650
1651 def _entries():
23d3608c 1652 more_widget_html = content_html = channel_page
b9643eed 1653 for pagenum in itertools.count(1):
81c2f20b 1654
8f02ad4f 1655 for video_id, video_title in self.extract_videos_from_page(content_html):
b82f815f 1656 yield self.url_result(
fb69240c
S
1657 video_id, 'Youtube', video_id=video_id,
1658 video_title=video_title)
5f6a1245 1659
23d3608c
JMF
1660 mobj = re.search(
1661 r'data-uix-load-more-href="/?(?P<more>[^"]+)"',
1662 more_widget_html)
1663 if not mobj:
b9643eed 1664 break
c5e8d7af 1665
23d3608c
JMF
1666 more = self._download_json(
1667 'https://youtube.com/%s' % mobj.group('more'), channel_id,
1668 'Downloading page #%s' % (pagenum + 1),
1669 transform_source=uppercase_escape)
1670 content_html = more['content_html']
1671 more_widget_html = more['load_more_widget_html']
1672
b82f815f 1673 return self.playlist_result(_entries(), channel_id)
c5e8d7af
PH
1674
1675
eb0f3e7e 1676class YoutubeUserIE(YoutubeChannelIE):
78caa52a 1677 IE_DESC = 'YouTube.com user videos (URL or "ytuser" keyword)'
9ff67727 1678 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:user/)?(?!(?:attribution_link|watch|results)(?:$|[^a-z_A-Z0-9-])))|ytuser:)(?!feed/)(?P<id>[A-Za-z0-9_-]+)'
eb0f3e7e 1679 _TEMPLATE_URL = 'https://www.youtube.com/user/%s/videos'
78caa52a 1680 IE_NAME = 'youtube:user'
c5e8d7af 1681
cdc628a4
PH
1682 _TESTS = [{
1683 'url': 'https://www.youtube.com/user/TheLinuxFoundation',
1684 'playlist_mincount': 320,
1685 'info_dict': {
1686 'title': 'TheLinuxFoundation',
1687 }
1688 }, {
1689 'url': 'ytuser:phihag',
1690 'only_matching': True,
1691 }]
1692
e3ea4790 1693 @classmethod
f4b05232 1694 def suitable(cls, url):
e3ea4790
JMF
1695 # Don't return True if the url can be extracted with other youtube
1696 # extractor, the regex would is too permissive and it would match.
1697 other_ies = iter(klass for (name, klass) in globals().items() if name.endswith('IE') and klass is not cls)
5f6a1245
JW
1698 if any(ie.suitable(url) for ie in other_ies):
1699 return False
1700 else:
1701 return super(YoutubeUserIE, cls).suitable(url)
f4b05232 1702
b05654f0 1703
b4c08069 1704class YoutubeSearchIE(SearchInfoExtractor, YoutubePlaylistIE):
78caa52a 1705 IE_DESC = 'YouTube.com searches'
b4c08069
JMF
1706 # there doesn't appear to be a real limit, for example if you search for
1707 # 'python' you get more than 8.000.000 results
1708 _MAX_RESULTS = float('inf')
78caa52a 1709 IE_NAME = 'youtube:search'
b05654f0 1710 _SEARCH_KEY = 'ytsearch'
b4c08069 1711 _EXTRA_QUERY_ARGS = {}
9dd8e46a 1712 _TESTS = []
b05654f0 1713
b05654f0
PH
1714 def _get_n_results(self, query, n):
1715 """Get a specified number of results for a query"""
1716
b4c08069 1717 videos = []
b05654f0
PH
1718 limit = n
1719
b4c08069
JMF
1720 for pagenum in itertools.count(1):
1721 url_query = {
02175a79 1722 'search_query': query.encode('utf-8'),
b4c08069
JMF
1723 'page': pagenum,
1724 'spf': 'navigate',
1725 }
1726 url_query.update(self._EXTRA_QUERY_ARGS)
1727 result_url = 'https://www.youtube.com/results?' + compat_urllib_parse.urlencode(url_query)
1728 data = self._download_json(
69ea8ca4 1729 result_url, video_id='query "%s"' % query,
b4c08069 1730 note='Downloading page %s' % pagenum,
69ea8ca4 1731 errnote='Unable to download API page')
b4c08069 1732 html_content = data[1]['body']['content']
7cc3570e 1733
b4c08069 1734 if 'class="search-message' in html_content:
07ad22b8 1735 raise ExtractorError(
78caa52a 1736 '[youtube] No video results', expected=True)
b05654f0 1737
b4c08069
JMF
1738 new_videos = self._ids_to_results(orderedSet(re.findall(
1739 r'href="/watch\?v=(.{11})', html_content)))
1740 videos += new_videos
1741 if not new_videos or len(videos) > limit:
1742 break
b05654f0 1743
b4c08069
JMF
1744 if len(videos) > n:
1745 videos = videos[:n]
b05654f0 1746 return self.playlist_result(videos, query)
75dff0ee 1747
c9ae7b95 1748
a3dd9248 1749class YoutubeSearchDateIE(YoutubeSearchIE):
cb7fb546 1750 IE_NAME = YoutubeSearchIE.IE_NAME + ':date'
a3dd9248 1751 _SEARCH_KEY = 'ytsearchdate'
78caa52a 1752 IE_DESC = 'YouTube.com searches, newest videos first'
b4c08069 1753 _EXTRA_QUERY_ARGS = {'search_sort': 'video_date_uploaded'}
75dff0ee 1754
c9ae7b95
PH
1755
1756class YoutubeSearchURLIE(InfoExtractor):
78caa52a
PH
1757 IE_DESC = 'YouTube.com search URLs'
1758 IE_NAME = 'youtube:search_url'
c9ae7b95 1759 _VALID_URL = r'https?://(?:www\.)?youtube\.com/results\?(.*?&)?search_query=(?P<query>[^&]+)(?:[&]|$)'
cdc628a4
PH
1760 _TESTS = [{
1761 'url': 'https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video',
1762 'playlist_mincount': 5,
1763 'info_dict': {
1764 'title': 'youtube-dl test video',
1765 }
1766 }]
c9ae7b95
PH
1767
1768 def _real_extract(self, url):
1769 mobj = re.match(self._VALID_URL, url)
7fd002c0 1770 query = compat_urllib_parse_unquote_plus(mobj.group('query'))
c9ae7b95
PH
1771
1772 webpage = self._download_webpage(url, query)
1773 result_code = self._search_regex(
98998cde 1774 r'(?s)<ol[^>]+class="item-section"(.*?)</ol>', webpage, 'result HTML')
c9ae7b95
PH
1775
1776 part_codes = re.findall(
1777 r'(?s)<h3 class="yt-lockup-title">(.*?)</h3>', result_code)
1778 entries = []
1779 for part_code in part_codes:
1780 part_title = self._html_search_regex(
6feb2d5e 1781 [r'(?s)title="([^"]+)"', r'>([^<]+)</a>'], part_code, 'item title', fatal=False)
c9ae7b95
PH
1782 part_url_snippet = self._html_search_regex(
1783 r'(?s)href="([^"]+)"', part_code, 'item URL')
1784 part_url = compat_urlparse.urljoin(
1785 'https://www.youtube.com/', part_url_snippet)
1786 entries.append({
1787 '_type': 'url',
1788 'url': part_url,
1789 'title': part_title,
1790 })
1791
1792 return {
1793 '_type': 'playlist',
1794 'entries': entries,
1795 'title': query,
1796 }
1797
1798
75dff0ee 1799class YoutubeShowIE(InfoExtractor):
78caa52a 1800 IE_DESC = 'YouTube.com (multi-season) shows'
cdc628a4 1801 _VALID_URL = r'https?://www\.youtube\.com/show/(?P<id>[^?#]*)'
78caa52a 1802 IE_NAME = 'youtube:show'
cdc628a4
PH
1803 _TESTS = [{
1804 'url': 'http://www.youtube.com/show/airdisasters',
1805 'playlist_mincount': 3,
1806 'info_dict': {
1807 'id': 'airdisasters',
1808 'title': 'Air Disasters',
1809 }
1810 }]
75dff0ee
JMF
1811
1812 def _real_extract(self, url):
1813 mobj = re.match(self._VALID_URL, url)
cdc628a4
PH
1814 playlist_id = mobj.group('id')
1815 webpage = self._download_webpage(
1816 url, playlist_id, 'Downloading show webpage')
75dff0ee
JMF
1817 # There's one playlist for each season of the show
1818 m_seasons = list(re.finditer(r'href="(/playlist\?list=.*?)"', webpage))
cdc628a4
PH
1819 self.to_screen('%s: Found %s seasons' % (playlist_id, len(m_seasons)))
1820 entries = [
1821 self.url_result(
1822 'https://www.youtube.com' + season.group(1), 'YoutubePlaylist')
1823 for season in m_seasons
1824 ]
1825 title = self._og_search_title(webpage, fatal=False)
1826
1827 return {
1828 '_type': 'playlist',
1829 'id': playlist_id,
1830 'title': title,
1831 'entries': entries,
1832 }
04cc9617
JMF
1833
1834
b2e8bc1b 1835class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
d7ae0639 1836 """
25f14e9f 1837 Base class for feed extractors
d7ae0639
JMF
1838 Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.
1839 """
b2e8bc1b 1840 _LOGIN_REQUIRED = True
d7ae0639
JMF
1841
1842 @property
1843 def IE_NAME(self):
78caa52a 1844 return 'youtube:%s' % self._FEED_NAME
04cc9617 1845
81f0259b 1846 def _real_initialize(self):
b2e8bc1b 1847 self._login()
81f0259b 1848
04cc9617 1849 def _real_extract(self, url):
25f14e9f
S
1850 page = self._download_webpage(
1851 'https://www.youtube.com/feed/%s' % self._FEED_NAME, self._PLAYLIST_TITLE)
2bc43303
JMF
1852
1853 # The extraction process is the same as for playlists, but the regex
1854 # for the video ids doesn't contain an index
1855 ids = []
1856 more_widget_html = content_html = page
2bc43303
JMF
1857 for page_num in itertools.count(1):
1858 matches = re.findall(r'href="\s*/watch\?v=([0-9A-Za-z_-]{11})', content_html)
62c95fd5
S
1859
1860 # 'recommended' feed has infinite 'load more' and each new portion spins
1861 # the same videos in (sometimes) slightly different order, so we'll check
1862 # for unicity and break when portion has no new videos
1863 new_ids = filter(lambda video_id: video_id not in ids, orderedSet(matches))
1864 if not new_ids:
1865 break
1866
2bc43303
JMF
1867 ids.extend(new_ids)
1868
1869 mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
1870 if not mobj:
1871 break
1872
1873 more = self._download_json(
25f14e9f 1874 'https://youtube.com/%s' % mobj.group('more'), self._PLAYLIST_TITLE,
2bc43303
JMF
1875 'Downloading page #%s' % page_num,
1876 transform_source=uppercase_escape)
1877 content_html = more['content_html']
1878 more_widget_html = more['load_more_widget_html']
1879
25f14e9f
S
1880 return self.playlist_result(
1881 self._ids_to_results(ids), playlist_title=self._PLAYLIST_TITLE)
1882
1883
1884class YoutubeWatchLaterIE(YoutubePlaylistIE):
1885 IE_NAME = 'youtube:watchlater'
1886 IE_DESC = 'Youtube watch later list, ":ytwatchlater" for short (requires authentication)'
1887 _VALID_URL = r'https?://www\.youtube\.com/(?:feed/watch_later|playlist\?list=WL)|:ytwatchlater'
1888
1889 _TESTS = [] # override PlaylistIE tests
1890
1891 def _real_extract(self, url):
1892 return self._extract_playlist('WL')
f459d170 1893
5f6a1245 1894
c626a3d9 1895class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
78caa52a 1896 IE_NAME = 'youtube:favorites'
f3a34072 1897 IE_DESC = 'YouTube.com favourite videos, ":ytfav" for short (requires authentication)'
c7a7750d 1898 _VALID_URL = r'https?://www\.youtube\.com/my_favorites|:ytfav(?:ou?rites)?'
c626a3d9
JMF
1899 _LOGIN_REQUIRED = True
1900
1901 def _real_extract(self, url):
1902 webpage = self._download_webpage('https://www.youtube.com/my_favorites', 'Youtube Favourites videos')
78caa52a 1903 playlist_id = self._search_regex(r'list=(.+?)["&]', webpage, 'favourites playlist id')
c626a3d9 1904 return self.url_result(playlist_id, 'YoutubePlaylist')
15870e90
PH
1905
1906
25f14e9f
S
1907class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
1908 IE_DESC = 'YouTube.com recommended videos, ":ytrec" for short (requires authentication)'
1909 _VALID_URL = r'https?://www\.youtube\.com/feed/recommended|:ytrec(?:ommended)?'
1910 _FEED_NAME = 'recommended'
1911 _PLAYLIST_TITLE = 'Youtube Recommended videos'
1ed5b5c9 1912
1ed5b5c9 1913
25f14e9f
S
1914class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor):
1915 IE_DESC = 'YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication)'
1916 _VALID_URL = r'https?://www\.youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
1917 _FEED_NAME = 'subscriptions'
1918 _PLAYLIST_TITLE = 'Youtube Subscriptions'
1ed5b5c9 1919
1ed5b5c9 1920
25f14e9f
S
1921class YoutubeHistoryIE(YoutubeFeedsInfoExtractor):
1922 IE_DESC = 'Youtube watch history, ":ythistory" for short (requires authentication)'
1923 _VALID_URL = 'https?://www\.youtube\.com/feed/history|:ythistory'
1924 _FEED_NAME = 'history'
1925 _PLAYLIST_TITLE = 'Youtube History'
1ed5b5c9
JMF
1926
1927
15870e90
PH
1928class YoutubeTruncatedURLIE(InfoExtractor):
1929 IE_NAME = 'youtube:truncated_url'
1930 IE_DESC = False # Do not list
975d35db 1931 _VALID_URL = r'''(?x)
b95aab84
PH
1932 (?:https?://)?
1933 (?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/
1934 (?:watch\?(?:
c4808c60 1935 feature=[a-z_]+|
b95aab84
PH
1936 annotation_id=annotation_[^&]+|
1937 x-yt-cl=[0-9]+|
c1708b89 1938 hl=[^&]*|
b95aab84
PH
1939 )?
1940 |
1941 attribution_link\?a=[^&]+
1942 )
1943 $
975d35db 1944 '''
15870e90 1945
c4808c60
PH
1946 _TESTS = [{
1947 'url': 'http://www.youtube.com/watch?annotation_id=annotation_3951667041',
1948 'only_matching': True,
dc2fc736
PH
1949 }, {
1950 'url': 'http://www.youtube.com/watch?',
1951 'only_matching': True,
b95aab84
PH
1952 }, {
1953 'url': 'https://www.youtube.com/watch?x-yt-cl=84503534',
1954 'only_matching': True,
1955 }, {
1956 'url': 'https://www.youtube.com/watch?feature=foo',
1957 'only_matching': True,
c1708b89
PH
1958 }, {
1959 'url': 'https://www.youtube.com/watch?hl=en-GB',
1960 'only_matching': True,
c4808c60
PH
1961 }]
1962
15870e90
PH
1963 def _real_extract(self, url):
1964 raise ExtractorError(
78caa52a
PH
1965 'Did you forget to quote the URL? Remember that & is a meta '
1966 'character in most shells, so you want to put the URL in quotes, '
1967 'like youtube-dl '
1968 '"http://www.youtube.com/watch?feature=foo&v=BaW_jenozKc" '
1969 ' or simply youtube-dl BaW_jenozKc .',
15870e90 1970 expected=True)
772fd5cc
PH
1971
1972
1973class YoutubeTruncatedIDIE(InfoExtractor):
1974 IE_NAME = 'youtube:truncated_id'
1975 IE_DESC = False # Do not list
b95aab84 1976 _VALID_URL = r'https?://(?:www\.)?youtube\.com/watch\?v=(?P<id>[0-9A-Za-z_-]{1,10})$'
772fd5cc
PH
1977
1978 _TESTS = [{
1979 'url': 'https://www.youtube.com/watch?v=N_708QY7Ob',
1980 'only_matching': True,
1981 }]
1982
1983 def _real_extract(self, url):
1984 video_id = self._match_id(url)
1985 raise ExtractorError(
1986 'Incomplete YouTube ID %s. URL %s looks truncated.' % (video_id, url),
1987 expected=True)