]> jfr.im git - yt-dlp.git/blame - youtube_dl/extractor/youtube.py
Merge pull request #6533 from sceext2/fix-iqiyi-2015-08-10
[yt-dlp.git] / youtube_dl / extractor / youtube.py
CommitLineData
c5e8d7af 1# coding: utf-8
c5e8d7af 2
78caa52a
PH
3from __future__ import unicode_literals
4
5
0ca96d48 6import itertools
c5e8d7af 7import json
c4417ddb 8import os.path
c5e8d7af 9import re
42939b61 10import time
e0df6211 11import traceback
c5e8d7af 12
b05654f0 13from .common import InfoExtractor, SearchInfoExtractor
2b25cb5d 14from ..jsinterp import JSInterpreter
54256267 15from ..swfinterp import SWFInterpreter
4bb4a188 16from ..compat import (
edf3e38e 17 compat_chr,
c5e8d7af 18 compat_parse_qs,
c5e8d7af 19 compat_urllib_parse,
7fd002c0
S
20 compat_urllib_parse_unquote,
21 compat_urllib_parse_unquote_plus,
7c80519c 22 compat_urllib_parse_urlparse,
c5e8d7af 23 compat_urllib_request,
7c61bd36 24 compat_urlparse,
c5e8d7af 25 compat_str,
4bb4a188
PH
26)
27from ..utils import (
c5e8d7af 28 clean_html,
c5e8d7af 29 ExtractorError,
2d30521a 30 float_or_none,
4bb4a188
PH
31 get_element_by_attribute,
32 get_element_by_id,
dd27fd17 33 int_or_none,
4bb4a188 34 orderedSet,
7c80519c 35 parse_duration,
cf7e015f 36 smuggle_url,
c93d53f5 37 str_to_int,
c5e8d7af
PH
38 unescapeHTML,
39 unified_strdate,
cf7e015f 40 unsmuggle_url,
81c2f20b 41 uppercase_escape,
af214c3a 42 ISO3166Utils,
c5e8d7af
PH
43)
44
5f6a1245 45
de7f3446 46class YoutubeBaseInfoExtractor(InfoExtractor):
b2e8bc1b
JMF
47 """Provide base functions for Youtube extractors"""
48 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
83317f69 49 _TWOFACTOR_URL = 'https://accounts.google.com/SecondFactor'
b2e8bc1b
JMF
50 _NETRC_MACHINE = 'youtube'
51 # If True it will raise an error if no login info is provided
52 _LOGIN_REQUIRED = False
53
b2e8bc1b 54 def _set_language(self):
810fb84d
PH
55 self._set_cookie(
56 '.youtube.com', 'PREF', 'f1=50000000&hl=en',
42939b61 57 # YouTube sets the expire time to about two months
810fb84d 58 expire_time=time.time() + 2 * 30 * 24 * 3600)
b2e8bc1b 59
25f14e9f
S
60 def _ids_to_results(self, ids):
61 return [
62 self.url_result(vid_id, 'Youtube', video_id=vid_id)
63 for vid_id in ids]
64
b2e8bc1b 65 def _login(self):
83317f69 66 """
67 Attempt to log in to YouTube.
68 True is returned if successful or skipped.
69 False is returned if login failed.
70
71 If _LOGIN_REQUIRED is set and no authentication was provided, an error is raised.
72 """
b2e8bc1b
JMF
73 (username, password) = self._get_login_info()
74 # No authentication to be performed
75 if username is None:
76 if self._LOGIN_REQUIRED:
69ea8ca4 77 raise ExtractorError('No login info available, needed for using %s.' % self.IE_NAME, expected=True)
83317f69 78 return True
b2e8bc1b 79
7cc3570e
PH
80 login_page = self._download_webpage(
81 self._LOGIN_URL, None,
69ea8ca4
PH
82 note='Downloading login page',
83 errnote='unable to fetch login page', fatal=False)
7cc3570e
PH
84 if login_page is False:
85 return
b2e8bc1b 86
795f28f8 87 galx = self._search_regex(r'(?s)<input.+?name="GALX".+?value="(.+?)"',
78caa52a 88 login_page, 'Login GALX parameter')
c5e8d7af 89
b2e8bc1b
JMF
90 # Log in
91 login_form_strs = {
8bcc8756
JW
92 'continue': 'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
93 'Email': username,
94 'GALX': galx,
95 'Passwd': password,
96
97 'PersistentCookie': 'yes',
98 '_utf8': '霱',
99 'bgresponse': 'js_disabled',
100 'checkConnection': '',
101 'checkedDomains': 'youtube',
102 'dnConn': '',
103 'pstMsg': '0',
104 'rmShown': '1',
105 'secTok': '',
106 'signIn': 'Sign in',
107 'timeStmp': '',
108 'service': 'youtube',
109 'uilel': '3',
110 'hl': 'en_US',
b2e8bc1b 111 }
83317f69 112
b2e8bc1b
JMF
113 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
114 # chokes on unicode
5f6a1245 115 login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k, v in login_form_strs.items())
b2e8bc1b 116 login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
7cc3570e
PH
117
118 req = compat_urllib_request.Request(self._LOGIN_URL, login_data)
119 login_results = self._download_webpage(
120 req, None,
69ea8ca4 121 note='Logging in', errnote='unable to log in', fatal=False)
7cc3570e
PH
122 if login_results is False:
123 return False
83317f69 124
125 if re.search(r'id="errormsg_0_Passwd"', login_results) is not None:
69ea8ca4 126 raise ExtractorError('Please use your account password and a two-factor code instead of an application-specific password.', expected=True)
83317f69 127
128 # Two-Factor
129 # TODO add SMS and phone call support - these require making a request and then prompting the user
130
131 if re.search(r'(?i)<form[^>]* id="gaia_secondfactorform"', login_results) is not None:
132 tfa_code = self._get_tfa_info()
133
134 if tfa_code is None:
69ea8ca4
PH
135 self._downloader.report_warning('Two-factor authentication required. Provide it with --twofactor <code>')
136 self._downloader.report_warning('(Note that only TOTP (Google Authenticator App) codes work at this time.)')
83317f69 137 return False
138
139 # Unlike the first login form, secTok and timeStmp are both required for the TFA form
140
141 match = re.search(r'id="secTok"\n\s+value=\'(.+)\'/>', login_results, re.M | re.U)
142 if match is None:
69ea8ca4 143 self._downloader.report_warning('Failed to get secTok - did the page structure change?')
83317f69 144 secTok = match.group(1)
145 match = re.search(r'id="timeStmp"\n\s+value=\'(.+)\'/>', login_results, re.M | re.U)
146 if match is None:
69ea8ca4 147 self._downloader.report_warning('Failed to get timeStmp - did the page structure change?')
83317f69 148 timeStmp = match.group(1)
149
150 tfa_form_strs = {
78caa52a
PH
151 'continue': 'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
152 'smsToken': '',
153 'smsUserPin': tfa_code,
154 'smsVerifyPin': 'Verify',
155
156 'PersistentCookie': 'yes',
157 'checkConnection': '',
158 'checkedDomains': 'youtube',
159 'pstMsg': '1',
160 'secTok': secTok,
161 'timeStmp': timeStmp,
162 'service': 'youtube',
163 'hl': 'en_US',
83317f69 164 }
5f6a1245 165 tfa_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k, v in tfa_form_strs.items())
83317f69 166 tfa_data = compat_urllib_parse.urlencode(tfa_form).encode('ascii')
167
168 tfa_req = compat_urllib_request.Request(self._TWOFACTOR_URL, tfa_data)
169 tfa_results = self._download_webpage(
170 tfa_req, None,
69ea8ca4 171 note='Submitting TFA code', errnote='unable to submit tfa', fatal=False)
83317f69 172
173 if tfa_results is False:
174 return False
175
176 if re.search(r'(?i)<form[^>]* id="gaia_secondfactorform"', tfa_results) is not None:
69ea8ca4 177 self._downloader.report_warning('Two-factor code expired. Please try again, or use a one-use backup code instead.')
83317f69 178 return False
179 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', tfa_results) is not None:
69ea8ca4 180 self._downloader.report_warning('unable to log in - did the page structure change?')
83317f69 181 return False
182 if re.search(r'smsauth-interstitial-reviewsettings', tfa_results) is not None:
69ea8ca4 183 self._downloader.report_warning('Your Google account has a security notice. Please log in on your web browser, resolve the notice, and try again.')
83317f69 184 return False
185
7cc3570e 186 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
69ea8ca4 187 self._downloader.report_warning('unable to log in: bad username or password')
b2e8bc1b
JMF
188 return False
189 return True
190
b2e8bc1b
JMF
191 def _real_initialize(self):
192 if self._downloader is None:
193 return
42939b61 194 self._set_language()
b2e8bc1b
JMF
195 if not self._login():
196 return
c5e8d7af 197
8377574c 198
360e1ca5 199class YoutubeIE(YoutubeBaseInfoExtractor):
78caa52a 200 IE_DESC = 'YouTube.com'
cb7dfeea 201 _VALID_URL = r"""(?x)^
c5e8d7af 202 (
edb53e2d 203 (?:https?://|//) # http(s):// or protocol-independent URL
cb7dfeea 204 (?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/|
484aaeb2 205 (?:www\.)?deturl\.com/www\.youtube\.com/|
e70dc1d1 206 (?:www\.)?pwnyoutube\.com/|
f7000f3a 207 (?:www\.)?yourepeat\.com/|
e69ae5b9
JMF
208 tube\.majestyc\.net/|
209 youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains
c5e8d7af
PH
210 (?:.*?\#/)? # handle anchor (#/) redirect urls
211 (?: # the various things that can precede the ID:
ac7553d0 212 (?:(?:v|embed|e)/(?!videoseries)) # v/ or embed/ or e/
c5e8d7af 213 |(?: # or the v= param in all its forms
f7000f3a 214 (?:(?:watch|movie)(?:_popup)?(?:\.php)?/?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
c5e8d7af 215 (?:\?|\#!?) # the params delimiter ? or # or #!
11b56058 216 (?:.*?&)?? # any other preceding param (like /?s=tuff&v=xxxx)
c5e8d7af
PH
217 v=
218 )
f4b05232
JMF
219 ))
220 |youtu\.be/ # just youtu.be/xxxx
edb53e2d 221 |(?:www\.)?cleanvideosearch\.com/media/action/yt/watch\?videoId=
f4b05232 222 )
c5e8d7af 223 )? # all until now is optional -> you can pass the naked ID
8963d9c2 224 ([0-9A-Za-z_-]{11}) # here is it! the YouTube video ID
9291475f 225 (?!.*?&list=) # combined list/video URLs are handled by the playlist IE
c5e8d7af
PH
226 (?(1).+)? # if we found the ID, everything can follow
227 $"""
c5e8d7af 228 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
2c62dc26
PH
229 _formats = {
230 '5': {'ext': 'flv', 'width': 400, 'height': 240},
231 '6': {'ext': 'flv', 'width': 450, 'height': 270},
232 '13': {'ext': '3gp'},
233 '17': {'ext': '3gp', 'width': 176, 'height': 144},
234 '18': {'ext': 'mp4', 'width': 640, 'height': 360},
235 '22': {'ext': 'mp4', 'width': 1280, 'height': 720},
236 '34': {'ext': 'flv', 'width': 640, 'height': 360},
237 '35': {'ext': 'flv', 'width': 854, 'height': 480},
238 '36': {'ext': '3gp', 'width': 320, 'height': 240},
239 '37': {'ext': 'mp4', 'width': 1920, 'height': 1080},
240 '38': {'ext': 'mp4', 'width': 4096, 'height': 3072},
241 '43': {'ext': 'webm', 'width': 640, 'height': 360},
242 '44': {'ext': 'webm', 'width': 854, 'height': 480},
243 '45': {'ext': 'webm', 'width': 1280, 'height': 720},
244 '46': {'ext': 'webm', 'width': 1920, 'height': 1080},
c9bebed2
S
245 '59': {'ext': 'mp4', 'width': 854, 'height': 480},
246 '78': {'ext': 'mp4', 'width': 854, 'height': 480},
2c62dc26 247
1d043b93 248
86fe61c8 249 # 3d videos
43b81eb9
PH
250 '82': {'ext': 'mp4', 'height': 360, 'format_note': '3D', 'preference': -20},
251 '83': {'ext': 'mp4', 'height': 480, 'format_note': '3D', 'preference': -20},
252 '84': {'ext': 'mp4', 'height': 720, 'format_note': '3D', 'preference': -20},
253 '85': {'ext': 'mp4', 'height': 1080, 'format_note': '3D', 'preference': -20},
254 '100': {'ext': 'webm', 'height': 360, 'format_note': '3D', 'preference': -20},
255 '101': {'ext': 'webm', 'height': 480, 'format_note': '3D', 'preference': -20},
256 '102': {'ext': 'webm', 'height': 720, 'format_note': '3D', 'preference': -20},
836a086c 257
96fb5605 258 # Apple HTTP Live Streaming
43b81eb9
PH
259 '92': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'preference': -10},
260 '93': {'ext': 'mp4', 'height': 360, 'format_note': 'HLS', 'preference': -10},
261 '94': {'ext': 'mp4', 'height': 480, 'format_note': 'HLS', 'preference': -10},
262 '95': {'ext': 'mp4', 'height': 720, 'format_note': 'HLS', 'preference': -10},
263 '96': {'ext': 'mp4', 'height': 1080, 'format_note': 'HLS', 'preference': -10},
264 '132': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'preference': -10},
265 '151': {'ext': 'mp4', 'height': 72, 'format_note': 'HLS', 'preference': -10},
2c62dc26
PH
266
267 # DASH mp4 video
43b81eb9
PH
268 '133': {'ext': 'mp4', 'height': 240, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
269 '134': {'ext': 'mp4', 'height': 360, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
270 '135': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
271 '136': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
272 '137': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
e65566a9 273 '138': {'ext': 'mp4', 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40}, # Height can vary (https://github.com/rg3/youtube-dl/issues/4559)
43b81eb9
PH
274 '160': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
275 '264': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
0d2c1418
PH
276 '298': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'h264'},
277 '299': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'h264'},
278 '266': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'vcodec': 'h264'},
836a086c 279
f6f1fc92 280 # Dash mp4 audio
62cd676c
PH
281 '139': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'vcodec': 'none', 'abr': 48, 'preference': -50, 'container': 'm4a_dash'},
282 '140': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'vcodec': 'none', 'abr': 128, 'preference': -50, 'container': 'm4a_dash'},
283 '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'vcodec': 'none', 'abr': 256, 'preference': -50, 'container': 'm4a_dash'},
836a086c
AZ
284
285 # Dash webm
4c6bd5b5
JMF
286 '167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40},
287 '168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40},
288 '169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40},
289 '170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40},
290 '218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40},
291 '219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40},
292 '278': {'ext': 'webm', 'height': 144, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'container': 'webm', 'vcodec': 'vp9'},
e75cafe9
A
293 '242': {'ext': 'webm', 'height': 240, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
294 '243': {'ext': 'webm', 'height': 360, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
295 '244': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
296 '245': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
297 '246': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
298 '247': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
299 '248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
3c80377b 300 '271': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
18061bba 301 '272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
4c6bd5b5
JMF
302 '302': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'vp9'},
303 '303': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'vp9'},
304 '308': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'vp9'},
305 '313': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'vcodec': 'vp9'},
306 '315': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'vp9'},
2c62dc26
PH
307
308 # Dash webm audio
55db73ef 309 '171': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'abr': 128, 'preference': -50},
e75cafe9 310 '172': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'abr': 256, 'preference': -50},
ce6b9a2d 311
0857baad
PH
312 # Dash webm audio with opus inside
313 '249': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 50, 'preference': -50},
314 '250': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 70, 'preference': -50},
315 '251': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 160, 'preference': -50},
316
ce6b9a2d
PH
317 # RTMP (unnamed)
318 '_rtmp': {'protocol': 'rtmp'},
c5e8d7af 319 }
836a086c 320
78caa52a 321 IE_NAME = 'youtube'
2eb88d95
PH
322 _TESTS = [
323 {
297a564b 324 'url': 'http://www.youtube.com/watch?v=BaW_jenozKcj&t=1s&end=9',
4bc3a23e
PH
325 'info_dict': {
326 'id': 'BaW_jenozKc',
327 'ext': 'mp4',
328 'title': 'youtube-dl test video "\'/\\ä↭𝕐',
329 'uploader': 'Philipp Hagemeister',
330 'uploader_id': 'phihag',
331 'upload_date': '20121002',
332 'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',
333 'categories': ['Science & Technology'],
000b6b5a 334 'tags': ['youtube-dl'],
3e7c1224
PH
335 'like_count': int,
336 'dislike_count': int,
7c80519c 337 'start_time': 1,
297a564b 338 'end_time': 9,
2eb88d95 339 }
0e853ca4 340 },
0e853ca4 341 {
4bc3a23e
PH
342 'url': 'http://www.youtube.com/watch?v=UxxajLWwzqY',
343 'note': 'Test generic use_cipher_signature video (#897)',
344 'info_dict': {
345 'id': 'UxxajLWwzqY',
346 'ext': 'mp4',
347 'upload_date': '20120506',
348 'title': 'Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]',
000b6b5a
S
349 'description': 'md5:782e8651347686cba06e58f71ab51773',
350 'tags': ['Icona Pop i love it', 'sweden', 'pop music', 'big beat records', 'big beat', 'charli',
351 'xcx', 'charli xcx', 'girls', 'hbo', 'i love it', "i don't care", 'icona', 'pop',
352 'iconic ep', 'iconic', 'love', 'it'],
4bc3a23e
PH
353 'uploader': 'Icona Pop',
354 'uploader_id': 'IconaPop',
2eb88d95 355 }
c108eb73
JMF
356 },
357 {
4bc3a23e
PH
358 'url': 'https://www.youtube.com/watch?v=07FYdnEawAQ',
359 'note': 'Test VEVO video with age protection (#956)',
360 'info_dict': {
361 'id': '07FYdnEawAQ',
362 'ext': 'mp4',
363 'upload_date': '20130703',
364 'title': 'Justin Timberlake - Tunnel Vision (Explicit)',
365 'description': 'md5:64249768eec3bc4276236606ea996373',
366 'uploader': 'justintimberlakeVEVO',
367 'uploader_id': 'justintimberlakeVEVO',
34952f09 368 'age_limit': 18,
c108eb73
JMF
369 }
370 },
fccd3771 371 {
4bc3a23e
PH
372 'url': '//www.YouTube.com/watch?v=yZIXLfi8CZQ',
373 'note': 'Embed-only video (#1746)',
374 'info_dict': {
375 'id': 'yZIXLfi8CZQ',
376 'ext': 'mp4',
377 'upload_date': '20120608',
378 'title': 'Principal Sexually Assaults A Teacher - Episode 117 - 8th June 2012',
379 'description': 'md5:09b78bd971f1e3e289601dfba15ca4f7',
380 'uploader': 'SET India',
381 'uploader_id': 'setindia'
fccd3771
PH
382 }
383 },
11b56058
PM
384 {
385 'url': 'http://www.youtube.com/watch?v=BaW_jenozKcj&v=UxxajLWwzqY',
386 'note': 'Use the first video ID in the URL',
387 'info_dict': {
388 'id': 'BaW_jenozKc',
389 'ext': 'mp4',
390 'title': 'youtube-dl test video "\'/\\ä↭𝕐',
391 'uploader': 'Philipp Hagemeister',
392 'uploader_id': 'phihag',
393 'upload_date': '20121002',
394 'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',
395 'categories': ['Science & Technology'],
396 'tags': ['youtube-dl'],
397 'like_count': int,
398 'dislike_count': int,
34a7de29
S
399 },
400 'params': {
401 'skip_download': True,
402 },
11b56058 403 },
dd27fd17 404 {
4bc3a23e
PH
405 'url': 'http://www.youtube.com/watch?v=a9LDPn-MO4I',
406 'note': '256k DASH audio (format 141) via DASH manifest',
407 'info_dict': {
408 'id': 'a9LDPn-MO4I',
409 'ext': 'm4a',
410 'upload_date': '20121002',
411 'uploader_id': '8KVIDEO',
412 'description': '',
413 'uploader': '8KVIDEO',
414 'title': 'UHDTV TEST 8K VIDEO.mp4'
4919603f 415 },
4bc3a23e
PH
416 'params': {
417 'youtube_include_dash_manifest': True,
418 'format': '141',
4919603f 419 },
dd27fd17 420 },
3489b7d2
JMF
421 # DASH manifest with encrypted signature
422 {
78caa52a
PH
423 'url': 'https://www.youtube.com/watch?v=IB3lcPjvWLA',
424 'info_dict': {
425 'id': 'IB3lcPjvWLA',
426 'ext': 'm4a',
b766eb27
JMF
427 'title': 'Afrojack, Spree Wilson - The Spark ft. Spree Wilson',
428 'description': 'md5:12e7067fa6735a77bdcbb58cb1187d2d',
78caa52a
PH
429 'uploader': 'AfrojackVEVO',
430 'uploader_id': 'AfrojackVEVO',
431 'upload_date': '20131011',
3489b7d2 432 },
4bc3a23e 433 'params': {
78caa52a
PH
434 'youtube_include_dash_manifest': True,
435 'format': '141',
3489b7d2
JMF
436 },
437 },
aaeb86f6
S
438 # JS player signature function name containing $
439 {
440 'url': 'https://www.youtube.com/watch?v=nfWlot6h_JM',
441 'info_dict': {
442 'id': 'nfWlot6h_JM',
443 'ext': 'm4a',
444 'title': 'Taylor Swift - Shake It Off',
445 'description': 'md5:2acfda1b285bdd478ccec22f9918199d',
446 'uploader': 'TaylorSwiftVEVO',
447 'uploader_id': 'TaylorSwiftVEVO',
448 'upload_date': '20140818',
449 },
450 'params': {
451 'youtube_include_dash_manifest': True,
452 'format': '141',
453 },
454 },
aa79ac0c
PH
455 # Controversy video
456 {
457 'url': 'https://www.youtube.com/watch?v=T4XJQO3qol8',
458 'info_dict': {
459 'id': 'T4XJQO3qol8',
460 'ext': 'mp4',
461 'upload_date': '20100909',
462 'uploader': 'The Amazing Atheist',
463 'uploader_id': 'TheAmazingAtheist',
464 'title': 'Burning Everyone\'s Koran',
465 'description': 'SUBSCRIBE: http://www.youtube.com/saturninefilms\n\nEven Obama has taken a stand against freedom on this issue: http://www.huffingtonpost.com/2010/09/09/obama-gma-interview-quran_n_710282.html',
466 }
c522adb1
JMF
467 },
468 # Normal age-gate video (No vevo, embed allowed)
469 {
470 'url': 'http://youtube.com/watch?v=HtVdAasjOgU',
471 'info_dict': {
472 'id': 'HtVdAasjOgU',
473 'ext': 'mp4',
474 'title': 'The Witcher 3: Wild Hunt - The Sword Of Destiny Trailer',
9ed99402 475 'description': 're:(?s).{100,}About the Game\n.*?The Witcher 3: Wild Hunt.{100,}',
c522adb1
JMF
476 'uploader': 'The Witcher',
477 'uploader_id': 'WitcherGame',
478 'upload_date': '20140605',
34952f09 479 'age_limit': 18,
c522adb1
JMF
480 },
481 },
fccae2b9
S
482 # Age-gate video with encrypted signature
483 {
484 'url': 'http://www.youtube.com/watch?v=6kLq3WMV1nU',
485 'info_dict': {
486 'id': '6kLq3WMV1nU',
487 'ext': 'mp4',
488 'title': 'Dedication To My Ex (Miss That) (Lyric Video)',
489 'description': 'md5:33765bb339e1b47e7e72b5490139bb41',
490 'uploader': 'LloydVEVO',
491 'uploader_id': 'LloydVEVO',
492 'upload_date': '20110629',
34952f09 493 'age_limit': 18,
fccae2b9
S
494 },
495 },
774e208f
PH
496 # video_info is None (https://github.com/rg3/youtube-dl/issues/4421)
497 {
498 'url': '__2ABJjxzNo',
499 'info_dict': {
500 'id': '__2ABJjxzNo',
501 'ext': 'mp4',
502 'upload_date': '20100430',
503 'uploader_id': 'deadmau5',
504 'description': 'md5:12c56784b8032162bb936a5f76d55360',
505 'uploader': 'deadmau5',
506 'title': 'Deadmau5 - Some Chords (HD)',
507 },
508 'expected_warnings': [
509 'DASH manifest missing',
510 ]
e52a40ab
PH
511 },
512 # Olympics (https://github.com/rg3/youtube-dl/issues/4431)
513 {
514 'url': 'lqQg6PlCWgI',
515 'info_dict': {
516 'id': 'lqQg6PlCWgI',
517 'ext': 'mp4',
cbe2bd91
PH
518 'upload_date': '20120731',
519 'uploader_id': 'olympic',
520 'description': 'HO09 - Women - GER-AUS - Hockey - 31 July 2012 - London 2012 Olympic Games',
521 'uploader': 'Olympics',
522 'title': 'Hockey - Women - GER-AUS - London 2012 Olympic Games',
523 },
524 'params': {
525 'skip_download': 'requires avconv',
e52a40ab 526 }
cbe2bd91 527 },
6271f1ca
PH
528 # Non-square pixels
529 {
530 'url': 'https://www.youtube.com/watch?v=_b-2C3KPAM0',
531 'info_dict': {
532 'id': '_b-2C3KPAM0',
533 'ext': 'mp4',
534 'stretched_ratio': 16 / 9.,
535 'upload_date': '20110310',
536 'uploader_id': 'AllenMeow',
537 'description': 'made by Wacom from Korea | 字幕&加油添醋 by TY\'s Allen | 感謝heylisa00cavey1001同學熱情提供梗及翻譯',
538 'uploader': '孫艾倫',
539 'title': '[A-made] 變態妍字幕版 太妍 我就是這樣的人',
540 },
06b491eb
S
541 },
542 # url_encoded_fmt_stream_map is empty string
543 {
544 'url': 'qEJwOuvDf7I',
545 'info_dict': {
546 'id': 'qEJwOuvDf7I',
547 'ext': 'mp4',
548 'title': 'Обсуждение судебной практики по выборам 14 сентября 2014 года в Санкт-Петербурге',
549 'description': '',
550 'upload_date': '20150404',
551 'uploader_id': 'spbelect',
552 'uploader': 'Наблюдатели Петербурга',
553 },
554 'params': {
555 'skip_download': 'requires avconv',
556 }
557 },
da77d856
S
558 # Extraction from multiple DASH manifests (https://github.com/rg3/youtube-dl/pull/6097)
559 {
560 'url': 'https://www.youtube.com/watch?v=FIl7x6_3R5Y',
561 'info_dict': {
562 'id': 'FIl7x6_3R5Y',
563 'ext': 'mp4',
564 'title': 'md5:7b81415841e02ecd4313668cde88737a',
565 'description': 'md5:116377fd2963b81ec4ce64b542173306',
566 'upload_date': '20150625',
567 'uploader_id': 'dorappi2000',
568 'uploader': 'dorappi2000',
569 'formats': 'mincount:33',
570 },
2ee8f5d8 571 },
8a1a26ce
YCH
572 # DASH manifest with segment_list
573 {
574 'url': 'https://www.youtube.com/embed/CsmdDsKjzN8',
575 'md5': '8ce563a1d667b599d21064e982ab9e31',
576 'info_dict': {
577 'id': 'CsmdDsKjzN8',
578 'ext': 'mp4',
17ee98e1 579 'upload_date': '20150501', # According to '<meta itemprop="datePublished"', but in other places it's 20150510
8a1a26ce
YCH
580 'uploader': 'Airtek',
581 'description': 'Retransmisión en directo de la XVIII media maratón de Zaragoza.',
582 'uploader_id': 'UCzTzUmjXxxacNnL8I3m4LnQ',
583 'title': 'Retransmisión XVIII Media maratón Zaragoza 2015',
584 },
585 'params': {
586 'youtube_include_dash_manifest': True,
587 'format': '135', # bestvideo
588 }
2ee8f5d8 589 },
cf7e015f
S
590 {
591 # Multifeed videos (multiple cameras), URL is for Main Camera
592 'url': 'https://www.youtube.com/watch?v=jqWvoWXjCVs',
593 'info_dict': {
594 'id': 'jqWvoWXjCVs',
595 'title': 'teamPGP: Rocket League Noob Stream',
596 'description': 'md5:dc7872fb300e143831327f1bae3af010',
597 },
598 'playlist': [{
599 'info_dict': {
600 'id': 'jqWvoWXjCVs',
601 'ext': 'mp4',
602 'title': 'teamPGP: Rocket League Noob Stream (Main Camera)',
603 'description': 'md5:dc7872fb300e143831327f1bae3af010',
604 'upload_date': '20150721',
605 'uploader': 'Beer Games Beer',
606 'uploader_id': 'beergamesbeer',
607 },
608 }, {
609 'info_dict': {
610 'id': '6h8e8xoXJzg',
611 'ext': 'mp4',
612 'title': 'teamPGP: Rocket League Noob Stream (kreestuh)',
613 'description': 'md5:dc7872fb300e143831327f1bae3af010',
614 'upload_date': '20150721',
615 'uploader': 'Beer Games Beer',
616 'uploader_id': 'beergamesbeer',
617 },
618 }, {
619 'info_dict': {
620 'id': 'PUOgX5z9xZw',
621 'ext': 'mp4',
622 'title': 'teamPGP: Rocket League Noob Stream (grizzle)',
623 'description': 'md5:dc7872fb300e143831327f1bae3af010',
624 'upload_date': '20150721',
625 'uploader': 'Beer Games Beer',
626 'uploader_id': 'beergamesbeer',
627 },
628 }, {
629 'info_dict': {
630 'id': 'teuwxikvS5k',
631 'ext': 'mp4',
632 'title': 'teamPGP: Rocket League Noob Stream (zim)',
633 'description': 'md5:dc7872fb300e143831327f1bae3af010',
634 'upload_date': '20150721',
635 'uploader': 'Beer Games Beer',
636 'uploader_id': 'beergamesbeer',
637 },
638 }],
639 'params': {
640 'skip_download': True,
641 },
642 }
2eb88d95
PH
643 ]
644
e0df6211
PH
645 def __init__(self, *args, **kwargs):
646 super(YoutubeIE, self).__init__(*args, **kwargs)
83799698 647 self._player_cache = {}
e0df6211 648
c5e8d7af
PH
649 def report_video_info_webpage_download(self, video_id):
650 """Report attempt to download video info webpage."""
69ea8ca4 651 self.to_screen('%s: Downloading video info webpage' % video_id)
c5e8d7af 652
c5e8d7af
PH
653 def report_information_extraction(self, video_id):
654 """Report attempt to extract video information."""
69ea8ca4 655 self.to_screen('%s: Extracting video information' % video_id)
c5e8d7af
PH
656
657 def report_unavailable_format(self, video_id, format):
658 """Report extracted video URL."""
69ea8ca4 659 self.to_screen('%s: Format %s not available' % (video_id, format))
c5e8d7af
PH
660
661 def report_rtmp_download(self):
662 """Indicate the download will use the RTMP protocol."""
69ea8ca4 663 self.to_screen('RTMP download detected')
c5e8d7af 664
60064c53
PH
665 def _signature_cache_id(self, example_sig):
666 """ Return a string representation of a signature """
78caa52a 667 return '.'.join(compat_str(len(part)) for part in example_sig.split('.'))
60064c53
PH
668
669 def _extract_signature_function(self, video_id, player_url, example_sig):
cf010131 670 id_m = re.match(
60620368 671 r'.*?-(?P<id>[a-zA-Z0-9_-]+)(?:/watch_as3|/html5player)?\.(?P<ext>[a-z]+)$',
cf010131 672 player_url)
c081b35c
PH
673 if not id_m:
674 raise ExtractorError('Cannot identify player %r' % player_url)
e0df6211
PH
675 player_type = id_m.group('ext')
676 player_id = id_m.group('id')
677
c4417ddb 678 # Read from filesystem cache
60064c53
PH
679 func_id = '%s_%s_%s' % (
680 player_type, player_id, self._signature_cache_id(example_sig))
c4417ddb 681 assert os.path.basename(func_id) == func_id
a0e07d31 682
69ea8ca4 683 cache_spec = self._downloader.cache.load('youtube-sigfuncs', func_id)
a0e07d31 684 if cache_spec is not None:
78caa52a 685 return lambda s: ''.join(s[i] for i in cache_spec)
83799698 686
6d1a55a5
PH
687 download_note = (
688 'Downloading player %s' % player_url
689 if self._downloader.params.get('verbose') else
690 'Downloading %s player %s' % (player_type, player_id)
691 )
e0df6211
PH
692 if player_type == 'js':
693 code = self._download_webpage(
694 player_url, video_id,
6d1a55a5 695 note=download_note,
69ea8ca4 696 errnote='Download of %s failed' % player_url)
83799698 697 res = self._parse_sig_js(code)
c4417ddb 698 elif player_type == 'swf':
e0df6211
PH
699 urlh = self._request_webpage(
700 player_url, video_id,
6d1a55a5 701 note=download_note,
69ea8ca4 702 errnote='Download of %s failed' % player_url)
e0df6211 703 code = urlh.read()
83799698 704 res = self._parse_sig_swf(code)
e0df6211
PH
705 else:
706 assert False, 'Invalid player type %r' % player_type
707
785521bf
PH
708 test_string = ''.join(map(compat_chr, range(len(example_sig))))
709 cache_res = res(test_string)
710 cache_spec = [ord(c) for c in cache_res]
83799698 711
69ea8ca4 712 self._downloader.cache.store('youtube-sigfuncs', func_id, cache_spec)
83799698
PH
713 return res
714
60064c53 715 def _print_sig_code(self, func, example_sig):
edf3e38e
PH
716 def gen_sig_code(idxs):
717 def _genslice(start, end, step):
78caa52a 718 starts = '' if start == 0 else str(start)
8bcc8756 719 ends = (':%d' % (end + step)) if end + step >= 0 else ':'
69ea8ca4 720 steps = '' if step == 1 else (':%d' % step)
78caa52a 721 return 's[%s%s%s]' % (starts, ends, steps)
edf3e38e
PH
722
723 step = None
7af808a5
PH
724 # Quelch pyflakes warnings - start will be set when step is set
725 start = '(Never used)'
edf3e38e
PH
726 for i, prev in zip(idxs[1:], idxs[:-1]):
727 if step is not None:
728 if i - prev == step:
729 continue
730 yield _genslice(start, prev, step)
731 step = None
732 continue
733 if i - prev in [-1, 1]:
734 step = i - prev
735 start = prev
736 continue
737 else:
78caa52a 738 yield 's[%d]' % prev
edf3e38e 739 if step is None:
78caa52a 740 yield 's[%d]' % i
edf3e38e
PH
741 else:
742 yield _genslice(start, i, step)
743
78caa52a 744 test_string = ''.join(map(compat_chr, range(len(example_sig))))
c705320f 745 cache_res = func(test_string)
edf3e38e 746 cache_spec = [ord(c) for c in cache_res]
78caa52a 747 expr_code = ' + '.join(gen_sig_code(cache_spec))
60064c53
PH
748 signature_id_tuple = '(%s)' % (
749 ', '.join(compat_str(len(p)) for p in example_sig.split('.')))
69ea8ca4 750 code = ('if tuple(len(p) for p in s.split(\'.\')) == %s:\n'
78caa52a 751 ' return %s\n') % (signature_id_tuple, expr_code)
69ea8ca4 752 self.to_screen('Extracted signature function:\n' + code)
edf3e38e 753
e0df6211
PH
754 def _parse_sig_js(self, jscode):
755 funcname = self._search_regex(
aaeb86f6 756 r'\.sig\|\|([a-zA-Z0-9$]+)\(', jscode,
8bcc8756 757 'Initial JS player signature function name')
2b25cb5d
PH
758
759 jsi = JSInterpreter(jscode)
760 initial_function = jsi.extract_function(funcname)
e0df6211
PH
761 return lambda s: initial_function([s])
762
763 def _parse_sig_swf(self, file_contents):
54256267 764 swfi = SWFInterpreter(file_contents)
78caa52a 765 TARGET_CLASSNAME = 'SignatureDecipher'
54256267 766 searched_class = swfi.extract_class(TARGET_CLASSNAME)
78caa52a 767 initial_function = swfi.extract_function(searched_class, 'decipher')
e0df6211
PH
768 return lambda s: initial_function([s])
769
83799698 770 def _decrypt_signature(self, s, video_id, player_url, age_gate=False):
257a2501 771 """Turn the encrypted s field into a working signature"""
6b37f0be 772
c8bf86d5 773 if player_url is None:
69ea8ca4 774 raise ExtractorError('Cannot decrypt signature without player_url')
920de7a2 775
69ea8ca4 776 if player_url.startswith('//'):
78caa52a 777 player_url = 'https:' + player_url
c8bf86d5 778 try:
62af3a0e 779 player_id = (player_url, self._signature_cache_id(s))
c8bf86d5
PH
780 if player_id not in self._player_cache:
781 func = self._extract_signature_function(
60064c53 782 video_id, player_url, s
c8bf86d5
PH
783 )
784 self._player_cache[player_id] = func
785 func = self._player_cache[player_id]
786 if self._downloader.params.get('youtube_print_sig_code'):
60064c53 787 self._print_sig_code(func, s)
c8bf86d5
PH
788 return func(s)
789 except Exception as e:
790 tb = traceback.format_exc()
791 raise ExtractorError(
78caa52a 792 'Signature extraction failed: ' + tb, cause=e)
e0df6211 793
360e1ca5 794 def _get_subtitles(self, video_id, webpage):
de7f3446 795 try:
60e47a26 796 subs_doc = self._download_xml(
38c2e5b8 797 'https://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,
7fad1c63
JMF
798 video_id, note=False)
799 except ExtractorError as err:
69ea8ca4 800 self._downloader.report_warning('unable to download video subtitles: %s' % compat_str(err))
de7f3446 801 return {}
de7f3446
JMF
802
803 sub_lang_list = {}
60e47a26
JMF
804 for track in subs_doc.findall('track'):
805 lang = track.attrib['lang_code']
7e660ac1
LD
806 if lang in sub_lang_list:
807 continue
360e1ca5
JMF
808 sub_formats = []
809 for ext in ['sbv', 'vtt', 'srt']:
810 params = compat_urllib_parse.urlencode({
811 'lang': lang,
812 'v': video_id,
813 'fmt': ext,
814 'name': track.attrib['name'].encode('utf-8'),
815 })
816 sub_formats.append({
817 'url': 'https://www.youtube.com/api/timedtext?' + params,
818 'ext': ext,
819 })
820 sub_lang_list[lang] = sub_formats
de7f3446 821 if not sub_lang_list:
69ea8ca4 822 self._downloader.report_warning('video doesn\'t have subtitles')
de7f3446
JMF
823 return {}
824 return sub_lang_list
825
360e1ca5 826 def _get_automatic_captions(self, video_id, webpage):
de7f3446
JMF
827 """We need the webpage for getting the captions url, pass it as an
828 argument to speed up the process."""
69ea8ca4 829 self.to_screen('%s: Looking for automatic captions' % video_id)
de7f3446 830 mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
78caa52a 831 err_msg = 'Couldn\'t find automatic captions for %s' % video_id
de7f3446
JMF
832 if mobj is None:
833 self._downloader.report_warning(err_msg)
834 return {}
835 player_config = json.loads(mobj.group(1))
836 try:
0792d563
PH
837 args = player_config['args']
838 caption_url = args['ttsurl']
839 timestamp = args['timestamp']
055e6f36
JMF
840 # We get the available subtitles
841 list_params = compat_urllib_parse.urlencode({
842 'type': 'list',
843 'tlangs': 1,
844 'asrs': 1,
de7f3446 845 })
055e6f36 846 list_url = caption_url + '&' + list_params
e26f8712 847 caption_list = self._download_xml(list_url, video_id)
e3dc22ca 848 original_lang_node = caption_list.find('track')
7d900ef1 849 if original_lang_node is None:
69ea8ca4 850 self._downloader.report_warning('Video doesn\'t have automatic captions')
e3dc22ca
JMF
851 return {}
852 original_lang = original_lang_node.attrib['lang_code']
7d900ef1 853 caption_kind = original_lang_node.attrib.get('kind', '')
055e6f36
JMF
854
855 sub_lang_list = {}
856 for lang_node in caption_list.findall('target'):
857 sub_lang = lang_node.attrib['lang_code']
360e1ca5
JMF
858 sub_formats = []
859 for ext in ['sbv', 'vtt', 'srt']:
860 params = compat_urllib_parse.urlencode({
861 'lang': original_lang,
862 'tlang': sub_lang,
863 'fmt': ext,
864 'ts': timestamp,
865 'kind': caption_kind,
866 })
867 sub_formats.append({
868 'url': caption_url + '&' + params,
869 'ext': ext,
870 })
871 sub_lang_list[sub_lang] = sub_formats
055e6f36 872 return sub_lang_list
de7f3446
JMF
873 # An extractor error can be raise by the download process if there are
874 # no automatic captions but there are subtitles
875 except (KeyError, ExtractorError):
876 self._downloader.report_warning(err_msg)
877 return {}
878
97665381
PH
879 @classmethod
880 def extract_id(cls, url):
881 mobj = re.match(cls._VALID_URL, url, re.VERBOSE)
c5e8d7af 882 if mobj is None:
69ea8ca4 883 raise ExtractorError('Invalid URL: %s' % url)
c5e8d7af
PH
884 video_id = mobj.group(2)
885 return video_id
886
1d043b93
JMF
887 def _extract_from_m3u8(self, manifest_url, video_id):
888 url_map = {}
5f6a1245 889
1d043b93
JMF
890 def _get_urls(_manifest):
891 lines = _manifest.split('\n')
892 urls = filter(lambda l: l and not l.startswith('#'),
8bcc8756 893 lines)
1d043b93 894 return urls
78caa52a 895 manifest = self._download_webpage(manifest_url, video_id, 'Downloading formats manifest')
1d043b93
JMF
896 formats_urls = _get_urls(manifest)
897 for format_url in formats_urls:
890f62e8 898 itag = self._search_regex(r'itag/(\d+?)/', format_url, 'itag')
1d043b93
JMF
899 url_map[itag] = format_url
900 return url_map
901
1fb07d10
JG
902 def _extract_annotations(self, video_id):
903 url = 'https://www.youtube.com/annotations_invideo?features=1&legacy=1&video_id=%s' % video_id
69ea8ca4 904 return self._download_webpage(url, video_id, note='Searching for annotations.', errnote='Unable to download video annotations.')
1fb07d10 905
da276600 906 def _parse_dash_manifest(
77c6fb5b 907 self, video_id, dash_manifest_url, player_url, age_gate, fatal=True):
774e208f
PH
908 def decrypt_sig(mobj):
909 s = mobj.group(1)
910 dec_s = self._decrypt_signature(s, video_id, player_url, age_gate)
911 return '/signature/%s' % dec_s
e1b9322b 912 dash_manifest_url = re.sub(r'/s/([a-fA-F0-9\.]+)', decrypt_sig, dash_manifest_url)
774e208f
PH
913 dash_doc = self._download_xml(
914 dash_manifest_url, video_id,
915 note='Downloading DASH manifest',
77c6fb5b
S
916 errnote='Could not download DASH manifest',
917 fatal=fatal)
918
919 if dash_doc is False:
920 return []
774e208f
PH
921
922 formats = []
de5c5456
YCH
923 for a in dash_doc.findall('.//{urn:mpeg:DASH:schema:MPD:2011}AdaptationSet'):
924 mime_type = a.attrib.get('mimeType')
925 for r in a.findall('{urn:mpeg:DASH:schema:MPD:2011}Representation'):
926 url_el = r.find('{urn:mpeg:DASH:schema:MPD:2011}BaseURL')
927 if url_el is None:
928 continue
929 if mime_type == 'text/vtt':
930 # TODO implement WebVTT downloading
931 pass
932 elif mime_type.startswith('audio/') or mime_type.startswith('video/'):
6800d337 933 segment_list = r.find('{urn:mpeg:DASH:schema:MPD:2011}SegmentList')
de5c5456
YCH
934 format_id = r.attrib['id']
935 video_url = url_el.text
936 filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength'))
937 f = {
938 'format_id': format_id,
939 'url': video_url,
940 'width': int_or_none(r.attrib.get('width')),
941 'height': int_or_none(r.attrib.get('height')),
942 'tbr': int_or_none(r.attrib.get('bandwidth'), 1000),
943 'asr': int_or_none(r.attrib.get('audioSamplingRate')),
944 'filesize': filesize,
945 'fps': int_or_none(r.attrib.get('frameRate')),
946 }
0c8662d2 947 if segment_list is not None:
6800d337
YCH
948 f.update({
949 'initialization_url': segment_list.find('{urn:mpeg:DASH:schema:MPD:2011}Initialization').attrib['sourceURL'],
b9258c61 950 'segment_urls': [segment.attrib.get('media') for segment in segment_list.findall('{urn:mpeg:DASH:schema:MPD:2011}SegmentURL')],
423d2be5 951 'protocol': 'http_dash_segments',
6800d337 952 })
de5c5456
YCH
953 try:
954 existing_format = next(
955 fo for fo in formats
956 if fo['format_id'] == format_id)
957 except StopIteration:
958 full_info = self._formats.get(format_id, {}).copy()
959 full_info.update(f)
1b5a1ae2
S
960 codecs = r.attrib.get('codecs')
961 if codecs:
962 if full_info.get('acodec') == 'none' and 'vcodec' not in full_info:
963 full_info['vcodec'] = codecs
964 elif full_info.get('vcodec') == 'none' and 'acodec' not in full_info:
965 full_info['acodec'] = codecs
de5c5456
YCH
966 formats.append(full_info)
967 else:
968 existing_format.update(f)
969 else:
970 self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
774e208f
PH
971 return formats
972
c5e8d7af 973 def _real_extract(self, url):
cf7e015f
S
974 url, smuggled_data = unsmuggle_url(url, {})
975
7e8c0af0 976 proto = (
78caa52a
PH
977 'http' if self._downloader.params.get('prefer_insecure', False)
978 else 'https')
7e8c0af0 979
7c80519c 980 start_time = None
297a564b 981 end_time = None
7c80519c
JMF
982 parsed_url = compat_urllib_parse_urlparse(url)
983 for component in [parsed_url.fragment, parsed_url.query]:
984 query = compat_parse_qs(component)
297a564b 985 if start_time is None and 't' in query:
7c80519c 986 start_time = parse_duration(query['t'][0])
2929fa0e
JMF
987 if start_time is None and 'start' in query:
988 start_time = parse_duration(query['start'][0])
297a564b
JMF
989 if end_time is None and 'end' in query:
990 end_time = parse_duration(query['end'][0])
7c80519c 991
c5e8d7af
PH
992 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
993 mobj = re.search(self._NEXT_URL_RE, url)
994 if mobj:
7fd002c0 995 url = proto + '://www.youtube.com/' + compat_urllib_parse_unquote(mobj.group(1)).lstrip('/')
97665381 996 video_id = self.extract_id(url)
c5e8d7af
PH
997
998 # Get video webpage
aa79ac0c 999 url = proto + '://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1&bpctr=9999999999' % video_id
a1f934b1 1000 video_webpage = self._download_webpage(url, video_id)
c5e8d7af
PH
1001
1002 # Attempt to extract SWF player URL
e0df6211 1003 mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
c5e8d7af
PH
1004 if mobj is not None:
1005 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1006 else:
1007 player_url = None
1008
d8d24a92
S
1009 dash_mpds = []
1010
1011 def add_dash_mpd(video_info):
1012 dash_mpd = video_info.get('dashmpd')
1013 if dash_mpd and dash_mpd[0] not in dash_mpds:
1014 dash_mpds.append(dash_mpd[0])
1015
c5e8d7af 1016 # Get video info
6449cd80 1017 embed_webpage = None
2fe1ff85 1018 is_live = None
c108eb73 1019 if re.search(r'player-age-gate-content">', video_webpage) is not None:
c108eb73
JMF
1020 age_gate = True
1021 # We simulate the access to the video from www.youtube.com/v/{video_id}
1022 # this can be viewed without login into Youtube
beb95e77
CL
1023 url = proto + '://www.youtube.com/embed/%s' % video_id
1024 embed_webpage = self._download_webpage(url, video_id, 'Downloading embed webpage')
2c57c7fa
JMF
1025 data = compat_urllib_parse.urlencode({
1026 'video_id': video_id,
1027 'eurl': 'https://youtube.googleapis.com/v/' + video_id,
c084c934 1028 'sts': self._search_regex(
beb95e77 1029 r'"sts"\s*:\s*(\d+)', embed_webpage, 'sts', default=''),
2c57c7fa 1030 })
7e8c0af0 1031 video_info_url = proto + '://www.youtube.com/get_video_info?' + data
94bd3613
PH
1032 video_info_webpage = self._download_webpage(
1033 video_info_url, video_id,
20436c30 1034 note='Refetching age-gated info webpage',
94bd3613 1035 errnote='unable to download video info webpage')
c5e8d7af 1036 video_info = compat_parse_qs(video_info_webpage)
d8d24a92 1037 add_dash_mpd(video_info)
c108eb73
JMF
1038 else:
1039 age_gate = False
bc93bdb5 1040 video_info = None
d8d24a92
S
1041 # Try looking directly into the video webpage
1042 mobj = re.search(r';ytplayer\.config\s*=\s*({.*?});', video_webpage)
1043 if mobj:
4e62ebe2
JMF
1044 json_code = uppercase_escape(mobj.group(1))
1045 ytplayer_config = json.loads(json_code)
1046 args = ytplayer_config['args']
d8d24a92
S
1047 if args.get('url_encoded_fmt_stream_map'):
1048 # Convert to the same format returned by compat_parse_qs
1049 video_info = dict((k, [v]) for k, v in args.items())
1050 add_dash_mpd(video_info)
2fe1ff85
JMF
1051 if args.get('livestream') == '1' or args.get('live_playback') == 1:
1052 is_live = True
0a3cf9ad
S
1053 if not video_info or self._downloader.params.get('youtube_include_dash_manifest', True):
1054 # We also try looking in get_video_info since it may contain different dashmpd
1055 # URL that points to a DASH manifest with possibly different itag set (some itags
1056 # are missing from DASH manifest pointed by webpage's dashmpd, some - from DASH
1057 # manifest pointed by get_video_info's dashmpd).
1058 # The general idea is to take a union of itags of both DASH manifests (for example
1059 # video with such 'manifest behavior' see https://github.com/rg3/youtube-dl/issues/6093)
4e62ebe2 1060 self.report_video_info_webpage_download(video_id)
0a3cf9ad 1061 for el_type in ['&el=info', '&el=embedded', '&el=detailpage', '&el=vevo', '']:
810fb84d
PH
1062 video_info_url = (
1063 '%s://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1064 % (proto, video_id, el_type))
1065 video_info_webpage = self._download_webpage(
1066 video_info_url,
4e62ebe2
JMF
1067 video_id, note=False,
1068 errnote='unable to download video info webpage')
0a3cf9ad 1069 get_video_info = compat_parse_qs(video_info_webpage)
87dc4511
JMF
1070 if get_video_info.get('use_cipher_signature') != ['True']:
1071 add_dash_mpd(get_video_info)
0a3cf9ad
S
1072 if not video_info:
1073 video_info = get_video_info
1074 if 'token' in get_video_info:
4e62ebe2 1075 break
c5e8d7af
PH
1076 if 'token' not in video_info:
1077 if 'reason' in video_info:
af214c3a
YCH
1078 if 'The uploader has not made this video available in your country.' in video_info['reason']:
1079 regions_allowed = self._html_search_meta('regionsAllowed', video_webpage, default=None)
678e436f 1080 if regions_allowed:
af214c3a
YCH
1081 raise ExtractorError('YouTube said: This video is available in %s only' % (
1082 ', '.join(map(ISO3166Utils.short2full, regions_allowed.split(',')))),
1083 expected=True)
d11271dd 1084 raise ExtractorError(
78caa52a 1085 'YouTube said: %s' % video_info['reason'][0],
d11271dd 1086 expected=True, video_id=video_id)
c5e8d7af 1087 else:
d11271dd 1088 raise ExtractorError(
78caa52a 1089 '"token" parameter not in video info for unknown reason',
d11271dd 1090 video_id=video_id)
c5e8d7af 1091
cf7e015f
S
1092 # title
1093 if 'title' in video_info:
1094 video_title = video_info['title'][0]
1095 else:
1096 self._downloader.report_warning('Unable to extract video title')
1097 video_title = '_'
1098
1099 # description
1100 video_description = get_element_by_id("eow-description", video_webpage)
1101 if video_description:
1102 video_description = re.sub(r'''(?x)
1103 <a\s+
1104 (?:[a-zA-Z-]+="[^"]+"\s+)*?
1105 title="([^"]+)"\s+
1106 (?:[a-zA-Z-]+="[^"]+"\s+)*?
1107 class="yt-uix-redirect-link"\s*>
1108 [^<]+
1109 </a>
1110 ''', r'\1', video_description)
1111 video_description = clean_html(video_description)
1112 else:
1113 fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
1114 if fd_mobj:
1115 video_description = unescapeHTML(fd_mobj.group(1))
1116 else:
1117 video_description = ''
1118
5e1eddb9
S
1119 if 'multifeed_metadata_list' in video_info and not smuggled_data.get('force_singlefeed', False):
1120 if not self._downloader.params.get('noplaylist'):
1121 entries = []
1122 feed_ids = []
1123 multifeed_metadata_list = compat_urllib_parse_unquote_plus(video_info['multifeed_metadata_list'][0])
1124 for feed in multifeed_metadata_list.split(','):
1125 feed_data = compat_parse_qs(feed)
1126 entries.append({
1127 '_type': 'url_transparent',
1128 'ie_key': 'Youtube',
1129 'url': smuggle_url(
1130 '%s://www.youtube.com/watch?v=%s' % (proto, feed_data['id'][0]),
1131 {'force_singlefeed': True}),
1132 'title': '%s (%s)' % (video_title, feed_data['title'][0]),
1133 })
1134 feed_ids.append(feed_data['id'][0])
1135 self.to_screen(
1136 'Downloading multifeed video (%s) - add --no-playlist to just download video %s'
1137 % (', '.join(feed_ids), video_id))
1138 return self.playlist_result(entries, video_id, video_title, video_description)
1139 self.to_screen('Downloading just video %s because of --no-playlist' % video_id)
cf7e015f 1140
1d699755
PH
1141 if 'view_count' in video_info:
1142 view_count = int(video_info['view_count'][0])
1143 else:
1144 view_count = None
1145
c5e8d7af
PH
1146 # Check for "rental" videos
1147 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
69ea8ca4 1148 raise ExtractorError('"rental" videos not supported')
c5e8d7af
PH
1149
1150 # Start extracting information
1151 self.report_information_extraction(video_id)
1152
1153 # uploader
1154 if 'author' not in video_info:
69ea8ca4 1155 raise ExtractorError('Unable to extract uploader name')
7fd002c0 1156 video_uploader = compat_urllib_parse_unquote_plus(video_info['author'][0])
c5e8d7af
PH
1157
1158 # uploader_id
1159 video_uploader_id = None
1160 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
1161 if mobj is not None:
1162 video_uploader_id = mobj.group(1)
1163 else:
69ea8ca4 1164 self._downloader.report_warning('unable to extract uploader nickname')
c5e8d7af 1165
c5e8d7af 1166 # thumbnail image
7763b04e
JMF
1167 # We try first to get a high quality image:
1168 m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">',
1169 video_webpage, re.DOTALL)
1170 if m_thumb is not None:
1171 video_thumbnail = m_thumb.group(1)
1172 elif 'thumbnail_url' not in video_info:
69ea8ca4 1173 self._downloader.report_warning('unable to extract video thumbnail')
f490e77e 1174 video_thumbnail = None
c5e8d7af 1175 else: # don't panic if we can't find it
7fd002c0 1176 video_thumbnail = compat_urllib_parse_unquote_plus(video_info['thumbnail_url'][0])
c5e8d7af
PH
1177
1178 # upload date
9d0b581f
S
1179 upload_date = self._html_search_meta(
1180 'datePublished', video_webpage, 'upload date', default=None)
1181 if not upload_date:
1182 upload_date = self._search_regex(
1183 [r'(?s)id="eow-date.*?>(.*?)</span>',
1184 r'id="watch-uploader-info".*?>.*?(?:Published|Uploaded|Streamed live|Started) on (.+?)</strong>'],
1185 video_webpage, 'upload date', default=None)
1186 if upload_date:
1187 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1188 upload_date = unified_strdate(upload_date)
c5e8d7af 1189
55f7bd2d
PH
1190 m_cat_container = self._search_regex(
1191 r'(?s)<h4[^>]*>\s*Category\s*</h4>\s*<ul[^>]*>(.*?)</ul>',
624dcebf 1192 video_webpage, 'categories', default=None)
ec8deefc 1193 if m_cat_container:
ad3bc6ac 1194 category = self._html_search_regex(
01ed5c9b 1195 r'(?s)<a[^<]+>(.*?)</a>', m_cat_container, 'category',
ad3bc6ac
PH
1196 default=None)
1197 video_categories = None if category is None else [category]
1198 else:
1199 video_categories = None
ec8deefc 1200
000b6b5a
S
1201 video_tags = [
1202 unescapeHTML(m.group('content'))
1203 for m in re.finditer(self._meta_regex('og:video:tag'), video_webpage)]
1204
f30a38be 1205 def _extract_count(count_name):
c93d53f5
S
1206 return str_to_int(self._search_regex(
1207 r'-%s-button[^>]+><span[^>]+class="yt-uix-button-content"[^>]*>([\d,]+)</span>'
1208 % re.escape(count_name),
1209 video_webpage, count_name, default=None))
1210
69ea8ca4
PH
1211 like_count = _extract_count('like')
1212 dislike_count = _extract_count('dislike')
336c3a69 1213
c5e8d7af 1214 # subtitles
d82134c3 1215 video_subtitles = self.extract_subtitles(video_id, video_webpage)
360e1ca5 1216 automatic_captions = self.extract_automatic_captions(video_id, video_webpage)
c5e8d7af
PH
1217
1218 if 'length_seconds' not in video_info:
69ea8ca4 1219 self._downloader.report_warning('unable to extract video duration')
b466b702 1220 video_duration = None
c5e8d7af 1221 else:
7fd002c0 1222 video_duration = int(compat_urllib_parse_unquote_plus(video_info['length_seconds'][0]))
c5e8d7af 1223
1fb07d10
JG
1224 # annotations
1225 video_annotations = None
1226 if self._downloader.params.get('writeannotations', False):
5f6a1245 1227 video_annotations = self._extract_annotations(video_id)
1fb07d10 1228
dd27fd17
PH
1229 def _map_to_format_list(urlmap):
1230 formats = []
1231 for itag, video_real_url in urlmap.items():
1232 dct = {
1233 'format_id': itag,
1234 'url': video_real_url,
1235 'player_url': player_url,
1236 }
0b65e5d4
PH
1237 if itag in self._formats:
1238 dct.update(self._formats[itag])
dd27fd17
PH
1239 formats.append(dct)
1240 return formats
1241
c5e8d7af
PH
1242 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1243 self.report_rtmp_download()
dd27fd17
PH
1244 formats = [{
1245 'format_id': '_rtmp',
1246 'protocol': 'rtmp',
1247 'url': video_info['conn'][0],
1248 'player_url': player_url,
1249 }]
24270b03 1250 elif len(video_info.get('url_encoded_fmt_stream_map', [''])[0]) >= 1 or len(video_info.get('adaptive_fmts', [''])[0]) >= 1:
5f6a1245 1251 encoded_url_map = video_info.get('url_encoded_fmt_stream_map', [''])[0] + ',' + video_info.get('adaptive_fmts', [''])[0]
00fe14fc 1252 if 'rtmpe%3Dyes' in encoded_url_map:
a7055eb9 1253 raise ExtractorError('rtmpe downloads are not supported, see https://github.com/rg3/youtube-dl/issues/343 for more information.', expected=True)
c5e8d7af 1254 url_map = {}
00fe14fc 1255 for url_data_str in encoded_url_map.split(','):
c5e8d7af 1256 url_data = compat_parse_qs(url_data_str)
201e9eaa
PH
1257 if 'itag' not in url_data or 'url' not in url_data:
1258 continue
1259 format_id = url_data['itag'][0]
1260 url = url_data['url'][0]
1261
1262 if 'sig' in url_data:
1263 url += '&signature=' + url_data['sig'][0]
1264 elif 's' in url_data:
1265 encrypted_sig = url_data['s'][0]
6449cd80 1266 ASSETS_RE = r'"assets":.+?"js":\s*("[^"]+")'
201e9eaa 1267
beb95e77 1268 jsplayer_url_json = self._search_regex(
6449cd80
PH
1269 ASSETS_RE,
1270 embed_webpage if age_gate else video_webpage,
1271 'JS player URL (1)', default=None)
1272 if not jsplayer_url_json and not age_gate:
1273 # We need the embed website after all
1274 if embed_webpage is None:
1275 embed_url = proto + '://www.youtube.com/embed/%s' % video_id
1276 embed_webpage = self._download_webpage(
1277 embed_url, video_id, 'Downloading embed webpage')
1278 jsplayer_url_json = self._search_regex(
1279 ASSETS_RE, embed_webpage, 'JS player URL')
1280
beb95e77 1281 player_url = json.loads(jsplayer_url_json)
201e9eaa
PH
1282 if player_url is None:
1283 player_url_json = self._search_regex(
1284 r'ytplayer\.config.*?"url"\s*:\s*("[^"]+")',
78caa52a 1285 video_webpage, 'age gate player URL')
201e9eaa
PH
1286 player_url = json.loads(player_url_json)
1287
1288 if self._downloader.params.get('verbose'):
cf010131 1289 if player_url is None:
201e9eaa
PH
1290 player_version = 'unknown'
1291 player_desc = 'unknown'
1292 else:
1293 if player_url.endswith('swf'):
1294 player_version = self._search_regex(
1295 r'-(.+?)(?:/watch_as3)?\.swf$', player_url,
78caa52a 1296 'flash player', fatal=False)
201e9eaa 1297 player_desc = 'flash player %s' % player_version
cf010131 1298 else:
201e9eaa
PH
1299 player_version = self._search_regex(
1300 r'html5player-([^/]+?)(?:/html5player)?\.js',
1301 player_url,
1302 'html5 player', fatal=False)
78caa52a 1303 player_desc = 'html5 player %s' % player_version
201e9eaa 1304
60064c53 1305 parts_sizes = self._signature_cache_id(encrypted_sig)
69ea8ca4 1306 self.to_screen('{%s} signature length %s, %s' %
9e1a5b84 1307 (format_id, parts_sizes, player_desc))
201e9eaa
PH
1308
1309 signature = self._decrypt_signature(
1310 encrypted_sig, video_id, player_url, age_gate)
1311 url += '&signature=' + signature
1312 if 'ratebypass' not in url:
1313 url += '&ratebypass=yes'
1314 url_map[format_id] = url
dd27fd17 1315 formats = _map_to_format_list(url_map)
1d043b93
JMF
1316 elif video_info.get('hlsvp'):
1317 manifest_url = video_info['hlsvp'][0]
1318 url_map = self._extract_from_m3u8(manifest_url, video_id)
dd27fd17 1319 formats = _map_to_format_list(url_map)
c5e8d7af 1320 else:
69ea8ca4 1321 raise ExtractorError('no conn, hlsvp or url_encoded_fmt_stream_map information found in video info')
c5e8d7af 1322
dd27fd17 1323 # Look for the DASH manifest
203fb43f 1324 if self._downloader.params.get('youtube_include_dash_manifest', True):
77c6fb5b 1325 dash_mpd_fatal = True
d8d24a92
S
1326 for dash_manifest_url in dash_mpds:
1327 dash_formats = {}
774e208f 1328 try:
d8d24a92 1329 for df in self._parse_dash_manifest(
77c6fb5b 1330 video_id, dash_manifest_url, player_url, age_gate, dash_mpd_fatal):
d8d24a92
S
1331 # Do not overwrite DASH format found in some previous DASH manifest
1332 if df['format_id'] not in dash_formats:
1333 dash_formats[df['format_id']] = df
77c6fb5b
S
1334 # Additional DASH manifests may end up in HTTP Error 403 therefore
1335 # allow them to fail without bug report message if we already have
1336 # some DASH manifest succeeded. This is temporary workaround to reduce
1337 # burst of bug reports until we figure out the reason and whether it
1338 # can be fixed at all.
1339 dash_mpd_fatal = False
774e208f
PH
1340 except (ExtractorError, KeyError) as e:
1341 self.report_warning(
1342 'Skipping DASH manifest: %r' % e, video_id)
d8d24a92 1343 if dash_formats:
04b3b3df
JMF
1344 # Remove the formats we found through non-DASH, they
1345 # contain less info and it can be wrong, because we use
1346 # fixed values (for example the resolution). See
1347 # https://github.com/rg3/youtube-dl/issues/5774 for an
1348 # example.
d80265cc 1349 formats = [f for f in formats if f['format_id'] not in dash_formats.keys()]
d8d24a92 1350 formats.extend(dash_formats.values())
d80044c2 1351
6271f1ca
PH
1352 # Check for malformed aspect ratio
1353 stretched_m = re.search(
1354 r'<meta\s+property="og:video:tag".*?content="yt:stretch=(?P<w>[0-9]+):(?P<h>[0-9]+)">',
1355 video_webpage)
1356 if stretched_m:
1357 ratio = float(stretched_m.group('w')) / float(stretched_m.group('h'))
1358 for f in formats:
1359 if f.get('vcodec') != 'none':
1360 f['stretched_ratio'] = ratio
1361
4bcc7bd1 1362 self._sort_formats(formats)
4ea3be0a 1363
1364 return {
8bcc8756
JW
1365 'id': video_id,
1366 'uploader': video_uploader,
1367 'uploader_id': video_uploader_id,
1368 'upload_date': upload_date,
1369 'title': video_title,
1370 'thumbnail': video_thumbnail,
1371 'description': video_description,
1372 'categories': video_categories,
000b6b5a 1373 'tags': video_tags,
8bcc8756 1374 'subtitles': video_subtitles,
360e1ca5 1375 'automatic_captions': automatic_captions,
8bcc8756
JW
1376 'duration': video_duration,
1377 'age_limit': 18 if age_gate else 0,
1378 'annotations': video_annotations,
7e8c0af0 1379 'webpage_url': proto + '://www.youtube.com/watch?v=%s' % video_id,
8bcc8756 1380 'view_count': view_count,
4ea3be0a 1381 'like_count': like_count,
1382 'dislike_count': dislike_count,
2d30521a 1383 'average_rating': float_or_none(video_info.get('avg_rating', [None])[0]),
8bcc8756 1384 'formats': formats,
2fe1ff85 1385 'is_live': is_live,
7c80519c 1386 'start_time': start_time,
297a564b 1387 'end_time': end_time,
4ea3be0a 1388 }
c5e8d7af 1389
5f6a1245 1390
880e1c52 1391class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
78caa52a 1392 IE_DESC = 'YouTube.com playlists'
d67cc9fa 1393 _VALID_URL = r"""(?x)(?:
c5e8d7af
PH
1394 (?:https?://)?
1395 (?:\w+\.)?
1396 youtube\.com/
1397 (?:
ac7553d0 1398 (?:course|view_play_list|my_playlists|artist|playlist|watch|embed/videoseries)
c5e8d7af
PH
1399 \? (?:.*?&)*? (?:p|a|list)=
1400 | p/
1401 )
d67cc9fa 1402 (
99209c29 1403 (?:PL|LL|EC|UU|FL|RD|UL)?[0-9A-Za-z-_]{10,}
5f6a1245 1404 # Top tracks, they can also include dots
d67cc9fa
JMF
1405 |(?:MC)[\w\.]*
1406 )
c5e8d7af
PH
1407 .*
1408 |
99209c29 1409 ((?:PL|LL|EC|UU|FL|RD|UL)[0-9A-Za-z-_]{10,})
c5e8d7af 1410 )"""
dbb94fb0 1411 _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s'
dbb94fb0 1412 _VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})&amp;[^"]*?index=(?P<index>\d+)'
78caa52a 1413 IE_NAME = 'youtube:playlist'
81127aa5
PH
1414 _TESTS = [{
1415 'url': 'https://www.youtube.com/playlist?list=PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re',
1416 'info_dict': {
1417 'title': 'ytdl test PL',
a1cf99d0 1418 'id': 'PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re',
81127aa5
PH
1419 },
1420 'playlist_count': 3,
9291475f
PH
1421 }, {
1422 'url': 'https://www.youtube.com/playlist?list=PLtPgu7CB4gbZDA7i_euNxn75ISqxwZPYx',
1423 'info_dict': {
acf757f4 1424 'id': 'PLtPgu7CB4gbZDA7i_euNxn75ISqxwZPYx',
9291475f
PH
1425 'title': 'YDL_Empty_List',
1426 },
1427 'playlist_count': 0,
1428 }, {
1429 'note': 'Playlist with deleted videos (#651). As a bonus, the video #51 is also twice in this list.',
1430 'url': 'https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC',
1431 'info_dict': {
1432 'title': '29C3: Not my department',
acf757f4 1433 'id': 'PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC',
9291475f
PH
1434 },
1435 'playlist_count': 95,
1436 }, {
1437 'note': 'issue #673',
1438 'url': 'PLBB231211A4F62143',
1439 'info_dict': {
f46a8702 1440 'title': '[OLD]Team Fortress 2 (Class-based LP)',
acf757f4 1441 'id': 'PLBB231211A4F62143',
9291475f
PH
1442 },
1443 'playlist_mincount': 26,
1444 }, {
1445 'note': 'Large playlist',
1446 'url': 'https://www.youtube.com/playlist?list=UUBABnxM4Ar9ten8Mdjj1j0Q',
1447 'info_dict': {
1448 'title': 'Uploads from Cauchemar',
acf757f4 1449 'id': 'UUBABnxM4Ar9ten8Mdjj1j0Q',
9291475f
PH
1450 },
1451 'playlist_mincount': 799,
1452 }, {
1453 'url': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl',
1454 'info_dict': {
1455 'title': 'YDL_safe_search',
acf757f4 1456 'id': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl',
9291475f
PH
1457 },
1458 'playlist_count': 2,
ac7553d0
PH
1459 }, {
1460 'note': 'embedded',
1461 'url': 'http://www.youtube.com/embed/videoseries?list=PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu',
1462 'playlist_count': 4,
1463 'info_dict': {
1464 'title': 'JODA15',
acf757f4 1465 'id': 'PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu',
ac7553d0 1466 }
6b08cdf6
PH
1467 }, {
1468 'note': 'Embedded SWF player',
1469 'url': 'http://www.youtube.com/p/YN5VISEtHet5D4NEvfTd0zcgFk84NqFZ?hl=en_US&fs=1&rel=0',
1470 'playlist_count': 4,
1471 'info_dict': {
1472 'title': 'JODA7',
acf757f4 1473 'id': 'YN5VISEtHet5D4NEvfTd0zcgFk84NqFZ',
6b08cdf6 1474 }
4b7df0d3
JMF
1475 }, {
1476 'note': 'Buggy playlist: the webpage has a "Load more" button but it doesn\'t have more videos',
1477 'url': 'https://www.youtube.com/playlist?list=UUXw-G3eDE9trcvY2sBMM_aA',
1478 'info_dict': {
acf757f4
PH
1479 'title': 'Uploads from Interstellar Movie',
1480 'id': 'UUXw-G3eDE9trcvY2sBMM_aA',
4b7df0d3
JMF
1481 },
1482 'playlist_mincout': 21,
81127aa5 1483 }]
c5e8d7af 1484
880e1c52
JMF
1485 def _real_initialize(self):
1486 self._login()
1487
652cdaa2 1488 def _extract_mix(self, playlist_id):
99209c29 1489 # The mixes are generated from a single video
652cdaa2 1490 # the id of the playlist is just 'RD' + video_id
7d4afc55 1491 url = 'https://youtube.com/watch?v=%s&list=%s' % (playlist_id[-11:], playlist_id)
c9cc0bf5 1492 webpage = self._download_webpage(
78caa52a 1493 url, playlist_id, 'Downloading Youtube mix')
bc2f773b 1494 search_title = lambda class_name: get_element_by_attribute('class', class_name, webpage)
c9cc0bf5
PH
1495 title_span = (
1496 search_title('playlist-title') or
1497 search_title('title long-title') or
1498 search_title('title'))
76d1700b 1499 title = clean_html(title_span)
c9cc0bf5
PH
1500 ids = orderedSet(re.findall(
1501 r'''(?xs)data-video-username=".*?".*?
1502 href="/watch\?v=([0-9A-Za-z_-]{11})&amp;[^"]*?list=%s''' % re.escape(playlist_id),
1503 webpage))
652cdaa2
JMF
1504 url_results = self._ids_to_results(ids)
1505
1506 return self.playlist_result(url_results, playlist_id, title)
1507
448830ce 1508 def _extract_playlist(self, playlist_id):
dbb94fb0
S
1509 url = self._TEMPLATE_URL % playlist_id
1510 page = self._download_webpage(url, playlist_id)
dbb94fb0 1511
39b62db1
YCH
1512 for match in re.findall(r'<div class="yt-alert-message">([^<]+)</div>', page):
1513 match = match.strip()
1514 # Check if the playlist exists or is private
1515 if re.match(r'[^<]*(The|This) playlist (does not exist|is private)[^<]*', match):
1516 raise ExtractorError(
1517 'The playlist doesn\'t exist or is private, use --username or '
1518 '--netrc to access it.',
1519 expected=True)
1520 elif re.match(r'[^<]*Invalid parameters[^<]*', match):
1521 raise ExtractorError(
1522 'Invalid parameters. Maybe URL is incorrect.',
1523 expected=True)
1524 elif re.match(r'[^<]*Choose your language[^<]*', match):
1525 continue
1526 else:
1527 self.report_warning('Youtube gives an alert message: ' + match)
10c0e2d8 1528
dcbb4580 1529 # Extract the video ids from the playlist pages
70219b0f
JMF
1530 def _entries():
1531 more_widget_html = content_html = page
1532 for page_num in itertools.count(1):
1533 matches = re.finditer(self._VIDEO_RE, content_html)
1534 # We remove the duplicates and the link with index 0
1535 # (it's not the first video of the playlist)
1536 new_ids = orderedSet(m.group('id') for m in matches if m.group('index') != '0')
1537 for vid_id in new_ids:
1538 yield self.url_result(vid_id, 'Youtube', video_id=vid_id)
1539
1540 mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
1541 if not mobj:
1542 break
1543
1544 more = self._download_json(
1545 'https://youtube.com/%s' % mobj.group('more'), playlist_id,
1546 'Downloading page #%s' % page_num,
1547 transform_source=uppercase_escape)
1548 content_html = more['content_html']
1549 if not content_html.strip():
1550 # Some webpages show a "Load more" button but they don't
1551 # have more videos
1552 break
1553 more_widget_html = more['load_more_widget_html']
dbb94fb0
S
1554
1555 playlist_title = self._html_search_regex(
68eb8e90 1556 r'(?s)<h1 class="pl-header-title[^"]*">\s*(.*?)\s*</h1>',
78caa52a 1557 page, 'title')
c5e8d7af 1558
70219b0f 1559 return self.playlist_result(_entries(), playlist_id, playlist_title)
c5e8d7af 1560
448830ce
S
1561 def _real_extract(self, url):
1562 # Extract playlist id
1563 mobj = re.match(self._VALID_URL, url)
1564 if mobj is None:
1565 raise ExtractorError('Invalid URL: %s' % url)
1566 playlist_id = mobj.group(1) or mobj.group(2)
1567
1568 # Check if it's a video-specific URL
1569 query_dict = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
1570 if 'v' in query_dict:
1571 video_id = query_dict['v'][0]
1572 if self._downloader.params.get('noplaylist'):
1573 self.to_screen('Downloading just video %s because of --no-playlist' % video_id)
1574 return self.url_result(video_id, 'Youtube', video_id=video_id)
1575 else:
1576 self.to_screen('Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id))
1577
1578 if playlist_id.startswith('RD') or playlist_id.startswith('UL'):
1579 # Mixes require a custom extraction process
1580 return self._extract_mix(playlist_id)
1581
1582 return self._extract_playlist(playlist_id)
1583
c5e8d7af
PH
1584
1585class YoutubeChannelIE(InfoExtractor):
78caa52a 1586 IE_DESC = 'YouTube.com channels'
9ff67727 1587 _VALID_URL = r'https?://(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/(?P<id>[0-9A-Za-z_-]+)'
eb0f3e7e 1588 _TEMPLATE_URL = 'https://www.youtube.com/channel/%s/videos'
78caa52a 1589 IE_NAME = 'youtube:channel'
cdc628a4
PH
1590 _TESTS = [{
1591 'note': 'paginated channel',
1592 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w',
1593 'playlist_mincount': 91,
acf757f4
PH
1594 'info_dict': {
1595 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
1596 }
cdc628a4 1597 }]
c5e8d7af 1598
6de5dbaf
S
1599 @staticmethod
1600 def extract_videos_from_page(page):
c5e8d7af 1601 ids_in_page = []
fb69240c
S
1602 titles_in_page = []
1603 for mobj in re.finditer(r'(?:title="(?P<title>[^"]+)"[^>]+)?href="/watch\?v=(?P<id>[0-9A-Za-z_-]+)&?', page):
1604 video_id = mobj.group('id')
1605 video_title = unescapeHTML(mobj.group('title'))
1606 try:
1607 idx = ids_in_page.index(video_id)
1608 if video_title and not titles_in_page[idx]:
1609 titles_in_page[idx] = video_title
1610 except ValueError:
1611 ids_in_page.append(video_id)
1612 titles_in_page.append(video_title)
1613 return zip(ids_in_page, titles_in_page)
c5e8d7af
PH
1614
1615 def _real_extract(self, url):
9ff67727 1616 channel_id = self._match_id(url)
c5e8d7af 1617
eb0f3e7e 1618 url = self._TEMPLATE_URL % channel_id
386bdfa6
S
1619
1620 # Channel by page listing is restricted to 35 pages of 30 items, i.e. 1050 videos total (see #5778)
1621 # Workaround by extracting as a playlist if managed to obtain channel playlist URL
1622 # otherwise fallback on channel by page extraction
1623 channel_page = self._download_webpage(
1624 url + '?view=57', channel_id,
1625 'Downloading channel page', fatal=False)
3d8e9573
S
1626 channel_playlist_id = self._html_search_meta(
1627 'channelId', channel_page, 'channel id', default=None)
1628 if not channel_playlist_id:
1629 channel_playlist_id = self._search_regex(
1630 r'data-channel-external-id="([^"]+)"',
1631 channel_page, 'channel id', default=None)
386bdfa6
S
1632 if channel_playlist_id and channel_playlist_id.startswith('UC'):
1633 playlist_id = 'UU' + channel_playlist_id[2:]
d2a9de78
IK
1634 return self.url_result(
1635 compat_urlparse.urljoin(url, '/playlist?list=%s' % playlist_id), 'YoutubePlaylist')
386bdfa6 1636
60bf45c8 1637 channel_page = self._download_webpage(url, channel_id, 'Downloading page #1')
31812a9e
PH
1638 autogenerated = re.search(r'''(?x)
1639 class="[^"]*?(?:
1640 channel-header-autogenerated-label|
1641 yt-channel-title-autogenerated
1642 )[^"]*"''', channel_page) is not None
c5e8d7af 1643
b9643eed
JMF
1644 if autogenerated:
1645 # The videos are contained in a single page
1646 # the ajax pages can't be used, they are empty
b82f815f 1647 entries = [
fb69240c
S
1648 self.url_result(
1649 video_id, 'Youtube', video_id=video_id,
1650 video_title=video_title)
8f02ad4f 1651 for video_id, video_title in self.extract_videos_from_page(channel_page)]
b82f815f
PH
1652 return self.playlist_result(entries, channel_id)
1653
1654 def _entries():
23d3608c 1655 more_widget_html = content_html = channel_page
b9643eed 1656 for pagenum in itertools.count(1):
81c2f20b 1657
8f02ad4f 1658 for video_id, video_title in self.extract_videos_from_page(content_html):
b82f815f 1659 yield self.url_result(
fb69240c
S
1660 video_id, 'Youtube', video_id=video_id,
1661 video_title=video_title)
5f6a1245 1662
23d3608c
JMF
1663 mobj = re.search(
1664 r'data-uix-load-more-href="/?(?P<more>[^"]+)"',
1665 more_widget_html)
1666 if not mobj:
b9643eed 1667 break
c5e8d7af 1668
23d3608c
JMF
1669 more = self._download_json(
1670 'https://youtube.com/%s' % mobj.group('more'), channel_id,
1671 'Downloading page #%s' % (pagenum + 1),
1672 transform_source=uppercase_escape)
1673 content_html = more['content_html']
1674 more_widget_html = more['load_more_widget_html']
1675
b82f815f 1676 return self.playlist_result(_entries(), channel_id)
c5e8d7af
PH
1677
1678
eb0f3e7e 1679class YoutubeUserIE(YoutubeChannelIE):
78caa52a 1680 IE_DESC = 'YouTube.com user videos (URL or "ytuser" keyword)'
9ff67727 1681 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:user/)?(?!(?:attribution_link|watch|results)(?:$|[^a-z_A-Z0-9-])))|ytuser:)(?!feed/)(?P<id>[A-Za-z0-9_-]+)'
eb0f3e7e 1682 _TEMPLATE_URL = 'https://www.youtube.com/user/%s/videos'
78caa52a 1683 IE_NAME = 'youtube:user'
c5e8d7af 1684
cdc628a4
PH
1685 _TESTS = [{
1686 'url': 'https://www.youtube.com/user/TheLinuxFoundation',
1687 'playlist_mincount': 320,
1688 'info_dict': {
1689 'title': 'TheLinuxFoundation',
1690 }
1691 }, {
1692 'url': 'ytuser:phihag',
1693 'only_matching': True,
1694 }]
1695
e3ea4790 1696 @classmethod
f4b05232 1697 def suitable(cls, url):
e3ea4790
JMF
1698 # Don't return True if the url can be extracted with other youtube
1699 # extractor, the regex would is too permissive and it would match.
1700 other_ies = iter(klass for (name, klass) in globals().items() if name.endswith('IE') and klass is not cls)
5f6a1245
JW
1701 if any(ie.suitable(url) for ie in other_ies):
1702 return False
1703 else:
1704 return super(YoutubeUserIE, cls).suitable(url)
f4b05232 1705
b05654f0 1706
b4c08069 1707class YoutubeSearchIE(SearchInfoExtractor, YoutubePlaylistIE):
78caa52a 1708 IE_DESC = 'YouTube.com searches'
b4c08069
JMF
1709 # there doesn't appear to be a real limit, for example if you search for
1710 # 'python' you get more than 8.000.000 results
1711 _MAX_RESULTS = float('inf')
78caa52a 1712 IE_NAME = 'youtube:search'
b05654f0 1713 _SEARCH_KEY = 'ytsearch'
b4c08069 1714 _EXTRA_QUERY_ARGS = {}
9dd8e46a 1715 _TESTS = []
b05654f0 1716
b05654f0
PH
1717 def _get_n_results(self, query, n):
1718 """Get a specified number of results for a query"""
1719
b4c08069 1720 videos = []
b05654f0
PH
1721 limit = n
1722
b4c08069
JMF
1723 for pagenum in itertools.count(1):
1724 url_query = {
02175a79 1725 'search_query': query.encode('utf-8'),
b4c08069
JMF
1726 'page': pagenum,
1727 'spf': 'navigate',
1728 }
1729 url_query.update(self._EXTRA_QUERY_ARGS)
1730 result_url = 'https://www.youtube.com/results?' + compat_urllib_parse.urlencode(url_query)
1731 data = self._download_json(
69ea8ca4 1732 result_url, video_id='query "%s"' % query,
b4c08069 1733 note='Downloading page %s' % pagenum,
69ea8ca4 1734 errnote='Unable to download API page')
b4c08069 1735 html_content = data[1]['body']['content']
7cc3570e 1736
b4c08069 1737 if 'class="search-message' in html_content:
07ad22b8 1738 raise ExtractorError(
78caa52a 1739 '[youtube] No video results', expected=True)
b05654f0 1740
b4c08069
JMF
1741 new_videos = self._ids_to_results(orderedSet(re.findall(
1742 r'href="/watch\?v=(.{11})', html_content)))
1743 videos += new_videos
1744 if not new_videos or len(videos) > limit:
1745 break
b05654f0 1746
b4c08069
JMF
1747 if len(videos) > n:
1748 videos = videos[:n]
b05654f0 1749 return self.playlist_result(videos, query)
75dff0ee 1750
c9ae7b95 1751
a3dd9248 1752class YoutubeSearchDateIE(YoutubeSearchIE):
cb7fb546 1753 IE_NAME = YoutubeSearchIE.IE_NAME + ':date'
a3dd9248 1754 _SEARCH_KEY = 'ytsearchdate'
78caa52a 1755 IE_DESC = 'YouTube.com searches, newest videos first'
b4c08069 1756 _EXTRA_QUERY_ARGS = {'search_sort': 'video_date_uploaded'}
75dff0ee 1757
c9ae7b95
PH
1758
1759class YoutubeSearchURLIE(InfoExtractor):
78caa52a
PH
1760 IE_DESC = 'YouTube.com search URLs'
1761 IE_NAME = 'youtube:search_url'
c9ae7b95 1762 _VALID_URL = r'https?://(?:www\.)?youtube\.com/results\?(.*?&)?search_query=(?P<query>[^&]+)(?:[&]|$)'
cdc628a4
PH
1763 _TESTS = [{
1764 'url': 'https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video',
1765 'playlist_mincount': 5,
1766 'info_dict': {
1767 'title': 'youtube-dl test video',
1768 }
1769 }]
c9ae7b95
PH
1770
1771 def _real_extract(self, url):
1772 mobj = re.match(self._VALID_URL, url)
7fd002c0 1773 query = compat_urllib_parse_unquote_plus(mobj.group('query'))
c9ae7b95
PH
1774
1775 webpage = self._download_webpage(url, query)
1776 result_code = self._search_regex(
98998cde 1777 r'(?s)<ol[^>]+class="item-section"(.*?)</ol>', webpage, 'result HTML')
c9ae7b95
PH
1778
1779 part_codes = re.findall(
1780 r'(?s)<h3 class="yt-lockup-title">(.*?)</h3>', result_code)
1781 entries = []
1782 for part_code in part_codes:
1783 part_title = self._html_search_regex(
6feb2d5e 1784 [r'(?s)title="([^"]+)"', r'>([^<]+)</a>'], part_code, 'item title', fatal=False)
c9ae7b95
PH
1785 part_url_snippet = self._html_search_regex(
1786 r'(?s)href="([^"]+)"', part_code, 'item URL')
1787 part_url = compat_urlparse.urljoin(
1788 'https://www.youtube.com/', part_url_snippet)
1789 entries.append({
1790 '_type': 'url',
1791 'url': part_url,
1792 'title': part_title,
1793 })
1794
1795 return {
1796 '_type': 'playlist',
1797 'entries': entries,
1798 'title': query,
1799 }
1800
1801
75dff0ee 1802class YoutubeShowIE(InfoExtractor):
78caa52a 1803 IE_DESC = 'YouTube.com (multi-season) shows'
cdc628a4 1804 _VALID_URL = r'https?://www\.youtube\.com/show/(?P<id>[^?#]*)'
78caa52a 1805 IE_NAME = 'youtube:show'
cdc628a4
PH
1806 _TESTS = [{
1807 'url': 'http://www.youtube.com/show/airdisasters',
1808 'playlist_mincount': 3,
1809 'info_dict': {
1810 'id': 'airdisasters',
1811 'title': 'Air Disasters',
1812 }
1813 }]
75dff0ee
JMF
1814
1815 def _real_extract(self, url):
1816 mobj = re.match(self._VALID_URL, url)
cdc628a4
PH
1817 playlist_id = mobj.group('id')
1818 webpage = self._download_webpage(
1819 url, playlist_id, 'Downloading show webpage')
75dff0ee
JMF
1820 # There's one playlist for each season of the show
1821 m_seasons = list(re.finditer(r'href="(/playlist\?list=.*?)"', webpage))
cdc628a4
PH
1822 self.to_screen('%s: Found %s seasons' % (playlist_id, len(m_seasons)))
1823 entries = [
1824 self.url_result(
1825 'https://www.youtube.com' + season.group(1), 'YoutubePlaylist')
1826 for season in m_seasons
1827 ]
1828 title = self._og_search_title(webpage, fatal=False)
1829
1830 return {
1831 '_type': 'playlist',
1832 'id': playlist_id,
1833 'title': title,
1834 'entries': entries,
1835 }
04cc9617
JMF
1836
1837
b2e8bc1b 1838class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
d7ae0639 1839 """
25f14e9f 1840 Base class for feed extractors
d7ae0639
JMF
1841 Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.
1842 """
b2e8bc1b 1843 _LOGIN_REQUIRED = True
d7ae0639
JMF
1844
1845 @property
1846 def IE_NAME(self):
78caa52a 1847 return 'youtube:%s' % self._FEED_NAME
04cc9617 1848
81f0259b 1849 def _real_initialize(self):
b2e8bc1b 1850 self._login()
81f0259b 1851
04cc9617 1852 def _real_extract(self, url):
25f14e9f
S
1853 page = self._download_webpage(
1854 'https://www.youtube.com/feed/%s' % self._FEED_NAME, self._PLAYLIST_TITLE)
2bc43303
JMF
1855
1856 # The extraction process is the same as for playlists, but the regex
1857 # for the video ids doesn't contain an index
1858 ids = []
1859 more_widget_html = content_html = page
2bc43303
JMF
1860 for page_num in itertools.count(1):
1861 matches = re.findall(r'href="\s*/watch\?v=([0-9A-Za-z_-]{11})', content_html)
62c95fd5
S
1862
1863 # 'recommended' feed has infinite 'load more' and each new portion spins
1864 # the same videos in (sometimes) slightly different order, so we'll check
1865 # for unicity and break when portion has no new videos
1866 new_ids = filter(lambda video_id: video_id not in ids, orderedSet(matches))
1867 if not new_ids:
1868 break
1869
2bc43303
JMF
1870 ids.extend(new_ids)
1871
1872 mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
1873 if not mobj:
1874 break
1875
1876 more = self._download_json(
25f14e9f 1877 'https://youtube.com/%s' % mobj.group('more'), self._PLAYLIST_TITLE,
2bc43303
JMF
1878 'Downloading page #%s' % page_num,
1879 transform_source=uppercase_escape)
1880 content_html = more['content_html']
1881 more_widget_html = more['load_more_widget_html']
1882
25f14e9f
S
1883 return self.playlist_result(
1884 self._ids_to_results(ids), playlist_title=self._PLAYLIST_TITLE)
1885
1886
1887class YoutubeWatchLaterIE(YoutubePlaylistIE):
1888 IE_NAME = 'youtube:watchlater'
1889 IE_DESC = 'Youtube watch later list, ":ytwatchlater" for short (requires authentication)'
1890 _VALID_URL = r'https?://www\.youtube\.com/(?:feed/watch_later|playlist\?list=WL)|:ytwatchlater'
1891
1892 _TESTS = [] # override PlaylistIE tests
1893
1894 def _real_extract(self, url):
1895 return self._extract_playlist('WL')
f459d170 1896
5f6a1245 1897
c626a3d9 1898class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
78caa52a 1899 IE_NAME = 'youtube:favorites'
f3a34072 1900 IE_DESC = 'YouTube.com favourite videos, ":ytfav" for short (requires authentication)'
c7a7750d 1901 _VALID_URL = r'https?://www\.youtube\.com/my_favorites|:ytfav(?:ou?rites)?'
c626a3d9
JMF
1902 _LOGIN_REQUIRED = True
1903
1904 def _real_extract(self, url):
1905 webpage = self._download_webpage('https://www.youtube.com/my_favorites', 'Youtube Favourites videos')
78caa52a 1906 playlist_id = self._search_regex(r'list=(.+?)["&]', webpage, 'favourites playlist id')
c626a3d9 1907 return self.url_result(playlist_id, 'YoutubePlaylist')
15870e90
PH
1908
1909
25f14e9f
S
1910class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
1911 IE_DESC = 'YouTube.com recommended videos, ":ytrec" for short (requires authentication)'
1912 _VALID_URL = r'https?://www\.youtube\.com/feed/recommended|:ytrec(?:ommended)?'
1913 _FEED_NAME = 'recommended'
1914 _PLAYLIST_TITLE = 'Youtube Recommended videos'
1ed5b5c9 1915
1ed5b5c9 1916
25f14e9f
S
1917class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor):
1918 IE_DESC = 'YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication)'
1919 _VALID_URL = r'https?://www\.youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
1920 _FEED_NAME = 'subscriptions'
1921 _PLAYLIST_TITLE = 'Youtube Subscriptions'
1ed5b5c9 1922
1ed5b5c9 1923
25f14e9f
S
1924class YoutubeHistoryIE(YoutubeFeedsInfoExtractor):
1925 IE_DESC = 'Youtube watch history, ":ythistory" for short (requires authentication)'
1926 _VALID_URL = 'https?://www\.youtube\.com/feed/history|:ythistory'
1927 _FEED_NAME = 'history'
1928 _PLAYLIST_TITLE = 'Youtube History'
1ed5b5c9
JMF
1929
1930
15870e90
PH
1931class YoutubeTruncatedURLIE(InfoExtractor):
1932 IE_NAME = 'youtube:truncated_url'
1933 IE_DESC = False # Do not list
975d35db 1934 _VALID_URL = r'''(?x)
b95aab84
PH
1935 (?:https?://)?
1936 (?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/
1937 (?:watch\?(?:
c4808c60 1938 feature=[a-z_]+|
b95aab84
PH
1939 annotation_id=annotation_[^&]+|
1940 x-yt-cl=[0-9]+|
c1708b89 1941 hl=[^&]*|
b95aab84
PH
1942 )?
1943 |
1944 attribution_link\?a=[^&]+
1945 )
1946 $
975d35db 1947 '''
15870e90 1948
c4808c60
PH
1949 _TESTS = [{
1950 'url': 'http://www.youtube.com/watch?annotation_id=annotation_3951667041',
1951 'only_matching': True,
dc2fc736
PH
1952 }, {
1953 'url': 'http://www.youtube.com/watch?',
1954 'only_matching': True,
b95aab84
PH
1955 }, {
1956 'url': 'https://www.youtube.com/watch?x-yt-cl=84503534',
1957 'only_matching': True,
1958 }, {
1959 'url': 'https://www.youtube.com/watch?feature=foo',
1960 'only_matching': True,
c1708b89
PH
1961 }, {
1962 'url': 'https://www.youtube.com/watch?hl=en-GB',
1963 'only_matching': True,
c4808c60
PH
1964 }]
1965
15870e90
PH
1966 def _real_extract(self, url):
1967 raise ExtractorError(
78caa52a
PH
1968 'Did you forget to quote the URL? Remember that & is a meta '
1969 'character in most shells, so you want to put the URL in quotes, '
1970 'like youtube-dl '
1971 '"http://www.youtube.com/watch?feature=foo&v=BaW_jenozKc" '
1972 ' or simply youtube-dl BaW_jenozKc .',
15870e90 1973 expected=True)
772fd5cc
PH
1974
1975
1976class YoutubeTruncatedIDIE(InfoExtractor):
1977 IE_NAME = 'youtube:truncated_id'
1978 IE_DESC = False # Do not list
b95aab84 1979 _VALID_URL = r'https?://(?:www\.)?youtube\.com/watch\?v=(?P<id>[0-9A-Za-z_-]{1,10})$'
772fd5cc
PH
1980
1981 _TESTS = [{
1982 'url': 'https://www.youtube.com/watch?v=N_708QY7Ob',
1983 'only_matching': True,
1984 }]
1985
1986 def _real_extract(self, url):
1987 video_id = self._match_id(url)
1988 raise ExtractorError(
1989 'Incomplete YouTube ID %s. URL %s looks truncated.' % (video_id, url),
1990 expected=True)