]> jfr.im git - yt-dlp.git/blame - youtube_dl/extractor/youtube.py
flake8: Ignore some error added in pep8 1.6
[yt-dlp.git] / youtube_dl / extractor / youtube.py
CommitLineData
c5e8d7af 1# coding: utf-8
c5e8d7af 2
78caa52a
PH
3from __future__ import unicode_literals
4
5
0ca96d48 6import itertools
c5e8d7af 7import json
c4417ddb 8import os.path
c5e8d7af 9import re
42939b61 10import time
e0df6211 11import traceback
c5e8d7af 12
b05654f0 13from .common import InfoExtractor, SearchInfoExtractor
54d39d8b 14from .subtitles import SubtitlesInfoExtractor
2b25cb5d 15from ..jsinterp import JSInterpreter
54256267 16from ..swfinterp import SWFInterpreter
4bb4a188 17from ..compat import (
edf3e38e 18 compat_chr,
c5e8d7af 19 compat_parse_qs,
c5e8d7af
PH
20 compat_urllib_parse,
21 compat_urllib_request,
7c61bd36 22 compat_urlparse,
c5e8d7af 23 compat_str,
4bb4a188
PH
24)
25from ..utils import (
c5e8d7af 26 clean_html,
c5e8d7af 27 ExtractorError,
4bb4a188
PH
28 get_element_by_attribute,
29 get_element_by_id,
dd27fd17 30 int_or_none,
9c44d242 31 OnDemandPagedList,
4bb4a188 32 orderedSet,
c5e8d7af
PH
33 unescapeHTML,
34 unified_strdate,
81c2f20b 35 uppercase_escape,
c5e8d7af
PH
36)
37
5f6a1245 38
de7f3446 39class YoutubeBaseInfoExtractor(InfoExtractor):
b2e8bc1b
JMF
40 """Provide base functions for Youtube extractors"""
41 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
83317f69 42 _TWOFACTOR_URL = 'https://accounts.google.com/SecondFactor'
b2e8bc1b
JMF
43 _NETRC_MACHINE = 'youtube'
44 # If True it will raise an error if no login info is provided
45 _LOGIN_REQUIRED = False
46
b2e8bc1b 47 def _set_language(self):
810fb84d
PH
48 self._set_cookie(
49 '.youtube.com', 'PREF', 'f1=50000000&hl=en',
42939b61 50 # YouTube sets the expire time to about two months
810fb84d 51 expire_time=time.time() + 2 * 30 * 24 * 3600)
b2e8bc1b
JMF
52
53 def _login(self):
83317f69 54 """
55 Attempt to log in to YouTube.
56 True is returned if successful or skipped.
57 False is returned if login failed.
58
59 If _LOGIN_REQUIRED is set and no authentication was provided, an error is raised.
60 """
b2e8bc1b
JMF
61 (username, password) = self._get_login_info()
62 # No authentication to be performed
63 if username is None:
64 if self._LOGIN_REQUIRED:
69ea8ca4 65 raise ExtractorError('No login info available, needed for using %s.' % self.IE_NAME, expected=True)
83317f69 66 return True
b2e8bc1b 67
7cc3570e
PH
68 login_page = self._download_webpage(
69 self._LOGIN_URL, None,
69ea8ca4
PH
70 note='Downloading login page',
71 errnote='unable to fetch login page', fatal=False)
7cc3570e
PH
72 if login_page is False:
73 return
b2e8bc1b 74
795f28f8 75 galx = self._search_regex(r'(?s)<input.+?name="GALX".+?value="(.+?)"',
78caa52a 76 login_page, 'Login GALX parameter')
c5e8d7af 77
b2e8bc1b
JMF
78 # Log in
79 login_form_strs = {
8bcc8756
JW
80 'continue': 'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
81 'Email': username,
82 'GALX': galx,
83 'Passwd': password,
84
85 'PersistentCookie': 'yes',
86 '_utf8': '霱',
87 'bgresponse': 'js_disabled',
88 'checkConnection': '',
89 'checkedDomains': 'youtube',
90 'dnConn': '',
91 'pstMsg': '0',
92 'rmShown': '1',
93 'secTok': '',
94 'signIn': 'Sign in',
95 'timeStmp': '',
96 'service': 'youtube',
97 'uilel': '3',
98 'hl': 'en_US',
b2e8bc1b 99 }
83317f69 100
b2e8bc1b
JMF
101 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
102 # chokes on unicode
5f6a1245 103 login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k, v in login_form_strs.items())
b2e8bc1b 104 login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
7cc3570e
PH
105
106 req = compat_urllib_request.Request(self._LOGIN_URL, login_data)
107 login_results = self._download_webpage(
108 req, None,
69ea8ca4 109 note='Logging in', errnote='unable to log in', fatal=False)
7cc3570e
PH
110 if login_results is False:
111 return False
83317f69 112
113 if re.search(r'id="errormsg_0_Passwd"', login_results) is not None:
69ea8ca4 114 raise ExtractorError('Please use your account password and a two-factor code instead of an application-specific password.', expected=True)
83317f69 115
116 # Two-Factor
117 # TODO add SMS and phone call support - these require making a request and then prompting the user
118
119 if re.search(r'(?i)<form[^>]* id="gaia_secondfactorform"', login_results) is not None:
120 tfa_code = self._get_tfa_info()
121
122 if tfa_code is None:
69ea8ca4
PH
123 self._downloader.report_warning('Two-factor authentication required. Provide it with --twofactor <code>')
124 self._downloader.report_warning('(Note that only TOTP (Google Authenticator App) codes work at this time.)')
83317f69 125 return False
126
127 # Unlike the first login form, secTok and timeStmp are both required for the TFA form
128
129 match = re.search(r'id="secTok"\n\s+value=\'(.+)\'/>', login_results, re.M | re.U)
130 if match is None:
69ea8ca4 131 self._downloader.report_warning('Failed to get secTok - did the page structure change?')
83317f69 132 secTok = match.group(1)
133 match = re.search(r'id="timeStmp"\n\s+value=\'(.+)\'/>', login_results, re.M | re.U)
134 if match is None:
69ea8ca4 135 self._downloader.report_warning('Failed to get timeStmp - did the page structure change?')
83317f69 136 timeStmp = match.group(1)
137
138 tfa_form_strs = {
78caa52a
PH
139 'continue': 'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
140 'smsToken': '',
141 'smsUserPin': tfa_code,
142 'smsVerifyPin': 'Verify',
143
144 'PersistentCookie': 'yes',
145 'checkConnection': '',
146 'checkedDomains': 'youtube',
147 'pstMsg': '1',
148 'secTok': secTok,
149 'timeStmp': timeStmp,
150 'service': 'youtube',
151 'hl': 'en_US',
83317f69 152 }
5f6a1245 153 tfa_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k, v in tfa_form_strs.items())
83317f69 154 tfa_data = compat_urllib_parse.urlencode(tfa_form).encode('ascii')
155
156 tfa_req = compat_urllib_request.Request(self._TWOFACTOR_URL, tfa_data)
157 tfa_results = self._download_webpage(
158 tfa_req, None,
69ea8ca4 159 note='Submitting TFA code', errnote='unable to submit tfa', fatal=False)
83317f69 160
161 if tfa_results is False:
162 return False
163
164 if re.search(r'(?i)<form[^>]* id="gaia_secondfactorform"', tfa_results) is not None:
69ea8ca4 165 self._downloader.report_warning('Two-factor code expired. Please try again, or use a one-use backup code instead.')
83317f69 166 return False
167 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', tfa_results) is not None:
69ea8ca4 168 self._downloader.report_warning('unable to log in - did the page structure change?')
83317f69 169 return False
170 if re.search(r'smsauth-interstitial-reviewsettings', tfa_results) is not None:
69ea8ca4 171 self._downloader.report_warning('Your Google account has a security notice. Please log in on your web browser, resolve the notice, and try again.')
83317f69 172 return False
173
7cc3570e 174 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
69ea8ca4 175 self._downloader.report_warning('unable to log in: bad username or password')
b2e8bc1b
JMF
176 return False
177 return True
178
b2e8bc1b
JMF
179 def _real_initialize(self):
180 if self._downloader is None:
181 return
42939b61 182 self._set_language()
b2e8bc1b
JMF
183 if not self._login():
184 return
c5e8d7af 185
8377574c 186
de7f3446 187class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
78caa52a 188 IE_DESC = 'YouTube.com'
cb7dfeea 189 _VALID_URL = r"""(?x)^
c5e8d7af 190 (
edb53e2d 191 (?:https?://|//) # http(s):// or protocol-independent URL
cb7dfeea 192 (?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/|
484aaeb2 193 (?:www\.)?deturl\.com/www\.youtube\.com/|
e70dc1d1 194 (?:www\.)?pwnyoutube\.com/|
f7000f3a 195 (?:www\.)?yourepeat\.com/|
e69ae5b9
JMF
196 tube\.majestyc\.net/|
197 youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains
c5e8d7af
PH
198 (?:.*?\#/)? # handle anchor (#/) redirect urls
199 (?: # the various things that can precede the ID:
ac7553d0 200 (?:(?:v|embed|e)/(?!videoseries)) # v/ or embed/ or e/
c5e8d7af 201 |(?: # or the v= param in all its forms
f7000f3a 202 (?:(?:watch|movie)(?:_popup)?(?:\.php)?/?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
c5e8d7af
PH
203 (?:\?|\#!?) # the params delimiter ? or # or #!
204 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
205 v=
206 )
f4b05232
JMF
207 ))
208 |youtu\.be/ # just youtu.be/xxxx
edb53e2d 209 |(?:www\.)?cleanvideosearch\.com/media/action/yt/watch\?videoId=
f4b05232 210 )
c5e8d7af 211 )? # all until now is optional -> you can pass the naked ID
8963d9c2 212 ([0-9A-Za-z_-]{11}) # here is it! the YouTube video ID
9291475f 213 (?!.*?&list=) # combined list/video URLs are handled by the playlist IE
c5e8d7af
PH
214 (?(1).+)? # if we found the ID, everything can follow
215 $"""
c5e8d7af 216 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
2c62dc26
PH
217 _formats = {
218 '5': {'ext': 'flv', 'width': 400, 'height': 240},
219 '6': {'ext': 'flv', 'width': 450, 'height': 270},
220 '13': {'ext': '3gp'},
221 '17': {'ext': '3gp', 'width': 176, 'height': 144},
222 '18': {'ext': 'mp4', 'width': 640, 'height': 360},
223 '22': {'ext': 'mp4', 'width': 1280, 'height': 720},
224 '34': {'ext': 'flv', 'width': 640, 'height': 360},
225 '35': {'ext': 'flv', 'width': 854, 'height': 480},
226 '36': {'ext': '3gp', 'width': 320, 'height': 240},
227 '37': {'ext': 'mp4', 'width': 1920, 'height': 1080},
228 '38': {'ext': 'mp4', 'width': 4096, 'height': 3072},
229 '43': {'ext': 'webm', 'width': 640, 'height': 360},
230 '44': {'ext': 'webm', 'width': 854, 'height': 480},
231 '45': {'ext': 'webm', 'width': 1280, 'height': 720},
232 '46': {'ext': 'webm', 'width': 1920, 'height': 1080},
233
1d043b93 234
86fe61c8 235 # 3d videos
43b81eb9
PH
236 '82': {'ext': 'mp4', 'height': 360, 'format_note': '3D', 'preference': -20},
237 '83': {'ext': 'mp4', 'height': 480, 'format_note': '3D', 'preference': -20},
238 '84': {'ext': 'mp4', 'height': 720, 'format_note': '3D', 'preference': -20},
239 '85': {'ext': 'mp4', 'height': 1080, 'format_note': '3D', 'preference': -20},
240 '100': {'ext': 'webm', 'height': 360, 'format_note': '3D', 'preference': -20},
241 '101': {'ext': 'webm', 'height': 480, 'format_note': '3D', 'preference': -20},
242 '102': {'ext': 'webm', 'height': 720, 'format_note': '3D', 'preference': -20},
836a086c 243
96fb5605 244 # Apple HTTP Live Streaming
43b81eb9
PH
245 '92': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'preference': -10},
246 '93': {'ext': 'mp4', 'height': 360, 'format_note': 'HLS', 'preference': -10},
247 '94': {'ext': 'mp4', 'height': 480, 'format_note': 'HLS', 'preference': -10},
248 '95': {'ext': 'mp4', 'height': 720, 'format_note': 'HLS', 'preference': -10},
249 '96': {'ext': 'mp4', 'height': 1080, 'format_note': 'HLS', 'preference': -10},
250 '132': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'preference': -10},
251 '151': {'ext': 'mp4', 'height': 72, 'format_note': 'HLS', 'preference': -10},
2c62dc26
PH
252
253 # DASH mp4 video
43b81eb9
PH
254 '133': {'ext': 'mp4', 'height': 240, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
255 '134': {'ext': 'mp4', 'height': 360, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
256 '135': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
257 '136': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
258 '137': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
e65566a9 259 '138': {'ext': 'mp4', 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40}, # Height can vary (https://github.com/rg3/youtube-dl/issues/4559)
43b81eb9
PH
260 '160': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
261 '264': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
0d2c1418
PH
262 '298': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'h264'},
263 '299': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'h264'},
264 '266': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'vcodec': 'h264'},
836a086c 265
f6f1fc92 266 # Dash mp4 audio
62cd676c
PH
267 '139': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'vcodec': 'none', 'abr': 48, 'preference': -50, 'container': 'm4a_dash'},
268 '140': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'vcodec': 'none', 'abr': 128, 'preference': -50, 'container': 'm4a_dash'},
269 '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'vcodec': 'none', 'abr': 256, 'preference': -50, 'container': 'm4a_dash'},
836a086c
AZ
270
271 # Dash webm
e75cafe9
A
272 '167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
273 '168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
274 '169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
275 '170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
276 '218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
277 '219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
1cc887cb 278 '278': {'ext': 'webm', 'height': 144, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'container': 'webm', 'vcodec': 'VP9'},
e75cafe9
A
279 '242': {'ext': 'webm', 'height': 240, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
280 '243': {'ext': 'webm', 'height': 360, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
281 '244': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
282 '245': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
283 '246': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
284 '247': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
285 '248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
3c80377b 286 '271': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
18061bba 287 '272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
a51d3aa0
PH
288 '302': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'VP9'},
289 '303': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'VP9'},
76b3c610 290 '308': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'VP9'},
10a404c3 291 '313': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'vcodec': 'VP9'},
76b3c610 292 '315': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'VP9'},
2c62dc26
PH
293
294 # Dash webm audio
55db73ef 295 '171': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'abr': 128, 'preference': -50},
e75cafe9 296 '172': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'abr': 256, 'preference': -50},
ce6b9a2d 297
0857baad
PH
298 # Dash webm audio with opus inside
299 '249': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 50, 'preference': -50},
300 '250': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 70, 'preference': -50},
301 '251': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 160, 'preference': -50},
302
ce6b9a2d
PH
303 # RTMP (unnamed)
304 '_rtmp': {'protocol': 'rtmp'},
c5e8d7af 305 }
836a086c 306
78caa52a 307 IE_NAME = 'youtube'
2eb88d95
PH
308 _TESTS = [
309 {
4bc3a23e
PH
310 'url': 'http://www.youtube.com/watch?v=BaW_jenozKc',
311 'info_dict': {
312 'id': 'BaW_jenozKc',
313 'ext': 'mp4',
314 'title': 'youtube-dl test video "\'/\\ä↭𝕐',
315 'uploader': 'Philipp Hagemeister',
316 'uploader_id': 'phihag',
317 'upload_date': '20121002',
318 'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',
319 'categories': ['Science & Technology'],
3e7c1224
PH
320 'like_count': int,
321 'dislike_count': int,
2eb88d95 322 }
0e853ca4 323 },
0e853ca4 324 {
4bc3a23e
PH
325 'url': 'http://www.youtube.com/watch?v=UxxajLWwzqY',
326 'note': 'Test generic use_cipher_signature video (#897)',
327 'info_dict': {
328 'id': 'UxxajLWwzqY',
329 'ext': 'mp4',
330 'upload_date': '20120506',
331 'title': 'Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]',
332 'description': 'md5:fea86fda2d5a5784273df5c7cc994d9f',
333 'uploader': 'Icona Pop',
334 'uploader_id': 'IconaPop',
2eb88d95 335 }
c108eb73
JMF
336 },
337 {
4bc3a23e
PH
338 'url': 'https://www.youtube.com/watch?v=07FYdnEawAQ',
339 'note': 'Test VEVO video with age protection (#956)',
340 'info_dict': {
341 'id': '07FYdnEawAQ',
342 'ext': 'mp4',
343 'upload_date': '20130703',
344 'title': 'Justin Timberlake - Tunnel Vision (Explicit)',
345 'description': 'md5:64249768eec3bc4276236606ea996373',
346 'uploader': 'justintimberlakeVEVO',
347 'uploader_id': 'justintimberlakeVEVO',
c108eb73
JMF
348 }
349 },
fccd3771 350 {
4bc3a23e
PH
351 'url': '//www.YouTube.com/watch?v=yZIXLfi8CZQ',
352 'note': 'Embed-only video (#1746)',
353 'info_dict': {
354 'id': 'yZIXLfi8CZQ',
355 'ext': 'mp4',
356 'upload_date': '20120608',
357 'title': 'Principal Sexually Assaults A Teacher - Episode 117 - 8th June 2012',
358 'description': 'md5:09b78bd971f1e3e289601dfba15ca4f7',
359 'uploader': 'SET India',
360 'uploader_id': 'setindia'
fccd3771
PH
361 }
362 },
dd27fd17 363 {
4bc3a23e
PH
364 'url': 'http://www.youtube.com/watch?v=a9LDPn-MO4I',
365 'note': '256k DASH audio (format 141) via DASH manifest',
366 'info_dict': {
367 'id': 'a9LDPn-MO4I',
368 'ext': 'm4a',
369 'upload_date': '20121002',
370 'uploader_id': '8KVIDEO',
371 'description': '',
372 'uploader': '8KVIDEO',
373 'title': 'UHDTV TEST 8K VIDEO.mp4'
4919603f 374 },
4bc3a23e
PH
375 'params': {
376 'youtube_include_dash_manifest': True,
377 'format': '141',
4919603f 378 },
dd27fd17 379 },
3489b7d2
JMF
380 # DASH manifest with encrypted signature
381 {
78caa52a
PH
382 'url': 'https://www.youtube.com/watch?v=IB3lcPjvWLA',
383 'info_dict': {
384 'id': 'IB3lcPjvWLA',
385 'ext': 'm4a',
b766eb27
JMF
386 'title': 'Afrojack, Spree Wilson - The Spark ft. Spree Wilson',
387 'description': 'md5:12e7067fa6735a77bdcbb58cb1187d2d',
78caa52a
PH
388 'uploader': 'AfrojackVEVO',
389 'uploader_id': 'AfrojackVEVO',
390 'upload_date': '20131011',
3489b7d2 391 },
4bc3a23e 392 'params': {
78caa52a
PH
393 'youtube_include_dash_manifest': True,
394 'format': '141',
3489b7d2
JMF
395 },
396 },
aaeb86f6
S
397 # JS player signature function name containing $
398 {
399 'url': 'https://www.youtube.com/watch?v=nfWlot6h_JM',
400 'info_dict': {
401 'id': 'nfWlot6h_JM',
402 'ext': 'm4a',
403 'title': 'Taylor Swift - Shake It Off',
404 'description': 'md5:2acfda1b285bdd478ccec22f9918199d',
405 'uploader': 'TaylorSwiftVEVO',
406 'uploader_id': 'TaylorSwiftVEVO',
407 'upload_date': '20140818',
408 },
409 'params': {
410 'youtube_include_dash_manifest': True,
411 'format': '141',
412 },
413 },
aa79ac0c
PH
414 # Controversy video
415 {
416 'url': 'https://www.youtube.com/watch?v=T4XJQO3qol8',
417 'info_dict': {
418 'id': 'T4XJQO3qol8',
419 'ext': 'mp4',
420 'upload_date': '20100909',
421 'uploader': 'The Amazing Atheist',
422 'uploader_id': 'TheAmazingAtheist',
423 'title': 'Burning Everyone\'s Koran',
424 'description': 'SUBSCRIBE: http://www.youtube.com/saturninefilms\n\nEven Obama has taken a stand against freedom on this issue: http://www.huffingtonpost.com/2010/09/09/obama-gma-interview-quran_n_710282.html',
425 }
c522adb1
JMF
426 },
427 # Normal age-gate video (No vevo, embed allowed)
428 {
429 'url': 'http://youtube.com/watch?v=HtVdAasjOgU',
430 'info_dict': {
431 'id': 'HtVdAasjOgU',
432 'ext': 'mp4',
433 'title': 'The Witcher 3: Wild Hunt - The Sword Of Destiny Trailer',
9ed99402 434 'description': 're:(?s).{100,}About the Game\n.*?The Witcher 3: Wild Hunt.{100,}',
c522adb1
JMF
435 'uploader': 'The Witcher',
436 'uploader_id': 'WitcherGame',
437 'upload_date': '20140605',
438 },
439 },
fccae2b9
S
440 # Age-gate video with encrypted signature
441 {
442 'url': 'http://www.youtube.com/watch?v=6kLq3WMV1nU',
443 'info_dict': {
444 'id': '6kLq3WMV1nU',
445 'ext': 'mp4',
446 'title': 'Dedication To My Ex (Miss That) (Lyric Video)',
447 'description': 'md5:33765bb339e1b47e7e72b5490139bb41',
448 'uploader': 'LloydVEVO',
449 'uploader_id': 'LloydVEVO',
450 'upload_date': '20110629',
451 },
452 },
774e208f
PH
453 # video_info is None (https://github.com/rg3/youtube-dl/issues/4421)
454 {
455 'url': '__2ABJjxzNo',
456 'info_dict': {
457 'id': '__2ABJjxzNo',
458 'ext': 'mp4',
459 'upload_date': '20100430',
460 'uploader_id': 'deadmau5',
461 'description': 'md5:12c56784b8032162bb936a5f76d55360',
462 'uploader': 'deadmau5',
463 'title': 'Deadmau5 - Some Chords (HD)',
464 },
465 'expected_warnings': [
466 'DASH manifest missing',
467 ]
e52a40ab
PH
468 },
469 # Olympics (https://github.com/rg3/youtube-dl/issues/4431)
470 {
471 'url': 'lqQg6PlCWgI',
472 'info_dict': {
473 'id': 'lqQg6PlCWgI',
474 'ext': 'mp4',
cbe2bd91
PH
475 'upload_date': '20120731',
476 'uploader_id': 'olympic',
477 'description': 'HO09 - Women - GER-AUS - Hockey - 31 July 2012 - London 2012 Olympic Games',
478 'uploader': 'Olympics',
479 'title': 'Hockey - Women - GER-AUS - London 2012 Olympic Games',
480 },
481 'params': {
482 'skip_download': 'requires avconv',
e52a40ab 483 }
cbe2bd91 484 },
6271f1ca
PH
485 # Non-square pixels
486 {
487 'url': 'https://www.youtube.com/watch?v=_b-2C3KPAM0',
488 'info_dict': {
489 'id': '_b-2C3KPAM0',
490 'ext': 'mp4',
491 'stretched_ratio': 16 / 9.,
492 'upload_date': '20110310',
493 'uploader_id': 'AllenMeow',
494 'description': 'made by Wacom from Korea | 字幕&加油添醋 by TY\'s Allen | 感謝heylisa00cavey1001同學熱情提供梗及翻譯',
495 'uploader': '孫艾倫',
496 'title': '[A-made] 變態妍字幕版 太妍 我就是這樣的人',
497 },
498 }
2eb88d95
PH
499 ]
500
e0df6211
PH
501 def __init__(self, *args, **kwargs):
502 super(YoutubeIE, self).__init__(*args, **kwargs)
83799698 503 self._player_cache = {}
e0df6211 504
c5e8d7af
PH
505 def report_video_info_webpage_download(self, video_id):
506 """Report attempt to download video info webpage."""
69ea8ca4 507 self.to_screen('%s: Downloading video info webpage' % video_id)
c5e8d7af 508
c5e8d7af
PH
509 def report_information_extraction(self, video_id):
510 """Report attempt to extract video information."""
69ea8ca4 511 self.to_screen('%s: Extracting video information' % video_id)
c5e8d7af
PH
512
513 def report_unavailable_format(self, video_id, format):
514 """Report extracted video URL."""
69ea8ca4 515 self.to_screen('%s: Format %s not available' % (video_id, format))
c5e8d7af
PH
516
517 def report_rtmp_download(self):
518 """Indicate the download will use the RTMP protocol."""
69ea8ca4 519 self.to_screen('RTMP download detected')
c5e8d7af 520
60064c53
PH
521 def _signature_cache_id(self, example_sig):
522 """ Return a string representation of a signature """
78caa52a 523 return '.'.join(compat_str(len(part)) for part in example_sig.split('.'))
60064c53
PH
524
525 def _extract_signature_function(self, video_id, player_url, example_sig):
cf010131 526 id_m = re.match(
60620368 527 r'.*?-(?P<id>[a-zA-Z0-9_-]+)(?:/watch_as3|/html5player)?\.(?P<ext>[a-z]+)$',
cf010131 528 player_url)
c081b35c
PH
529 if not id_m:
530 raise ExtractorError('Cannot identify player %r' % player_url)
e0df6211
PH
531 player_type = id_m.group('ext')
532 player_id = id_m.group('id')
533
c4417ddb 534 # Read from filesystem cache
60064c53
PH
535 func_id = '%s_%s_%s' % (
536 player_type, player_id, self._signature_cache_id(example_sig))
c4417ddb 537 assert os.path.basename(func_id) == func_id
a0e07d31 538
69ea8ca4 539 cache_spec = self._downloader.cache.load('youtube-sigfuncs', func_id)
a0e07d31 540 if cache_spec is not None:
78caa52a 541 return lambda s: ''.join(s[i] for i in cache_spec)
83799698 542
e0df6211
PH
543 if player_type == 'js':
544 code = self._download_webpage(
545 player_url, video_id,
69ea8ca4
PH
546 note='Downloading %s player %s' % (player_type, player_id),
547 errnote='Download of %s failed' % player_url)
83799698 548 res = self._parse_sig_js(code)
c4417ddb 549 elif player_type == 'swf':
e0df6211
PH
550 urlh = self._request_webpage(
551 player_url, video_id,
69ea8ca4
PH
552 note='Downloading %s player %s' % (player_type, player_id),
553 errnote='Download of %s failed' % player_url)
e0df6211 554 code = urlh.read()
83799698 555 res = self._parse_sig_swf(code)
e0df6211
PH
556 else:
557 assert False, 'Invalid player type %r' % player_type
558
a0e07d31 559 if cache_spec is None:
78caa52a 560 test_string = ''.join(map(compat_chr, range(len(example_sig))))
a0e07d31
PH
561 cache_res = res(test_string)
562 cache_spec = [ord(c) for c in cache_res]
83799698 563
69ea8ca4 564 self._downloader.cache.store('youtube-sigfuncs', func_id, cache_spec)
83799698
PH
565 return res
566
60064c53 567 def _print_sig_code(self, func, example_sig):
edf3e38e
PH
568 def gen_sig_code(idxs):
569 def _genslice(start, end, step):
78caa52a 570 starts = '' if start == 0 else str(start)
8bcc8756 571 ends = (':%d' % (end + step)) if end + step >= 0 else ':'
69ea8ca4 572 steps = '' if step == 1 else (':%d' % step)
78caa52a 573 return 's[%s%s%s]' % (starts, ends, steps)
edf3e38e
PH
574
575 step = None
7af808a5
PH
576 # Quelch pyflakes warnings - start will be set when step is set
577 start = '(Never used)'
edf3e38e
PH
578 for i, prev in zip(idxs[1:], idxs[:-1]):
579 if step is not None:
580 if i - prev == step:
581 continue
582 yield _genslice(start, prev, step)
583 step = None
584 continue
585 if i - prev in [-1, 1]:
586 step = i - prev
587 start = prev
588 continue
589 else:
78caa52a 590 yield 's[%d]' % prev
edf3e38e 591 if step is None:
78caa52a 592 yield 's[%d]' % i
edf3e38e
PH
593 else:
594 yield _genslice(start, i, step)
595
78caa52a 596 test_string = ''.join(map(compat_chr, range(len(example_sig))))
c705320f 597 cache_res = func(test_string)
edf3e38e 598 cache_spec = [ord(c) for c in cache_res]
78caa52a 599 expr_code = ' + '.join(gen_sig_code(cache_spec))
60064c53
PH
600 signature_id_tuple = '(%s)' % (
601 ', '.join(compat_str(len(p)) for p in example_sig.split('.')))
69ea8ca4 602 code = ('if tuple(len(p) for p in s.split(\'.\')) == %s:\n'
78caa52a 603 ' return %s\n') % (signature_id_tuple, expr_code)
69ea8ca4 604 self.to_screen('Extracted signature function:\n' + code)
edf3e38e 605
e0df6211
PH
606 def _parse_sig_js(self, jscode):
607 funcname = self._search_regex(
aaeb86f6 608 r'\.sig\|\|([a-zA-Z0-9$]+)\(', jscode,
8bcc8756 609 'Initial JS player signature function name')
2b25cb5d
PH
610
611 jsi = JSInterpreter(jscode)
612 initial_function = jsi.extract_function(funcname)
e0df6211
PH
613 return lambda s: initial_function([s])
614
615 def _parse_sig_swf(self, file_contents):
54256267 616 swfi = SWFInterpreter(file_contents)
78caa52a 617 TARGET_CLASSNAME = 'SignatureDecipher'
54256267 618 searched_class = swfi.extract_class(TARGET_CLASSNAME)
78caa52a 619 initial_function = swfi.extract_function(searched_class, 'decipher')
e0df6211
PH
620 return lambda s: initial_function([s])
621
83799698 622 def _decrypt_signature(self, s, video_id, player_url, age_gate=False):
257a2501 623 """Turn the encrypted s field into a working signature"""
6b37f0be 624
c8bf86d5 625 if player_url is None:
69ea8ca4 626 raise ExtractorError('Cannot decrypt signature without player_url')
920de7a2 627
69ea8ca4 628 if player_url.startswith('//'):
78caa52a 629 player_url = 'https:' + player_url
c8bf86d5 630 try:
62af3a0e 631 player_id = (player_url, self._signature_cache_id(s))
c8bf86d5
PH
632 if player_id not in self._player_cache:
633 func = self._extract_signature_function(
60064c53 634 video_id, player_url, s
c8bf86d5
PH
635 )
636 self._player_cache[player_id] = func
637 func = self._player_cache[player_id]
638 if self._downloader.params.get('youtube_print_sig_code'):
60064c53 639 self._print_sig_code(func, s)
c8bf86d5
PH
640 return func(s)
641 except Exception as e:
642 tb = traceback.format_exc()
643 raise ExtractorError(
78caa52a 644 'Signature extraction failed: ' + tb, cause=e)
e0df6211 645
1f343eaa 646 def _get_available_subtitles(self, video_id, webpage):
de7f3446 647 try:
60e47a26 648 subs_doc = self._download_xml(
38c2e5b8 649 'https://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,
7fad1c63
JMF
650 video_id, note=False)
651 except ExtractorError as err:
69ea8ca4 652 self._downloader.report_warning('unable to download video subtitles: %s' % compat_str(err))
de7f3446 653 return {}
de7f3446
JMF
654
655 sub_lang_list = {}
60e47a26
JMF
656 for track in subs_doc.findall('track'):
657 lang = track.attrib['lang_code']
7e660ac1
LD
658 if lang in sub_lang_list:
659 continue
de7f3446
JMF
660 params = compat_urllib_parse.urlencode({
661 'lang': lang,
662 'v': video_id,
ca715127 663 'fmt': self._downloader.params.get('subtitlesformat', 'srt'),
60e47a26 664 'name': track.attrib['name'].encode('utf-8'),
de7f3446 665 })
78caa52a 666 url = 'https://www.youtube.com/api/timedtext?' + params
de7f3446
JMF
667 sub_lang_list[lang] = url
668 if not sub_lang_list:
69ea8ca4 669 self._downloader.report_warning('video doesn\'t have subtitles')
de7f3446
JMF
670 return {}
671 return sub_lang_list
672
055e6f36 673 def _get_available_automatic_caption(self, video_id, webpage):
de7f3446
JMF
674 """We need the webpage for getting the captions url, pass it as an
675 argument to speed up the process."""
ca715127 676 sub_format = self._downloader.params.get('subtitlesformat', 'srt')
69ea8ca4 677 self.to_screen('%s: Looking for automatic captions' % video_id)
de7f3446 678 mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
78caa52a 679 err_msg = 'Couldn\'t find automatic captions for %s' % video_id
de7f3446
JMF
680 if mobj is None:
681 self._downloader.report_warning(err_msg)
682 return {}
683 player_config = json.loads(mobj.group(1))
684 try:
0792d563
PH
685 args = player_config['args']
686 caption_url = args['ttsurl']
687 timestamp = args['timestamp']
055e6f36
JMF
688 # We get the available subtitles
689 list_params = compat_urllib_parse.urlencode({
690 'type': 'list',
691 'tlangs': 1,
692 'asrs': 1,
de7f3446 693 })
055e6f36 694 list_url = caption_url + '&' + list_params
e26f8712 695 caption_list = self._download_xml(list_url, video_id)
e3dc22ca 696 original_lang_node = caption_list.find('track')
7d900ef1 697 if original_lang_node is None:
69ea8ca4 698 self._downloader.report_warning('Video doesn\'t have automatic captions')
e3dc22ca
JMF
699 return {}
700 original_lang = original_lang_node.attrib['lang_code']
7d900ef1 701 caption_kind = original_lang_node.attrib.get('kind', '')
055e6f36
JMF
702
703 sub_lang_list = {}
704 for lang_node in caption_list.findall('target'):
705 sub_lang = lang_node.attrib['lang_code']
706 params = compat_urllib_parse.urlencode({
707 'lang': original_lang,
708 'tlang': sub_lang,
709 'fmt': sub_format,
710 'ts': timestamp,
7d900ef1 711 'kind': caption_kind,
055e6f36
JMF
712 })
713 sub_lang_list[sub_lang] = caption_url + '&' + params
714 return sub_lang_list
de7f3446
JMF
715 # An extractor error can be raise by the download process if there are
716 # no automatic captions but there are subtitles
717 except (KeyError, ExtractorError):
718 self._downloader.report_warning(err_msg)
719 return {}
720
97665381
PH
721 @classmethod
722 def extract_id(cls, url):
723 mobj = re.match(cls._VALID_URL, url, re.VERBOSE)
c5e8d7af 724 if mobj is None:
69ea8ca4 725 raise ExtractorError('Invalid URL: %s' % url)
c5e8d7af
PH
726 video_id = mobj.group(2)
727 return video_id
728
1d043b93
JMF
729 def _extract_from_m3u8(self, manifest_url, video_id):
730 url_map = {}
5f6a1245 731
1d043b93
JMF
732 def _get_urls(_manifest):
733 lines = _manifest.split('\n')
734 urls = filter(lambda l: l and not l.startswith('#'),
8bcc8756 735 lines)
1d043b93 736 return urls
78caa52a 737 manifest = self._download_webpage(manifest_url, video_id, 'Downloading formats manifest')
1d043b93
JMF
738 formats_urls = _get_urls(manifest)
739 for format_url in formats_urls:
890f62e8 740 itag = self._search_regex(r'itag/(\d+?)/', format_url, 'itag')
1d043b93
JMF
741 url_map[itag] = format_url
742 return url_map
743
1fb07d10
JG
744 def _extract_annotations(self, video_id):
745 url = 'https://www.youtube.com/annotations_invideo?features=1&legacy=1&video_id=%s' % video_id
69ea8ca4 746 return self._download_webpage(url, video_id, note='Searching for annotations.', errnote='Unable to download video annotations.')
1fb07d10 747
da276600
PH
748 def _parse_dash_manifest(
749 self, video_id, dash_manifest_url, player_url, age_gate):
774e208f
PH
750 def decrypt_sig(mobj):
751 s = mobj.group(1)
752 dec_s = self._decrypt_signature(s, video_id, player_url, age_gate)
753 return '/signature/%s' % dec_s
754 dash_manifest_url = re.sub(r'/s/([\w\.]+)', decrypt_sig, dash_manifest_url)
755 dash_doc = self._download_xml(
756 dash_manifest_url, video_id,
757 note='Downloading DASH manifest',
758 errnote='Could not download DASH manifest')
759
760 formats = []
761 for r in dash_doc.findall('.//{urn:mpeg:DASH:schema:MPD:2011}Representation'):
762 url_el = r.find('{urn:mpeg:DASH:schema:MPD:2011}BaseURL')
763 if url_el is None:
764 continue
765 format_id = r.attrib['id']
766 video_url = url_el.text
767 filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength'))
768 f = {
769 'format_id': format_id,
770 'url': video_url,
771 'width': int_or_none(r.attrib.get('width')),
e65566a9 772 'height': int_or_none(r.attrib.get('height')),
774e208f
PH
773 'tbr': int_or_none(r.attrib.get('bandwidth'), 1000),
774 'asr': int_or_none(r.attrib.get('audioSamplingRate')),
775 'filesize': filesize,
776 'fps': int_or_none(r.attrib.get('frameRate')),
777 }
778 try:
779 existing_format = next(
780 fo for fo in formats
781 if fo['format_id'] == format_id)
782 except StopIteration:
ba617964
JMF
783 full_info = self._formats.get(format_id, {}).copy()
784 full_info.update(f)
785 formats.append(full_info)
774e208f
PH
786 else:
787 existing_format.update(f)
788 return formats
789
c5e8d7af 790 def _real_extract(self, url):
7e8c0af0 791 proto = (
78caa52a
PH
792 'http' if self._downloader.params.get('prefer_insecure', False)
793 else 'https')
7e8c0af0 794
c5e8d7af
PH
795 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
796 mobj = re.search(self._NEXT_URL_RE, url)
797 if mobj:
7e8c0af0 798 url = proto + '://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
97665381 799 video_id = self.extract_id(url)
c5e8d7af
PH
800
801 # Get video webpage
aa79ac0c 802 url = proto + '://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1&bpctr=9999999999' % video_id
a1f934b1 803 video_webpage = self._download_webpage(url, video_id)
c5e8d7af
PH
804
805 # Attempt to extract SWF player URL
e0df6211 806 mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
c5e8d7af
PH
807 if mobj is not None:
808 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
809 else:
810 player_url = None
811
812 # Get video info
6449cd80 813 embed_webpage = None
c108eb73 814 if re.search(r'player-age-gate-content">', video_webpage) is not None:
c108eb73
JMF
815 age_gate = True
816 # We simulate the access to the video from www.youtube.com/v/{video_id}
817 # this can be viewed without login into Youtube
beb95e77
CL
818 url = proto + '://www.youtube.com/embed/%s' % video_id
819 embed_webpage = self._download_webpage(url, video_id, 'Downloading embed webpage')
2c57c7fa
JMF
820 data = compat_urllib_parse.urlencode({
821 'video_id': video_id,
822 'eurl': 'https://youtube.googleapis.com/v/' + video_id,
c084c934 823 'sts': self._search_regex(
beb95e77 824 r'"sts"\s*:\s*(\d+)', embed_webpage, 'sts', default=''),
2c57c7fa 825 })
7e8c0af0 826 video_info_url = proto + '://www.youtube.com/get_video_info?' + data
94bd3613
PH
827 video_info_webpage = self._download_webpage(
828 video_info_url, video_id,
20436c30 829 note='Refetching age-gated info webpage',
94bd3613 830 errnote='unable to download video info webpage')
c5e8d7af 831 video_info = compat_parse_qs(video_info_webpage)
c108eb73
JMF
832 else:
833 age_gate = False
4e62ebe2
JMF
834 try:
835 # Try looking directly into the video webpage
836 mobj = re.search(r';ytplayer\.config\s*=\s*({.*?});', video_webpage)
837 if not mobj:
838 raise ValueError('Could not find ytplayer.config') # caught below
839 json_code = uppercase_escape(mobj.group(1))
840 ytplayer_config = json.loads(json_code)
841 args = ytplayer_config['args']
842 # Convert to the same format returned by compat_parse_qs
843 video_info = dict((k, [v]) for k, v in args.items())
844 if 'url_encoded_fmt_stream_map' not in args:
845 raise ValueError('No stream_map present') # caught below
846 except ValueError:
847 # We fallback to the get_video_info pages (used by the embed page)
848 self.report_video_info_webpage_download(video_id)
849 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
810fb84d
PH
850 video_info_url = (
851 '%s://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
852 % (proto, video_id, el_type))
853 video_info_webpage = self._download_webpage(
854 video_info_url,
4e62ebe2
JMF
855 video_id, note=False,
856 errnote='unable to download video info webpage')
857 video_info = compat_parse_qs(video_info_webpage)
858 if 'token' in video_info:
859 break
c5e8d7af
PH
860 if 'token' not in video_info:
861 if 'reason' in video_info:
d11271dd 862 raise ExtractorError(
78caa52a 863 'YouTube said: %s' % video_info['reason'][0],
d11271dd 864 expected=True, video_id=video_id)
c5e8d7af 865 else:
d11271dd 866 raise ExtractorError(
78caa52a 867 '"token" parameter not in video info for unknown reason',
d11271dd 868 video_id=video_id)
c5e8d7af 869
1d699755
PH
870 if 'view_count' in video_info:
871 view_count = int(video_info['view_count'][0])
872 else:
873 view_count = None
874
c5e8d7af
PH
875 # Check for "rental" videos
876 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
69ea8ca4 877 raise ExtractorError('"rental" videos not supported')
c5e8d7af
PH
878
879 # Start extracting information
880 self.report_information_extraction(video_id)
881
882 # uploader
883 if 'author' not in video_info:
69ea8ca4 884 raise ExtractorError('Unable to extract uploader name')
c5e8d7af
PH
885 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
886
887 # uploader_id
888 video_uploader_id = None
889 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
890 if mobj is not None:
891 video_uploader_id = mobj.group(1)
892 else:
69ea8ca4 893 self._downloader.report_warning('unable to extract uploader nickname')
c5e8d7af
PH
894
895 # title
a8c6b241 896 if 'title' in video_info:
aa92f063 897 video_title = video_info['title'][0]
a8c6b241 898 else:
69ea8ca4 899 self._downloader.report_warning('Unable to extract video title')
78caa52a 900 video_title = '_'
c5e8d7af
PH
901
902 # thumbnail image
7763b04e
JMF
903 # We try first to get a high quality image:
904 m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">',
905 video_webpage, re.DOTALL)
906 if m_thumb is not None:
907 video_thumbnail = m_thumb.group(1)
908 elif 'thumbnail_url' not in video_info:
69ea8ca4 909 self._downloader.report_warning('unable to extract video thumbnail')
f490e77e 910 video_thumbnail = None
c5e8d7af
PH
911 else: # don't panic if we can't find it
912 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
913
914 # upload date
915 upload_date = None
ad3bc6ac 916 mobj = re.search(r'(?s)id="eow-date.*?>(.*?)</span>', video_webpage)
beee53de
PH
917 if mobj is None:
918 mobj = re.search(
263bd4ec 919 r'(?s)id="watch-uploader-info".*?>.*?(?:Published|Uploaded|Streamed live) on (.*?)</strong>',
beee53de 920 video_webpage)
c5e8d7af
PH
921 if mobj is not None:
922 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
923 upload_date = unified_strdate(upload_date)
924
55f7bd2d
PH
925 m_cat_container = self._search_regex(
926 r'(?s)<h4[^>]*>\s*Category\s*</h4>\s*<ul[^>]*>(.*?)</ul>',
624dcebf 927 video_webpage, 'categories', default=None)
ec8deefc 928 if m_cat_container:
ad3bc6ac 929 category = self._html_search_regex(
01ed5c9b 930 r'(?s)<a[^<]+>(.*?)</a>', m_cat_container, 'category',
ad3bc6ac
PH
931 default=None)
932 video_categories = None if category is None else [category]
933 else:
934 video_categories = None
ec8deefc 935
c5e8d7af
PH
936 # description
937 video_description = get_element_by_id("eow-description", video_webpage)
938 if video_description:
27dcce19
PH
939 video_description = re.sub(r'''(?x)
940 <a\s+
941 (?:[a-zA-Z-]+="[^"]+"\s+)*?
942 title="([^"]+)"\s+
943 (?:[a-zA-Z-]+="[^"]+"\s+)*?
944 class="yt-uix-redirect-link"\s*>
945 [^<]+
946 </a>
947 ''', r'\1', video_description)
c5e8d7af
PH
948 video_description = clean_html(video_description)
949 else:
950 fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
951 if fd_mobj:
952 video_description = unescapeHTML(fd_mobj.group(1))
953 else:
78caa52a 954 video_description = ''
c5e8d7af 955
f30a38be 956 def _extract_count(count_name):
46374a56 957 count = self._search_regex(
f30a38be
JMF
958 r'id="watch-%s"[^>]*>.*?([\d,]+)\s*</span>' % re.escape(count_name),
959 video_webpage, count_name, default=None)
336c3a69
JMF
960 if count is not None:
961 return int(count.replace(',', ''))
962 return None
69ea8ca4
PH
963 like_count = _extract_count('like')
964 dislike_count = _extract_count('dislike')
336c3a69 965
c5e8d7af 966 # subtitles
d82134c3 967 video_subtitles = self.extract_subtitles(video_id, video_webpage)
c5e8d7af 968
c5e8d7af 969 if self._downloader.params.get('listsubtitles', False):
d665f8d3 970 self._list_available_subtitles(video_id, video_webpage)
c5e8d7af
PH
971 return
972
973 if 'length_seconds' not in video_info:
69ea8ca4 974 self._downloader.report_warning('unable to extract video duration')
b466b702 975 video_duration = None
c5e8d7af 976 else:
b466b702 977 video_duration = int(compat_urllib_parse.unquote_plus(video_info['length_seconds'][0]))
c5e8d7af 978
1fb07d10
JG
979 # annotations
980 video_annotations = None
981 if self._downloader.params.get('writeannotations', False):
5f6a1245 982 video_annotations = self._extract_annotations(video_id)
1fb07d10 983
dd27fd17
PH
984 def _map_to_format_list(urlmap):
985 formats = []
986 for itag, video_real_url in urlmap.items():
987 dct = {
988 'format_id': itag,
989 'url': video_real_url,
990 'player_url': player_url,
991 }
0b65e5d4
PH
992 if itag in self._formats:
993 dct.update(self._formats[itag])
dd27fd17
PH
994 formats.append(dct)
995 return formats
996
c5e8d7af
PH
997 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
998 self.report_rtmp_download()
dd27fd17
PH
999 formats = [{
1000 'format_id': '_rtmp',
1001 'protocol': 'rtmp',
1002 'url': video_info['conn'][0],
1003 'player_url': player_url,
1004 }]
24270b03 1005 elif len(video_info.get('url_encoded_fmt_stream_map', [''])[0]) >= 1 or len(video_info.get('adaptive_fmts', [''])[0]) >= 1:
5f6a1245 1006 encoded_url_map = video_info.get('url_encoded_fmt_stream_map', [''])[0] + ',' + video_info.get('adaptive_fmts', [''])[0]
00fe14fc 1007 if 'rtmpe%3Dyes' in encoded_url_map:
a7055eb9 1008 raise ExtractorError('rtmpe downloads are not supported, see https://github.com/rg3/youtube-dl/issues/343 for more information.', expected=True)
c5e8d7af 1009 url_map = {}
00fe14fc 1010 for url_data_str in encoded_url_map.split(','):
c5e8d7af 1011 url_data = compat_parse_qs(url_data_str)
201e9eaa
PH
1012 if 'itag' not in url_data or 'url' not in url_data:
1013 continue
1014 format_id = url_data['itag'][0]
1015 url = url_data['url'][0]
1016
1017 if 'sig' in url_data:
1018 url += '&signature=' + url_data['sig'][0]
1019 elif 's' in url_data:
1020 encrypted_sig = url_data['s'][0]
6449cd80 1021 ASSETS_RE = r'"assets":.+?"js":\s*("[^"]+")'
201e9eaa 1022
beb95e77 1023 jsplayer_url_json = self._search_regex(
6449cd80
PH
1024 ASSETS_RE,
1025 embed_webpage if age_gate else video_webpage,
1026 'JS player URL (1)', default=None)
1027 if not jsplayer_url_json and not age_gate:
1028 # We need the embed website after all
1029 if embed_webpage is None:
1030 embed_url = proto + '://www.youtube.com/embed/%s' % video_id
1031 embed_webpage = self._download_webpage(
1032 embed_url, video_id, 'Downloading embed webpage')
1033 jsplayer_url_json = self._search_regex(
1034 ASSETS_RE, embed_webpage, 'JS player URL')
1035
beb95e77 1036 player_url = json.loads(jsplayer_url_json)
201e9eaa
PH
1037 if player_url is None:
1038 player_url_json = self._search_regex(
1039 r'ytplayer\.config.*?"url"\s*:\s*("[^"]+")',
78caa52a 1040 video_webpage, 'age gate player URL')
201e9eaa
PH
1041 player_url = json.loads(player_url_json)
1042
1043 if self._downloader.params.get('verbose'):
cf010131 1044 if player_url is None:
201e9eaa
PH
1045 player_version = 'unknown'
1046 player_desc = 'unknown'
1047 else:
1048 if player_url.endswith('swf'):
1049 player_version = self._search_regex(
1050 r'-(.+?)(?:/watch_as3)?\.swf$', player_url,
78caa52a 1051 'flash player', fatal=False)
201e9eaa 1052 player_desc = 'flash player %s' % player_version
cf010131 1053 else:
201e9eaa
PH
1054 player_version = self._search_regex(
1055 r'html5player-([^/]+?)(?:/html5player)?\.js',
1056 player_url,
1057 'html5 player', fatal=False)
78caa52a 1058 player_desc = 'html5 player %s' % player_version
201e9eaa 1059
60064c53 1060 parts_sizes = self._signature_cache_id(encrypted_sig)
69ea8ca4 1061 self.to_screen('{%s} signature length %s, %s' %
9e1a5b84 1062 (format_id, parts_sizes, player_desc))
201e9eaa
PH
1063
1064 signature = self._decrypt_signature(
1065 encrypted_sig, video_id, player_url, age_gate)
1066 url += '&signature=' + signature
1067 if 'ratebypass' not in url:
1068 url += '&ratebypass=yes'
1069 url_map[format_id] = url
dd27fd17 1070 formats = _map_to_format_list(url_map)
1d043b93
JMF
1071 elif video_info.get('hlsvp'):
1072 manifest_url = video_info['hlsvp'][0]
1073 url_map = self._extract_from_m3u8(manifest_url, video_id)
dd27fd17 1074 formats = _map_to_format_list(url_map)
c5e8d7af 1075 else:
69ea8ca4 1076 raise ExtractorError('no conn, hlsvp or url_encoded_fmt_stream_map information found in video info')
c5e8d7af 1077
dd27fd17 1078 # Look for the DASH manifest
203fb43f 1079 if self._downloader.params.get('youtube_include_dash_manifest', True):
774e208f 1080 dash_mpd = video_info.get('dashmpd')
75111274 1081 if dash_mpd:
774e208f
PH
1082 dash_manifest_url = dash_mpd[0]
1083 try:
1084 dash_formats = self._parse_dash_manifest(
da276600 1085 video_id, dash_manifest_url, player_url, age_gate)
774e208f
PH
1086 except (ExtractorError, KeyError) as e:
1087 self.report_warning(
1088 'Skipping DASH manifest: %r' % e, video_id)
1089 else:
e65566a9
PH
1090 # Hide the formats we found through non-DASH
1091 dash_keys = set(df['format_id'] for df in dash_formats)
1092 for f in formats:
1093 if f['format_id'] in dash_keys:
1094 f['format_id'] = 'nondash-%s' % f['format_id']
ee61f6f3 1095 f['preference'] = f.get('preference', 0) - 10000
774e208f 1096 formats.extend(dash_formats)
d80044c2 1097
6271f1ca
PH
1098 # Check for malformed aspect ratio
1099 stretched_m = re.search(
1100 r'<meta\s+property="og:video:tag".*?content="yt:stretch=(?P<w>[0-9]+):(?P<h>[0-9]+)">',
1101 video_webpage)
1102 if stretched_m:
1103 ratio = float(stretched_m.group('w')) / float(stretched_m.group('h'))
1104 for f in formats:
1105 if f.get('vcodec') != 'none':
1106 f['stretched_ratio'] = ratio
1107
4bcc7bd1 1108 self._sort_formats(formats)
4ea3be0a 1109
1110 return {
8bcc8756
JW
1111 'id': video_id,
1112 'uploader': video_uploader,
1113 'uploader_id': video_uploader_id,
1114 'upload_date': upload_date,
1115 'title': video_title,
1116 'thumbnail': video_thumbnail,
1117 'description': video_description,
1118 'categories': video_categories,
1119 'subtitles': video_subtitles,
1120 'duration': video_duration,
1121 'age_limit': 18 if age_gate else 0,
1122 'annotations': video_annotations,
7e8c0af0 1123 'webpage_url': proto + '://www.youtube.com/watch?v=%s' % video_id,
8bcc8756 1124 'view_count': view_count,
4ea3be0a 1125 'like_count': like_count,
1126 'dislike_count': dislike_count,
8bcc8756 1127 'formats': formats,
4ea3be0a 1128 }
c5e8d7af 1129
5f6a1245 1130
880e1c52 1131class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
78caa52a 1132 IE_DESC = 'YouTube.com playlists'
d67cc9fa 1133 _VALID_URL = r"""(?x)(?:
c5e8d7af
PH
1134 (?:https?://)?
1135 (?:\w+\.)?
1136 youtube\.com/
1137 (?:
ac7553d0 1138 (?:course|view_play_list|my_playlists|artist|playlist|watch|embed/videoseries)
c5e8d7af
PH
1139 \? (?:.*?&)*? (?:p|a|list)=
1140 | p/
1141 )
d67cc9fa 1142 (
7d568f5a 1143 (?:PL|LL|EC|UU|FL|RD)?[0-9A-Za-z-_]{10,}
5f6a1245 1144 # Top tracks, they can also include dots
d67cc9fa
JMF
1145 |(?:MC)[\w\.]*
1146 )
c5e8d7af
PH
1147 .*
1148 |
7d568f5a 1149 ((?:PL|LL|EC|UU|FL|RD)[0-9A-Za-z-_]{10,})
c5e8d7af 1150 )"""
dbb94fb0 1151 _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s'
dbb94fb0 1152 _VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})&amp;[^"]*?index=(?P<index>\d+)'
78caa52a 1153 IE_NAME = 'youtube:playlist'
81127aa5
PH
1154 _TESTS = [{
1155 'url': 'https://www.youtube.com/playlist?list=PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re',
1156 'info_dict': {
1157 'title': 'ytdl test PL',
a1cf99d0 1158 'id': 'PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re',
81127aa5
PH
1159 },
1160 'playlist_count': 3,
9291475f
PH
1161 }, {
1162 'url': 'https://www.youtube.com/playlist?list=PLtPgu7CB4gbZDA7i_euNxn75ISqxwZPYx',
1163 'info_dict': {
acf757f4 1164 'id': 'PLtPgu7CB4gbZDA7i_euNxn75ISqxwZPYx',
9291475f
PH
1165 'title': 'YDL_Empty_List',
1166 },
1167 'playlist_count': 0,
1168 }, {
1169 'note': 'Playlist with deleted videos (#651). As a bonus, the video #51 is also twice in this list.',
1170 'url': 'https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC',
1171 'info_dict': {
1172 'title': '29C3: Not my department',
acf757f4 1173 'id': 'PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC',
9291475f
PH
1174 },
1175 'playlist_count': 95,
1176 }, {
1177 'note': 'issue #673',
1178 'url': 'PLBB231211A4F62143',
1179 'info_dict': {
f46a8702 1180 'title': '[OLD]Team Fortress 2 (Class-based LP)',
acf757f4 1181 'id': 'PLBB231211A4F62143',
9291475f
PH
1182 },
1183 'playlist_mincount': 26,
1184 }, {
1185 'note': 'Large playlist',
1186 'url': 'https://www.youtube.com/playlist?list=UUBABnxM4Ar9ten8Mdjj1j0Q',
1187 'info_dict': {
1188 'title': 'Uploads from Cauchemar',
acf757f4 1189 'id': 'UUBABnxM4Ar9ten8Mdjj1j0Q',
9291475f
PH
1190 },
1191 'playlist_mincount': 799,
1192 }, {
1193 'url': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl',
1194 'info_dict': {
1195 'title': 'YDL_safe_search',
acf757f4 1196 'id': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl',
9291475f
PH
1197 },
1198 'playlist_count': 2,
ac7553d0
PH
1199 }, {
1200 'note': 'embedded',
1201 'url': 'http://www.youtube.com/embed/videoseries?list=PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu',
1202 'playlist_count': 4,
1203 'info_dict': {
1204 'title': 'JODA15',
acf757f4 1205 'id': 'PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu',
ac7553d0 1206 }
6b08cdf6
PH
1207 }, {
1208 'note': 'Embedded SWF player',
1209 'url': 'http://www.youtube.com/p/YN5VISEtHet5D4NEvfTd0zcgFk84NqFZ?hl=en_US&fs=1&rel=0',
1210 'playlist_count': 4,
1211 'info_dict': {
1212 'title': 'JODA7',
acf757f4 1213 'id': 'YN5VISEtHet5D4NEvfTd0zcgFk84NqFZ',
6b08cdf6 1214 }
4b7df0d3
JMF
1215 }, {
1216 'note': 'Buggy playlist: the webpage has a "Load more" button but it doesn\'t have more videos',
1217 'url': 'https://www.youtube.com/playlist?list=UUXw-G3eDE9trcvY2sBMM_aA',
1218 'info_dict': {
acf757f4
PH
1219 'title': 'Uploads from Interstellar Movie',
1220 'id': 'UUXw-G3eDE9trcvY2sBMM_aA',
4b7df0d3
JMF
1221 },
1222 'playlist_mincout': 21,
81127aa5 1223 }]
c5e8d7af 1224
880e1c52
JMF
1225 def _real_initialize(self):
1226 self._login()
1227
652cdaa2 1228 def _ids_to_results(self, ids):
c9cc0bf5
PH
1229 return [
1230 self.url_result(vid_id, 'Youtube', video_id=vid_id)
1231 for vid_id in ids]
652cdaa2
JMF
1232
1233 def _extract_mix(self, playlist_id):
1234 # The mixes are generated from a a single video
1235 # the id of the playlist is just 'RD' + video_id
7d4afc55 1236 url = 'https://youtube.com/watch?v=%s&list=%s' % (playlist_id[-11:], playlist_id)
c9cc0bf5 1237 webpage = self._download_webpage(
78caa52a 1238 url, playlist_id, 'Downloading Youtube mix')
bc2f773b 1239 search_title = lambda class_name: get_element_by_attribute('class', class_name, webpage)
c9cc0bf5
PH
1240 title_span = (
1241 search_title('playlist-title') or
1242 search_title('title long-title') or
1243 search_title('title'))
76d1700b 1244 title = clean_html(title_span)
c9cc0bf5
PH
1245 ids = orderedSet(re.findall(
1246 r'''(?xs)data-video-username=".*?".*?
1247 href="/watch\?v=([0-9A-Za-z_-]{11})&amp;[^"]*?list=%s''' % re.escape(playlist_id),
1248 webpage))
652cdaa2
JMF
1249 url_results = self._ids_to_results(ids)
1250
1251 return self.playlist_result(url_results, playlist_id, title)
1252
c5e8d7af
PH
1253 def _real_extract(self, url):
1254 # Extract playlist id
d67cc9fa 1255 mobj = re.match(self._VALID_URL, url)
c5e8d7af 1256 if mobj is None:
69ea8ca4 1257 raise ExtractorError('Invalid URL: %s' % url)
47192f92
FV
1258 playlist_id = mobj.group(1) or mobj.group(2)
1259
1260 # Check if it's a video-specific URL
7c61bd36 1261 query_dict = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
47192f92
FV
1262 if 'v' in query_dict:
1263 video_id = query_dict['v'][0]
1264 if self._downloader.params.get('noplaylist'):
69ea8ca4 1265 self.to_screen('Downloading just video %s because of --no-playlist' % video_id)
7012b23c 1266 return self.url_result(video_id, 'Youtube', video_id=video_id)
47192f92 1267 else:
69ea8ca4 1268 self.to_screen('Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id))
c5e8d7af 1269
7d4afc55 1270 if playlist_id.startswith('RD'):
652cdaa2
JMF
1271 # Mixes require a custom extraction process
1272 return self._extract_mix(playlist_id)
1273
dbb94fb0
S
1274 url = self._TEMPLATE_URL % playlist_id
1275 page = self._download_webpage(url, playlist_id)
1276 more_widget_html = content_html = page
1277
10c0e2d8 1278 # Check if the playlist exists or is private
e399853d 1279 if re.search(r'<div class="yt-alert-message">[^<]*?(The|This) playlist (does not exist|is private)[^<]*?</div>', page) is not None:
10c0e2d8 1280 raise ExtractorError(
78caa52a 1281 'The playlist doesn\'t exist or is private, use --username or '
10c0e2d8
JMF
1282 '--netrc to access it.',
1283 expected=True)
1284
dcbb4580
JMF
1285 # Extract the video ids from the playlist pages
1286 ids = []
c5e8d7af 1287
755eb032 1288 for page_num in itertools.count(1):
dbb94fb0 1289 matches = re.finditer(self._VIDEO_RE, content_html)
6e47b51e
JMF
1290 # We remove the duplicates and the link with index 0
1291 # (it's not the first video of the playlist)
1292 new_ids = orderedSet(m.group('id') for m in matches if m.group('index') != '0')
dcbb4580 1293 ids.extend(new_ids)
c5e8d7af 1294
dbb94fb0
S
1295 mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
1296 if not mobj:
c5e8d7af
PH
1297 break
1298
dbb94fb0 1299 more = self._download_json(
5912c639
PH
1300 'https://youtube.com/%s' % mobj.group('more'), playlist_id,
1301 'Downloading page #%s' % page_num,
1302 transform_source=uppercase_escape)
dbb94fb0 1303 content_html = more['content_html']
4b7df0d3
JMF
1304 if not content_html.strip():
1305 # Some webpages show a "Load more" button but they don't
1306 # have more videos
1307 break
dbb94fb0
S
1308 more_widget_html = more['load_more_widget_html']
1309
1310 playlist_title = self._html_search_regex(
68eb8e90 1311 r'(?s)<h1 class="pl-header-title[^"]*">\s*(.*?)\s*</h1>',
78caa52a 1312 page, 'title')
c5e8d7af 1313
652cdaa2 1314 url_results = self._ids_to_results(ids)
dcbb4580 1315 return self.playlist_result(url_results, playlist_id, playlist_title)
c5e8d7af
PH
1316
1317
1318class YoutubeChannelIE(InfoExtractor):
78caa52a 1319 IE_DESC = 'YouTube.com channels'
9ff67727 1320 _VALID_URL = r'https?://(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/(?P<id>[0-9A-Za-z_-]+)'
78caa52a 1321 IE_NAME = 'youtube:channel'
cdc628a4
PH
1322 _TESTS = [{
1323 'note': 'paginated channel',
1324 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w',
1325 'playlist_mincount': 91,
acf757f4
PH
1326 'info_dict': {
1327 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
1328 }
cdc628a4 1329 }]
c5e8d7af
PH
1330
1331 def extract_videos_from_page(self, page):
1332 ids_in_page = []
1333 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1334 if mobj.group(1) not in ids_in_page:
1335 ids_in_page.append(mobj.group(1))
1336 return ids_in_page
1337
1338 def _real_extract(self, url):
9ff67727 1339 channel_id = self._match_id(url)
c5e8d7af 1340
c5e8d7af 1341 video_ids = []
b9643eed
JMF
1342 url = 'https://www.youtube.com/channel/%s/videos' % channel_id
1343 channel_page = self._download_webpage(url, channel_id)
31812a9e
PH
1344 autogenerated = re.search(r'''(?x)
1345 class="[^"]*?(?:
1346 channel-header-autogenerated-label|
1347 yt-channel-title-autogenerated
1348 )[^"]*"''', channel_page) is not None
c5e8d7af 1349
b9643eed
JMF
1350 if autogenerated:
1351 # The videos are contained in a single page
1352 # the ajax pages can't be used, they are empty
1353 video_ids = self.extract_videos_from_page(channel_page)
b82f815f
PH
1354 entries = [
1355 self.url_result(video_id, 'Youtube', video_id=video_id)
1356 for video_id in video_ids]
1357 return self.playlist_result(entries, channel_id)
1358
1359 def _entries():
23d3608c 1360 more_widget_html = content_html = channel_page
b9643eed 1361 for pagenum in itertools.count(1):
81c2f20b 1362
23d3608c 1363 ids_in_page = self.extract_videos_from_page(content_html)
b82f815f
PH
1364 for video_id in ids_in_page:
1365 yield self.url_result(
1366 video_id, 'Youtube', video_id=video_id)
5f6a1245 1367
23d3608c
JMF
1368 mobj = re.search(
1369 r'data-uix-load-more-href="/?(?P<more>[^"]+)"',
1370 more_widget_html)
1371 if not mobj:
b9643eed 1372 break
c5e8d7af 1373
23d3608c
JMF
1374 more = self._download_json(
1375 'https://youtube.com/%s' % mobj.group('more'), channel_id,
1376 'Downloading page #%s' % (pagenum + 1),
1377 transform_source=uppercase_escape)
1378 content_html = more['content_html']
1379 more_widget_html = more['load_more_widget_html']
1380
b82f815f 1381 return self.playlist_result(_entries(), channel_id)
c5e8d7af
PH
1382
1383
1384class YoutubeUserIE(InfoExtractor):
78caa52a 1385 IE_DESC = 'YouTube.com user videos (URL or "ytuser" keyword)'
9ff67727 1386 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:user/)?(?!(?:attribution_link|watch|results)(?:$|[^a-z_A-Z0-9-])))|ytuser:)(?!feed/)(?P<id>[A-Za-z0-9_-]+)'
38c2e5b8 1387 _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/users/%s'
c5e8d7af 1388 _GDATA_PAGE_SIZE = 50
38c2e5b8 1389 _GDATA_URL = 'https://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d&alt=json'
78caa52a 1390 IE_NAME = 'youtube:user'
c5e8d7af 1391
cdc628a4
PH
1392 _TESTS = [{
1393 'url': 'https://www.youtube.com/user/TheLinuxFoundation',
1394 'playlist_mincount': 320,
1395 'info_dict': {
1396 'title': 'TheLinuxFoundation',
1397 }
1398 }, {
1399 'url': 'ytuser:phihag',
1400 'only_matching': True,
1401 }]
1402
e3ea4790 1403 @classmethod
f4b05232 1404 def suitable(cls, url):
e3ea4790
JMF
1405 # Don't return True if the url can be extracted with other youtube
1406 # extractor, the regex would is too permissive and it would match.
1407 other_ies = iter(klass for (name, klass) in globals().items() if name.endswith('IE') and klass is not cls)
5f6a1245
JW
1408 if any(ie.suitable(url) for ie in other_ies):
1409 return False
1410 else:
1411 return super(YoutubeUserIE, cls).suitable(url)
f4b05232 1412
c5e8d7af 1413 def _real_extract(self, url):
9ff67727 1414 username = self._match_id(url)
c5e8d7af
PH
1415
1416 # Download video ids using YouTube Data API. Result size per
1417 # query is limited (currently to 50 videos) so we need to query
1418 # page by page until there are no video ids - it means we got
1419 # all of them.
1420
b7ab0590 1421 def download_page(pagenum):
c5e8d7af
PH
1422 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1423
1424 gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
b7ab0590
PH
1425 page = self._download_webpage(
1426 gdata_url, username,
78caa52a 1427 'Downloading video ids from %d to %d' % (
b7ab0590 1428 start_index, start_index + self._GDATA_PAGE_SIZE))
c5e8d7af 1429
fd9cf738
JMF
1430 try:
1431 response = json.loads(page)
1432 except ValueError as err:
69ea8ca4 1433 raise ExtractorError('Invalid JSON in API response: ' + compat_str(err))
71c82637 1434 if 'entry' not in response['feed']:
b7ab0590 1435 return
fd9cf738 1436
c5e8d7af 1437 # Extract video identifiers
e302f9ce
PH
1438 entries = response['feed']['entry']
1439 for entry in entries:
1440 title = entry['title']['$t']
1441 video_id = entry['id']['$t'].split('/')[-1]
b7ab0590 1442 yield {
e302f9ce
PH
1443 '_type': 'url',
1444 'url': video_id,
1445 'ie_key': 'Youtube',
b11cec41 1446 'id': video_id,
e302f9ce 1447 'title': title,
b7ab0590 1448 }
9c44d242 1449 url_results = OnDemandPagedList(download_page, self._GDATA_PAGE_SIZE)
c5e8d7af 1450
7012b23c
PH
1451 return self.playlist_result(url_results, playlist_title=username)
1452
b05654f0
PH
1453
1454class YoutubeSearchIE(SearchInfoExtractor):
78caa52a
PH
1455 IE_DESC = 'YouTube.com searches'
1456 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
b05654f0 1457 _MAX_RESULTS = 1000
78caa52a 1458 IE_NAME = 'youtube:search'
b05654f0
PH
1459 _SEARCH_KEY = 'ytsearch'
1460
b05654f0
PH
1461 def _get_n_results(self, query, n):
1462 """Get a specified number of results for a query"""
1463
1464 video_ids = []
1465 pagenum = 0
1466 limit = n
83d548ef 1467 PAGE_SIZE = 50
b05654f0 1468
83d548ef
PH
1469 while (PAGE_SIZE * pagenum) < limit:
1470 result_url = self._API_URL % (
1471 compat_urllib_parse.quote_plus(query.encode('utf-8')),
1472 (PAGE_SIZE * pagenum) + 1)
7cc3570e 1473 data_json = self._download_webpage(
69ea8ca4
PH
1474 result_url, video_id='query "%s"' % query,
1475 note='Downloading page %s' % (pagenum + 1),
1476 errnote='Unable to download API page')
7cc3570e
PH
1477 data = json.loads(data_json)
1478 api_response = data['data']
1479
1480 if 'items' not in api_response:
07ad22b8 1481 raise ExtractorError(
78caa52a 1482 '[youtube] No video results', expected=True)
b05654f0
PH
1483
1484 new_ids = list(video['id'] for video in api_response['items'])
1485 video_ids += new_ids
1486
1487 limit = min(n, api_response['totalItems'])
1488 pagenum += 1
1489
1490 if len(video_ids) > n:
1491 video_ids = video_ids[:n]
7012b23c
PH
1492 videos = [self.url_result(video_id, 'Youtube', video_id=video_id)
1493 for video_id in video_ids]
b05654f0 1494 return self.playlist_result(videos, query)
75dff0ee 1495
c9ae7b95 1496
a3dd9248 1497class YoutubeSearchDateIE(YoutubeSearchIE):
cb7fb546 1498 IE_NAME = YoutubeSearchIE.IE_NAME + ':date'
a3dd9248
CM
1499 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc&orderby=published'
1500 _SEARCH_KEY = 'ytsearchdate'
78caa52a 1501 IE_DESC = 'YouTube.com searches, newest videos first'
75dff0ee 1502
c9ae7b95
PH
1503
1504class YoutubeSearchURLIE(InfoExtractor):
78caa52a
PH
1505 IE_DESC = 'YouTube.com search URLs'
1506 IE_NAME = 'youtube:search_url'
c9ae7b95 1507 _VALID_URL = r'https?://(?:www\.)?youtube\.com/results\?(.*?&)?search_query=(?P<query>[^&]+)(?:[&]|$)'
cdc628a4
PH
1508 _TESTS = [{
1509 'url': 'https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video',
1510 'playlist_mincount': 5,
1511 'info_dict': {
1512 'title': 'youtube-dl test video',
1513 }
1514 }]
c9ae7b95
PH
1515
1516 def _real_extract(self, url):
1517 mobj = re.match(self._VALID_URL, url)
1518 query = compat_urllib_parse.unquote_plus(mobj.group('query'))
1519
1520 webpage = self._download_webpage(url, query)
1521 result_code = self._search_regex(
78caa52a 1522 r'(?s)<ol class="item-section"(.*?)</ol>', webpage, 'result HTML')
c9ae7b95
PH
1523
1524 part_codes = re.findall(
1525 r'(?s)<h3 class="yt-lockup-title">(.*?)</h3>', result_code)
1526 entries = []
1527 for part_code in part_codes:
1528 part_title = self._html_search_regex(
6feb2d5e 1529 [r'(?s)title="([^"]+)"', r'>([^<]+)</a>'], part_code, 'item title', fatal=False)
c9ae7b95
PH
1530 part_url_snippet = self._html_search_regex(
1531 r'(?s)href="([^"]+)"', part_code, 'item URL')
1532 part_url = compat_urlparse.urljoin(
1533 'https://www.youtube.com/', part_url_snippet)
1534 entries.append({
1535 '_type': 'url',
1536 'url': part_url,
1537 'title': part_title,
1538 })
1539
1540 return {
1541 '_type': 'playlist',
1542 'entries': entries,
1543 'title': query,
1544 }
1545
1546
75dff0ee 1547class YoutubeShowIE(InfoExtractor):
78caa52a 1548 IE_DESC = 'YouTube.com (multi-season) shows'
cdc628a4 1549 _VALID_URL = r'https?://www\.youtube\.com/show/(?P<id>[^?#]*)'
78caa52a 1550 IE_NAME = 'youtube:show'
cdc628a4
PH
1551 _TESTS = [{
1552 'url': 'http://www.youtube.com/show/airdisasters',
1553 'playlist_mincount': 3,
1554 'info_dict': {
1555 'id': 'airdisasters',
1556 'title': 'Air Disasters',
1557 }
1558 }]
75dff0ee
JMF
1559
1560 def _real_extract(self, url):
1561 mobj = re.match(self._VALID_URL, url)
cdc628a4
PH
1562 playlist_id = mobj.group('id')
1563 webpage = self._download_webpage(
1564 url, playlist_id, 'Downloading show webpage')
75dff0ee
JMF
1565 # There's one playlist for each season of the show
1566 m_seasons = list(re.finditer(r'href="(/playlist\?list=.*?)"', webpage))
cdc628a4
PH
1567 self.to_screen('%s: Found %s seasons' % (playlist_id, len(m_seasons)))
1568 entries = [
1569 self.url_result(
1570 'https://www.youtube.com' + season.group(1), 'YoutubePlaylist')
1571 for season in m_seasons
1572 ]
1573 title = self._og_search_title(webpage, fatal=False)
1574
1575 return {
1576 '_type': 'playlist',
1577 'id': playlist_id,
1578 'title': title,
1579 'entries': entries,
1580 }
04cc9617
JMF
1581
1582
b2e8bc1b 1583class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
d7ae0639
JMF
1584 """
1585 Base class for extractors that fetch info from
1586 http://www.youtube.com/feed_ajax
1587 Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.
1588 """
b2e8bc1b 1589 _LOGIN_REQUIRED = True
43ba5456
JMF
1590 # use action_load_personal_feed instead of action_load_system_feed
1591 _PERSONAL_FEED = False
04cc9617 1592
d7ae0639
JMF
1593 @property
1594 def _FEED_TEMPLATE(self):
43ba5456
JMF
1595 action = 'action_load_system_feed'
1596 if self._PERSONAL_FEED:
1597 action = 'action_load_personal_feed'
38c2e5b8 1598 return 'https://www.youtube.com/feed_ajax?%s=1&feed_name=%s&paging=%%s' % (action, self._FEED_NAME)
d7ae0639
JMF
1599
1600 @property
1601 def IE_NAME(self):
78caa52a 1602 return 'youtube:%s' % self._FEED_NAME
04cc9617 1603
81f0259b 1604 def _real_initialize(self):
b2e8bc1b 1605 self._login()
81f0259b 1606
04cc9617
JMF
1607 def _real_extract(self, url):
1608 feed_entries = []
0e44d838
JMF
1609 paging = 0
1610 for i in itertools.count(1):
84d84211
PH
1611 info = self._download_json(
1612 self._FEED_TEMPLATE % paging,
1613 '%s feed' % self._FEED_NAME,
1614 'Downloading page %s' % i,
1615 transform_source=uppercase_escape)
f6177462 1616 feed_html = info.get('feed_html') or info.get('content_html')
1a9b9649 1617 load_more_widget_html = info.get('load_more_widget_html') or feed_html
43ba5456 1618 m_ids = re.finditer(r'"/watch\?v=(.*?)["&]', feed_html)
04cc9617 1619 ids = orderedSet(m.group(1) for m in m_ids)
7012b23c
PH
1620 feed_entries.extend(
1621 self.url_result(video_id, 'Youtube', video_id=video_id)
1622 for video_id in ids)
05ee2b6d
JMF
1623 mobj = re.search(
1624 r'data-uix-load-more-href="/?[^"]+paging=(?P<paging>\d+)',
1a9b9649 1625 load_more_widget_html)
05ee2b6d 1626 if mobj is None:
04cc9617 1627 break
05ee2b6d 1628 paging = mobj.group('paging')
d7ae0639
JMF
1629 return self.playlist_result(feed_entries, playlist_title=self._PLAYLIST_TITLE)
1630
5f6a1245 1631
d7ae0639 1632class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
f3a34072 1633 IE_DESC = 'YouTube.com recommended videos, ":ytrec" for short (requires authentication)'
d7ae0639
JMF
1634 _VALID_URL = r'https?://www\.youtube\.com/feed/recommended|:ytrec(?:ommended)?'
1635 _FEED_NAME = 'recommended'
78caa52a 1636 _PLAYLIST_TITLE = 'Youtube Recommended videos'
c626a3d9 1637
5f6a1245 1638
43ba5456 1639class YoutubeWatchLaterIE(YoutubeFeedsInfoExtractor):
f3a34072 1640 IE_DESC = 'Youtube watch later list, ":ytwatchlater" for short (requires authentication)'
43ba5456
JMF
1641 _VALID_URL = r'https?://www\.youtube\.com/feed/watch_later|:ytwatchlater'
1642 _FEED_NAME = 'watch_later'
78caa52a 1643 _PLAYLIST_TITLE = 'Youtube Watch Later'
43ba5456 1644 _PERSONAL_FEED = True
c626a3d9 1645
5f6a1245 1646
f459d170 1647class YoutubeHistoryIE(YoutubeFeedsInfoExtractor):
f3a34072 1648 IE_DESC = 'Youtube watch history, ":ythistory" for short (requires authentication)'
78caa52a 1649 _VALID_URL = 'https?://www\.youtube\.com/feed/history|:ythistory'
f459d170
JMF
1650 _FEED_NAME = 'history'
1651 _PERSONAL_FEED = True
78caa52a 1652 _PLAYLIST_TITLE = 'Youtube Watch History'
f459d170 1653
5f6a1245 1654
c626a3d9 1655class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
78caa52a 1656 IE_NAME = 'youtube:favorites'
f3a34072 1657 IE_DESC = 'YouTube.com favourite videos, ":ytfav" for short (requires authentication)'
c7a7750d 1658 _VALID_URL = r'https?://www\.youtube\.com/my_favorites|:ytfav(?:ou?rites)?'
c626a3d9
JMF
1659 _LOGIN_REQUIRED = True
1660
1661 def _real_extract(self, url):
1662 webpage = self._download_webpage('https://www.youtube.com/my_favorites', 'Youtube Favourites videos')
78caa52a 1663 playlist_id = self._search_regex(r'list=(.+?)["&]', webpage, 'favourites playlist id')
c626a3d9 1664 return self.url_result(playlist_id, 'YoutubePlaylist')
15870e90
PH
1665
1666
1ed5b5c9 1667class YoutubeSubscriptionsIE(YoutubePlaylistIE):
78caa52a
PH
1668 IE_NAME = 'youtube:subscriptions'
1669 IE_DESC = 'YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication)'
1ed5b5c9 1670 _VALID_URL = r'https?://www\.youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
81127aa5 1671 _TESTS = []
1ed5b5c9
JMF
1672
1673 def _real_extract(self, url):
78caa52a 1674 title = 'Youtube Subscriptions'
1ed5b5c9
JMF
1675 page = self._download_webpage('https://www.youtube.com/feed/subscriptions', title)
1676
1677 # The extraction process is the same as for playlists, but the regex
1678 # for the video ids doesn't contain an index
1679 ids = []
1680 more_widget_html = content_html = page
1681
1682 for page_num in itertools.count(1):
1683 matches = re.findall(r'href="\s*/watch\?v=([0-9A-Za-z_-]{11})', content_html)
1684 new_ids = orderedSet(matches)
1685 ids.extend(new_ids)
1686
1687 mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
1688 if not mobj:
1689 break
1690
1691 more = self._download_json(
1692 'https://youtube.com/%s' % mobj.group('more'), title,
1693 'Downloading page #%s' % page_num,
1694 transform_source=uppercase_escape)
1695 content_html = more['content_html']
1696 more_widget_html = more['load_more_widget_html']
1697
1698 return {
1699 '_type': 'playlist',
1700 'title': title,
1701 'entries': self._ids_to_results(ids),
1702 }
1703
1704
15870e90
PH
1705class YoutubeTruncatedURLIE(InfoExtractor):
1706 IE_NAME = 'youtube:truncated_url'
1707 IE_DESC = False # Do not list
975d35db 1708 _VALID_URL = r'''(?x)
b95aab84
PH
1709 (?:https?://)?
1710 (?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/
1711 (?:watch\?(?:
c4808c60 1712 feature=[a-z_]+|
b95aab84
PH
1713 annotation_id=annotation_[^&]+|
1714 x-yt-cl=[0-9]+|
c1708b89 1715 hl=[^&]*|
b95aab84
PH
1716 )?
1717 |
1718 attribution_link\?a=[^&]+
1719 )
1720 $
975d35db 1721 '''
15870e90 1722
c4808c60
PH
1723 _TESTS = [{
1724 'url': 'http://www.youtube.com/watch?annotation_id=annotation_3951667041',
1725 'only_matching': True,
dc2fc736
PH
1726 }, {
1727 'url': 'http://www.youtube.com/watch?',
1728 'only_matching': True,
b95aab84
PH
1729 }, {
1730 'url': 'https://www.youtube.com/watch?x-yt-cl=84503534',
1731 'only_matching': True,
1732 }, {
1733 'url': 'https://www.youtube.com/watch?feature=foo',
1734 'only_matching': True,
c1708b89
PH
1735 }, {
1736 'url': 'https://www.youtube.com/watch?hl=en-GB',
1737 'only_matching': True,
c4808c60
PH
1738 }]
1739
15870e90
PH
1740 def _real_extract(self, url):
1741 raise ExtractorError(
78caa52a
PH
1742 'Did you forget to quote the URL? Remember that & is a meta '
1743 'character in most shells, so you want to put the URL in quotes, '
1744 'like youtube-dl '
1745 '"http://www.youtube.com/watch?feature=foo&v=BaW_jenozKc" '
1746 ' or simply youtube-dl BaW_jenozKc .',
15870e90 1747 expected=True)
772fd5cc
PH
1748
1749
1750class YoutubeTruncatedIDIE(InfoExtractor):
1751 IE_NAME = 'youtube:truncated_id'
1752 IE_DESC = False # Do not list
b95aab84 1753 _VALID_URL = r'https?://(?:www\.)?youtube\.com/watch\?v=(?P<id>[0-9A-Za-z_-]{1,10})$'
772fd5cc
PH
1754
1755 _TESTS = [{
1756 'url': 'https://www.youtube.com/watch?v=N_708QY7Ob',
1757 'only_matching': True,
1758 }]
1759
1760 def _real_extract(self, url):
1761 video_id = self._match_id(url)
1762 raise ExtractorError(
1763 'Incomplete YouTube ID %s. URL %s looks truncated.' % (video_id, url),
1764 expected=True)