]> jfr.im git - yt-dlp.git/blame - youtube_dl/extractor/youtube.py
release 2014.08.21
[yt-dlp.git] / youtube_dl / extractor / youtube.py
CommitLineData
c5e8d7af 1# coding: utf-8
c5e8d7af 2
edf3e38e 3import errno
e0df6211 4import io
0ca96d48 5import itertools
c5e8d7af 6import json
c4417ddb 7import os.path
c5e8d7af 8import re
e0df6211 9import traceback
c5e8d7af 10
b05654f0 11from .common import InfoExtractor, SearchInfoExtractor
54d39d8b 12from .subtitles import SubtitlesInfoExtractor
2b25cb5d 13from ..jsinterp import JSInterpreter
54256267 14from ..swfinterp import SWFInterpreter
c5e8d7af 15from ..utils import (
edf3e38e 16 compat_chr,
c5e8d7af 17 compat_parse_qs,
c5e8d7af
PH
18 compat_urllib_parse,
19 compat_urllib_request,
7c61bd36 20 compat_urlparse,
c5e8d7af
PH
21 compat_str,
22
23 clean_html,
c38b1e77 24 get_cachedir,
c5e8d7af 25 get_element_by_id,
652cdaa2 26 get_element_by_attribute,
c5e8d7af 27 ExtractorError,
dd27fd17 28 int_or_none,
b7ab0590 29 PagedList,
c5e8d7af
PH
30 unescapeHTML,
31 unified_strdate,
04cc9617 32 orderedSet,
edf3e38e 33 write_json_file,
81c2f20b 34 uppercase_escape,
c5e8d7af
PH
35)
36
de7f3446 37class YoutubeBaseInfoExtractor(InfoExtractor):
b2e8bc1b
JMF
38 """Provide base functions for Youtube extractors"""
39 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
40 _LANG_URL = r'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
38c2e5b8 41 _AGE_URL = 'https://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
b2e8bc1b
JMF
42 _NETRC_MACHINE = 'youtube'
43 # If True it will raise an error if no login info is provided
44 _LOGIN_REQUIRED = False
45
b2e8bc1b 46 def _set_language(self):
7cc3570e
PH
47 return bool(self._download_webpage(
48 self._LANG_URL, None,
49 note=u'Setting language', errnote='unable to set language',
50 fatal=False))
b2e8bc1b
JMF
51
52 def _login(self):
53 (username, password) = self._get_login_info()
54 # No authentication to be performed
55 if username is None:
56 if self._LOGIN_REQUIRED:
57 raise ExtractorError(u'No login info available, needed for using %s.' % self.IE_NAME, expected=True)
58 return False
59
7cc3570e
PH
60 login_page = self._download_webpage(
61 self._LOGIN_URL, None,
62 note=u'Downloading login page',
63 errnote=u'unable to fetch login page', fatal=False)
64 if login_page is False:
65 return
b2e8bc1b 66
795f28f8
PH
67 galx = self._search_regex(r'(?s)<input.+?name="GALX".+?value="(.+?)"',
68 login_page, u'Login GALX parameter')
c5e8d7af 69
b2e8bc1b
JMF
70 # Log in
71 login_form_strs = {
72 u'continue': u'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
73 u'Email': username,
74 u'GALX': galx,
75 u'Passwd': password,
76 u'PersistentCookie': u'yes',
77 u'_utf8': u'霱',
78 u'bgresponse': u'js_disabled',
79 u'checkConnection': u'',
80 u'checkedDomains': u'youtube',
81 u'dnConn': u'',
b2e8bc1b
JMF
82 u'pstMsg': u'0',
83 u'rmShown': u'1',
84 u'secTok': u'',
85 u'signIn': u'Sign in',
86 u'timeStmp': u'',
87 u'service': u'youtube',
88 u'uilel': u'3',
89 u'hl': u'en_US',
90 }
91 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
92 # chokes on unicode
93 login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
94 login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
7cc3570e
PH
95
96 req = compat_urllib_request.Request(self._LOGIN_URL, login_data)
97 login_results = self._download_webpage(
98 req, None,
99 note=u'Logging in', errnote=u'unable to log in', fatal=False)
100 if login_results is False:
101 return False
102 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
103 self._downloader.report_warning(u'unable to log in: bad username or password')
b2e8bc1b
JMF
104 return False
105 return True
106
107 def _confirm_age(self):
108 age_form = {
7cc3570e
PH
109 'next_url': '/',
110 'action_confirm': 'Confirm',
111 }
5700e779
JMF
112 req = compat_urllib_request.Request(self._AGE_URL,
113 compat_urllib_parse.urlencode(age_form).encode('ascii'))
7cc3570e
PH
114
115 self._download_webpage(
116 req, None,
117 note=u'Confirming age', errnote=u'Unable to confirm age')
b2e8bc1b
JMF
118 return True
119
120 def _real_initialize(self):
121 if self._downloader is None:
122 return
123 if not self._set_language():
124 return
125 if not self._login():
126 return
127 self._confirm_age()
c5e8d7af 128
8377574c 129
de7f3446 130class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
0f818663 131 IE_DESC = u'YouTube.com'
cb7dfeea 132 _VALID_URL = r"""(?x)^
c5e8d7af 133 (
83aa5293 134 (?:https?://|//)? # http(s):// or protocol-independent URL (optional)
cb7dfeea 135 (?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/|
484aaeb2 136 (?:www\.)?deturl\.com/www\.youtube\.com/|
e70dc1d1 137 (?:www\.)?pwnyoutube\.com/|
f7000f3a 138 (?:www\.)?yourepeat\.com/|
e69ae5b9
JMF
139 tube\.majestyc\.net/|
140 youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains
c5e8d7af
PH
141 (?:.*?\#/)? # handle anchor (#/) redirect urls
142 (?: # the various things that can precede the ID:
143 (?:(?:v|embed|e)/) # v/ or embed/ or e/
144 |(?: # or the v= param in all its forms
f7000f3a 145 (?:(?:watch|movie)(?:_popup)?(?:\.php)?/?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
c5e8d7af
PH
146 (?:\?|\#!?) # the params delimiter ? or # or #!
147 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
148 v=
149 )
f4b05232
JMF
150 ))
151 |youtu\.be/ # just youtu.be/xxxx
b9c76aa1 152 |https?://(?:www\.)?cleanvideosearch\.com/media/action/yt/watch\?videoId=
f4b05232 153 )
c5e8d7af 154 )? # all until now is optional -> you can pass the naked ID
8963d9c2 155 ([0-9A-Za-z_-]{11}) # here is it! the YouTube video ID
c5e8d7af
PH
156 (?(1).+)? # if we found the ID, everything can follow
157 $"""
c5e8d7af 158 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
2c62dc26
PH
159 _formats = {
160 '5': {'ext': 'flv', 'width': 400, 'height': 240},
161 '6': {'ext': 'flv', 'width': 450, 'height': 270},
162 '13': {'ext': '3gp'},
163 '17': {'ext': '3gp', 'width': 176, 'height': 144},
164 '18': {'ext': 'mp4', 'width': 640, 'height': 360},
165 '22': {'ext': 'mp4', 'width': 1280, 'height': 720},
166 '34': {'ext': 'flv', 'width': 640, 'height': 360},
167 '35': {'ext': 'flv', 'width': 854, 'height': 480},
168 '36': {'ext': '3gp', 'width': 320, 'height': 240},
169 '37': {'ext': 'mp4', 'width': 1920, 'height': 1080},
170 '38': {'ext': 'mp4', 'width': 4096, 'height': 3072},
171 '43': {'ext': 'webm', 'width': 640, 'height': 360},
172 '44': {'ext': 'webm', 'width': 854, 'height': 480},
173 '45': {'ext': 'webm', 'width': 1280, 'height': 720},
174 '46': {'ext': 'webm', 'width': 1920, 'height': 1080},
175
1d043b93 176
86fe61c8 177 # 3d videos
43b81eb9
PH
178 '82': {'ext': 'mp4', 'height': 360, 'format_note': '3D', 'preference': -20},
179 '83': {'ext': 'mp4', 'height': 480, 'format_note': '3D', 'preference': -20},
180 '84': {'ext': 'mp4', 'height': 720, 'format_note': '3D', 'preference': -20},
181 '85': {'ext': 'mp4', 'height': 1080, 'format_note': '3D', 'preference': -20},
182 '100': {'ext': 'webm', 'height': 360, 'format_note': '3D', 'preference': -20},
183 '101': {'ext': 'webm', 'height': 480, 'format_note': '3D', 'preference': -20},
184 '102': {'ext': 'webm', 'height': 720, 'format_note': '3D', 'preference': -20},
836a086c 185
96fb5605 186 # Apple HTTP Live Streaming
43b81eb9
PH
187 '92': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'preference': -10},
188 '93': {'ext': 'mp4', 'height': 360, 'format_note': 'HLS', 'preference': -10},
189 '94': {'ext': 'mp4', 'height': 480, 'format_note': 'HLS', 'preference': -10},
190 '95': {'ext': 'mp4', 'height': 720, 'format_note': 'HLS', 'preference': -10},
191 '96': {'ext': 'mp4', 'height': 1080, 'format_note': 'HLS', 'preference': -10},
192 '132': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'preference': -10},
193 '151': {'ext': 'mp4', 'height': 72, 'format_note': 'HLS', 'preference': -10},
2c62dc26
PH
194
195 # DASH mp4 video
43b81eb9
PH
196 '133': {'ext': 'mp4', 'height': 240, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
197 '134': {'ext': 'mp4', 'height': 360, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
198 '135': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
199 '136': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
200 '137': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
201 '138': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
202 '160': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
203 '264': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
836a086c 204
f6f1fc92 205 # Dash mp4 audio
2c62dc26
PH
206 '139': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 48, 'preference': -50},
207 '140': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 128, 'preference': -50},
208 '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 256, 'preference': -50},
836a086c
AZ
209
210 # Dash webm
e75cafe9
A
211 '167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
212 '168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
213 '169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
214 '170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
215 '218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
216 '219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
217 '242': {'ext': 'webm', 'height': 240, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
218 '243': {'ext': 'webm', 'height': 360, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
219 '244': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
220 '245': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
221 '246': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
222 '247': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
223 '248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
3c80377b 224 '271': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
18061bba 225 '272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
2c62dc26
PH
226
227 # Dash webm audio
e75cafe9
A
228 '171': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'abr': 48, 'preference': -50},
229 '172': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'abr': 256, 'preference': -50},
ce6b9a2d
PH
230
231 # RTMP (unnamed)
232 '_rtmp': {'protocol': 'rtmp'},
c5e8d7af 233 }
836a086c 234
c5e8d7af 235 IE_NAME = u'youtube'
2eb88d95
PH
236 _TESTS = [
237 {
0e853ca4
PH
238 u"url": u"http://www.youtube.com/watch?v=BaW_jenozKc",
239 u"file": u"BaW_jenozKc.mp4",
240 u"info_dict": {
241 u"title": u"youtube-dl test video \"'/\\ä↭𝕐",
242 u"uploader": u"Philipp Hagemeister",
243 u"uploader_id": u"phihag",
244 u"upload_date": u"20121002",
ad3bc6ac
PH
245 u"description": u"test chars: \"'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .",
246 u"categories": [u'Science & Technology'],
2eb88d95 247 }
0e853ca4 248 },
0e853ca4
PH
249 {
250 u"url": u"http://www.youtube.com/watch?v=UxxajLWwzqY",
251 u"file": u"UxxajLWwzqY.mp4",
252 u"note": u"Test generic use_cipher_signature video (#897)",
253 u"info_dict": {
254 u"upload_date": u"20120506",
255 u"title": u"Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]",
ba60a3eb 256 u"description": u"md5:fea86fda2d5a5784273df5c7cc994d9f",
45ed795c 257 u"uploader": u"Icona Pop",
0e853ca4 258 u"uploader_id": u"IconaPop"
2eb88d95 259 }
c108eb73
JMF
260 },
261 {
262 u"url": u"https://www.youtube.com/watch?v=07FYdnEawAQ",
263 u"file": u"07FYdnEawAQ.mp4",
264 u"note": u"Test VEVO video with age protection (#956)",
265 u"info_dict": {
266 u"upload_date": u"20130703",
267 u"title": u"Justin Timberlake - Tunnel Vision (Explicit)",
268 u"description": u"md5:64249768eec3bc4276236606ea996373",
269 u"uploader": u"justintimberlakeVEVO",
270 u"uploader_id": u"justintimberlakeVEVO"
271 }
272 },
fccd3771 273 {
83aa5293 274 u"url": u"//www.YouTube.com/watch?v=yZIXLfi8CZQ",
fccd3771
PH
275 u"file": u"yZIXLfi8CZQ.mp4",
276 u"note": u"Embed-only video (#1746)",
277 u"info_dict": {
278 u"upload_date": u"20120608",
279 u"title": u"Principal Sexually Assaults A Teacher - Episode 117 - 8th June 2012",
280 u"description": u"md5:09b78bd971f1e3e289601dfba15ca4f7",
281 u"uploader": u"SET India",
282 u"uploader_id": u"setindia"
283 }
284 },
dd27fd17
PH
285 {
286 u"url": u"http://www.youtube.com/watch?v=a9LDPn-MO4I",
287 u"file": u"a9LDPn-MO4I.m4a",
288 u"note": u"256k DASH audio (format 141) via DASH manifest",
dd27fd17
PH
289 u"info_dict": {
290 u"upload_date": "20121002",
291 u"uploader_id": "8KVIDEO",
292 u"description": "No description available.",
293 u"uploader": "8KVIDEO",
294 u"title": "UHDTV TEST 8K VIDEO.mp4"
4919603f
PH
295 },
296 u"params": {
297 u"youtube_include_dash_manifest": True,
298 u"format": "141",
299 },
dd27fd17 300 },
3489b7d2
JMF
301 # DASH manifest with encrypted signature
302 {
303 u'url': u'https://www.youtube.com/watch?v=IB3lcPjvWLA',
304 u'info_dict': {
305 u'id': u'IB3lcPjvWLA',
306 u'ext': u'm4a',
307 u'title': u'Afrojack - The Spark ft. Spree Wilson',
e00c9cf5 308 u'description': u'md5:9717375db5a9a3992be4668bbf3bc0a8',
3489b7d2
JMF
309 u'uploader': u'AfrojackVEVO',
310 u'uploader_id': u'AfrojackVEVO',
311 u'upload_date': u'20131011',
312 },
313 u"params": {
314 u'youtube_include_dash_manifest': True,
315 u'format': '141',
316 },
317 },
2eb88d95
PH
318 ]
319
c5e8d7af
PH
320
321 @classmethod
322 def suitable(cls, url):
323 """Receives a URL and returns True if suitable for this IE."""
e3ea4790 324 if YoutubePlaylistIE.suitable(url): return False
fccd3771 325 return re.match(cls._VALID_URL, url) is not None
c5e8d7af 326
e0df6211
PH
327 def __init__(self, *args, **kwargs):
328 super(YoutubeIE, self).__init__(*args, **kwargs)
83799698 329 self._player_cache = {}
e0df6211 330
c5e8d7af
PH
331 def report_video_info_webpage_download(self, video_id):
332 """Report attempt to download video info webpage."""
333 self.to_screen(u'%s: Downloading video info webpage' % video_id)
334
c5e8d7af
PH
335 def report_information_extraction(self, video_id):
336 """Report attempt to extract video information."""
337 self.to_screen(u'%s: Extracting video information' % video_id)
338
339 def report_unavailable_format(self, video_id, format):
340 """Report extracted video URL."""
341 self.to_screen(u'%s: Format %s not available' % (video_id, format))
342
343 def report_rtmp_download(self):
344 """Indicate the download will use the RTMP protocol."""
345 self.to_screen(u'RTMP download detected')
346
60064c53
PH
347 def _signature_cache_id(self, example_sig):
348 """ Return a string representation of a signature """
349 return u'.'.join(compat_str(len(part)) for part in example_sig.split('.'))
350
351 def _extract_signature_function(self, video_id, player_url, example_sig):
cf010131 352 id_m = re.match(
c081b35c 353 r'.*-(?P<id>[a-zA-Z0-9_-]+)(?:/watch_as3|/html5player)?\.(?P<ext>[a-z]+)$',
cf010131 354 player_url)
c081b35c
PH
355 if not id_m:
356 raise ExtractorError('Cannot identify player %r' % player_url)
e0df6211
PH
357 player_type = id_m.group('ext')
358 player_id = id_m.group('id')
359
c4417ddb 360 # Read from filesystem cache
60064c53
PH
361 func_id = '%s_%s_%s' % (
362 player_type, player_id, self._signature_cache_id(example_sig))
c4417ddb 363 assert os.path.basename(func_id) == func_id
c38b1e77 364 cache_dir = get_cachedir(self._downloader.params)
c4417ddb 365
c3c88a26 366 cache_enabled = cache_dir is not None
f8061589 367 if cache_enabled:
c4417ddb
PH
368 cache_fn = os.path.join(os.path.expanduser(cache_dir),
369 u'youtube-sigfuncs',
370 func_id + '.json')
371 try:
edf3e38e 372 with io.open(cache_fn, 'r', encoding='utf-8') as cachef:
c4417ddb
PH
373 cache_spec = json.load(cachef)
374 return lambda s: u''.join(s[i] for i in cache_spec)
edf3e38e 375 except IOError:
c4417ddb 376 pass # No cache available
83799698 377
e0df6211
PH
378 if player_type == 'js':
379 code = self._download_webpage(
380 player_url, video_id,
83799698 381 note=u'Downloading %s player %s' % (player_type, player_id),
e0df6211 382 errnote=u'Download of %s failed' % player_url)
83799698 383 res = self._parse_sig_js(code)
c4417ddb 384 elif player_type == 'swf':
e0df6211
PH
385 urlh = self._request_webpage(
386 player_url, video_id,
83799698 387 note=u'Downloading %s player %s' % (player_type, player_id),
e0df6211
PH
388 errnote=u'Download of %s failed' % player_url)
389 code = urlh.read()
83799698 390 res = self._parse_sig_swf(code)
e0df6211
PH
391 else:
392 assert False, 'Invalid player type %r' % player_type
393
f8061589 394 if cache_enabled:
edf3e38e 395 try:
60064c53 396 test_string = u''.join(map(compat_chr, range(len(example_sig))))
c705320f 397 cache_res = res(test_string)
edf3e38e
PH
398 cache_spec = [ord(c) for c in cache_res]
399 try:
400 os.makedirs(os.path.dirname(cache_fn))
401 except OSError as ose:
402 if ose.errno != errno.EEXIST:
403 raise
404 write_json_file(cache_spec, cache_fn)
0ca96d48 405 except Exception:
edf3e38e
PH
406 tb = traceback.format_exc()
407 self._downloader.report_warning(
408 u'Writing cache to %r failed: %s' % (cache_fn, tb))
83799698
PH
409
410 return res
411
60064c53 412 def _print_sig_code(self, func, example_sig):
edf3e38e
PH
413 def gen_sig_code(idxs):
414 def _genslice(start, end, step):
415 starts = u'' if start == 0 else str(start)
e35e4ddc
PH
416 ends = (u':%d' % (end+step)) if end + step >= 0 else u':'
417 steps = u'' if step == 1 else (u':%d' % step)
edf3e38e
PH
418 return u's[%s%s%s]' % (starts, ends, steps)
419
420 step = None
0ca96d48
PH
421 start = '(Never used)' # Quelch pyflakes warnings - start will be
422 # set as soon as step is set
edf3e38e
PH
423 for i, prev in zip(idxs[1:], idxs[:-1]):
424 if step is not None:
425 if i - prev == step:
426 continue
427 yield _genslice(start, prev, step)
428 step = None
429 continue
430 if i - prev in [-1, 1]:
431 step = i - prev
432 start = prev
433 continue
434 else:
435 yield u's[%d]' % prev
436 if step is None:
437 yield u's[%d]' % i
438 else:
439 yield _genslice(start, i, step)
440
60064c53 441 test_string = u''.join(map(compat_chr, range(len(example_sig))))
c705320f 442 cache_res = func(test_string)
edf3e38e
PH
443 cache_spec = [ord(c) for c in cache_res]
444 expr_code = u' + '.join(gen_sig_code(cache_spec))
60064c53
PH
445 signature_id_tuple = '(%s)' % (
446 ', '.join(compat_str(len(p)) for p in example_sig.split('.')))
447 code = (u'if tuple(len(p) for p in s.split(\'.\')) == %s:\n'
448 u' return %s\n') % (signature_id_tuple, expr_code)
f8061589 449 self.to_screen(u'Extracted signature function:\n' + code)
edf3e38e 450
e0df6211
PH
451 def _parse_sig_js(self, jscode):
452 funcname = self._search_regex(
c26e9ac4 453 r'signature=([$a-zA-Z]+)', jscode,
2b25cb5d
PH
454 u'Initial JS player signature function name')
455
456 jsi = JSInterpreter(jscode)
457 initial_function = jsi.extract_function(funcname)
e0df6211
PH
458 return lambda s: initial_function([s])
459
460 def _parse_sig_swf(self, file_contents):
54256267 461 swfi = SWFInterpreter(file_contents)
5dc3552d 462 TARGET_CLASSNAME = u'SignatureDecipher'
54256267
PH
463 searched_class = swfi.extract_class(TARGET_CLASSNAME)
464 initial_function = swfi.extract_function(searched_class, u'decipher')
e0df6211
PH
465 return lambda s: initial_function([s])
466
83799698 467 def _decrypt_signature(self, s, video_id, player_url, age_gate=False):
257a2501 468 """Turn the encrypted s field into a working signature"""
6b37f0be 469
c8bf86d5
PH
470 if player_url is None:
471 raise ExtractorError(u'Cannot decrypt signature without player_url')
920de7a2 472
c8bf86d5
PH
473 if player_url.startswith(u'//'):
474 player_url = u'https:' + player_url
475 try:
62af3a0e 476 player_id = (player_url, self._signature_cache_id(s))
c8bf86d5
PH
477 if player_id not in self._player_cache:
478 func = self._extract_signature_function(
60064c53 479 video_id, player_url, s
c8bf86d5
PH
480 )
481 self._player_cache[player_id] = func
482 func = self._player_cache[player_id]
483 if self._downloader.params.get('youtube_print_sig_code'):
60064c53 484 self._print_sig_code(func, s)
c8bf86d5
PH
485 return func(s)
486 except Exception as e:
487 tb = traceback.format_exc()
488 raise ExtractorError(
60064c53 489 u'Signature extraction failed: ' + tb, cause=e)
e0df6211 490
1f343eaa 491 def _get_available_subtitles(self, video_id, webpage):
de7f3446 492 try:
7fad1c63 493 sub_list = self._download_webpage(
38c2e5b8 494 'https://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,
7fad1c63
JMF
495 video_id, note=False)
496 except ExtractorError as err:
de7f3446
JMF
497 self._downloader.report_warning(u'unable to download video subtitles: %s' % compat_str(err))
498 return {}
499 lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
500
501 sub_lang_list = {}
502 for l in lang_list:
503 lang = l[1]
504 params = compat_urllib_parse.urlencode({
505 'lang': lang,
506 'v': video_id,
ca715127 507 'fmt': self._downloader.params.get('subtitlesformat', 'srt'),
c3197e3e 508 'name': unescapeHTML(l[0]).encode('utf-8'),
de7f3446 509 })
38c2e5b8 510 url = u'https://www.youtube.com/api/timedtext?' + params
de7f3446
JMF
511 sub_lang_list[lang] = url
512 if not sub_lang_list:
513 self._downloader.report_warning(u'video doesn\'t have subtitles')
514 return {}
515 return sub_lang_list
516
055e6f36 517 def _get_available_automatic_caption(self, video_id, webpage):
de7f3446
JMF
518 """We need the webpage for getting the captions url, pass it as an
519 argument to speed up the process."""
ca715127 520 sub_format = self._downloader.params.get('subtitlesformat', 'srt')
de7f3446
JMF
521 self.to_screen(u'%s: Looking for automatic captions' % video_id)
522 mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
055e6f36 523 err_msg = u'Couldn\'t find automatic captions for %s' % video_id
de7f3446
JMF
524 if mobj is None:
525 self._downloader.report_warning(err_msg)
526 return {}
527 player_config = json.loads(mobj.group(1))
528 try:
529 args = player_config[u'args']
530 caption_url = args[u'ttsurl']
531 timestamp = args[u'timestamp']
055e6f36
JMF
532 # We get the available subtitles
533 list_params = compat_urllib_parse.urlencode({
534 'type': 'list',
535 'tlangs': 1,
536 'asrs': 1,
de7f3446 537 })
055e6f36 538 list_url = caption_url + '&' + list_params
e26f8712 539 caption_list = self._download_xml(list_url, video_id)
e3dc22ca 540 original_lang_node = caption_list.find('track')
f6a54188 541 if original_lang_node is None or original_lang_node.attrib.get('kind') != 'asr' :
e3dc22ca
JMF
542 self._downloader.report_warning(u'Video doesn\'t have automatic captions')
543 return {}
544 original_lang = original_lang_node.attrib['lang_code']
055e6f36
JMF
545
546 sub_lang_list = {}
547 for lang_node in caption_list.findall('target'):
548 sub_lang = lang_node.attrib['lang_code']
549 params = compat_urllib_parse.urlencode({
550 'lang': original_lang,
551 'tlang': sub_lang,
552 'fmt': sub_format,
553 'ts': timestamp,
554 'kind': 'asr',
555 })
556 sub_lang_list[sub_lang] = caption_url + '&' + params
557 return sub_lang_list
de7f3446
JMF
558 # An extractor error can be raise by the download process if there are
559 # no automatic captions but there are subtitles
560 except (KeyError, ExtractorError):
561 self._downloader.report_warning(err_msg)
562 return {}
563
97665381
PH
564 @classmethod
565 def extract_id(cls, url):
566 mobj = re.match(cls._VALID_URL, url, re.VERBOSE)
c5e8d7af
PH
567 if mobj is None:
568 raise ExtractorError(u'Invalid URL: %s' % url)
569 video_id = mobj.group(2)
570 return video_id
571
1d043b93
JMF
572 def _extract_from_m3u8(self, manifest_url, video_id):
573 url_map = {}
574 def _get_urls(_manifest):
575 lines = _manifest.split('\n')
576 urls = filter(lambda l: l and not l.startswith('#'),
577 lines)
578 return urls
579 manifest = self._download_webpage(manifest_url, video_id, u'Downloading formats manifest')
580 formats_urls = _get_urls(manifest)
581 for format_url in formats_urls:
890f62e8 582 itag = self._search_regex(r'itag/(\d+?)/', format_url, 'itag')
1d043b93
JMF
583 url_map[itag] = format_url
584 return url_map
585
1fb07d10
JG
586 def _extract_annotations(self, video_id):
587 url = 'https://www.youtube.com/annotations_invideo?features=1&legacy=1&video_id=%s' % video_id
588 return self._download_webpage(url, video_id, note=u'Searching for annotations.', errnote=u'Unable to download video annotations.')
589
c5e8d7af 590 def _real_extract(self, url):
7e8c0af0
PH
591 proto = (
592 u'http' if self._downloader.params.get('prefer_insecure', False)
593 else u'https')
594
c5e8d7af
PH
595 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
596 mobj = re.search(self._NEXT_URL_RE, url)
597 if mobj:
7e8c0af0 598 url = proto + '://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
97665381 599 video_id = self.extract_id(url)
c5e8d7af
PH
600
601 # Get video webpage
7e8c0af0 602 url = proto + '://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
336c3a69 603 video_webpage = self._download_webpage(url, video_id)
c5e8d7af
PH
604
605 # Attempt to extract SWF player URL
e0df6211 606 mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
c5e8d7af
PH
607 if mobj is not None:
608 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
609 else:
610 player_url = None
611
612 # Get video info
613 self.report_video_info_webpage_download(video_id)
c108eb73
JMF
614 if re.search(r'player-age-gate-content">', video_webpage) is not None:
615 self.report_age_confirmation()
616 age_gate = True
617 # We simulate the access to the video from www.youtube.com/v/{video_id}
618 # this can be viewed without login into Youtube
2c57c7fa
JMF
619 data = compat_urllib_parse.urlencode({
620 'video_id': video_id,
621 'eurl': 'https://youtube.googleapis.com/v/' + video_id,
c084c934
JMF
622 'sts': self._search_regex(
623 r'"sts"\s*:\s*(\d+)', video_webpage, 'sts'),
2c57c7fa 624 })
7e8c0af0 625 video_info_url = proto + '://www.youtube.com/get_video_info?' + data
c5e8d7af
PH
626 video_info_webpage = self._download_webpage(video_info_url, video_id,
627 note=False,
628 errnote='unable to download video info webpage')
629 video_info = compat_parse_qs(video_info_webpage)
c108eb73
JMF
630 else:
631 age_gate = False
632 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
7e8c0af0 633 video_info_url = (proto + '://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
c108eb73
JMF
634 % (video_id, el_type))
635 video_info_webpage = self._download_webpage(video_info_url, video_id,
636 note=False,
637 errnote='unable to download video info webpage')
638 video_info = compat_parse_qs(video_info_webpage)
639 if 'token' in video_info:
640 break
c5e8d7af
PH
641 if 'token' not in video_info:
642 if 'reason' in video_info:
d11271dd
PH
643 raise ExtractorError(
644 u'YouTube said: %s' % video_info['reason'][0],
645 expected=True, video_id=video_id)
c5e8d7af 646 else:
d11271dd
PH
647 raise ExtractorError(
648 u'"token" parameter not in video info for unknown reason',
649 video_id=video_id)
c5e8d7af 650
1d699755
PH
651 if 'view_count' in video_info:
652 view_count = int(video_info['view_count'][0])
653 else:
654 view_count = None
655
c5e8d7af
PH
656 # Check for "rental" videos
657 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
658 raise ExtractorError(u'"rental" videos not supported')
659
660 # Start extracting information
661 self.report_information_extraction(video_id)
662
663 # uploader
664 if 'author' not in video_info:
665 raise ExtractorError(u'Unable to extract uploader name')
666 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
667
668 # uploader_id
669 video_uploader_id = None
670 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
671 if mobj is not None:
672 video_uploader_id = mobj.group(1)
673 else:
674 self._downloader.report_warning(u'unable to extract uploader nickname')
675
676 # title
a8c6b241 677 if 'title' in video_info:
aa92f063 678 video_title = video_info['title'][0]
a8c6b241
PH
679 else:
680 self._downloader.report_warning(u'Unable to extract video title')
681 video_title = u'_'
c5e8d7af
PH
682
683 # thumbnail image
7763b04e
JMF
684 # We try first to get a high quality image:
685 m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">',
686 video_webpage, re.DOTALL)
687 if m_thumb is not None:
688 video_thumbnail = m_thumb.group(1)
689 elif 'thumbnail_url' not in video_info:
c5e8d7af 690 self._downloader.report_warning(u'unable to extract video thumbnail')
f490e77e 691 video_thumbnail = None
c5e8d7af
PH
692 else: # don't panic if we can't find it
693 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
694
695 # upload date
696 upload_date = None
ad3bc6ac 697 mobj = re.search(r'(?s)id="eow-date.*?>(.*?)</span>', video_webpage)
beee53de
PH
698 if mobj is None:
699 mobj = re.search(
263bd4ec 700 r'(?s)id="watch-uploader-info".*?>.*?(?:Published|Uploaded|Streamed live) on (.*?)</strong>',
beee53de 701 video_webpage)
c5e8d7af
PH
702 if mobj is not None:
703 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
704 upload_date = unified_strdate(upload_date)
705
ec8deefc
DG
706 m_cat_container = get_element_by_id("eow-category", video_webpage)
707 if m_cat_container:
ad3bc6ac 708 category = self._html_search_regex(
01ed5c9b 709 r'(?s)<a[^<]+>(.*?)</a>', m_cat_container, 'category',
ad3bc6ac
PH
710 default=None)
711 video_categories = None if category is None else [category]
712 else:
713 video_categories = None
ec8deefc 714
c5e8d7af
PH
715 # description
716 video_description = get_element_by_id("eow-description", video_webpage)
717 if video_description:
27dcce19
PH
718 video_description = re.sub(r'''(?x)
719 <a\s+
720 (?:[a-zA-Z-]+="[^"]+"\s+)*?
721 title="([^"]+)"\s+
722 (?:[a-zA-Z-]+="[^"]+"\s+)*?
723 class="yt-uix-redirect-link"\s*>
724 [^<]+
725 </a>
726 ''', r'\1', video_description)
c5e8d7af
PH
727 video_description = clean_html(video_description)
728 else:
729 fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
730 if fd_mobj:
731 video_description = unescapeHTML(fd_mobj.group(1))
732 else:
733 video_description = u''
734
336c3a69 735 def _extract_count(klass):
46374a56
PH
736 count = self._search_regex(
737 r'class="%s">([\d,]+)</span>' % re.escape(klass),
738 video_webpage, klass, default=None)
336c3a69
JMF
739 if count is not None:
740 return int(count.replace(',', ''))
741 return None
742 like_count = _extract_count(u'likes-count')
743 dislike_count = _extract_count(u'dislikes-count')
744
c5e8d7af 745 # subtitles
d82134c3 746 video_subtitles = self.extract_subtitles(video_id, video_webpage)
c5e8d7af 747
c5e8d7af 748 if self._downloader.params.get('listsubtitles', False):
d665f8d3 749 self._list_available_subtitles(video_id, video_webpage)
c5e8d7af
PH
750 return
751
752 if 'length_seconds' not in video_info:
753 self._downloader.report_warning(u'unable to extract video duration')
b466b702 754 video_duration = None
c5e8d7af 755 else:
b466b702 756 video_duration = int(compat_urllib_parse.unquote_plus(video_info['length_seconds'][0]))
c5e8d7af 757
1fb07d10
JG
758 # annotations
759 video_annotations = None
760 if self._downloader.params.get('writeannotations', False):
761 video_annotations = self._extract_annotations(video_id)
762
c5e8d7af 763 # Decide which formats to download
c5e8d7af 764 try:
ae7ed920 765 mobj = re.search(r';ytplayer\.config\s*=\s*({.*?});', video_webpage)
50be92c1
PH
766 if not mobj:
767 raise ValueError('Could not find vevo ID')
ae7ed920
PH
768 json_code = uppercase_escape(mobj.group(1))
769 ytplayer_config = json.loads(json_code)
3489b7d2 770 args = ytplayer_config['args']
7ce7e394
JMF
771 # Easy way to know if the 's' value is in url_encoded_fmt_stream_map
772 # this signatures are encrypted
44d46655 773 if 'url_encoded_fmt_stream_map' not in args:
f10503db 774 raise ValueError(u'No stream_map present') # caught below
00fe14fc
JMF
775 re_signature = re.compile(r'[&,]s=')
776 m_s = re_signature.search(args['url_encoded_fmt_stream_map'])
7ce7e394
JMF
777 if m_s is not None:
778 self.to_screen(u'%s: Encrypted signatures detected.' % video_id)
c5e8d7af 779 video_info['url_encoded_fmt_stream_map'] = [args['url_encoded_fmt_stream_map']]
00fe14fc 780 m_s = re_signature.search(args.get('adaptive_fmts', u''))
b7a68384 781 if m_s is not None:
00fe14fc
JMF
782 if 'adaptive_fmts' in video_info:
783 video_info['adaptive_fmts'][0] += ',' + args['adaptive_fmts']
37b6d5f6 784 else:
00fe14fc 785 video_info['adaptive_fmts'] = [args['adaptive_fmts']]
c5e8d7af
PH
786 except ValueError:
787 pass
788
dd27fd17
PH
789 def _map_to_format_list(urlmap):
790 formats = []
791 for itag, video_real_url in urlmap.items():
792 dct = {
793 'format_id': itag,
794 'url': video_real_url,
795 'player_url': player_url,
796 }
0b65e5d4
PH
797 if itag in self._formats:
798 dct.update(self._formats[itag])
dd27fd17
PH
799 formats.append(dct)
800 return formats
801
c5e8d7af
PH
802 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
803 self.report_rtmp_download()
dd27fd17
PH
804 formats = [{
805 'format_id': '_rtmp',
806 'protocol': 'rtmp',
807 'url': video_info['conn'][0],
808 'player_url': player_url,
809 }]
00fe14fc
JMF
810 elif len(video_info.get('url_encoded_fmt_stream_map', [])) >= 1 or len(video_info.get('adaptive_fmts', [])) >= 1:
811 encoded_url_map = video_info.get('url_encoded_fmt_stream_map', [''])[0] + ',' + video_info.get('adaptive_fmts',[''])[0]
812 if 'rtmpe%3Dyes' in encoded_url_map:
a7055eb9 813 raise ExtractorError('rtmpe downloads are not supported, see https://github.com/rg3/youtube-dl/issues/343 for more information.', expected=True)
c5e8d7af 814 url_map = {}
00fe14fc 815 for url_data_str in encoded_url_map.split(','):
c5e8d7af 816 url_data = compat_parse_qs(url_data_str)
201e9eaa
PH
817 if 'itag' not in url_data or 'url' not in url_data:
818 continue
819 format_id = url_data['itag'][0]
820 url = url_data['url'][0]
821
822 if 'sig' in url_data:
823 url += '&signature=' + url_data['sig'][0]
824 elif 's' in url_data:
825 encrypted_sig = url_data['s'][0]
826
827 if not age_gate:
828 jsplayer_url_json = self._search_regex(
829 r'"assets":.+?"js":\s*("[^"]+")',
830 video_webpage, u'JS player URL')
831 player_url = json.loads(jsplayer_url_json)
832 if player_url is None:
833 player_url_json = self._search_regex(
834 r'ytplayer\.config.*?"url"\s*:\s*("[^"]+")',
835 video_webpage, u'age gate player URL')
836 player_url = json.loads(player_url_json)
837
838 if self._downloader.params.get('verbose'):
cf010131 839 if player_url is None:
201e9eaa
PH
840 player_version = 'unknown'
841 player_desc = 'unknown'
842 else:
843 if player_url.endswith('swf'):
844 player_version = self._search_regex(
845 r'-(.+?)(?:/watch_as3)?\.swf$', player_url,
846 u'flash player', fatal=False)
847 player_desc = 'flash player %s' % player_version
cf010131 848 else:
201e9eaa
PH
849 player_version = self._search_regex(
850 r'html5player-([^/]+?)(?:/html5player)?\.js',
851 player_url,
852 'html5 player', fatal=False)
853 player_desc = u'html5 player %s' % player_version
854
60064c53 855 parts_sizes = self._signature_cache_id(encrypted_sig)
98eb1c3f
PH
856 self.to_screen(u'{%s} signature length %s, %s' %
857 (format_id, parts_sizes, player_desc))
201e9eaa
PH
858
859 signature = self._decrypt_signature(
860 encrypted_sig, video_id, player_url, age_gate)
861 url += '&signature=' + signature
862 if 'ratebypass' not in url:
863 url += '&ratebypass=yes'
864 url_map[format_id] = url
dd27fd17 865 formats = _map_to_format_list(url_map)
1d043b93
JMF
866 elif video_info.get('hlsvp'):
867 manifest_url = video_info['hlsvp'][0]
868 url_map = self._extract_from_m3u8(manifest_url, video_id)
dd27fd17 869 formats = _map_to_format_list(url_map)
c5e8d7af 870 else:
9abb3204 871 raise ExtractorError(u'no conn, hlsvp or url_encoded_fmt_stream_map information found in video info')
c5e8d7af 872
dd27fd17 873 # Look for the DASH manifest
d68f0cdb 874 if (self._downloader.params.get('youtube_include_dash_manifest', False)):
dd27fd17 875 try:
d68f0cdb 876 # The DASH manifest used needs to be the one from the original video_webpage.
877 # The one found in get_video_info seems to be using different signatures.
878 # However, in the case of an age restriction there won't be any embedded dashmpd in the video_webpage.
879 # Luckily, it seems, this case uses some kind of default signature (len == 86), so the
880 # combination of get_video_info and the _static_decrypt_signature() decryption fallback will work here.
881 if age_gate:
3489b7d2 882 dash_manifest_url = video_info.get('dashmpd')[0]
d68f0cdb 883 else:
3489b7d2 884 dash_manifest_url = ytplayer_config['args']['dashmpd']
d68f0cdb 885 def decrypt_sig(mobj):
886 s = mobj.group(1)
887 dec_s = self._decrypt_signature(s, video_id, player_url, age_gate)
888 return '/signature/%s' % dec_s
889 dash_manifest_url = re.sub(r'/s/([\w\.]+)', decrypt_sig, dash_manifest_url)
dd27fd17 890 dash_doc = self._download_xml(
d68f0cdb 891 dash_manifest_url, video_id,
dd27fd17
PH
892 note=u'Downloading DASH manifest',
893 errnote=u'Could not download DASH manifest')
894 for r in dash_doc.findall(u'.//{urn:mpeg:DASH:schema:MPD:2011}Representation'):
895 url_el = r.find('{urn:mpeg:DASH:schema:MPD:2011}BaseURL')
896 if url_el is None:
897 continue
898 format_id = r.attrib['id']
899 video_url = url_el.text
900 filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength'))
901 f = {
902 'format_id': format_id,
903 'url': video_url,
904 'width': int_or_none(r.attrib.get('width')),
905 'tbr': int_or_none(r.attrib.get('bandwidth'), 1000),
906 'asr': int_or_none(r.attrib.get('audioSamplingRate')),
907 'filesize': filesize,
908 }
909 try:
910 existing_format = next(
911 fo for fo in formats
912 if fo['format_id'] == format_id)
913 except StopIteration:
914 f.update(self._formats.get(format_id, {}))
915 formats.append(f)
916 else:
917 existing_format.update(f)
918
919 except (ExtractorError, KeyError) as e:
920 self.report_warning(u'Skipping DASH manifest: %s' % e, video_id)
d80044c2 921
4bcc7bd1 922 self._sort_formats(formats)
4ea3be0a 923
924 return {
925 'id': video_id,
926 'uploader': video_uploader,
927 'uploader_id': video_uploader_id,
928 'upload_date': upload_date,
929 'title': video_title,
930 'thumbnail': video_thumbnail,
931 'description': video_description,
ec8deefc 932 'categories': video_categories,
4ea3be0a 933 'subtitles': video_subtitles,
934 'duration': video_duration,
935 'age_limit': 18 if age_gate else 0,
936 'annotations': video_annotations,
7e8c0af0 937 'webpage_url': proto + '://www.youtube.com/watch?v=%s' % video_id,
4ea3be0a 938 'view_count': view_count,
939 'like_count': like_count,
940 'dislike_count': dislike_count,
941 'formats': formats,
942 }
c5e8d7af 943
880e1c52 944class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
0f818663 945 IE_DESC = u'YouTube.com playlists'
d67cc9fa 946 _VALID_URL = r"""(?x)(?:
c5e8d7af
PH
947 (?:https?://)?
948 (?:\w+\.)?
949 youtube\.com/
950 (?:
951 (?:course|view_play_list|my_playlists|artist|playlist|watch)
952 \? (?:.*?&)*? (?:p|a|list)=
953 | p/
954 )
d67cc9fa 955 (
7d568f5a 956 (?:PL|LL|EC|UU|FL|RD)?[0-9A-Za-z-_]{10,}
d67cc9fa
JMF
957 # Top tracks, they can also include dots
958 |(?:MC)[\w\.]*
959 )
c5e8d7af
PH
960 .*
961 |
7d568f5a 962 ((?:PL|LL|EC|UU|FL|RD)[0-9A-Za-z-_]{10,})
c5e8d7af 963 )"""
dbb94fb0 964 _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s'
dcbb4580 965 _MORE_PAGES_INDICATOR = r'data-link-type="next"'
dbb94fb0 966 _VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})&amp;[^"]*?index=(?P<index>\d+)'
c5e8d7af
PH
967 IE_NAME = u'youtube:playlist'
968
880e1c52
JMF
969 def _real_initialize(self):
970 self._login()
971
652cdaa2
JMF
972 def _ids_to_results(self, ids):
973 return [self.url_result(vid_id, 'Youtube', video_id=vid_id)
974 for vid_id in ids]
975
976 def _extract_mix(self, playlist_id):
977 # The mixes are generated from a a single video
978 # the id of the playlist is just 'RD' + video_id
7d4afc55 979 url = 'https://youtube.com/watch?v=%s&list=%s' % (playlist_id[-11:], playlist_id)
652cdaa2 980 webpage = self._download_webpage(url, playlist_id, u'Downloading Youtube mix')
bc2f773b
JMF
981 search_title = lambda class_name: get_element_by_attribute('class', class_name, webpage)
982 title_span = (search_title('playlist-title') or
983 search_title('title long-title') or search_title('title'))
76d1700b 984 title = clean_html(title_span)
70e32269 985 video_re = r'''(?x)data-video-username=".*?".*?
bc2f773b 986 href="/watch\?v=([0-9A-Za-z_-]{11})&amp;[^"]*?list=%s''' % re.escape(playlist_id)
70e32269 987 ids = orderedSet(re.findall(video_re, webpage, flags=re.DOTALL))
652cdaa2
JMF
988 url_results = self._ids_to_results(ids)
989
990 return self.playlist_result(url_results, playlist_id, title)
991
c5e8d7af
PH
992 def _real_extract(self, url):
993 # Extract playlist id
d67cc9fa 994 mobj = re.match(self._VALID_URL, url)
c5e8d7af
PH
995 if mobj is None:
996 raise ExtractorError(u'Invalid URL: %s' % url)
47192f92
FV
997 playlist_id = mobj.group(1) or mobj.group(2)
998
999 # Check if it's a video-specific URL
7c61bd36 1000 query_dict = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
47192f92
FV
1001 if 'v' in query_dict:
1002 video_id = query_dict['v'][0]
1003 if self._downloader.params.get('noplaylist'):
1004 self.to_screen(u'Downloading just video %s because of --no-playlist' % video_id)
7012b23c 1005 return self.url_result(video_id, 'Youtube', video_id=video_id)
47192f92 1006 else:
1db26669 1007 self.to_screen(u'Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id))
c5e8d7af 1008
7d4afc55 1009 if playlist_id.startswith('RD'):
652cdaa2
JMF
1010 # Mixes require a custom extraction process
1011 return self._extract_mix(playlist_id)
0a688bc0
JMF
1012 if playlist_id.startswith('TL'):
1013 raise ExtractorError(u'For downloading YouTube.com top lists, use '
1014 u'the "yttoplist" keyword, for example "youtube-dl \'yttoplist:music:Top Tracks\'"', expected=True)
652cdaa2 1015
dbb94fb0
S
1016 url = self._TEMPLATE_URL % playlist_id
1017 page = self._download_webpage(url, playlist_id)
1018 more_widget_html = content_html = page
1019
10c0e2d8 1020 # Check if the playlist exists or is private
e399853d 1021 if re.search(r'<div class="yt-alert-message">[^<]*?(The|This) playlist (does not exist|is private)[^<]*?</div>', page) is not None:
10c0e2d8
JMF
1022 raise ExtractorError(
1023 u'The playlist doesn\'t exist or is private, use --username or '
1024 '--netrc to access it.',
1025 expected=True)
1026
dcbb4580
JMF
1027 # Extract the video ids from the playlist pages
1028 ids = []
c5e8d7af 1029
755eb032 1030 for page_num in itertools.count(1):
dbb94fb0 1031 matches = re.finditer(self._VIDEO_RE, content_html)
6e47b51e
JMF
1032 # We remove the duplicates and the link with index 0
1033 # (it's not the first video of the playlist)
1034 new_ids = orderedSet(m.group('id') for m in matches if m.group('index') != '0')
dcbb4580 1035 ids.extend(new_ids)
c5e8d7af 1036
dbb94fb0
S
1037 mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
1038 if not mobj:
c5e8d7af
PH
1039 break
1040
dbb94fb0 1041 more = self._download_json(
5912c639
PH
1042 'https://youtube.com/%s' % mobj.group('more'), playlist_id,
1043 'Downloading page #%s' % page_num,
1044 transform_source=uppercase_escape)
dbb94fb0
S
1045 content_html = more['content_html']
1046 more_widget_html = more['load_more_widget_html']
1047
1048 playlist_title = self._html_search_regex(
68eb8e90
PH
1049 r'(?s)<h1 class="pl-header-title[^"]*">\s*(.*?)\s*</h1>',
1050 page, u'title')
c5e8d7af 1051
652cdaa2 1052 url_results = self._ids_to_results(ids)
dcbb4580 1053 return self.playlist_result(url_results, playlist_id, playlist_title)
c5e8d7af
PH
1054
1055
0a688bc0
JMF
1056class YoutubeTopListIE(YoutubePlaylistIE):
1057 IE_NAME = u'youtube:toplist'
1058 IE_DESC = (u'YouTube.com top lists, "yttoplist:{channel}:{list title}"'
1059 u' (Example: "yttoplist:music:Top Tracks")')
1060 _VALID_URL = r'yttoplist:(?P<chann>.*?):(?P<title>.*?)$'
1061
1062 def _real_extract(self, url):
1063 mobj = re.match(self._VALID_URL, url)
1064 channel = mobj.group('chann')
1065 title = mobj.group('title')
1066 query = compat_urllib_parse.urlencode({'title': title})
beddbc2a 1067 playlist_re = 'href="([^"]+?%s.*?)"' % re.escape(query)
0a688bc0
JMF
1068 channel_page = self._download_webpage('https://www.youtube.com/%s' % channel, title)
1069 link = self._html_search_regex(playlist_re, channel_page, u'list')
1070 url = compat_urlparse.urljoin('https://www.youtube.com/', link)
1071
1072 video_re = r'data-index="\d+".*?data-video-id="([0-9A-Za-z_-]{11})"'
1073 ids = []
1074 # sometimes the webpage doesn't contain the videos
1075 # retry until we get them
1076 for i in itertools.count(0):
1077 msg = u'Downloading Youtube mix'
1078 if i > 0:
1079 msg += ', retry #%d' % i
1080 webpage = self._download_webpage(url, title, msg)
1081 ids = orderedSet(re.findall(video_re, webpage))
1082 if ids:
1083 break
1084 url_results = self._ids_to_results(ids)
1085 return self.playlist_result(url_results, playlist_title=title)
1086
1087
c5e8d7af 1088class YoutubeChannelIE(InfoExtractor):
0f818663 1089 IE_DESC = u'YouTube.com channels'
c5e8d7af 1090 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
c5e8d7af 1091 _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
38c2e5b8 1092 _MORE_PAGES_URL = 'https://www.youtube.com/c4_browse_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
c5e8d7af
PH
1093 IE_NAME = u'youtube:channel'
1094
1095 def extract_videos_from_page(self, page):
1096 ids_in_page = []
1097 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1098 if mobj.group(1) not in ids_in_page:
1099 ids_in_page.append(mobj.group(1))
1100 return ids_in_page
1101
1102 def _real_extract(self, url):
1103 # Extract channel id
1104 mobj = re.match(self._VALID_URL, url)
1105 if mobj is None:
1106 raise ExtractorError(u'Invalid URL: %s' % url)
1107
1108 # Download channel page
1109 channel_id = mobj.group(1)
1110 video_ids = []
b9643eed
JMF
1111 url = 'https://www.youtube.com/channel/%s/videos' % channel_id
1112 channel_page = self._download_webpage(url, channel_id)
31812a9e
PH
1113 autogenerated = re.search(r'''(?x)
1114 class="[^"]*?(?:
1115 channel-header-autogenerated-label|
1116 yt-channel-title-autogenerated
1117 )[^"]*"''', channel_page) is not None
c5e8d7af 1118
b9643eed
JMF
1119 if autogenerated:
1120 # The videos are contained in a single page
1121 # the ajax pages can't be used, they are empty
1122 video_ids = self.extract_videos_from_page(channel_page)
1123 else:
1124 # Download all channel pages using the json-based channel_ajax query
1125 for pagenum in itertools.count(1):
1126 url = self._MORE_PAGES_URL % (pagenum, channel_id)
81c2f20b
PH
1127 page = self._download_json(
1128 url, channel_id, note=u'Downloading page #%s' % pagenum,
1129 transform_source=uppercase_escape)
1130
b9643eed
JMF
1131 ids_in_page = self.extract_videos_from_page(page['content_html'])
1132 video_ids.extend(ids_in_page)
1133
1134 if self._MORE_PAGES_INDICATOR not in page['load_more_widget_html']:
1135 break
c5e8d7af
PH
1136
1137 self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1138
7012b23c
PH
1139 url_entries = [self.url_result(video_id, 'Youtube', video_id=video_id)
1140 for video_id in video_ids]
1141 return self.playlist_result(url_entries, channel_id)
c5e8d7af
PH
1142
1143
1144class YoutubeUserIE(InfoExtractor):
0f818663 1145 IE_DESC = u'YouTube.com user videos (URL or "ytuser" keyword)'
c9ae7b95 1146 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:user/)?(?!(?:attribution_link|watch|results)(?:$|[^a-z_A-Z0-9-])))|ytuser:)(?!feed/)([A-Za-z0-9_-]+)'
38c2e5b8 1147 _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/users/%s'
c5e8d7af 1148 _GDATA_PAGE_SIZE = 50
38c2e5b8 1149 _GDATA_URL = 'https://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d&alt=json'
c5e8d7af
PH
1150 IE_NAME = u'youtube:user'
1151
e3ea4790 1152 @classmethod
f4b05232 1153 def suitable(cls, url):
e3ea4790
JMF
1154 # Don't return True if the url can be extracted with other youtube
1155 # extractor, the regex would is too permissive and it would match.
1156 other_ies = iter(klass for (name, klass) in globals().items() if name.endswith('IE') and klass is not cls)
1157 if any(ie.suitable(url) for ie in other_ies): return False
f4b05232
JMF
1158 else: return super(YoutubeUserIE, cls).suitable(url)
1159
c5e8d7af
PH
1160 def _real_extract(self, url):
1161 # Extract username
1162 mobj = re.match(self._VALID_URL, url)
1163 if mobj is None:
1164 raise ExtractorError(u'Invalid URL: %s' % url)
1165
1166 username = mobj.group(1)
1167
1168 # Download video ids using YouTube Data API. Result size per
1169 # query is limited (currently to 50 videos) so we need to query
1170 # page by page until there are no video ids - it means we got
1171 # all of them.
1172
b7ab0590 1173 def download_page(pagenum):
c5e8d7af
PH
1174 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1175
1176 gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
b7ab0590
PH
1177 page = self._download_webpage(
1178 gdata_url, username,
1179 u'Downloading video ids from %d to %d' % (
1180 start_index, start_index + self._GDATA_PAGE_SIZE))
c5e8d7af 1181
fd9cf738
JMF
1182 try:
1183 response = json.loads(page)
1184 except ValueError as err:
1185 raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
71c82637 1186 if 'entry' not in response['feed']:
b7ab0590 1187 return
fd9cf738 1188
c5e8d7af 1189 # Extract video identifiers
e302f9ce
PH
1190 entries = response['feed']['entry']
1191 for entry in entries:
1192 title = entry['title']['$t']
1193 video_id = entry['id']['$t'].split('/')[-1]
b7ab0590 1194 yield {
e302f9ce
PH
1195 '_type': 'url',
1196 'url': video_id,
1197 'ie_key': 'Youtube',
b11cec41 1198 'id': video_id,
e302f9ce 1199 'title': title,
b7ab0590
PH
1200 }
1201 url_results = PagedList(download_page, self._GDATA_PAGE_SIZE)
c5e8d7af 1202
7012b23c
PH
1203 return self.playlist_result(url_results, playlist_title=username)
1204
b05654f0
PH
1205
1206class YoutubeSearchIE(SearchInfoExtractor):
0f818663 1207 IE_DESC = u'YouTube.com searches'
83d548ef 1208 _API_URL = u'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
b05654f0
PH
1209 _MAX_RESULTS = 1000
1210 IE_NAME = u'youtube:search'
1211 _SEARCH_KEY = 'ytsearch'
1212
b05654f0
PH
1213 def _get_n_results(self, query, n):
1214 """Get a specified number of results for a query"""
1215
1216 video_ids = []
1217 pagenum = 0
1218 limit = n
83d548ef 1219 PAGE_SIZE = 50
b05654f0 1220
83d548ef
PH
1221 while (PAGE_SIZE * pagenum) < limit:
1222 result_url = self._API_URL % (
1223 compat_urllib_parse.quote_plus(query.encode('utf-8')),
1224 (PAGE_SIZE * pagenum) + 1)
7cc3570e
PH
1225 data_json = self._download_webpage(
1226 result_url, video_id=u'query "%s"' % query,
1227 note=u'Downloading page %s' % (pagenum + 1),
1228 errnote=u'Unable to download API page')
1229 data = json.loads(data_json)
1230 api_response = data['data']
1231
1232 if 'items' not in api_response:
07ad22b8
PH
1233 raise ExtractorError(
1234 u'[youtube] No video results', expected=True)
b05654f0
PH
1235
1236 new_ids = list(video['id'] for video in api_response['items'])
1237 video_ids += new_ids
1238
1239 limit = min(n, api_response['totalItems'])
1240 pagenum += 1
1241
1242 if len(video_ids) > n:
1243 video_ids = video_ids[:n]
7012b23c
PH
1244 videos = [self.url_result(video_id, 'Youtube', video_id=video_id)
1245 for video_id in video_ids]
b05654f0 1246 return self.playlist_result(videos, query)
75dff0ee 1247
c9ae7b95 1248
a3dd9248 1249class YoutubeSearchDateIE(YoutubeSearchIE):
cb7fb546 1250 IE_NAME = YoutubeSearchIE.IE_NAME + ':date'
a3dd9248
CM
1251 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc&orderby=published'
1252 _SEARCH_KEY = 'ytsearchdate'
08fb86c4 1253 IE_DESC = u'YouTube.com searches, newest videos first'
75dff0ee 1254
c9ae7b95
PH
1255
1256class YoutubeSearchURLIE(InfoExtractor):
1257 IE_DESC = u'YouTube.com search URLs'
1258 IE_NAME = u'youtube:search_url'
1259 _VALID_URL = r'https?://(?:www\.)?youtube\.com/results\?(.*?&)?search_query=(?P<query>[^&]+)(?:[&]|$)'
1260
1261 def _real_extract(self, url):
1262 mobj = re.match(self._VALID_URL, url)
1263 query = compat_urllib_parse.unquote_plus(mobj.group('query'))
1264
1265 webpage = self._download_webpage(url, query)
1266 result_code = self._search_regex(
6feb2d5e 1267 r'(?s)<ol class="item-section"(.*?)</ol>', webpage, u'result HTML')
c9ae7b95
PH
1268
1269 part_codes = re.findall(
1270 r'(?s)<h3 class="yt-lockup-title">(.*?)</h3>', result_code)
1271 entries = []
1272 for part_code in part_codes:
1273 part_title = self._html_search_regex(
6feb2d5e 1274 [r'(?s)title="([^"]+)"', r'>([^<]+)</a>'], part_code, 'item title', fatal=False)
c9ae7b95
PH
1275 part_url_snippet = self._html_search_regex(
1276 r'(?s)href="([^"]+)"', part_code, 'item URL')
1277 part_url = compat_urlparse.urljoin(
1278 'https://www.youtube.com/', part_url_snippet)
1279 entries.append({
1280 '_type': 'url',
1281 'url': part_url,
1282 'title': part_title,
1283 })
1284
1285 return {
1286 '_type': 'playlist',
1287 'entries': entries,
1288 'title': query,
1289 }
1290
1291
75dff0ee 1292class YoutubeShowIE(InfoExtractor):
0f818663 1293 IE_DESC = u'YouTube.com (multi-season) shows'
75dff0ee
JMF
1294 _VALID_URL = r'https?://www\.youtube\.com/show/(.*)'
1295 IE_NAME = u'youtube:show'
1296
1297 def _real_extract(self, url):
1298 mobj = re.match(self._VALID_URL, url)
1299 show_name = mobj.group(1)
1300 webpage = self._download_webpage(url, show_name, u'Downloading show webpage')
1301 # There's one playlist for each season of the show
1302 m_seasons = list(re.finditer(r'href="(/playlist\?list=.*?)"', webpage))
1303 self.to_screen(u'%s: Found %s seasons' % (show_name, len(m_seasons)))
1304 return [self.url_result('https://www.youtube.com' + season.group(1), 'YoutubePlaylist') for season in m_seasons]
04cc9617
JMF
1305
1306
b2e8bc1b 1307class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
d7ae0639
JMF
1308 """
1309 Base class for extractors that fetch info from
1310 http://www.youtube.com/feed_ajax
1311 Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.
1312 """
b2e8bc1b 1313 _LOGIN_REQUIRED = True
43ba5456
JMF
1314 # use action_load_personal_feed instead of action_load_system_feed
1315 _PERSONAL_FEED = False
04cc9617 1316
d7ae0639
JMF
1317 @property
1318 def _FEED_TEMPLATE(self):
43ba5456
JMF
1319 action = 'action_load_system_feed'
1320 if self._PERSONAL_FEED:
1321 action = 'action_load_personal_feed'
38c2e5b8 1322 return 'https://www.youtube.com/feed_ajax?%s=1&feed_name=%s&paging=%%s' % (action, self._FEED_NAME)
d7ae0639
JMF
1323
1324 @property
1325 def IE_NAME(self):
1326 return u'youtube:%s' % self._FEED_NAME
04cc9617 1327
81f0259b 1328 def _real_initialize(self):
b2e8bc1b 1329 self._login()
81f0259b 1330
04cc9617
JMF
1331 def _real_extract(self, url):
1332 feed_entries = []
0e44d838
JMF
1333 paging = 0
1334 for i in itertools.count(1):
f6177462 1335 info = self._download_json(self._FEED_TEMPLATE % paging,
d7ae0639 1336 u'%s feed' % self._FEED_NAME,
04cc9617 1337 u'Downloading page %s' % i)
f6177462 1338 feed_html = info.get('feed_html') or info.get('content_html')
43ba5456 1339 m_ids = re.finditer(r'"/watch\?v=(.*?)["&]', feed_html)
04cc9617 1340 ids = orderedSet(m.group(1) for m in m_ids)
7012b23c
PH
1341 feed_entries.extend(
1342 self.url_result(video_id, 'Youtube', video_id=video_id)
1343 for video_id in ids)
05ee2b6d
JMF
1344 mobj = re.search(
1345 r'data-uix-load-more-href="/?[^"]+paging=(?P<paging>\d+)',
1346 feed_html)
1347 if mobj is None:
04cc9617 1348 break
05ee2b6d 1349 paging = mobj.group('paging')
d7ae0639
JMF
1350 return self.playlist_result(feed_entries, playlist_title=self._PLAYLIST_TITLE)
1351
1352class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor):
e45d40b1 1353 IE_DESC = u'YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication)'
d7ae0639
JMF
1354 _VALID_URL = r'https?://www\.youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
1355 _FEED_NAME = 'subscriptions'
1356 _PLAYLIST_TITLE = u'Youtube Subscriptions'
1357
1358class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
1359 IE_DESC = u'YouTube.com recommended videos, "ytrec" keyword (requires authentication)'
1360 _VALID_URL = r'https?://www\.youtube\.com/feed/recommended|:ytrec(?:ommended)?'
1361 _FEED_NAME = 'recommended'
1362 _PLAYLIST_TITLE = u'Youtube Recommended videos'
c626a3d9 1363
43ba5456
JMF
1364class YoutubeWatchLaterIE(YoutubeFeedsInfoExtractor):
1365 IE_DESC = u'Youtube watch later list, "ytwatchlater" keyword (requires authentication)'
1366 _VALID_URL = r'https?://www\.youtube\.com/feed/watch_later|:ytwatchlater'
1367 _FEED_NAME = 'watch_later'
1368 _PLAYLIST_TITLE = u'Youtube Watch Later'
43ba5456 1369 _PERSONAL_FEED = True
c626a3d9 1370
f459d170
JMF
1371class YoutubeHistoryIE(YoutubeFeedsInfoExtractor):
1372 IE_DESC = u'Youtube watch history, "ythistory" keyword (requires authentication)'
1373 _VALID_URL = u'https?://www\.youtube\.com/feed/history|:ythistory'
1374 _FEED_NAME = 'history'
1375 _PERSONAL_FEED = True
1376 _PLAYLIST_TITLE = u'Youtube Watch History'
1377
c626a3d9
JMF
1378class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
1379 IE_NAME = u'youtube:favorites'
1380 IE_DESC = u'YouTube.com favourite videos, "ytfav" keyword (requires authentication)'
c7a7750d 1381 _VALID_URL = r'https?://www\.youtube\.com/my_favorites|:ytfav(?:ou?rites)?'
c626a3d9
JMF
1382 _LOGIN_REQUIRED = True
1383
1384 def _real_extract(self, url):
1385 webpage = self._download_webpage('https://www.youtube.com/my_favorites', 'Youtube Favourites videos')
1386 playlist_id = self._search_regex(r'list=(.+?)["&]', webpage, u'favourites playlist id')
1387 return self.url_result(playlist_id, 'YoutubePlaylist')
15870e90
PH
1388
1389
1390class YoutubeTruncatedURLIE(InfoExtractor):
1391 IE_NAME = 'youtube:truncated_url'
1392 IE_DESC = False # Do not list
975d35db 1393 _VALID_URL = r'''(?x)
c4808c60
PH
1394 (?:https?://)?[^/]+/watch\?(?:
1395 feature=[a-z_]+|
1396 annotation_id=annotation_[^&]+
1397 )?$|
975d35db
PH
1398 (?:https?://)?(?:www\.)?youtube\.com/attribution_link\?a=[^&]+$
1399 '''
15870e90 1400
c4808c60
PH
1401 _TESTS = [{
1402 'url': 'http://www.youtube.com/watch?annotation_id=annotation_3951667041',
1403 'only_matching': True,
dc2fc736
PH
1404 }, {
1405 'url': 'http://www.youtube.com/watch?',
1406 'only_matching': True,
c4808c60
PH
1407 }]
1408
15870e90
PH
1409 def _real_extract(self, url):
1410 raise ExtractorError(
1411 u'Did you forget to quote the URL? Remember that & is a meta '
1412 u'character in most shells, so you want to put the URL in quotes, '
1413 u'like youtube-dl '
b4622a32
PH
1414 u'"http://www.youtube.com/watch?feature=foo&v=BaW_jenozKc" '
1415 u' or simply youtube-dl BaW_jenozKc .',
15870e90 1416 expected=True)