]> jfr.im git - yt-dlp.git/blob - youtube_dl/utils.py
Prepare urllib references for 2/3 compatibility
[yt-dlp.git] / youtube_dl / utils.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 import gzip
5 import htmlentitydefs
6 import HTMLParser
7 import locale
8 import os
9 import re
10 import sys
11 import zlib
12 import email.utils
13 import json
14
15 try:
16 import cStringIO as StringIO
17 except ImportError:
18 import StringIO
19
20 std_headers = {
21 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0',
22 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
23 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
24 'Accept-Encoding': 'gzip, deflate',
25 'Accept-Language': 'en-us,en;q=0.5',
26 }
27
28 try:
29 compat_str = unicode # Python 2
30 except NameError:
31 compat_str = str
32
33 try:
34 import urllib.request as compat_urllib_request
35 except ImportError: # Python 2
36 import urllib2 as compat_urllib_request
37
38 try:
39 import urllib.error as compat_urllib_error
40 except ImportError: # Python 2
41 import urllib2 as compat_urllib_error
42
43 try:
44 import urllib.parse as compat_urllib_parse
45 except ImportError: # Python 2
46 import urllib2 as compat_urllib_parse
47
48 try:
49 import http.cookiejar as compat_cookiejar
50 except ImportError: # Python 2
51 import cookielib as compat_cookiejar
52
53 def preferredencoding():
54 """Get preferred encoding.
55
56 Returns the best encoding scheme for the system, based on
57 locale.getpreferredencoding() and some further tweaks.
58 """
59 try:
60 pref = locale.getpreferredencoding()
61 u'TEST'.encode(pref)
62 except:
63 pref = 'UTF-8'
64
65 return pref
66
67
68 def htmlentity_transform(matchobj):
69 """Transforms an HTML entity to a character.
70
71 This function receives a match object and is intended to be used with
72 the re.sub() function.
73 """
74 entity = matchobj.group(1)
75
76 # Known non-numeric HTML entity
77 if entity in htmlentitydefs.name2codepoint:
78 return unichr(htmlentitydefs.name2codepoint[entity])
79
80 mobj = re.match(ur'(?u)#(x?\d+)', entity)
81 if mobj is not None:
82 numstr = mobj.group(1)
83 if numstr.startswith(u'x'):
84 base = 16
85 numstr = u'0%s' % numstr
86 else:
87 base = 10
88 return unichr(int(numstr, base))
89
90 # Unknown entity in name, return its literal representation
91 return (u'&%s;' % entity)
92
93 HTMLParser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix
94 class IDParser(HTMLParser.HTMLParser):
95 """Modified HTMLParser that isolates a tag with the specified id"""
96 def __init__(self, id):
97 self.id = id
98 self.result = None
99 self.started = False
100 self.depth = {}
101 self.html = None
102 self.watch_startpos = False
103 self.error_count = 0
104 HTMLParser.HTMLParser.__init__(self)
105
106 def error(self, message):
107 if self.error_count > 10 or self.started:
108 raise HTMLParser.HTMLParseError(message, self.getpos())
109 self.rawdata = '\n'.join(self.html.split('\n')[self.getpos()[0]:]) # skip one line
110 self.error_count += 1
111 self.goahead(1)
112
113 def loads(self, html):
114 self.html = html
115 self.feed(html)
116 self.close()
117
118 def handle_starttag(self, tag, attrs):
119 attrs = dict(attrs)
120 if self.started:
121 self.find_startpos(None)
122 if 'id' in attrs and attrs['id'] == self.id:
123 self.result = [tag]
124 self.started = True
125 self.watch_startpos = True
126 if self.started:
127 if not tag in self.depth: self.depth[tag] = 0
128 self.depth[tag] += 1
129
130 def handle_endtag(self, tag):
131 if self.started:
132 if tag in self.depth: self.depth[tag] -= 1
133 if self.depth[self.result[0]] == 0:
134 self.started = False
135 self.result.append(self.getpos())
136
137 def find_startpos(self, x):
138 """Needed to put the start position of the result (self.result[1])
139 after the opening tag with the requested id"""
140 if self.watch_startpos:
141 self.watch_startpos = False
142 self.result.append(self.getpos())
143 handle_entityref = handle_charref = handle_data = handle_comment = \
144 handle_decl = handle_pi = unknown_decl = find_startpos
145
146 def get_result(self):
147 if self.result is None:
148 return None
149 if len(self.result) != 3:
150 return None
151 lines = self.html.split('\n')
152 lines = lines[self.result[1][0]-1:self.result[2][0]]
153 lines[0] = lines[0][self.result[1][1]:]
154 if len(lines) == 1:
155 lines[-1] = lines[-1][:self.result[2][1]-self.result[1][1]]
156 lines[-1] = lines[-1][:self.result[2][1]]
157 return '\n'.join(lines).strip()
158
159 def get_element_by_id(id, html):
160 """Return the content of the tag with the specified id in the passed HTML document"""
161 parser = IDParser(id)
162 try:
163 parser.loads(html)
164 except HTMLParser.HTMLParseError:
165 pass
166 return parser.get_result()
167
168
169 def clean_html(html):
170 """Clean an HTML snippet into a readable string"""
171 # Newline vs <br />
172 html = html.replace('\n', ' ')
173 html = re.sub('\s*<\s*br\s*/?\s*>\s*', '\n', html)
174 # Strip html tags
175 html = re.sub('<.*?>', '', html)
176 # Replace html entities
177 html = unescapeHTML(html)
178 return html
179
180
181 def sanitize_open(filename, open_mode):
182 """Try to open the given filename, and slightly tweak it if this fails.
183
184 Attempts to open the given filename. If this fails, it tries to change
185 the filename slightly, step by step, until it's either able to open it
186 or it fails and raises a final exception, like the standard open()
187 function.
188
189 It returns the tuple (stream, definitive_file_name).
190 """
191 try:
192 if filename == u'-':
193 if sys.platform == 'win32':
194 import msvcrt
195 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
196 return (sys.stdout, filename)
197 stream = open(encodeFilename(filename), open_mode)
198 return (stream, filename)
199 except (IOError, OSError) as err:
200 # In case of error, try to remove win32 forbidden chars
201 filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)
202
203 # An exception here should be caught in the caller
204 stream = open(encodeFilename(filename), open_mode)
205 return (stream, filename)
206
207
208 def timeconvert(timestr):
209 """Convert RFC 2822 defined time string into system timestamp"""
210 timestamp = None
211 timetuple = email.utils.parsedate_tz(timestr)
212 if timetuple is not None:
213 timestamp = email.utils.mktime_tz(timetuple)
214 return timestamp
215
216 def sanitize_filename(s, restricted=False):
217 """Sanitizes a string so it could be used as part of a filename.
218 If restricted is set, use a stricter subset of allowed characters.
219 """
220 def replace_insane(char):
221 if char == '?' or ord(char) < 32 or ord(char) == 127:
222 return ''
223 elif char == '"':
224 return '' if restricted else '\''
225 elif char == ':':
226 return '_-' if restricted else ' -'
227 elif char in '\\/|*<>':
228 return '_'
229 if restricted and (char in '!&\'' or char.isspace()):
230 return '_'
231 if restricted and ord(char) > 127:
232 return '_'
233 return char
234
235 result = u''.join(map(replace_insane, s))
236 while '__' in result:
237 result = result.replace('__', '_')
238 result = result.strip('_')
239 # Common case of "Foreign band name - English song title"
240 if restricted and result.startswith('-_'):
241 result = result[2:]
242 if not result:
243 result = '_'
244 return result
245
246 def orderedSet(iterable):
247 """ Remove all duplicates from the input iterable """
248 res = []
249 for el in iterable:
250 if el not in res:
251 res.append(el)
252 return res
253
254 def unescapeHTML(s):
255 """
256 @param s a string
257 """
258 assert type(s) == type(u'')
259
260 result = re.sub(ur'(?u)&(.+?);', htmlentity_transform, s)
261 return result
262
263 def encodeFilename(s):
264 """
265 @param s The name of the file
266 """
267
268 assert type(s) == type(u'')
269
270 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
271 # Pass u'' directly to use Unicode APIs on Windows 2000 and up
272 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
273 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
274 return s
275 else:
276 return s.encode(sys.getfilesystemencoding(), 'ignore')
277
278 class DownloadError(Exception):
279 """Download Error exception.
280
281 This exception may be thrown by FileDownloader objects if they are not
282 configured to continue on errors. They will contain the appropriate
283 error message.
284 """
285 pass
286
287
288 class SameFileError(Exception):
289 """Same File exception.
290
291 This exception will be thrown by FileDownloader objects if they detect
292 multiple files would have to be downloaded to the same file on disk.
293 """
294 pass
295
296
297 class PostProcessingError(Exception):
298 """Post Processing exception.
299
300 This exception may be raised by PostProcessor's .run() method to
301 indicate an error in the postprocessing task.
302 """
303 pass
304
305 class MaxDownloadsReached(Exception):
306 """ --max-downloads limit has been reached. """
307 pass
308
309
310 class UnavailableVideoError(Exception):
311 """Unavailable Format exception.
312
313 This exception will be thrown when a video is requested
314 in a format that is not available for that video.
315 """
316 pass
317
318
319 class ContentTooShortError(Exception):
320 """Content Too Short exception.
321
322 This exception may be raised by FileDownloader objects when a file they
323 download is too small for what the server announced first, indicating
324 the connection was probably interrupted.
325 """
326 # Both in bytes
327 downloaded = None
328 expected = None
329
330 def __init__(self, downloaded, expected):
331 self.downloaded = downloaded
332 self.expected = expected
333
334
335 class Trouble(Exception):
336 """Trouble helper exception
337
338 This is an exception to be handled with
339 FileDownloader.trouble
340 """
341
342 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
343 """Handler for HTTP requests and responses.
344
345 This class, when installed with an OpenerDirector, automatically adds
346 the standard headers to every HTTP request and handles gzipped and
347 deflated responses from web servers. If compression is to be avoided in
348 a particular request, the original request in the program code only has
349 to include the HTTP header "Youtubedl-No-Compression", which will be
350 removed before making the real request.
351
352 Part of this code was copied from:
353
354 http://techknack.net/python-urllib2-handlers/
355
356 Andrew Rowls, the author of that code, agreed to release it to the
357 public domain.
358 """
359
360 @staticmethod
361 def deflate(data):
362 try:
363 return zlib.decompress(data, -zlib.MAX_WBITS)
364 except zlib.error:
365 return zlib.decompress(data)
366
367 @staticmethod
368 def addinfourl_wrapper(stream, headers, url, code):
369 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
370 return compat_urllib_request.addinfourl(stream, headers, url, code)
371 ret = compat_urllib_request.addinfourl(stream, headers, url)
372 ret.code = code
373 return ret
374
375 def http_request(self, req):
376 for h in std_headers:
377 if h in req.headers:
378 del req.headers[h]
379 req.add_header(h, std_headers[h])
380 if 'Youtubedl-no-compression' in req.headers:
381 if 'Accept-encoding' in req.headers:
382 del req.headers['Accept-encoding']
383 del req.headers['Youtubedl-no-compression']
384 return req
385
386 def http_response(self, req, resp):
387 old_resp = resp
388 # gzip
389 if resp.headers.get('Content-encoding', '') == 'gzip':
390 gz = gzip.GzipFile(fileobj=StringIO.StringIO(resp.read()), mode='r')
391 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
392 resp.msg = old_resp.msg
393 # deflate
394 if resp.headers.get('Content-encoding', '') == 'deflate':
395 gz = StringIO.StringIO(self.deflate(resp.read()))
396 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
397 resp.msg = old_resp.msg
398 return resp