]> jfr.im git - yt-dlp.git/blame - youtube_dl/utils.py
Fall back to urllib instead of urllib2 for Python 3 urllib.parse
[yt-dlp.git] / youtube_dl / utils.py
CommitLineData
d77c3dfd
FV
1#!/usr/bin/env python
2# -*- coding: utf-8 -*-
3
4import gzip
5import htmlentitydefs
6import HTMLParser
7import locale
8import os
9import re
10import sys
11import zlib
d77c3dfd 12import email.utils
921a1455 13import json
d77c3dfd
FV
14
15try:
16 import cStringIO as StringIO
17except ImportError:
18 import StringIO
19
20std_headers = {
38612b4e 21 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0',
d77c3dfd
FV
22 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
23 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
24 'Accept-Encoding': 'gzip, deflate',
25 'Accept-Language': 'en-us,en;q=0.5',
26}
27
92b91c18 28try:
96731798 29 compat_str = unicode # Python 2
92b91c18 30except NameError:
96731798 31 compat_str = str
92b91c18 32
01ba00ca
PH
33try:
34 import urllib.request as compat_urllib_request
35except ImportError: # Python 2
36 import urllib2 as compat_urllib_request
37
38try:
39 import urllib.error as compat_urllib_error
40except ImportError: # Python 2
41 import urllib2 as compat_urllib_error
42
43try:
44 import urllib.parse as compat_urllib_parse
45except ImportError: # Python 2
da779b49 46 import urllib as compat_urllib_parse
01ba00ca
PH
47
48try:
49 import http.cookiejar as compat_cookiejar
50except ImportError: # Python 2
51 import cookielib as compat_cookiejar
52
d77c3dfd
FV
53def preferredencoding():
54 """Get preferred encoding.
55
56 Returns the best encoding scheme for the system, based on
57 locale.getpreferredencoding() and some further tweaks.
58 """
bae611f2
AS
59 try:
60 pref = locale.getpreferredencoding()
61 u'TEST'.encode(pref)
62 except:
63 pref = 'UTF-8'
64
65 return pref
d77c3dfd
FV
66
67
68def htmlentity_transform(matchobj):
dd109dee 69 """Transforms an HTML entity to a character.
d77c3dfd
FV
70
71 This function receives a match object and is intended to be used with
72 the re.sub() function.
73 """
74 entity = matchobj.group(1)
75
76 # Known non-numeric HTML entity
77 if entity in htmlentitydefs.name2codepoint:
78 return unichr(htmlentitydefs.name2codepoint[entity])
79
89fb51dd 80 mobj = re.match(u'(?u)#(x?\\d+)', entity)
d77c3dfd
FV
81 if mobj is not None:
82 numstr = mobj.group(1)
83 if numstr.startswith(u'x'):
84 base = 16
85 numstr = u'0%s' % numstr
86 else:
87 base = 10
dd109dee 88 return unichr(int(numstr, base))
d77c3dfd
FV
89
90 # Unknown entity in name, return its literal representation
91 return (u'&%s;' % entity)
92
9beb5af8 93HTMLParser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix
9e6dd238
FV
94class IDParser(HTMLParser.HTMLParser):
95 """Modified HTMLParser that isolates a tag with the specified id"""
96 def __init__(self, id):
97 self.id = id
98 self.result = None
99 self.started = False
100 self.depth = {}
101 self.html = None
102 self.watch_startpos = False
9beb5af8 103 self.error_count = 0
9e6dd238
FV
104 HTMLParser.HTMLParser.__init__(self)
105
9beb5af8 106 def error(self, message):
9beb5af8
FV
107 if self.error_count > 10 or self.started:
108 raise HTMLParser.HTMLParseError(message, self.getpos())
109 self.rawdata = '\n'.join(self.html.split('\n')[self.getpos()[0]:]) # skip one line
110 self.error_count += 1
111 self.goahead(1)
112
9e6dd238
FV
113 def loads(self, html):
114 self.html = html
115 self.feed(html)
116 self.close()
117
118 def handle_starttag(self, tag, attrs):
119 attrs = dict(attrs)
120 if self.started:
121 self.find_startpos(None)
122 if 'id' in attrs and attrs['id'] == self.id:
123 self.result = [tag]
124 self.started = True
125 self.watch_startpos = True
126 if self.started:
127 if not tag in self.depth: self.depth[tag] = 0
128 self.depth[tag] += 1
129
130 def handle_endtag(self, tag):
131 if self.started:
132 if tag in self.depth: self.depth[tag] -= 1
133 if self.depth[self.result[0]] == 0:
134 self.started = False
135 self.result.append(self.getpos())
136
137 def find_startpos(self, x):
138 """Needed to put the start position of the result (self.result[1])
139 after the opening tag with the requested id"""
140 if self.watch_startpos:
141 self.watch_startpos = False
142 self.result.append(self.getpos())
143 handle_entityref = handle_charref = handle_data = handle_comment = \
144 handle_decl = handle_pi = unknown_decl = find_startpos
145
146 def get_result(self):
b514df20
PH
147 if self.result is None:
148 return None
149 if len(self.result) != 3:
150 return None
9e6dd238
FV
151 lines = self.html.split('\n')
152 lines = lines[self.result[1][0]-1:self.result[2][0]]
153 lines[0] = lines[0][self.result[1][1]:]
154 if len(lines) == 1:
155 lines[-1] = lines[-1][:self.result[2][1]-self.result[1][1]]
156 lines[-1] = lines[-1][:self.result[2][1]]
157 return '\n'.join(lines).strip()
158
159def get_element_by_id(id, html):
160 """Return the content of the tag with the specified id in the passed HTML document"""
161 parser = IDParser(id)
162 try:
163 parser.loads(html)
164 except HTMLParser.HTMLParseError:
165 pass
166 return parser.get_result()
167
168
169def clean_html(html):
170 """Clean an HTML snippet into a readable string"""
171 # Newline vs <br />
172 html = html.replace('\n', ' ')
173 html = re.sub('\s*<\s*br\s*/?\s*>\s*', '\n', html)
174 # Strip html tags
175 html = re.sub('<.*?>', '', html)
176 # Replace html entities
177 html = unescapeHTML(html)
178 return html
179
180
d77c3dfd
FV
181def sanitize_open(filename, open_mode):
182 """Try to open the given filename, and slightly tweak it if this fails.
183
184 Attempts to open the given filename. If this fails, it tries to change
185 the filename slightly, step by step, until it's either able to open it
186 or it fails and raises a final exception, like the standard open()
187 function.
188
189 It returns the tuple (stream, definitive_file_name).
190 """
191 try:
192 if filename == u'-':
193 if sys.platform == 'win32':
194 import msvcrt
195 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
196 return (sys.stdout, filename)
197 stream = open(encodeFilename(filename), open_mode)
198 return (stream, filename)
e08bee32 199 except (IOError, OSError) as err:
d77c3dfd 200 # In case of error, try to remove win32 forbidden chars
89fb51dd 201 filename = re.sub(u'[/<>:"\\|\\\\?\\*]', u'#', filename)
d77c3dfd
FV
202
203 # An exception here should be caught in the caller
204 stream = open(encodeFilename(filename), open_mode)
205 return (stream, filename)
206
207
208def timeconvert(timestr):
209 """Convert RFC 2822 defined time string into system timestamp"""
210 timestamp = None
211 timetuple = email.utils.parsedate_tz(timestr)
212 if timetuple is not None:
213 timestamp = email.utils.mktime_tz(timetuple)
214 return timestamp
1c469a94
PH
215
216def sanitize_filename(s, restricted=False):
217 """Sanitizes a string so it could be used as part of a filename.
218 If restricted is set, use a stricter subset of allowed characters.
219 """
2c288bda 220 def replace_insane(char):
42cb53fc
FV
221 if char == '?' or ord(char) < 32 or ord(char) == 127:
222 return ''
223 elif char == '"':
240089e5 224 return '' if restricted else '\''
42cb53fc 225 elif char == ':':
1c469a94 226 return '_-' if restricted else ' -'
42cb53fc 227 elif char in '\\/|*<>':
56781d3d 228 return '_'
dffe658b 229 if restricted and (char in '!&\'' or char.isspace()):
1c469a94 230 return '_'
56781d3d
PH
231 if restricted and ord(char) > 127:
232 return '_'
2c288bda 233 return char
42cb53fc
FV
234
235 result = u''.join(map(replace_insane, s))
56781d3d
PH
236 while '__' in result:
237 result = result.replace('__', '_')
238 result = result.strip('_')
46cbda0b
PH
239 # Common case of "Foreign band name - English song title"
240 if restricted and result.startswith('-_'):
241 result = result[2:]
56781d3d
PH
242 if not result:
243 result = '_'
244 return result
d77c3dfd
FV
245
246def orderedSet(iterable):
247 """ Remove all duplicates from the input iterable """
248 res = []
249 for el in iterable:
250 if el not in res:
251 res.append(el)
252 return res
253
254def unescapeHTML(s):
255 """
dd109dee 256 @param s a string
d77c3dfd
FV
257 """
258 assert type(s) == type(u'')
259
89fb51dd 260 result = re.sub(u'(?u)&(.+?);', htmlentity_transform, s)
9e6dd238 261 return result
d77c3dfd
FV
262
263def encodeFilename(s):
264 """
dd109dee 265 @param s The name of the file
d77c3dfd
FV
266 """
267
268 assert type(s) == type(u'')
269
9bb8dc8e 270 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
d77c3dfd
FV
271 # Pass u'' directly to use Unicode APIs on Windows 2000 and up
272 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
273 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
274 return s
275 else:
276 return s.encode(sys.getfilesystemencoding(), 'ignore')
277
278class DownloadError(Exception):
279 """Download Error exception.
280
281 This exception may be thrown by FileDownloader objects if they are not
282 configured to continue on errors. They will contain the appropriate
283 error message.
284 """
285 pass
286
287
288class SameFileError(Exception):
289 """Same File exception.
290
291 This exception will be thrown by FileDownloader objects if they detect
292 multiple files would have to be downloaded to the same file on disk.
293 """
294 pass
295
296
297class PostProcessingError(Exception):
298 """Post Processing exception.
299
300 This exception may be raised by PostProcessor's .run() method to
301 indicate an error in the postprocessing task.
302 """
303 pass
304
305class MaxDownloadsReached(Exception):
306 """ --max-downloads limit has been reached. """
307 pass
308
309
310class UnavailableVideoError(Exception):
311 """Unavailable Format exception.
312
313 This exception will be thrown when a video is requested
314 in a format that is not available for that video.
315 """
316 pass
317
318
319class ContentTooShortError(Exception):
320 """Content Too Short exception.
321
322 This exception may be raised by FileDownloader objects when a file they
323 download is too small for what the server announced first, indicating
324 the connection was probably interrupted.
325 """
326 # Both in bytes
327 downloaded = None
328 expected = None
329
330 def __init__(self, downloaded, expected):
331 self.downloaded = downloaded
332 self.expected = expected
333
334
0b8c922d
FV
335class Trouble(Exception):
336 """Trouble helper exception
dffe658b 337
0b8c922d
FV
338 This is an exception to be handled with
339 FileDownloader.trouble
340 """
341
01ba00ca 342class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
d77c3dfd
FV
343 """Handler for HTTP requests and responses.
344
345 This class, when installed with an OpenerDirector, automatically adds
346 the standard headers to every HTTP request and handles gzipped and
347 deflated responses from web servers. If compression is to be avoided in
348 a particular request, the original request in the program code only has
349 to include the HTTP header "Youtubedl-No-Compression", which will be
350 removed before making the real request.
351
352 Part of this code was copied from:
353
354 http://techknack.net/python-urllib2-handlers/
355
356 Andrew Rowls, the author of that code, agreed to release it to the
357 public domain.
358 """
359
360 @staticmethod
361 def deflate(data):
362 try:
363 return zlib.decompress(data, -zlib.MAX_WBITS)
364 except zlib.error:
365 return zlib.decompress(data)
366
367 @staticmethod
368 def addinfourl_wrapper(stream, headers, url, code):
01ba00ca
PH
369 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
370 return compat_urllib_request.addinfourl(stream, headers, url, code)
371 ret = compat_urllib_request.addinfourl(stream, headers, url)
d77c3dfd
FV
372 ret.code = code
373 return ret
374
375 def http_request(self, req):
376 for h in std_headers:
377 if h in req.headers:
378 del req.headers[h]
379 req.add_header(h, std_headers[h])
380 if 'Youtubedl-no-compression' in req.headers:
381 if 'Accept-encoding' in req.headers:
382 del req.headers['Accept-encoding']
383 del req.headers['Youtubedl-no-compression']
384 return req
385
386 def http_response(self, req, resp):
387 old_resp = resp
388 # gzip
389 if resp.headers.get('Content-encoding', '') == 'gzip':
390 gz = gzip.GzipFile(fileobj=StringIO.StringIO(resp.read()), mode='r')
391 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
392 resp.msg = old_resp.msg
393 # deflate
394 if resp.headers.get('Content-encoding', '') == 'deflate':
395 gz = StringIO.StringIO(self.deflate(resp.read()))
396 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
397 resp.msg = old_resp.msg
398 return resp