]> jfr.im git - yt-dlp.git/blame - youtube_dl/utils.py
Remove superfluous encodings
[yt-dlp.git] / youtube_dl / utils.py
CommitLineData
d77c3dfd
FV
1#!/usr/bin/env python
2# -*- coding: utf-8 -*-
3
4import gzip
03f9daab 5import io
d77c3dfd
FV
6import locale
7import os
8import re
9import sys
10import zlib
d77c3dfd 11import email.utils
921a1455 12import json
d77c3dfd 13
01ba00ca
PH
14try:
15 import urllib.request as compat_urllib_request
16except ImportError: # Python 2
17 import urllib2 as compat_urllib_request
18
19try:
20 import urllib.error as compat_urllib_error
21except ImportError: # Python 2
22 import urllib2 as compat_urllib_error
23
24try:
25 import urllib.parse as compat_urllib_parse
26except ImportError: # Python 2
da779b49 27 import urllib as compat_urllib_parse
01ba00ca
PH
28
29try:
30 import http.cookiejar as compat_cookiejar
31except ImportError: # Python 2
32 import cookielib as compat_cookiejar
33
3e669f36
PH
34try:
35 import html.entities as compat_html_entities
9f37a959 36except ImportError: # Python 2
3e669f36
PH
37 import htmlentitydefs as compat_html_entities
38
a8156c1d
PH
39try:
40 import html.parser as compat_html_parser
9f37a959 41except ImportError: # Python 2
a8156c1d
PH
42 import HTMLParser as compat_html_parser
43
348d0a7a 44try:
5bd9cc7a 45 import http.client as compat_http_client
9f37a959 46except ImportError: # Python 2
5bd9cc7a 47 import httplib as compat_http_client
348d0a7a 48
9f37a959 49try:
73dce4b2 50 from urllib.parse import parse_qs as compat_parse_qs
9f37a959
PH
51except ImportError: # Python 2
52 from urlparse import parse_qs as compat_parse_qs
348d0a7a 53
3e669f36
PH
54try:
55 compat_str = unicode # Python 2
56except NameError:
57 compat_str = str
58
59try:
60 compat_chr = unichr # Python 2
61except NameError:
62 compat_chr = chr
63
3e669f36
PH
64std_headers = {
65 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0',
66 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
67 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
68 'Accept-Encoding': 'gzip, deflate',
69 'Accept-Language': 'en-us,en;q=0.5',
70}
d77c3dfd
FV
71def preferredencoding():
72 """Get preferred encoding.
73
74 Returns the best encoding scheme for the system, based on
75 locale.getpreferredencoding() and some further tweaks.
76 """
bae611f2
AS
77 try:
78 pref = locale.getpreferredencoding()
79 u'TEST'.encode(pref)
80 except:
81 pref = 'UTF-8'
82
83 return pref
d77c3dfd 84
8cd10ac4
PH
85if sys.version_info < (3,0):
86 def compat_print(s):
87 print(s.encode(preferredencoding(), 'xmlcharrefreplace'))
88else:
89 def compat_print(s):
e6137fd6 90 assert type(s) == type(u'')
8cd10ac4 91 print(s)
d77c3dfd
FV
92
93def htmlentity_transform(matchobj):
dd109dee 94 """Transforms an HTML entity to a character.
d77c3dfd
FV
95
96 This function receives a match object and is intended to be used with
97 the re.sub() function.
98 """
99 entity = matchobj.group(1)
100
101 # Known non-numeric HTML entity
3e669f36
PH
102 if entity in compat_html_entities.name2codepoint:
103 return compat_chr(compat_html_entities.name2codepoint[entity])
d77c3dfd 104
89fb51dd 105 mobj = re.match(u'(?u)#(x?\\d+)', entity)
d77c3dfd
FV
106 if mobj is not None:
107 numstr = mobj.group(1)
108 if numstr.startswith(u'x'):
109 base = 16
110 numstr = u'0%s' % numstr
111 else:
112 base = 10
3e669f36 113 return compat_chr(int(numstr, base))
d77c3dfd
FV
114
115 # Unknown entity in name, return its literal representation
116 return (u'&%s;' % entity)
117
a8156c1d
PH
118compat_html_parser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix
119class IDParser(compat_html_parser.HTMLParser):
9e6dd238
FV
120 """Modified HTMLParser that isolates a tag with the specified id"""
121 def __init__(self, id):
122 self.id = id
123 self.result = None
124 self.started = False
125 self.depth = {}
126 self.html = None
127 self.watch_startpos = False
9beb5af8 128 self.error_count = 0
a8156c1d 129 compat_html_parser.HTMLParser.__init__(self)
9e6dd238 130
9beb5af8 131 def error(self, message):
9beb5af8 132 if self.error_count > 10 or self.started:
a8156c1d 133 raise compat_html_parser.HTMLParseError(message, self.getpos())
9beb5af8
FV
134 self.rawdata = '\n'.join(self.html.split('\n')[self.getpos()[0]:]) # skip one line
135 self.error_count += 1
136 self.goahead(1)
137
9e6dd238
FV
138 def loads(self, html):
139 self.html = html
140 self.feed(html)
141 self.close()
142
143 def handle_starttag(self, tag, attrs):
144 attrs = dict(attrs)
145 if self.started:
146 self.find_startpos(None)
147 if 'id' in attrs and attrs['id'] == self.id:
148 self.result = [tag]
149 self.started = True
150 self.watch_startpos = True
151 if self.started:
152 if not tag in self.depth: self.depth[tag] = 0
153 self.depth[tag] += 1
154
155 def handle_endtag(self, tag):
156 if self.started:
157 if tag in self.depth: self.depth[tag] -= 1
158 if self.depth[self.result[0]] == 0:
159 self.started = False
160 self.result.append(self.getpos())
161
162 def find_startpos(self, x):
163 """Needed to put the start position of the result (self.result[1])
164 after the opening tag with the requested id"""
165 if self.watch_startpos:
166 self.watch_startpos = False
167 self.result.append(self.getpos())
168 handle_entityref = handle_charref = handle_data = handle_comment = \
169 handle_decl = handle_pi = unknown_decl = find_startpos
170
171 def get_result(self):
b514df20
PH
172 if self.result is None:
173 return None
174 if len(self.result) != 3:
175 return None
9e6dd238
FV
176 lines = self.html.split('\n')
177 lines = lines[self.result[1][0]-1:self.result[2][0]]
178 lines[0] = lines[0][self.result[1][1]:]
179 if len(lines) == 1:
180 lines[-1] = lines[-1][:self.result[2][1]-self.result[1][1]]
181 lines[-1] = lines[-1][:self.result[2][1]]
182 return '\n'.join(lines).strip()
183
184def get_element_by_id(id, html):
185 """Return the content of the tag with the specified id in the passed HTML document"""
186 parser = IDParser(id)
187 try:
188 parser.loads(html)
a8156c1d 189 except compat_html_parser.HTMLParseError:
9e6dd238
FV
190 pass
191 return parser.get_result()
192
193
194def clean_html(html):
195 """Clean an HTML snippet into a readable string"""
196 # Newline vs <br />
197 html = html.replace('\n', ' ')
198 html = re.sub('\s*<\s*br\s*/?\s*>\s*', '\n', html)
199 # Strip html tags
200 html = re.sub('<.*?>', '', html)
201 # Replace html entities
202 html = unescapeHTML(html)
203 return html
204
205
d77c3dfd
FV
206def sanitize_open(filename, open_mode):
207 """Try to open the given filename, and slightly tweak it if this fails.
208
209 Attempts to open the given filename. If this fails, it tries to change
210 the filename slightly, step by step, until it's either able to open it
211 or it fails and raises a final exception, like the standard open()
212 function.
213
214 It returns the tuple (stream, definitive_file_name).
215 """
216 try:
217 if filename == u'-':
218 if sys.platform == 'win32':
219 import msvcrt
220 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
221 return (sys.stdout, filename)
222 stream = open(encodeFilename(filename), open_mode)
223 return (stream, filename)
e08bee32 224 except (IOError, OSError) as err:
d77c3dfd 225 # In case of error, try to remove win32 forbidden chars
89fb51dd 226 filename = re.sub(u'[/<>:"\\|\\\\?\\*]', u'#', filename)
d77c3dfd
FV
227
228 # An exception here should be caught in the caller
229 stream = open(encodeFilename(filename), open_mode)
230 return (stream, filename)
231
232
233def timeconvert(timestr):
234 """Convert RFC 2822 defined time string into system timestamp"""
235 timestamp = None
236 timetuple = email.utils.parsedate_tz(timestr)
237 if timetuple is not None:
238 timestamp = email.utils.mktime_tz(timetuple)
239 return timestamp
1c469a94
PH
240
241def sanitize_filename(s, restricted=False):
242 """Sanitizes a string so it could be used as part of a filename.
243 If restricted is set, use a stricter subset of allowed characters.
244 """
2c288bda 245 def replace_insane(char):
42cb53fc
FV
246 if char == '?' or ord(char) < 32 or ord(char) == 127:
247 return ''
248 elif char == '"':
240089e5 249 return '' if restricted else '\''
42cb53fc 250 elif char == ':':
1c469a94 251 return '_-' if restricted else ' -'
42cb53fc 252 elif char in '\\/|*<>':
56781d3d 253 return '_'
dffe658b 254 if restricted and (char in '!&\'' or char.isspace()):
1c469a94 255 return '_'
56781d3d
PH
256 if restricted and ord(char) > 127:
257 return '_'
2c288bda 258 return char
42cb53fc
FV
259
260 result = u''.join(map(replace_insane, s))
56781d3d
PH
261 while '__' in result:
262 result = result.replace('__', '_')
263 result = result.strip('_')
46cbda0b
PH
264 # Common case of "Foreign band name - English song title"
265 if restricted and result.startswith('-_'):
266 result = result[2:]
56781d3d
PH
267 if not result:
268 result = '_'
269 return result
d77c3dfd
FV
270
271def orderedSet(iterable):
272 """ Remove all duplicates from the input iterable """
273 res = []
274 for el in iterable:
275 if el not in res:
276 res.append(el)
277 return res
278
279def unescapeHTML(s):
280 """
dd109dee 281 @param s a string
d77c3dfd
FV
282 """
283 assert type(s) == type(u'')
284
89fb51dd 285 result = re.sub(u'(?u)&(.+?);', htmlentity_transform, s)
9e6dd238 286 return result
d77c3dfd
FV
287
288def encodeFilename(s):
289 """
dd109dee 290 @param s The name of the file
d77c3dfd
FV
291 """
292
293 assert type(s) == type(u'')
294
9bb8dc8e 295 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
d77c3dfd
FV
296 # Pass u'' directly to use Unicode APIs on Windows 2000 and up
297 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
298 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
299 return s
300 else:
301 return s.encode(sys.getfilesystemencoding(), 'ignore')
302
303class DownloadError(Exception):
304 """Download Error exception.
305
306 This exception may be thrown by FileDownloader objects if they are not
307 configured to continue on errors. They will contain the appropriate
308 error message.
309 """
310 pass
311
312
313class SameFileError(Exception):
314 """Same File exception.
315
316 This exception will be thrown by FileDownloader objects if they detect
317 multiple files would have to be downloaded to the same file on disk.
318 """
319 pass
320
321
322class PostProcessingError(Exception):
323 """Post Processing exception.
324
325 This exception may be raised by PostProcessor's .run() method to
326 indicate an error in the postprocessing task.
327 """
328 pass
329
330class MaxDownloadsReached(Exception):
331 """ --max-downloads limit has been reached. """
332 pass
333
334
335class UnavailableVideoError(Exception):
336 """Unavailable Format exception.
337
338 This exception will be thrown when a video is requested
339 in a format that is not available for that video.
340 """
341 pass
342
343
344class ContentTooShortError(Exception):
345 """Content Too Short exception.
346
347 This exception may be raised by FileDownloader objects when a file they
348 download is too small for what the server announced first, indicating
349 the connection was probably interrupted.
350 """
351 # Both in bytes
352 downloaded = None
353 expected = None
354
355 def __init__(self, downloaded, expected):
356 self.downloaded = downloaded
357 self.expected = expected
358
359
0b8c922d
FV
360class Trouble(Exception):
361 """Trouble helper exception
dffe658b 362
0b8c922d
FV
363 This is an exception to be handled with
364 FileDownloader.trouble
365 """
366
01ba00ca 367class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
d77c3dfd
FV
368 """Handler for HTTP requests and responses.
369
370 This class, when installed with an OpenerDirector, automatically adds
371 the standard headers to every HTTP request and handles gzipped and
372 deflated responses from web servers. If compression is to be avoided in
373 a particular request, the original request in the program code only has
374 to include the HTTP header "Youtubedl-No-Compression", which will be
375 removed before making the real request.
376
377 Part of this code was copied from:
378
379 http://techknack.net/python-urllib2-handlers/
380
381 Andrew Rowls, the author of that code, agreed to release it to the
382 public domain.
383 """
384
385 @staticmethod
386 def deflate(data):
387 try:
388 return zlib.decompress(data, -zlib.MAX_WBITS)
389 except zlib.error:
390 return zlib.decompress(data)
391
392 @staticmethod
393 def addinfourl_wrapper(stream, headers, url, code):
01ba00ca
PH
394 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
395 return compat_urllib_request.addinfourl(stream, headers, url, code)
396 ret = compat_urllib_request.addinfourl(stream, headers, url)
d77c3dfd
FV
397 ret.code = code
398 return ret
399
400 def http_request(self, req):
401 for h in std_headers:
402 if h in req.headers:
403 del req.headers[h]
404 req.add_header(h, std_headers[h])
405 if 'Youtubedl-no-compression' in req.headers:
406 if 'Accept-encoding' in req.headers:
407 del req.headers['Accept-encoding']
408 del req.headers['Youtubedl-no-compression']
409 return req
410
411 def http_response(self, req, resp):
412 old_resp = resp
413 # gzip
414 if resp.headers.get('Content-encoding', '') == 'gzip':
03f9daab 415 gz = gzip.GzipFile(fileobj=io.BytesIO(resp.read()), mode='r')
d77c3dfd
FV
416 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
417 resp.msg = old_resp.msg
418 # deflate
419 if resp.headers.get('Content-encoding', '') == 'deflate':
03f9daab 420 gz = io.BytesIO(self.deflate(resp.read()))
d77c3dfd
FV
421 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
422 resp.msg = old_resp.msg
423 return resp