]> jfr.im git - yt-dlp.git/blob - youtube_dl/utils.py
Restrict more characters (Closes #566)
[yt-dlp.git] / youtube_dl / utils.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 import gzip
5 import io
6 import locale
7 import os
8 import re
9 import sys
10 import zlib
11 import email.utils
12 import json
13
14 try:
15 import urllib.request as compat_urllib_request
16 except ImportError: # Python 2
17 import urllib2 as compat_urllib_request
18
19 try:
20 import urllib.error as compat_urllib_error
21 except ImportError: # Python 2
22 import urllib2 as compat_urllib_error
23
24 try:
25 import urllib.parse as compat_urllib_parse
26 except ImportError: # Python 2
27 import urllib as compat_urllib_parse
28
29 try:
30 from urllib.parse import urlparse as compat_urllib_parse_urlparse
31 except ImportError: # Python 2
32 from urlparse import urlparse as compat_urllib_parse_urlparse
33
34 try:
35 import http.cookiejar as compat_cookiejar
36 except ImportError: # Python 2
37 import cookielib as compat_cookiejar
38
39 try:
40 import html.entities as compat_html_entities
41 except ImportError: # Python 2
42 import htmlentitydefs as compat_html_entities
43
44 try:
45 import html.parser as compat_html_parser
46 except ImportError: # Python 2
47 import HTMLParser as compat_html_parser
48
49 try:
50 import http.client as compat_http_client
51 except ImportError: # Python 2
52 import httplib as compat_http_client
53
54 try:
55 from urllib.parse import parse_qs as compat_parse_qs
56 except ImportError: # Python 2
57 # HACK: The following is the correct parse_qs implementation from cpython 3's stdlib.
58 # Python 2's version is apparently totally broken
59 def _unquote(string, encoding='utf-8', errors='replace'):
60 if string == '':
61 return string
62 res = string.split('%')
63 if len(res) == 1:
64 return string
65 if encoding is None:
66 encoding = 'utf-8'
67 if errors is None:
68 errors = 'replace'
69 # pct_sequence: contiguous sequence of percent-encoded bytes, decoded
70 pct_sequence = b''
71 string = res[0]
72 for item in res[1:]:
73 try:
74 if not item:
75 raise ValueError
76 pct_sequence += item[:2].decode('hex')
77 rest = item[2:]
78 if not rest:
79 # This segment was just a single percent-encoded character.
80 # May be part of a sequence of code units, so delay decoding.
81 # (Stored in pct_sequence).
82 continue
83 except ValueError:
84 rest = '%' + item
85 # Encountered non-percent-encoded characters. Flush the current
86 # pct_sequence.
87 string += pct_sequence.decode(encoding, errors) + rest
88 pct_sequence = b''
89 if pct_sequence:
90 # Flush the final pct_sequence
91 string += pct_sequence.decode(encoding, errors)
92 return string
93
94 def _parse_qsl(qs, keep_blank_values=False, strict_parsing=False,
95 encoding='utf-8', errors='replace'):
96 qs, _coerce_result = qs, unicode
97 pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
98 r = []
99 for name_value in pairs:
100 if not name_value and not strict_parsing:
101 continue
102 nv = name_value.split('=', 1)
103 if len(nv) != 2:
104 if strict_parsing:
105 raise ValueError("bad query field: %r" % (name_value,))
106 # Handle case of a control-name with no equal sign
107 if keep_blank_values:
108 nv.append('')
109 else:
110 continue
111 if len(nv[1]) or keep_blank_values:
112 name = nv[0].replace('+', ' ')
113 name = _unquote(name, encoding=encoding, errors=errors)
114 name = _coerce_result(name)
115 value = nv[1].replace('+', ' ')
116 value = _unquote(value, encoding=encoding, errors=errors)
117 value = _coerce_result(value)
118 r.append((name, value))
119 return r
120
121 def compat_parse_qs(qs, keep_blank_values=False, strict_parsing=False,
122 encoding='utf-8', errors='replace'):
123 parsed_result = {}
124 pairs = _parse_qsl(qs, keep_blank_values, strict_parsing,
125 encoding=encoding, errors=errors)
126 for name, value in pairs:
127 if name in parsed_result:
128 parsed_result[name].append(value)
129 else:
130 parsed_result[name] = [value]
131 return parsed_result
132
133 try:
134 compat_str = unicode # Python 2
135 except NameError:
136 compat_str = str
137
138 try:
139 compat_chr = unichr # Python 2
140 except NameError:
141 compat_chr = chr
142
143 std_headers = {
144 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0',
145 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
146 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
147 'Accept-Encoding': 'gzip, deflate',
148 'Accept-Language': 'en-us,en;q=0.5',
149 }
150 def preferredencoding():
151 """Get preferred encoding.
152
153 Returns the best encoding scheme for the system, based on
154 locale.getpreferredencoding() and some further tweaks.
155 """
156 try:
157 pref = locale.getpreferredencoding()
158 u'TEST'.encode(pref)
159 except:
160 pref = 'UTF-8'
161
162 return pref
163
164 if sys.version_info < (3,0):
165 def compat_print(s):
166 print(s.encode(preferredencoding(), 'xmlcharrefreplace'))
167 else:
168 def compat_print(s):
169 assert type(s) == type(u'')
170 print(s)
171
172 def htmlentity_transform(matchobj):
173 """Transforms an HTML entity to a character.
174
175 This function receives a match object and is intended to be used with
176 the re.sub() function.
177 """
178 entity = matchobj.group(1)
179
180 # Known non-numeric HTML entity
181 if entity in compat_html_entities.name2codepoint:
182 return compat_chr(compat_html_entities.name2codepoint[entity])
183
184 mobj = re.match(u'(?u)#(x?\\d+)', entity)
185 if mobj is not None:
186 numstr = mobj.group(1)
187 if numstr.startswith(u'x'):
188 base = 16
189 numstr = u'0%s' % numstr
190 else:
191 base = 10
192 return compat_chr(int(numstr, base))
193
194 # Unknown entity in name, return its literal representation
195 return (u'&%s;' % entity)
196
197 compat_html_parser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix
198 class IDParser(compat_html_parser.HTMLParser):
199 """Modified HTMLParser that isolates a tag with the specified id"""
200 def __init__(self, id):
201 self.id = id
202 self.result = None
203 self.started = False
204 self.depth = {}
205 self.html = None
206 self.watch_startpos = False
207 self.error_count = 0
208 compat_html_parser.HTMLParser.__init__(self)
209
210 def error(self, message):
211 if self.error_count > 10 or self.started:
212 raise compat_html_parser.HTMLParseError(message, self.getpos())
213 self.rawdata = '\n'.join(self.html.split('\n')[self.getpos()[0]:]) # skip one line
214 self.error_count += 1
215 self.goahead(1)
216
217 def loads(self, html):
218 self.html = html
219 self.feed(html)
220 self.close()
221
222 def handle_starttag(self, tag, attrs):
223 attrs = dict(attrs)
224 if self.started:
225 self.find_startpos(None)
226 if 'id' in attrs and attrs['id'] == self.id:
227 self.result = [tag]
228 self.started = True
229 self.watch_startpos = True
230 if self.started:
231 if not tag in self.depth: self.depth[tag] = 0
232 self.depth[tag] += 1
233
234 def handle_endtag(self, tag):
235 if self.started:
236 if tag in self.depth: self.depth[tag] -= 1
237 if self.depth[self.result[0]] == 0:
238 self.started = False
239 self.result.append(self.getpos())
240
241 def find_startpos(self, x):
242 """Needed to put the start position of the result (self.result[1])
243 after the opening tag with the requested id"""
244 if self.watch_startpos:
245 self.watch_startpos = False
246 self.result.append(self.getpos())
247 handle_entityref = handle_charref = handle_data = handle_comment = \
248 handle_decl = handle_pi = unknown_decl = find_startpos
249
250 def get_result(self):
251 if self.result is None:
252 return None
253 if len(self.result) != 3:
254 return None
255 lines = self.html.split('\n')
256 lines = lines[self.result[1][0]-1:self.result[2][0]]
257 lines[0] = lines[0][self.result[1][1]:]
258 if len(lines) == 1:
259 lines[-1] = lines[-1][:self.result[2][1]-self.result[1][1]]
260 lines[-1] = lines[-1][:self.result[2][1]]
261 return '\n'.join(lines).strip()
262
263 def get_element_by_id(id, html):
264 """Return the content of the tag with the specified id in the passed HTML document"""
265 parser = IDParser(id)
266 try:
267 parser.loads(html)
268 except compat_html_parser.HTMLParseError:
269 pass
270 return parser.get_result()
271
272
273 def clean_html(html):
274 """Clean an HTML snippet into a readable string"""
275 # Newline vs <br />
276 html = html.replace('\n', ' ')
277 html = re.sub('\s*<\s*br\s*/?\s*>\s*', '\n', html)
278 # Strip html tags
279 html = re.sub('<.*?>', '', html)
280 # Replace html entities
281 html = unescapeHTML(html)
282 return html
283
284
285 def sanitize_open(filename, open_mode):
286 """Try to open the given filename, and slightly tweak it if this fails.
287
288 Attempts to open the given filename. If this fails, it tries to change
289 the filename slightly, step by step, until it's either able to open it
290 or it fails and raises a final exception, like the standard open()
291 function.
292
293 It returns the tuple (stream, definitive_file_name).
294 """
295 try:
296 if filename == u'-':
297 if sys.platform == 'win32':
298 import msvcrt
299 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
300 return (sys.stdout, filename)
301 stream = open(encodeFilename(filename), open_mode)
302 return (stream, filename)
303 except (IOError, OSError) as err:
304 # In case of error, try to remove win32 forbidden chars
305 filename = re.sub(u'[/<>:"\\|\\\\?\\*]', u'#', filename)
306
307 # An exception here should be caught in the caller
308 stream = open(encodeFilename(filename), open_mode)
309 return (stream, filename)
310
311
312 def timeconvert(timestr):
313 """Convert RFC 2822 defined time string into system timestamp"""
314 timestamp = None
315 timetuple = email.utils.parsedate_tz(timestr)
316 if timetuple is not None:
317 timestamp = email.utils.mktime_tz(timetuple)
318 return timestamp
319
320 def sanitize_filename(s, restricted=False):
321 """Sanitizes a string so it could be used as part of a filename.
322 If restricted is set, use a stricter subset of allowed characters.
323 """
324 def replace_insane(char):
325 if char == '?' or ord(char) < 32 or ord(char) == 127:
326 return ''
327 elif char == '"':
328 return '' if restricted else '\''
329 elif char == ':':
330 return '_-' if restricted else ' -'
331 elif char in '\\/|*<>':
332 return '_'
333 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
334 return '_'
335 if restricted and ord(char) > 127:
336 return '_'
337 return char
338
339 result = u''.join(map(replace_insane, s))
340 while '__' in result:
341 result = result.replace('__', '_')
342 result = result.strip('_')
343 # Common case of "Foreign band name - English song title"
344 if restricted and result.startswith('-_'):
345 result = result[2:]
346 if not result:
347 result = '_'
348 return result
349
350 def orderedSet(iterable):
351 """ Remove all duplicates from the input iterable """
352 res = []
353 for el in iterable:
354 if el not in res:
355 res.append(el)
356 return res
357
358 def unescapeHTML(s):
359 """
360 @param s a string
361 """
362 assert type(s) == type(u'')
363
364 result = re.sub(u'(?u)&(.+?);', htmlentity_transform, s)
365 return result
366
367 def encodeFilename(s):
368 """
369 @param s The name of the file
370 """
371
372 assert type(s) == type(u'')
373
374 # Python 3 has a Unicode API
375 if sys.version_info >= (3, 0):
376 return s
377
378 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
379 # Pass u'' directly to use Unicode APIs on Windows 2000 and up
380 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
381 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
382 return s
383 else:
384 return s.encode(sys.getfilesystemencoding(), 'ignore')
385
386 class DownloadError(Exception):
387 """Download Error exception.
388
389 This exception may be thrown by FileDownloader objects if they are not
390 configured to continue on errors. They will contain the appropriate
391 error message.
392 """
393 pass
394
395
396 class SameFileError(Exception):
397 """Same File exception.
398
399 This exception will be thrown by FileDownloader objects if they detect
400 multiple files would have to be downloaded to the same file on disk.
401 """
402 pass
403
404
405 class PostProcessingError(Exception):
406 """Post Processing exception.
407
408 This exception may be raised by PostProcessor's .run() method to
409 indicate an error in the postprocessing task.
410 """
411 pass
412
413 class MaxDownloadsReached(Exception):
414 """ --max-downloads limit has been reached. """
415 pass
416
417
418 class UnavailableVideoError(Exception):
419 """Unavailable Format exception.
420
421 This exception will be thrown when a video is requested
422 in a format that is not available for that video.
423 """
424 pass
425
426
427 class ContentTooShortError(Exception):
428 """Content Too Short exception.
429
430 This exception may be raised by FileDownloader objects when a file they
431 download is too small for what the server announced first, indicating
432 the connection was probably interrupted.
433 """
434 # Both in bytes
435 downloaded = None
436 expected = None
437
438 def __init__(self, downloaded, expected):
439 self.downloaded = downloaded
440 self.expected = expected
441
442
443 class Trouble(Exception):
444 """Trouble helper exception
445
446 This is an exception to be handled with
447 FileDownloader.trouble
448 """
449
450 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
451 """Handler for HTTP requests and responses.
452
453 This class, when installed with an OpenerDirector, automatically adds
454 the standard headers to every HTTP request and handles gzipped and
455 deflated responses from web servers. If compression is to be avoided in
456 a particular request, the original request in the program code only has
457 to include the HTTP header "Youtubedl-No-Compression", which will be
458 removed before making the real request.
459
460 Part of this code was copied from:
461
462 http://techknack.net/python-urllib2-handlers/
463
464 Andrew Rowls, the author of that code, agreed to release it to the
465 public domain.
466 """
467
468 @staticmethod
469 def deflate(data):
470 try:
471 return zlib.decompress(data, -zlib.MAX_WBITS)
472 except zlib.error:
473 return zlib.decompress(data)
474
475 @staticmethod
476 def addinfourl_wrapper(stream, headers, url, code):
477 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
478 return compat_urllib_request.addinfourl(stream, headers, url, code)
479 ret = compat_urllib_request.addinfourl(stream, headers, url)
480 ret.code = code
481 return ret
482
483 def http_request(self, req):
484 for h in std_headers:
485 if h in req.headers:
486 del req.headers[h]
487 req.add_header(h, std_headers[h])
488 if 'Youtubedl-no-compression' in req.headers:
489 if 'Accept-encoding' in req.headers:
490 del req.headers['Accept-encoding']
491 del req.headers['Youtubedl-no-compression']
492 return req
493
494 def http_response(self, req, resp):
495 old_resp = resp
496 # gzip
497 if resp.headers.get('Content-encoding', '') == 'gzip':
498 gz = gzip.GzipFile(fileobj=io.BytesIO(resp.read()), mode='r')
499 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
500 resp.msg = old_resp.msg
501 # deflate
502 if resp.headers.get('Content-encoding', '') == 'deflate':
503 gz = io.BytesIO(self.deflate(resp.read()))
504 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
505 resp.msg = old_resp.msg
506 return resp