]> jfr.im git - yt-dlp.git/blame - youtube_dl/utils.py
Convert all tabs to 4 spaces (PEP8)
[yt-dlp.git] / youtube_dl / utils.py
CommitLineData
d77c3dfd
FV
1#!/usr/bin/env python
2# -*- coding: utf-8 -*-
3
4import gzip
03f9daab 5import io
d77c3dfd
FV
6import locale
7import os
8import re
9import sys
10import zlib
d77c3dfd 11import email.utils
921a1455 12import json
d77c3dfd 13
01ba00ca 14try:
59ae15a5 15 import urllib.request as compat_urllib_request
01ba00ca 16except ImportError: # Python 2
59ae15a5 17 import urllib2 as compat_urllib_request
01ba00ca
PH
18
19try:
59ae15a5 20 import urllib.error as compat_urllib_error
01ba00ca 21except ImportError: # Python 2
59ae15a5 22 import urllib2 as compat_urllib_error
01ba00ca
PH
23
24try:
59ae15a5 25 import urllib.parse as compat_urllib_parse
01ba00ca 26except ImportError: # Python 2
59ae15a5 27 import urllib as compat_urllib_parse
01ba00ca
PH
28
29try:
59ae15a5 30 import http.cookiejar as compat_cookiejar
01ba00ca 31except ImportError: # Python 2
59ae15a5 32 import cookielib as compat_cookiejar
01ba00ca 33
3e669f36 34try:
59ae15a5 35 import html.entities as compat_html_entities
9f37a959 36except ImportError: # Python 2
59ae15a5 37 import htmlentitydefs as compat_html_entities
3e669f36 38
a8156c1d 39try:
59ae15a5 40 import html.parser as compat_html_parser
9f37a959 41except ImportError: # Python 2
59ae15a5 42 import HTMLParser as compat_html_parser
a8156c1d 43
348d0a7a 44try:
59ae15a5 45 import http.client as compat_http_client
9f37a959 46except ImportError: # Python 2
59ae15a5 47 import httplib as compat_http_client
348d0a7a 48
9f37a959 49try:
59ae15a5 50 from urllib.parse import parse_qs as compat_parse_qs
9f37a959 51except ImportError: # Python 2
59ae15a5
PH
52 # HACK: The following is the correct parse_qs implementation from cpython 3's stdlib.
53 # Python 2's version is apparently totally broken
54 def _unquote(string, encoding='utf-8', errors='replace'):
55 if string == '':
56 return string
57 res = string.split('%')
58 if len(res) == 1:
59 return string
60 if encoding is None:
61 encoding = 'utf-8'
62 if errors is None:
63 errors = 'replace'
64 # pct_sequence: contiguous sequence of percent-encoded bytes, decoded
65 pct_sequence = b''
66 string = res[0]
67 for item in res[1:]:
68 try:
69 if not item:
70 raise ValueError
71 pct_sequence += item[:2].decode('hex')
72 rest = item[2:]
73 if not rest:
74 # This segment was just a single percent-encoded character.
75 # May be part of a sequence of code units, so delay decoding.
76 # (Stored in pct_sequence).
77 continue
78 except ValueError:
79 rest = '%' + item
80 # Encountered non-percent-encoded characters. Flush the current
81 # pct_sequence.
82 string += pct_sequence.decode(encoding, errors) + rest
83 pct_sequence = b''
84 if pct_sequence:
85 # Flush the final pct_sequence
86 string += pct_sequence.decode(encoding, errors)
87 return string
88
89 def _parse_qsl(qs, keep_blank_values=False, strict_parsing=False,
90 encoding='utf-8', errors='replace'):
91 qs, _coerce_result = qs, unicode
92 pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
93 r = []
94 for name_value in pairs:
95 if not name_value and not strict_parsing:
96 continue
97 nv = name_value.split('=', 1)
98 if len(nv) != 2:
99 if strict_parsing:
100 raise ValueError("bad query field: %r" % (name_value,))
101 # Handle case of a control-name with no equal sign
102 if keep_blank_values:
103 nv.append('')
104 else:
105 continue
106 if len(nv[1]) or keep_blank_values:
107 name = nv[0].replace('+', ' ')
108 name = _unquote(name, encoding=encoding, errors=errors)
109 name = _coerce_result(name)
110 value = nv[1].replace('+', ' ')
111 value = _unquote(value, encoding=encoding, errors=errors)
112 value = _coerce_result(value)
113 r.append((name, value))
114 return r
115
116 def compat_parse_qs(qs, keep_blank_values=False, strict_parsing=False,
117 encoding='utf-8', errors='replace'):
118 parsed_result = {}
119 pairs = _parse_qsl(qs, keep_blank_values, strict_parsing,
120 encoding=encoding, errors=errors)
121 for name, value in pairs:
122 if name in parsed_result:
123 parsed_result[name].append(value)
124 else:
125 parsed_result[name] = [value]
126 return parsed_result
348d0a7a 127
3e669f36 128try:
59ae15a5 129 compat_str = unicode # Python 2
3e669f36 130except NameError:
59ae15a5 131 compat_str = str
3e669f36
PH
132
133try:
59ae15a5 134 compat_chr = unichr # Python 2
3e669f36 135except NameError:
59ae15a5 136 compat_chr = chr
3e669f36 137
3e669f36 138std_headers = {
59ae15a5
PH
139 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0',
140 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
141 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
142 'Accept-Encoding': 'gzip, deflate',
143 'Accept-Language': 'en-us,en;q=0.5',
3e669f36 144}
d77c3dfd 145def preferredencoding():
59ae15a5 146 """Get preferred encoding.
d77c3dfd 147
59ae15a5
PH
148 Returns the best encoding scheme for the system, based on
149 locale.getpreferredencoding() and some further tweaks.
150 """
151 try:
152 pref = locale.getpreferredencoding()
153 u'TEST'.encode(pref)
154 except:
155 pref = 'UTF-8'
bae611f2 156
59ae15a5 157 return pref
d77c3dfd 158
8cd10ac4 159if sys.version_info < (3,0):
59ae15a5
PH
160 def compat_print(s):
161 print(s.encode(preferredencoding(), 'xmlcharrefreplace'))
8cd10ac4 162else:
59ae15a5
PH
163 def compat_print(s):
164 assert type(s) == type(u'')
165 print(s)
d77c3dfd
FV
166
167def htmlentity_transform(matchobj):
59ae15a5
PH
168 """Transforms an HTML entity to a character.
169
170 This function receives a match object and is intended to be used with
171 the re.sub() function.
172 """
173 entity = matchobj.group(1)
174
175 # Known non-numeric HTML entity
176 if entity in compat_html_entities.name2codepoint:
177 return compat_chr(compat_html_entities.name2codepoint[entity])
178
179 mobj = re.match(u'(?u)#(x?\\d+)', entity)
180 if mobj is not None:
181 numstr = mobj.group(1)
182 if numstr.startswith(u'x'):
183 base = 16
184 numstr = u'0%s' % numstr
185 else:
186 base = 10
187 return compat_chr(int(numstr, base))
188
189 # Unknown entity in name, return its literal representation
190 return (u'&%s;' % entity)
d77c3dfd 191
a8156c1d
PH
192compat_html_parser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix
193class IDParser(compat_html_parser.HTMLParser):
59ae15a5
PH
194 """Modified HTMLParser that isolates a tag with the specified id"""
195 def __init__(self, id):
196 self.id = id
197 self.result = None
198 self.started = False
199 self.depth = {}
200 self.html = None
201 self.watch_startpos = False
202 self.error_count = 0
203 compat_html_parser.HTMLParser.__init__(self)
204
205 def error(self, message):
206 if self.error_count > 10 or self.started:
207 raise compat_html_parser.HTMLParseError(message, self.getpos())
208 self.rawdata = '\n'.join(self.html.split('\n')[self.getpos()[0]:]) # skip one line
209 self.error_count += 1
210 self.goahead(1)
211
212 def loads(self, html):
213 self.html = html
214 self.feed(html)
215 self.close()
216
217 def handle_starttag(self, tag, attrs):
218 attrs = dict(attrs)
219 if self.started:
220 self.find_startpos(None)
221 if 'id' in attrs and attrs['id'] == self.id:
222 self.result = [tag]
223 self.started = True
224 self.watch_startpos = True
225 if self.started:
226 if not tag in self.depth: self.depth[tag] = 0
227 self.depth[tag] += 1
228
229 def handle_endtag(self, tag):
230 if self.started:
231 if tag in self.depth: self.depth[tag] -= 1
232 if self.depth[self.result[0]] == 0:
233 self.started = False
234 self.result.append(self.getpos())
235
236 def find_startpos(self, x):
237 """Needed to put the start position of the result (self.result[1])
238 after the opening tag with the requested id"""
239 if self.watch_startpos:
240 self.watch_startpos = False
241 self.result.append(self.getpos())
242 handle_entityref = handle_charref = handle_data = handle_comment = \
243 handle_decl = handle_pi = unknown_decl = find_startpos
244
245 def get_result(self):
246 if self.result is None:
247 return None
248 if len(self.result) != 3:
249 return None
250 lines = self.html.split('\n')
251 lines = lines[self.result[1][0]-1:self.result[2][0]]
252 lines[0] = lines[0][self.result[1][1]:]
253 if len(lines) == 1:
254 lines[-1] = lines[-1][:self.result[2][1]-self.result[1][1]]
255 lines[-1] = lines[-1][:self.result[2][1]]
256 return '\n'.join(lines).strip()
9e6dd238
FV
257
258def get_element_by_id(id, html):
59ae15a5
PH
259 """Return the content of the tag with the specified id in the passed HTML document"""
260 parser = IDParser(id)
261 try:
262 parser.loads(html)
263 except compat_html_parser.HTMLParseError:
264 pass
265 return parser.get_result()
9e6dd238
FV
266
267
268def clean_html(html):
59ae15a5
PH
269 """Clean an HTML snippet into a readable string"""
270 # Newline vs <br />
271 html = html.replace('\n', ' ')
272 html = re.sub('\s*<\s*br\s*/?\s*>\s*', '\n', html)
273 # Strip html tags
274 html = re.sub('<.*?>', '', html)
275 # Replace html entities
276 html = unescapeHTML(html)
277 return html
9e6dd238
FV
278
279
d77c3dfd 280def sanitize_open(filename, open_mode):
59ae15a5
PH
281 """Try to open the given filename, and slightly tweak it if this fails.
282
283 Attempts to open the given filename. If this fails, it tries to change
284 the filename slightly, step by step, until it's either able to open it
285 or it fails and raises a final exception, like the standard open()
286 function.
287
288 It returns the tuple (stream, definitive_file_name).
289 """
290 try:
291 if filename == u'-':
292 if sys.platform == 'win32':
293 import msvcrt
294 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
295 return (sys.stdout, filename)
296 stream = open(encodeFilename(filename), open_mode)
297 return (stream, filename)
298 except (IOError, OSError) as err:
299 # In case of error, try to remove win32 forbidden chars
300 filename = re.sub(u'[/<>:"\\|\\\\?\\*]', u'#', filename)
301
302 # An exception here should be caught in the caller
303 stream = open(encodeFilename(filename), open_mode)
304 return (stream, filename)
d77c3dfd
FV
305
306
307def timeconvert(timestr):
59ae15a5
PH
308 """Convert RFC 2822 defined time string into system timestamp"""
309 timestamp = None
310 timetuple = email.utils.parsedate_tz(timestr)
311 if timetuple is not None:
312 timestamp = email.utils.mktime_tz(timetuple)
313 return timestamp
1c469a94
PH
314
315def sanitize_filename(s, restricted=False):
59ae15a5
PH
316 """Sanitizes a string so it could be used as part of a filename.
317 If restricted is set, use a stricter subset of allowed characters.
318 """
319 def replace_insane(char):
320 if char == '?' or ord(char) < 32 or ord(char) == 127:
321 return ''
322 elif char == '"':
323 return '' if restricted else '\''
324 elif char == ':':
325 return '_-' if restricted else ' -'
326 elif char in '\\/|*<>':
327 return '_'
328 if restricted and (char in '!&\'' or char.isspace()):
329 return '_'
330 if restricted and ord(char) > 127:
331 return '_'
332 return char
333
334 result = u''.join(map(replace_insane, s))
335 while '__' in result:
336 result = result.replace('__', '_')
337 result = result.strip('_')
338 # Common case of "Foreign band name - English song title"
339 if restricted and result.startswith('-_'):
340 result = result[2:]
341 if not result:
342 result = '_'
343 return result
d77c3dfd
FV
344
345def orderedSet(iterable):
59ae15a5
PH
346 """ Remove all duplicates from the input iterable """
347 res = []
348 for el in iterable:
349 if el not in res:
350 res.append(el)
351 return res
d77c3dfd
FV
352
353def unescapeHTML(s):
59ae15a5
PH
354 """
355 @param s a string
356 """
357 assert type(s) == type(u'')
d77c3dfd 358
59ae15a5
PH
359 result = re.sub(u'(?u)&(.+?);', htmlentity_transform, s)
360 return result
d77c3dfd
FV
361
362def encodeFilename(s):
59ae15a5
PH
363 """
364 @param s The name of the file
365 """
d77c3dfd 366
59ae15a5 367 assert type(s) == type(u'')
d77c3dfd 368
59ae15a5
PH
369 # Python 3 has a Unicode API
370 if sys.version_info >= (3, 0):
371 return s
0f00efed 372
59ae15a5
PH
373 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
374 # Pass u'' directly to use Unicode APIs on Windows 2000 and up
375 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
376 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
377 return s
378 else:
379 return s.encode(sys.getfilesystemencoding(), 'ignore')
d77c3dfd
FV
380
381class DownloadError(Exception):
59ae15a5 382 """Download Error exception.
d77c3dfd 383
59ae15a5
PH
384 This exception may be thrown by FileDownloader objects if they are not
385 configured to continue on errors. They will contain the appropriate
386 error message.
387 """
388 pass
d77c3dfd
FV
389
390
391class SameFileError(Exception):
59ae15a5 392 """Same File exception.
d77c3dfd 393
59ae15a5
PH
394 This exception will be thrown by FileDownloader objects if they detect
395 multiple files would have to be downloaded to the same file on disk.
396 """
397 pass
d77c3dfd
FV
398
399
400class PostProcessingError(Exception):
59ae15a5 401 """Post Processing exception.
d77c3dfd 402
59ae15a5
PH
403 This exception may be raised by PostProcessor's .run() method to
404 indicate an error in the postprocessing task.
405 """
406 pass
d77c3dfd
FV
407
408class MaxDownloadsReached(Exception):
59ae15a5
PH
409 """ --max-downloads limit has been reached. """
410 pass
d77c3dfd
FV
411
412
413class UnavailableVideoError(Exception):
59ae15a5 414 """Unavailable Format exception.
d77c3dfd 415
59ae15a5
PH
416 This exception will be thrown when a video is requested
417 in a format that is not available for that video.
418 """
419 pass
d77c3dfd
FV
420
421
422class ContentTooShortError(Exception):
59ae15a5 423 """Content Too Short exception.
d77c3dfd 424
59ae15a5
PH
425 This exception may be raised by FileDownloader objects when a file they
426 download is too small for what the server announced first, indicating
427 the connection was probably interrupted.
428 """
429 # Both in bytes
430 downloaded = None
431 expected = None
d77c3dfd 432
59ae15a5
PH
433 def __init__(self, downloaded, expected):
434 self.downloaded = downloaded
435 self.expected = expected
d77c3dfd
FV
436
437
0b8c922d 438class Trouble(Exception):
59ae15a5 439 """Trouble helper exception
dffe658b 440
59ae15a5
PH
441 This is an exception to be handled with
442 FileDownloader.trouble
443 """
0b8c922d 444
01ba00ca 445class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
59ae15a5
PH
446 """Handler for HTTP requests and responses.
447
448 This class, when installed with an OpenerDirector, automatically adds
449 the standard headers to every HTTP request and handles gzipped and
450 deflated responses from web servers. If compression is to be avoided in
451 a particular request, the original request in the program code only has
452 to include the HTTP header "Youtubedl-No-Compression", which will be
453 removed before making the real request.
454
455 Part of this code was copied from:
456
457 http://techknack.net/python-urllib2-handlers/
458
459 Andrew Rowls, the author of that code, agreed to release it to the
460 public domain.
461 """
462
463 @staticmethod
464 def deflate(data):
465 try:
466 return zlib.decompress(data, -zlib.MAX_WBITS)
467 except zlib.error:
468 return zlib.decompress(data)
469
470 @staticmethod
471 def addinfourl_wrapper(stream, headers, url, code):
472 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
473 return compat_urllib_request.addinfourl(stream, headers, url, code)
474 ret = compat_urllib_request.addinfourl(stream, headers, url)
475 ret.code = code
476 return ret
477
478 def http_request(self, req):
479 for h in std_headers:
480 if h in req.headers:
481 del req.headers[h]
482 req.add_header(h, std_headers[h])
483 if 'Youtubedl-no-compression' in req.headers:
484 if 'Accept-encoding' in req.headers:
485 del req.headers['Accept-encoding']
486 del req.headers['Youtubedl-no-compression']
487 return req
488
489 def http_response(self, req, resp):
490 old_resp = resp
491 # gzip
492 if resp.headers.get('Content-encoding', '') == 'gzip':
493 gz = gzip.GzipFile(fileobj=io.BytesIO(resp.read()), mode='r')
494 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
495 resp.msg = old_resp.msg
496 # deflate
497 if resp.headers.get('Content-encoding', '') == 'deflate':
498 gz = io.BytesIO(self.deflate(resp.read()))
499 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
500 resp.msg = old_resp.msg
501 return resp