]> jfr.im git - yt-dlp.git/blame - youtube_dl/utils.py
Do not use deprecated method
[yt-dlp.git] / youtube_dl / utils.py
CommitLineData
d77c3dfd
FV
1#!/usr/bin/env python
2# -*- coding: utf-8 -*-
3
4import gzip
03f9daab 5import io
f4bfd65f 6import json
d77c3dfd
FV
7import locale
8import os
9import re
10import sys
11import zlib
d77c3dfd 12import email.utils
921a1455 13import json
d77c3dfd 14
01ba00ca 15try:
59ae15a5 16 import urllib.request as compat_urllib_request
01ba00ca 17except ImportError: # Python 2
59ae15a5 18 import urllib2 as compat_urllib_request
01ba00ca
PH
19
20try:
59ae15a5 21 import urllib.error as compat_urllib_error
01ba00ca 22except ImportError: # Python 2
59ae15a5 23 import urllib2 as compat_urllib_error
01ba00ca
PH
24
25try:
59ae15a5 26 import urllib.parse as compat_urllib_parse
01ba00ca 27except ImportError: # Python 2
59ae15a5 28 import urllib as compat_urllib_parse
01ba00ca 29
799c0763
PH
30try:
31 from urllib.parse import urlparse as compat_urllib_parse_urlparse
32except ImportError: # Python 2
33 from urlparse import urlparse as compat_urllib_parse_urlparse
34
01ba00ca 35try:
59ae15a5 36 import http.cookiejar as compat_cookiejar
01ba00ca 37except ImportError: # Python 2
59ae15a5 38 import cookielib as compat_cookiejar
01ba00ca 39
3e669f36 40try:
59ae15a5 41 import html.entities as compat_html_entities
9f37a959 42except ImportError: # Python 2
59ae15a5 43 import htmlentitydefs as compat_html_entities
3e669f36 44
a8156c1d 45try:
59ae15a5 46 import html.parser as compat_html_parser
9f37a959 47except ImportError: # Python 2
59ae15a5 48 import HTMLParser as compat_html_parser
a8156c1d 49
348d0a7a 50try:
59ae15a5 51 import http.client as compat_http_client
9f37a959 52except ImportError: # Python 2
59ae15a5 53 import httplib as compat_http_client
348d0a7a 54
5910e210
PH
55try:
56 from subprocess import DEVNULL
57 compat_subprocess_get_DEVNULL = lambda: DEVNULL
58except ImportError:
59 compat_subprocess_get_DEVNULL = lambda: open(os.path.devnull, 'w')
60
9f37a959 61try:
59ae15a5 62 from urllib.parse import parse_qs as compat_parse_qs
9f37a959 63except ImportError: # Python 2
59ae15a5
PH
64 # HACK: The following is the correct parse_qs implementation from cpython 3's stdlib.
65 # Python 2's version is apparently totally broken
66 def _unquote(string, encoding='utf-8', errors='replace'):
67 if string == '':
68 return string
69 res = string.split('%')
70 if len(res) == 1:
71 return string
72 if encoding is None:
73 encoding = 'utf-8'
74 if errors is None:
75 errors = 'replace'
76 # pct_sequence: contiguous sequence of percent-encoded bytes, decoded
77 pct_sequence = b''
78 string = res[0]
79 for item in res[1:]:
80 try:
81 if not item:
82 raise ValueError
83 pct_sequence += item[:2].decode('hex')
84 rest = item[2:]
85 if not rest:
86 # This segment was just a single percent-encoded character.
87 # May be part of a sequence of code units, so delay decoding.
88 # (Stored in pct_sequence).
89 continue
90 except ValueError:
91 rest = '%' + item
92 # Encountered non-percent-encoded characters. Flush the current
93 # pct_sequence.
94 string += pct_sequence.decode(encoding, errors) + rest
95 pct_sequence = b''
96 if pct_sequence:
97 # Flush the final pct_sequence
98 string += pct_sequence.decode(encoding, errors)
99 return string
100
101 def _parse_qsl(qs, keep_blank_values=False, strict_parsing=False,
102 encoding='utf-8', errors='replace'):
103 qs, _coerce_result = qs, unicode
104 pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
105 r = []
106 for name_value in pairs:
107 if not name_value and not strict_parsing:
108 continue
109 nv = name_value.split('=', 1)
110 if len(nv) != 2:
111 if strict_parsing:
112 raise ValueError("bad query field: %r" % (name_value,))
113 # Handle case of a control-name with no equal sign
114 if keep_blank_values:
115 nv.append('')
116 else:
117 continue
118 if len(nv[1]) or keep_blank_values:
119 name = nv[0].replace('+', ' ')
120 name = _unquote(name, encoding=encoding, errors=errors)
121 name = _coerce_result(name)
122 value = nv[1].replace('+', ' ')
123 value = _unquote(value, encoding=encoding, errors=errors)
124 value = _coerce_result(value)
125 r.append((name, value))
126 return r
127
128 def compat_parse_qs(qs, keep_blank_values=False, strict_parsing=False,
129 encoding='utf-8', errors='replace'):
130 parsed_result = {}
131 pairs = _parse_qsl(qs, keep_blank_values, strict_parsing,
132 encoding=encoding, errors=errors)
133 for name, value in pairs:
134 if name in parsed_result:
135 parsed_result[name].append(value)
136 else:
137 parsed_result[name] = [value]
138 return parsed_result
348d0a7a 139
3e669f36 140try:
59ae15a5 141 compat_str = unicode # Python 2
3e669f36 142except NameError:
59ae15a5 143 compat_str = str
3e669f36
PH
144
145try:
59ae15a5 146 compat_chr = unichr # Python 2
3e669f36 147except NameError:
59ae15a5 148 compat_chr = chr
3e669f36 149
3e669f36 150std_headers = {
59ae15a5
PH
151 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0',
152 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
153 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
154 'Accept-Encoding': 'gzip, deflate',
155 'Accept-Language': 'en-us,en;q=0.5',
3e669f36 156}
d77c3dfd 157def preferredencoding():
59ae15a5 158 """Get preferred encoding.
d77c3dfd 159
59ae15a5
PH
160 Returns the best encoding scheme for the system, based on
161 locale.getpreferredencoding() and some further tweaks.
162 """
163 try:
164 pref = locale.getpreferredencoding()
165 u'TEST'.encode(pref)
166 except:
167 pref = 'UTF-8'
bae611f2 168
59ae15a5 169 return pref
d77c3dfd 170
8cd10ac4 171if sys.version_info < (3,0):
59ae15a5
PH
172 def compat_print(s):
173 print(s.encode(preferredencoding(), 'xmlcharrefreplace'))
8cd10ac4 174else:
59ae15a5
PH
175 def compat_print(s):
176 assert type(s) == type(u'')
177 print(s)
d77c3dfd 178
f4bfd65f
PH
179# In Python 2.x, json.dump expects a bytestream.
180# In Python 3.x, it writes to a character stream
181if sys.version_info < (3,0):
182 def write_json_file(obj, fn):
183 with open(fn, 'wb') as f:
184 json.dump(obj, f)
185else:
186 def write_json_file(obj, fn):
187 with open(fn, 'w', encoding='utf-8') as f:
188 json.dump(obj, f)
189
190
d77c3dfd 191def htmlentity_transform(matchobj):
59ae15a5
PH
192 """Transforms an HTML entity to a character.
193
194 This function receives a match object and is intended to be used with
195 the re.sub() function.
196 """
197 entity = matchobj.group(1)
198
199 # Known non-numeric HTML entity
200 if entity in compat_html_entities.name2codepoint:
201 return compat_chr(compat_html_entities.name2codepoint[entity])
202
203 mobj = re.match(u'(?u)#(x?\\d+)', entity)
204 if mobj is not None:
205 numstr = mobj.group(1)
206 if numstr.startswith(u'x'):
207 base = 16
208 numstr = u'0%s' % numstr
209 else:
210 base = 10
211 return compat_chr(int(numstr, base))
212
213 # Unknown entity in name, return its literal representation
214 return (u'&%s;' % entity)
d77c3dfd 215
a8156c1d
PH
216compat_html_parser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix
217class IDParser(compat_html_parser.HTMLParser):
59ae15a5
PH
218 """Modified HTMLParser that isolates a tag with the specified id"""
219 def __init__(self, id):
220 self.id = id
221 self.result = None
222 self.started = False
223 self.depth = {}
224 self.html = None
225 self.watch_startpos = False
226 self.error_count = 0
227 compat_html_parser.HTMLParser.__init__(self)
228
229 def error(self, message):
230 if self.error_count > 10 or self.started:
231 raise compat_html_parser.HTMLParseError(message, self.getpos())
232 self.rawdata = '\n'.join(self.html.split('\n')[self.getpos()[0]:]) # skip one line
233 self.error_count += 1
234 self.goahead(1)
235
236 def loads(self, html):
237 self.html = html
238 self.feed(html)
239 self.close()
240
241 def handle_starttag(self, tag, attrs):
242 attrs = dict(attrs)
243 if self.started:
244 self.find_startpos(None)
245 if 'id' in attrs and attrs['id'] == self.id:
246 self.result = [tag]
247 self.started = True
248 self.watch_startpos = True
249 if self.started:
250 if not tag in self.depth: self.depth[tag] = 0
251 self.depth[tag] += 1
252
253 def handle_endtag(self, tag):
254 if self.started:
255 if tag in self.depth: self.depth[tag] -= 1
256 if self.depth[self.result[0]] == 0:
257 self.started = False
258 self.result.append(self.getpos())
259
260 def find_startpos(self, x):
261 """Needed to put the start position of the result (self.result[1])
262 after the opening tag with the requested id"""
263 if self.watch_startpos:
264 self.watch_startpos = False
265 self.result.append(self.getpos())
266 handle_entityref = handle_charref = handle_data = handle_comment = \
267 handle_decl = handle_pi = unknown_decl = find_startpos
268
269 def get_result(self):
270 if self.result is None:
271 return None
272 if len(self.result) != 3:
273 return None
274 lines = self.html.split('\n')
275 lines = lines[self.result[1][0]-1:self.result[2][0]]
276 lines[0] = lines[0][self.result[1][1]:]
277 if len(lines) == 1:
278 lines[-1] = lines[-1][:self.result[2][1]-self.result[1][1]]
279 lines[-1] = lines[-1][:self.result[2][1]]
280 return '\n'.join(lines).strip()
9e6dd238
FV
281
282def get_element_by_id(id, html):
59ae15a5
PH
283 """Return the content of the tag with the specified id in the passed HTML document"""
284 parser = IDParser(id)
285 try:
286 parser.loads(html)
287 except compat_html_parser.HTMLParseError:
288 pass
289 return parser.get_result()
9e6dd238
FV
290
291
292def clean_html(html):
59ae15a5
PH
293 """Clean an HTML snippet into a readable string"""
294 # Newline vs <br />
295 html = html.replace('\n', ' ')
296 html = re.sub('\s*<\s*br\s*/?\s*>\s*', '\n', html)
297 # Strip html tags
298 html = re.sub('<.*?>', '', html)
299 # Replace html entities
300 html = unescapeHTML(html)
301 return html
9e6dd238
FV
302
303
d77c3dfd 304def sanitize_open(filename, open_mode):
59ae15a5
PH
305 """Try to open the given filename, and slightly tweak it if this fails.
306
307 Attempts to open the given filename. If this fails, it tries to change
308 the filename slightly, step by step, until it's either able to open it
309 or it fails and raises a final exception, like the standard open()
310 function.
311
312 It returns the tuple (stream, definitive_file_name).
313 """
314 try:
315 if filename == u'-':
316 if sys.platform == 'win32':
317 import msvcrt
318 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
319 return (sys.stdout, filename)
320 stream = open(encodeFilename(filename), open_mode)
321 return (stream, filename)
322 except (IOError, OSError) as err:
323 # In case of error, try to remove win32 forbidden chars
324 filename = re.sub(u'[/<>:"\\|\\\\?\\*]', u'#', filename)
325
326 # An exception here should be caught in the caller
327 stream = open(encodeFilename(filename), open_mode)
328 return (stream, filename)
d77c3dfd
FV
329
330
331def timeconvert(timestr):
59ae15a5
PH
332 """Convert RFC 2822 defined time string into system timestamp"""
333 timestamp = None
334 timetuple = email.utils.parsedate_tz(timestr)
335 if timetuple is not None:
336 timestamp = email.utils.mktime_tz(timetuple)
337 return timestamp
1c469a94 338
796173d0 339def sanitize_filename(s, restricted=False, is_id=False):
59ae15a5
PH
340 """Sanitizes a string so it could be used as part of a filename.
341 If restricted is set, use a stricter subset of allowed characters.
796173d0 342 Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
59ae15a5
PH
343 """
344 def replace_insane(char):
345 if char == '?' or ord(char) < 32 or ord(char) == 127:
346 return ''
347 elif char == '"':
348 return '' if restricted else '\''
349 elif char == ':':
350 return '_-' if restricted else ' -'
351 elif char in '\\/|*<>':
352 return '_'
627dcfff 353 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
59ae15a5
PH
354 return '_'
355 if restricted and ord(char) > 127:
356 return '_'
357 return char
358
359 result = u''.join(map(replace_insane, s))
796173d0
PH
360 if not is_id:
361 while '__' in result:
362 result = result.replace('__', '_')
363 result = result.strip('_')
364 # Common case of "Foreign band name - English song title"
365 if restricted and result.startswith('-_'):
366 result = result[2:]
367 if not result:
368 result = '_'
59ae15a5 369 return result
d77c3dfd
FV
370
371def orderedSet(iterable):
59ae15a5
PH
372 """ Remove all duplicates from the input iterable """
373 res = []
374 for el in iterable:
375 if el not in res:
376 res.append(el)
377 return res
d77c3dfd
FV
378
379def unescapeHTML(s):
59ae15a5
PH
380 """
381 @param s a string
382 """
383 assert type(s) == type(u'')
d77c3dfd 384
59ae15a5
PH
385 result = re.sub(u'(?u)&(.+?);', htmlentity_transform, s)
386 return result
d77c3dfd
FV
387
388def encodeFilename(s):
59ae15a5
PH
389 """
390 @param s The name of the file
391 """
d77c3dfd 392
59ae15a5 393 assert type(s) == type(u'')
d77c3dfd 394
59ae15a5
PH
395 # Python 3 has a Unicode API
396 if sys.version_info >= (3, 0):
397 return s
0f00efed 398
59ae15a5
PH
399 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
400 # Pass u'' directly to use Unicode APIs on Windows 2000 and up
401 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
402 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
403 return s
404 else:
405 return s.encode(sys.getfilesystemencoding(), 'ignore')
d77c3dfd
FV
406
407class DownloadError(Exception):
59ae15a5 408 """Download Error exception.
d77c3dfd 409
59ae15a5
PH
410 This exception may be thrown by FileDownloader objects if they are not
411 configured to continue on errors. They will contain the appropriate
412 error message.
413 """
414 pass
d77c3dfd
FV
415
416
417class SameFileError(Exception):
59ae15a5 418 """Same File exception.
d77c3dfd 419
59ae15a5
PH
420 This exception will be thrown by FileDownloader objects if they detect
421 multiple files would have to be downloaded to the same file on disk.
422 """
423 pass
d77c3dfd
FV
424
425
426class PostProcessingError(Exception):
59ae15a5 427 """Post Processing exception.
d77c3dfd 428
59ae15a5
PH
429 This exception may be raised by PostProcessor's .run() method to
430 indicate an error in the postprocessing task.
431 """
432 pass
d77c3dfd
FV
433
434class MaxDownloadsReached(Exception):
59ae15a5
PH
435 """ --max-downloads limit has been reached. """
436 pass
d77c3dfd
FV
437
438
439class UnavailableVideoError(Exception):
59ae15a5 440 """Unavailable Format exception.
d77c3dfd 441
59ae15a5
PH
442 This exception will be thrown when a video is requested
443 in a format that is not available for that video.
444 """
445 pass
d77c3dfd
FV
446
447
448class ContentTooShortError(Exception):
59ae15a5 449 """Content Too Short exception.
d77c3dfd 450
59ae15a5
PH
451 This exception may be raised by FileDownloader objects when a file they
452 download is too small for what the server announced first, indicating
453 the connection was probably interrupted.
454 """
455 # Both in bytes
456 downloaded = None
457 expected = None
d77c3dfd 458
59ae15a5
PH
459 def __init__(self, downloaded, expected):
460 self.downloaded = downloaded
461 self.expected = expected
d77c3dfd
FV
462
463
0b8c922d 464class Trouble(Exception):
59ae15a5 465 """Trouble helper exception
dffe658b 466
59ae15a5
PH
467 This is an exception to be handled with
468 FileDownloader.trouble
469 """
0b8c922d 470
01ba00ca 471class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
59ae15a5
PH
472 """Handler for HTTP requests and responses.
473
474 This class, when installed with an OpenerDirector, automatically adds
475 the standard headers to every HTTP request and handles gzipped and
476 deflated responses from web servers. If compression is to be avoided in
477 a particular request, the original request in the program code only has
478 to include the HTTP header "Youtubedl-No-Compression", which will be
479 removed before making the real request.
480
481 Part of this code was copied from:
482
483 http://techknack.net/python-urllib2-handlers/
484
485 Andrew Rowls, the author of that code, agreed to release it to the
486 public domain.
487 """
488
489 @staticmethod
490 def deflate(data):
491 try:
492 return zlib.decompress(data, -zlib.MAX_WBITS)
493 except zlib.error:
494 return zlib.decompress(data)
495
496 @staticmethod
497 def addinfourl_wrapper(stream, headers, url, code):
498 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
499 return compat_urllib_request.addinfourl(stream, headers, url, code)
500 ret = compat_urllib_request.addinfourl(stream, headers, url)
501 ret.code = code
502 return ret
503
504 def http_request(self, req):
505 for h in std_headers:
506 if h in req.headers:
507 del req.headers[h]
508 req.add_header(h, std_headers[h])
509 if 'Youtubedl-no-compression' in req.headers:
510 if 'Accept-encoding' in req.headers:
511 del req.headers['Accept-encoding']
512 del req.headers['Youtubedl-no-compression']
513 return req
514
515 def http_response(self, req, resp):
516 old_resp = resp
517 # gzip
518 if resp.headers.get('Content-encoding', '') == 'gzip':
519 gz = gzip.GzipFile(fileobj=io.BytesIO(resp.read()), mode='r')
520 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
521 resp.msg = old_resp.msg
522 # deflate
523 if resp.headers.get('Content-encoding', '') == 'deflate':
524 gz = io.BytesIO(self.deflate(resp.read()))
525 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
526 resp.msg = old_resp.msg
527 return resp
0f8d03f8
PH
528
529 https_request = http_request
530 https_response = http_response