]> jfr.im git - yt-dlp.git/blame - youtube_dl/utils.py
modified FFmpegExtractAudioPP to accept whether it should overwrite post-processed...
[yt-dlp.git] / youtube_dl / utils.py
CommitLineData
d77c3dfd
FV
1#!/usr/bin/env python
2# -*- coding: utf-8 -*-
3
4import gzip
03f9daab 5import io
f4bfd65f 6import json
d77c3dfd
FV
7import locale
8import os
9import re
10import sys
11import zlib
d77c3dfd 12import email.utils
921a1455 13import json
d77c3dfd 14
01ba00ca 15try:
59ae15a5 16 import urllib.request as compat_urllib_request
01ba00ca 17except ImportError: # Python 2
59ae15a5 18 import urllib2 as compat_urllib_request
01ba00ca
PH
19
20try:
59ae15a5 21 import urllib.error as compat_urllib_error
01ba00ca 22except ImportError: # Python 2
59ae15a5 23 import urllib2 as compat_urllib_error
01ba00ca
PH
24
25try:
59ae15a5 26 import urllib.parse as compat_urllib_parse
01ba00ca 27except ImportError: # Python 2
59ae15a5 28 import urllib as compat_urllib_parse
01ba00ca 29
799c0763
PH
30try:
31 from urllib.parse import urlparse as compat_urllib_parse_urlparse
32except ImportError: # Python 2
33 from urlparse import urlparse as compat_urllib_parse_urlparse
34
01ba00ca 35try:
59ae15a5 36 import http.cookiejar as compat_cookiejar
01ba00ca 37except ImportError: # Python 2
59ae15a5 38 import cookielib as compat_cookiejar
01ba00ca 39
3e669f36 40try:
59ae15a5 41 import html.entities as compat_html_entities
9f37a959 42except ImportError: # Python 2
59ae15a5 43 import htmlentitydefs as compat_html_entities
3e669f36 44
a8156c1d 45try:
59ae15a5 46 import html.parser as compat_html_parser
9f37a959 47except ImportError: # Python 2
59ae15a5 48 import HTMLParser as compat_html_parser
a8156c1d 49
348d0a7a 50try:
59ae15a5 51 import http.client as compat_http_client
9f37a959 52except ImportError: # Python 2
59ae15a5 53 import httplib as compat_http_client
348d0a7a 54
5910e210
PH
55try:
56 from subprocess import DEVNULL
57 compat_subprocess_get_DEVNULL = lambda: DEVNULL
58except ImportError:
59 compat_subprocess_get_DEVNULL = lambda: open(os.path.devnull, 'w')
60
9f37a959 61try:
59ae15a5 62 from urllib.parse import parse_qs as compat_parse_qs
9f37a959 63except ImportError: # Python 2
59ae15a5
PH
64 # HACK: The following is the correct parse_qs implementation from cpython 3's stdlib.
65 # Python 2's version is apparently totally broken
66 def _unquote(string, encoding='utf-8', errors='replace'):
67 if string == '':
68 return string
69 res = string.split('%')
70 if len(res) == 1:
71 return string
72 if encoding is None:
73 encoding = 'utf-8'
74 if errors is None:
75 errors = 'replace'
76 # pct_sequence: contiguous sequence of percent-encoded bytes, decoded
77 pct_sequence = b''
78 string = res[0]
79 for item in res[1:]:
80 try:
81 if not item:
82 raise ValueError
83 pct_sequence += item[:2].decode('hex')
84 rest = item[2:]
85 if not rest:
86 # This segment was just a single percent-encoded character.
87 # May be part of a sequence of code units, so delay decoding.
88 # (Stored in pct_sequence).
89 continue
90 except ValueError:
91 rest = '%' + item
92 # Encountered non-percent-encoded characters. Flush the current
93 # pct_sequence.
94 string += pct_sequence.decode(encoding, errors) + rest
95 pct_sequence = b''
96 if pct_sequence:
97 # Flush the final pct_sequence
98 string += pct_sequence.decode(encoding, errors)
99 return string
100
101 def _parse_qsl(qs, keep_blank_values=False, strict_parsing=False,
102 encoding='utf-8', errors='replace'):
103 qs, _coerce_result = qs, unicode
104 pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
105 r = []
106 for name_value in pairs:
107 if not name_value and not strict_parsing:
108 continue
109 nv = name_value.split('=', 1)
110 if len(nv) != 2:
111 if strict_parsing:
112 raise ValueError("bad query field: %r" % (name_value,))
113 # Handle case of a control-name with no equal sign
114 if keep_blank_values:
115 nv.append('')
116 else:
117 continue
118 if len(nv[1]) or keep_blank_values:
119 name = nv[0].replace('+', ' ')
120 name = _unquote(name, encoding=encoding, errors=errors)
121 name = _coerce_result(name)
122 value = nv[1].replace('+', ' ')
123 value = _unquote(value, encoding=encoding, errors=errors)
124 value = _coerce_result(value)
125 r.append((name, value))
126 return r
127
128 def compat_parse_qs(qs, keep_blank_values=False, strict_parsing=False,
129 encoding='utf-8', errors='replace'):
130 parsed_result = {}
131 pairs = _parse_qsl(qs, keep_blank_values, strict_parsing,
132 encoding=encoding, errors=errors)
133 for name, value in pairs:
134 if name in parsed_result:
135 parsed_result[name].append(value)
136 else:
137 parsed_result[name] = [value]
138 return parsed_result
348d0a7a 139
3e669f36 140try:
59ae15a5 141 compat_str = unicode # Python 2
3e669f36 142except NameError:
59ae15a5 143 compat_str = str
3e669f36
PH
144
145try:
59ae15a5 146 compat_chr = unichr # Python 2
3e669f36 147except NameError:
59ae15a5 148 compat_chr = chr
3e669f36 149
3e669f36 150std_headers = {
59ae15a5
PH
151 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0',
152 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
153 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
154 'Accept-Encoding': 'gzip, deflate',
155 'Accept-Language': 'en-us,en;q=0.5',
3e669f36 156}
d77c3dfd 157def preferredencoding():
59ae15a5 158 """Get preferred encoding.
d77c3dfd 159
59ae15a5
PH
160 Returns the best encoding scheme for the system, based on
161 locale.getpreferredencoding() and some further tweaks.
162 """
163 try:
164 pref = locale.getpreferredencoding()
165 u'TEST'.encode(pref)
166 except:
167 pref = 'UTF-8'
bae611f2 168
59ae15a5 169 return pref
d77c3dfd 170
8cd10ac4 171if sys.version_info < (3,0):
59ae15a5
PH
172 def compat_print(s):
173 print(s.encode(preferredencoding(), 'xmlcharrefreplace'))
8cd10ac4 174else:
59ae15a5
PH
175 def compat_print(s):
176 assert type(s) == type(u'')
177 print(s)
d77c3dfd 178
f4bfd65f
PH
179# In Python 2.x, json.dump expects a bytestream.
180# In Python 3.x, it writes to a character stream
181if sys.version_info < (3,0):
182 def write_json_file(obj, fn):
183 with open(fn, 'wb') as f:
184 json.dump(obj, f)
185else:
186 def write_json_file(obj, fn):
187 with open(fn, 'w', encoding='utf-8') as f:
188 json.dump(obj, f)
189
190
d77c3dfd 191def htmlentity_transform(matchobj):
59ae15a5
PH
192 """Transforms an HTML entity to a character.
193
194 This function receives a match object and is intended to be used with
195 the re.sub() function.
196 """
197 entity = matchobj.group(1)
198
199 # Known non-numeric HTML entity
200 if entity in compat_html_entities.name2codepoint:
201 return compat_chr(compat_html_entities.name2codepoint[entity])
202
203 mobj = re.match(u'(?u)#(x?\\d+)', entity)
204 if mobj is not None:
205 numstr = mobj.group(1)
206 if numstr.startswith(u'x'):
207 base = 16
208 numstr = u'0%s' % numstr
209 else:
210 base = 10
211 return compat_chr(int(numstr, base))
212
213 # Unknown entity in name, return its literal representation
214 return (u'&%s;' % entity)
d77c3dfd 215
a8156c1d 216compat_html_parser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix
43e8fafd
ND
217class AttrParser(compat_html_parser.HTMLParser):
218 """Modified HTMLParser that isolates a tag with the specified attribute"""
219 def __init__(self, attribute, value):
220 self.attribute = attribute
221 self.value = value
59ae15a5
PH
222 self.result = None
223 self.started = False
224 self.depth = {}
225 self.html = None
226 self.watch_startpos = False
227 self.error_count = 0
228 compat_html_parser.HTMLParser.__init__(self)
229
230 def error(self, message):
231 if self.error_count > 10 or self.started:
232 raise compat_html_parser.HTMLParseError(message, self.getpos())
233 self.rawdata = '\n'.join(self.html.split('\n')[self.getpos()[0]:]) # skip one line
234 self.error_count += 1
235 self.goahead(1)
236
237 def loads(self, html):
238 self.html = html
239 self.feed(html)
240 self.close()
241
242 def handle_starttag(self, tag, attrs):
243 attrs = dict(attrs)
244 if self.started:
245 self.find_startpos(None)
43e8fafd 246 if self.attribute in attrs and attrs[self.attribute] == self.value:
59ae15a5
PH
247 self.result = [tag]
248 self.started = True
249 self.watch_startpos = True
250 if self.started:
251 if not tag in self.depth: self.depth[tag] = 0
252 self.depth[tag] += 1
253
254 def handle_endtag(self, tag):
255 if self.started:
256 if tag in self.depth: self.depth[tag] -= 1
257 if self.depth[self.result[0]] == 0:
258 self.started = False
259 self.result.append(self.getpos())
260
261 def find_startpos(self, x):
262 """Needed to put the start position of the result (self.result[1])
263 after the opening tag with the requested id"""
264 if self.watch_startpos:
265 self.watch_startpos = False
266 self.result.append(self.getpos())
267 handle_entityref = handle_charref = handle_data = handle_comment = \
268 handle_decl = handle_pi = unknown_decl = find_startpos
269
270 def get_result(self):
271 if self.result is None:
272 return None
273 if len(self.result) != 3:
274 return None
275 lines = self.html.split('\n')
276 lines = lines[self.result[1][0]-1:self.result[2][0]]
277 lines[0] = lines[0][self.result[1][1]:]
278 if len(lines) == 1:
279 lines[-1] = lines[-1][:self.result[2][1]-self.result[1][1]]
280 lines[-1] = lines[-1][:self.result[2][1]]
281 return '\n'.join(lines).strip()
9e6dd238
FV
282
283def get_element_by_id(id, html):
43e8fafd
ND
284 """Return the content of the tag with the specified ID in the passed HTML document"""
285 return get_element_by_attribute("id", id, html)
286
287def get_element_by_attribute(attribute, value, html):
288 """Return the content of the tag with the specified attribute in the passed HTML document"""
289 parser = AttrParser(attribute, value)
59ae15a5
PH
290 try:
291 parser.loads(html)
292 except compat_html_parser.HTMLParseError:
293 pass
294 return parser.get_result()
9e6dd238
FV
295
296
297def clean_html(html):
59ae15a5
PH
298 """Clean an HTML snippet into a readable string"""
299 # Newline vs <br />
300 html = html.replace('\n', ' ')
6b3aef80
FV
301 html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
302 html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
59ae15a5
PH
303 # Strip html tags
304 html = re.sub('<.*?>', '', html)
305 # Replace html entities
306 html = unescapeHTML(html)
307 return html
9e6dd238
FV
308
309
d77c3dfd 310def sanitize_open(filename, open_mode):
59ae15a5
PH
311 """Try to open the given filename, and slightly tweak it if this fails.
312
313 Attempts to open the given filename. If this fails, it tries to change
314 the filename slightly, step by step, until it's either able to open it
315 or it fails and raises a final exception, like the standard open()
316 function.
317
318 It returns the tuple (stream, definitive_file_name).
319 """
320 try:
321 if filename == u'-':
322 if sys.platform == 'win32':
323 import msvcrt
324 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
325 return (sys.stdout, filename)
326 stream = open(encodeFilename(filename), open_mode)
327 return (stream, filename)
328 except (IOError, OSError) as err:
329 # In case of error, try to remove win32 forbidden chars
330 filename = re.sub(u'[/<>:"\\|\\\\?\\*]', u'#', filename)
331
332 # An exception here should be caught in the caller
333 stream = open(encodeFilename(filename), open_mode)
334 return (stream, filename)
d77c3dfd
FV
335
336
337def timeconvert(timestr):
59ae15a5
PH
338 """Convert RFC 2822 defined time string into system timestamp"""
339 timestamp = None
340 timetuple = email.utils.parsedate_tz(timestr)
341 if timetuple is not None:
342 timestamp = email.utils.mktime_tz(timetuple)
343 return timestamp
1c469a94 344
796173d0 345def sanitize_filename(s, restricted=False, is_id=False):
59ae15a5
PH
346 """Sanitizes a string so it could be used as part of a filename.
347 If restricted is set, use a stricter subset of allowed characters.
796173d0 348 Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
59ae15a5
PH
349 """
350 def replace_insane(char):
351 if char == '?' or ord(char) < 32 or ord(char) == 127:
352 return ''
353 elif char == '"':
354 return '' if restricted else '\''
355 elif char == ':':
356 return '_-' if restricted else ' -'
357 elif char in '\\/|*<>':
358 return '_'
627dcfff 359 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
59ae15a5
PH
360 return '_'
361 if restricted and ord(char) > 127:
362 return '_'
363 return char
364
365 result = u''.join(map(replace_insane, s))
796173d0
PH
366 if not is_id:
367 while '__' in result:
368 result = result.replace('__', '_')
369 result = result.strip('_')
370 # Common case of "Foreign band name - English song title"
371 if restricted and result.startswith('-_'):
372 result = result[2:]
373 if not result:
374 result = '_'
59ae15a5 375 return result
d77c3dfd
FV
376
377def orderedSet(iterable):
59ae15a5
PH
378 """ Remove all duplicates from the input iterable """
379 res = []
380 for el in iterable:
381 if el not in res:
382 res.append(el)
383 return res
d77c3dfd
FV
384
385def unescapeHTML(s):
59ae15a5
PH
386 """
387 @param s a string
388 """
389 assert type(s) == type(u'')
d77c3dfd 390
59ae15a5
PH
391 result = re.sub(u'(?u)&(.+?);', htmlentity_transform, s)
392 return result
d77c3dfd
FV
393
394def encodeFilename(s):
59ae15a5
PH
395 """
396 @param s The name of the file
397 """
d77c3dfd 398
59ae15a5 399 assert type(s) == type(u'')
d77c3dfd 400
59ae15a5
PH
401 # Python 3 has a Unicode API
402 if sys.version_info >= (3, 0):
403 return s
0f00efed 404
59ae15a5
PH
405 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
406 # Pass u'' directly to use Unicode APIs on Windows 2000 and up
407 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
408 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
409 return s
410 else:
411 return s.encode(sys.getfilesystemencoding(), 'ignore')
d77c3dfd
FV
412
413class DownloadError(Exception):
59ae15a5 414 """Download Error exception.
d77c3dfd 415
59ae15a5
PH
416 This exception may be thrown by FileDownloader objects if they are not
417 configured to continue on errors. They will contain the appropriate
418 error message.
419 """
420 pass
d77c3dfd
FV
421
422
423class SameFileError(Exception):
59ae15a5 424 """Same File exception.
d77c3dfd 425
59ae15a5
PH
426 This exception will be thrown by FileDownloader objects if they detect
427 multiple files would have to be downloaded to the same file on disk.
428 """
429 pass
d77c3dfd
FV
430
431
432class PostProcessingError(Exception):
59ae15a5 433 """Post Processing exception.
d77c3dfd 434
59ae15a5
PH
435 This exception may be raised by PostProcessor's .run() method to
436 indicate an error in the postprocessing task.
437 """
438 pass
d77c3dfd
FV
439
440class MaxDownloadsReached(Exception):
59ae15a5
PH
441 """ --max-downloads limit has been reached. """
442 pass
d77c3dfd
FV
443
444
445class UnavailableVideoError(Exception):
59ae15a5 446 """Unavailable Format exception.
d77c3dfd 447
59ae15a5
PH
448 This exception will be thrown when a video is requested
449 in a format that is not available for that video.
450 """
451 pass
d77c3dfd
FV
452
453
454class ContentTooShortError(Exception):
59ae15a5 455 """Content Too Short exception.
d77c3dfd 456
59ae15a5
PH
457 This exception may be raised by FileDownloader objects when a file they
458 download is too small for what the server announced first, indicating
459 the connection was probably interrupted.
460 """
461 # Both in bytes
462 downloaded = None
463 expected = None
d77c3dfd 464
59ae15a5
PH
465 def __init__(self, downloaded, expected):
466 self.downloaded = downloaded
467 self.expected = expected
d77c3dfd 468
01ba00ca 469class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
59ae15a5
PH
470 """Handler for HTTP requests and responses.
471
472 This class, when installed with an OpenerDirector, automatically adds
473 the standard headers to every HTTP request and handles gzipped and
474 deflated responses from web servers. If compression is to be avoided in
475 a particular request, the original request in the program code only has
476 to include the HTTP header "Youtubedl-No-Compression", which will be
477 removed before making the real request.
478
479 Part of this code was copied from:
480
481 http://techknack.net/python-urllib2-handlers/
482
483 Andrew Rowls, the author of that code, agreed to release it to the
484 public domain.
485 """
486
487 @staticmethod
488 def deflate(data):
489 try:
490 return zlib.decompress(data, -zlib.MAX_WBITS)
491 except zlib.error:
492 return zlib.decompress(data)
493
494 @staticmethod
495 def addinfourl_wrapper(stream, headers, url, code):
496 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
497 return compat_urllib_request.addinfourl(stream, headers, url, code)
498 ret = compat_urllib_request.addinfourl(stream, headers, url)
499 ret.code = code
500 return ret
501
502 def http_request(self, req):
503 for h in std_headers:
504 if h in req.headers:
505 del req.headers[h]
506 req.add_header(h, std_headers[h])
507 if 'Youtubedl-no-compression' in req.headers:
508 if 'Accept-encoding' in req.headers:
509 del req.headers['Accept-encoding']
510 del req.headers['Youtubedl-no-compression']
511 return req
512
513 def http_response(self, req, resp):
514 old_resp = resp
515 # gzip
516 if resp.headers.get('Content-encoding', '') == 'gzip':
517 gz = gzip.GzipFile(fileobj=io.BytesIO(resp.read()), mode='r')
518 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
519 resp.msg = old_resp.msg
520 # deflate
521 if resp.headers.get('Content-encoding', '') == 'deflate':
522 gz = io.BytesIO(self.deflate(resp.read()))
523 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
524 resp.msg = old_resp.msg
525 return resp
0f8d03f8
PH
526
527 https_request = http_request
528 https_response = http_response