]> jfr.im git - yt-dlp.git/blame - youtube_dl/utils.py
correct to_stderr
[yt-dlp.git] / youtube_dl / utils.py
CommitLineData
d77c3dfd
FV
1#!/usr/bin/env python
2# -*- coding: utf-8 -*-
3
4import gzip
03f9daab 5import io
d77c3dfd
FV
6import locale
7import os
8import re
9import sys
10import zlib
d77c3dfd 11import email.utils
921a1455 12import json
d77c3dfd 13
01ba00ca
PH
14try:
15 import urllib.request as compat_urllib_request
16except ImportError: # Python 2
17 import urllib2 as compat_urllib_request
18
19try:
20 import urllib.error as compat_urllib_error
21except ImportError: # Python 2
22 import urllib2 as compat_urllib_error
23
24try:
25 import urllib.parse as compat_urllib_parse
26except ImportError: # Python 2
da779b49 27 import urllib as compat_urllib_parse
01ba00ca
PH
28
29try:
30 import http.cookiejar as compat_cookiejar
31except ImportError: # Python 2
32 import cookielib as compat_cookiejar
33
3e669f36
PH
34try:
35 import html.entities as compat_html_entities
9f37a959 36except ImportError: # Python 2
3e669f36
PH
37 import htmlentitydefs as compat_html_entities
38
a8156c1d
PH
39try:
40 import html.parser as compat_html_parser
9f37a959 41except ImportError: # Python 2
a8156c1d
PH
42 import HTMLParser as compat_html_parser
43
348d0a7a 44try:
5bd9cc7a 45 import http.client as compat_http_client
9f37a959 46except ImportError: # Python 2
5bd9cc7a 47 import httplib as compat_http_client
348d0a7a 48
9f37a959 49try:
73dce4b2 50 from urllib.parse import parse_qs as compat_parse_qs
9f37a959
PH
51except ImportError: # Python 2
52 from urlparse import parse_qs as compat_parse_qs
348d0a7a 53
3e669f36
PH
54try:
55 compat_str = unicode # Python 2
56except NameError:
57 compat_str = str
58
59try:
60 compat_chr = unichr # Python 2
61except NameError:
62 compat_chr = chr
63
64
65std_headers = {
66 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0',
67 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
68 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
69 'Accept-Encoding': 'gzip, deflate',
70 'Accept-Language': 'en-us,en;q=0.5',
71}
d77c3dfd
FV
72def preferredencoding():
73 """Get preferred encoding.
74
75 Returns the best encoding scheme for the system, based on
76 locale.getpreferredencoding() and some further tweaks.
77 """
bae611f2
AS
78 try:
79 pref = locale.getpreferredencoding()
80 u'TEST'.encode(pref)
81 except:
82 pref = 'UTF-8'
83
84 return pref
d77c3dfd
FV
85
86
87def htmlentity_transform(matchobj):
dd109dee 88 """Transforms an HTML entity to a character.
d77c3dfd
FV
89
90 This function receives a match object and is intended to be used with
91 the re.sub() function.
92 """
93 entity = matchobj.group(1)
94
95 # Known non-numeric HTML entity
3e669f36
PH
96 if entity in compat_html_entities.name2codepoint:
97 return compat_chr(compat_html_entities.name2codepoint[entity])
d77c3dfd 98
89fb51dd 99 mobj = re.match(u'(?u)#(x?\\d+)', entity)
d77c3dfd
FV
100 if mobj is not None:
101 numstr = mobj.group(1)
102 if numstr.startswith(u'x'):
103 base = 16
104 numstr = u'0%s' % numstr
105 else:
106 base = 10
3e669f36 107 return compat_chr(int(numstr, base))
d77c3dfd
FV
108
109 # Unknown entity in name, return its literal representation
110 return (u'&%s;' % entity)
111
a8156c1d
PH
112compat_html_parser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix
113class IDParser(compat_html_parser.HTMLParser):
9e6dd238
FV
114 """Modified HTMLParser that isolates a tag with the specified id"""
115 def __init__(self, id):
116 self.id = id
117 self.result = None
118 self.started = False
119 self.depth = {}
120 self.html = None
121 self.watch_startpos = False
9beb5af8 122 self.error_count = 0
a8156c1d 123 compat_html_parser.HTMLParser.__init__(self)
9e6dd238 124
9beb5af8 125 def error(self, message):
9beb5af8 126 if self.error_count > 10 or self.started:
a8156c1d 127 raise compat_html_parser.HTMLParseError(message, self.getpos())
9beb5af8
FV
128 self.rawdata = '\n'.join(self.html.split('\n')[self.getpos()[0]:]) # skip one line
129 self.error_count += 1
130 self.goahead(1)
131
9e6dd238
FV
132 def loads(self, html):
133 self.html = html
134 self.feed(html)
135 self.close()
136
137 def handle_starttag(self, tag, attrs):
138 attrs = dict(attrs)
139 if self.started:
140 self.find_startpos(None)
141 if 'id' in attrs and attrs['id'] == self.id:
142 self.result = [tag]
143 self.started = True
144 self.watch_startpos = True
145 if self.started:
146 if not tag in self.depth: self.depth[tag] = 0
147 self.depth[tag] += 1
148
149 def handle_endtag(self, tag):
150 if self.started:
151 if tag in self.depth: self.depth[tag] -= 1
152 if self.depth[self.result[0]] == 0:
153 self.started = False
154 self.result.append(self.getpos())
155
156 def find_startpos(self, x):
157 """Needed to put the start position of the result (self.result[1])
158 after the opening tag with the requested id"""
159 if self.watch_startpos:
160 self.watch_startpos = False
161 self.result.append(self.getpos())
162 handle_entityref = handle_charref = handle_data = handle_comment = \
163 handle_decl = handle_pi = unknown_decl = find_startpos
164
165 def get_result(self):
b514df20
PH
166 if self.result is None:
167 return None
168 if len(self.result) != 3:
169 return None
9e6dd238
FV
170 lines = self.html.split('\n')
171 lines = lines[self.result[1][0]-1:self.result[2][0]]
172 lines[0] = lines[0][self.result[1][1]:]
173 if len(lines) == 1:
174 lines[-1] = lines[-1][:self.result[2][1]-self.result[1][1]]
175 lines[-1] = lines[-1][:self.result[2][1]]
176 return '\n'.join(lines).strip()
177
178def get_element_by_id(id, html):
179 """Return the content of the tag with the specified id in the passed HTML document"""
180 parser = IDParser(id)
181 try:
182 parser.loads(html)
a8156c1d 183 except compat_html_parser.HTMLParseError:
9e6dd238
FV
184 pass
185 return parser.get_result()
186
187
188def clean_html(html):
189 """Clean an HTML snippet into a readable string"""
190 # Newline vs <br />
191 html = html.replace('\n', ' ')
192 html = re.sub('\s*<\s*br\s*/?\s*>\s*', '\n', html)
193 # Strip html tags
194 html = re.sub('<.*?>', '', html)
195 # Replace html entities
196 html = unescapeHTML(html)
197 return html
198
199
d77c3dfd
FV
200def sanitize_open(filename, open_mode):
201 """Try to open the given filename, and slightly tweak it if this fails.
202
203 Attempts to open the given filename. If this fails, it tries to change
204 the filename slightly, step by step, until it's either able to open it
205 or it fails and raises a final exception, like the standard open()
206 function.
207
208 It returns the tuple (stream, definitive_file_name).
209 """
210 try:
211 if filename == u'-':
212 if sys.platform == 'win32':
213 import msvcrt
214 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
215 return (sys.stdout, filename)
216 stream = open(encodeFilename(filename), open_mode)
217 return (stream, filename)
e08bee32 218 except (IOError, OSError) as err:
d77c3dfd 219 # In case of error, try to remove win32 forbidden chars
89fb51dd 220 filename = re.sub(u'[/<>:"\\|\\\\?\\*]', u'#', filename)
d77c3dfd
FV
221
222 # An exception here should be caught in the caller
223 stream = open(encodeFilename(filename), open_mode)
224 return (stream, filename)
225
226
227def timeconvert(timestr):
228 """Convert RFC 2822 defined time string into system timestamp"""
229 timestamp = None
230 timetuple = email.utils.parsedate_tz(timestr)
231 if timetuple is not None:
232 timestamp = email.utils.mktime_tz(timetuple)
233 return timestamp
1c469a94
PH
234
235def sanitize_filename(s, restricted=False):
236 """Sanitizes a string so it could be used as part of a filename.
237 If restricted is set, use a stricter subset of allowed characters.
238 """
2c288bda 239 def replace_insane(char):
42cb53fc
FV
240 if char == '?' or ord(char) < 32 or ord(char) == 127:
241 return ''
242 elif char == '"':
240089e5 243 return '' if restricted else '\''
42cb53fc 244 elif char == ':':
1c469a94 245 return '_-' if restricted else ' -'
42cb53fc 246 elif char in '\\/|*<>':
56781d3d 247 return '_'
dffe658b 248 if restricted and (char in '!&\'' or char.isspace()):
1c469a94 249 return '_'
56781d3d
PH
250 if restricted and ord(char) > 127:
251 return '_'
2c288bda 252 return char
42cb53fc
FV
253
254 result = u''.join(map(replace_insane, s))
56781d3d
PH
255 while '__' in result:
256 result = result.replace('__', '_')
257 result = result.strip('_')
46cbda0b
PH
258 # Common case of "Foreign band name - English song title"
259 if restricted and result.startswith('-_'):
260 result = result[2:]
56781d3d
PH
261 if not result:
262 result = '_'
263 return result
d77c3dfd
FV
264
265def orderedSet(iterable):
266 """ Remove all duplicates from the input iterable """
267 res = []
268 for el in iterable:
269 if el not in res:
270 res.append(el)
271 return res
272
273def unescapeHTML(s):
274 """
dd109dee 275 @param s a string
d77c3dfd
FV
276 """
277 assert type(s) == type(u'')
278
89fb51dd 279 result = re.sub(u'(?u)&(.+?);', htmlentity_transform, s)
9e6dd238 280 return result
d77c3dfd
FV
281
282def encodeFilename(s):
283 """
dd109dee 284 @param s The name of the file
d77c3dfd
FV
285 """
286
287 assert type(s) == type(u'')
288
9bb8dc8e 289 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
d77c3dfd
FV
290 # Pass u'' directly to use Unicode APIs on Windows 2000 and up
291 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
292 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
293 return s
294 else:
295 return s.encode(sys.getfilesystemencoding(), 'ignore')
296
297class DownloadError(Exception):
298 """Download Error exception.
299
300 This exception may be thrown by FileDownloader objects if they are not
301 configured to continue on errors. They will contain the appropriate
302 error message.
303 """
304 pass
305
306
307class SameFileError(Exception):
308 """Same File exception.
309
310 This exception will be thrown by FileDownloader objects if they detect
311 multiple files would have to be downloaded to the same file on disk.
312 """
313 pass
314
315
316class PostProcessingError(Exception):
317 """Post Processing exception.
318
319 This exception may be raised by PostProcessor's .run() method to
320 indicate an error in the postprocessing task.
321 """
322 pass
323
324class MaxDownloadsReached(Exception):
325 """ --max-downloads limit has been reached. """
326 pass
327
328
329class UnavailableVideoError(Exception):
330 """Unavailable Format exception.
331
332 This exception will be thrown when a video is requested
333 in a format that is not available for that video.
334 """
335 pass
336
337
338class ContentTooShortError(Exception):
339 """Content Too Short exception.
340
341 This exception may be raised by FileDownloader objects when a file they
342 download is too small for what the server announced first, indicating
343 the connection was probably interrupted.
344 """
345 # Both in bytes
346 downloaded = None
347 expected = None
348
349 def __init__(self, downloaded, expected):
350 self.downloaded = downloaded
351 self.expected = expected
352
353
0b8c922d
FV
354class Trouble(Exception):
355 """Trouble helper exception
dffe658b 356
0b8c922d
FV
357 This is an exception to be handled with
358 FileDownloader.trouble
359 """
360
01ba00ca 361class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
d77c3dfd
FV
362 """Handler for HTTP requests and responses.
363
364 This class, when installed with an OpenerDirector, automatically adds
365 the standard headers to every HTTP request and handles gzipped and
366 deflated responses from web servers. If compression is to be avoided in
367 a particular request, the original request in the program code only has
368 to include the HTTP header "Youtubedl-No-Compression", which will be
369 removed before making the real request.
370
371 Part of this code was copied from:
372
373 http://techknack.net/python-urllib2-handlers/
374
375 Andrew Rowls, the author of that code, agreed to release it to the
376 public domain.
377 """
378
379 @staticmethod
380 def deflate(data):
381 try:
382 return zlib.decompress(data, -zlib.MAX_WBITS)
383 except zlib.error:
384 return zlib.decompress(data)
385
386 @staticmethod
387 def addinfourl_wrapper(stream, headers, url, code):
01ba00ca
PH
388 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
389 return compat_urllib_request.addinfourl(stream, headers, url, code)
390 ret = compat_urllib_request.addinfourl(stream, headers, url)
d77c3dfd
FV
391 ret.code = code
392 return ret
393
394 def http_request(self, req):
395 for h in std_headers:
396 if h in req.headers:
397 del req.headers[h]
398 req.add_header(h, std_headers[h])
399 if 'Youtubedl-no-compression' in req.headers:
400 if 'Accept-encoding' in req.headers:
401 del req.headers['Accept-encoding']
402 del req.headers['Youtubedl-no-compression']
403 return req
404
405 def http_response(self, req, resp):
406 old_resp = resp
407 # gzip
408 if resp.headers.get('Content-encoding', '') == 'gzip':
03f9daab 409 gz = gzip.GzipFile(fileobj=io.BytesIO(resp.read()), mode='r')
d77c3dfd
FV
410 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
411 resp.msg = old_resp.msg
412 # deflate
413 if resp.headers.get('Content-encoding', '') == 'deflate':
03f9daab 414 gz = io.BytesIO(self.deflate(resp.read()))
d77c3dfd
FV
415 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
416 resp.msg = old_resp.msg
417 return resp