From: John Runyon Date: Tue, 24 Oct 2023 02:21:50 +0000 (-0600) Subject: urls - beautifulsoup: use encoding from Content-Type if available X-Git-Url: https://jfr.im/git/erebus.git/commitdiff_plain/b91b84fabab3a5dbbc50cae53a54dcedaa96d903 urls - beautifulsoup: use encoding from Content-Type if available --- diff --git a/modules/urls.py b/modules/urls.py index 67abc24..5a8b4a6 100644 --- a/modules/urls.py +++ b/modules/urls.py @@ -289,7 +289,13 @@ def goturl(url): return response # Try to add type and length headers to reply - c_type = response.getheader('Content-Type', '').split(';', 1)[0] + c_type_fields = response.getheader('Content-Type', '').split(';') + c_type = c_type_fields.pop(0) + c_charset = None + for f in c_type_fields: + f = f.strip() + if len(f) > 8 and f[0:8] == 'charset=': + c_charset = f[8:] c_len = response.getheader('Content-Length') if c_type != '': output.append("[%s] " % (c_type)) @@ -316,7 +322,7 @@ def goturl(url): else: output.append("[%s] " % (_humanize_bytes(len(responsebody)))) try: - soup = BeautifulSoup(responsebody) + soup = BeautifulSoup(responsebody, from_encoding=c_charset) if soup.title: output.append('Title: ' + unescape('%s' % (soup.find('title').string.strip()))) else: