]> jfr.im git - erebus.git/commitdiff
urls - beautifulsoup: use encoding from Content-Type if available
authorJohn Runyon <redacted>
Tue, 24 Oct 2023 02:21:50 +0000 (20:21 -0600)
committerJohn Runyon <redacted>
Tue, 24 Oct 2023 02:21:50 +0000 (20:21 -0600)
modules/urls.py

index 67abc24a626b737c4c4a5eca008ad1caced9f697..5a8b4a6a1852c1c0a5301b8897f49528311fce67 100644 (file)
@@ -289,7 +289,13 @@ def goturl(url):
                return response
 
        # Try to add type and length headers to reply
-       c_type = response.getheader('Content-Type', '').split(';', 1)[0]
+       c_type_fields = response.getheader('Content-Type', '').split(';')
+       c_type = c_type_fields.pop(0)
+       c_charset = None
+       for f in c_type_fields:
+               f = f.strip()
+               if len(f) > 8 and f[0:8] == 'charset=':
+                       c_charset = f[8:]
        c_len = response.getheader('Content-Length')
        if c_type != '':
                output.append("[%s] " % (c_type))
@@ -316,7 +322,7 @@ def goturl(url):
                        else:
                                output.append("[%s] " % (_humanize_bytes(len(responsebody))))
                        try:
-                               soup = BeautifulSoup(responsebody)
+                               soup = BeautifulSoup(responsebody, from_encoding=c_charset)
                                if soup.title:
                                        output.append('Title: ' + unescape('%s' % (soup.find('title').string.strip())))
                                else: