Allow users to specify encoding in each config files (#4357)

[yt-dlp.git] / yt_dlp / utils.py
diff --git a/yt_dlp/utils.py b/yt_dlp/utils.py

index 6e0c31c012ec0b5c4b69bd387bf2844a0dc472c6..5d4e607abaaf76e6dff543b5110fc0d392fc84d9 100644 (file)
--- a/yt_dlp/utils.py
+++ b/yt_dlp/utils.py
@@ -3485,17 +3485,19 @@ def age_restricted(content_limit, age_limit):
      return age_limit < content_limit
  
  
+BOMS = [
+    (b'\xef\xbb\xbf', 'utf-8'),
+    (b'\x00\x00\xfe\xff', 'utf-32-be'),
+    (b'\xff\xfe\x00\x00', 'utf-32-le'),
+    (b'\xff\xfe', 'utf-16-le'),
+    (b'\xfe\xff', 'utf-16-be'),
+]
+""" List of known byte-order-marks (BOM) """
+
+
  def is_html(first_bytes):
      """ Detect whether a file contains HTML by examining its first bytes. """
  
-    BOMS = [
-        (b'\xef\xbb\xbf', 'utf-8'),
-        (b'\x00\x00\xfe\xff', 'utf-32-be'),
-        (b'\xff\xfe\x00\x00', 'utf-32-le'),
-        (b'\xff\xfe', 'utf-16-le'),
-        (b'\xfe\xff', 'utf-16-be'),
-    ]
-
      encoding = 'utf-8'
      for bom, enc in BOMS:
          while first_bytes.startswith(bom):
@@ -5394,6 +5396,41 @@ def read_stdin(what):
      return sys.stdin
  
  
+def determine_file_encoding(data):
+    """
+    From the first 512 bytes of a given file,
+    it tries to detect the encoding to be used to read as text.
+
+    @returns (encoding, bytes to skip)
+    """
+
+    for bom, enc in BOMS:
+        # matching BOM beats any declaration
+        # BOMs are skipped to prevent any errors
+        if data.startswith(bom):
+            return enc, len(bom)
+
+    # strip off all null bytes to match even when UTF-16 or UTF-32 is used
+    # endians don't matter
+    data = data.replace(b'\0', b'')
+
+    PREAMBLES = [
+        # "# -*- coding: utf-8 -*-"
+        # "# coding: utf-8"
+        rb'(?m)^#(?:\s+-\*-)?\s*coding\s*:\s*(?P<encoding>\S+)(?:\s+-\*-)?\s*$',
+        # "# vi: set fileencoding=utf-8"
+        rb'^#\s+vi\s*:\s+set\s+fileencoding=(?P<encoding>[^\s,]+)'
+    ]
+    for pb in PREAMBLES:
+        mobj = re.match(pb, data)
+        if not mobj:
+            continue
+        # preambles aren't skipped since they're just ignored when reading as config
+        return mobj.group('encoding').decode(), 0
+
+    return None, 0
+
+
  class Config:
      own_args = None
      parsed_args = None
@@ -5445,12 +5482,17 @@ def __str__(self):
      @staticmethod
      def read_file(filename, default=[]):
          try:
-            optionf = open(filename)
+            optionf = open(filename, 'rb')
          except OSError:
              return default  # silently skip if file is not present
+        try:
+            enc, skip = determine_file_encoding(optionf.read(512))
+            optionf.seek(skip, io.SEEK_SET)
+        except OSError:
+            enc = None  # silently skip read errors
          try:
              # FIXME: https://github.com/ytdl-org/youtube-dl/commit/dfe5fa49aed02cf36ba9f743b11b0903554b5e56
-            contents = optionf.read()
+            contents = optionf.read().decode(enc or preferredencoding())
              res = shlex.split(contents, comments=True)
          except Exception as err:
              raise ValueError(f'Unable to parse "{filename}": {err}')