Allow users to specify encoding in each config files (#4357)

author Lesmiscore <redacted>

Fri, 15 Jul 2022 11:52:14 +0000 (20:52 +0900)

committer GitHub <redacted>

Fri, 15 Jul 2022 11:52:14 +0000 (20:52 +0900)
author Lesmiscore <redacted>
Fri, 15 Jul 2022 11:52:14 +0000 (20:52 +0900)
committer GitHub <redacted>
Fri, 15 Jul 2022 11:52:14 +0000 (20:52 +0900)
diff --git a/README.md b/README.md

index af5fb46ae8fb401fb23206691e28cd244697c2ed..b9e62d54b34e173f9aa6609bb7b296f62340179f 100644 (file)
--- a/README.md
+++ b/README.md
@@ -1161,6 +1161,15 @@ # Save all videos under YouTube directory in your home directory
  
  You can use `--ignore-config` if you want to disable all configuration files for a particular yt-dlp run. If `--ignore-config` is found inside any configuration file, no further configuration will be loaded. For example, having the option in the portable configuration file prevents loading of home, user, and system configurations. Additionally, (for backward compatibility) if `--ignore-config` is found inside the system configuration file, the user configuration is not loaded.
  
+### Specifying encoding of config files
+
+By default, config files are read in the encoding from system locale.
+If you saved your config file in a different encoding than that, you may write `# coding: ENCODING` to the beginning of the file. (e.g. `# coding: shift-jis`)
+
+There must not be any characters before that, including spaces.
+
+If you have BOM enabled, it will be used instead.
+
  ### Authentication with `.netrc` file
  
  You may also want to configure automatic credentials storage for extractors that support authentication (by providing login and password with `--username` and `--password`) in order not to pass credentials as command line arguments on every yt-dlp execution and prevent tracking plain text passwords in the shell command history. You can achieve this using a [`.netrc` file](https://stackoverflow.com/tags/.netrc/info) on a per extractor basis. For that you will need to create a `.netrc` file in `--netrc-location` and restrict permissions to read/write by only you:
diff --git a/test/test_utils.py b/test/test_utils.py

index 948d5d0596ca0d056693ed5675560fc566b2270b..c668ff9e4277166eaa25afc055ff3ea71813e832 100644 (file)
--- a/test/test_utils.py
+++ b/test/test_utils.py
@@ -39,6 +39,7 @@
      datetime_from_str,
      detect_exe_version,
      determine_ext,
+    determine_file_encoding,
      dfxp2srt,
      dict_get,
      encode_base_n,
@@ -1822,6 +1823,33 @@ def test_locked_file(self):
              with contextlib.suppress(OSError):
                  os.remove(FILE)
  
+    def test_determine_file_encoding(self):
+        self.assertEqual(determine_file_encoding(b''), (None, 0))
+        self.assertEqual(determine_file_encoding(b'--verbose -x --audio-format mkv\n'), (None, 0))
+
+        self.assertEqual(determine_file_encoding(b'\xef\xbb\xbf'), ('utf-8', 3))
+        self.assertEqual(determine_file_encoding(b'\x00\x00\xfe\xff'), ('utf-32-be', 4))
+        self.assertEqual(determine_file_encoding(b'\xff\xfe'), ('utf-16-le', 2))
+
+        self.assertEqual(determine_file_encoding(b'# -*- coding: cp932 -*-'), ('cp932', 0))
+        self.assertEqual(determine_file_encoding(b'# -*- coding: cp932 -*-\n'), ('cp932', 0))
+        self.assertEqual(determine_file_encoding(b'# -*- coding: cp932 -*-\r\n'), ('cp932', 0))
+
+        self.assertEqual(determine_file_encoding(b'# coding: utf-8\n--verbose'), ('utf-8', 0))
+        self.assertEqual(determine_file_encoding(b'# coding: someencodinghere-12345\n--verbose'), ('someencodinghere-12345', 0))
+
+        self.assertEqual(determine_file_encoding(b'# vi: set fileencoding=cp932'), ('cp932', 0))
+        self.assertEqual(determine_file_encoding(b'# vi: set fileencoding=cp932\n'), ('cp932', 0))
+        self.assertEqual(determine_file_encoding(b'# vi: set fileencoding=cp932\r\n'), ('cp932', 0))
+        self.assertEqual(determine_file_encoding(b'# vi: set fileencoding=cp932,euc-jp\r\n'), ('cp932', 0))
+
+        self.assertEqual(determine_file_encoding(
+            b'\0\0\0#\0\0\0 \0\0\0c\0\0\0o\0\0\0d\0\0\0i\0\0\0n\0\0\0g\0\0\0:\0\0\0 \0\0\0u\0\0\0t\0\0\0f\0\0\0-\0\0\x003\0\0\x002\0\0\0-\0\0\0b\0\0\0e'),
+            ('utf-32-be', 0))
+        self.assertEqual(determine_file_encoding(
+            b'#\0 \0c\0o\0d\0i\0n\0g\0:\0 \0u\0t\0f\0-\x001\x006\0-\0l\0e\0'),
+            ('utf-16-le', 0))
+
  
  if __name__ == '__main__':
      unittest.main()
diff --git a/yt_dlp/utils.py b/yt_dlp/utils.py

index 6e0c31c012ec0b5c4b69bd387bf2844a0dc472c6..5d4e607abaaf76e6dff543b5110fc0d392fc84d9 100644 (file)
--- a/yt_dlp/utils.py
+++ b/yt_dlp/utils.py
@@ -3485,17 +3485,19 @@ def age_restricted(content_limit, age_limit):
      return age_limit < content_limit
  
  
+BOMS = [
+    (b'\xef\xbb\xbf', 'utf-8'),
+    (b'\x00\x00\xfe\xff', 'utf-32-be'),
+    (b'\xff\xfe\x00\x00', 'utf-32-le'),
+    (b'\xff\xfe', 'utf-16-le'),
+    (b'\xfe\xff', 'utf-16-be'),
+]
+""" List of known byte-order-marks (BOM) """
+
+
  def is_html(first_bytes):
      """ Detect whether a file contains HTML by examining its first bytes. """
  
-    BOMS = [
-        (b'\xef\xbb\xbf', 'utf-8'),
-        (b'\x00\x00\xfe\xff', 'utf-32-be'),
-        (b'\xff\xfe\x00\x00', 'utf-32-le'),
-        (b'\xff\xfe', 'utf-16-le'),
-        (b'\xfe\xff', 'utf-16-be'),
-    ]
-
      encoding = 'utf-8'
      for bom, enc in BOMS:
          while first_bytes.startswith(bom):
@@ -5394,6 +5396,41 @@ def read_stdin(what):
      return sys.stdin
  
  
+def determine_file_encoding(data):
+    """
+    From the first 512 bytes of a given file,
+    it tries to detect the encoding to be used to read as text.
+
+    @returns (encoding, bytes to skip)
+    """
+
+    for bom, enc in BOMS:
+        # matching BOM beats any declaration
+        # BOMs are skipped to prevent any errors
+        if data.startswith(bom):
+            return enc, len(bom)
+
+    # strip off all null bytes to match even when UTF-16 or UTF-32 is used
+    # endians don't matter
+    data = data.replace(b'\0', b'')
+
+    PREAMBLES = [
+        # "# -*- coding: utf-8 -*-"
+        # "# coding: utf-8"
+        rb'(?m)^#(?:\s+-\*-)?\s*coding\s*:\s*(?P<encoding>\S+)(?:\s+-\*-)?\s*$',
+        # "# vi: set fileencoding=utf-8"
+        rb'^#\s+vi\s*:\s+set\s+fileencoding=(?P<encoding>[^\s,]+)'
+    ]
+    for pb in PREAMBLES:
+        mobj = re.match(pb, data)
+        if not mobj:
+            continue
+        # preambles aren't skipped since they're just ignored when reading as config
+        return mobj.group('encoding').decode(), 0
+
+    return None, 0
+
+
  class Config:
      own_args = None
      parsed_args = None
@@ -5445,12 +5482,17 @@ def __str__(self):
      @staticmethod
      def read_file(filename, default=[]):
          try:
-            optionf = open(filename)
+            optionf = open(filename, 'rb')
          except OSError:
              return default  # silently skip if file is not present
+        try:
+            enc, skip = determine_file_encoding(optionf.read(512))
+            optionf.seek(skip, io.SEEK_SET)
+        except OSError:
+            enc = None  # silently skip read errors
          try:
              # FIXME: https://github.com/ytdl-org/youtube-dl/commit/dfe5fa49aed02cf36ba9f743b11b0903554b5e56
-            contents = optionf.read()
+            contents = optionf.read().decode(enc or preferredencoding())
              res = shlex.split(contents, comments=True)
          except Exception as err:
              raise ValueError(f'Unable to parse "{filename}": {err}')
author	Lesmiscore <redacted>
	Fri, 15 Jul 2022 11:52:14 +0000 (20:52 +0900)
committer	GitHub <redacted>
	Fri, 15 Jul 2022 11:52:14 +0000 (20:52 +0900)
README.md		patch \| blob \| blame \| history
test/test_utils.py		patch \| blob \| blame \| history
yt_dlp/utils.py		patch \| blob \| blame \| history