[outtmpl] Format type `U` for unicode normalization

author pukkandan <redacted>

Sat, 25 Sep 2021 20:09:44 +0000 (01:39 +0530)

committer pukkandan <redacted>

Sat, 25 Sep 2021 20:11:01 +0000 (01:41 +0530)
author pukkandan <redacted>
Sat, 25 Sep 2021 20:09:44 +0000 (01:39 +0530)
committer pukkandan <redacted>
Sat, 25 Sep 2021 20:11:01 +0000 (01:41 +0530)
diff --git a/README.md b/README.md

index a1488028229accbe77e1f6044d98abbf0a6333cb..d13eb4dc1e31762dbf93292da1bfe726a7f5ed45 100644 (file)
--- a/README.md
+++ b/README.md
@@ -964,6 +964,7 @@ # OUTPUT TEMPLATE
  1. **Alternatives**: Alternate fields can be specified seperated with a `,`. Eg: `%(release_date>%Y,upload_date>%Y|Unknown)s`
  1. **Default**: A literal default value can be specified for when the field is empty using a `|` seperator. This overrides `--output-na-template`. Eg: `%(uploader|Unknown)s`
  1. **More Conversions**: In addition to the normal format types `diouxXeEfFgGcrs`, `B`, `j`, `l`, `q` can be used for converting to **B**ytes, **j**son, a comma seperated **l**ist and a string **q**uoted for the terminal respectively
+1. **Unicode normalization**: The format type `U` can be used for NFC [unicode normalization](https://docs.python.org/3/library/unicodedata.html#unicodedata.normalize). The alternate form flag (`#`) changes the normalization to NFD and the conversion flag `+` can be used for NFKC/NFKD compatibility equivalence normalization. Eg: `%(title)+.100U` is NFKC
  
  To summarize, the general syntax for a field is:
  ```
diff --git a/test/test_YoutubeDL.py b/test/test_YoutubeDL.py

index 6feca2ce249f62eb9c20582ad88105e6c8091ef2..f6483575f330d8153678d588d8b80886e78bb7ce 100644 (file)
--- a/test/test_YoutubeDL.py
+++ b/test/test_YoutubeDL.py
@@ -649,7 +649,7 @@ def test_add_extra_info(self):
          'title2': '%PATH%',
          'title3': 'foo/bar\\test',
          'title4': 'foo "bar" test',
-        'title5': 'áéí',
+        'title5': 'áéí 𝐀',
          'timestamp': 1618488000,
          'duration': 100000,
          'playlist_index': 1,
@@ -769,6 +769,10 @@ def expect_same_infodict(out):
          test('%(formats.:.id) 15l', '  id1, id2, id3')
          test('%(formats)j', (json.dumps(FORMATS), sanitize(json.dumps(FORMATS))))
          test('%(title5).3B', 'á')
+        test('%(title5)U', 'áéí 𝐀')
+        test('%(title5)#U', 'a\u0301e\u0301i\u0301 𝐀')
+        test('%(title5)+U', 'áéí A')
+        test('%(title5)+#U', 'a\u0301e\u0301i\u0301 A')
          if compat_os_name == 'nt':
              test('%(title4)q', ('"foo \\"bar\\" test"', "'foo _'bar_' test'"))
          else:
diff --git a/yt_dlp/YoutubeDL.py b/yt_dlp/YoutubeDL.py

index 11371fa860aa2593433db9726010018438ed47ee..a6eddd7f78f24622b1609358541e0252d52e0387 100644 (file)
--- a/yt_dlp/YoutubeDL.py
+++ b/yt_dlp/YoutubeDL.py
@@ -25,6 +25,7 @@
  import tokenize
  import traceback
  import random
+import unicodedata
  
  from string import ascii_letters
  
@@ -908,7 +909,7 @@ def escape_outtmpl(outtmpl):
      def validate_outtmpl(cls, outtmpl):
          ''' @return None or Exception object '''
          outtmpl = re.sub(
-            STR_FORMAT_RE_TMPL.format('[^)]*', '[ljqB]'),
+            STR_FORMAT_RE_TMPL.format('[^)]*', '[ljqBU]'),
              lambda mobj: f'{mobj.group(0)[:-1]}s',
              cls._outtmpl_expandpath(outtmpl))
          try:
@@ -940,7 +941,7 @@ def prepare_outtmpl(self, outtmpl, info_dict, sanitize=None):
          }
  
          TMPL_DICT = {}
-        EXTERNAL_FORMAT_RE = re.compile(STR_FORMAT_RE_TMPL.format('[^)]*', f'[{STR_FORMAT_TYPES}ljqB]'))
+        EXTERNAL_FORMAT_RE = re.compile(STR_FORMAT_RE_TMPL.format('[^)]*', f'[{STR_FORMAT_TYPES}ljqBU]'))
          MATH_FUNCTIONS = {
              '+': float.__add__,
              '-': float.__sub__,
@@ -1031,21 +1032,26 @@ def create_key(outer_mobj):
              value = default if value is None else value
  
              str_fmt = f'{fmt[:-1]}s'
-            if fmt[-1] == 'l':
+            if fmt[-1] == 'l':  # list
                  value, fmt = ', '.join(variadic(value)), str_fmt
-            elif fmt[-1] == 'j':
+            elif fmt[-1] == 'j':  # json
                  value, fmt = json.dumps(value, default=_dumpjson_default), str_fmt
-            elif fmt[-1] == 'q':
+            elif fmt[-1] == 'q':  # quoted
                  value, fmt = compat_shlex_quote(str(value)), str_fmt
-            elif fmt[-1] == 'B':
+            elif fmt[-1] == 'B':  # bytes
                  value = f'%{str_fmt}'.encode('utf-8') % str(value).encode('utf-8')
                  value, fmt = value.decode('utf-8', 'ignore'), 's'
+            elif fmt[-1] == 'U':  # unicode normalized
+                opts = outer_mobj.group('conversion') or ''
+                value, fmt = unicodedata.normalize(
+                    # "+" = compatibility equivalence, "#" = NFD
+                    'NF%s%s' % ('K' if '+' in opts else '', 'D' if '#' in opts else 'C'),
+                    value), str_fmt
              elif fmt[-1] == 'c':
-                value = str(value)
-                if value is None:
-                    value, fmt = default, 's'
+                if value:
+                    value = str(value)[0]
                  else:
-                    value = value[0]
+                    fmt = str_fmt
              elif fmt[-1] not in 'rs':  # numeric
                  value = float_or_none(value)
                  if value is None:
diff --git a/yt_dlp/utils.py b/yt_dlp/utils.py

index 141d2c9ccd7dde8463062e8e102468073b8357de..770d7feb9c75a9a3cee49bf2cec0e79bd69bf7af 100644 (file)
--- a/yt_dlp/utils.py
+++ b/yt_dlp/utils.py
@@ -4474,12 +4474,12 @@ def q(qid):
  STR_FORMAT_RE_TMPL = r'''(?x)
      (?<!%)(?P<prefix>(?:%%)*)
      %
-    (?P<has_key>\((?P<key>{0})\))?  # mapping key
+    (?P<has_key>\((?P<key>{0})\))?
      (?P<format>
-        (?:[#0\-+ ]+)?  # conversion flags (optional)
-        (?:\d+)?  # minimum field width (optional)
-        (?:\.\d+)?  # precision (optional)
-        [hlL]?  # length modifier (optional)
+        (?P<conversion>[#0\-+ ]+)?
+        (?P<min_width>\d+)?
+        (?P<precision>\.\d+)?
+        (?P<len_mod>[hlL])?  # unused in python
          {1}  # conversion type
      )
  '''
author	pukkandan <redacted>
	Sat, 25 Sep 2021 20:09:44 +0000 (01:39 +0530)
committer	pukkandan <redacted>
	Sat, 25 Sep 2021 20:11:01 +0000 (01:41 +0530)
README.md		patch \| blob \| blame \| history
test/test_YoutubeDL.py		patch \| blob \| blame \| history
yt_dlp/YoutubeDL.py		patch \| blob \| blame \| history
yt_dlp/utils.py		patch \| blob \| blame \| history