Add option `--replace-in-metadata`

author pukkandan <redacted>

Mon, 9 Aug 2021 19:52:55 +0000 (01:22 +0530)

committer pukkandan <redacted>

Mon, 9 Aug 2021 19:52:55 +0000 (01:22 +0530)
author pukkandan <redacted>
Mon, 9 Aug 2021 19:52:55 +0000 (01:22 +0530)
committer pukkandan <redacted>
Mon, 9 Aug 2021 19:52:55 +0000 (01:22 +0530)
diff --git a/README.md b/README.md

index be4323771b42d6612ec3444583394de065c83175..493f437d1ba6f81b8d874784d65f294140475528 100644 (file)
--- a/README.md
+++ b/README.md
@@ -777,6 +777,10 @@ ## Post-Processing Options:
      --parse-metadata FROM:TO         Parse additional metadata like title/artist
                                       from other fields; see "MODIFYING METADATA"
                                       for details
+    --replace-in-metadata FIELDS REGEX REPLACE
+                                     Replace text in a metadata field using the
+                                     given regex. This option can be used
+                                     multiple times
      --xattrs                         Write metadata to the video file's xattrs
                                       (using dublin core and xdg standards)
      --fixup POLICY                   Automatically correct known faults of the
@@ -1333,7 +1337,11 @@ # preferring better codec and then larger total bitrate for the same resolution
  
  # MODIFYING METADATA
  
-The metadata obtained the the extractors can be modified by using `--parse-metadata FROM:TO`. The general syntax is to give the name of a field or a template (with similar syntax to [output template](#output-template)) to extract data from, and the format to interpret it as, separated by a colon `:`. Either a [python regular expression](https://docs.python.org/3/library/re.html#regular-expression-syntax) with named capture groups or a similar syntax to the [output template](#output-template) (only `%(field)s` formatting is supported) can be used for `TO`. The option can be used multiple times to parse and modify various fields.
+The metadata obtained the the extractors can be modified by using `--parse-metadata` and `--replace-in-metadata`
+
+`--replace-in-metadata FIELDS REGEX REPLACE` is used to replace text in any metatdata field using [python regular expression](https://docs.python.org/3/library/re.html#regular-expression-syntax). [Backreferences](https://docs.python.org/3/library/re.html?highlight=backreferences#re.sub) can be used in the replace string for advanced use.
+
+The general syntax of `--parse-metadata FROM:TO` is to give the name of a field or a template (with same syntax as [output template](#output-template)) to extract data from, and the format to interpret it as, separated by a colon `:`. Either a [python regular expression](https://docs.python.org/3/library/re.html#regular-expression-syntax) with named capture groups or a similar syntax to the [output template](#output-template) (only `%(field)s` formatting is supported) can be used for `TO`. The option can be used multiple times to parse and modify various fields.
  
  Note that any field created by this can be used in the [output template](#output-template) and will also affect the media file's metadata added when using `--add-metadata`.
  
@@ -1380,6 +1388,9 @@ # Set title as "Series name S01E05"
  # Set "comment" field in video metadata using description instead of webpage_url
  $ yt-dlp --parse-metadata 'description:(?s)(?P<meta_comment>.+)' --add-metadata
  
+# Replace all spaces and "_" in title and uploader with a `-`
+$ yt-dlp --replace-in-metadata 'title,uploader' '[ _]' '-'
+
  ```
  
  # EXTRACTOR ARGUMENTS
diff --git a/test/test_postprocessors.py b/test/test_postprocessors.py

index bdc2d93cb6ffe55fee1d1b17dcaacf00ccc3023a..320e69e8873560722fb8e8a0a49a5cad39b6f673 100644 (file)
--- a/test/test_postprocessors.py
+++ b/test/test_postprocessors.py
@@ -14,29 +14,28 @@
      ExecAfterDownloadPP,
      FFmpegThumbnailsConvertorPP,
      MetadataFromFieldPP,
-    MetadataFromTitlePP,
+    MetadataParserPP,
  )
  
  
  class TestMetadataFromField(unittest.TestCase):
-    def test_format_to_regex(self):
-        pp = MetadataFromFieldPP(None, ['title:%(title)s - %(artist)s'])
-        self.assertEqual(pp._data[0]['regex'], r'(?P<title>.+)\ \-\ (?P<artist>.+)')
-
-    def test_field_to_outtmpl(self):
-        pp = MetadataFromFieldPP(None, ['title:%(title)s : %(artist)s'])
-        self.assertEqual(pp._data[0]['tmpl'], '%(title)s')
-
-    def test_in_out_seperation(self):
-        pp = MetadataFromFieldPP(None, ['%(title)s \\: %(artist)s:%(title)s : %(artist)s'])
-        self.assertEqual(pp._data[0]['in'], '%(title)s : %(artist)s')
-        self.assertEqual(pp._data[0]['out'], '%(title)s : %(artist)s')
-
  
-class TestMetadataFromTitle(unittest.TestCase):
      def test_format_to_regex(self):
-        pp = MetadataFromTitlePP(None, '%(title)s - %(artist)s')
-        self.assertEqual(pp._titleregex, r'(?P<title>.+)\ \-\ (?P<artist>.+)')
+        self.assertEqual(
+            MetadataParserPP.format_to_regex('%(title)s - %(artist)s'),
+            r'(?P<title>.+)\ \-\ (?P<artist>.+)')
+        self.assertEqual(MetadataParserPP.format_to_regex(r'(?P<x>.+)'), r'(?P<x>.+)')
+
+    def test_field_to_template(self):
+        self.assertEqual(MetadataParserPP.field_to_template('title'), '%(title)s')
+        self.assertEqual(MetadataParserPP.field_to_template('1'), '1')
+        self.assertEqual(MetadataParserPP.field_to_template('foo bar'), 'foo bar')
+        self.assertEqual(MetadataParserPP.field_to_template(' literal'), ' literal')
+
+    def test_metadatafromfield(self):
+        self.assertEqual(
+            MetadataFromFieldPP.to_action('%(title)s \\: %(artist)s:%(title)s : %(artist)s'),
+            (MetadataParserPP.Actions.INTERPRET, '%(title)s : %(artist)s', '%(title)s : %(artist)s'))
  
  
  class TestConvertThumbnail(unittest.TestCase):
diff --git a/yt_dlp/YoutubeDL.py b/yt_dlp/YoutubeDL.py

index 2791d180acb7c0e3de6d2f1fec54d17ddc2242d5..72d9f2c336c27fadf8f746d9bdd5978eb5a6d3ad 100644 (file)
--- a/yt_dlp/YoutubeDL.py
+++ b/yt_dlp/YoutubeDL.py
@@ -1281,7 +1281,7 @@ def process_ie_result(self, ie_result, download=True, extra_info={}):
              ie_result = self.process_video_result(ie_result, download=download)
              additional_urls = (ie_result or {}).get('additional_urls')
              if additional_urls:
-                # TODO: Improve MetadataFromFieldPP to allow setting a list
+                # TODO: Improve MetadataParserPP to allow setting a list
                  if isinstance(additional_urls, compat_str):
                      additional_urls = [additional_urls]
                  self.to_screen(
diff --git a/yt_dlp/__init__.py b/yt_dlp/__init__.py

index 7d081207904bd650132c4fb2ffb1fa7b5936aa85..73e3f9f78602fa3c4c334c1eb30990fe52ad6eb9 100644 (file)
--- a/yt_dlp/__init__.py
+++ b/yt_dlp/__init__.py
@@ -7,6 +7,7 @@
  
  import codecs
  import io
+import itertools
  import os
  import random
  import re
@@ -18,6 +19,7 @@
  )
  from .compat import (
      compat_getpass,
+    compat_shlex_quote,
      workaround_optparse_bug9161,
  )
  from .cookies import SUPPORTED_BROWSERS
@@ -46,14 +48,15 @@
  from .extractor import gen_extractors, list_extractors
  from .extractor.common import InfoExtractor
  from .extractor.adobepass import MSO_INFO
-from .postprocessor.ffmpeg import (
+from .postprocessor import (
      FFmpegExtractAudioPP,
      FFmpegSubtitlesConvertorPP,
      FFmpegThumbnailsConvertorPP,
      FFmpegVideoConvertorPP,
      FFmpegVideoRemuxerPP,
+    MetadataFromFieldPP,
+    MetadataParserPP,
  )
-from .postprocessor.metadatafromfield import MetadataFromFieldPP
  from .YoutubeDL import YoutubeDL
  
  
@@ -344,13 +347,29 @@ def validate_outtmpl(tmpl, msg):
          if re.match(InfoExtractor.FormatSort.regex, f) is None:
              parser.error('invalid format sort string "%s" specified' % f)
  
-    if opts.metafromfield is None:
-        opts.metafromfield = []
+    def metadataparser_actions(f):
+        if isinstance(f, str):
+            cmd = '--parse-metadata %s' % compat_shlex_quote(f)
+            try:
+                actions = [MetadataFromFieldPP.to_action(f)]
+            except Exception as err:
+                parser.error(f'{cmd} is invalid; {err}')
+        else:
+            cmd = '--replace-in-metadata %s' % ' '.join(map(compat_shlex_quote, f))
+            actions = ((MetadataParserPP.Actions.REPLACE, x, *f[1:]) for x in f[0].split(','))
+
+        for action in actions:
+            try:
+                MetadataParserPP.validate_action(*action)
+            except Exception as err:
+                parser.error(f'{cmd} is invalid; {err}')
+            yield action
+
+    if opts.parse_metadata is None:
+        opts.parse_metadata = []
      if opts.metafromtitle is not None:
-        opts.metafromfield.append('title:%s' % opts.metafromtitle)
-    for f in opts.metafromfield:
-        if re.match(MetadataFromFieldPP.regex, f) is None:
-            parser.error('invalid format string "%s" specified for --parse-metadata' % f)
+        opts.parse_metadata.append('title:%s' % opts.metafromtitle)
+    opts.parse_metadata = list(itertools.chain(*map(metadataparser_actions, opts.parse_metadata)))
  
      any_getting = opts.forceprint or opts.geturl or opts.gettitle or opts.getid or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat or opts.getduration or opts.dumpjson or opts.dump_single_json
      any_printing = opts.print_json
@@ -402,10 +421,10 @@ def report_conflict(arg1, arg2):
  
      # PostProcessors
      postprocessors = []
-    if opts.metafromfield:
+    if opts.parse_metadata:
          postprocessors.append({
-            'key': 'MetadataFromField',
-            'formats': opts.metafromfield,
+            'key': 'MetadataParser',
+            'actions': opts.parse_metadata,
              # Run this immediately after extraction is complete
              'when': 'pre_process'
          })
diff --git a/yt_dlp/options.py b/yt_dlp/options.py

index aef2f8143412a33e78534514873ff8991f854871..f8cfdeb1267915a1469d516e5fc9d1cf68caaf93 100644 (file)
--- a/yt_dlp/options.py
+++ b/yt_dlp/options.py
@@ -1241,10 +1241,14 @@ def _dict_from_options_callback(
          help=optparse.SUPPRESS_HELP)
      postproc.add_option(
          '--parse-metadata',
-        metavar='FROM:TO', dest='metafromfield', action='append',
+        metavar='FROM:TO', dest='parse_metadata', action='append',
          help=(
              'Parse additional metadata like title/artist from other fields; '
              'see "MODIFYING METADATA" for details'))
+    postproc.add_option(
+        '--replace-in-metadata',
+        dest='parse_metadata', metavar='FIELDS REGEX REPLACE', action='append', nargs=3,
+        help='Replace text in a metadata field using the given regex. This option can be used multiple times')
      postproc.add_option(
          '--xattrs',
          action='store_true', dest='xattrs', default=False,
diff --git a/yt_dlp/postprocessor/__init__.py b/yt_dlp/postprocessor/__init__.py

index 98cbe866574d2e2466a7cf3fdf8f91628ee561e6..b1a6917d704927184415faabd2c2a11ac6c1a9cc 100644 (file)
--- a/yt_dlp/postprocessor/__init__.py
+++ b/yt_dlp/postprocessor/__init__.py
@@ -20,8 +20,11 @@
  )
  from .xattrpp import XAttrMetadataPP
  from .execafterdownload import ExecAfterDownloadPP
-from .metadatafromfield import MetadataFromFieldPP
-from .metadatafromfield import MetadataFromTitlePP
+from .metadataparser import (
+    MetadataFromFieldPP,
+    MetadataFromTitlePP,
+    MetadataParserPP,
+)
  from .movefilesafterdownload import MoveFilesAfterDownloadPP
  from .sponskrub import SponSkrubPP
  
@@ -48,6 +51,7 @@ def get_postprocessor(key):
      'FFmpegThumbnailsConvertorPP',
      'FFmpegVideoConvertorPP',
      'FFmpegVideoRemuxerPP',
+    'MetadataParserPP',
      'MetadataFromFieldPP',
      'MetadataFromTitlePP',
      'MoveFilesAfterDownloadPP',
diff --git a/yt_dlp/postprocessor/metadatafromfield.py b/yt_dlp/postprocessor/metadatafromfield.py

deleted file mode 100644 (file)

index 0027947..0000000
--- a/yt_dlp/postprocessor/metadatafromfield.py
+++ /dev/null
@@ -1,74 +0,0 @@
-from __future__ import unicode_literals
-
-import re
-
-from .common import PostProcessor
-from ..compat import compat_str
-
-
-class MetadataFromFieldPP(PostProcessor):
-    regex = r'(?P<in>.*?)(?<!\\):(?P<out>.+)$'
-
-    def __init__(self, downloader, formats):
-        PostProcessor.__init__(self, downloader)
-        assert isinstance(formats, (list, tuple))
-        self._data = []
-        for f in formats:
-            assert isinstance(f, compat_str)
-            match = re.match(self.regex, f)
-            assert match is not None
-            inp = match.group('in').replace('\\:', ':')
-            self._data.append({
-                'in': inp,
-                'out': match.group('out'),
-                'tmpl': self.field_to_template(inp),
-                'regex': self.format_to_regex(match.group('out')),
-            })
-
-    @staticmethod
-    def field_to_template(tmpl):
-        if re.match(r'[a-zA-Z_]+$', tmpl):
-            return '%%(%s)s' % tmpl
-        return tmpl
-
-    @staticmethod
-    def format_to_regex(fmt):
-        r"""
-        Converts a string like
-           '%(title)s - %(artist)s'
-        to a regex like
-           '(?P<title>.+)\ \-\ (?P<artist>.+)'
-        """
-        if not re.search(r'%\(\w+\)s', fmt):
-            return fmt
-        lastpos = 0
-        regex = ''
-        # replace %(..)s with regex group and escape other string parts
-        for match in re.finditer(r'%\((\w+)\)s', fmt):
-            regex += re.escape(fmt[lastpos:match.start()])
-            regex += r'(?P<%s>.+)' % match.group(1)
-            lastpos = match.end()
-        if lastpos < len(fmt):
-            regex += re.escape(fmt[lastpos:])
-        return regex
-
-    def run(self, info):
-        for dictn in self._data:
-            tmpl, tmpl_dict = self._downloader.prepare_outtmpl(dictn['tmpl'], info)
-            data_to_parse = self._downloader.escape_outtmpl(tmpl) % tmpl_dict
-            self.write_debug('Searching for r"%s" in %s' % (dictn['regex'], dictn['tmpl']))
-            match = re.search(dictn['regex'], data_to_parse)
-            if match is None:
-                self.report_warning('Could not interpret video %s as "%s"' % (dictn['in'], dictn['out']))
-                continue
-            for attribute, value in match.groupdict().items():
-                info[attribute] = value
-                self.to_screen('parsed %s from "%s": %s' % (attribute, dictn['tmpl'], value if value is not None else 'NA'))
-        return [], info
-
-
-class MetadataFromTitlePP(MetadataFromFieldPP):  # for backward compatibility
-    def __init__(self, downloader, titleformat):
-        super(MetadataFromTitlePP, self).__init__(downloader, ['%%(title)s:%s' % titleformat])
-        self._titleformat = titleformat
-        self._titleregex = self._data[0]['regex']
diff --git a/yt_dlp/postprocessor/metadataparser.py b/yt_dlp/postprocessor/metadataparser.py

new file mode 100644 (file)

index 0000000..4d3c0e0
--- /dev/null
+++ b/yt_dlp/postprocessor/metadataparser.py
@@ -0,0 +1,117 @@
+import re
+
+from enum import Enum
+
+from .common import PostProcessor
+
+
+class MetadataParserPP(PostProcessor):
+    class Actions(Enum):
+        INTERPRET = 'interpretter'
+        REPLACE = 'replacer'
+
+    def __init__(self, downloader, actions):
+        PostProcessor.__init__(self, downloader)
+        self._actions = []
+        for f in actions:
+            action = f[0]
+            assert isinstance(action, self.Actions)
+            self._actions.append(getattr(self, action._value_)(*f[1:]))
+
+    @classmethod
+    def validate_action(cls, action, *data):
+        ''' Each action can be:
+                (Actions.INTERPRET, from, to) OR
+                (Actions.REPLACE, field, search, replace)
+        '''
+        if not isinstance(action, cls.Actions):
+            raise ValueError(f'{action!r} is not a valid action')
+        getattr(cls, action._value_)(cls, *data)
+
+    @staticmethod
+    def field_to_template(tmpl):
+        if re.match(r'[a-zA-Z_]+$', tmpl):
+            return f'%({tmpl})s'
+        return tmpl
+
+    @staticmethod
+    def format_to_regex(fmt):
+        r"""
+        Converts a string like
+           '%(title)s - %(artist)s'
+        to a regex like
+           '(?P<title>.+)\ \-\ (?P<artist>.+)'
+        """
+        if not re.search(r'%\(\w+\)s', fmt):
+            return fmt
+        lastpos = 0
+        regex = ''
+        # replace %(..)s with regex group and escape other string parts
+        for match in re.finditer(r'%\((\w+)\)s', fmt):
+            regex += re.escape(fmt[lastpos:match.start()])
+            regex += rf'(?P<{match.group(1)}>.+)'
+            lastpos = match.end()
+        if lastpos < len(fmt):
+            regex += re.escape(fmt[lastpos:])
+        return regex
+
+    def run(self, info):
+        for f in self._actions:
+            f(info)
+        return [], info
+
+    def interpretter(self, inp, out):
+        def f(info):
+            outtmpl, tmpl_dict = self._downloader.prepare_outtmpl(template, info)
+            data_to_parse = self._downloader.escape_outtmpl(outtmpl) % tmpl_dict
+            self.write_debug(f'Searching for r{out_re.pattern!r} in {template!r}')
+            match = out_re.search(data_to_parse)
+            if match is None:
+                self.report_warning('Could not interpret {inp!r} as {out!r}')
+                return
+            for attribute, value in match.groupdict().items():
+                info[attribute] = value
+                self.to_screen('Parsed %s from %r: %r' % (attribute, template, value if value is not None else 'NA'))
+
+        template = self.field_to_template(inp)
+        out_re = re.compile(self.format_to_regex(out))
+        return f
+
+    def replacer(self, field, search, replace):
+        def f(info):
+            val = info.get(field)
+            if val is None:
+                self.report_warning(f'Video does not have a {field}')
+                return
+            elif not isinstance(val, str):
+                self.report_warning(f'Cannot replace in field {field} since it is a {type(val).__name__}')
+                return
+            self.write_debug(f'Replacing all r{search!r} in {field} with {replace!r}')
+            info[field], n = search_re.subn(replace, val)
+            if n:
+                self.to_screen(f'Changed {field} to: {info[field]}')
+            else:
+                self.to_screen(f'Did not find r{search!r} in {field}')
+
+        search_re = re.compile(search)
+        return f
+
+
+class MetadataFromFieldPP(MetadataParserPP):
+    @classmethod
+    def to_action(cls, f):
+        match = re.match(r'(?P<in>.*?)(?<!\\):(?P<out>.+)$', f)
+        if match is None:
+            raise ValueError(f'it should be FROM:TO, not {f!r}')
+        return (
+            cls.Actions.INTERPRET,
+            match.group('in').replace('\\:', ':'),
+            match.group('out'))
+
+    def __init__(self, downloader, formats):
+        MetadataParserPP.__init__(self, downloader, [self.to_action(f) for f in formats])
+
+
+class MetadataFromTitlePP(MetadataParserPP):  # for backward compatibility
+    def __init__(self, downloader, titleformat):
+        MetadataParserPP.__init__(self, downloader, [(self.Actions.INTERPRET, 'title', titleformat)])
author	pukkandan <redacted>
	Mon, 9 Aug 2021 19:52:55 +0000 (01:22 +0530)
committer	pukkandan <redacted>
	Mon, 9 Aug 2021 19:52:55 +0000 (01:22 +0530)
README.md		patch \| blob \| blame \| history
test/test_postprocessors.py		patch \| blob \| blame \| history
yt_dlp/YoutubeDL.py		patch \| blob \| blame \| history
yt_dlp/__init__.py		patch \| blob \| blame \| history
yt_dlp/options.py		patch \| blob \| blame \| history
yt_dlp/postprocessor/__init__.py		patch \| blob \| blame \| history
yt_dlp/postprocessor/metadatafromfield.py	[deleted file]	patch \| blob \| blame \| history
yt_dlp/postprocessor/metadataparser.py	[new file with mode: 0644]	patch \| blob