--parse-metadata FROM:TO Parse additional metadata like title/artist
from other fields; see "MODIFYING METADATA"
for details
+ --replace-in-metadata FIELDS REGEX REPLACE
+ Replace text in a metadata field using the
+ given regex. This option can be used
+ multiple times
--xattrs Write metadata to the video file's xattrs
(using dublin core and xdg standards)
--fixup POLICY Automatically correct known faults of the
# MODIFYING METADATA
-The metadata obtained the the extractors can be modified by using `--parse-metadata FROM:TO`. The general syntax is to give the name of a field or a template (with similar syntax to [output template](#output-template)) to extract data from, and the format to interpret it as, separated by a colon `:`. Either a [python regular expression](https://docs.python.org/3/library/re.html#regular-expression-syntax) with named capture groups or a similar syntax to the [output template](#output-template) (only `%(field)s` formatting is supported) can be used for `TO`. The option can be used multiple times to parse and modify various fields.
+The metadata obtained the the extractors can be modified by using `--parse-metadata` and `--replace-in-metadata`
+
+`--replace-in-metadata FIELDS REGEX REPLACE` is used to replace text in any metatdata field using [python regular expression](https://docs.python.org/3/library/re.html#regular-expression-syntax). [Backreferences](https://docs.python.org/3/library/re.html?highlight=backreferences#re.sub) can be used in the replace string for advanced use.
+
+The general syntax of `--parse-metadata FROM:TO` is to give the name of a field or a template (with same syntax as [output template](#output-template)) to extract data from, and the format to interpret it as, separated by a colon `:`. Either a [python regular expression](https://docs.python.org/3/library/re.html#regular-expression-syntax) with named capture groups or a similar syntax to the [output template](#output-template) (only `%(field)s` formatting is supported) can be used for `TO`. The option can be used multiple times to parse and modify various fields.
Note that any field created by this can be used in the [output template](#output-template) and will also affect the media file's metadata added when using `--add-metadata`.
# Set "comment" field in video metadata using description instead of webpage_url
$ yt-dlp --parse-metadata 'description:(?s)(?P<meta_comment>.+)' --add-metadata
+# Replace all spaces and "_" in title and uploader with a `-`
+$ yt-dlp --replace-in-metadata 'title,uploader' '[ _]' '-'
+
```
# EXTRACTOR ARGUMENTS
ExecAfterDownloadPP,
FFmpegThumbnailsConvertorPP,
MetadataFromFieldPP,
- MetadataFromTitlePP,
+ MetadataParserPP,
)
class TestMetadataFromField(unittest.TestCase):
- def test_format_to_regex(self):
- pp = MetadataFromFieldPP(None, ['title:%(title)s - %(artist)s'])
- self.assertEqual(pp._data[0]['regex'], r'(?P<title>.+)\ \-\ (?P<artist>.+)')
-
- def test_field_to_outtmpl(self):
- pp = MetadataFromFieldPP(None, ['title:%(title)s : %(artist)s'])
- self.assertEqual(pp._data[0]['tmpl'], '%(title)s')
-
- def test_in_out_seperation(self):
- pp = MetadataFromFieldPP(None, ['%(title)s \\: %(artist)s:%(title)s : %(artist)s'])
- self.assertEqual(pp._data[0]['in'], '%(title)s : %(artist)s')
- self.assertEqual(pp._data[0]['out'], '%(title)s : %(artist)s')
-
-class TestMetadataFromTitle(unittest.TestCase):
def test_format_to_regex(self):
- pp = MetadataFromTitlePP(None, '%(title)s - %(artist)s')
- self.assertEqual(pp._titleregex, r'(?P<title>.+)\ \-\ (?P<artist>.+)')
+ self.assertEqual(
+ MetadataParserPP.format_to_regex('%(title)s - %(artist)s'),
+ r'(?P<title>.+)\ \-\ (?P<artist>.+)')
+ self.assertEqual(MetadataParserPP.format_to_regex(r'(?P<x>.+)'), r'(?P<x>.+)')
+
+ def test_field_to_template(self):
+ self.assertEqual(MetadataParserPP.field_to_template('title'), '%(title)s')
+ self.assertEqual(MetadataParserPP.field_to_template('1'), '1')
+ self.assertEqual(MetadataParserPP.field_to_template('foo bar'), 'foo bar')
+ self.assertEqual(MetadataParserPP.field_to_template(' literal'), ' literal')
+
+ def test_metadatafromfield(self):
+ self.assertEqual(
+ MetadataFromFieldPP.to_action('%(title)s \\: %(artist)s:%(title)s : %(artist)s'),
+ (MetadataParserPP.Actions.INTERPRET, '%(title)s : %(artist)s', '%(title)s : %(artist)s'))
class TestConvertThumbnail(unittest.TestCase):
ie_result = self.process_video_result(ie_result, download=download)
additional_urls = (ie_result or {}).get('additional_urls')
if additional_urls:
- # TODO: Improve MetadataFromFieldPP to allow setting a list
+ # TODO: Improve MetadataParserPP to allow setting a list
if isinstance(additional_urls, compat_str):
additional_urls = [additional_urls]
self.to_screen(
import codecs
import io
+import itertools
import os
import random
import re
)
from .compat import (
compat_getpass,
+ compat_shlex_quote,
workaround_optparse_bug9161,
)
from .cookies import SUPPORTED_BROWSERS
from .extractor import gen_extractors, list_extractors
from .extractor.common import InfoExtractor
from .extractor.adobepass import MSO_INFO
-from .postprocessor.ffmpeg import (
+from .postprocessor import (
FFmpegExtractAudioPP,
FFmpegSubtitlesConvertorPP,
FFmpegThumbnailsConvertorPP,
FFmpegVideoConvertorPP,
FFmpegVideoRemuxerPP,
+ MetadataFromFieldPP,
+ MetadataParserPP,
)
-from .postprocessor.metadatafromfield import MetadataFromFieldPP
from .YoutubeDL import YoutubeDL
if re.match(InfoExtractor.FormatSort.regex, f) is None:
parser.error('invalid format sort string "%s" specified' % f)
- if opts.metafromfield is None:
- opts.metafromfield = []
+ def metadataparser_actions(f):
+ if isinstance(f, str):
+ cmd = '--parse-metadata %s' % compat_shlex_quote(f)
+ try:
+ actions = [MetadataFromFieldPP.to_action(f)]
+ except Exception as err:
+ parser.error(f'{cmd} is invalid; {err}')
+ else:
+ cmd = '--replace-in-metadata %s' % ' '.join(map(compat_shlex_quote, f))
+ actions = ((MetadataParserPP.Actions.REPLACE, x, *f[1:]) for x in f[0].split(','))
+
+ for action in actions:
+ try:
+ MetadataParserPP.validate_action(*action)
+ except Exception as err:
+ parser.error(f'{cmd} is invalid; {err}')
+ yield action
+
+ if opts.parse_metadata is None:
+ opts.parse_metadata = []
if opts.metafromtitle is not None:
- opts.metafromfield.append('title:%s' % opts.metafromtitle)
- for f in opts.metafromfield:
- if re.match(MetadataFromFieldPP.regex, f) is None:
- parser.error('invalid format string "%s" specified for --parse-metadata' % f)
+ opts.parse_metadata.append('title:%s' % opts.metafromtitle)
+ opts.parse_metadata = list(itertools.chain(*map(metadataparser_actions, opts.parse_metadata)))
any_getting = opts.forceprint or opts.geturl or opts.gettitle or opts.getid or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat or opts.getduration or opts.dumpjson or opts.dump_single_json
any_printing = opts.print_json
# PostProcessors
postprocessors = []
- if opts.metafromfield:
+ if opts.parse_metadata:
postprocessors.append({
- 'key': 'MetadataFromField',
- 'formats': opts.metafromfield,
+ 'key': 'MetadataParser',
+ 'actions': opts.parse_metadata,
# Run this immediately after extraction is complete
'when': 'pre_process'
})
help=optparse.SUPPRESS_HELP)
postproc.add_option(
'--parse-metadata',
- metavar='FROM:TO', dest='metafromfield', action='append',
+ metavar='FROM:TO', dest='parse_metadata', action='append',
help=(
'Parse additional metadata like title/artist from other fields; '
'see "MODIFYING METADATA" for details'))
+ postproc.add_option(
+ '--replace-in-metadata',
+ dest='parse_metadata', metavar='FIELDS REGEX REPLACE', action='append', nargs=3,
+ help='Replace text in a metadata field using the given regex. This option can be used multiple times')
postproc.add_option(
'--xattrs',
action='store_true', dest='xattrs', default=False,
)
from .xattrpp import XAttrMetadataPP
from .execafterdownload import ExecAfterDownloadPP
-from .metadatafromfield import MetadataFromFieldPP
-from .metadatafromfield import MetadataFromTitlePP
+from .metadataparser import (
+ MetadataFromFieldPP,
+ MetadataFromTitlePP,
+ MetadataParserPP,
+)
from .movefilesafterdownload import MoveFilesAfterDownloadPP
from .sponskrub import SponSkrubPP
'FFmpegThumbnailsConvertorPP',
'FFmpegVideoConvertorPP',
'FFmpegVideoRemuxerPP',
+ 'MetadataParserPP',
'MetadataFromFieldPP',
'MetadataFromTitlePP',
'MoveFilesAfterDownloadPP',
+++ /dev/null
-from __future__ import unicode_literals
-
-import re
-
-from .common import PostProcessor
-from ..compat import compat_str
-
-
-class MetadataFromFieldPP(PostProcessor):
- regex = r'(?P<in>.*?)(?<!\\):(?P<out>.+)$'
-
- def __init__(self, downloader, formats):
- PostProcessor.__init__(self, downloader)
- assert isinstance(formats, (list, tuple))
- self._data = []
- for f in formats:
- assert isinstance(f, compat_str)
- match = re.match(self.regex, f)
- assert match is not None
- inp = match.group('in').replace('\\:', ':')
- self._data.append({
- 'in': inp,
- 'out': match.group('out'),
- 'tmpl': self.field_to_template(inp),
- 'regex': self.format_to_regex(match.group('out')),
- })
-
- @staticmethod
- def field_to_template(tmpl):
- if re.match(r'[a-zA-Z_]+$', tmpl):
- return '%%(%s)s' % tmpl
- return tmpl
-
- @staticmethod
- def format_to_regex(fmt):
- r"""
- Converts a string like
- '%(title)s - %(artist)s'
- to a regex like
- '(?P<title>.+)\ \-\ (?P<artist>.+)'
- """
- if not re.search(r'%\(\w+\)s', fmt):
- return fmt
- lastpos = 0
- regex = ''
- # replace %(..)s with regex group and escape other string parts
- for match in re.finditer(r'%\((\w+)\)s', fmt):
- regex += re.escape(fmt[lastpos:match.start()])
- regex += r'(?P<%s>.+)' % match.group(1)
- lastpos = match.end()
- if lastpos < len(fmt):
- regex += re.escape(fmt[lastpos:])
- return regex
-
- def run(self, info):
- for dictn in self._data:
- tmpl, tmpl_dict = self._downloader.prepare_outtmpl(dictn['tmpl'], info)
- data_to_parse = self._downloader.escape_outtmpl(tmpl) % tmpl_dict
- self.write_debug('Searching for r"%s" in %s' % (dictn['regex'], dictn['tmpl']))
- match = re.search(dictn['regex'], data_to_parse)
- if match is None:
- self.report_warning('Could not interpret video %s as "%s"' % (dictn['in'], dictn['out']))
- continue
- for attribute, value in match.groupdict().items():
- info[attribute] = value
- self.to_screen('parsed %s from "%s": %s' % (attribute, dictn['tmpl'], value if value is not None else 'NA'))
- return [], info
-
-
-class MetadataFromTitlePP(MetadataFromFieldPP): # for backward compatibility
- def __init__(self, downloader, titleformat):
- super(MetadataFromTitlePP, self).__init__(downloader, ['%%(title)s:%s' % titleformat])
- self._titleformat = titleformat
- self._titleregex = self._data[0]['regex']
--- /dev/null
+import re
+
+from enum import Enum
+
+from .common import PostProcessor
+
+
+class MetadataParserPP(PostProcessor):
+ class Actions(Enum):
+ INTERPRET = 'interpretter'
+ REPLACE = 'replacer'
+
+ def __init__(self, downloader, actions):
+ PostProcessor.__init__(self, downloader)
+ self._actions = []
+ for f in actions:
+ action = f[0]
+ assert isinstance(action, self.Actions)
+ self._actions.append(getattr(self, action._value_)(*f[1:]))
+
+ @classmethod
+ def validate_action(cls, action, *data):
+ ''' Each action can be:
+ (Actions.INTERPRET, from, to) OR
+ (Actions.REPLACE, field, search, replace)
+ '''
+ if not isinstance(action, cls.Actions):
+ raise ValueError(f'{action!r} is not a valid action')
+ getattr(cls, action._value_)(cls, *data)
+
+ @staticmethod
+ def field_to_template(tmpl):
+ if re.match(r'[a-zA-Z_]+$', tmpl):
+ return f'%({tmpl})s'
+ return tmpl
+
+ @staticmethod
+ def format_to_regex(fmt):
+ r"""
+ Converts a string like
+ '%(title)s - %(artist)s'
+ to a regex like
+ '(?P<title>.+)\ \-\ (?P<artist>.+)'
+ """
+ if not re.search(r'%\(\w+\)s', fmt):
+ return fmt
+ lastpos = 0
+ regex = ''
+ # replace %(..)s with regex group and escape other string parts
+ for match in re.finditer(r'%\((\w+)\)s', fmt):
+ regex += re.escape(fmt[lastpos:match.start()])
+ regex += rf'(?P<{match.group(1)}>.+)'
+ lastpos = match.end()
+ if lastpos < len(fmt):
+ regex += re.escape(fmt[lastpos:])
+ return regex
+
+ def run(self, info):
+ for f in self._actions:
+ f(info)
+ return [], info
+
+ def interpretter(self, inp, out):
+ def f(info):
+ outtmpl, tmpl_dict = self._downloader.prepare_outtmpl(template, info)
+ data_to_parse = self._downloader.escape_outtmpl(outtmpl) % tmpl_dict
+ self.write_debug(f'Searching for r{out_re.pattern!r} in {template!r}')
+ match = out_re.search(data_to_parse)
+ if match is None:
+ self.report_warning('Could not interpret {inp!r} as {out!r}')
+ return
+ for attribute, value in match.groupdict().items():
+ info[attribute] = value
+ self.to_screen('Parsed %s from %r: %r' % (attribute, template, value if value is not None else 'NA'))
+
+ template = self.field_to_template(inp)
+ out_re = re.compile(self.format_to_regex(out))
+ return f
+
+ def replacer(self, field, search, replace):
+ def f(info):
+ val = info.get(field)
+ if val is None:
+ self.report_warning(f'Video does not have a {field}')
+ return
+ elif not isinstance(val, str):
+ self.report_warning(f'Cannot replace in field {field} since it is a {type(val).__name__}')
+ return
+ self.write_debug(f'Replacing all r{search!r} in {field} with {replace!r}')
+ info[field], n = search_re.subn(replace, val)
+ if n:
+ self.to_screen(f'Changed {field} to: {info[field]}')
+ else:
+ self.to_screen(f'Did not find r{search!r} in {field}')
+
+ search_re = re.compile(search)
+ return f
+
+
+class MetadataFromFieldPP(MetadataParserPP):
+ @classmethod
+ def to_action(cls, f):
+ match = re.match(r'(?P<in>.*?)(?<!\\):(?P<out>.+)$', f)
+ if match is None:
+ raise ValueError(f'it should be FROM:TO, not {f!r}')
+ return (
+ cls.Actions.INTERPRET,
+ match.group('in').replace('\\:', ':'),
+ match.group('out'))
+
+ def __init__(self, downloader, formats):
+ MetadataParserPP.__init__(self, downloader, [self.to_action(f) for f in formats])
+
+
+class MetadataFromTitlePP(MetadataParserPP): # for backward compatibility
+ def __init__(self, downloader, titleformat):
+ MetadataParserPP.__init__(self, downloader, [(self.Actions.INTERPRET, 'title', titleformat)])