]> jfr.im git - yt-dlp.git/commitdiff
Add option `--parse-metadata`
authorpukkandan <redacted>
Tue, 26 Jan 2021 10:20:20 +0000 (15:50 +0530)
committerpukkandan <redacted>
Tue, 26 Jan 2021 10:44:31 +0000 (16:14 +0530)
* The fields extracted by this can be used in `--output`
* Deprecated `--metadata-from-title`

:ci skip dl

README.md
test/test_postprocessors.py
youtube_dlc/YoutubeDL.py
youtube_dlc/__init__.py
youtube_dlc/options.py
youtube_dlc/postprocessor/__init__.py
youtube_dlc/postprocessor/metadatafromfield.py [new file with mode: 0644]
youtube_dlc/postprocessor/metadatafromtitle.py [deleted file]

index 7524e849398a044f5ec3424e4f2c4c7cd8369a70..886ec245f403a7383b8ff5d8f025b371bc280083 100644 (file)
--- a/README.md
+++ b/README.md
@@ -610,16 +610,19 @@ ## Post-Processing Options:
     --no-embed-thumbnail             Do not embed thumbnail (default)
     --add-metadata                   Write metadata to the video file
     --no-add-metadata                Do not write metadata (default)
-    --metadata-from-title FORMAT     Parse additional metadata like song title /
-                                     artist from the video title. The format
-                                     syntax is the same as --output. Regular
-                                     expression with named capture groups may
-                                     also be used. The parsed parameters replace
-                                     existing values. Example: --metadata-from-
-                                     title "%(artist)s - %(title)s" matches a
+    --parse-metadata FIELD:FORMAT    Parse additional metadata like title/artist
+                                     from other fields. Give field name to
+                                     extract data from, and format of the field
+                                     seperated by a ":". The format syntax is
+                                     the same as --output. Regular expression
+                                     with named capture groups may also be used.
+                                     The parsed parameters replace existing
+                                     values. This option can be used multiple
+                                     times. Example: --parse-metadata
+                                     "title:%(artist)s - %(title)s" matches a
                                      title like "Coldplay - Paradise". Example
-                                     (regex): --metadata-from-title
-                                     "(?P<artist>.+?) - (?P<title>.+)"
+                                     (regex): --parse-metadata
+                                     "description:Artist - (?P<artist>.+?)"
     --xattrs                         Write metadata to the video file's xattrs
                                      (using dublin core and xdg standards)
     --fixup POLICY                   Automatically correct known faults of the
@@ -1098,7 +1101,7 @@ # PLUGINS
 
 Plugins are loaded from `<root-dir>/ytdlp_plugins/<type>/__init__.py`. Currently only `extractor` plugins are supported. Support for `downloader` and `postprocessor` plugins may be added in the future. See [ytdlp_plugins](ytdlp_plugins) for example.
 
-**Note**: `<root-dir>` is the directory of the binary (`<root-dir>/youtube-dlc`), or the root directory of the module if you are running directly from source-code ((`<root dir>/youtube_dlc/__main__.py`)
+**Note**: `<root-dir>` is the directory of the binary (`<root-dir>/youtube-dlc`), or the root directory of the module if you are running directly from source-code (`<root dir>/youtube_dlc/__main__.py`)
 
 # MORE
-For FAQ, Developer Instructions etc., see the [original README](https://github.com/ytdl-org/youtube-dl)
+For FAQ, Developer Instructions etc., see the [original README](https://github.com/ytdl-org/youtube-dl#faq)
index 6f538a3da0f823361f86e84f502e58f25540594e..fabe7e6fb9a20c6a8179e6aa8e70c7d68c4c9d6b 100644 (file)
@@ -8,10 +8,16 @@
 import unittest
 sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 
-from youtube_dlc.postprocessor import MetadataFromTitlePP
+from youtube_dlc.postprocessor import MetadataFromFieldPP, MetadataFromTitlePP
+
+
+class TestMetadataFromField(unittest.TestCase):
+    def test_format_to_regex(self):
+        pp = MetadataFromFieldPP(None, ['title:%(title)s - %(artist)s'])
+        self.assertEqual(pp._data[0]['regex'], r'(?P<title>[^\r\n]+)\ \-\ (?P<artist>[^\r\n]+)')
 
 
 class TestMetadataFromTitle(unittest.TestCase):
     def test_format_to_regex(self):
         pp = MetadataFromTitlePP(None, '%(title)s - %(artist)s')
-        self.assertEqual(pp._titleregex, r'(?P<title>.+)\ \-\ (?P<artist>.+)')
+        self.assertEqual(pp._titleregex, r'(?P<title>[^\r\n]+)\ \-\ (?P<artist>[^\r\n]+)')
index ce990507ca04f4ae0d66b9f5beff3d7602e986f8..0e93303b1e5e27e67cf3f4ded17992d49a40b83c 100644 (file)
@@ -375,8 +375,7 @@ class YoutubeDL(object):
 
     params = None
     _ies = []
-    _pps = []
-    _pps_end = []
+    _pps = {'beforedl': [], 'aftermove': [], 'normal': []}
     __prepare_filename_warned = False
     _download_retcode = None
     _num_downloads = None
@@ -390,8 +389,7 @@ def __init__(self, params=None, auto_init=True):
             params = {}
         self._ies = []
         self._ies_instances = {}
-        self._pps = []
-        self._pps_end = []
+        self._pps = {'beforedl': [], 'aftermove': [], 'normal': []}
         self.__prepare_filename_warned = False
         self._post_hooks = []
         self._progress_hooks = []
@@ -494,11 +492,13 @@ def check_deprecated(param, option, suggestion):
             pp_class = get_postprocessor(pp_def_raw['key'])
             pp_def = dict(pp_def_raw)
             del pp_def['key']
-            after_move = pp_def.get('_after_move', False)
-            if '_after_move' in pp_def:
-                del pp_def['_after_move']
+            if 'when' in pp_def:
+                when = pp_def['when']
+                del pp_def['when']
+            else:
+                when = 'normal'
             pp = pp_class(self, **compat_kwargs(pp_def))
-            self.add_post_processor(pp, after_move=after_move)
+            self.add_post_processor(pp, when=when)
 
         for ph in self.params.get('post_hooks', []):
             self.add_post_hook(ph)
@@ -550,12 +550,9 @@ def add_default_info_extractors(self):
         for ie in gen_extractor_classes():
             self.add_info_extractor(ie)
 
-    def add_post_processor(self, pp, after_move=False):
+    def add_post_processor(self, pp, when='normal'):
         """Add a PostProcessor object to the end of the chain."""
-        if after_move:
-            self._pps_end.append(pp)
-        else:
-            self._pps.append(pp)
+        self._pps[when].append(pp)
         pp.set_downloader(self)
 
     def add_post_hook(self, ph):
@@ -1948,6 +1945,8 @@ def process_info(self, info_dict):
 
         self._num_downloads += 1
 
+        info_dict = self.pre_process(info_dict)
+
         filename = self.prepare_filename(info_dict, warn=True)
         info_dict['_filename'] = full_filename = self.prepare_filepath(filename)
         temp_filename = self.prepare_filepath(filename, 'temp')
@@ -2400,41 +2399,45 @@ def filter_requested_info(info_dict):
             (k, v) for k, v in info_dict.items()
             if k not in ['requested_formats', 'requested_subtitles'])
 
+    def run_pp(self, pp, infodict, files_to_move={}):
+        files_to_delete = []
+        try:
+            files_to_delete, infodict = pp.run(infodict)
+        except PostProcessingError as e:
+            self.report_error(e.msg)
+        if not files_to_delete:
+            return files_to_move, infodict
+
+        if self.params.get('keepvideo', False):
+            for f in files_to_delete:
+                files_to_move.setdefault(f, '')
+        else:
+            for old_filename in set(files_to_delete):
+                self.to_screen('Deleting original file %s (pass -k to keep)' % old_filename)
+                try:
+                    os.remove(encodeFilename(old_filename))
+                except (IOError, OSError):
+                    self.report_warning('Unable to remove downloaded original file')
+                if old_filename in files_to_move:
+                    del files_to_move[old_filename]
+        return files_to_move, infodict
+
+    def pre_process(self, ie_info):
+        info = dict(ie_info)
+        for pp in self._pps['beforedl']:
+            info = self.run_pp(pp, info)[1]
+        return info
+
     def post_process(self, filename, ie_info, files_to_move={}):
         """Run all the postprocessors on the given file."""
         info = dict(ie_info)
         info['filepath'] = filename
 
-        def run_pp(pp):
-            files_to_delete = []
-            infodict = info
-            try:
-                files_to_delete, infodict = pp.run(infodict)
-            except PostProcessingError as e:
-                self.report_error(e.msg)
-            if not files_to_delete:
-                return infodict
-
-            if self.params.get('keepvideo', False):
-                for f in files_to_delete:
-                    files_to_move.setdefault(f, '')
-            else:
-                for old_filename in set(files_to_delete):
-                    self.to_screen('Deleting original file %s (pass -k to keep)' % old_filename)
-                    try:
-                        os.remove(encodeFilename(old_filename))
-                    except (IOError, OSError):
-                        self.report_warning('Unable to remove downloaded original file')
-                    if old_filename in files_to_move:
-                        del files_to_move[old_filename]
-            return infodict
-
-        for pp in ie_info.get('__postprocessors', []) + self._pps:
-            info = run_pp(pp)
-        info = run_pp(MoveFilesAfterDownloadPP(self, files_to_move))
-        files_to_move = {}
-        for pp in self._pps_end:
-            info = run_pp(pp)
+        for pp in ie_info.get('__postprocessors', []) + self._pps['normal']:
+            files_to_move, info = self.run_pp(pp, info, files_to_move)
+        info = self.run_pp(MoveFilesAfterDownloadPP(self, files_to_move), info, files_to_move)[1]
+        for pp in self._pps['aftermove']:
+            files_to_move, info = self.run_pp(pp, info, {})
 
     def _make_archive_id(self, info_dict):
         video_id = info_dict.get('id')
index e2db6626650e549f76723b53677562eded95b015..5f97b51ff14e262f25ad84645cb722d978f16668 100644 (file)
@@ -45,6 +45,7 @@
 from .extractor import gen_extractors, list_extractors
 from .extractor.common import InfoExtractor
 from .extractor.adobepass import MSO_INFO
+from .postprocessor.metadatafromfield import MetadataFromFieldPP
 from .YoutubeDL import YoutubeDL
 
 
@@ -249,16 +250,25 @@ def parse_retries(retries):
         if re.match(InfoExtractor.FormatSort.regex, f) is None:
             parser.error('invalid format sort string "%s" specified' % f)
 
+    if opts.metafromfield is None:
+        opts.metafromfield = []
+    if opts.metafromtitle is not None:
+        opts.metafromfield.append('title:%s' % opts.metafromtitle)
+    for f in opts.metafromfield:
+        if re.match(MetadataFromFieldPP.regex, f) is None:
+            parser.error('invalid format string "%s" specified for --parse-metadata' % f)
+
     any_getting = opts.geturl or opts.gettitle or opts.getid or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat or opts.getduration or opts.dumpjson or opts.dump_single_json
     any_printing = opts.print_json
     download_archive_fn = expand_path(opts.download_archive) if opts.download_archive is not None else opts.download_archive
 
     # PostProcessors
     postprocessors = []
-    if opts.metafromtitle:
+    if opts.metafromfield:
         postprocessors.append({
-            'key': 'MetadataFromTitle',
-            'titleformat': opts.metafromtitle
+            'key': 'MetadataFromField',
+            'formats': opts.metafromfield,
+            'when': 'beforedl'
         })
     if opts.extractaudio:
         postprocessors.append({
@@ -324,7 +334,7 @@ def parse_retries(retries):
         postprocessors.append({
             'key': 'ExecAfterDownload',
             'exec_cmd': opts.exec_cmd,
-            '_after_move': True
+            'when': 'aftermove'
         })
 
     _args_compat_warning = 'WARNING: %s given without specifying name. The arguments will be given to all %s\n'
index 4910c2083f2e0864556563c7d474c3db2048127a..859f28e2b1c053a007bd7b5d39385b591935ac85 100644 (file)
@@ -1078,14 +1078,20 @@ def _dict_from_multiple_values_options_callback(
     postproc.add_option(
         '--metadata-from-title',
         metavar='FORMAT', dest='metafromtitle',
+        help=optparse.SUPPRESS_HELP)
+    postproc.add_option(
+        '--parse-metadata',
+        metavar='FIELD:FORMAT', dest='metafromfield', action='append',
         help=(
-            'Parse additional metadata like song title / artist from the video title. '
-            'The format syntax is the same as --output. Regular expression with '
-            'named capture groups may also be used. '
+            'Parse additional metadata like title/artist from other fields. '
+            'Give field name to extract data from, and format of the field seperated by a ":". '
+            'The format syntax is the same as --output. '
+            'Regular expression with named capture groups may also be used. '
             'The parsed parameters replace existing values. '
-            'Example: --metadata-from-title "%(artist)s - %(title)s" matches a title like '
+            'This option can be used multiple times. '
+            'Example: --parse-metadata "title:%(artist)s - %(title)s" matches a title like '
             '"Coldplay - Paradise". '
-            'Example (regex): --metadata-from-title "(?P<artist>.+?) - (?P<title>.+)"'))
+            'Example (regex): --parse-metadata "description:Artist - (?P<artist>.+?)"'))
     postproc.add_option(
         '--xattrs',
         action='store_true', dest='xattrs', default=False,
index 840a83b0e27e1f1892dfbf2be93e48f76987ef79..c5aa925c65742da895a8619cf3794dbdf4a4c7b4 100644 (file)
@@ -16,7 +16,8 @@
 )
 from .xattrpp import XAttrMetadataPP
 from .execafterdownload import ExecAfterDownloadPP
-from .metadatafromtitle import MetadataFromTitlePP
+from .metadatafromfield import MetadataFromFieldPP
+from .metadatafromfield import MetadataFromTitlePP
 from .movefilesafterdownload import MoveFilesAfterDownloadPP
 from .sponskrub import SponSkrubPP
 
@@ -39,6 +40,7 @@ def get_postprocessor(key):
     'FFmpegSubtitlesConvertorPP',
     'FFmpegVideoConvertorPP',
     'FFmpegVideoRemuxerPP',
+    'MetadataFromFieldPP',
     'MetadataFromTitlePP',
     'MoveFilesAfterDownloadPP',
     'SponSkrubPP',
diff --git a/youtube_dlc/postprocessor/metadatafromfield.py b/youtube_dlc/postprocessor/metadatafromfield.py
new file mode 100644 (file)
index 0000000..eb77432
--- /dev/null
@@ -0,0 +1,66 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import PostProcessor
+from ..compat import compat_str
+
+
+class MetadataFromFieldPP(PostProcessor):
+    regex = r'(?P<field>\w+):(?P<format>.+)$'
+
+    def __init__(self, downloader, formats):
+        PostProcessor.__init__(self, downloader)
+        assert isinstance(formats, (list, tuple))
+        self._data = []
+        for f in formats:
+            assert isinstance(f, compat_str)
+            match = re.match(self.regex, f)
+            assert match is not None
+            self._data.append({
+                'field': match.group('field'),
+                'format': match.group('format'),
+                'regex': self.format_to_regex(match.group('format'))})
+
+    def format_to_regex(self, fmt):
+        r"""
+        Converts a string like
+           '%(title)s - %(artist)s'
+        to a regex like
+           '(?P<title>.+)\ \-\ (?P<artist>.+)'
+        """
+        if not re.search(r'%\(\w+\)s', fmt):
+            return fmt
+        lastpos = 0
+        regex = ''
+        # replace %(..)s with regex group and escape other string parts
+        for match in re.finditer(r'%\((\w+)\)s', fmt):
+            regex += re.escape(fmt[lastpos:match.start()])
+            regex += r'(?P<' + match.group(1) + r'>[^\r\n]+)'
+            lastpos = match.end()
+        if lastpos < len(fmt):
+            regex += re.escape(fmt[lastpos:])
+        return regex
+
+    def run(self, info):
+        for dictn in self._data:
+            field, regex = dictn['field'], dictn['regex']
+            if field not in info:
+                self.report_warning('Video doesnot have a %s' % field)
+                continue
+            self.write_debug('Searching for r"%s" in %s' % (regex, field))
+            match = re.search(regex, info[field])
+            if match is None:
+                self.report_warning('Could not interpret video %s as "%s"' % (field, dictn['format']))
+                continue
+            for attribute, value in match.groupdict().items():
+                info[attribute] = value
+                self.to_screen('parsed %s from %s: %s' % (attribute, field, value if value is not None else 'NA'))
+        return [], info
+
+
+class MetadataFromTitlePP(MetadataFromFieldPP):  # for backward compatibility
+    def __init__(self, downloader, titleformat):
+        super(MetadataFromTitlePP, self).__init__(downloader, ['title:%s' % titleformat])
+        self._titleformat = titleformat
+        self._titleregex = self._data[0]['regex']
diff --git a/youtube_dlc/postprocessor/metadatafromtitle.py b/youtube_dlc/postprocessor/metadatafromtitle.py
deleted file mode 100644 (file)
index 86df3b4..0000000
+++ /dev/null
@@ -1,44 +0,0 @@
-from __future__ import unicode_literals
-
-import re
-
-from .common import PostProcessor
-
-
-class MetadataFromTitlePP(PostProcessor):
-    def __init__(self, downloader, titleformat):
-        super(MetadataFromTitlePP, self).__init__(downloader)
-        self._titleformat = titleformat
-        self._titleregex = (self.format_to_regex(titleformat)
-                            if re.search(r'%\(\w+\)s', titleformat)
-                            else titleformat)
-
-    def format_to_regex(self, fmt):
-        r"""
-        Converts a string like
-           '%(title)s - %(artist)s'
-        to a regex like
-           '(?P<title>.+)\ \-\ (?P<artist>.+)'
-        """
-        lastpos = 0
-        regex = ''
-        # replace %(..)s with regex group and escape other string parts
-        for match in re.finditer(r'%\((\w+)\)s', fmt):
-            regex += re.escape(fmt[lastpos:match.start()])
-            regex += r'(?P<' + match.group(1) + '>.+)'
-            lastpos = match.end()
-        if lastpos < len(fmt):
-            regex += re.escape(fmt[lastpos:])
-        return regex
-
-    def run(self, info):
-        title = info['title']
-        match = re.match(self._titleregex, title)
-        if match is None:
-            self.to_screen('Could not interpret title of video as "%s"' % self._titleformat)
-            return [], info
-        for attribute, value in match.groupdict().items():
-            info[attribute] = value
-            self.to_screen('parsed %s: %s' % (attribute, value if value is not None else 'NA'))
-
-        return [], info