]> jfr.im git - yt-dlp.git/blob - yt_dlp/postprocessor/metadataparser.py
[postprocessor,cleanup] Create `_download_json`
[yt-dlp.git] / yt_dlp / postprocessor / metadataparser.py
1 import re
2 from enum import Enum
3
4 from .common import PostProcessor
5
6
7 class MetadataParserPP(PostProcessor):
8 class Actions(Enum):
9 INTERPRET = 'interpretter'
10 REPLACE = 'replacer'
11
12 def __init__(self, downloader, actions):
13 PostProcessor.__init__(self, downloader)
14 self._actions = []
15 for f in actions:
16 action = f[0]
17 assert isinstance(action, self.Actions)
18 self._actions.append(getattr(self, action.value)(*f[1:]))
19
20 @classmethod
21 def validate_action(cls, action, *data):
22 ''' Each action can be:
23 (Actions.INTERPRET, from, to) OR
24 (Actions.REPLACE, field, search, replace)
25 '''
26 if not isinstance(action, cls.Actions):
27 raise ValueError(f'{action!r} is not a valid action')
28 getattr(cls, action.value)(cls, *data) # So this can raise error to validate
29
30 @staticmethod
31 def field_to_template(tmpl):
32 if re.match(r'[a-zA-Z_]+$', tmpl):
33 return f'%({tmpl})s'
34
35 from ..YoutubeDL import YoutubeDL
36 err = YoutubeDL.validate_outtmpl(tmpl)
37 if err:
38 raise err
39 return tmpl
40
41 @staticmethod
42 def format_to_regex(fmt):
43 r"""
44 Converts a string like
45 '%(title)s - %(artist)s'
46 to a regex like
47 '(?P<title>.+)\ \-\ (?P<artist>.+)'
48 """
49 if not re.search(r'%\(\w+\)s', fmt):
50 return fmt
51 lastpos = 0
52 regex = ''
53 # replace %(..)s with regex group and escape other string parts
54 for match in re.finditer(r'%\((\w+)\)s', fmt):
55 regex += re.escape(fmt[lastpos:match.start()])
56 regex += rf'(?P<{match.group(1)}>.+)'
57 lastpos = match.end()
58 if lastpos < len(fmt):
59 regex += re.escape(fmt[lastpos:])
60 return regex
61
62 def run(self, info):
63 for f in self._actions:
64 f(info)
65 return [], info
66
67 def interpretter(self, inp, out):
68 def f(info):
69 data_to_parse = self._downloader.evaluate_outtmpl(template, info)
70 self.write_debug(f'Searching for {out_re.pattern!r} in {template!r}')
71 match = out_re.search(data_to_parse)
72 if match is None:
73 self.to_screen(f'Could not interpret {inp!r} as {out!r}')
74 return
75 for attribute, value in match.groupdict().items():
76 info[attribute] = value
77 self.to_screen('Parsed %s from %r: %r' % (attribute, template, value if value is not None else 'NA'))
78
79 template = self.field_to_template(inp)
80 out_re = re.compile(self.format_to_regex(out))
81 return f
82
83 def replacer(self, field, search, replace):
84 def f(info):
85 val = info.get(field)
86 if val is None:
87 self.to_screen(f'Video does not have a {field}')
88 return
89 elif not isinstance(val, str):
90 self.report_warning(f'Cannot replace in field {field} since it is a {type(val).__name__}')
91 return
92 self.write_debug(f'Replacing all {search!r} in {field} with {replace!r}')
93 info[field], n = search_re.subn(replace, val)
94 if n:
95 self.to_screen(f'Changed {field} to: {info[field]}')
96 else:
97 self.to_screen(f'Did not find {search!r} in {field}')
98
99 search_re = re.compile(search)
100 return f
101
102
103 class MetadataFromFieldPP(MetadataParserPP):
104 @classmethod
105 def to_action(cls, f):
106 match = re.match(r'(?s)(?P<in>.*?)(?<!\\):(?P<out>.+)$', f)
107 if match is None:
108 raise ValueError(f'it should be FROM:TO, not {f!r}')
109 return (
110 cls.Actions.INTERPRET,
111 match.group('in').replace('\\:', ':'),
112 match.group('out'),
113 )
114
115 def __init__(self, downloader, formats):
116 super().__init__(downloader, [self.to_action(f) for f in formats])
117
118
119 # Deprecated
120 class MetadataFromTitlePP(MetadataParserPP):
121 def __init__(self, downloader, titleformat):
122 super().__init__(downloader, [(self.Actions.INTERPRET, 'title', titleformat)])
123 self.deprecation_warning(
124 'yt_dlp.postprocessor.MetadataFromTitlePP is deprecated '
125 'and may be removed in a future version. Use yt_dlp.postprocessor.MetadataFromFieldPP instead')