]> jfr.im git - yt-dlp.git/blob - yt_dlp/webvtt.py
[cleanup] Misc cleanup and refactor (#2173)
[yt-dlp.git] / yt_dlp / webvtt.py
1 """
2 A partial parser for WebVTT segments. Interprets enough of the WebVTT stream
3 to be able to assemble a single stand-alone subtitle file, suitably adjusting
4 timestamps on the way, while everything else is passed through unmodified.
5
6 Regular expressions based on the W3C WebVTT specification
7 <https://www.w3.org/TR/webvtt1/>. The X-TIMESTAMP-MAP extension is described
8 in RFC 8216 §3.5 <https://tools.ietf.org/html/rfc8216#section-3.5>.
9 """
10
11 import io
12 import re
13
14 from .compat import compat_Match, compat_Pattern
15 from .utils import int_or_none, timetuple_from_msec
16
17
18 class _MatchParser:
19 """
20 An object that maintains the current parsing position and allows
21 conveniently advancing it as syntax elements are successfully parsed.
22 """
23
24 def __init__(self, string):
25 self._data = string
26 self._pos = 0
27
28 def match(self, r):
29 if isinstance(r, compat_Pattern):
30 return r.match(self._data, self._pos)
31 if isinstance(r, str):
32 if self._data.startswith(r, self._pos):
33 return len(r)
34 return None
35 raise ValueError(r)
36
37 def advance(self, by):
38 if by is None:
39 amt = 0
40 elif isinstance(by, compat_Match):
41 amt = len(by.group(0))
42 elif isinstance(by, str):
43 amt = len(by)
44 elif isinstance(by, int):
45 amt = by
46 else:
47 raise ValueError(by)
48 self._pos += amt
49 return by
50
51 def consume(self, r):
52 return self.advance(self.match(r))
53
54 def child(self):
55 return _MatchChildParser(self)
56
57
58 class _MatchChildParser(_MatchParser):
59 """
60 A child parser state, which advances through the same data as
61 its parent, but has an independent position. This is useful when
62 advancing through syntax elements we might later want to backtrack
63 from.
64 """
65
66 def __init__(self, parent):
67 super().__init__(parent._data)
68 self.__parent = parent
69 self._pos = parent._pos
70
71 def commit(self):
72 """
73 Advance the parent state to the current position of this child state.
74 """
75 self.__parent._pos = self._pos
76 return self.__parent
77
78
79 class ParseError(Exception):
80 def __init__(self, parser):
81 super().__init__("Parse error at position %u (near %r)" % (
82 parser._pos, parser._data[parser._pos:parser._pos + 20]
83 ))
84
85
86 # While the specification <https://www.w3.org/TR/webvtt1/#webvtt-timestamp>
87 # prescribes that hours must be *2 or more* digits, timestamps with a single
88 # digit for the hour part has been seen in the wild.
89 # See https://github.com/yt-dlp/yt-dlp/issues/921
90 _REGEX_TS = re.compile(r'''(?x)
91 (?:([0-9]{1,}):)?
92 ([0-9]{2}):
93 ([0-9]{2})\.
94 ([0-9]{3})?
95 ''')
96 _REGEX_EOF = re.compile(r'\Z')
97 _REGEX_NL = re.compile(r'(?:\r\n|[\r\n])')
98 _REGEX_BLANK = re.compile(r'(?:\r\n|[\r\n])+')
99
100
101 def _parse_ts(ts):
102 """
103 Convert a parsed WebVTT timestamp (a re.Match obtained from _REGEX_TS)
104 into an MPEG PES timestamp: a tick counter at 90 kHz resolution.
105 """
106 return 90 * sum(
107 int(part or 0) * mult for part, mult in zip(ts.groups(), (3600_000, 60_000, 1000, 1)))
108
109
110 def _format_ts(ts):
111 """
112 Convert an MPEG PES timestamp into a WebVTT timestamp.
113 This will lose sub-millisecond precision.
114 """
115 return '%02u:%02u:%02u.%03u' % timetuple_from_msec(int((ts + 45) // 90))
116
117
118 class Block:
119 """
120 An abstract WebVTT block.
121 """
122
123 def __init__(self, **kwargs):
124 for key, val in kwargs.items():
125 setattr(self, key, val)
126
127 @classmethod
128 def parse(cls, parser):
129 m = parser.match(cls._REGEX)
130 if not m:
131 return None
132 parser.advance(m)
133 return cls(raw=m.group(0))
134
135 def write_into(self, stream):
136 stream.write(self.raw)
137
138
139 class HeaderBlock(Block):
140 """
141 A WebVTT block that may only appear in the header part of the file,
142 i.e. before any cue blocks.
143 """
144
145 pass
146
147
148 class Magic(HeaderBlock):
149 _REGEX = re.compile(r'\ufeff?WEBVTT([ \t][^\r\n]*)?(?:\r\n|[\r\n])')
150
151 # XXX: The X-TIMESTAMP-MAP extension is described in RFC 8216 §3.5
152 # <https://tools.ietf.org/html/rfc8216#section-3.5>, but the RFC
153 # doesn’t specify the exact grammar nor where in the WebVTT
154 # syntax it should be placed; the below has been devised based
155 # on usage in the wild
156 #
157 # And strictly speaking, the presence of this extension violates
158 # the W3C WebVTT spec. Oh well.
159
160 _REGEX_TSMAP = re.compile(r'X-TIMESTAMP-MAP=')
161 _REGEX_TSMAP_LOCAL = re.compile(r'LOCAL:')
162 _REGEX_TSMAP_MPEGTS = re.compile(r'MPEGTS:([0-9]+)')
163 _REGEX_TSMAP_SEP = re.compile(r'[ \t]*,[ \t]*')
164
165 @classmethod
166 def __parse_tsmap(cls, parser):
167 parser = parser.child()
168
169 while True:
170 m = parser.consume(cls._REGEX_TSMAP_LOCAL)
171 if m:
172 m = parser.consume(_REGEX_TS)
173 if m is None:
174 raise ParseError(parser)
175 local = _parse_ts(m)
176 if local is None:
177 raise ParseError(parser)
178 else:
179 m = parser.consume(cls._REGEX_TSMAP_MPEGTS)
180 if m:
181 mpegts = int_or_none(m.group(1))
182 if mpegts is None:
183 raise ParseError(parser)
184 else:
185 raise ParseError(parser)
186 if parser.consume(cls._REGEX_TSMAP_SEP):
187 continue
188 if parser.consume(_REGEX_NL):
189 break
190 raise ParseError(parser)
191
192 parser.commit()
193 return local, mpegts
194
195 @classmethod
196 def parse(cls, parser):
197 parser = parser.child()
198
199 m = parser.consume(cls._REGEX)
200 if not m:
201 raise ParseError(parser)
202
203 extra = m.group(1)
204 local, mpegts = None, None
205 if parser.consume(cls._REGEX_TSMAP):
206 local, mpegts = cls.__parse_tsmap(parser)
207 if not parser.consume(_REGEX_NL):
208 raise ParseError(parser)
209 parser.commit()
210 return cls(extra=extra, mpegts=mpegts, local=local)
211
212 def write_into(self, stream):
213 stream.write('WEBVTT')
214 if self.extra is not None:
215 stream.write(self.extra)
216 stream.write('\n')
217 if self.local or self.mpegts:
218 stream.write('X-TIMESTAMP-MAP=LOCAL:')
219 stream.write(_format_ts(self.local if self.local is not None else 0))
220 stream.write(',MPEGTS:')
221 stream.write(str(self.mpegts if self.mpegts is not None else 0))
222 stream.write('\n')
223 stream.write('\n')
224
225
226 class StyleBlock(HeaderBlock):
227 _REGEX = re.compile(r'''(?x)
228 STYLE[\ \t]*(?:\r\n|[\r\n])
229 ((?:(?!-->)[^\r\n])+(?:\r\n|[\r\n]))*
230 (?:\r\n|[\r\n])
231 ''')
232
233
234 class RegionBlock(HeaderBlock):
235 _REGEX = re.compile(r'''(?x)
236 REGION[\ \t]*
237 ((?:(?!-->)[^\r\n])+(?:\r\n|[\r\n]))*
238 (?:\r\n|[\r\n])
239 ''')
240
241
242 class CommentBlock(Block):
243 _REGEX = re.compile(r'''(?x)
244 NOTE(?:\r\n|[\ \t\r\n])
245 ((?:(?!-->)[^\r\n])+(?:\r\n|[\r\n]))*
246 (?:\r\n|[\r\n])
247 ''')
248
249
250 class CueBlock(Block):
251 """
252 A cue block. The payload is not interpreted.
253 """
254
255 _REGEX_ID = re.compile(r'((?:(?!-->)[^\r\n])+)(?:\r\n|[\r\n])')
256 _REGEX_ARROW = re.compile(r'[ \t]+-->[ \t]+')
257 _REGEX_SETTINGS = re.compile(r'[ \t]+((?:(?!-->)[^\r\n])+)')
258 _REGEX_PAYLOAD = re.compile(r'[^\r\n]+(?:\r\n|[\r\n])?')
259
260 @classmethod
261 def parse(cls, parser):
262 parser = parser.child()
263
264 id = None
265 m = parser.consume(cls._REGEX_ID)
266 if m:
267 id = m.group(1)
268
269 m0 = parser.consume(_REGEX_TS)
270 if not m0:
271 return None
272 if not parser.consume(cls._REGEX_ARROW):
273 return None
274 m1 = parser.consume(_REGEX_TS)
275 if not m1:
276 return None
277 m2 = parser.consume(cls._REGEX_SETTINGS)
278 if not parser.consume(_REGEX_NL):
279 return None
280
281 start = _parse_ts(m0)
282 end = _parse_ts(m1)
283 settings = m2.group(1) if m2 is not None else None
284
285 text = io.StringIO()
286 while True:
287 m = parser.consume(cls._REGEX_PAYLOAD)
288 if not m:
289 break
290 text.write(m.group(0))
291
292 parser.commit()
293 return cls(
294 id=id,
295 start=start, end=end, settings=settings,
296 text=text.getvalue()
297 )
298
299 def write_into(self, stream):
300 if self.id is not None:
301 stream.write(self.id)
302 stream.write('\n')
303 stream.write(_format_ts(self.start))
304 stream.write(' --> ')
305 stream.write(_format_ts(self.end))
306 if self.settings is not None:
307 stream.write(' ')
308 stream.write(self.settings)
309 stream.write('\n')
310 stream.write(self.text)
311 stream.write('\n')
312
313 @property
314 def as_json(self):
315 return {
316 'id': self.id,
317 'start': self.start,
318 'end': self.end,
319 'text': self.text,
320 'settings': self.settings,
321 }
322
323 def __eq__(self, other):
324 return self.as_json == other.as_json
325
326 @classmethod
327 def from_json(cls, json):
328 return cls(
329 id=json['id'],
330 start=json['start'],
331 end=json['end'],
332 text=json['text'],
333 settings=json['settings']
334 )
335
336 def hinges(self, other):
337 if self.text != other.text:
338 return False
339 if self.settings != other.settings:
340 return False
341 return self.start <= self.end == other.start <= other.end
342
343
344 def parse_fragment(frag_content):
345 """
346 A generator that yields (partially) parsed WebVTT blocks when given
347 a bytes object containing the raw contents of a WebVTT file.
348 """
349
350 parser = _MatchParser(frag_content.decode('utf-8'))
351
352 yield Magic.parse(parser)
353
354 while not parser.match(_REGEX_EOF):
355 if parser.consume(_REGEX_BLANK):
356 continue
357
358 block = RegionBlock.parse(parser)
359 if block:
360 yield block
361 continue
362 block = StyleBlock.parse(parser)
363 if block:
364 yield block
365 continue
366 block = CommentBlock.parse(parser)
367 if block:
368 yield block # XXX: or skip
369 continue
370
371 break
372
373 while not parser.match(_REGEX_EOF):
374 if parser.consume(_REGEX_BLANK):
375 continue
376
377 block = CommentBlock.parse(parser)
378 if block:
379 yield block # XXX: or skip
380 continue
381 block = CueBlock.parse(parser)
382 if block:
383 yield block
384 continue
385
386 raise ParseError(parser)