]> jfr.im git - yt-dlp.git/blob - yt_dlp/webvtt.py
[compat] Split into sub-modules (#2173)
[yt-dlp.git] / yt_dlp / webvtt.py
1 """
2 A partial parser for WebVTT segments. Interprets enough of the WebVTT stream
3 to be able to assemble a single stand-alone subtitle file, suitably adjusting
4 timestamps on the way, while everything else is passed through unmodified.
5
6 Regular expressions based on the W3C WebVTT specification
7 <https://www.w3.org/TR/webvtt1/>. The X-TIMESTAMP-MAP extension is described
8 in RFC 8216 §3.5 <https://tools.ietf.org/html/rfc8216#section-3.5>.
9 """
10
11 import io
12
13 from .compat import re
14 from .utils import int_or_none, timetuple_from_msec
15
16
17 class _MatchParser:
18 """
19 An object that maintains the current parsing position and allows
20 conveniently advancing it as syntax elements are successfully parsed.
21 """
22
23 def __init__(self, string):
24 self._data = string
25 self._pos = 0
26
27 def match(self, r):
28 if isinstance(r, re.Pattern):
29 return r.match(self._data, self._pos)
30 if isinstance(r, str):
31 if self._data.startswith(r, self._pos):
32 return len(r)
33 return None
34 raise ValueError(r)
35
36 def advance(self, by):
37 if by is None:
38 amt = 0
39 elif isinstance(by, re.Match):
40 amt = len(by.group(0))
41 elif isinstance(by, str):
42 amt = len(by)
43 elif isinstance(by, int):
44 amt = by
45 else:
46 raise ValueError(by)
47 self._pos += amt
48 return by
49
50 def consume(self, r):
51 return self.advance(self.match(r))
52
53 def child(self):
54 return _MatchChildParser(self)
55
56
57 class _MatchChildParser(_MatchParser):
58 """
59 A child parser state, which advances through the same data as
60 its parent, but has an independent position. This is useful when
61 advancing through syntax elements we might later want to backtrack
62 from.
63 """
64
65 def __init__(self, parent):
66 super().__init__(parent._data)
67 self.__parent = parent
68 self._pos = parent._pos
69
70 def commit(self):
71 """
72 Advance the parent state to the current position of this child state.
73 """
74 self.__parent._pos = self._pos
75 return self.__parent
76
77
78 class ParseError(Exception):
79 def __init__(self, parser):
80 super().__init__("Parse error at position %u (near %r)" % (
81 parser._pos, parser._data[parser._pos:parser._pos + 20]
82 ))
83
84
85 # While the specification <https://www.w3.org/TR/webvtt1/#webvtt-timestamp>
86 # prescribes that hours must be *2 or more* digits, timestamps with a single
87 # digit for the hour part has been seen in the wild.
88 # See https://github.com/yt-dlp/yt-dlp/issues/921
89 _REGEX_TS = re.compile(r'''(?x)
90 (?:([0-9]{1,}):)?
91 ([0-9]{2}):
92 ([0-9]{2})\.
93 ([0-9]{3})?
94 ''')
95 _REGEX_EOF = re.compile(r'\Z')
96 _REGEX_NL = re.compile(r'(?:\r\n|[\r\n])')
97 _REGEX_BLANK = re.compile(r'(?:\r\n|[\r\n])+')
98
99
100 def _parse_ts(ts):
101 """
102 Convert a parsed WebVTT timestamp (a re.Match obtained from _REGEX_TS)
103 into an MPEG PES timestamp: a tick counter at 90 kHz resolution.
104 """
105 return 90 * sum(
106 int(part or 0) * mult for part, mult in zip(ts.groups(), (3600_000, 60_000, 1000, 1)))
107
108
109 def _format_ts(ts):
110 """
111 Convert an MPEG PES timestamp into a WebVTT timestamp.
112 This will lose sub-millisecond precision.
113 """
114 return '%02u:%02u:%02u.%03u' % timetuple_from_msec(int((ts + 45) // 90))
115
116
117 class Block:
118 """
119 An abstract WebVTT block.
120 """
121
122 def __init__(self, **kwargs):
123 for key, val in kwargs.items():
124 setattr(self, key, val)
125
126 @classmethod
127 def parse(cls, parser):
128 m = parser.match(cls._REGEX)
129 if not m:
130 return None
131 parser.advance(m)
132 return cls(raw=m.group(0))
133
134 def write_into(self, stream):
135 stream.write(self.raw)
136
137
138 class HeaderBlock(Block):
139 """
140 A WebVTT block that may only appear in the header part of the file,
141 i.e. before any cue blocks.
142 """
143
144 pass
145
146
147 class Magic(HeaderBlock):
148 _REGEX = re.compile(r'\ufeff?WEBVTT([ \t][^\r\n]*)?(?:\r\n|[\r\n])')
149
150 # XXX: The X-TIMESTAMP-MAP extension is described in RFC 8216 §3.5
151 # <https://tools.ietf.org/html/rfc8216#section-3.5>, but the RFC
152 # doesn’t specify the exact grammar nor where in the WebVTT
153 # syntax it should be placed; the below has been devised based
154 # on usage in the wild
155 #
156 # And strictly speaking, the presence of this extension violates
157 # the W3C WebVTT spec. Oh well.
158
159 _REGEX_TSMAP = re.compile(r'X-TIMESTAMP-MAP=')
160 _REGEX_TSMAP_LOCAL = re.compile(r'LOCAL:')
161 _REGEX_TSMAP_MPEGTS = re.compile(r'MPEGTS:([0-9]+)')
162 _REGEX_TSMAP_SEP = re.compile(r'[ \t]*,[ \t]*')
163
164 @classmethod
165 def __parse_tsmap(cls, parser):
166 parser = parser.child()
167
168 while True:
169 m = parser.consume(cls._REGEX_TSMAP_LOCAL)
170 if m:
171 m = parser.consume(_REGEX_TS)
172 if m is None:
173 raise ParseError(parser)
174 local = _parse_ts(m)
175 if local is None:
176 raise ParseError(parser)
177 else:
178 m = parser.consume(cls._REGEX_TSMAP_MPEGTS)
179 if m:
180 mpegts = int_or_none(m.group(1))
181 if mpegts is None:
182 raise ParseError(parser)
183 else:
184 raise ParseError(parser)
185 if parser.consume(cls._REGEX_TSMAP_SEP):
186 continue
187 if parser.consume(_REGEX_NL):
188 break
189 raise ParseError(parser)
190
191 parser.commit()
192 return local, mpegts
193
194 @classmethod
195 def parse(cls, parser):
196 parser = parser.child()
197
198 m = parser.consume(cls._REGEX)
199 if not m:
200 raise ParseError(parser)
201
202 extra = m.group(1)
203 local, mpegts = None, None
204 if parser.consume(cls._REGEX_TSMAP):
205 local, mpegts = cls.__parse_tsmap(parser)
206 if not parser.consume(_REGEX_NL):
207 raise ParseError(parser)
208 parser.commit()
209 return cls(extra=extra, mpegts=mpegts, local=local)
210
211 def write_into(self, stream):
212 stream.write('WEBVTT')
213 if self.extra is not None:
214 stream.write(self.extra)
215 stream.write('\n')
216 if self.local or self.mpegts:
217 stream.write('X-TIMESTAMP-MAP=LOCAL:')
218 stream.write(_format_ts(self.local if self.local is not None else 0))
219 stream.write(',MPEGTS:')
220 stream.write(str(self.mpegts if self.mpegts is not None else 0))
221 stream.write('\n')
222 stream.write('\n')
223
224
225 class StyleBlock(HeaderBlock):
226 _REGEX = re.compile(r'''(?x)
227 STYLE[\ \t]*(?:\r\n|[\r\n])
228 ((?:(?!-->)[^\r\n])+(?:\r\n|[\r\n]))*
229 (?:\r\n|[\r\n])
230 ''')
231
232
233 class RegionBlock(HeaderBlock):
234 _REGEX = re.compile(r'''(?x)
235 REGION[\ \t]*
236 ((?:(?!-->)[^\r\n])+(?:\r\n|[\r\n]))*
237 (?:\r\n|[\r\n])
238 ''')
239
240
241 class CommentBlock(Block):
242 _REGEX = re.compile(r'''(?x)
243 NOTE(?:\r\n|[\ \t\r\n])
244 ((?:(?!-->)[^\r\n])+(?:\r\n|[\r\n]))*
245 (?:\r\n|[\r\n])
246 ''')
247
248
249 class CueBlock(Block):
250 """
251 A cue block. The payload is not interpreted.
252 """
253
254 _REGEX_ID = re.compile(r'((?:(?!-->)[^\r\n])+)(?:\r\n|[\r\n])')
255 _REGEX_ARROW = re.compile(r'[ \t]+-->[ \t]+')
256 _REGEX_SETTINGS = re.compile(r'[ \t]+((?:(?!-->)[^\r\n])+)')
257 _REGEX_PAYLOAD = re.compile(r'[^\r\n]+(?:\r\n|[\r\n])?')
258
259 @classmethod
260 def parse(cls, parser):
261 parser = parser.child()
262
263 id = None
264 m = parser.consume(cls._REGEX_ID)
265 if m:
266 id = m.group(1)
267
268 m0 = parser.consume(_REGEX_TS)
269 if not m0:
270 return None
271 if not parser.consume(cls._REGEX_ARROW):
272 return None
273 m1 = parser.consume(_REGEX_TS)
274 if not m1:
275 return None
276 m2 = parser.consume(cls._REGEX_SETTINGS)
277 if not parser.consume(_REGEX_NL):
278 return None
279
280 start = _parse_ts(m0)
281 end = _parse_ts(m1)
282 settings = m2.group(1) if m2 is not None else None
283
284 text = io.StringIO()
285 while True:
286 m = parser.consume(cls._REGEX_PAYLOAD)
287 if not m:
288 break
289 text.write(m.group(0))
290
291 parser.commit()
292 return cls(
293 id=id,
294 start=start, end=end, settings=settings,
295 text=text.getvalue()
296 )
297
298 def write_into(self, stream):
299 if self.id is not None:
300 stream.write(self.id)
301 stream.write('\n')
302 stream.write(_format_ts(self.start))
303 stream.write(' --> ')
304 stream.write(_format_ts(self.end))
305 if self.settings is not None:
306 stream.write(' ')
307 stream.write(self.settings)
308 stream.write('\n')
309 stream.write(self.text)
310 stream.write('\n')
311
312 @property
313 def as_json(self):
314 return {
315 'id': self.id,
316 'start': self.start,
317 'end': self.end,
318 'text': self.text,
319 'settings': self.settings,
320 }
321
322 def __eq__(self, other):
323 return self.as_json == other.as_json
324
325 @classmethod
326 def from_json(cls, json):
327 return cls(
328 id=json['id'],
329 start=json['start'],
330 end=json['end'],
331 text=json['text'],
332 settings=json['settings']
333 )
334
335 def hinges(self, other):
336 if self.text != other.text:
337 return False
338 if self.settings != other.settings:
339 return False
340 return self.start <= self.end == other.start <= other.end
341
342
343 def parse_fragment(frag_content):
344 """
345 A generator that yields (partially) parsed WebVTT blocks when given
346 a bytes object containing the raw contents of a WebVTT file.
347 """
348
349 parser = _MatchParser(frag_content.decode('utf-8'))
350
351 yield Magic.parse(parser)
352
353 while not parser.match(_REGEX_EOF):
354 if parser.consume(_REGEX_BLANK):
355 continue
356
357 block = RegionBlock.parse(parser)
358 if block:
359 yield block
360 continue
361 block = StyleBlock.parse(parser)
362 if block:
363 yield block
364 continue
365 block = CommentBlock.parse(parser)
366 if block:
367 yield block # XXX: or skip
368 continue
369
370 break
371
372 while not parser.match(_REGEX_EOF):
373 if parser.consume(_REGEX_BLANK):
374 continue
375
376 block = CommentBlock.parse(parser)
377 if block:
378 yield block # XXX: or skip
379 continue
380 block = CueBlock.parse(parser)
381 if block:
382 yield block
383 continue
384
385 raise ParseError(parser)