]> jfr.im git - yt-dlp.git/blame - yt_dlp/webvtt.py
[cleanup] Minor fixes (See desc)
[yt-dlp.git] / yt_dlp / webvtt.py
CommitLineData
4a2f19ab
F
1"""
2A partial parser for WebVTT segments. Interprets enough of the WebVTT stream
3to be able to assemble a single stand-alone subtitle file, suitably adjusting
4timestamps on the way, while everything else is passed through unmodified.
5
6Regular expressions based on the W3C WebVTT specification
7<https://www.w3.org/TR/webvtt1/>. The X-TIMESTAMP-MAP extension is described
8in RFC 8216 §3.5 <https://tools.ietf.org/html/rfc8216#section-3.5>.
9"""
10
4a2f19ab 11import io
f8271158 12
77f90330 13from .compat import re
aa7785f8 14from .utils import int_or_none, timetuple_from_msec
4a2f19ab
F
15
16
86e5f3ed 17class _MatchParser:
4a2f19ab
F
18 """
19 An object that maintains the current parsing position and allows
20 conveniently advancing it as syntax elements are successfully parsed.
21 """
22
23 def __init__(self, string):
24 self._data = string
25 self._pos = 0
26
27 def match(self, r):
77f90330 28 if isinstance(r, re.Pattern):
4a2f19ab
F
29 return r.match(self._data, self._pos)
30 if isinstance(r, str):
31 if self._data.startswith(r, self._pos):
32 return len(r)
33 return None
34 raise ValueError(r)
35
36 def advance(self, by):
37 if by is None:
38 amt = 0
77f90330 39 elif isinstance(by, re.Match):
4a2f19ab
F
40 amt = len(by.group(0))
41 elif isinstance(by, str):
42 amt = len(by)
43 elif isinstance(by, int):
44 amt = by
45 else:
46 raise ValueError(by)
47 self._pos += amt
48 return by
49
50 def consume(self, r):
51 return self.advance(self.match(r))
52
53 def child(self):
54 return _MatchChildParser(self)
55
56
57class _MatchChildParser(_MatchParser):
58 """
59 A child parser state, which advances through the same data as
60 its parent, but has an independent position. This is useful when
61 advancing through syntax elements we might later want to backtrack
62 from.
63 """
64
65 def __init__(self, parent):
86e5f3ed 66 super().__init__(parent._data)
4a2f19ab
F
67 self.__parent = parent
68 self._pos = parent._pos
69
70 def commit(self):
71 """
72 Advance the parent state to the current position of this child state.
73 """
74 self.__parent._pos = self._pos
75 return self.__parent
76
77
78class ParseError(Exception):
79 def __init__(self, parser):
86e5f3ed 80 super().__init__("Parse error at position %u (near %r)" % (
4a2f19ab
F
81 parser._pos, parser._data[parser._pos:parser._pos + 20]
82 ))
83
84
81a136b8 85# While the specification <https://www.w3.org/TR/webvtt1/#webvtt-timestamp>
86# prescribes that hours must be *2 or more* digits, timestamps with a single
87# digit for the hour part has been seen in the wild.
88# See https://github.com/yt-dlp/yt-dlp/issues/921
4a2f19ab 89_REGEX_TS = re.compile(r'''(?x)
81a136b8 90 (?:([0-9]{1,}):)?
4a2f19ab
F
91 ([0-9]{2}):
92 ([0-9]{2})\.
93 ([0-9]{3})?
94''')
95_REGEX_EOF = re.compile(r'\Z')
96_REGEX_NL = re.compile(r'(?:\r\n|[\r\n])')
97_REGEX_BLANK = re.compile(r'(?:\r\n|[\r\n])+')
98
99
100def _parse_ts(ts):
101 """
102 Convert a parsed WebVTT timestamp (a re.Match obtained from _REGEX_TS)
103 into an MPEG PES timestamp: a tick counter at 90 kHz resolution.
104 """
19a03940 105 return 90 * sum(
106 int(part or 0) * mult for part, mult in zip(ts.groups(), (3600_000, 60_000, 1000, 1)))
4a2f19ab
F
107
108
109def _format_ts(ts):
110 """
111 Convert an MPEG PES timestamp into a WebVTT timestamp.
112 This will lose sub-millisecond precision.
113 """
aa7785f8 114 return '%02u:%02u:%02u.%03u' % timetuple_from_msec(int((ts + 45) // 90))
4a2f19ab
F
115
116
86e5f3ed 117class Block:
4a2f19ab
F
118 """
119 An abstract WebVTT block.
120 """
121
122 def __init__(self, **kwargs):
123 for key, val in kwargs.items():
124 setattr(self, key, val)
125
126 @classmethod
127 def parse(cls, parser):
128 m = parser.match(cls._REGEX)
129 if not m:
130 return None
131 parser.advance(m)
132 return cls(raw=m.group(0))
133
134 def write_into(self, stream):
135 stream.write(self.raw)
136
137
138class HeaderBlock(Block):
139 """
140 A WebVTT block that may only appear in the header part of the file,
141 i.e. before any cue blocks.
142 """
143
144 pass
145
146
147class Magic(HeaderBlock):
148 _REGEX = re.compile(r'\ufeff?WEBVTT([ \t][^\r\n]*)?(?:\r\n|[\r\n])')
149
150 # XXX: The X-TIMESTAMP-MAP extension is described in RFC 8216 §3.5
151 # <https://tools.ietf.org/html/rfc8216#section-3.5>, but the RFC
152 # doesn’t specify the exact grammar nor where in the WebVTT
153 # syntax it should be placed; the below has been devised based
154 # on usage in the wild
155 #
156 # And strictly speaking, the presence of this extension violates
157 # the W3C WebVTT spec. Oh well.
158
159 _REGEX_TSMAP = re.compile(r'X-TIMESTAMP-MAP=')
160 _REGEX_TSMAP_LOCAL = re.compile(r'LOCAL:')
161 _REGEX_TSMAP_MPEGTS = re.compile(r'MPEGTS:([0-9]+)')
81a136b8 162 _REGEX_TSMAP_SEP = re.compile(r'[ \t]*,[ \t]*')
4a2f19ab
F
163
164 @classmethod
165 def __parse_tsmap(cls, parser):
166 parser = parser.child()
167
168 while True:
169 m = parser.consume(cls._REGEX_TSMAP_LOCAL)
170 if m:
171 m = parser.consume(_REGEX_TS)
172 if m is None:
173 raise ParseError(parser)
174 local = _parse_ts(m)
175 if local is None:
176 raise ParseError(parser)
177 else:
178 m = parser.consume(cls._REGEX_TSMAP_MPEGTS)
179 if m:
180 mpegts = int_or_none(m.group(1))
181 if mpegts is None:
182 raise ParseError(parser)
183 else:
184 raise ParseError(parser)
81a136b8 185 if parser.consume(cls._REGEX_TSMAP_SEP):
4a2f19ab
F
186 continue
187 if parser.consume(_REGEX_NL):
188 break
189 raise ParseError(parser)
190
191 parser.commit()
192 return local, mpegts
193
194 @classmethod
195 def parse(cls, parser):
196 parser = parser.child()
197
198 m = parser.consume(cls._REGEX)
199 if not m:
200 raise ParseError(parser)
201
202 extra = m.group(1)
203 local, mpegts = None, None
204 if parser.consume(cls._REGEX_TSMAP):
205 local, mpegts = cls.__parse_tsmap(parser)
206 if not parser.consume(_REGEX_NL):
207 raise ParseError(parser)
208 parser.commit()
209 return cls(extra=extra, mpegts=mpegts, local=local)
210
211 def write_into(self, stream):
212 stream.write('WEBVTT')
213 if self.extra is not None:
214 stream.write(self.extra)
215 stream.write('\n')
216 if self.local or self.mpegts:
217 stream.write('X-TIMESTAMP-MAP=LOCAL:')
218 stream.write(_format_ts(self.local if self.local is not None else 0))
219 stream.write(',MPEGTS:')
220 stream.write(str(self.mpegts if self.mpegts is not None else 0))
221 stream.write('\n')
222 stream.write('\n')
223
224
225class StyleBlock(HeaderBlock):
226 _REGEX = re.compile(r'''(?x)
227 STYLE[\ \t]*(?:\r\n|[\r\n])
228 ((?:(?!-->)[^\r\n])+(?:\r\n|[\r\n]))*
229 (?:\r\n|[\r\n])
230 ''')
231
232
233class RegionBlock(HeaderBlock):
234 _REGEX = re.compile(r'''(?x)
235 REGION[\ \t]*
236 ((?:(?!-->)[^\r\n])+(?:\r\n|[\r\n]))*
237 (?:\r\n|[\r\n])
238 ''')
239
240
241class CommentBlock(Block):
242 _REGEX = re.compile(r'''(?x)
243 NOTE(?:\r\n|[\ \t\r\n])
244 ((?:(?!-->)[^\r\n])+(?:\r\n|[\r\n]))*
245 (?:\r\n|[\r\n])
246 ''')
247
248
249class CueBlock(Block):
250 """
251 A cue block. The payload is not interpreted.
252 """
253
254 _REGEX_ID = re.compile(r'((?:(?!-->)[^\r\n])+)(?:\r\n|[\r\n])')
255 _REGEX_ARROW = re.compile(r'[ \t]+-->[ \t]+')
256 _REGEX_SETTINGS = re.compile(r'[ \t]+((?:(?!-->)[^\r\n])+)')
257 _REGEX_PAYLOAD = re.compile(r'[^\r\n]+(?:\r\n|[\r\n])?')
258
259 @classmethod
260 def parse(cls, parser):
261 parser = parser.child()
262
263 id = None
264 m = parser.consume(cls._REGEX_ID)
265 if m:
266 id = m.group(1)
267
268 m0 = parser.consume(_REGEX_TS)
269 if not m0:
270 return None
271 if not parser.consume(cls._REGEX_ARROW):
272 return None
273 m1 = parser.consume(_REGEX_TS)
274 if not m1:
275 return None
276 m2 = parser.consume(cls._REGEX_SETTINGS)
277 if not parser.consume(_REGEX_NL):
278 return None
279
280 start = _parse_ts(m0)
281 end = _parse_ts(m1)
282 settings = m2.group(1) if m2 is not None else None
283
284 text = io.StringIO()
285 while True:
286 m = parser.consume(cls._REGEX_PAYLOAD)
287 if not m:
288 break
289 text.write(m.group(0))
290
291 parser.commit()
292 return cls(
293 id=id,
294 start=start, end=end, settings=settings,
295 text=text.getvalue()
296 )
297
298 def write_into(self, stream):
299 if self.id is not None:
300 stream.write(self.id)
301 stream.write('\n')
302 stream.write(_format_ts(self.start))
303 stream.write(' --> ')
304 stream.write(_format_ts(self.end))
305 if self.settings is not None:
306 stream.write(' ')
307 stream.write(self.settings)
308 stream.write('\n')
309 stream.write(self.text)
310 stream.write('\n')
311
333217f4
F
312 @property
313 def as_json(self):
314 return {
315 'id': self.id,
316 'start': self.start,
317 'end': self.end,
318 'text': self.text,
319 'settings': self.settings,
320 }
321
25a3f4f5
F
322 def __eq__(self, other):
323 return self.as_json == other.as_json
324
325 @classmethod
326 def from_json(cls, json):
327 return cls(
328 id=json['id'],
329 start=json['start'],
330 end=json['end'],
331 text=json['text'],
332 settings=json['settings']
333 )
334
335 def hinges(self, other):
336 if self.text != other.text:
337 return False
338 if self.settings != other.settings:
339 return False
340 return self.start <= self.end == other.start <= other.end
341
4a2f19ab
F
342
343def parse_fragment(frag_content):
344 """
345 A generator that yields (partially) parsed WebVTT blocks when given
346 a bytes object containing the raw contents of a WebVTT file.
347 """
348
0f06bcd7 349 parser = _MatchParser(frag_content.decode())
4a2f19ab
F
350
351 yield Magic.parse(parser)
352
353 while not parser.match(_REGEX_EOF):
354 if parser.consume(_REGEX_BLANK):
355 continue
356
357 block = RegionBlock.parse(parser)
358 if block:
359 yield block
360 continue
361 block = StyleBlock.parse(parser)
362 if block:
363 yield block
364 continue
365 block = CommentBlock.parse(parser)
366 if block:
367 yield block # XXX: or skip
368 continue
369
370 break
371
372 while not parser.match(_REGEX_EOF):
373 if parser.consume(_REGEX_BLANK):
374 continue
375
376 block = CommentBlock.parse(parser)
377 if block:
378 yield block # XXX: or skip
379 continue
380 block = CueBlock.parse(parser)
381 if block:
382 yield block
383 continue
384
385 raise ParseError(parser)