]>
Commit | Line | Data |
---|---|---|
4a2f19ab F |
1 | """ |
2 | A partial parser for WebVTT segments. Interprets enough of the WebVTT stream | |
3 | to be able to assemble a single stand-alone subtitle file, suitably adjusting | |
4 | timestamps on the way, while everything else is passed through unmodified. | |
5 | ||
6 | Regular expressions based on the W3C WebVTT specification | |
7 | <https://www.w3.org/TR/webvtt1/>. The X-TIMESTAMP-MAP extension is described | |
8 | in RFC 8216 §3.5 <https://tools.ietf.org/html/rfc8216#section-3.5>. | |
9 | """ | |
10 | ||
4a2f19ab | 11 | import io |
6929b41a | 12 | import re |
f8271158 | 13 | |
aa7785f8 | 14 | from .utils import int_or_none, timetuple_from_msec |
4a2f19ab F |
15 | |
16 | ||
86e5f3ed | 17 | class _MatchParser: |
4a2f19ab F |
18 | """ |
19 | An object that maintains the current parsing position and allows | |
20 | conveniently advancing it as syntax elements are successfully parsed. | |
21 | """ | |
22 | ||
23 | def __init__(self, string): | |
24 | self._data = string | |
25 | self._pos = 0 | |
26 | ||
27 | def match(self, r): | |
77f90330 | 28 | if isinstance(r, re.Pattern): |
4a2f19ab F |
29 | return r.match(self._data, self._pos) |
30 | if isinstance(r, str): | |
31 | if self._data.startswith(r, self._pos): | |
32 | return len(r) | |
33 | return None | |
34 | raise ValueError(r) | |
35 | ||
36 | def advance(self, by): | |
37 | if by is None: | |
38 | amt = 0 | |
77f90330 | 39 | elif isinstance(by, re.Match): |
4a2f19ab F |
40 | amt = len(by.group(0)) |
41 | elif isinstance(by, str): | |
42 | amt = len(by) | |
43 | elif isinstance(by, int): | |
44 | amt = by | |
45 | else: | |
46 | raise ValueError(by) | |
47 | self._pos += amt | |
48 | return by | |
49 | ||
50 | def consume(self, r): | |
51 | return self.advance(self.match(r)) | |
52 | ||
53 | def child(self): | |
54 | return _MatchChildParser(self) | |
55 | ||
56 | ||
57 | class _MatchChildParser(_MatchParser): | |
58 | """ | |
59 | A child parser state, which advances through the same data as | |
60 | its parent, but has an independent position. This is useful when | |
61 | advancing through syntax elements we might later want to backtrack | |
62 | from. | |
63 | """ | |
64 | ||
65 | def __init__(self, parent): | |
86e5f3ed | 66 | super().__init__(parent._data) |
4a2f19ab F |
67 | self.__parent = parent |
68 | self._pos = parent._pos | |
69 | ||
70 | def commit(self): | |
71 | """ | |
72 | Advance the parent state to the current position of this child state. | |
73 | """ | |
74 | self.__parent._pos = self._pos | |
75 | return self.__parent | |
76 | ||
77 | ||
78 | class ParseError(Exception): | |
79 | def __init__(self, parser): | |
86e5f3ed | 80 | super().__init__("Parse error at position %u (near %r)" % ( |
4a2f19ab F |
81 | parser._pos, parser._data[parser._pos:parser._pos + 20] |
82 | )) | |
83 | ||
84 | ||
81a136b8 | 85 | # While the specification <https://www.w3.org/TR/webvtt1/#webvtt-timestamp> |
86 | # prescribes that hours must be *2 or more* digits, timestamps with a single | |
87 | # digit for the hour part has been seen in the wild. | |
88 | # See https://github.com/yt-dlp/yt-dlp/issues/921 | |
4a2f19ab | 89 | _REGEX_TS = re.compile(r'''(?x) |
81a136b8 | 90 | (?:([0-9]{1,}):)? |
4a2f19ab F |
91 | ([0-9]{2}): |
92 | ([0-9]{2})\. | |
93 | ([0-9]{3})? | |
94 | ''') | |
95 | _REGEX_EOF = re.compile(r'\Z') | |
f352a097 | 96 | _REGEX_NL = re.compile(r'(?:\r\n|[\r\n]|$)') |
4a2f19ab F |
97 | _REGEX_BLANK = re.compile(r'(?:\r\n|[\r\n])+') |
98 | ||
99 | ||
100 | def _parse_ts(ts): | |
101 | """ | |
102 | Convert a parsed WebVTT timestamp (a re.Match obtained from _REGEX_TS) | |
103 | into an MPEG PES timestamp: a tick counter at 90 kHz resolution. | |
104 | """ | |
19a03940 | 105 | return 90 * sum( |
106 | int(part or 0) * mult for part, mult in zip(ts.groups(), (3600_000, 60_000, 1000, 1))) | |
4a2f19ab F |
107 | |
108 | ||
109 | def _format_ts(ts): | |
110 | """ | |
111 | Convert an MPEG PES timestamp into a WebVTT timestamp. | |
112 | This will lose sub-millisecond precision. | |
113 | """ | |
aa7785f8 | 114 | return '%02u:%02u:%02u.%03u' % timetuple_from_msec(int((ts + 45) // 90)) |
4a2f19ab F |
115 | |
116 | ||
86e5f3ed | 117 | class Block: |
4a2f19ab F |
118 | """ |
119 | An abstract WebVTT block. | |
120 | """ | |
121 | ||
122 | def __init__(self, **kwargs): | |
123 | for key, val in kwargs.items(): | |
124 | setattr(self, key, val) | |
125 | ||
126 | @classmethod | |
127 | def parse(cls, parser): | |
128 | m = parser.match(cls._REGEX) | |
129 | if not m: | |
130 | return None | |
131 | parser.advance(m) | |
132 | return cls(raw=m.group(0)) | |
133 | ||
134 | def write_into(self, stream): | |
135 | stream.write(self.raw) | |
136 | ||
137 | ||
138 | class HeaderBlock(Block): | |
139 | """ | |
140 | A WebVTT block that may only appear in the header part of the file, | |
141 | i.e. before any cue blocks. | |
142 | """ | |
4a2f19ab F |
143 | pass |
144 | ||
145 | ||
146 | class Magic(HeaderBlock): | |
147 | _REGEX = re.compile(r'\ufeff?WEBVTT([ \t][^\r\n]*)?(?:\r\n|[\r\n])') | |
148 | ||
149 | # XXX: The X-TIMESTAMP-MAP extension is described in RFC 8216 §3.5 | |
150 | # <https://tools.ietf.org/html/rfc8216#section-3.5>, but the RFC | |
151 | # doesn’t specify the exact grammar nor where in the WebVTT | |
152 | # syntax it should be placed; the below has been devised based | |
153 | # on usage in the wild | |
154 | # | |
155 | # And strictly speaking, the presence of this extension violates | |
156 | # the W3C WebVTT spec. Oh well. | |
157 | ||
158 | _REGEX_TSMAP = re.compile(r'X-TIMESTAMP-MAP=') | |
159 | _REGEX_TSMAP_LOCAL = re.compile(r'LOCAL:') | |
160 | _REGEX_TSMAP_MPEGTS = re.compile(r'MPEGTS:([0-9]+)') | |
81a136b8 | 161 | _REGEX_TSMAP_SEP = re.compile(r'[ \t]*,[ \t]*') |
4a2f19ab | 162 | |
c646d76f | 163 | # This was removed from the spec in the 2017 revision; |
164 | # the last spec draft to describe this syntax element is | |
165 | # <https://www.w3.org/TR/2015/WD-webvtt1-20151208/#webvtt-metadata-header>. | |
166 | # Nevertheless, YouTube keeps serving those | |
167 | _REGEX_META = re.compile(r'(?:(?!-->)[^\r\n])+:(?:(?!-->)[^\r\n])+(?:\r\n|[\r\n])') | |
168 | ||
4a2f19ab F |
169 | @classmethod |
170 | def __parse_tsmap(cls, parser): | |
171 | parser = parser.child() | |
172 | ||
173 | while True: | |
174 | m = parser.consume(cls._REGEX_TSMAP_LOCAL) | |
175 | if m: | |
176 | m = parser.consume(_REGEX_TS) | |
177 | if m is None: | |
178 | raise ParseError(parser) | |
179 | local = _parse_ts(m) | |
180 | if local is None: | |
181 | raise ParseError(parser) | |
182 | else: | |
183 | m = parser.consume(cls._REGEX_TSMAP_MPEGTS) | |
184 | if m: | |
185 | mpegts = int_or_none(m.group(1)) | |
186 | if mpegts is None: | |
187 | raise ParseError(parser) | |
188 | else: | |
189 | raise ParseError(parser) | |
81a136b8 | 190 | if parser.consume(cls._REGEX_TSMAP_SEP): |
4a2f19ab F |
191 | continue |
192 | if parser.consume(_REGEX_NL): | |
193 | break | |
194 | raise ParseError(parser) | |
195 | ||
196 | parser.commit() | |
197 | return local, mpegts | |
198 | ||
199 | @classmethod | |
200 | def parse(cls, parser): | |
201 | parser = parser.child() | |
202 | ||
203 | m = parser.consume(cls._REGEX) | |
204 | if not m: | |
205 | raise ParseError(parser) | |
206 | ||
207 | extra = m.group(1) | |
c646d76f | 208 | local, mpegts, meta = None, None, '' |
209 | while not parser.consume(_REGEX_NL): | |
210 | if parser.consume(cls._REGEX_TSMAP): | |
211 | local, mpegts = cls.__parse_tsmap(parser) | |
212 | continue | |
213 | m = parser.consume(cls._REGEX_META) | |
214 | if m: | |
215 | meta += m.group(0) | |
216 | continue | |
4a2f19ab F |
217 | raise ParseError(parser) |
218 | parser.commit() | |
c646d76f | 219 | return cls(extra=extra, mpegts=mpegts, local=local, meta=meta) |
4a2f19ab F |
220 | |
221 | def write_into(self, stream): | |
222 | stream.write('WEBVTT') | |
223 | if self.extra is not None: | |
224 | stream.write(self.extra) | |
225 | stream.write('\n') | |
226 | if self.local or self.mpegts: | |
227 | stream.write('X-TIMESTAMP-MAP=LOCAL:') | |
228 | stream.write(_format_ts(self.local if self.local is not None else 0)) | |
229 | stream.write(',MPEGTS:') | |
230 | stream.write(str(self.mpegts if self.mpegts is not None else 0)) | |
231 | stream.write('\n') | |
c646d76f | 232 | if self.meta: |
233 | stream.write(self.meta) | |
4a2f19ab F |
234 | stream.write('\n') |
235 | ||
236 | ||
237 | class StyleBlock(HeaderBlock): | |
238 | _REGEX = re.compile(r'''(?x) | |
239 | STYLE[\ \t]*(?:\r\n|[\r\n]) | |
240 | ((?:(?!-->)[^\r\n])+(?:\r\n|[\r\n]))* | |
241 | (?:\r\n|[\r\n]) | |
242 | ''') | |
243 | ||
244 | ||
245 | class RegionBlock(HeaderBlock): | |
246 | _REGEX = re.compile(r'''(?x) | |
247 | REGION[\ \t]* | |
248 | ((?:(?!-->)[^\r\n])+(?:\r\n|[\r\n]))* | |
249 | (?:\r\n|[\r\n]) | |
250 | ''') | |
251 | ||
252 | ||
253 | class CommentBlock(Block): | |
254 | _REGEX = re.compile(r'''(?x) | |
255 | NOTE(?:\r\n|[\ \t\r\n]) | |
256 | ((?:(?!-->)[^\r\n])+(?:\r\n|[\r\n]))* | |
257 | (?:\r\n|[\r\n]) | |
258 | ''') | |
259 | ||
260 | ||
261 | class CueBlock(Block): | |
262 | """ | |
263 | A cue block. The payload is not interpreted. | |
264 | """ | |
265 | ||
266 | _REGEX_ID = re.compile(r'((?:(?!-->)[^\r\n])+)(?:\r\n|[\r\n])') | |
267 | _REGEX_ARROW = re.compile(r'[ \t]+-->[ \t]+') | |
268 | _REGEX_SETTINGS = re.compile(r'[ \t]+((?:(?!-->)[^\r\n])+)') | |
269 | _REGEX_PAYLOAD = re.compile(r'[^\r\n]+(?:\r\n|[\r\n])?') | |
270 | ||
271 | @classmethod | |
272 | def parse(cls, parser): | |
273 | parser = parser.child() | |
274 | ||
275 | id = None | |
276 | m = parser.consume(cls._REGEX_ID) | |
277 | if m: | |
278 | id = m.group(1) | |
279 | ||
280 | m0 = parser.consume(_REGEX_TS) | |
281 | if not m0: | |
282 | return None | |
283 | if not parser.consume(cls._REGEX_ARROW): | |
284 | return None | |
285 | m1 = parser.consume(_REGEX_TS) | |
286 | if not m1: | |
287 | return None | |
288 | m2 = parser.consume(cls._REGEX_SETTINGS) | |
289 | if not parser.consume(_REGEX_NL): | |
290 | return None | |
291 | ||
292 | start = _parse_ts(m0) | |
293 | end = _parse_ts(m1) | |
294 | settings = m2.group(1) if m2 is not None else None | |
295 | ||
296 | text = io.StringIO() | |
297 | while True: | |
298 | m = parser.consume(cls._REGEX_PAYLOAD) | |
299 | if not m: | |
300 | break | |
301 | text.write(m.group(0)) | |
302 | ||
303 | parser.commit() | |
304 | return cls( | |
305 | id=id, | |
306 | start=start, end=end, settings=settings, | |
307 | text=text.getvalue() | |
308 | ) | |
309 | ||
310 | def write_into(self, stream): | |
311 | if self.id is not None: | |
312 | stream.write(self.id) | |
313 | stream.write('\n') | |
314 | stream.write(_format_ts(self.start)) | |
315 | stream.write(' --> ') | |
316 | stream.write(_format_ts(self.end)) | |
317 | if self.settings is not None: | |
318 | stream.write(' ') | |
319 | stream.write(self.settings) | |
320 | stream.write('\n') | |
321 | stream.write(self.text) | |
322 | stream.write('\n') | |
323 | ||
333217f4 F |
324 | @property |
325 | def as_json(self): | |
326 | return { | |
327 | 'id': self.id, | |
328 | 'start': self.start, | |
329 | 'end': self.end, | |
330 | 'text': self.text, | |
331 | 'settings': self.settings, | |
332 | } | |
333 | ||
25a3f4f5 F |
334 | def __eq__(self, other): |
335 | return self.as_json == other.as_json | |
336 | ||
337 | @classmethod | |
338 | def from_json(cls, json): | |
339 | return cls( | |
340 | id=json['id'], | |
341 | start=json['start'], | |
342 | end=json['end'], | |
343 | text=json['text'], | |
344 | settings=json['settings'] | |
345 | ) | |
346 | ||
347 | def hinges(self, other): | |
348 | if self.text != other.text: | |
349 | return False | |
350 | if self.settings != other.settings: | |
351 | return False | |
352 | return self.start <= self.end == other.start <= other.end | |
353 | ||
4a2f19ab F |
354 | |
355 | def parse_fragment(frag_content): | |
356 | """ | |
357 | A generator that yields (partially) parsed WebVTT blocks when given | |
358 | a bytes object containing the raw contents of a WebVTT file. | |
359 | """ | |
360 | ||
0f06bcd7 | 361 | parser = _MatchParser(frag_content.decode()) |
4a2f19ab F |
362 | |
363 | yield Magic.parse(parser) | |
364 | ||
365 | while not parser.match(_REGEX_EOF): | |
366 | if parser.consume(_REGEX_BLANK): | |
367 | continue | |
368 | ||
369 | block = RegionBlock.parse(parser) | |
370 | if block: | |
371 | yield block | |
372 | continue | |
373 | block = StyleBlock.parse(parser) | |
374 | if block: | |
375 | yield block | |
376 | continue | |
377 | block = CommentBlock.parse(parser) | |
378 | if block: | |
379 | yield block # XXX: or skip | |
380 | continue | |
381 | ||
382 | break | |
383 | ||
384 | while not parser.match(_REGEX_EOF): | |
385 | if parser.consume(_REGEX_BLANK): | |
386 | continue | |
387 | ||
388 | block = CommentBlock.parse(parser) | |
389 | if block: | |
390 | yield block # XXX: or skip | |
391 | continue | |
392 | block = CueBlock.parse(parser) | |
393 | if block: | |
394 | yield block | |
395 | continue | |
396 | ||
397 | raise ParseError(parser) |