]>
Commit | Line | Data |
---|---|---|
1 | # coding: utf-8 | |
2 | from __future__ import unicode_literals, print_function, division | |
3 | ||
4 | """ | |
5 | A partial parser for WebVTT segments. Interprets enough of the WebVTT stream | |
6 | to be able to assemble a single stand-alone subtitle file, suitably adjusting | |
7 | timestamps on the way, while everything else is passed through unmodified. | |
8 | ||
9 | Regular expressions based on the W3C WebVTT specification | |
10 | <https://www.w3.org/TR/webvtt1/>. The X-TIMESTAMP-MAP extension is described | |
11 | in RFC 8216 §3.5 <https://tools.ietf.org/html/rfc8216#section-3.5>. | |
12 | """ | |
13 | ||
14 | import re | |
15 | import io | |
16 | from .utils import int_or_none | |
17 | from .compat import ( | |
18 | compat_str as str, | |
19 | compat_Pattern, | |
20 | compat_Match, | |
21 | ) | |
22 | ||
23 | ||
24 | class _MatchParser(object): | |
25 | """ | |
26 | An object that maintains the current parsing position and allows | |
27 | conveniently advancing it as syntax elements are successfully parsed. | |
28 | """ | |
29 | ||
30 | def __init__(self, string): | |
31 | self._data = string | |
32 | self._pos = 0 | |
33 | ||
34 | def match(self, r): | |
35 | if isinstance(r, compat_Pattern): | |
36 | return r.match(self._data, self._pos) | |
37 | if isinstance(r, str): | |
38 | if self._data.startswith(r, self._pos): | |
39 | return len(r) | |
40 | return None | |
41 | raise ValueError(r) | |
42 | ||
43 | def advance(self, by): | |
44 | if by is None: | |
45 | amt = 0 | |
46 | elif isinstance(by, compat_Match): | |
47 | amt = len(by.group(0)) | |
48 | elif isinstance(by, str): | |
49 | amt = len(by) | |
50 | elif isinstance(by, int): | |
51 | amt = by | |
52 | else: | |
53 | raise ValueError(by) | |
54 | self._pos += amt | |
55 | return by | |
56 | ||
57 | def consume(self, r): | |
58 | return self.advance(self.match(r)) | |
59 | ||
60 | def child(self): | |
61 | return _MatchChildParser(self) | |
62 | ||
63 | ||
64 | class _MatchChildParser(_MatchParser): | |
65 | """ | |
66 | A child parser state, which advances through the same data as | |
67 | its parent, but has an independent position. This is useful when | |
68 | advancing through syntax elements we might later want to backtrack | |
69 | from. | |
70 | """ | |
71 | ||
72 | def __init__(self, parent): | |
73 | super(_MatchChildParser, self).__init__(parent._data) | |
74 | self.__parent = parent | |
75 | self._pos = parent._pos | |
76 | ||
77 | def commit(self): | |
78 | """ | |
79 | Advance the parent state to the current position of this child state. | |
80 | """ | |
81 | self.__parent._pos = self._pos | |
82 | return self.__parent | |
83 | ||
84 | ||
85 | class ParseError(Exception): | |
86 | def __init__(self, parser): | |
87 | super(ParseError, self).__init__("Parse error at position %u (near %r)" % ( | |
88 | parser._pos, parser._data[parser._pos:parser._pos + 20] | |
89 | )) | |
90 | ||
91 | ||
92 | _REGEX_TS = re.compile(r'''(?x) | |
93 | (?:([0-9]{2,}):)? | |
94 | ([0-9]{2}): | |
95 | ([0-9]{2})\. | |
96 | ([0-9]{3})? | |
97 | ''') | |
98 | _REGEX_EOF = re.compile(r'\Z') | |
99 | _REGEX_NL = re.compile(r'(?:\r\n|[\r\n])') | |
100 | _REGEX_BLANK = re.compile(r'(?:\r\n|[\r\n])+') | |
101 | ||
102 | ||
103 | def _parse_ts(ts): | |
104 | """ | |
105 | Convert a parsed WebVTT timestamp (a re.Match obtained from _REGEX_TS) | |
106 | into an MPEG PES timestamp: a tick counter at 90 kHz resolution. | |
107 | """ | |
108 | ||
109 | h, min, s, ms = ts.groups() | |
110 | return 90 * ( | |
111 | int(h or 0) * 3600000 + # noqa: W504,E221,E222 | |
112 | int(min) * 60000 + # noqa: W504,E221,E222 | |
113 | int(s) * 1000 + # noqa: W504,E221,E222 | |
114 | int(ms) # noqa: W504,E221,E222 | |
115 | ) | |
116 | ||
117 | ||
118 | def _format_ts(ts): | |
119 | """ | |
120 | Convert an MPEG PES timestamp into a WebVTT timestamp. | |
121 | This will lose sub-millisecond precision. | |
122 | """ | |
123 | ||
124 | ts = int((ts + 45) // 90) | |
125 | ms , ts = divmod(ts, 1000) # noqa: W504,E221,E222,E203 | |
126 | s , ts = divmod(ts, 60) # noqa: W504,E221,E222,E203 | |
127 | min, h = divmod(ts, 60) # noqa: W504,E221,E222 | |
128 | return '%02u:%02u:%02u.%03u' % (h, min, s, ms) | |
129 | ||
130 | ||
131 | class Block(object): | |
132 | """ | |
133 | An abstract WebVTT block. | |
134 | """ | |
135 | ||
136 | def __init__(self, **kwargs): | |
137 | for key, val in kwargs.items(): | |
138 | setattr(self, key, val) | |
139 | ||
140 | @classmethod | |
141 | def parse(cls, parser): | |
142 | m = parser.match(cls._REGEX) | |
143 | if not m: | |
144 | return None | |
145 | parser.advance(m) | |
146 | return cls(raw=m.group(0)) | |
147 | ||
148 | def write_into(self, stream): | |
149 | stream.write(self.raw) | |
150 | ||
151 | ||
152 | class HeaderBlock(Block): | |
153 | """ | |
154 | A WebVTT block that may only appear in the header part of the file, | |
155 | i.e. before any cue blocks. | |
156 | """ | |
157 | ||
158 | pass | |
159 | ||
160 | ||
161 | class Magic(HeaderBlock): | |
162 | _REGEX = re.compile(r'\ufeff?WEBVTT([ \t][^\r\n]*)?(?:\r\n|[\r\n])') | |
163 | ||
164 | # XXX: The X-TIMESTAMP-MAP extension is described in RFC 8216 §3.5 | |
165 | # <https://tools.ietf.org/html/rfc8216#section-3.5>, but the RFC | |
166 | # doesn’t specify the exact grammar nor where in the WebVTT | |
167 | # syntax it should be placed; the below has been devised based | |
168 | # on usage in the wild | |
169 | # | |
170 | # And strictly speaking, the presence of this extension violates | |
171 | # the W3C WebVTT spec. Oh well. | |
172 | ||
173 | _REGEX_TSMAP = re.compile(r'X-TIMESTAMP-MAP=') | |
174 | _REGEX_TSMAP_LOCAL = re.compile(r'LOCAL:') | |
175 | _REGEX_TSMAP_MPEGTS = re.compile(r'MPEGTS:([0-9]+)') | |
176 | ||
177 | @classmethod | |
178 | def __parse_tsmap(cls, parser): | |
179 | parser = parser.child() | |
180 | ||
181 | while True: | |
182 | m = parser.consume(cls._REGEX_TSMAP_LOCAL) | |
183 | if m: | |
184 | m = parser.consume(_REGEX_TS) | |
185 | if m is None: | |
186 | raise ParseError(parser) | |
187 | local = _parse_ts(m) | |
188 | if local is None: | |
189 | raise ParseError(parser) | |
190 | else: | |
191 | m = parser.consume(cls._REGEX_TSMAP_MPEGTS) | |
192 | if m: | |
193 | mpegts = int_or_none(m.group(1)) | |
194 | if mpegts is None: | |
195 | raise ParseError(parser) | |
196 | else: | |
197 | raise ParseError(parser) | |
198 | if parser.consume(','): | |
199 | continue | |
200 | if parser.consume(_REGEX_NL): | |
201 | break | |
202 | raise ParseError(parser) | |
203 | ||
204 | parser.commit() | |
205 | return local, mpegts | |
206 | ||
207 | @classmethod | |
208 | def parse(cls, parser): | |
209 | parser = parser.child() | |
210 | ||
211 | m = parser.consume(cls._REGEX) | |
212 | if not m: | |
213 | raise ParseError(parser) | |
214 | ||
215 | extra = m.group(1) | |
216 | local, mpegts = None, None | |
217 | if parser.consume(cls._REGEX_TSMAP): | |
218 | local, mpegts = cls.__parse_tsmap(parser) | |
219 | if not parser.consume(_REGEX_NL): | |
220 | raise ParseError(parser) | |
221 | parser.commit() | |
222 | return cls(extra=extra, mpegts=mpegts, local=local) | |
223 | ||
224 | def write_into(self, stream): | |
225 | stream.write('WEBVTT') | |
226 | if self.extra is not None: | |
227 | stream.write(self.extra) | |
228 | stream.write('\n') | |
229 | if self.local or self.mpegts: | |
230 | stream.write('X-TIMESTAMP-MAP=LOCAL:') | |
231 | stream.write(_format_ts(self.local if self.local is not None else 0)) | |
232 | stream.write(',MPEGTS:') | |
233 | stream.write(str(self.mpegts if self.mpegts is not None else 0)) | |
234 | stream.write('\n') | |
235 | stream.write('\n') | |
236 | ||
237 | ||
238 | class StyleBlock(HeaderBlock): | |
239 | _REGEX = re.compile(r'''(?x) | |
240 | STYLE[\ \t]*(?:\r\n|[\r\n]) | |
241 | ((?:(?!-->)[^\r\n])+(?:\r\n|[\r\n]))* | |
242 | (?:\r\n|[\r\n]) | |
243 | ''') | |
244 | ||
245 | ||
246 | class RegionBlock(HeaderBlock): | |
247 | _REGEX = re.compile(r'''(?x) | |
248 | REGION[\ \t]* | |
249 | ((?:(?!-->)[^\r\n])+(?:\r\n|[\r\n]))* | |
250 | (?:\r\n|[\r\n]) | |
251 | ''') | |
252 | ||
253 | ||
254 | class CommentBlock(Block): | |
255 | _REGEX = re.compile(r'''(?x) | |
256 | NOTE(?:\r\n|[\ \t\r\n]) | |
257 | ((?:(?!-->)[^\r\n])+(?:\r\n|[\r\n]))* | |
258 | (?:\r\n|[\r\n]) | |
259 | ''') | |
260 | ||
261 | ||
262 | class CueBlock(Block): | |
263 | """ | |
264 | A cue block. The payload is not interpreted. | |
265 | """ | |
266 | ||
267 | _REGEX_ID = re.compile(r'((?:(?!-->)[^\r\n])+)(?:\r\n|[\r\n])') | |
268 | _REGEX_ARROW = re.compile(r'[ \t]+-->[ \t]+') | |
269 | _REGEX_SETTINGS = re.compile(r'[ \t]+((?:(?!-->)[^\r\n])+)') | |
270 | _REGEX_PAYLOAD = re.compile(r'[^\r\n]+(?:\r\n|[\r\n])?') | |
271 | ||
272 | @classmethod | |
273 | def parse(cls, parser): | |
274 | parser = parser.child() | |
275 | ||
276 | id = None | |
277 | m = parser.consume(cls._REGEX_ID) | |
278 | if m: | |
279 | id = m.group(1) | |
280 | ||
281 | m0 = parser.consume(_REGEX_TS) | |
282 | if not m0: | |
283 | return None | |
284 | if not parser.consume(cls._REGEX_ARROW): | |
285 | return None | |
286 | m1 = parser.consume(_REGEX_TS) | |
287 | if not m1: | |
288 | return None | |
289 | m2 = parser.consume(cls._REGEX_SETTINGS) | |
290 | if not parser.consume(_REGEX_NL): | |
291 | return None | |
292 | ||
293 | start = _parse_ts(m0) | |
294 | end = _parse_ts(m1) | |
295 | settings = m2.group(1) if m2 is not None else None | |
296 | ||
297 | text = io.StringIO() | |
298 | while True: | |
299 | m = parser.consume(cls._REGEX_PAYLOAD) | |
300 | if not m: | |
301 | break | |
302 | text.write(m.group(0)) | |
303 | ||
304 | parser.commit() | |
305 | return cls( | |
306 | id=id, | |
307 | start=start, end=end, settings=settings, | |
308 | text=text.getvalue() | |
309 | ) | |
310 | ||
311 | def write_into(self, stream): | |
312 | if self.id is not None: | |
313 | stream.write(self.id) | |
314 | stream.write('\n') | |
315 | stream.write(_format_ts(self.start)) | |
316 | stream.write(' --> ') | |
317 | stream.write(_format_ts(self.end)) | |
318 | if self.settings is not None: | |
319 | stream.write(' ') | |
320 | stream.write(self.settings) | |
321 | stream.write('\n') | |
322 | stream.write(self.text) | |
323 | stream.write('\n') | |
324 | ||
325 | @property | |
326 | def as_json(self): | |
327 | return { | |
328 | 'id': self.id, | |
329 | 'start': self.start, | |
330 | 'end': self.end, | |
331 | 'text': self.text, | |
332 | 'settings': self.settings, | |
333 | } | |
334 | ||
335 | ||
336 | def parse_fragment(frag_content): | |
337 | """ | |
338 | A generator that yields (partially) parsed WebVTT blocks when given | |
339 | a bytes object containing the raw contents of a WebVTT file. | |
340 | """ | |
341 | ||
342 | parser = _MatchParser(frag_content.decode('utf-8')) | |
343 | ||
344 | yield Magic.parse(parser) | |
345 | ||
346 | while not parser.match(_REGEX_EOF): | |
347 | if parser.consume(_REGEX_BLANK): | |
348 | continue | |
349 | ||
350 | block = RegionBlock.parse(parser) | |
351 | if block: | |
352 | yield block | |
353 | continue | |
354 | block = StyleBlock.parse(parser) | |
355 | if block: | |
356 | yield block | |
357 | continue | |
358 | block = CommentBlock.parse(parser) | |
359 | if block: | |
360 | yield block # XXX: or skip | |
361 | continue | |
362 | ||
363 | break | |
364 | ||
365 | while not parser.match(_REGEX_EOF): | |
366 | if parser.consume(_REGEX_BLANK): | |
367 | continue | |
368 | ||
369 | block = CommentBlock.parse(parser) | |
370 | if block: | |
371 | yield block # XXX: or skip | |
372 | continue | |
373 | block = CueBlock.parse(parser) | |
374 | if block: | |
375 | yield block | |
376 | continue | |
377 | ||
378 | raise ParseError(parser) |