]> jfr.im git - yt-dlp.git/blob - yt_dlp/webvtt.py
[cleanup] Upgrade syntax
[yt-dlp.git] / yt_dlp / webvtt.py
1 """
2 A partial parser for WebVTT segments. Interprets enough of the WebVTT stream
3 to be able to assemble a single stand-alone subtitle file, suitably adjusting
4 timestamps on the way, while everything else is passed through unmodified.
5
6 Regular expressions based on the W3C WebVTT specification
7 <https://www.w3.org/TR/webvtt1/>. The X-TIMESTAMP-MAP extension is described
8 in RFC 8216 §3.5 <https://tools.ietf.org/html/rfc8216#section-3.5>.
9 """
10
11 import re
12 import io
13 from .utils import int_or_none, timetuple_from_msec
14 from .compat import (
15 compat_Pattern,
16 compat_Match,
17 )
18
19
20 class _MatchParser:
21 """
22 An object that maintains the current parsing position and allows
23 conveniently advancing it as syntax elements are successfully parsed.
24 """
25
26 def __init__(self, string):
27 self._data = string
28 self._pos = 0
29
30 def match(self, r):
31 if isinstance(r, compat_Pattern):
32 return r.match(self._data, self._pos)
33 if isinstance(r, str):
34 if self._data.startswith(r, self._pos):
35 return len(r)
36 return None
37 raise ValueError(r)
38
39 def advance(self, by):
40 if by is None:
41 amt = 0
42 elif isinstance(by, compat_Match):
43 amt = len(by.group(0))
44 elif isinstance(by, str):
45 amt = len(by)
46 elif isinstance(by, int):
47 amt = by
48 else:
49 raise ValueError(by)
50 self._pos += amt
51 return by
52
53 def consume(self, r):
54 return self.advance(self.match(r))
55
56 def child(self):
57 return _MatchChildParser(self)
58
59
60 class _MatchChildParser(_MatchParser):
61 """
62 A child parser state, which advances through the same data as
63 its parent, but has an independent position. This is useful when
64 advancing through syntax elements we might later want to backtrack
65 from.
66 """
67
68 def __init__(self, parent):
69 super().__init__(parent._data)
70 self.__parent = parent
71 self._pos = parent._pos
72
73 def commit(self):
74 """
75 Advance the parent state to the current position of this child state.
76 """
77 self.__parent._pos = self._pos
78 return self.__parent
79
80
81 class ParseError(Exception):
82 def __init__(self, parser):
83 super().__init__("Parse error at position %u (near %r)" % (
84 parser._pos, parser._data[parser._pos:parser._pos + 20]
85 ))
86
87
88 # While the specification <https://www.w3.org/TR/webvtt1/#webvtt-timestamp>
89 # prescribes that hours must be *2 or more* digits, timestamps with a single
90 # digit for the hour part has been seen in the wild.
91 # See https://github.com/yt-dlp/yt-dlp/issues/921
92 _REGEX_TS = re.compile(r'''(?x)
93 (?:([0-9]{1,}):)?
94 ([0-9]{2}):
95 ([0-9]{2})\.
96 ([0-9]{3})?
97 ''')
98 _REGEX_EOF = re.compile(r'\Z')
99 _REGEX_NL = re.compile(r'(?:\r\n|[\r\n])')
100 _REGEX_BLANK = re.compile(r'(?:\r\n|[\r\n])+')
101
102
103 def _parse_ts(ts):
104 """
105 Convert a parsed WebVTT timestamp (a re.Match obtained from _REGEX_TS)
106 into an MPEG PES timestamp: a tick counter at 90 kHz resolution.
107 """
108
109 h, min, s, ms = ts.groups()
110 return 90 * (
111 int(h or 0) * 3600000 + # noqa: W504,E221,E222
112 int(min) * 60000 + # noqa: W504,E221,E222
113 int(s) * 1000 + # noqa: W504,E221,E222
114 int(ms) # noqa: W504,E221,E222
115 )
116
117
118 def _format_ts(ts):
119 """
120 Convert an MPEG PES timestamp into a WebVTT timestamp.
121 This will lose sub-millisecond precision.
122 """
123 return '%02u:%02u:%02u.%03u' % timetuple_from_msec(int((ts + 45) // 90))
124
125
126 class Block:
127 """
128 An abstract WebVTT block.
129 """
130
131 def __init__(self, **kwargs):
132 for key, val in kwargs.items():
133 setattr(self, key, val)
134
135 @classmethod
136 def parse(cls, parser):
137 m = parser.match(cls._REGEX)
138 if not m:
139 return None
140 parser.advance(m)
141 return cls(raw=m.group(0))
142
143 def write_into(self, stream):
144 stream.write(self.raw)
145
146
147 class HeaderBlock(Block):
148 """
149 A WebVTT block that may only appear in the header part of the file,
150 i.e. before any cue blocks.
151 """
152
153 pass
154
155
156 class Magic(HeaderBlock):
157 _REGEX = re.compile(r'\ufeff?WEBVTT([ \t][^\r\n]*)?(?:\r\n|[\r\n])')
158
159 # XXX: The X-TIMESTAMP-MAP extension is described in RFC 8216 §3.5
160 # <https://tools.ietf.org/html/rfc8216#section-3.5>, but the RFC
161 # doesn’t specify the exact grammar nor where in the WebVTT
162 # syntax it should be placed; the below has been devised based
163 # on usage in the wild
164 #
165 # And strictly speaking, the presence of this extension violates
166 # the W3C WebVTT spec. Oh well.
167
168 _REGEX_TSMAP = re.compile(r'X-TIMESTAMP-MAP=')
169 _REGEX_TSMAP_LOCAL = re.compile(r'LOCAL:')
170 _REGEX_TSMAP_MPEGTS = re.compile(r'MPEGTS:([0-9]+)')
171 _REGEX_TSMAP_SEP = re.compile(r'[ \t]*,[ \t]*')
172
173 @classmethod
174 def __parse_tsmap(cls, parser):
175 parser = parser.child()
176
177 while True:
178 m = parser.consume(cls._REGEX_TSMAP_LOCAL)
179 if m:
180 m = parser.consume(_REGEX_TS)
181 if m is None:
182 raise ParseError(parser)
183 local = _parse_ts(m)
184 if local is None:
185 raise ParseError(parser)
186 else:
187 m = parser.consume(cls._REGEX_TSMAP_MPEGTS)
188 if m:
189 mpegts = int_or_none(m.group(1))
190 if mpegts is None:
191 raise ParseError(parser)
192 else:
193 raise ParseError(parser)
194 if parser.consume(cls._REGEX_TSMAP_SEP):
195 continue
196 if parser.consume(_REGEX_NL):
197 break
198 raise ParseError(parser)
199
200 parser.commit()
201 return local, mpegts
202
203 @classmethod
204 def parse(cls, parser):
205 parser = parser.child()
206
207 m = parser.consume(cls._REGEX)
208 if not m:
209 raise ParseError(parser)
210
211 extra = m.group(1)
212 local, mpegts = None, None
213 if parser.consume(cls._REGEX_TSMAP):
214 local, mpegts = cls.__parse_tsmap(parser)
215 if not parser.consume(_REGEX_NL):
216 raise ParseError(parser)
217 parser.commit()
218 return cls(extra=extra, mpegts=mpegts, local=local)
219
220 def write_into(self, stream):
221 stream.write('WEBVTT')
222 if self.extra is not None:
223 stream.write(self.extra)
224 stream.write('\n')
225 if self.local or self.mpegts:
226 stream.write('X-TIMESTAMP-MAP=LOCAL:')
227 stream.write(_format_ts(self.local if self.local is not None else 0))
228 stream.write(',MPEGTS:')
229 stream.write(str(self.mpegts if self.mpegts is not None else 0))
230 stream.write('\n')
231 stream.write('\n')
232
233
234 class StyleBlock(HeaderBlock):
235 _REGEX = re.compile(r'''(?x)
236 STYLE[\ \t]*(?:\r\n|[\r\n])
237 ((?:(?!-->)[^\r\n])+(?:\r\n|[\r\n]))*
238 (?:\r\n|[\r\n])
239 ''')
240
241
242 class RegionBlock(HeaderBlock):
243 _REGEX = re.compile(r'''(?x)
244 REGION[\ \t]*
245 ((?:(?!-->)[^\r\n])+(?:\r\n|[\r\n]))*
246 (?:\r\n|[\r\n])
247 ''')
248
249
250 class CommentBlock(Block):
251 _REGEX = re.compile(r'''(?x)
252 NOTE(?:\r\n|[\ \t\r\n])
253 ((?:(?!-->)[^\r\n])+(?:\r\n|[\r\n]))*
254 (?:\r\n|[\r\n])
255 ''')
256
257
258 class CueBlock(Block):
259 """
260 A cue block. The payload is not interpreted.
261 """
262
263 _REGEX_ID = re.compile(r'((?:(?!-->)[^\r\n])+)(?:\r\n|[\r\n])')
264 _REGEX_ARROW = re.compile(r'[ \t]+-->[ \t]+')
265 _REGEX_SETTINGS = re.compile(r'[ \t]+((?:(?!-->)[^\r\n])+)')
266 _REGEX_PAYLOAD = re.compile(r'[^\r\n]+(?:\r\n|[\r\n])?')
267
268 @classmethod
269 def parse(cls, parser):
270 parser = parser.child()
271
272 id = None
273 m = parser.consume(cls._REGEX_ID)
274 if m:
275 id = m.group(1)
276
277 m0 = parser.consume(_REGEX_TS)
278 if not m0:
279 return None
280 if not parser.consume(cls._REGEX_ARROW):
281 return None
282 m1 = parser.consume(_REGEX_TS)
283 if not m1:
284 return None
285 m2 = parser.consume(cls._REGEX_SETTINGS)
286 if not parser.consume(_REGEX_NL):
287 return None
288
289 start = _parse_ts(m0)
290 end = _parse_ts(m1)
291 settings = m2.group(1) if m2 is not None else None
292
293 text = io.StringIO()
294 while True:
295 m = parser.consume(cls._REGEX_PAYLOAD)
296 if not m:
297 break
298 text.write(m.group(0))
299
300 parser.commit()
301 return cls(
302 id=id,
303 start=start, end=end, settings=settings,
304 text=text.getvalue()
305 )
306
307 def write_into(self, stream):
308 if self.id is not None:
309 stream.write(self.id)
310 stream.write('\n')
311 stream.write(_format_ts(self.start))
312 stream.write(' --> ')
313 stream.write(_format_ts(self.end))
314 if self.settings is not None:
315 stream.write(' ')
316 stream.write(self.settings)
317 stream.write('\n')
318 stream.write(self.text)
319 stream.write('\n')
320
321 @property
322 def as_json(self):
323 return {
324 'id': self.id,
325 'start': self.start,
326 'end': self.end,
327 'text': self.text,
328 'settings': self.settings,
329 }
330
331 def __eq__(self, other):
332 return self.as_json == other.as_json
333
334 @classmethod
335 def from_json(cls, json):
336 return cls(
337 id=json['id'],
338 start=json['start'],
339 end=json['end'],
340 text=json['text'],
341 settings=json['settings']
342 )
343
344 def hinges(self, other):
345 if self.text != other.text:
346 return False
347 if self.settings != other.settings:
348 return False
349 return self.start <= self.end == other.start <= other.end
350
351
352 def parse_fragment(frag_content):
353 """
354 A generator that yields (partially) parsed WebVTT blocks when given
355 a bytes object containing the raw contents of a WebVTT file.
356 """
357
358 parser = _MatchParser(frag_content.decode('utf-8'))
359
360 yield Magic.parse(parser)
361
362 while not parser.match(_REGEX_EOF):
363 if parser.consume(_REGEX_BLANK):
364 continue
365
366 block = RegionBlock.parse(parser)
367 if block:
368 yield block
369 continue
370 block = StyleBlock.parse(parser)
371 if block:
372 yield block
373 continue
374 block = CommentBlock.parse(parser)
375 if block:
376 yield block # XXX: or skip
377 continue
378
379 break
380
381 while not parser.match(_REGEX_EOF):
382 if parser.consume(_REGEX_BLANK):
383 continue
384
385 block = CommentBlock.parse(parser)
386 if block:
387 yield block # XXX: or skip
388 continue
389 block = CueBlock.parse(parser)
390 if block:
391 yield block
392 continue
393
394 raise ParseError(parser)