]> jfr.im git - yt-dlp.git/blob - yt_dlp/webvtt.py
23d67a8971d78ad3cec493c516fcc078c60fe313
[yt-dlp.git] / yt_dlp / webvtt.py
1 """
2 A partial parser for WebVTT segments. Interprets enough of the WebVTT stream
3 to be able to assemble a single stand-alone subtitle file, suitably adjusting
4 timestamps on the way, while everything else is passed through unmodified.
5
6 Regular expressions based on the W3C WebVTT specification
7 <https://www.w3.org/TR/webvtt1/>. The X-TIMESTAMP-MAP extension is described
8 in RFC 8216 §3.5 <https://tools.ietf.org/html/rfc8216#section-3.5>.
9 """
10
11 import io
12 import re
13
14 from .utils import int_or_none, timetuple_from_msec
15
16
17 class _MatchParser:
18 """
19 An object that maintains the current parsing position and allows
20 conveniently advancing it as syntax elements are successfully parsed.
21 """
22
23 def __init__(self, string):
24 self._data = string
25 self._pos = 0
26
27 def match(self, r):
28 if isinstance(r, re.Pattern):
29 return r.match(self._data, self._pos)
30 if isinstance(r, str):
31 if self._data.startswith(r, self._pos):
32 return len(r)
33 return None
34 raise ValueError(r)
35
36 def advance(self, by):
37 if by is None:
38 amt = 0
39 elif isinstance(by, re.Match):
40 amt = len(by.group(0))
41 elif isinstance(by, str):
42 amt = len(by)
43 elif isinstance(by, int):
44 amt = by
45 else:
46 raise ValueError(by)
47 self._pos += amt
48 return by
49
50 def consume(self, r):
51 return self.advance(self.match(r))
52
53 def child(self):
54 return _MatchChildParser(self)
55
56
57 class _MatchChildParser(_MatchParser):
58 """
59 A child parser state, which advances through the same data as
60 its parent, but has an independent position. This is useful when
61 advancing through syntax elements we might later want to backtrack
62 from.
63 """
64
65 def __init__(self, parent):
66 super().__init__(parent._data)
67 self.__parent = parent
68 self._pos = parent._pos
69
70 def commit(self):
71 """
72 Advance the parent state to the current position of this child state.
73 """
74 self.__parent._pos = self._pos
75 return self.__parent
76
77
78 class ParseError(Exception):
79 def __init__(self, parser):
80 super().__init__("Parse error at position %u (near %r)" % (
81 parser._pos, parser._data[parser._pos:parser._pos + 20]
82 ))
83
84
85 # While the specification <https://www.w3.org/TR/webvtt1/#webvtt-timestamp>
86 # prescribes that hours must be *2 or more* digits, timestamps with a single
87 # digit for the hour part has been seen in the wild.
88 # See https://github.com/yt-dlp/yt-dlp/issues/921
89 _REGEX_TS = re.compile(r'''(?x)
90 (?:([0-9]{1,}):)?
91 ([0-9]{2}):
92 ([0-9]{2})\.
93 ([0-9]{3})?
94 ''')
95 _REGEX_EOF = re.compile(r'\Z')
96 _REGEX_NL = re.compile(r'(?:\r\n|[\r\n])')
97 _REGEX_BLANK = re.compile(r'(?:\r\n|[\r\n])+')
98
99
100 def _parse_ts(ts):
101 """
102 Convert a parsed WebVTT timestamp (a re.Match obtained from _REGEX_TS)
103 into an MPEG PES timestamp: a tick counter at 90 kHz resolution.
104 """
105 return 90 * sum(
106 int(part or 0) * mult for part, mult in zip(ts.groups(), (3600_000, 60_000, 1000, 1)))
107
108
109 def _format_ts(ts):
110 """
111 Convert an MPEG PES timestamp into a WebVTT timestamp.
112 This will lose sub-millisecond precision.
113 """
114 return '%02u:%02u:%02u.%03u' % timetuple_from_msec(int((ts + 45) // 90))
115
116
117 class Block:
118 """
119 An abstract WebVTT block.
120 """
121
122 def __init__(self, **kwargs):
123 for key, val in kwargs.items():
124 setattr(self, key, val)
125
126 @classmethod
127 def parse(cls, parser):
128 m = parser.match(cls._REGEX)
129 if not m:
130 return None
131 parser.advance(m)
132 return cls(raw=m.group(0))
133
134 def write_into(self, stream):
135 stream.write(self.raw)
136
137
138 class HeaderBlock(Block):
139 """
140 A WebVTT block that may only appear in the header part of the file,
141 i.e. before any cue blocks.
142 """
143
144 pass
145
146
147 class Magic(HeaderBlock):
148 _REGEX = re.compile(r'\ufeff?WEBVTT([ \t][^\r\n]*)?(?:\r\n|[\r\n])')
149
150 # XXX: The X-TIMESTAMP-MAP extension is described in RFC 8216 §3.5
151 # <https://tools.ietf.org/html/rfc8216#section-3.5>, but the RFC
152 # doesn’t specify the exact grammar nor where in the WebVTT
153 # syntax it should be placed; the below has been devised based
154 # on usage in the wild
155 #
156 # And strictly speaking, the presence of this extension violates
157 # the W3C WebVTT spec. Oh well.
158
159 _REGEX_TSMAP = re.compile(r'X-TIMESTAMP-MAP=')
160 _REGEX_TSMAP_LOCAL = re.compile(r'LOCAL:')
161 _REGEX_TSMAP_MPEGTS = re.compile(r'MPEGTS:([0-9]+)')
162 _REGEX_TSMAP_SEP = re.compile(r'[ \t]*,[ \t]*')
163
164 # This was removed from the spec in the 2017 revision;
165 # the last spec draft to describe this syntax element is
166 # <https://www.w3.org/TR/2015/WD-webvtt1-20151208/#webvtt-metadata-header>.
167 # Nevertheless, YouTube keeps serving those
168 _REGEX_META = re.compile(r'(?:(?!-->)[^\r\n])+:(?:(?!-->)[^\r\n])+(?:\r\n|[\r\n])')
169
170 @classmethod
171 def __parse_tsmap(cls, parser):
172 parser = parser.child()
173
174 while True:
175 m = parser.consume(cls._REGEX_TSMAP_LOCAL)
176 if m:
177 m = parser.consume(_REGEX_TS)
178 if m is None:
179 raise ParseError(parser)
180 local = _parse_ts(m)
181 if local is None:
182 raise ParseError(parser)
183 else:
184 m = parser.consume(cls._REGEX_TSMAP_MPEGTS)
185 if m:
186 mpegts = int_or_none(m.group(1))
187 if mpegts is None:
188 raise ParseError(parser)
189 else:
190 raise ParseError(parser)
191 if parser.consume(cls._REGEX_TSMAP_SEP):
192 continue
193 if parser.consume(_REGEX_NL):
194 break
195 raise ParseError(parser)
196
197 parser.commit()
198 return local, mpegts
199
200 @classmethod
201 def parse(cls, parser):
202 parser = parser.child()
203
204 m = parser.consume(cls._REGEX)
205 if not m:
206 raise ParseError(parser)
207
208 extra = m.group(1)
209 local, mpegts, meta = None, None, ''
210 while not parser.consume(_REGEX_NL):
211 if parser.consume(cls._REGEX_TSMAP):
212 local, mpegts = cls.__parse_tsmap(parser)
213 continue
214 m = parser.consume(cls._REGEX_META)
215 if m:
216 meta += m.group(0)
217 continue
218 raise ParseError(parser)
219 parser.commit()
220 return cls(extra=extra, mpegts=mpegts, local=local, meta=meta)
221
222 def write_into(self, stream):
223 stream.write('WEBVTT')
224 if self.extra is not None:
225 stream.write(self.extra)
226 stream.write('\n')
227 if self.local or self.mpegts:
228 stream.write('X-TIMESTAMP-MAP=LOCAL:')
229 stream.write(_format_ts(self.local if self.local is not None else 0))
230 stream.write(',MPEGTS:')
231 stream.write(str(self.mpegts if self.mpegts is not None else 0))
232 stream.write('\n')
233 if self.meta:
234 stream.write(self.meta)
235 stream.write('\n')
236
237
238 class StyleBlock(HeaderBlock):
239 _REGEX = re.compile(r'''(?x)
240 STYLE[\ \t]*(?:\r\n|[\r\n])
241 ((?:(?!-->)[^\r\n])+(?:\r\n|[\r\n]))*
242 (?:\r\n|[\r\n])
243 ''')
244
245
246 class RegionBlock(HeaderBlock):
247 _REGEX = re.compile(r'''(?x)
248 REGION[\ \t]*
249 ((?:(?!-->)[^\r\n])+(?:\r\n|[\r\n]))*
250 (?:\r\n|[\r\n])
251 ''')
252
253
254 class CommentBlock(Block):
255 _REGEX = re.compile(r'''(?x)
256 NOTE(?:\r\n|[\ \t\r\n])
257 ((?:(?!-->)[^\r\n])+(?:\r\n|[\r\n]))*
258 (?:\r\n|[\r\n])
259 ''')
260
261
262 class CueBlock(Block):
263 """
264 A cue block. The payload is not interpreted.
265 """
266
267 _REGEX_ID = re.compile(r'((?:(?!-->)[^\r\n])+)(?:\r\n|[\r\n])')
268 _REGEX_ARROW = re.compile(r'[ \t]+-->[ \t]+')
269 _REGEX_SETTINGS = re.compile(r'[ \t]+((?:(?!-->)[^\r\n])+)')
270 _REGEX_PAYLOAD = re.compile(r'[^\r\n]+(?:\r\n|[\r\n])?')
271
272 @classmethod
273 def parse(cls, parser):
274 parser = parser.child()
275
276 id = None
277 m = parser.consume(cls._REGEX_ID)
278 if m:
279 id = m.group(1)
280
281 m0 = parser.consume(_REGEX_TS)
282 if not m0:
283 return None
284 if not parser.consume(cls._REGEX_ARROW):
285 return None
286 m1 = parser.consume(_REGEX_TS)
287 if not m1:
288 return None
289 m2 = parser.consume(cls._REGEX_SETTINGS)
290 if not parser.consume(_REGEX_NL):
291 return None
292
293 start = _parse_ts(m0)
294 end = _parse_ts(m1)
295 settings = m2.group(1) if m2 is not None else None
296
297 text = io.StringIO()
298 while True:
299 m = parser.consume(cls._REGEX_PAYLOAD)
300 if not m:
301 break
302 text.write(m.group(0))
303
304 parser.commit()
305 return cls(
306 id=id,
307 start=start, end=end, settings=settings,
308 text=text.getvalue()
309 )
310
311 def write_into(self, stream):
312 if self.id is not None:
313 stream.write(self.id)
314 stream.write('\n')
315 stream.write(_format_ts(self.start))
316 stream.write(' --> ')
317 stream.write(_format_ts(self.end))
318 if self.settings is not None:
319 stream.write(' ')
320 stream.write(self.settings)
321 stream.write('\n')
322 stream.write(self.text)
323 stream.write('\n')
324
325 @property
326 def as_json(self):
327 return {
328 'id': self.id,
329 'start': self.start,
330 'end': self.end,
331 'text': self.text,
332 'settings': self.settings,
333 }
334
335 def __eq__(self, other):
336 return self.as_json == other.as_json
337
338 @classmethod
339 def from_json(cls, json):
340 return cls(
341 id=json['id'],
342 start=json['start'],
343 end=json['end'],
344 text=json['text'],
345 settings=json['settings']
346 )
347
348 def hinges(self, other):
349 if self.text != other.text:
350 return False
351 if self.settings != other.settings:
352 return False
353 return self.start <= self.end == other.start <= other.end
354
355
356 def parse_fragment(frag_content):
357 """
358 A generator that yields (partially) parsed WebVTT blocks when given
359 a bytes object containing the raw contents of a WebVTT file.
360 """
361
362 parser = _MatchParser(frag_content.decode())
363
364 yield Magic.parse(parser)
365
366 while not parser.match(_REGEX_EOF):
367 if parser.consume(_REGEX_BLANK):
368 continue
369
370 block = RegionBlock.parse(parser)
371 if block:
372 yield block
373 continue
374 block = StyleBlock.parse(parser)
375 if block:
376 yield block
377 continue
378 block = CommentBlock.parse(parser)
379 if block:
380 yield block # XXX: or skip
381 continue
382
383 break
384
385 while not parser.match(_REGEX_EOF):
386 if parser.consume(_REGEX_BLANK):
387 continue
388
389 block = CommentBlock.parse(parser)
390 if block:
391 yield block # XXX: or skip
392 continue
393 block = CueBlock.parse(parser)
394 if block:
395 yield block
396 continue
397
398 raise ParseError(parser)