]> jfr.im git - yt-dlp.git/blame - yt_dlp/webvtt.py
[outtmpl] Alternate form of format type `l` for `\n` delimited list
[yt-dlp.git] / yt_dlp / webvtt.py
CommitLineData
4a2f19ab
F
1# coding: utf-8
2from __future__ import unicode_literals, print_function, division
3
4"""
5A partial parser for WebVTT segments. Interprets enough of the WebVTT stream
6to be able to assemble a single stand-alone subtitle file, suitably adjusting
7timestamps on the way, while everything else is passed through unmodified.
8
9Regular expressions based on the W3C WebVTT specification
10<https://www.w3.org/TR/webvtt1/>. The X-TIMESTAMP-MAP extension is described
11in RFC 8216 §3.5 <https://tools.ietf.org/html/rfc8216#section-3.5>.
12"""
13
14import re
15import io
16from .utils import int_or_none
17from .compat import (
18 compat_str as str,
19 compat_Pattern,
20 compat_Match,
21)
22
23
24class _MatchParser(object):
25 """
26 An object that maintains the current parsing position and allows
27 conveniently advancing it as syntax elements are successfully parsed.
28 """
29
30 def __init__(self, string):
31 self._data = string
32 self._pos = 0
33
34 def match(self, r):
35 if isinstance(r, compat_Pattern):
36 return r.match(self._data, self._pos)
37 if isinstance(r, str):
38 if self._data.startswith(r, self._pos):
39 return len(r)
40 return None
41 raise ValueError(r)
42
43 def advance(self, by):
44 if by is None:
45 amt = 0
46 elif isinstance(by, compat_Match):
47 amt = len(by.group(0))
48 elif isinstance(by, str):
49 amt = len(by)
50 elif isinstance(by, int):
51 amt = by
52 else:
53 raise ValueError(by)
54 self._pos += amt
55 return by
56
57 def consume(self, r):
58 return self.advance(self.match(r))
59
60 def child(self):
61 return _MatchChildParser(self)
62
63
64class _MatchChildParser(_MatchParser):
65 """
66 A child parser state, which advances through the same data as
67 its parent, but has an independent position. This is useful when
68 advancing through syntax elements we might later want to backtrack
69 from.
70 """
71
72 def __init__(self, parent):
73 super(_MatchChildParser, self).__init__(parent._data)
74 self.__parent = parent
75 self._pos = parent._pos
76
77 def commit(self):
78 """
79 Advance the parent state to the current position of this child state.
80 """
81 self.__parent._pos = self._pos
82 return self.__parent
83
84
85class ParseError(Exception):
86 def __init__(self, parser):
87 super(ParseError, self).__init__("Parse error at position %u (near %r)" % (
88 parser._pos, parser._data[parser._pos:parser._pos + 20]
89 ))
90
91
81a136b8 92# While the specification <https://www.w3.org/TR/webvtt1/#webvtt-timestamp>
93# prescribes that hours must be *2 or more* digits, timestamps with a single
94# digit for the hour part has been seen in the wild.
95# See https://github.com/yt-dlp/yt-dlp/issues/921
4a2f19ab 96_REGEX_TS = re.compile(r'''(?x)
81a136b8 97 (?:([0-9]{1,}):)?
4a2f19ab
F
98 ([0-9]{2}):
99 ([0-9]{2})\.
100 ([0-9]{3})?
101''')
102_REGEX_EOF = re.compile(r'\Z')
103_REGEX_NL = re.compile(r'(?:\r\n|[\r\n])')
104_REGEX_BLANK = re.compile(r'(?:\r\n|[\r\n])+')
105
106
107def _parse_ts(ts):
108 """
109 Convert a parsed WebVTT timestamp (a re.Match obtained from _REGEX_TS)
110 into an MPEG PES timestamp: a tick counter at 90 kHz resolution.
111 """
112
113 h, min, s, ms = ts.groups()
114 return 90 * (
115 int(h or 0) * 3600000 + # noqa: W504,E221,E222
116 int(min) * 60000 + # noqa: W504,E221,E222
117 int(s) * 1000 + # noqa: W504,E221,E222
118 int(ms) # noqa: W504,E221,E222
119 )
120
121
122def _format_ts(ts):
123 """
124 Convert an MPEG PES timestamp into a WebVTT timestamp.
125 This will lose sub-millisecond precision.
126 """
75722b03 127 msec = int((ts + 45) // 90)
128 secs, msec = divmod(msec, 1000)
129 mins, secs = divmod(secs, 60)
130 hrs, mins = divmod(mins, 60)
131 return '%02u:%02u:%02u.%03u' % (hrs, mins, secs, msec)
4a2f19ab
F
132
133
134class Block(object):
135 """
136 An abstract WebVTT block.
137 """
138
139 def __init__(self, **kwargs):
140 for key, val in kwargs.items():
141 setattr(self, key, val)
142
143 @classmethod
144 def parse(cls, parser):
145 m = parser.match(cls._REGEX)
146 if not m:
147 return None
148 parser.advance(m)
149 return cls(raw=m.group(0))
150
151 def write_into(self, stream):
152 stream.write(self.raw)
153
154
155class HeaderBlock(Block):
156 """
157 A WebVTT block that may only appear in the header part of the file,
158 i.e. before any cue blocks.
159 """
160
161 pass
162
163
164class Magic(HeaderBlock):
165 _REGEX = re.compile(r'\ufeff?WEBVTT([ \t][^\r\n]*)?(?:\r\n|[\r\n])')
166
167 # XXX: The X-TIMESTAMP-MAP extension is described in RFC 8216 §3.5
168 # <https://tools.ietf.org/html/rfc8216#section-3.5>, but the RFC
169 # doesn’t specify the exact grammar nor where in the WebVTT
170 # syntax it should be placed; the below has been devised based
171 # on usage in the wild
172 #
173 # And strictly speaking, the presence of this extension violates
174 # the W3C WebVTT spec. Oh well.
175
176 _REGEX_TSMAP = re.compile(r'X-TIMESTAMP-MAP=')
177 _REGEX_TSMAP_LOCAL = re.compile(r'LOCAL:')
178 _REGEX_TSMAP_MPEGTS = re.compile(r'MPEGTS:([0-9]+)')
81a136b8 179 _REGEX_TSMAP_SEP = re.compile(r'[ \t]*,[ \t]*')
4a2f19ab
F
180
181 @classmethod
182 def __parse_tsmap(cls, parser):
183 parser = parser.child()
184
185 while True:
186 m = parser.consume(cls._REGEX_TSMAP_LOCAL)
187 if m:
188 m = parser.consume(_REGEX_TS)
189 if m is None:
190 raise ParseError(parser)
191 local = _parse_ts(m)
192 if local is None:
193 raise ParseError(parser)
194 else:
195 m = parser.consume(cls._REGEX_TSMAP_MPEGTS)
196 if m:
197 mpegts = int_or_none(m.group(1))
198 if mpegts is None:
199 raise ParseError(parser)
200 else:
201 raise ParseError(parser)
81a136b8 202 if parser.consume(cls._REGEX_TSMAP_SEP):
4a2f19ab
F
203 continue
204 if parser.consume(_REGEX_NL):
205 break
206 raise ParseError(parser)
207
208 parser.commit()
209 return local, mpegts
210
211 @classmethod
212 def parse(cls, parser):
213 parser = parser.child()
214
215 m = parser.consume(cls._REGEX)
216 if not m:
217 raise ParseError(parser)
218
219 extra = m.group(1)
220 local, mpegts = None, None
221 if parser.consume(cls._REGEX_TSMAP):
222 local, mpegts = cls.__parse_tsmap(parser)
223 if not parser.consume(_REGEX_NL):
224 raise ParseError(parser)
225 parser.commit()
226 return cls(extra=extra, mpegts=mpegts, local=local)
227
228 def write_into(self, stream):
229 stream.write('WEBVTT')
230 if self.extra is not None:
231 stream.write(self.extra)
232 stream.write('\n')
233 if self.local or self.mpegts:
234 stream.write('X-TIMESTAMP-MAP=LOCAL:')
235 stream.write(_format_ts(self.local if self.local is not None else 0))
236 stream.write(',MPEGTS:')
237 stream.write(str(self.mpegts if self.mpegts is not None else 0))
238 stream.write('\n')
239 stream.write('\n')
240
241
242class StyleBlock(HeaderBlock):
243 _REGEX = re.compile(r'''(?x)
244 STYLE[\ \t]*(?:\r\n|[\r\n])
245 ((?:(?!-->)[^\r\n])+(?:\r\n|[\r\n]))*
246 (?:\r\n|[\r\n])
247 ''')
248
249
250class RegionBlock(HeaderBlock):
251 _REGEX = re.compile(r'''(?x)
252 REGION[\ \t]*
253 ((?:(?!-->)[^\r\n])+(?:\r\n|[\r\n]))*
254 (?:\r\n|[\r\n])
255 ''')
256
257
258class CommentBlock(Block):
259 _REGEX = re.compile(r'''(?x)
260 NOTE(?:\r\n|[\ \t\r\n])
261 ((?:(?!-->)[^\r\n])+(?:\r\n|[\r\n]))*
262 (?:\r\n|[\r\n])
263 ''')
264
265
266class CueBlock(Block):
267 """
268 A cue block. The payload is not interpreted.
269 """
270
271 _REGEX_ID = re.compile(r'((?:(?!-->)[^\r\n])+)(?:\r\n|[\r\n])')
272 _REGEX_ARROW = re.compile(r'[ \t]+-->[ \t]+')
273 _REGEX_SETTINGS = re.compile(r'[ \t]+((?:(?!-->)[^\r\n])+)')
274 _REGEX_PAYLOAD = re.compile(r'[^\r\n]+(?:\r\n|[\r\n])?')
275
276 @classmethod
277 def parse(cls, parser):
278 parser = parser.child()
279
280 id = None
281 m = parser.consume(cls._REGEX_ID)
282 if m:
283 id = m.group(1)
284
285 m0 = parser.consume(_REGEX_TS)
286 if not m0:
287 return None
288 if not parser.consume(cls._REGEX_ARROW):
289 return None
290 m1 = parser.consume(_REGEX_TS)
291 if not m1:
292 return None
293 m2 = parser.consume(cls._REGEX_SETTINGS)
294 if not parser.consume(_REGEX_NL):
295 return None
296
297 start = _parse_ts(m0)
298 end = _parse_ts(m1)
299 settings = m2.group(1) if m2 is not None else None
300
301 text = io.StringIO()
302 while True:
303 m = parser.consume(cls._REGEX_PAYLOAD)
304 if not m:
305 break
306 text.write(m.group(0))
307
308 parser.commit()
309 return cls(
310 id=id,
311 start=start, end=end, settings=settings,
312 text=text.getvalue()
313 )
314
315 def write_into(self, stream):
316 if self.id is not None:
317 stream.write(self.id)
318 stream.write('\n')
319 stream.write(_format_ts(self.start))
320 stream.write(' --> ')
321 stream.write(_format_ts(self.end))
322 if self.settings is not None:
323 stream.write(' ')
324 stream.write(self.settings)
325 stream.write('\n')
326 stream.write(self.text)
327 stream.write('\n')
328
333217f4
F
329 @property
330 def as_json(self):
331 return {
332 'id': self.id,
333 'start': self.start,
334 'end': self.end,
335 'text': self.text,
336 'settings': self.settings,
337 }
338
25a3f4f5
F
339 def __eq__(self, other):
340 return self.as_json == other.as_json
341
342 @classmethod
343 def from_json(cls, json):
344 return cls(
345 id=json['id'],
346 start=json['start'],
347 end=json['end'],
348 text=json['text'],
349 settings=json['settings']
350 )
351
352 def hinges(self, other):
353 if self.text != other.text:
354 return False
355 if self.settings != other.settings:
356 return False
357 return self.start <= self.end == other.start <= other.end
358
4a2f19ab
F
359
360def parse_fragment(frag_content):
361 """
362 A generator that yields (partially) parsed WebVTT blocks when given
363 a bytes object containing the raw contents of a WebVTT file.
364 """
365
366 parser = _MatchParser(frag_content.decode('utf-8'))
367
368 yield Magic.parse(parser)
369
370 while not parser.match(_REGEX_EOF):
371 if parser.consume(_REGEX_BLANK):
372 continue
373
374 block = RegionBlock.parse(parser)
375 if block:
376 yield block
377 continue
378 block = StyleBlock.parse(parser)
379 if block:
380 yield block
381 continue
382 block = CommentBlock.parse(parser)
383 if block:
384 yield block # XXX: or skip
385 continue
386
387 break
388
389 while not parser.match(_REGEX_EOF):
390 if parser.consume(_REGEX_BLANK):
391 continue
392
393 block = CommentBlock.parse(parser)
394 if block:
395 yield block # XXX: or skip
396 continue
397 block = CueBlock.parse(parser)
398 if block:
399 yield block
400 continue
401
402 raise ParseError(parser)