]> jfr.im git - yt-dlp.git/blame - yt_dlp/webvtt.py
[ie/tiktok] Extract all web formats (#9960)
[yt-dlp.git] / yt_dlp / webvtt.py
CommitLineData
4a2f19ab
F
1"""
2A partial parser for WebVTT segments. Interprets enough of the WebVTT stream
3to be able to assemble a single stand-alone subtitle file, suitably adjusting
4timestamps on the way, while everything else is passed through unmodified.
5
6Regular expressions based on the W3C WebVTT specification
7<https://www.w3.org/TR/webvtt1/>. The X-TIMESTAMP-MAP extension is described
8in RFC 8216 §3.5 <https://tools.ietf.org/html/rfc8216#section-3.5>.
9"""
10
4a2f19ab 11import io
6929b41a 12import re
f8271158 13
aa7785f8 14from .utils import int_or_none, timetuple_from_msec
4a2f19ab
F
15
16
86e5f3ed 17class _MatchParser:
4a2f19ab
F
18 """
19 An object that maintains the current parsing position and allows
20 conveniently advancing it as syntax elements are successfully parsed.
21 """
22
23 def __init__(self, string):
24 self._data = string
25 self._pos = 0
26
27 def match(self, r):
77f90330 28 if isinstance(r, re.Pattern):
4a2f19ab
F
29 return r.match(self._data, self._pos)
30 if isinstance(r, str):
31 if self._data.startswith(r, self._pos):
32 return len(r)
33 return None
34 raise ValueError(r)
35
36 def advance(self, by):
37 if by is None:
38 amt = 0
77f90330 39 elif isinstance(by, re.Match):
4a2f19ab
F
40 amt = len(by.group(0))
41 elif isinstance(by, str):
42 amt = len(by)
43 elif isinstance(by, int):
44 amt = by
45 else:
46 raise ValueError(by)
47 self._pos += amt
48 return by
49
50 def consume(self, r):
51 return self.advance(self.match(r))
52
53 def child(self):
54 return _MatchChildParser(self)
55
56
57class _MatchChildParser(_MatchParser):
58 """
59 A child parser state, which advances through the same data as
60 its parent, but has an independent position. This is useful when
61 advancing through syntax elements we might later want to backtrack
62 from.
63 """
64
65 def __init__(self, parent):
86e5f3ed 66 super().__init__(parent._data)
4a2f19ab
F
67 self.__parent = parent
68 self._pos = parent._pos
69
70 def commit(self):
71 """
72 Advance the parent state to the current position of this child state.
73 """
74 self.__parent._pos = self._pos
75 return self.__parent
76
77
78class ParseError(Exception):
79 def __init__(self, parser):
86e5f3ed 80 super().__init__("Parse error at position %u (near %r)" % (
615a8444 81 parser._pos, parser._data[parser._pos:parser._pos + 100]
4a2f19ab
F
82 ))
83
84
81a136b8 85# While the specification <https://www.w3.org/TR/webvtt1/#webvtt-timestamp>
86# prescribes that hours must be *2 or more* digits, timestamps with a single
87# digit for the hour part has been seen in the wild.
88# See https://github.com/yt-dlp/yt-dlp/issues/921
4a2f19ab 89_REGEX_TS = re.compile(r'''(?x)
81a136b8 90 (?:([0-9]{1,}):)?
4a2f19ab
F
91 ([0-9]{2}):
92 ([0-9]{2})\.
93 ([0-9]{3})?
94''')
95_REGEX_EOF = re.compile(r'\Z')
f352a097 96_REGEX_NL = re.compile(r'(?:\r\n|[\r\n]|$)')
4a2f19ab 97_REGEX_BLANK = re.compile(r'(?:\r\n|[\r\n])+')
15f22b48 98_REGEX_OPTIONAL_WHITESPACE = re.compile(r'[ \t]*')
4a2f19ab
F
99
100
101def _parse_ts(ts):
102 """
103 Convert a parsed WebVTT timestamp (a re.Match obtained from _REGEX_TS)
104 into an MPEG PES timestamp: a tick counter at 90 kHz resolution.
105 """
19a03940 106 return 90 * sum(
107 int(part or 0) * mult for part, mult in zip(ts.groups(), (3600_000, 60_000, 1000, 1)))
4a2f19ab
F
108
109
110def _format_ts(ts):
111 """
112 Convert an MPEG PES timestamp into a WebVTT timestamp.
113 This will lose sub-millisecond precision.
114 """
aa7785f8 115 return '%02u:%02u:%02u.%03u' % timetuple_from_msec(int((ts + 45) // 90))
4a2f19ab
F
116
117
86e5f3ed 118class Block:
4a2f19ab
F
119 """
120 An abstract WebVTT block.
121 """
122
123 def __init__(self, **kwargs):
124 for key, val in kwargs.items():
125 setattr(self, key, val)
126
127 @classmethod
128 def parse(cls, parser):
129 m = parser.match(cls._REGEX)
130 if not m:
131 return None
132 parser.advance(m)
133 return cls(raw=m.group(0))
134
135 def write_into(self, stream):
136 stream.write(self.raw)
137
138
139class HeaderBlock(Block):
140 """
141 A WebVTT block that may only appear in the header part of the file,
142 i.e. before any cue blocks.
143 """
4a2f19ab
F
144 pass
145
146
147class Magic(HeaderBlock):
148 _REGEX = re.compile(r'\ufeff?WEBVTT([ \t][^\r\n]*)?(?:\r\n|[\r\n])')
149
150 # XXX: The X-TIMESTAMP-MAP extension is described in RFC 8216 §3.5
151 # <https://tools.ietf.org/html/rfc8216#section-3.5>, but the RFC
152 # doesn’t specify the exact grammar nor where in the WebVTT
153 # syntax it should be placed; the below has been devised based
154 # on usage in the wild
155 #
156 # And strictly speaking, the presence of this extension violates
157 # the W3C WebVTT spec. Oh well.
158
159 _REGEX_TSMAP = re.compile(r'X-TIMESTAMP-MAP=')
160 _REGEX_TSMAP_LOCAL = re.compile(r'LOCAL:')
161 _REGEX_TSMAP_MPEGTS = re.compile(r'MPEGTS:([0-9]+)')
81a136b8 162 _REGEX_TSMAP_SEP = re.compile(r'[ \t]*,[ \t]*')
4a2f19ab 163
c646d76f 164 # This was removed from the spec in the 2017 revision;
165 # the last spec draft to describe this syntax element is
166 # <https://www.w3.org/TR/2015/WD-webvtt1-20151208/#webvtt-metadata-header>.
167 # Nevertheless, YouTube keeps serving those
168 _REGEX_META = re.compile(r'(?:(?!-->)[^\r\n])+:(?:(?!-->)[^\r\n])+(?:\r\n|[\r\n])')
169
4a2f19ab
F
170 @classmethod
171 def __parse_tsmap(cls, parser):
172 parser = parser.child()
173
174 while True:
175 m = parser.consume(cls._REGEX_TSMAP_LOCAL)
176 if m:
177 m = parser.consume(_REGEX_TS)
178 if m is None:
179 raise ParseError(parser)
180 local = _parse_ts(m)
181 if local is None:
182 raise ParseError(parser)
183 else:
184 m = parser.consume(cls._REGEX_TSMAP_MPEGTS)
185 if m:
186 mpegts = int_or_none(m.group(1))
187 if mpegts is None:
188 raise ParseError(parser)
189 else:
190 raise ParseError(parser)
81a136b8 191 if parser.consume(cls._REGEX_TSMAP_SEP):
4a2f19ab
F
192 continue
193 if parser.consume(_REGEX_NL):
194 break
195 raise ParseError(parser)
196
197 parser.commit()
198 return local, mpegts
199
200 @classmethod
201 def parse(cls, parser):
202 parser = parser.child()
203
204 m = parser.consume(cls._REGEX)
205 if not m:
206 raise ParseError(parser)
207
208 extra = m.group(1)
c646d76f 209 local, mpegts, meta = None, None, ''
210 while not parser.consume(_REGEX_NL):
211 if parser.consume(cls._REGEX_TSMAP):
212 local, mpegts = cls.__parse_tsmap(parser)
213 continue
214 m = parser.consume(cls._REGEX_META)
215 if m:
216 meta += m.group(0)
217 continue
4a2f19ab
F
218 raise ParseError(parser)
219 parser.commit()
c646d76f 220 return cls(extra=extra, mpegts=mpegts, local=local, meta=meta)
4a2f19ab
F
221
222 def write_into(self, stream):
223 stream.write('WEBVTT')
224 if self.extra is not None:
225 stream.write(self.extra)
226 stream.write('\n')
227 if self.local or self.mpegts:
228 stream.write('X-TIMESTAMP-MAP=LOCAL:')
229 stream.write(_format_ts(self.local if self.local is not None else 0))
230 stream.write(',MPEGTS:')
231 stream.write(str(self.mpegts if self.mpegts is not None else 0))
232 stream.write('\n')
c646d76f 233 if self.meta:
234 stream.write(self.meta)
4a2f19ab
F
235 stream.write('\n')
236
237
238class StyleBlock(HeaderBlock):
239 _REGEX = re.compile(r'''(?x)
240 STYLE[\ \t]*(?:\r\n|[\r\n])
241 ((?:(?!-->)[^\r\n])+(?:\r\n|[\r\n]))*
242 (?:\r\n|[\r\n])
243 ''')
244
245
246class RegionBlock(HeaderBlock):
247 _REGEX = re.compile(r'''(?x)
248 REGION[\ \t]*
249 ((?:(?!-->)[^\r\n])+(?:\r\n|[\r\n]))*
250 (?:\r\n|[\r\n])
251 ''')
252
253
254class CommentBlock(Block):
255 _REGEX = re.compile(r'''(?x)
256 NOTE(?:\r\n|[\ \t\r\n])
257 ((?:(?!-->)[^\r\n])+(?:\r\n|[\r\n]))*
258 (?:\r\n|[\r\n])
259 ''')
260
261
262class CueBlock(Block):
263 """
264 A cue block. The payload is not interpreted.
265 """
266
267 _REGEX_ID = re.compile(r'((?:(?!-->)[^\r\n])+)(?:\r\n|[\r\n])')
268 _REGEX_ARROW = re.compile(r'[ \t]+-->[ \t]+')
269 _REGEX_SETTINGS = re.compile(r'[ \t]+((?:(?!-->)[^\r\n])+)')
270 _REGEX_PAYLOAD = re.compile(r'[^\r\n]+(?:\r\n|[\r\n])?')
271
272 @classmethod
273 def parse(cls, parser):
274 parser = parser.child()
275
276 id = None
277 m = parser.consume(cls._REGEX_ID)
278 if m:
279 id = m.group(1)
280
281 m0 = parser.consume(_REGEX_TS)
282 if not m0:
283 return None
284 if not parser.consume(cls._REGEX_ARROW):
285 return None
286 m1 = parser.consume(_REGEX_TS)
287 if not m1:
288 return None
289 m2 = parser.consume(cls._REGEX_SETTINGS)
298230e5 290 parser.consume(_REGEX_OPTIONAL_WHITESPACE)
4a2f19ab
F
291 if not parser.consume(_REGEX_NL):
292 return None
293
294 start = _parse_ts(m0)
295 end = _parse_ts(m1)
296 settings = m2.group(1) if m2 is not None else None
297
298 text = io.StringIO()
299 while True:
300 m = parser.consume(cls._REGEX_PAYLOAD)
301 if not m:
302 break
303 text.write(m.group(0))
304
305 parser.commit()
306 return cls(
307 id=id,
308 start=start, end=end, settings=settings,
309 text=text.getvalue()
310 )
311
312 def write_into(self, stream):
313 if self.id is not None:
314 stream.write(self.id)
315 stream.write('\n')
316 stream.write(_format_ts(self.start))
317 stream.write(' --> ')
318 stream.write(_format_ts(self.end))
319 if self.settings is not None:
320 stream.write(' ')
321 stream.write(self.settings)
322 stream.write('\n')
323 stream.write(self.text)
324 stream.write('\n')
325
333217f4
F
326 @property
327 def as_json(self):
328 return {
329 'id': self.id,
330 'start': self.start,
331 'end': self.end,
332 'text': self.text,
333 'settings': self.settings,
334 }
335
25a3f4f5
F
336 def __eq__(self, other):
337 return self.as_json == other.as_json
338
339 @classmethod
340 def from_json(cls, json):
341 return cls(
342 id=json['id'],
343 start=json['start'],
344 end=json['end'],
345 text=json['text'],
346 settings=json['settings']
347 )
348
349 def hinges(self, other):
350 if self.text != other.text:
351 return False
352 if self.settings != other.settings:
353 return False
354 return self.start <= self.end == other.start <= other.end
355
4a2f19ab
F
356
357def parse_fragment(frag_content):
358 """
359 A generator that yields (partially) parsed WebVTT blocks when given
360 a bytes object containing the raw contents of a WebVTT file.
361 """
362
0f06bcd7 363 parser = _MatchParser(frag_content.decode())
4a2f19ab
F
364
365 yield Magic.parse(parser)
366
367 while not parser.match(_REGEX_EOF):
368 if parser.consume(_REGEX_BLANK):
369 continue
370
371 block = RegionBlock.parse(parser)
372 if block:
373 yield block
374 continue
375 block = StyleBlock.parse(parser)
376 if block:
377 yield block
378 continue
379 block = CommentBlock.parse(parser)
380 if block:
381 yield block # XXX: or skip
382 continue
383
384 break
385
386 while not parser.match(_REGEX_EOF):
387 if parser.consume(_REGEX_BLANK):
388 continue
389
390 block = CommentBlock.parse(parser)
391 if block:
392 yield block # XXX: or skip
393 continue
394 block = CueBlock.parse(parser)
395 if block:
396 yield block
397 continue
398
399 raise ParseError(parser)