]> jfr.im git - yt-dlp.git/blame - yt_dlp/webvtt.py
[ie/youtube] Skip formats if nsig decoding fails (#10223)
[yt-dlp.git] / yt_dlp / webvtt.py
CommitLineData
4a2f19ab
F
1"""
2A partial parser for WebVTT segments. Interprets enough of the WebVTT stream
3to be able to assemble a single stand-alone subtitle file, suitably adjusting
4timestamps on the way, while everything else is passed through unmodified.
5
6Regular expressions based on the W3C WebVTT specification
7<https://www.w3.org/TR/webvtt1/>. The X-TIMESTAMP-MAP extension is described
8in RFC 8216 §3.5 <https://tools.ietf.org/html/rfc8216#section-3.5>.
9"""
10
4a2f19ab 11import io
6929b41a 12import re
f8271158 13
aa7785f8 14from .utils import int_or_none, timetuple_from_msec
4a2f19ab
F
15
16
86e5f3ed 17class _MatchParser:
4a2f19ab
F
18 """
19 An object that maintains the current parsing position and allows
20 conveniently advancing it as syntax elements are successfully parsed.
21 """
22
23 def __init__(self, string):
24 self._data = string
25 self._pos = 0
26
27 def match(self, r):
77f90330 28 if isinstance(r, re.Pattern):
4a2f19ab
F
29 return r.match(self._data, self._pos)
30 if isinstance(r, str):
31 if self._data.startswith(r, self._pos):
32 return len(r)
33 return None
34 raise ValueError(r)
35
36 def advance(self, by):
37 if by is None:
38 amt = 0
77f90330 39 elif isinstance(by, re.Match):
4a2f19ab
F
40 amt = len(by.group(0))
41 elif isinstance(by, str):
42 amt = len(by)
43 elif isinstance(by, int):
44 amt = by
45 else:
46 raise ValueError(by)
47 self._pos += amt
48 return by
49
50 def consume(self, r):
51 return self.advance(self.match(r))
52
53 def child(self):
54 return _MatchChildParser(self)
55
56
57class _MatchChildParser(_MatchParser):
58 """
59 A child parser state, which advances through the same data as
60 its parent, but has an independent position. This is useful when
61 advancing through syntax elements we might later want to backtrack
62 from.
63 """
64
65 def __init__(self, parent):
86e5f3ed 66 super().__init__(parent._data)
4a2f19ab
F
67 self.__parent = parent
68 self._pos = parent._pos
69
70 def commit(self):
71 """
72 Advance the parent state to the current position of this child state.
73 """
74 self.__parent._pos = self._pos
75 return self.__parent
76
77
78class ParseError(Exception):
79 def __init__(self, parser):
add96eb9 80 data = parser._data[parser._pos:parser._pos + 100]
81 super().__init__(f'Parse error at position {parser._pos} (near {data!r})')
4a2f19ab
F
82
83
81a136b8 84# While the specification <https://www.w3.org/TR/webvtt1/#webvtt-timestamp>
85# prescribes that hours must be *2 or more* digits, timestamps with a single
86# digit for the hour part has been seen in the wild.
87# See https://github.com/yt-dlp/yt-dlp/issues/921
4a2f19ab 88_REGEX_TS = re.compile(r'''(?x)
81a136b8 89 (?:([0-9]{1,}):)?
4a2f19ab
F
90 ([0-9]{2}):
91 ([0-9]{2})\.
92 ([0-9]{3})?
93''')
94_REGEX_EOF = re.compile(r'\Z')
f352a097 95_REGEX_NL = re.compile(r'(?:\r\n|[\r\n]|$)')
4a2f19ab 96_REGEX_BLANK = re.compile(r'(?:\r\n|[\r\n])+')
15f22b48 97_REGEX_OPTIONAL_WHITESPACE = re.compile(r'[ \t]*')
4a2f19ab
F
98
99
100def _parse_ts(ts):
101 """
102 Convert a parsed WebVTT timestamp (a re.Match obtained from _REGEX_TS)
103 into an MPEG PES timestamp: a tick counter at 90 kHz resolution.
104 """
19a03940 105 return 90 * sum(
106 int(part or 0) * mult for part, mult in zip(ts.groups(), (3600_000, 60_000, 1000, 1)))
4a2f19ab
F
107
108
109def _format_ts(ts):
110 """
111 Convert an MPEG PES timestamp into a WebVTT timestamp.
112 This will lose sub-millisecond precision.
113 """
aa7785f8 114 return '%02u:%02u:%02u.%03u' % timetuple_from_msec(int((ts + 45) // 90))
4a2f19ab
F
115
116
86e5f3ed 117class Block:
4a2f19ab
F
118 """
119 An abstract WebVTT block.
120 """
121
122 def __init__(self, **kwargs):
123 for key, val in kwargs.items():
124 setattr(self, key, val)
125
126 @classmethod
127 def parse(cls, parser):
128 m = parser.match(cls._REGEX)
129 if not m:
130 return None
131 parser.advance(m)
132 return cls(raw=m.group(0))
133
134 def write_into(self, stream):
135 stream.write(self.raw)
136
137
138class HeaderBlock(Block):
139 """
140 A WebVTT block that may only appear in the header part of the file,
141 i.e. before any cue blocks.
142 """
4a2f19ab
F
143 pass
144
145
146class Magic(HeaderBlock):
147 _REGEX = re.compile(r'\ufeff?WEBVTT([ \t][^\r\n]*)?(?:\r\n|[\r\n])')
148
149 # XXX: The X-TIMESTAMP-MAP extension is described in RFC 8216 §3.5
150 # <https://tools.ietf.org/html/rfc8216#section-3.5>, but the RFC
add96eb9 151 # doesn't specify the exact grammar nor where in the WebVTT
4a2f19ab
F
152 # syntax it should be placed; the below has been devised based
153 # on usage in the wild
154 #
155 # And strictly speaking, the presence of this extension violates
156 # the W3C WebVTT spec. Oh well.
157
158 _REGEX_TSMAP = re.compile(r'X-TIMESTAMP-MAP=')
159 _REGEX_TSMAP_LOCAL = re.compile(r'LOCAL:')
160 _REGEX_TSMAP_MPEGTS = re.compile(r'MPEGTS:([0-9]+)')
81a136b8 161 _REGEX_TSMAP_SEP = re.compile(r'[ \t]*,[ \t]*')
4a2f19ab 162
c646d76f 163 # This was removed from the spec in the 2017 revision;
164 # the last spec draft to describe this syntax element is
165 # <https://www.w3.org/TR/2015/WD-webvtt1-20151208/#webvtt-metadata-header>.
166 # Nevertheless, YouTube keeps serving those
167 _REGEX_META = re.compile(r'(?:(?!-->)[^\r\n])+:(?:(?!-->)[^\r\n])+(?:\r\n|[\r\n])')
168
4a2f19ab
F
169 @classmethod
170 def __parse_tsmap(cls, parser):
171 parser = parser.child()
172
173 while True:
174 m = parser.consume(cls._REGEX_TSMAP_LOCAL)
175 if m:
176 m = parser.consume(_REGEX_TS)
177 if m is None:
178 raise ParseError(parser)
179 local = _parse_ts(m)
180 if local is None:
181 raise ParseError(parser)
182 else:
183 m = parser.consume(cls._REGEX_TSMAP_MPEGTS)
184 if m:
185 mpegts = int_or_none(m.group(1))
186 if mpegts is None:
187 raise ParseError(parser)
188 else:
189 raise ParseError(parser)
81a136b8 190 if parser.consume(cls._REGEX_TSMAP_SEP):
4a2f19ab
F
191 continue
192 if parser.consume(_REGEX_NL):
193 break
194 raise ParseError(parser)
195
196 parser.commit()
197 return local, mpegts
198
199 @classmethod
200 def parse(cls, parser):
201 parser = parser.child()
202
203 m = parser.consume(cls._REGEX)
204 if not m:
205 raise ParseError(parser)
206
207 extra = m.group(1)
c646d76f 208 local, mpegts, meta = None, None, ''
209 while not parser.consume(_REGEX_NL):
210 if parser.consume(cls._REGEX_TSMAP):
211 local, mpegts = cls.__parse_tsmap(parser)
212 continue
213 m = parser.consume(cls._REGEX_META)
214 if m:
215 meta += m.group(0)
216 continue
4a2f19ab
F
217 raise ParseError(parser)
218 parser.commit()
c646d76f 219 return cls(extra=extra, mpegts=mpegts, local=local, meta=meta)
4a2f19ab
F
220
221 def write_into(self, stream):
222 stream.write('WEBVTT')
223 if self.extra is not None:
224 stream.write(self.extra)
225 stream.write('\n')
226 if self.local or self.mpegts:
227 stream.write('X-TIMESTAMP-MAP=LOCAL:')
228 stream.write(_format_ts(self.local if self.local is not None else 0))
229 stream.write(',MPEGTS:')
230 stream.write(str(self.mpegts if self.mpegts is not None else 0))
231 stream.write('\n')
c646d76f 232 if self.meta:
233 stream.write(self.meta)
4a2f19ab
F
234 stream.write('\n')
235
236
237class StyleBlock(HeaderBlock):
238 _REGEX = re.compile(r'''(?x)
239 STYLE[\ \t]*(?:\r\n|[\r\n])
240 ((?:(?!-->)[^\r\n])+(?:\r\n|[\r\n]))*
241 (?:\r\n|[\r\n])
242 ''')
243
244
245class RegionBlock(HeaderBlock):
246 _REGEX = re.compile(r'''(?x)
247 REGION[\ \t]*
248 ((?:(?!-->)[^\r\n])+(?:\r\n|[\r\n]))*
249 (?:\r\n|[\r\n])
250 ''')
251
252
253class CommentBlock(Block):
254 _REGEX = re.compile(r'''(?x)
255 NOTE(?:\r\n|[\ \t\r\n])
256 ((?:(?!-->)[^\r\n])+(?:\r\n|[\r\n]))*
257 (?:\r\n|[\r\n])
258 ''')
259
260
261class CueBlock(Block):
262 """
263 A cue block. The payload is not interpreted.
264 """
265
266 _REGEX_ID = re.compile(r'((?:(?!-->)[^\r\n])+)(?:\r\n|[\r\n])')
267 _REGEX_ARROW = re.compile(r'[ \t]+-->[ \t]+')
268 _REGEX_SETTINGS = re.compile(r'[ \t]+((?:(?!-->)[^\r\n])+)')
269 _REGEX_PAYLOAD = re.compile(r'[^\r\n]+(?:\r\n|[\r\n])?')
270
271 @classmethod
272 def parse(cls, parser):
273 parser = parser.child()
274
add96eb9 275 id_ = None
4a2f19ab
F
276 m = parser.consume(cls._REGEX_ID)
277 if m:
add96eb9 278 id_ = m.group(1)
4a2f19ab
F
279
280 m0 = parser.consume(_REGEX_TS)
281 if not m0:
282 return None
283 if not parser.consume(cls._REGEX_ARROW):
284 return None
285 m1 = parser.consume(_REGEX_TS)
286 if not m1:
287 return None
288 m2 = parser.consume(cls._REGEX_SETTINGS)
298230e5 289 parser.consume(_REGEX_OPTIONAL_WHITESPACE)
4a2f19ab
F
290 if not parser.consume(_REGEX_NL):
291 return None
292
293 start = _parse_ts(m0)
294 end = _parse_ts(m1)
295 settings = m2.group(1) if m2 is not None else None
296
297 text = io.StringIO()
298 while True:
299 m = parser.consume(cls._REGEX_PAYLOAD)
300 if not m:
301 break
302 text.write(m.group(0))
303
304 parser.commit()
305 return cls(
add96eb9 306 id=id_,
4a2f19ab 307 start=start, end=end, settings=settings,
add96eb9 308 text=text.getvalue(),
4a2f19ab
F
309 )
310
311 def write_into(self, stream):
312 if self.id is not None:
313 stream.write(self.id)
314 stream.write('\n')
315 stream.write(_format_ts(self.start))
316 stream.write(' --> ')
317 stream.write(_format_ts(self.end))
318 if self.settings is not None:
319 stream.write(' ')
320 stream.write(self.settings)
321 stream.write('\n')
322 stream.write(self.text)
323 stream.write('\n')
324
333217f4
F
325 @property
326 def as_json(self):
327 return {
328 'id': self.id,
329 'start': self.start,
330 'end': self.end,
331 'text': self.text,
332 'settings': self.settings,
333 }
334
25a3f4f5
F
335 def __eq__(self, other):
336 return self.as_json == other.as_json
337
338 @classmethod
339 def from_json(cls, json):
340 return cls(
341 id=json['id'],
342 start=json['start'],
343 end=json['end'],
344 text=json['text'],
add96eb9 345 settings=json['settings'],
25a3f4f5
F
346 )
347
348 def hinges(self, other):
349 if self.text != other.text:
350 return False
351 if self.settings != other.settings:
352 return False
353 return self.start <= self.end == other.start <= other.end
354
4a2f19ab
F
355
356def parse_fragment(frag_content):
357 """
358 A generator that yields (partially) parsed WebVTT blocks when given
359 a bytes object containing the raw contents of a WebVTT file.
360 """
361
0f06bcd7 362 parser = _MatchParser(frag_content.decode())
4a2f19ab
F
363
364 yield Magic.parse(parser)
365
366 while not parser.match(_REGEX_EOF):
367 if parser.consume(_REGEX_BLANK):
368 continue
369
370 block = RegionBlock.parse(parser)
371 if block:
372 yield block
373 continue
374 block = StyleBlock.parse(parser)
375 if block:
376 yield block
377 continue
378 block = CommentBlock.parse(parser)
379 if block:
380 yield block # XXX: or skip
381 continue
382
383 break
384
385 while not parser.match(_REGEX_EOF):
386 if parser.consume(_REGEX_BLANK):
387 continue
388
389 block = CommentBlock.parse(parser)
390 if block:
391 yield block # XXX: or skip
392 continue
393 block = CueBlock.parse(parser)
394 if block:
395 yield block
396 continue
397
398 raise ParseError(parser)