]> jfr.im git - yt-dlp.git/blob - yt_dlp/webvtt.py
[cleanup] Mark some compat variables for removal (#2173)
[yt-dlp.git] / yt_dlp / webvtt.py
1 # coding: utf-8
2 from __future__ import unicode_literals, print_function, division
3
4 """
5 A partial parser for WebVTT segments. Interprets enough of the WebVTT stream
6 to be able to assemble a single stand-alone subtitle file, suitably adjusting
7 timestamps on the way, while everything else is passed through unmodified.
8
9 Regular expressions based on the W3C WebVTT specification
10 <https://www.w3.org/TR/webvtt1/>. The X-TIMESTAMP-MAP extension is described
11 in RFC 8216 §3.5 <https://tools.ietf.org/html/rfc8216#section-3.5>.
12 """
13
14 import re
15 import io
16 from .utils import int_or_none, timetuple_from_msec
17 from .compat import (
18 compat_Pattern,
19 compat_Match,
20 )
21
22
23 class _MatchParser(object):
24 """
25 An object that maintains the current parsing position and allows
26 conveniently advancing it as syntax elements are successfully parsed.
27 """
28
29 def __init__(self, string):
30 self._data = string
31 self._pos = 0
32
33 def match(self, r):
34 if isinstance(r, compat_Pattern):
35 return r.match(self._data, self._pos)
36 if isinstance(r, str):
37 if self._data.startswith(r, self._pos):
38 return len(r)
39 return None
40 raise ValueError(r)
41
42 def advance(self, by):
43 if by is None:
44 amt = 0
45 elif isinstance(by, compat_Match):
46 amt = len(by.group(0))
47 elif isinstance(by, str):
48 amt = len(by)
49 elif isinstance(by, int):
50 amt = by
51 else:
52 raise ValueError(by)
53 self._pos += amt
54 return by
55
56 def consume(self, r):
57 return self.advance(self.match(r))
58
59 def child(self):
60 return _MatchChildParser(self)
61
62
63 class _MatchChildParser(_MatchParser):
64 """
65 A child parser state, which advances through the same data as
66 its parent, but has an independent position. This is useful when
67 advancing through syntax elements we might later want to backtrack
68 from.
69 """
70
71 def __init__(self, parent):
72 super(_MatchChildParser, self).__init__(parent._data)
73 self.__parent = parent
74 self._pos = parent._pos
75
76 def commit(self):
77 """
78 Advance the parent state to the current position of this child state.
79 """
80 self.__parent._pos = self._pos
81 return self.__parent
82
83
84 class ParseError(Exception):
85 def __init__(self, parser):
86 super(ParseError, self).__init__("Parse error at position %u (near %r)" % (
87 parser._pos, parser._data[parser._pos:parser._pos + 20]
88 ))
89
90
91 # While the specification <https://www.w3.org/TR/webvtt1/#webvtt-timestamp>
92 # prescribes that hours must be *2 or more* digits, timestamps with a single
93 # digit for the hour part has been seen in the wild.
94 # See https://github.com/yt-dlp/yt-dlp/issues/921
95 _REGEX_TS = re.compile(r'''(?x)
96 (?:([0-9]{1,}):)?
97 ([0-9]{2}):
98 ([0-9]{2})\.
99 ([0-9]{3})?
100 ''')
101 _REGEX_EOF = re.compile(r'\Z')
102 _REGEX_NL = re.compile(r'(?:\r\n|[\r\n])')
103 _REGEX_BLANK = re.compile(r'(?:\r\n|[\r\n])+')
104
105
106 def _parse_ts(ts):
107 """
108 Convert a parsed WebVTT timestamp (a re.Match obtained from _REGEX_TS)
109 into an MPEG PES timestamp: a tick counter at 90 kHz resolution.
110 """
111
112 h, min, s, ms = ts.groups()
113 return 90 * (
114 int(h or 0) * 3600000 + # noqa: W504,E221,E222
115 int(min) * 60000 + # noqa: W504,E221,E222
116 int(s) * 1000 + # noqa: W504,E221,E222
117 int(ms) # noqa: W504,E221,E222
118 )
119
120
121 def _format_ts(ts):
122 """
123 Convert an MPEG PES timestamp into a WebVTT timestamp.
124 This will lose sub-millisecond precision.
125 """
126 return '%02u:%02u:%02u.%03u' % timetuple_from_msec(int((ts + 45) // 90))
127
128
129 class Block(object):
130 """
131 An abstract WebVTT block.
132 """
133
134 def __init__(self, **kwargs):
135 for key, val in kwargs.items():
136 setattr(self, key, val)
137
138 @classmethod
139 def parse(cls, parser):
140 m = parser.match(cls._REGEX)
141 if not m:
142 return None
143 parser.advance(m)
144 return cls(raw=m.group(0))
145
146 def write_into(self, stream):
147 stream.write(self.raw)
148
149
150 class HeaderBlock(Block):
151 """
152 A WebVTT block that may only appear in the header part of the file,
153 i.e. before any cue blocks.
154 """
155
156 pass
157
158
159 class Magic(HeaderBlock):
160 _REGEX = re.compile(r'\ufeff?WEBVTT([ \t][^\r\n]*)?(?:\r\n|[\r\n])')
161
162 # XXX: The X-TIMESTAMP-MAP extension is described in RFC 8216 §3.5
163 # <https://tools.ietf.org/html/rfc8216#section-3.5>, but the RFC
164 # doesn’t specify the exact grammar nor where in the WebVTT
165 # syntax it should be placed; the below has been devised based
166 # on usage in the wild
167 #
168 # And strictly speaking, the presence of this extension violates
169 # the W3C WebVTT spec. Oh well.
170
171 _REGEX_TSMAP = re.compile(r'X-TIMESTAMP-MAP=')
172 _REGEX_TSMAP_LOCAL = re.compile(r'LOCAL:')
173 _REGEX_TSMAP_MPEGTS = re.compile(r'MPEGTS:([0-9]+)')
174 _REGEX_TSMAP_SEP = re.compile(r'[ \t]*,[ \t]*')
175
176 @classmethod
177 def __parse_tsmap(cls, parser):
178 parser = parser.child()
179
180 while True:
181 m = parser.consume(cls._REGEX_TSMAP_LOCAL)
182 if m:
183 m = parser.consume(_REGEX_TS)
184 if m is None:
185 raise ParseError(parser)
186 local = _parse_ts(m)
187 if local is None:
188 raise ParseError(parser)
189 else:
190 m = parser.consume(cls._REGEX_TSMAP_MPEGTS)
191 if m:
192 mpegts = int_or_none(m.group(1))
193 if mpegts is None:
194 raise ParseError(parser)
195 else:
196 raise ParseError(parser)
197 if parser.consume(cls._REGEX_TSMAP_SEP):
198 continue
199 if parser.consume(_REGEX_NL):
200 break
201 raise ParseError(parser)
202
203 parser.commit()
204 return local, mpegts
205
206 @classmethod
207 def parse(cls, parser):
208 parser = parser.child()
209
210 m = parser.consume(cls._REGEX)
211 if not m:
212 raise ParseError(parser)
213
214 extra = m.group(1)
215 local, mpegts = None, None
216 if parser.consume(cls._REGEX_TSMAP):
217 local, mpegts = cls.__parse_tsmap(parser)
218 if not parser.consume(_REGEX_NL):
219 raise ParseError(parser)
220 parser.commit()
221 return cls(extra=extra, mpegts=mpegts, local=local)
222
223 def write_into(self, stream):
224 stream.write('WEBVTT')
225 if self.extra is not None:
226 stream.write(self.extra)
227 stream.write('\n')
228 if self.local or self.mpegts:
229 stream.write('X-TIMESTAMP-MAP=LOCAL:')
230 stream.write(_format_ts(self.local if self.local is not None else 0))
231 stream.write(',MPEGTS:')
232 stream.write(str(self.mpegts if self.mpegts is not None else 0))
233 stream.write('\n')
234 stream.write('\n')
235
236
237 class StyleBlock(HeaderBlock):
238 _REGEX = re.compile(r'''(?x)
239 STYLE[\ \t]*(?:\r\n|[\r\n])
240 ((?:(?!-->)[^\r\n])+(?:\r\n|[\r\n]))*
241 (?:\r\n|[\r\n])
242 ''')
243
244
245 class RegionBlock(HeaderBlock):
246 _REGEX = re.compile(r'''(?x)
247 REGION[\ \t]*
248 ((?:(?!-->)[^\r\n])+(?:\r\n|[\r\n]))*
249 (?:\r\n|[\r\n])
250 ''')
251
252
253 class CommentBlock(Block):
254 _REGEX = re.compile(r'''(?x)
255 NOTE(?:\r\n|[\ \t\r\n])
256 ((?:(?!-->)[^\r\n])+(?:\r\n|[\r\n]))*
257 (?:\r\n|[\r\n])
258 ''')
259
260
261 class CueBlock(Block):
262 """
263 A cue block. The payload is not interpreted.
264 """
265
266 _REGEX_ID = re.compile(r'((?:(?!-->)[^\r\n])+)(?:\r\n|[\r\n])')
267 _REGEX_ARROW = re.compile(r'[ \t]+-->[ \t]+')
268 _REGEX_SETTINGS = re.compile(r'[ \t]+((?:(?!-->)[^\r\n])+)')
269 _REGEX_PAYLOAD = re.compile(r'[^\r\n]+(?:\r\n|[\r\n])?')
270
271 @classmethod
272 def parse(cls, parser):
273 parser = parser.child()
274
275 id = None
276 m = parser.consume(cls._REGEX_ID)
277 if m:
278 id = m.group(1)
279
280 m0 = parser.consume(_REGEX_TS)
281 if not m0:
282 return None
283 if not parser.consume(cls._REGEX_ARROW):
284 return None
285 m1 = parser.consume(_REGEX_TS)
286 if not m1:
287 return None
288 m2 = parser.consume(cls._REGEX_SETTINGS)
289 if not parser.consume(_REGEX_NL):
290 return None
291
292 start = _parse_ts(m0)
293 end = _parse_ts(m1)
294 settings = m2.group(1) if m2 is not None else None
295
296 text = io.StringIO()
297 while True:
298 m = parser.consume(cls._REGEX_PAYLOAD)
299 if not m:
300 break
301 text.write(m.group(0))
302
303 parser.commit()
304 return cls(
305 id=id,
306 start=start, end=end, settings=settings,
307 text=text.getvalue()
308 )
309
310 def write_into(self, stream):
311 if self.id is not None:
312 stream.write(self.id)
313 stream.write('\n')
314 stream.write(_format_ts(self.start))
315 stream.write(' --> ')
316 stream.write(_format_ts(self.end))
317 if self.settings is not None:
318 stream.write(' ')
319 stream.write(self.settings)
320 stream.write('\n')
321 stream.write(self.text)
322 stream.write('\n')
323
324 @property
325 def as_json(self):
326 return {
327 'id': self.id,
328 'start': self.start,
329 'end': self.end,
330 'text': self.text,
331 'settings': self.settings,
332 }
333
334 def __eq__(self, other):
335 return self.as_json == other.as_json
336
337 @classmethod
338 def from_json(cls, json):
339 return cls(
340 id=json['id'],
341 start=json['start'],
342 end=json['end'],
343 text=json['text'],
344 settings=json['settings']
345 )
346
347 def hinges(self, other):
348 if self.text != other.text:
349 return False
350 if self.settings != other.settings:
351 return False
352 return self.start <= self.end == other.start <= other.end
353
354
355 def parse_fragment(frag_content):
356 """
357 A generator that yields (partially) parsed WebVTT blocks when given
358 a bytes object containing the raw contents of a WebVTT file.
359 """
360
361 parser = _MatchParser(frag_content.decode('utf-8'))
362
363 yield Magic.parse(parser)
364
365 while not parser.match(_REGEX_EOF):
366 if parser.consume(_REGEX_BLANK):
367 continue
368
369 block = RegionBlock.parse(parser)
370 if block:
371 yield block
372 continue
373 block = StyleBlock.parse(parser)
374 if block:
375 yield block
376 continue
377 block = CommentBlock.parse(parser)
378 if block:
379 yield block # XXX: or skip
380 continue
381
382 break
383
384 while not parser.match(_REGEX_EOF):
385 if parser.consume(_REGEX_BLANK):
386 continue
387
388 block = CommentBlock.parse(parser)
389 if block:
390 yield block # XXX: or skip
391 continue
392 block = CueBlock.parse(parser)
393 if block:
394 yield block
395 continue
396
397 raise ParseError(parser)