]>
jfr.im git - yt-dlp.git/blob - yt_dlp/webvtt.py
2 A partial parser for WebVTT segments. Interprets enough of the WebVTT stream
3 to be able to assemble a single stand-alone subtitle file, suitably adjusting
4 timestamps on the way, while everything else is passed through unmodified.
6 Regular expressions based on the W3C WebVTT specification
7 <https://www.w3.org/TR/webvtt1/>. The X-TIMESTAMP-MAP extension is described
8 in RFC 8216 §3.5 <https://tools.ietf.org/html/rfc8216#section-3.5>.
13 from .compat
import re
14 from .utils
import int_or_none
, timetuple_from_msec
19 An object that maintains the current parsing position and allows
20 conveniently advancing it as syntax elements are successfully parsed.
23 def __init__(self
, string
):
28 if isinstance(r
, re
.Pattern
):
29 return r
.match(self
._data
, self
._pos
)
30 if isinstance(r
, str):
31 if self
._data
.startswith(r
, self
._pos
):
36 def advance(self
, by
):
39 elif isinstance(by
, re
.Match
):
40 amt
= len(by
.group(0))
41 elif isinstance(by
, str):
43 elif isinstance(by
, int):
51 return self
.advance(self
.match(r
))
54 return _MatchChildParser(self
)
57 class _MatchChildParser(_MatchParser
):
59 A child parser state, which advances through the same data as
60 its parent, but has an independent position. This is useful when
61 advancing through syntax elements we might later want to backtrack
65 def __init__(self
, parent
):
66 super().__init
__(parent
._data
)
67 self
.__parent
= parent
68 self
._pos
= parent
._pos
72 Advance the parent state to the current position of this child state.
74 self
.__parent
._pos
= self
._pos
78 class ParseError(Exception):
79 def __init__(self
, parser
):
80 super().__init
__("Parse error at position %u (near %r)" % (
81 parser
._pos
, parser
._data
[parser
._pos
:parser
._pos
+ 20]
85 # While the specification <https://www.w3.org/TR/webvtt1/#webvtt-timestamp>
86 # prescribes that hours must be *2 or more* digits, timestamps with a single
87 # digit for the hour part has been seen in the wild.
88 # See https://github.com/yt-dlp/yt-dlp/issues/921
89 _REGEX_TS
= re
.compile(r
'''(?x)
95 _REGEX_EOF
= re
.compile(r
'\Z')
96 _REGEX_NL
= re
.compile(r
'(?:\r\n|[\r\n])')
97 _REGEX_BLANK
= re
.compile(r
'(?:\r\n|[\r\n])+')
102 Convert a parsed WebVTT timestamp (a re.Match obtained from _REGEX_TS)
103 into an MPEG PES timestamp: a tick counter at 90 kHz resolution.
106 int(part
or 0) * mult
for part
, mult
in zip(ts
.groups(), (3600_000, 60_000, 1000, 1)))
111 Convert an MPEG PES timestamp into a WebVTT timestamp.
112 This will lose sub-millisecond precision.
114 return '%02u:%02u:%02u.%03u' % timetuple_from_msec(int((ts
+ 45) // 90))
119 An abstract WebVTT block.
122 def __init__(self
, **kwargs
):
123 for key
, val
in kwargs
.items():
124 setattr(self
, key
, val
)
127 def parse(cls
, parser
):
128 m
= parser
.match(cls
._REGEX
)
132 return cls(raw
=m
.group(0))
134 def write_into(self
, stream
):
135 stream
.write(self
.raw
)
138 class HeaderBlock(Block
):
140 A WebVTT block that may only appear in the header part of the file,
141 i.e. before any cue blocks.
147 class Magic(HeaderBlock
):
148 _REGEX
= re
.compile(r
'\ufeff?WEBVTT([ \t][^\r\n]*)?(?:\r\n|[\r\n])')
150 # XXX: The X-TIMESTAMP-MAP extension is described in RFC 8216 §3.5
151 # <https://tools.ietf.org/html/rfc8216#section-3.5>, but the RFC
152 # doesn’t specify the exact grammar nor where in the WebVTT
153 # syntax it should be placed; the below has been devised based
154 # on usage in the wild
156 # And strictly speaking, the presence of this extension violates
157 # the W3C WebVTT spec. Oh well.
159 _REGEX_TSMAP
= re
.compile(r
'X-TIMESTAMP-MAP=')
160 _REGEX_TSMAP_LOCAL
= re
.compile(r
'LOCAL:')
161 _REGEX_TSMAP_MPEGTS
= re
.compile(r
'MPEGTS:([0-9]+)')
162 _REGEX_TSMAP_SEP
= re
.compile(r
'[ \t]*,[ \t]*')
165 def __parse_tsmap(cls
, parser
):
166 parser
= parser
.child()
169 m
= parser
.consume(cls
._REGEX
_TSMAP
_LOCAL
)
171 m
= parser
.consume(_REGEX_TS
)
173 raise ParseError(parser
)
176 raise ParseError(parser
)
178 m
= parser
.consume(cls
._REGEX
_TSMAP
_MPEGTS
)
180 mpegts
= int_or_none(m
.group(1))
182 raise ParseError(parser
)
184 raise ParseError(parser
)
185 if parser
.consume(cls
._REGEX
_TSMAP
_SEP
):
187 if parser
.consume(_REGEX_NL
):
189 raise ParseError(parser
)
195 def parse(cls
, parser
):
196 parser
= parser
.child()
198 m
= parser
.consume(cls
._REGEX
)
200 raise ParseError(parser
)
203 local
, mpegts
= None, None
204 if parser
.consume(cls
._REGEX
_TSMAP
):
205 local
, mpegts
= cls
.__parse
_tsmap
(parser
)
206 if not parser
.consume(_REGEX_NL
):
207 raise ParseError(parser
)
209 return cls(extra
=extra
, mpegts
=mpegts
, local
=local
)
211 def write_into(self
, stream
):
212 stream
.write('WEBVTT')
213 if self
.extra
is not None:
214 stream
.write(self
.extra
)
216 if self
.local
or self
.mpegts
:
217 stream
.write('X-TIMESTAMP-MAP=LOCAL:')
218 stream
.write(_format_ts(self
.local
if self
.local
is not None else 0))
219 stream
.write(',MPEGTS:')
220 stream
.write(str(self
.mpegts
if self
.mpegts
is not None else 0))
225 class StyleBlock(HeaderBlock
):
226 _REGEX
= re
.compile(r
'''(?x)
227 STYLE[\ \t]*(?:\r\n|[\r\n])
228 ((?:(?!-->)[^\r\n])+(?:\r\n|[\r\n]))*
233 class RegionBlock(HeaderBlock
):
234 _REGEX
= re
.compile(r
'''(?x)
236 ((?:(?!-->)[^\r\n])+(?:\r\n|[\r\n]))*
241 class CommentBlock(Block
):
242 _REGEX
= re
.compile(r
'''(?x)
243 NOTE(?:\r\n|[\ \t\r\n])
244 ((?:(?!-->)[^\r\n])+(?:\r\n|[\r\n]))*
249 class CueBlock(Block
):
251 A cue block. The payload is not interpreted.
254 _REGEX_ID
= re
.compile(r
'((?:(?!-->)[^\r\n])+)(?:\r\n|[\r\n])')
255 _REGEX_ARROW
= re
.compile(r
'[ \t]+-->[ \t]+')
256 _REGEX_SETTINGS
= re
.compile(r
'[ \t]+((?:(?!-->)[^\r\n])+)')
257 _REGEX_PAYLOAD
= re
.compile(r
'[^\r\n]+(?:\r\n|[\r\n])?')
260 def parse(cls
, parser
):
261 parser
= parser
.child()
264 m
= parser
.consume(cls
._REGEX
_ID
)
268 m0
= parser
.consume(_REGEX_TS
)
271 if not parser
.consume(cls
._REGEX
_ARROW
):
273 m1
= parser
.consume(_REGEX_TS
)
276 m2
= parser
.consume(cls
._REGEX
_SETTINGS
)
277 if not parser
.consume(_REGEX_NL
):
280 start
= _parse_ts(m0
)
282 settings
= m2
.group(1) if m2
is not None else None
286 m
= parser
.consume(cls
._REGEX
_PAYLOAD
)
289 text
.write(m
.group(0))
294 start
=start
, end
=end
, settings
=settings
,
298 def write_into(self
, stream
):
299 if self
.id is not None:
300 stream
.write(self
.id)
302 stream
.write(_format_ts(self
.start
))
303 stream
.write(' --> ')
304 stream
.write(_format_ts(self
.end
))
305 if self
.settings
is not None:
307 stream
.write(self
.settings
)
309 stream
.write(self
.text
)
319 'settings': self
.settings
,
322 def __eq__(self
, other
):
323 return self
.as_json
== other
.as_json
326 def from_json(cls
, json
):
332 settings
=json
['settings']
335 def hinges(self
, other
):
336 if self
.text
!= other
.text
:
338 if self
.settings
!= other
.settings
:
340 return self
.start
<= self
.end
== other
.start
<= other
.end
343 def parse_fragment(frag_content
):
345 A generator that yields (partially) parsed WebVTT blocks when given
346 a bytes object containing the raw contents of a WebVTT file.
349 parser
= _MatchParser(frag_content
.decode())
351 yield Magic
.parse(parser
)
353 while not parser
.match(_REGEX_EOF
):
354 if parser
.consume(_REGEX_BLANK
):
357 block
= RegionBlock
.parse(parser
)
361 block
= StyleBlock
.parse(parser
)
365 block
= CommentBlock
.parse(parser
)
367 yield block
# XXX: or skip
372 while not parser
.match(_REGEX_EOF
):
373 if parser
.consume(_REGEX_BLANK
):
376 block
= CommentBlock
.parse(parser
)
378 yield block
# XXX: or skip
380 block
= CueBlock
.parse(parser
)
385 raise ParseError(parser
)