]>
jfr.im git - yt-dlp.git/blob - yt_dlp/webvtt.py
2 A partial parser for WebVTT segments. Interprets enough of the WebVTT stream
3 to be able to assemble a single stand-alone subtitle file, suitably adjusting
4 timestamps on the way, while everything else is passed through unmodified.
6 Regular expressions based on the W3C WebVTT specification
7 <https://www.w3.org/TR/webvtt1/>. The X-TIMESTAMP-MAP extension is described
8 in RFC 8216 §3.5 <https://tools.ietf.org/html/rfc8216#section-3.5>.
14 from .compat
import compat_Match
, compat_Pattern
15 from .utils
import int_or_none
, timetuple_from_msec
20 An object that maintains the current parsing position and allows
21 conveniently advancing it as syntax elements are successfully parsed.
24 def __init__(self
, string
):
29 if isinstance(r
, compat_Pattern
):
30 return r
.match(self
._data
, self
._pos
)
31 if isinstance(r
, str):
32 if self
._data
.startswith(r
, self
._pos
):
37 def advance(self
, by
):
40 elif isinstance(by
, compat_Match
):
41 amt
= len(by
.group(0))
42 elif isinstance(by
, str):
44 elif isinstance(by
, int):
52 return self
.advance(self
.match(r
))
55 return _MatchChildParser(self
)
58 class _MatchChildParser(_MatchParser
):
60 A child parser state, which advances through the same data as
61 its parent, but has an independent position. This is useful when
62 advancing through syntax elements we might later want to backtrack
66 def __init__(self
, parent
):
67 super().__init
__(parent
._data
)
68 self
.__parent
= parent
69 self
._pos
= parent
._pos
73 Advance the parent state to the current position of this child state.
75 self
.__parent
._pos
= self
._pos
79 class ParseError(Exception):
80 def __init__(self
, parser
):
81 super().__init
__("Parse error at position %u (near %r)" % (
82 parser
._pos
, parser
._data
[parser
._pos
:parser
._pos
+ 20]
86 # While the specification <https://www.w3.org/TR/webvtt1/#webvtt-timestamp>
87 # prescribes that hours must be *2 or more* digits, timestamps with a single
88 # digit for the hour part has been seen in the wild.
89 # See https://github.com/yt-dlp/yt-dlp/issues/921
90 _REGEX_TS
= re
.compile(r
'''(?x)
96 _REGEX_EOF
= re
.compile(r
'\Z')
97 _REGEX_NL
= re
.compile(r
'(?:\r\n|[\r\n])')
98 _REGEX_BLANK
= re
.compile(r
'(?:\r\n|[\r\n])+')
103 Convert a parsed WebVTT timestamp (a re.Match obtained from _REGEX_TS)
104 into an MPEG PES timestamp: a tick counter at 90 kHz resolution.
107 int(part
or 0) * mult
for part
, mult
in zip(ts
.groups(), (3600_000, 60_000, 1000, 1)))
112 Convert an MPEG PES timestamp into a WebVTT timestamp.
113 This will lose sub-millisecond precision.
115 return '%02u:%02u:%02u.%03u' % timetuple_from_msec(int((ts
+ 45) // 90))
120 An abstract WebVTT block.
123 def __init__(self
, **kwargs
):
124 for key
, val
in kwargs
.items():
125 setattr(self
, key
, val
)
128 def parse(cls
, parser
):
129 m
= parser
.match(cls
._REGEX
)
133 return cls(raw
=m
.group(0))
135 def write_into(self
, stream
):
136 stream
.write(self
.raw
)
139 class HeaderBlock(Block
):
141 A WebVTT block that may only appear in the header part of the file,
142 i.e. before any cue blocks.
148 class Magic(HeaderBlock
):
149 _REGEX
= re
.compile(r
'\ufeff?WEBVTT([ \t][^\r\n]*)?(?:\r\n|[\r\n])')
151 # XXX: The X-TIMESTAMP-MAP extension is described in RFC 8216 §3.5
152 # <https://tools.ietf.org/html/rfc8216#section-3.5>, but the RFC
153 # doesn’t specify the exact grammar nor where in the WebVTT
154 # syntax it should be placed; the below has been devised based
155 # on usage in the wild
157 # And strictly speaking, the presence of this extension violates
158 # the W3C WebVTT spec. Oh well.
160 _REGEX_TSMAP
= re
.compile(r
'X-TIMESTAMP-MAP=')
161 _REGEX_TSMAP_LOCAL
= re
.compile(r
'LOCAL:')
162 _REGEX_TSMAP_MPEGTS
= re
.compile(r
'MPEGTS:([0-9]+)')
163 _REGEX_TSMAP_SEP
= re
.compile(r
'[ \t]*,[ \t]*')
166 def __parse_tsmap(cls
, parser
):
167 parser
= parser
.child()
170 m
= parser
.consume(cls
._REGEX
_TSMAP
_LOCAL
)
172 m
= parser
.consume(_REGEX_TS
)
174 raise ParseError(parser
)
177 raise ParseError(parser
)
179 m
= parser
.consume(cls
._REGEX
_TSMAP
_MPEGTS
)
181 mpegts
= int_or_none(m
.group(1))
183 raise ParseError(parser
)
185 raise ParseError(parser
)
186 if parser
.consume(cls
._REGEX
_TSMAP
_SEP
):
188 if parser
.consume(_REGEX_NL
):
190 raise ParseError(parser
)
196 def parse(cls
, parser
):
197 parser
= parser
.child()
199 m
= parser
.consume(cls
._REGEX
)
201 raise ParseError(parser
)
204 local
, mpegts
= None, None
205 if parser
.consume(cls
._REGEX
_TSMAP
):
206 local
, mpegts
= cls
.__parse
_tsmap
(parser
)
207 if not parser
.consume(_REGEX_NL
):
208 raise ParseError(parser
)
210 return cls(extra
=extra
, mpegts
=mpegts
, local
=local
)
212 def write_into(self
, stream
):
213 stream
.write('WEBVTT')
214 if self
.extra
is not None:
215 stream
.write(self
.extra
)
217 if self
.local
or self
.mpegts
:
218 stream
.write('X-TIMESTAMP-MAP=LOCAL:')
219 stream
.write(_format_ts(self
.local
if self
.local
is not None else 0))
220 stream
.write(',MPEGTS:')
221 stream
.write(str(self
.mpegts
if self
.mpegts
is not None else 0))
226 class StyleBlock(HeaderBlock
):
227 _REGEX
= re
.compile(r
'''(?x)
228 STYLE[\ \t]*(?:\r\n|[\r\n])
229 ((?:(?!-->)[^\r\n])+(?:\r\n|[\r\n]))*
234 class RegionBlock(HeaderBlock
):
235 _REGEX
= re
.compile(r
'''(?x)
237 ((?:(?!-->)[^\r\n])+(?:\r\n|[\r\n]))*
242 class CommentBlock(Block
):
243 _REGEX
= re
.compile(r
'''(?x)
244 NOTE(?:\r\n|[\ \t\r\n])
245 ((?:(?!-->)[^\r\n])+(?:\r\n|[\r\n]))*
250 class CueBlock(Block
):
252 A cue block. The payload is not interpreted.
255 _REGEX_ID
= re
.compile(r
'((?:(?!-->)[^\r\n])+)(?:\r\n|[\r\n])')
256 _REGEX_ARROW
= re
.compile(r
'[ \t]+-->[ \t]+')
257 _REGEX_SETTINGS
= re
.compile(r
'[ \t]+((?:(?!-->)[^\r\n])+)')
258 _REGEX_PAYLOAD
= re
.compile(r
'[^\r\n]+(?:\r\n|[\r\n])?')
261 def parse(cls
, parser
):
262 parser
= parser
.child()
265 m
= parser
.consume(cls
._REGEX
_ID
)
269 m0
= parser
.consume(_REGEX_TS
)
272 if not parser
.consume(cls
._REGEX
_ARROW
):
274 m1
= parser
.consume(_REGEX_TS
)
277 m2
= parser
.consume(cls
._REGEX
_SETTINGS
)
278 if not parser
.consume(_REGEX_NL
):
281 start
= _parse_ts(m0
)
283 settings
= m2
.group(1) if m2
is not None else None
287 m
= parser
.consume(cls
._REGEX
_PAYLOAD
)
290 text
.write(m
.group(0))
295 start
=start
, end
=end
, settings
=settings
,
299 def write_into(self
, stream
):
300 if self
.id is not None:
301 stream
.write(self
.id)
303 stream
.write(_format_ts(self
.start
))
304 stream
.write(' --> ')
305 stream
.write(_format_ts(self
.end
))
306 if self
.settings
is not None:
308 stream
.write(self
.settings
)
310 stream
.write(self
.text
)
320 'settings': self
.settings
,
323 def __eq__(self
, other
):
324 return self
.as_json
== other
.as_json
327 def from_json(cls
, json
):
333 settings
=json
['settings']
336 def hinges(self
, other
):
337 if self
.text
!= other
.text
:
339 if self
.settings
!= other
.settings
:
341 return self
.start
<= self
.end
== other
.start
<= other
.end
344 def parse_fragment(frag_content
):
346 A generator that yields (partially) parsed WebVTT blocks when given
347 a bytes object containing the raw contents of a WebVTT file.
350 parser
= _MatchParser(frag_content
.decode('utf-8'))
352 yield Magic
.parse(parser
)
354 while not parser
.match(_REGEX_EOF
):
355 if parser
.consume(_REGEX_BLANK
):
358 block
= RegionBlock
.parse(parser
)
362 block
= StyleBlock
.parse(parser
)
366 block
= CommentBlock
.parse(parser
)
368 yield block
# XXX: or skip
373 while not parser
.match(_REGEX_EOF
):
374 if parser
.consume(_REGEX_BLANK
):
377 block
= CommentBlock
.parse(parser
)
379 yield block
# XXX: or skip
381 block
= CueBlock
.parse(parser
)
386 raise ParseError(parser
)