]>
jfr.im git - yt-dlp.git/blob - yt_dlp/webvtt.py
2 A partial parser for WebVTT segments. Interprets enough of the WebVTT stream
3 to be able to assemble a single stand-alone subtitle file, suitably adjusting
4 timestamps on the way, while everything else is passed through unmodified.
6 Regular expressions based on the W3C WebVTT specification
7 <https://www.w3.org/TR/webvtt1/>. The X-TIMESTAMP-MAP extension is described
8 in RFC 8216 §3.5 <https://tools.ietf.org/html/rfc8216#section-3.5>.
14 from .compat
import compat_Match
, compat_Pattern
15 from .utils
import int_or_none
, timetuple_from_msec
20 An object that maintains the current parsing position and allows
21 conveniently advancing it as syntax elements are successfully parsed.
24 def __init__(self
, string
):
29 if isinstance(r
, compat_Pattern
):
30 return r
.match(self
._data
, self
._pos
)
31 if isinstance(r
, str):
32 if self
._data
.startswith(r
, self
._pos
):
37 def advance(self
, by
):
40 elif isinstance(by
, compat_Match
):
41 amt
= len(by
.group(0))
42 elif isinstance(by
, str):
44 elif isinstance(by
, int):
52 return self
.advance(self
.match(r
))
55 return _MatchChildParser(self
)
58 class _MatchChildParser(_MatchParser
):
60 A child parser state, which advances through the same data as
61 its parent, but has an independent position. This is useful when
62 advancing through syntax elements we might later want to backtrack
66 def __init__(self
, parent
):
67 super().__init
__(parent
._data
)
68 self
.__parent
= parent
69 self
._pos
= parent
._pos
73 Advance the parent state to the current position of this child state.
75 self
.__parent
._pos
= self
._pos
79 class ParseError(Exception):
80 def __init__(self
, parser
):
81 super().__init
__("Parse error at position %u (near %r)" % (
82 parser
._pos
, parser
._data
[parser
._pos
:parser
._pos
+ 20]
86 # While the specification <https://www.w3.org/TR/webvtt1/#webvtt-timestamp>
87 # prescribes that hours must be *2 or more* digits, timestamps with a single
88 # digit for the hour part has been seen in the wild.
89 # See https://github.com/yt-dlp/yt-dlp/issues/921
90 _REGEX_TS
= re
.compile(r
'''(?x)
96 _REGEX_EOF
= re
.compile(r
'\Z')
97 _REGEX_NL
= re
.compile(r
'(?:\r\n|[\r\n])')
98 _REGEX_BLANK
= re
.compile(r
'(?:\r\n|[\r\n])+')
103 Convert a parsed WebVTT timestamp (a re.Match obtained from _REGEX_TS)
104 into an MPEG PES timestamp: a tick counter at 90 kHz resolution.
107 h
, min, s
, ms
= ts
.groups()
109 int(h
or 0) * 3600000 + # noqa: W504,E221,E222
110 int(min) * 60000 + # noqa: W504,E221,E222
111 int(s
) * 1000 + # noqa: W504,E221,E222
112 int(ms
) # noqa: W504,E221,E222
118 Convert an MPEG PES timestamp into a WebVTT timestamp.
119 This will lose sub-millisecond precision.
121 return '%02u:%02u:%02u.%03u' % timetuple_from_msec(int((ts
+ 45) // 90))
126 An abstract WebVTT block.
129 def __init__(self
, **kwargs
):
130 for key
, val
in kwargs
.items():
131 setattr(self
, key
, val
)
134 def parse(cls
, parser
):
135 m
= parser
.match(cls
._REGEX
)
139 return cls(raw
=m
.group(0))
141 def write_into(self
, stream
):
142 stream
.write(self
.raw
)
145 class HeaderBlock(Block
):
147 A WebVTT block that may only appear in the header part of the file,
148 i.e. before any cue blocks.
154 class Magic(HeaderBlock
):
155 _REGEX
= re
.compile(r
'\ufeff?WEBVTT([ \t][^\r\n]*)?(?:\r\n|[\r\n])')
157 # XXX: The X-TIMESTAMP-MAP extension is described in RFC 8216 §3.5
158 # <https://tools.ietf.org/html/rfc8216#section-3.5>, but the RFC
159 # doesn’t specify the exact grammar nor where in the WebVTT
160 # syntax it should be placed; the below has been devised based
161 # on usage in the wild
163 # And strictly speaking, the presence of this extension violates
164 # the W3C WebVTT spec. Oh well.
166 _REGEX_TSMAP
= re
.compile(r
'X-TIMESTAMP-MAP=')
167 _REGEX_TSMAP_LOCAL
= re
.compile(r
'LOCAL:')
168 _REGEX_TSMAP_MPEGTS
= re
.compile(r
'MPEGTS:([0-9]+)')
169 _REGEX_TSMAP_SEP
= re
.compile(r
'[ \t]*,[ \t]*')
172 def __parse_tsmap(cls
, parser
):
173 parser
= parser
.child()
176 m
= parser
.consume(cls
._REGEX
_TSMAP
_LOCAL
)
178 m
= parser
.consume(_REGEX_TS
)
180 raise ParseError(parser
)
183 raise ParseError(parser
)
185 m
= parser
.consume(cls
._REGEX
_TSMAP
_MPEGTS
)
187 mpegts
= int_or_none(m
.group(1))
189 raise ParseError(parser
)
191 raise ParseError(parser
)
192 if parser
.consume(cls
._REGEX
_TSMAP
_SEP
):
194 if parser
.consume(_REGEX_NL
):
196 raise ParseError(parser
)
202 def parse(cls
, parser
):
203 parser
= parser
.child()
205 m
= parser
.consume(cls
._REGEX
)
207 raise ParseError(parser
)
210 local
, mpegts
= None, None
211 if parser
.consume(cls
._REGEX
_TSMAP
):
212 local
, mpegts
= cls
.__parse
_tsmap
(parser
)
213 if not parser
.consume(_REGEX_NL
):
214 raise ParseError(parser
)
216 return cls(extra
=extra
, mpegts
=mpegts
, local
=local
)
218 def write_into(self
, stream
):
219 stream
.write('WEBVTT')
220 if self
.extra
is not None:
221 stream
.write(self
.extra
)
223 if self
.local
or self
.mpegts
:
224 stream
.write('X-TIMESTAMP-MAP=LOCAL:')
225 stream
.write(_format_ts(self
.local
if self
.local
is not None else 0))
226 stream
.write(',MPEGTS:')
227 stream
.write(str(self
.mpegts
if self
.mpegts
is not None else 0))
232 class StyleBlock(HeaderBlock
):
233 _REGEX
= re
.compile(r
'''(?x)
234 STYLE[\ \t]*(?:\r\n|[\r\n])
235 ((?:(?!-->)[^\r\n])+(?:\r\n|[\r\n]))*
240 class RegionBlock(HeaderBlock
):
241 _REGEX
= re
.compile(r
'''(?x)
243 ((?:(?!-->)[^\r\n])+(?:\r\n|[\r\n]))*
248 class CommentBlock(Block
):
249 _REGEX
= re
.compile(r
'''(?x)
250 NOTE(?:\r\n|[\ \t\r\n])
251 ((?:(?!-->)[^\r\n])+(?:\r\n|[\r\n]))*
256 class CueBlock(Block
):
258 A cue block. The payload is not interpreted.
261 _REGEX_ID
= re
.compile(r
'((?:(?!-->)[^\r\n])+)(?:\r\n|[\r\n])')
262 _REGEX_ARROW
= re
.compile(r
'[ \t]+-->[ \t]+')
263 _REGEX_SETTINGS
= re
.compile(r
'[ \t]+((?:(?!-->)[^\r\n])+)')
264 _REGEX_PAYLOAD
= re
.compile(r
'[^\r\n]+(?:\r\n|[\r\n])?')
267 def parse(cls
, parser
):
268 parser
= parser
.child()
271 m
= parser
.consume(cls
._REGEX
_ID
)
275 m0
= parser
.consume(_REGEX_TS
)
278 if not parser
.consume(cls
._REGEX
_ARROW
):
280 m1
= parser
.consume(_REGEX_TS
)
283 m2
= parser
.consume(cls
._REGEX
_SETTINGS
)
284 if not parser
.consume(_REGEX_NL
):
287 start
= _parse_ts(m0
)
289 settings
= m2
.group(1) if m2
is not None else None
293 m
= parser
.consume(cls
._REGEX
_PAYLOAD
)
296 text
.write(m
.group(0))
301 start
=start
, end
=end
, settings
=settings
,
305 def write_into(self
, stream
):
306 if self
.id is not None:
307 stream
.write(self
.id)
309 stream
.write(_format_ts(self
.start
))
310 stream
.write(' --> ')
311 stream
.write(_format_ts(self
.end
))
312 if self
.settings
is not None:
314 stream
.write(self
.settings
)
316 stream
.write(self
.text
)
326 'settings': self
.settings
,
329 def __eq__(self
, other
):
330 return self
.as_json
== other
.as_json
333 def from_json(cls
, json
):
339 settings
=json
['settings']
342 def hinges(self
, other
):
343 if self
.text
!= other
.text
:
345 if self
.settings
!= other
.settings
:
347 return self
.start
<= self
.end
== other
.start
<= other
.end
350 def parse_fragment(frag_content
):
352 A generator that yields (partially) parsed WebVTT blocks when given
353 a bytes object containing the raw contents of a WebVTT file.
356 parser
= _MatchParser(frag_content
.decode('utf-8'))
358 yield Magic
.parse(parser
)
360 while not parser
.match(_REGEX_EOF
):
361 if parser
.consume(_REGEX_BLANK
):
364 block
= RegionBlock
.parse(parser
)
368 block
= StyleBlock
.parse(parser
)
372 block
= CommentBlock
.parse(parser
)
374 yield block
# XXX: or skip
379 while not parser
.match(_REGEX_EOF
):
380 if parser
.consume(_REGEX_BLANK
):
383 block
= CommentBlock
.parse(parser
)
385 yield block
# XXX: or skip
387 block
= CueBlock
.parse(parser
)
392 raise ParseError(parser
)