]>
jfr.im git - yt-dlp.git/blob - yt_dlp/webvtt.py
2 A partial parser for WebVTT segments. Interprets enough of the WebVTT stream
3 to be able to assemble a single stand-alone subtitle file, suitably adjusting
4 timestamps on the way, while everything else is passed through unmodified.
6 Regular expressions based on the W3C WebVTT specification
7 <https://www.w3.org/TR/webvtt1/>. The X-TIMESTAMP-MAP extension is described
8 in RFC 8216 §3.5 <https://tools.ietf.org/html/rfc8216#section-3.5>.
13 from .utils
import int_or_none
, timetuple_from_msec
22 An object that maintains the current parsing position and allows
23 conveniently advancing it as syntax elements are successfully parsed.
26 def __init__(self
, string
):
31 if isinstance(r
, compat_Pattern
):
32 return r
.match(self
._data
, self
._pos
)
33 if isinstance(r
, str):
34 if self
._data
.startswith(r
, self
._pos
):
39 def advance(self
, by
):
42 elif isinstance(by
, compat_Match
):
43 amt
= len(by
.group(0))
44 elif isinstance(by
, str):
46 elif isinstance(by
, int):
54 return self
.advance(self
.match(r
))
57 return _MatchChildParser(self
)
60 class _MatchChildParser(_MatchParser
):
62 A child parser state, which advances through the same data as
63 its parent, but has an independent position. This is useful when
64 advancing through syntax elements we might later want to backtrack
68 def __init__(self
, parent
):
69 super().__init
__(parent
._data
)
70 self
.__parent
= parent
71 self
._pos
= parent
._pos
75 Advance the parent state to the current position of this child state.
77 self
.__parent
._pos
= self
._pos
81 class ParseError(Exception):
82 def __init__(self
, parser
):
83 super().__init
__("Parse error at position %u (near %r)" % (
84 parser
._pos
, parser
._data
[parser
._pos
:parser
._pos
+ 20]
88 # While the specification <https://www.w3.org/TR/webvtt1/#webvtt-timestamp>
89 # prescribes that hours must be *2 or more* digits, timestamps with a single
90 # digit for the hour part has been seen in the wild.
91 # See https://github.com/yt-dlp/yt-dlp/issues/921
92 _REGEX_TS
= re
.compile(r
'''(?x)
98 _REGEX_EOF
= re
.compile(r
'\Z')
99 _REGEX_NL
= re
.compile(r
'(?:\r\n|[\r\n])')
100 _REGEX_BLANK
= re
.compile(r
'(?:\r\n|[\r\n])+')
105 Convert a parsed WebVTT timestamp (a re.Match obtained from _REGEX_TS)
106 into an MPEG PES timestamp: a tick counter at 90 kHz resolution.
109 h
, min, s
, ms
= ts
.groups()
111 int(h
or 0) * 3600000 + # noqa: W504,E221,E222
112 int(min) * 60000 + # noqa: W504,E221,E222
113 int(s
) * 1000 + # noqa: W504,E221,E222
114 int(ms
) # noqa: W504,E221,E222
120 Convert an MPEG PES timestamp into a WebVTT timestamp.
121 This will lose sub-millisecond precision.
123 return '%02u:%02u:%02u.%03u' % timetuple_from_msec(int((ts
+ 45) // 90))
128 An abstract WebVTT block.
131 def __init__(self
, **kwargs
):
132 for key
, val
in kwargs
.items():
133 setattr(self
, key
, val
)
136 def parse(cls
, parser
):
137 m
= parser
.match(cls
._REGEX
)
141 return cls(raw
=m
.group(0))
143 def write_into(self
, stream
):
144 stream
.write(self
.raw
)
147 class HeaderBlock(Block
):
149 A WebVTT block that may only appear in the header part of the file,
150 i.e. before any cue blocks.
156 class Magic(HeaderBlock
):
157 _REGEX
= re
.compile(r
'\ufeff?WEBVTT([ \t][^\r\n]*)?(?:\r\n|[\r\n])')
159 # XXX: The X-TIMESTAMP-MAP extension is described in RFC 8216 §3.5
160 # <https://tools.ietf.org/html/rfc8216#section-3.5>, but the RFC
161 # doesn’t specify the exact grammar nor where in the WebVTT
162 # syntax it should be placed; the below has been devised based
163 # on usage in the wild
165 # And strictly speaking, the presence of this extension violates
166 # the W3C WebVTT spec. Oh well.
168 _REGEX_TSMAP
= re
.compile(r
'X-TIMESTAMP-MAP=')
169 _REGEX_TSMAP_LOCAL
= re
.compile(r
'LOCAL:')
170 _REGEX_TSMAP_MPEGTS
= re
.compile(r
'MPEGTS:([0-9]+)')
171 _REGEX_TSMAP_SEP
= re
.compile(r
'[ \t]*,[ \t]*')
174 def __parse_tsmap(cls
, parser
):
175 parser
= parser
.child()
178 m
= parser
.consume(cls
._REGEX
_TSMAP
_LOCAL
)
180 m
= parser
.consume(_REGEX_TS
)
182 raise ParseError(parser
)
185 raise ParseError(parser
)
187 m
= parser
.consume(cls
._REGEX
_TSMAP
_MPEGTS
)
189 mpegts
= int_or_none(m
.group(1))
191 raise ParseError(parser
)
193 raise ParseError(parser
)
194 if parser
.consume(cls
._REGEX
_TSMAP
_SEP
):
196 if parser
.consume(_REGEX_NL
):
198 raise ParseError(parser
)
204 def parse(cls
, parser
):
205 parser
= parser
.child()
207 m
= parser
.consume(cls
._REGEX
)
209 raise ParseError(parser
)
212 local
, mpegts
= None, None
213 if parser
.consume(cls
._REGEX
_TSMAP
):
214 local
, mpegts
= cls
.__parse
_tsmap
(parser
)
215 if not parser
.consume(_REGEX_NL
):
216 raise ParseError(parser
)
218 return cls(extra
=extra
, mpegts
=mpegts
, local
=local
)
220 def write_into(self
, stream
):
221 stream
.write('WEBVTT')
222 if self
.extra
is not None:
223 stream
.write(self
.extra
)
225 if self
.local
or self
.mpegts
:
226 stream
.write('X-TIMESTAMP-MAP=LOCAL:')
227 stream
.write(_format_ts(self
.local
if self
.local
is not None else 0))
228 stream
.write(',MPEGTS:')
229 stream
.write(str(self
.mpegts
if self
.mpegts
is not None else 0))
234 class StyleBlock(HeaderBlock
):
235 _REGEX
= re
.compile(r
'''(?x)
236 STYLE[\ \t]*(?:\r\n|[\r\n])
237 ((?:(?!-->)[^\r\n])+(?:\r\n|[\r\n]))*
242 class RegionBlock(HeaderBlock
):
243 _REGEX
= re
.compile(r
'''(?x)
245 ((?:(?!-->)[^\r\n])+(?:\r\n|[\r\n]))*
250 class CommentBlock(Block
):
251 _REGEX
= re
.compile(r
'''(?x)
252 NOTE(?:\r\n|[\ \t\r\n])
253 ((?:(?!-->)[^\r\n])+(?:\r\n|[\r\n]))*
258 class CueBlock(Block
):
260 A cue block. The payload is not interpreted.
263 _REGEX_ID
= re
.compile(r
'((?:(?!-->)[^\r\n])+)(?:\r\n|[\r\n])')
264 _REGEX_ARROW
= re
.compile(r
'[ \t]+-->[ \t]+')
265 _REGEX_SETTINGS
= re
.compile(r
'[ \t]+((?:(?!-->)[^\r\n])+)')
266 _REGEX_PAYLOAD
= re
.compile(r
'[^\r\n]+(?:\r\n|[\r\n])?')
269 def parse(cls
, parser
):
270 parser
= parser
.child()
273 m
= parser
.consume(cls
._REGEX
_ID
)
277 m0
= parser
.consume(_REGEX_TS
)
280 if not parser
.consume(cls
._REGEX
_ARROW
):
282 m1
= parser
.consume(_REGEX_TS
)
285 m2
= parser
.consume(cls
._REGEX
_SETTINGS
)
286 if not parser
.consume(_REGEX_NL
):
289 start
= _parse_ts(m0
)
291 settings
= m2
.group(1) if m2
is not None else None
295 m
= parser
.consume(cls
._REGEX
_PAYLOAD
)
298 text
.write(m
.group(0))
303 start
=start
, end
=end
, settings
=settings
,
307 def write_into(self
, stream
):
308 if self
.id is not None:
309 stream
.write(self
.id)
311 stream
.write(_format_ts(self
.start
))
312 stream
.write(' --> ')
313 stream
.write(_format_ts(self
.end
))
314 if self
.settings
is not None:
316 stream
.write(self
.settings
)
318 stream
.write(self
.text
)
328 'settings': self
.settings
,
331 def __eq__(self
, other
):
332 return self
.as_json
== other
.as_json
335 def from_json(cls
, json
):
341 settings
=json
['settings']
344 def hinges(self
, other
):
345 if self
.text
!= other
.text
:
347 if self
.settings
!= other
.settings
:
349 return self
.start
<= self
.end
== other
.start
<= other
.end
352 def parse_fragment(frag_content
):
354 A generator that yields (partially) parsed WebVTT blocks when given
355 a bytes object containing the raw contents of a WebVTT file.
358 parser
= _MatchParser(frag_content
.decode('utf-8'))
360 yield Magic
.parse(parser
)
362 while not parser
.match(_REGEX_EOF
):
363 if parser
.consume(_REGEX_BLANK
):
366 block
= RegionBlock
.parse(parser
)
370 block
= StyleBlock
.parse(parser
)
374 block
= CommentBlock
.parse(parser
)
376 yield block
# XXX: or skip
381 while not parser
.match(_REGEX_EOF
):
382 if parser
.consume(_REGEX_BLANK
):
385 block
= CommentBlock
.parse(parser
)
387 yield block
# XXX: or skip
389 block
= CueBlock
.parse(parser
)
394 raise ParseError(parser
)