]>
jfr.im git - yt-dlp.git/blob - yt_dlp/webvtt.py
2 from __future__
import unicode_literals
, print_function
, division
5 A partial parser for WebVTT segments. Interprets enough of the WebVTT stream
6 to be able to assemble a single stand-alone subtitle file, suitably adjusting
7 timestamps on the way, while everything else is passed through unmodified.
9 Regular expressions based on the W3C WebVTT specification
10 <https://www.w3.org/TR/webvtt1/>. The X-TIMESTAMP-MAP extension is described
11 in RFC 8216 §3.5 <https://tools.ietf.org/html/rfc8216#section-3.5>.
16 from .utils
import int_or_none
24 class _MatchParser(object):
26 An object that maintains the current parsing position and allows
27 conveniently advancing it as syntax elements are successfully parsed.
30 def __init__(self
, string
):
35 if isinstance(r
, compat_Pattern
):
36 return r
.match(self
._data
, self
._pos
)
37 if isinstance(r
, str):
38 if self
._data
.startswith(r
, self
._pos
):
43 def advance(self
, by
):
46 elif isinstance(by
, compat_Match
):
47 amt
= len(by
.group(0))
48 elif isinstance(by
, str):
50 elif isinstance(by
, int):
58 return self
.advance(self
.match(r
))
61 return _MatchChildParser(self
)
64 class _MatchChildParser(_MatchParser
):
66 A child parser state, which advances through the same data as
67 its parent, but has an independent position. This is useful when
68 advancing through syntax elements we might later want to backtrack
72 def __init__(self
, parent
):
73 super(_MatchChildParser
, self
).__init
__(parent
._data
)
74 self
.__parent
= parent
75 self
._pos
= parent
._pos
79 Advance the parent state to the current position of this child state.
81 self
.__parent
._pos
= self
._pos
85 class ParseError(Exception):
86 def __init__(self
, parser
):
87 super(ParseError
, self
).__init
__("Parse error at position %u (near %r)" % (
88 parser
._pos
, parser
._data
[parser
._pos
:parser
._pos
+ 20]
92 _REGEX_TS
= re
.compile(r
'''(?x)
98 _REGEX_EOF
= re
.compile(r
'\Z')
99 _REGEX_NL
= re
.compile(r
'(?:\r\n|[\r\n])')
100 _REGEX_BLANK
= re
.compile(r
'(?:\r\n|[\r\n])+')
105 Convert a parsed WebVTT timestamp (a re.Match obtained from _REGEX_TS)
106 into an MPEG PES timestamp: a tick counter at 90 kHz resolution.
109 h
, min, s
, ms
= ts
.groups()
111 int(h
or 0) * 3600000 + # noqa: W504,E221,E222
112 int(min) * 60000 + # noqa: W504,E221,E222
113 int(s
) * 1000 + # noqa: W504,E221,E222
114 int(ms
) # noqa: W504,E221,E222
120 Convert an MPEG PES timestamp into a WebVTT timestamp.
121 This will lose sub-millisecond precision.
124 ts
= int((ts
+ 45) // 90)
125 ms
, ts
= divmod(ts
, 1000) # noqa: W504,E221,E222,E203
126 s
, ts
= divmod(ts
, 60) # noqa: W504,E221,E222,E203
127 min, h
= divmod(ts
, 60) # noqa: W504,E221,E222
128 return '%02u:%02u:%02u.%03u' % (h
, min, s
, ms
)
133 An abstract WebVTT block.
136 def __init__(self
, **kwargs
):
137 for key
, val
in kwargs
.items():
138 setattr(self
, key
, val
)
141 def parse(cls
, parser
):
142 m
= parser
.match(cls
._REGEX
)
146 return cls(raw
=m
.group(0))
148 def write_into(self
, stream
):
149 stream
.write(self
.raw
)
152 class HeaderBlock(Block
):
154 A WebVTT block that may only appear in the header part of the file,
155 i.e. before any cue blocks.
161 class Magic(HeaderBlock
):
162 _REGEX
= re
.compile(r
'\ufeff?WEBVTT([ \t][^\r\n]*)?(?:\r\n|[\r\n])')
164 # XXX: The X-TIMESTAMP-MAP extension is described in RFC 8216 §3.5
165 # <https://tools.ietf.org/html/rfc8216#section-3.5>, but the RFC
166 # doesn’t specify the exact grammar nor where in the WebVTT
167 # syntax it should be placed; the below has been devised based
168 # on usage in the wild
170 # And strictly speaking, the presence of this extension violates
171 # the W3C WebVTT spec. Oh well.
173 _REGEX_TSMAP
= re
.compile(r
'X-TIMESTAMP-MAP=')
174 _REGEX_TSMAP_LOCAL
= re
.compile(r
'LOCAL:')
175 _REGEX_TSMAP_MPEGTS
= re
.compile(r
'MPEGTS:([0-9]+)')
178 def __parse_tsmap(cls
, parser
):
179 parser
= parser
.child()
182 m
= parser
.consume(cls
._REGEX
_TSMAP
_LOCAL
)
184 m
= parser
.consume(_REGEX_TS
)
186 raise ParseError(parser
)
189 raise ParseError(parser
)
191 m
= parser
.consume(cls
._REGEX
_TSMAP
_MPEGTS
)
193 mpegts
= int_or_none(m
.group(1))
195 raise ParseError(parser
)
197 raise ParseError(parser
)
198 if parser
.consume(','):
200 if parser
.consume(_REGEX_NL
):
202 raise ParseError(parser
)
208 def parse(cls
, parser
):
209 parser
= parser
.child()
211 m
= parser
.consume(cls
._REGEX
)
213 raise ParseError(parser
)
216 local
, mpegts
= None, None
217 if parser
.consume(cls
._REGEX
_TSMAP
):
218 local
, mpegts
= cls
.__parse
_tsmap
(parser
)
219 if not parser
.consume(_REGEX_NL
):
220 raise ParseError(parser
)
222 return cls(extra
=extra
, mpegts
=mpegts
, local
=local
)
224 def write_into(self
, stream
):
225 stream
.write('WEBVTT')
226 if self
.extra
is not None:
227 stream
.write(self
.extra
)
229 if self
.local
or self
.mpegts
:
230 stream
.write('X-TIMESTAMP-MAP=LOCAL:')
231 stream
.write(_format_ts(self
.local
if self
.local
is not None else 0))
232 stream
.write(',MPEGTS:')
233 stream
.write(str(self
.mpegts
if self
.mpegts
is not None else 0))
238 class StyleBlock(HeaderBlock
):
239 _REGEX
= re
.compile(r
'''(?x)
240 STYLE[\ \t]*(?:\r\n|[\r\n])
241 ((?:(?!-->)[^\r\n])+(?:\r\n|[\r\n]))*
246 class RegionBlock(HeaderBlock
):
247 _REGEX
= re
.compile(r
'''(?x)
249 ((?:(?!-->)[^\r\n])+(?:\r\n|[\r\n]))*
254 class CommentBlock(Block
):
255 _REGEX
= re
.compile(r
'''(?x)
256 NOTE(?:\r\n|[\ \t\r\n])
257 ((?:(?!-->)[^\r\n])+(?:\r\n|[\r\n]))*
262 class CueBlock(Block
):
264 A cue block. The payload is not interpreted.
267 _REGEX_ID
= re
.compile(r
'((?:(?!-->)[^\r\n])+)(?:\r\n|[\r\n])')
268 _REGEX_ARROW
= re
.compile(r
'[ \t]+-->[ \t]+')
269 _REGEX_SETTINGS
= re
.compile(r
'[ \t]+((?:(?!-->)[^\r\n])+)')
270 _REGEX_PAYLOAD
= re
.compile(r
'[^\r\n]+(?:\r\n|[\r\n])?')
273 def parse(cls
, parser
):
274 parser
= parser
.child()
277 m
= parser
.consume(cls
._REGEX
_ID
)
281 m0
= parser
.consume(_REGEX_TS
)
284 if not parser
.consume(cls
._REGEX
_ARROW
):
286 m1
= parser
.consume(_REGEX_TS
)
289 m2
= parser
.consume(cls
._REGEX
_SETTINGS
)
290 if not parser
.consume(_REGEX_NL
):
293 start
= _parse_ts(m0
)
295 settings
= m2
.group(1) if m2
is not None else None
299 m
= parser
.consume(cls
._REGEX
_PAYLOAD
)
302 text
.write(m
.group(0))
307 start
=start
, end
=end
, settings
=settings
,
311 def write_into(self
, stream
):
312 if self
.id is not None:
313 stream
.write(self
.id)
315 stream
.write(_format_ts(self
.start
))
316 stream
.write(' --> ')
317 stream
.write(_format_ts(self
.end
))
318 if self
.settings
is not None:
320 stream
.write(self
.settings
)
322 stream
.write(self
.text
)
332 'settings': self
.settings
,
336 def parse_fragment(frag_content
):
338 A generator that yields (partially) parsed WebVTT blocks when given
339 a bytes object containing the raw contents of a WebVTT file.
342 parser
= _MatchParser(frag_content
.decode('utf-8'))
344 yield Magic
.parse(parser
)
346 while not parser
.match(_REGEX_EOF
):
347 if parser
.consume(_REGEX_BLANK
):
350 block
= RegionBlock
.parse(parser
)
354 block
= StyleBlock
.parse(parser
)
358 block
= CommentBlock
.parse(parser
)
360 yield block
# XXX: or skip
365 while not parser
.match(_REGEX_EOF
):
366 if parser
.consume(_REGEX_BLANK
):
369 block
= CommentBlock
.parse(parser
)
371 yield block
# XXX: or skip
373 block
= CueBlock
.parse(parser
)
378 raise ParseError(parser
)