]>
jfr.im git - yt-dlp.git/blob - yt_dlp/webvtt.py
2 from __future__
import unicode_literals
, print_function
, division
5 A partial parser for WebVTT segments. Interprets enough of the WebVTT stream
6 to be able to assemble a single stand-alone subtitle file, suitably adjusting
7 timestamps on the way, while everything else is passed through unmodified.
9 Regular expressions based on the W3C WebVTT specification
10 <https://www.w3.org/TR/webvtt1/>. The X-TIMESTAMP-MAP extension is described
11 in RFC 8216 §3.5 <https://tools.ietf.org/html/rfc8216#section-3.5>.
16 from .utils
import int_or_none
, timetuple_from_msec
23 class _MatchParser(object):
25 An object that maintains the current parsing position and allows
26 conveniently advancing it as syntax elements are successfully parsed.
29 def __init__(self
, string
):
34 if isinstance(r
, compat_Pattern
):
35 return r
.match(self
._data
, self
._pos
)
36 if isinstance(r
, str):
37 if self
._data
.startswith(r
, self
._pos
):
42 def advance(self
, by
):
45 elif isinstance(by
, compat_Match
):
46 amt
= len(by
.group(0))
47 elif isinstance(by
, str):
49 elif isinstance(by
, int):
57 return self
.advance(self
.match(r
))
60 return _MatchChildParser(self
)
63 class _MatchChildParser(_MatchParser
):
65 A child parser state, which advances through the same data as
66 its parent, but has an independent position. This is useful when
67 advancing through syntax elements we might later want to backtrack
71 def __init__(self
, parent
):
72 super(_MatchChildParser
, self
).__init
__(parent
._data
)
73 self
.__parent
= parent
74 self
._pos
= parent
._pos
78 Advance the parent state to the current position of this child state.
80 self
.__parent
._pos
= self
._pos
84 class ParseError(Exception):
85 def __init__(self
, parser
):
86 super(ParseError
, self
).__init
__("Parse error at position %u (near %r)" % (
87 parser
._pos
, parser
._data
[parser
._pos
:parser
._pos
+ 20]
91 # While the specification <https://www.w3.org/TR/webvtt1/#webvtt-timestamp>
92 # prescribes that hours must be *2 or more* digits, timestamps with a single
93 # digit for the hour part has been seen in the wild.
94 # See https://github.com/yt-dlp/yt-dlp/issues/921
95 _REGEX_TS
= re
.compile(r
'''(?x)
101 _REGEX_EOF
= re
.compile(r
'\Z')
102 _REGEX_NL
= re
.compile(r
'(?:\r\n|[\r\n])')
103 _REGEX_BLANK
= re
.compile(r
'(?:\r\n|[\r\n])+')
108 Convert a parsed WebVTT timestamp (a re.Match obtained from _REGEX_TS)
109 into an MPEG PES timestamp: a tick counter at 90 kHz resolution.
112 h
, min, s
, ms
= ts
.groups()
114 int(h
or 0) * 3600000 + # noqa: W504,E221,E222
115 int(min) * 60000 + # noqa: W504,E221,E222
116 int(s
) * 1000 + # noqa: W504,E221,E222
117 int(ms
) # noqa: W504,E221,E222
123 Convert an MPEG PES timestamp into a WebVTT timestamp.
124 This will lose sub-millisecond precision.
126 return '%02u:%02u:%02u.%03u' % timetuple_from_msec(int((ts
+ 45) // 90))
131 An abstract WebVTT block.
134 def __init__(self
, **kwargs
):
135 for key
, val
in kwargs
.items():
136 setattr(self
, key
, val
)
139 def parse(cls
, parser
):
140 m
= parser
.match(cls
._REGEX
)
144 return cls(raw
=m
.group(0))
146 def write_into(self
, stream
):
147 stream
.write(self
.raw
)
150 class HeaderBlock(Block
):
152 A WebVTT block that may only appear in the header part of the file,
153 i.e. before any cue blocks.
159 class Magic(HeaderBlock
):
160 _REGEX
= re
.compile(r
'\ufeff?WEBVTT([ \t][^\r\n]*)?(?:\r\n|[\r\n])')
162 # XXX: The X-TIMESTAMP-MAP extension is described in RFC 8216 §3.5
163 # <https://tools.ietf.org/html/rfc8216#section-3.5>, but the RFC
164 # doesn’t specify the exact grammar nor where in the WebVTT
165 # syntax it should be placed; the below has been devised based
166 # on usage in the wild
168 # And strictly speaking, the presence of this extension violates
169 # the W3C WebVTT spec. Oh well.
171 _REGEX_TSMAP
= re
.compile(r
'X-TIMESTAMP-MAP=')
172 _REGEX_TSMAP_LOCAL
= re
.compile(r
'LOCAL:')
173 _REGEX_TSMAP_MPEGTS
= re
.compile(r
'MPEGTS:([0-9]+)')
174 _REGEX_TSMAP_SEP
= re
.compile(r
'[ \t]*,[ \t]*')
177 def __parse_tsmap(cls
, parser
):
178 parser
= parser
.child()
181 m
= parser
.consume(cls
._REGEX
_TSMAP
_LOCAL
)
183 m
= parser
.consume(_REGEX_TS
)
185 raise ParseError(parser
)
188 raise ParseError(parser
)
190 m
= parser
.consume(cls
._REGEX
_TSMAP
_MPEGTS
)
192 mpegts
= int_or_none(m
.group(1))
194 raise ParseError(parser
)
196 raise ParseError(parser
)
197 if parser
.consume(cls
._REGEX
_TSMAP
_SEP
):
199 if parser
.consume(_REGEX_NL
):
201 raise ParseError(parser
)
207 def parse(cls
, parser
):
208 parser
= parser
.child()
210 m
= parser
.consume(cls
._REGEX
)
212 raise ParseError(parser
)
215 local
, mpegts
= None, None
216 if parser
.consume(cls
._REGEX
_TSMAP
):
217 local
, mpegts
= cls
.__parse
_tsmap
(parser
)
218 if not parser
.consume(_REGEX_NL
):
219 raise ParseError(parser
)
221 return cls(extra
=extra
, mpegts
=mpegts
, local
=local
)
223 def write_into(self
, stream
):
224 stream
.write('WEBVTT')
225 if self
.extra
is not None:
226 stream
.write(self
.extra
)
228 if self
.local
or self
.mpegts
:
229 stream
.write('X-TIMESTAMP-MAP=LOCAL:')
230 stream
.write(_format_ts(self
.local
if self
.local
is not None else 0))
231 stream
.write(',MPEGTS:')
232 stream
.write(str(self
.mpegts
if self
.mpegts
is not None else 0))
237 class StyleBlock(HeaderBlock
):
238 _REGEX
= re
.compile(r
'''(?x)
239 STYLE[\ \t]*(?:\r\n|[\r\n])
240 ((?:(?!-->)[^\r\n])+(?:\r\n|[\r\n]))*
245 class RegionBlock(HeaderBlock
):
246 _REGEX
= re
.compile(r
'''(?x)
248 ((?:(?!-->)[^\r\n])+(?:\r\n|[\r\n]))*
253 class CommentBlock(Block
):
254 _REGEX
= re
.compile(r
'''(?x)
255 NOTE(?:\r\n|[\ \t\r\n])
256 ((?:(?!-->)[^\r\n])+(?:\r\n|[\r\n]))*
261 class CueBlock(Block
):
263 A cue block. The payload is not interpreted.
266 _REGEX_ID
= re
.compile(r
'((?:(?!-->)[^\r\n])+)(?:\r\n|[\r\n])')
267 _REGEX_ARROW
= re
.compile(r
'[ \t]+-->[ \t]+')
268 _REGEX_SETTINGS
= re
.compile(r
'[ \t]+((?:(?!-->)[^\r\n])+)')
269 _REGEX_PAYLOAD
= re
.compile(r
'[^\r\n]+(?:\r\n|[\r\n])?')
272 def parse(cls
, parser
):
273 parser
= parser
.child()
276 m
= parser
.consume(cls
._REGEX
_ID
)
280 m0
= parser
.consume(_REGEX_TS
)
283 if not parser
.consume(cls
._REGEX
_ARROW
):
285 m1
= parser
.consume(_REGEX_TS
)
288 m2
= parser
.consume(cls
._REGEX
_SETTINGS
)
289 if not parser
.consume(_REGEX_NL
):
292 start
= _parse_ts(m0
)
294 settings
= m2
.group(1) if m2
is not None else None
298 m
= parser
.consume(cls
._REGEX
_PAYLOAD
)
301 text
.write(m
.group(0))
306 start
=start
, end
=end
, settings
=settings
,
310 def write_into(self
, stream
):
311 if self
.id is not None:
312 stream
.write(self
.id)
314 stream
.write(_format_ts(self
.start
))
315 stream
.write(' --> ')
316 stream
.write(_format_ts(self
.end
))
317 if self
.settings
is not None:
319 stream
.write(self
.settings
)
321 stream
.write(self
.text
)
331 'settings': self
.settings
,
334 def __eq__(self
, other
):
335 return self
.as_json
== other
.as_json
338 def from_json(cls
, json
):
344 settings
=json
['settings']
347 def hinges(self
, other
):
348 if self
.text
!= other
.text
:
350 if self
.settings
!= other
.settings
:
352 return self
.start
<= self
.end
== other
.start
<= other
.end
355 def parse_fragment(frag_content
):
357 A generator that yields (partially) parsed WebVTT blocks when given
358 a bytes object containing the raw contents of a WebVTT file.
361 parser
= _MatchParser(frag_content
.decode('utf-8'))
363 yield Magic
.parse(parser
)
365 while not parser
.match(_REGEX_EOF
):
366 if parser
.consume(_REGEX_BLANK
):
369 block
= RegionBlock
.parse(parser
)
373 block
= StyleBlock
.parse(parser
)
377 block
= CommentBlock
.parse(parser
)
379 yield block
# XXX: or skip
384 while not parser
.match(_REGEX_EOF
):
385 if parser
.consume(_REGEX_BLANK
):
388 block
= CommentBlock
.parse(parser
)
390 yield block
# XXX: or skip
392 block
= CueBlock
.parse(parser
)
397 raise ParseError(parser
)