]> jfr.im git - yt-dlp.git/blame - yt_dlp/webvtt.py
[youtube:search] Support hashtag entries (#3265)
[yt-dlp.git] / yt_dlp / webvtt.py
CommitLineData
4a2f19ab
F
1# coding: utf-8
2from __future__ import unicode_literals, print_function, division
3
4"""
5A partial parser for WebVTT segments. Interprets enough of the WebVTT stream
6to be able to assemble a single stand-alone subtitle file, suitably adjusting
7timestamps on the way, while everything else is passed through unmodified.
8
9Regular expressions based on the W3C WebVTT specification
10<https://www.w3.org/TR/webvtt1/>. The X-TIMESTAMP-MAP extension is described
11in RFC 8216 §3.5 <https://tools.ietf.org/html/rfc8216#section-3.5>.
12"""
13
14import re
15import io
aa7785f8 16from .utils import int_or_none, timetuple_from_msec
4a2f19ab
F
17from .compat import (
18 compat_str as str,
19 compat_Pattern,
20 compat_Match,
21)
22
23
24class _MatchParser(object):
25 """
26 An object that maintains the current parsing position and allows
27 conveniently advancing it as syntax elements are successfully parsed.
28 """
29
30 def __init__(self, string):
31 self._data = string
32 self._pos = 0
33
34 def match(self, r):
35 if isinstance(r, compat_Pattern):
36 return r.match(self._data, self._pos)
37 if isinstance(r, str):
38 if self._data.startswith(r, self._pos):
39 return len(r)
40 return None
41 raise ValueError(r)
42
43 def advance(self, by):
44 if by is None:
45 amt = 0
46 elif isinstance(by, compat_Match):
47 amt = len(by.group(0))
48 elif isinstance(by, str):
49 amt = len(by)
50 elif isinstance(by, int):
51 amt = by
52 else:
53 raise ValueError(by)
54 self._pos += amt
55 return by
56
57 def consume(self, r):
58 return self.advance(self.match(r))
59
60 def child(self):
61 return _MatchChildParser(self)
62
63
64class _MatchChildParser(_MatchParser):
65 """
66 A child parser state, which advances through the same data as
67 its parent, but has an independent position. This is useful when
68 advancing through syntax elements we might later want to backtrack
69 from.
70 """
71
72 def __init__(self, parent):
73 super(_MatchChildParser, self).__init__(parent._data)
74 self.__parent = parent
75 self._pos = parent._pos
76
77 def commit(self):
78 """
79 Advance the parent state to the current position of this child state.
80 """
81 self.__parent._pos = self._pos
82 return self.__parent
83
84
85class ParseError(Exception):
86 def __init__(self, parser):
87 super(ParseError, self).__init__("Parse error at position %u (near %r)" % (
88 parser._pos, parser._data[parser._pos:parser._pos + 20]
89 ))
90
91
81a136b8 92# While the specification <https://www.w3.org/TR/webvtt1/#webvtt-timestamp>
93# prescribes that hours must be *2 or more* digits, timestamps with a single
94# digit for the hour part has been seen in the wild.
95# See https://github.com/yt-dlp/yt-dlp/issues/921
4a2f19ab 96_REGEX_TS = re.compile(r'''(?x)
81a136b8 97 (?:([0-9]{1,}):)?
4a2f19ab
F
98 ([0-9]{2}):
99 ([0-9]{2})\.
100 ([0-9]{3})?
101''')
102_REGEX_EOF = re.compile(r'\Z')
103_REGEX_NL = re.compile(r'(?:\r\n|[\r\n])')
104_REGEX_BLANK = re.compile(r'(?:\r\n|[\r\n])+')
105
106
107def _parse_ts(ts):
108 """
109 Convert a parsed WebVTT timestamp (a re.Match obtained from _REGEX_TS)
110 into an MPEG PES timestamp: a tick counter at 90 kHz resolution.
111 """
112
113 h, min, s, ms = ts.groups()
114 return 90 * (
115 int(h or 0) * 3600000 + # noqa: W504,E221,E222
116 int(min) * 60000 + # noqa: W504,E221,E222
117 int(s) * 1000 + # noqa: W504,E221,E222
118 int(ms) # noqa: W504,E221,E222
119 )
120
121
122def _format_ts(ts):
123 """
124 Convert an MPEG PES timestamp into a WebVTT timestamp.
125 This will lose sub-millisecond precision.
126 """
aa7785f8 127 return '%02u:%02u:%02u.%03u' % timetuple_from_msec(int((ts + 45) // 90))
4a2f19ab
F
128
129
130class Block(object):
131 """
132 An abstract WebVTT block.
133 """
134
135 def __init__(self, **kwargs):
136 for key, val in kwargs.items():
137 setattr(self, key, val)
138
139 @classmethod
140 def parse(cls, parser):
141 m = parser.match(cls._REGEX)
142 if not m:
143 return None
144 parser.advance(m)
145 return cls(raw=m.group(0))
146
147 def write_into(self, stream):
148 stream.write(self.raw)
149
150
151class HeaderBlock(Block):
152 """
153 A WebVTT block that may only appear in the header part of the file,
154 i.e. before any cue blocks.
155 """
156
157 pass
158
159
160class Magic(HeaderBlock):
161 _REGEX = re.compile(r'\ufeff?WEBVTT([ \t][^\r\n]*)?(?:\r\n|[\r\n])')
162
163 # XXX: The X-TIMESTAMP-MAP extension is described in RFC 8216 §3.5
164 # <https://tools.ietf.org/html/rfc8216#section-3.5>, but the RFC
165 # doesn’t specify the exact grammar nor where in the WebVTT
166 # syntax it should be placed; the below has been devised based
167 # on usage in the wild
168 #
169 # And strictly speaking, the presence of this extension violates
170 # the W3C WebVTT spec. Oh well.
171
172 _REGEX_TSMAP = re.compile(r'X-TIMESTAMP-MAP=')
173 _REGEX_TSMAP_LOCAL = re.compile(r'LOCAL:')
174 _REGEX_TSMAP_MPEGTS = re.compile(r'MPEGTS:([0-9]+)')
81a136b8 175 _REGEX_TSMAP_SEP = re.compile(r'[ \t]*,[ \t]*')
4a2f19ab
F
176
177 @classmethod
178 def __parse_tsmap(cls, parser):
179 parser = parser.child()
180
181 while True:
182 m = parser.consume(cls._REGEX_TSMAP_LOCAL)
183 if m:
184 m = parser.consume(_REGEX_TS)
185 if m is None:
186 raise ParseError(parser)
187 local = _parse_ts(m)
188 if local is None:
189 raise ParseError(parser)
190 else:
191 m = parser.consume(cls._REGEX_TSMAP_MPEGTS)
192 if m:
193 mpegts = int_or_none(m.group(1))
194 if mpegts is None:
195 raise ParseError(parser)
196 else:
197 raise ParseError(parser)
81a136b8 198 if parser.consume(cls._REGEX_TSMAP_SEP):
4a2f19ab
F
199 continue
200 if parser.consume(_REGEX_NL):
201 break
202 raise ParseError(parser)
203
204 parser.commit()
205 return local, mpegts
206
207 @classmethod
208 def parse(cls, parser):
209 parser = parser.child()
210
211 m = parser.consume(cls._REGEX)
212 if not m:
213 raise ParseError(parser)
214
215 extra = m.group(1)
216 local, mpegts = None, None
217 if parser.consume(cls._REGEX_TSMAP):
218 local, mpegts = cls.__parse_tsmap(parser)
219 if not parser.consume(_REGEX_NL):
220 raise ParseError(parser)
221 parser.commit()
222 return cls(extra=extra, mpegts=mpegts, local=local)
223
224 def write_into(self, stream):
225 stream.write('WEBVTT')
226 if self.extra is not None:
227 stream.write(self.extra)
228 stream.write('\n')
229 if self.local or self.mpegts:
230 stream.write('X-TIMESTAMP-MAP=LOCAL:')
231 stream.write(_format_ts(self.local if self.local is not None else 0))
232 stream.write(',MPEGTS:')
233 stream.write(str(self.mpegts if self.mpegts is not None else 0))
234 stream.write('\n')
235 stream.write('\n')
236
237
238class StyleBlock(HeaderBlock):
239 _REGEX = re.compile(r'''(?x)
240 STYLE[\ \t]*(?:\r\n|[\r\n])
241 ((?:(?!-->)[^\r\n])+(?:\r\n|[\r\n]))*
242 (?:\r\n|[\r\n])
243 ''')
244
245
246class RegionBlock(HeaderBlock):
247 _REGEX = re.compile(r'''(?x)
248 REGION[\ \t]*
249 ((?:(?!-->)[^\r\n])+(?:\r\n|[\r\n]))*
250 (?:\r\n|[\r\n])
251 ''')
252
253
254class CommentBlock(Block):
255 _REGEX = re.compile(r'''(?x)
256 NOTE(?:\r\n|[\ \t\r\n])
257 ((?:(?!-->)[^\r\n])+(?:\r\n|[\r\n]))*
258 (?:\r\n|[\r\n])
259 ''')
260
261
262class CueBlock(Block):
263 """
264 A cue block. The payload is not interpreted.
265 """
266
267 _REGEX_ID = re.compile(r'((?:(?!-->)[^\r\n])+)(?:\r\n|[\r\n])')
268 _REGEX_ARROW = re.compile(r'[ \t]+-->[ \t]+')
269 _REGEX_SETTINGS = re.compile(r'[ \t]+((?:(?!-->)[^\r\n])+)')
270 _REGEX_PAYLOAD = re.compile(r'[^\r\n]+(?:\r\n|[\r\n])?')
271
272 @classmethod
273 def parse(cls, parser):
274 parser = parser.child()
275
276 id = None
277 m = parser.consume(cls._REGEX_ID)
278 if m:
279 id = m.group(1)
280
281 m0 = parser.consume(_REGEX_TS)
282 if not m0:
283 return None
284 if not parser.consume(cls._REGEX_ARROW):
285 return None
286 m1 = parser.consume(_REGEX_TS)
287 if not m1:
288 return None
289 m2 = parser.consume(cls._REGEX_SETTINGS)
290 if not parser.consume(_REGEX_NL):
291 return None
292
293 start = _parse_ts(m0)
294 end = _parse_ts(m1)
295 settings = m2.group(1) if m2 is not None else None
296
297 text = io.StringIO()
298 while True:
299 m = parser.consume(cls._REGEX_PAYLOAD)
300 if not m:
301 break
302 text.write(m.group(0))
303
304 parser.commit()
305 return cls(
306 id=id,
307 start=start, end=end, settings=settings,
308 text=text.getvalue()
309 )
310
311 def write_into(self, stream):
312 if self.id is not None:
313 stream.write(self.id)
314 stream.write('\n')
315 stream.write(_format_ts(self.start))
316 stream.write(' --> ')
317 stream.write(_format_ts(self.end))
318 if self.settings is not None:
319 stream.write(' ')
320 stream.write(self.settings)
321 stream.write('\n')
322 stream.write(self.text)
323 stream.write('\n')
324
333217f4
F
325 @property
326 def as_json(self):
327 return {
328 'id': self.id,
329 'start': self.start,
330 'end': self.end,
331 'text': self.text,
332 'settings': self.settings,
333 }
334
25a3f4f5
F
335 def __eq__(self, other):
336 return self.as_json == other.as_json
337
338 @classmethod
339 def from_json(cls, json):
340 return cls(
341 id=json['id'],
342 start=json['start'],
343 end=json['end'],
344 text=json['text'],
345 settings=json['settings']
346 )
347
348 def hinges(self, other):
349 if self.text != other.text:
350 return False
351 if self.settings != other.settings:
352 return False
353 return self.start <= self.end == other.start <= other.end
354
4a2f19ab
F
355
356def parse_fragment(frag_content):
357 """
358 A generator that yields (partially) parsed WebVTT blocks when given
359 a bytes object containing the raw contents of a WebVTT file.
360 """
361
362 parser = _MatchParser(frag_content.decode('utf-8'))
363
364 yield Magic.parse(parser)
365
366 while not parser.match(_REGEX_EOF):
367 if parser.consume(_REGEX_BLANK):
368 continue
369
370 block = RegionBlock.parse(parser)
371 if block:
372 yield block
373 continue
374 block = StyleBlock.parse(parser)
375 if block:
376 yield block
377 continue
378 block = CommentBlock.parse(parser)
379 if block:
380 yield block # XXX: or skip
381 continue
382
383 break
384
385 while not parser.match(_REGEX_EOF):
386 if parser.consume(_REGEX_BLANK):
387 continue
388
389 block = CommentBlock.parse(parser)
390 if block:
391 yield block # XXX: or skip
392 continue
393 block = CueBlock.parse(parser)
394 if block:
395 yield block
396 continue
397
398 raise ParseError(parser)