jfr.im git - yt-dlp.git/blame_incremental

... / ...

Commit	Line	Data
	1	# coding: utf-8
	2	from __future__ import unicode_literals, print_function, division
	3
	4	"""
	5	A partial parser for WebVTT segments. Interprets enough of the WebVTT stream
	6	to be able to assemble a single stand-alone subtitle file, suitably adjusting
	7	timestamps on the way, while everything else is passed through unmodified.
	8
	9	Regular expressions based on the W3C WebVTT specification
	10	<https://www.w3.org/TR/webvtt1/>. The X-TIMESTAMP-MAP extension is described
	11	in RFC 8216 §3.5 <https://tools.ietf.org/html/rfc8216#section-3.5>.
	12	"""
	13
	14	import re
	15	import io
	16	from .utils import int_or_none
	17	from .compat import (
	18	compat_str as str,
	19	compat_Pattern,
	20	compat_Match,
	21	)
	22
	23
	24	class _MatchParser(object):
	25	"""
	26	An object that maintains the current parsing position and allows
	27	conveniently advancing it as syntax elements are successfully parsed.
	28	"""
	29
	30	def __init__(self, string):
	31	self._data = string
	32	self._pos = 0
	33
	34	def match(self, r):
	35	if isinstance(r, compat_Pattern):
	36	return r.match(self._data, self._pos)
	37	if isinstance(r, str):
	38	if self._data.startswith(r, self._pos):
	39	return len(r)
	40	return None
	41	raise ValueError(r)
	42
	43	def advance(self, by):
	44	if by is None:
	45	amt = 0
	46	elif isinstance(by, compat_Match):
	47	amt = len(by.group(0))
	48	elif isinstance(by, str):
	49	amt = len(by)
	50	elif isinstance(by, int):
	51	amt = by
	52	else:
	53	raise ValueError(by)
	54	self._pos += amt
	55	return by
	56
	57	def consume(self, r):
	58	return self.advance(self.match(r))
	59
	60	def child(self):
	61	return _MatchChildParser(self)
	62
	63
	64	class _MatchChildParser(_MatchParser):
	65	"""
	66	A child parser state, which advances through the same data as
	67	its parent, but has an independent position. This is useful when
	68	advancing through syntax elements we might later want to backtrack
	69	from.
	70	"""
	71
	72	def __init__(self, parent):
	73	super(_MatchChildParser, self).__init__(parent._data)
	74	self.__parent = parent
	75	self._pos = parent._pos
	76
	77	def commit(self):
	78	"""
	79	Advance the parent state to the current position of this child state.
	80	"""
	81	self.__parent._pos = self._pos
	82	return self.__parent
	83
	84
	85	class ParseError(Exception):
	86	def __init__(self, parser):
	87	super(ParseError, self).__init__("Parse error at position %u (near %r)" % (
	88	parser._pos, parser._data[parser._pos:parser._pos + 20]
	89	))
	90
	91
	92	_REGEX_TS = re.compile(r'''(?x)
	93	(?:([0-9]{2,}):)?
	94	([0-9]{2}):
	95	([0-9]{2})\.
	96	([0-9]{3})?
	97	''')
	98	_REGEX_EOF = re.compile(r'\Z')
	99	_REGEX_NL = re.compile(r'(?:\r\n\|[\r\n])')
	100	_REGEX_BLANK = re.compile(r'(?:\r\n\|[\r\n])+')
	101
	102
	103	def _parse_ts(ts):
	104	"""
	105	Convert a parsed WebVTT timestamp (a re.Match obtained from _REGEX_TS)
	106	into an MPEG PES timestamp: a tick counter at 90 kHz resolution.
	107	"""
	108
	109	h, min, s, ms = ts.groups()
	110	return 90 * (
	111	int(h or 0) * 3600000 + # noqa: W504,E221,E222
	112	int(min) * 60000 + # noqa: W504,E221,E222
	113	int(s) * 1000 + # noqa: W504,E221,E222
	114	int(ms) # noqa: W504,E221,E222
	115	)
	116
	117
	118	def _format_ts(ts):
	119	"""
	120	Convert an MPEG PES timestamp into a WebVTT timestamp.
	121	This will lose sub-millisecond precision.
	122	"""
	123
	124	ts = int((ts + 45) // 90)
	125	ms , ts = divmod(ts, 1000) # noqa: W504,E221,E222,E203
	126	s , ts = divmod(ts, 60) # noqa: W504,E221,E222,E203
	127	min, h = divmod(ts, 60) # noqa: W504,E221,E222
	128	return '%02u:%02u:%02u.%03u' % (h, min, s, ms)
	129
	130
	131	class Block(object):
	132	"""
	133	An abstract WebVTT block.
	134	"""
	135
	136	def __init__(self, **kwargs):
	137	for key, val in kwargs.items():
	138	setattr(self, key, val)
	139
	140	@classmethod
	141	def parse(cls, parser):
	142	m = parser.match(cls._REGEX)
	143	if not m:
	144	return None
	145	parser.advance(m)
	146	return cls(raw=m.group(0))
	147
	148	def write_into(self, stream):
	149	stream.write(self.raw)
	150
	151
	152	class HeaderBlock(Block):
	153	"""
	154	A WebVTT block that may only appear in the header part of the file,
	155	i.e. before any cue blocks.
	156	"""
	157
	158	pass
	159
	160
	161	class Magic(HeaderBlock):
	162	_REGEX = re.compile(r'\ufeff?WEBVTT([ \t][^\r\n]*)?(?:\r\n\|[\r\n])')
	163
	164	# XXX: The X-TIMESTAMP-MAP extension is described in RFC 8216 §3.5
	165	# <https://tools.ietf.org/html/rfc8216#section-3.5>, but the RFC
	166	# doesn’t specify the exact grammar nor where in the WebVTT
	167	# syntax it should be placed; the below has been devised based
	168	# on usage in the wild
	169	#
	170	# And strictly speaking, the presence of this extension violates
	171	# the W3C WebVTT spec. Oh well.
	172
	173	_REGEX_TSMAP = re.compile(r'X-TIMESTAMP-MAP=')
	174	_REGEX_TSMAP_LOCAL = re.compile(r'LOCAL:')
	175	_REGEX_TSMAP_MPEGTS = re.compile(r'MPEGTS:([0-9]+)')
	176
	177	@classmethod
	178	def __parse_tsmap(cls, parser):
	179	parser = parser.child()
	180
	181	while True:
	182	m = parser.consume(cls._REGEX_TSMAP_LOCAL)
	183	if m:
	184	m = parser.consume(_REGEX_TS)
	185	if m is None:
	186	raise ParseError(parser)
	187	local = _parse_ts(m)
	188	if local is None:
	189	raise ParseError(parser)
	190	else:
	191	m = parser.consume(cls._REGEX_TSMAP_MPEGTS)
	192	if m:
	193	mpegts = int_or_none(m.group(1))
	194	if mpegts is None:
	195	raise ParseError(parser)
	196	else:
	197	raise ParseError(parser)
	198	if parser.consume(','):
	199	continue
	200	if parser.consume(_REGEX_NL):
	201	break
	202	raise ParseError(parser)
	203
	204	parser.commit()
	205	return local, mpegts
	206
	207	@classmethod
	208	def parse(cls, parser):
	209	parser = parser.child()
	210
	211	m = parser.consume(cls._REGEX)
	212	if not m:
	213	raise ParseError(parser)
	214
	215	extra = m.group(1)
	216	local, mpegts = None, None
	217	if parser.consume(cls._REGEX_TSMAP):
	218	local, mpegts = cls.__parse_tsmap(parser)
	219	if not parser.consume(_REGEX_NL):
	220	raise ParseError(parser)
	221	parser.commit()
	222	return cls(extra=extra, mpegts=mpegts, local=local)
	223
	224	def write_into(self, stream):
	225	stream.write('WEBVTT')
	226	if self.extra is not None:
	227	stream.write(self.extra)
	228	stream.write('\n')
	229	if self.local or self.mpegts:
	230	stream.write('X-TIMESTAMP-MAP=LOCAL:')
	231	stream.write(_format_ts(self.local if self.local is not None else 0))
	232	stream.write(',MPEGTS:')
	233	stream.write(str(self.mpegts if self.mpegts is not None else 0))
	234	stream.write('\n')
	235	stream.write('\n')
	236
	237
	238	class StyleBlock(HeaderBlock):
	239	_REGEX = re.compile(r'''(?x)
	240	STYLE[\ \t]*(?:\r\n\|[\r\n])
	241	((?:(?!-->)[^\r\n])+(?:\r\n\|[\r\n]))*
	242	(?:\r\n\|[\r\n])
	243	''')
	244
	245
	246	class RegionBlock(HeaderBlock):
	247	_REGEX = re.compile(r'''(?x)
	248	REGION[\ \t]*
	249	((?:(?!-->)[^\r\n])+(?:\r\n\|[\r\n]))*
	250	(?:\r\n\|[\r\n])
	251	''')
	252
	253
	254	class CommentBlock(Block):
	255	_REGEX = re.compile(r'''(?x)
	256	NOTE(?:\r\n\|[\ \t\r\n])
	257	((?:(?!-->)[^\r\n])+(?:\r\n\|[\r\n]))*
	258	(?:\r\n\|[\r\n])
	259	''')
	260
	261
	262	class CueBlock(Block):
	263	"""
	264	A cue block. The payload is not interpreted.
	265	"""
	266
	267	_REGEX_ID = re.compile(r'((?:(?!-->)[^\r\n])+)(?:\r\n\|[\r\n])')
	268	_REGEX_ARROW = re.compile(r'[ \t]+-->[ \t]+')
	269	_REGEX_SETTINGS = re.compile(r'[ \t]+((?:(?!-->)[^\r\n])+)')
	270	_REGEX_PAYLOAD = re.compile(r'[^\r\n]+(?:\r\n\|[\r\n])?')
	271
	272	@classmethod
	273	def parse(cls, parser):
	274	parser = parser.child()
	275
	276	id = None
	277	m = parser.consume(cls._REGEX_ID)
	278	if m:
	279	id = m.group(1)
	280
	281	m0 = parser.consume(_REGEX_TS)
	282	if not m0:
	283	return None
	284	if not parser.consume(cls._REGEX_ARROW):
	285	return None
	286	m1 = parser.consume(_REGEX_TS)
	287	if not m1:
	288	return None
	289	m2 = parser.consume(cls._REGEX_SETTINGS)
	290	if not parser.consume(_REGEX_NL):
	291	return None
	292
	293	start = _parse_ts(m0)
	294	end = _parse_ts(m1)
	295	settings = m2.group(1) if m2 is not None else None
	296
	297	text = io.StringIO()
	298	while True:
	299	m = parser.consume(cls._REGEX_PAYLOAD)
	300	if not m:
	301	break
	302	text.write(m.group(0))
	303
	304	parser.commit()
	305	return cls(
	306	id=id,
	307	start=start, end=end, settings=settings,
	308	text=text.getvalue()
	309	)
	310
	311	def write_into(self, stream):
	312	if self.id is not None:
	313	stream.write(self.id)
	314	stream.write('\n')
	315	stream.write(_format_ts(self.start))
	316	stream.write(' --> ')
	317	stream.write(_format_ts(self.end))
	318	if self.settings is not None:
	319	stream.write(' ')
	320	stream.write(self.settings)
	321	stream.write('\n')
	322	stream.write(self.text)
	323	stream.write('\n')
	324
	325	@property
	326	def as_json(self):
	327	return {
	328	'id': self.id,
	329	'start': self.start,
	330	'end': self.end,
	331	'text': self.text,
	332	'settings': self.settings,
	333	}
	334
	335
	336	def parse_fragment(frag_content):
	337	"""
	338	A generator that yields (partially) parsed WebVTT blocks when given
	339	a bytes object containing the raw contents of a WebVTT file.
	340	"""
	341
	342	parser = _MatchParser(frag_content.decode('utf-8'))
	343
	344	yield Magic.parse(parser)
	345
	346	while not parser.match(_REGEX_EOF):
	347	if parser.consume(_REGEX_BLANK):
	348	continue
	349
	350	block = RegionBlock.parse(parser)
	351	if block:
	352	yield block
	353	continue
	354	block = StyleBlock.parse(parser)
	355	if block:
	356	yield block
	357	continue
	358	block = CommentBlock.parse(parser)
	359	if block:
	360	yield block # XXX: or skip
	361	continue
	362
	363	break
	364
	365	while not parser.match(_REGEX_EOF):
	366	if parser.consume(_REGEX_BLANK):
	367	continue
	368
	369	block = CommentBlock.parse(parser)
	370	if block:
	371	yield block # XXX: or skip
	372	continue
	373	block = CueBlock.parse(parser)
	374	if block:
	375	yield block
	376	continue
	377
	378	raise ParseError(parser)