[yt-dlp.git] / yt_dlp / webvtt.py

"""
A partial parser for WebVTT segments. Interprets enough of the WebVTT stream
to be able to assemble a single stand-alone subtitle file, suitably adjusting
timestamps on the way, while everything else is passed through unmodified.

Regular expressions based on the W3C WebVTT specification
<https://www.w3.org/TR/webvtt1/>. The X-TIMESTAMP-MAP extension is described
in RFC 8216 §3.5 <https://tools.ietf.org/html/rfc8216#section-3.5>.
"""

import io

from .compat import re
from .utils import int_or_none, timetuple_from_msec


class _MatchParser:
    """
    An object that maintains the current parsing position and allows
    conveniently advancing it as syntax elements are successfully parsed.
    """

    def __init__(self, string):
        self._data = string
        self._pos = 0

    def match(self, r):
        if isinstance(r, re.Pattern):
            return r.match(self._data, self._pos)
        if isinstance(r, str):
            if self._data.startswith(r, self._pos):
                return len(r)
            return None
        raise ValueError(r)

    def advance(self, by):
        if by is None:
            amt = 0
        elif isinstance(by, re.Match):
            amt = len(by.group(0))
        elif isinstance(by, str):
            amt = len(by)
        elif isinstance(by, int):
            amt = by
        else:
            raise ValueError(by)
        self._pos += amt
        return by

    def consume(self, r):
        return self.advance(self.match(r))

    def child(self):
        return _MatchChildParser(self)


class _MatchChildParser(_MatchParser):
    """
    A child parser state, which advances through the same data as
    its parent, but has an independent position. This is useful when
    advancing through syntax elements we might later want to backtrack
    from.
    """

    def __init__(self, parent):
        super().__init__(parent._data)
        self.__parent = parent
        self._pos = parent._pos

    def commit(self):
        """
        Advance the parent state to the current position of this child state.
        """
        self.__parent._pos = self._pos
        return self.__parent


class ParseError(Exception):
    def __init__(self, parser):
        super().__init__("Parse error at position %u (near %r)" % (
            parser._pos, parser._data[parser._pos:parser._pos + 20]
        ))


# While the specification <https://www.w3.org/TR/webvtt1/#webvtt-timestamp>
# prescribes that hours must be *2 or more* digits, timestamps with a single
# digit for the hour part has been seen in the wild.
# See https://github.com/yt-dlp/yt-dlp/issues/921
_REGEX_TS = re.compile(r'''(?x)
    (?:([0-9]{1,}):)?
    ([0-9]{2}):
    ([0-9]{2})\.
    ([0-9]{3})?
''')
_REGEX_EOF = re.compile(r'\Z')
_REGEX_NL = re.compile(r'(?:\r\n|[\r\n])')
_REGEX_BLANK = re.compile(r'(?:\r\n|[\r\n])+')


def _parse_ts(ts):
    """
    Convert a parsed WebVTT timestamp (a re.Match obtained from _REGEX_TS)
    into an MPEG PES timestamp: a tick counter at 90 kHz resolution.
    """
    return 90 * sum(
        int(part or 0) * mult for part, mult in zip(ts.groups(), (3600_000, 60_000, 1000, 1)))


def _format_ts(ts):
    """
    Convert an MPEG PES timestamp into a WebVTT timestamp.
    This will lose sub-millisecond precision.
    """
    return '%02u:%02u:%02u.%03u' % timetuple_from_msec(int((ts + 45) // 90))


class Block:
    """
    An abstract WebVTT block.
    """

    def __init__(self, **kwargs):
        for key, val in kwargs.items():
            setattr(self, key, val)

    @classmethod
    def parse(cls, parser):
        m = parser.match(cls._REGEX)
        if not m:
            return None
        parser.advance(m)
        return cls(raw=m.group(0))

    def write_into(self, stream):
        stream.write(self.raw)


class HeaderBlock(Block):
    """
    A WebVTT block that may only appear in the header part of the file,
    i.e. before any cue blocks.
    """

    pass


class Magic(HeaderBlock):
    _REGEX = re.compile(r'\ufeff?WEBVTT([ \t][^\r\n]*)?(?:\r\n|[\r\n])')

    # XXX: The X-TIMESTAMP-MAP extension is described in RFC 8216 §3.5
    # <https://tools.ietf.org/html/rfc8216#section-3.5>, but the RFC
    # doesn’t specify the exact grammar nor where in the WebVTT
    # syntax it should be placed; the below has been devised based
    # on usage in the wild
    #
    # And strictly speaking, the presence of this extension violates
    # the W3C WebVTT spec. Oh well.

    _REGEX_TSMAP = re.compile(r'X-TIMESTAMP-MAP=')
    _REGEX_TSMAP_LOCAL = re.compile(r'LOCAL:')
    _REGEX_TSMAP_MPEGTS = re.compile(r'MPEGTS:([0-9]+)')
    _REGEX_TSMAP_SEP = re.compile(r'[ \t]*,[ \t]*')

    @classmethod
    def __parse_tsmap(cls, parser):
        parser = parser.child()

        while True:
            m = parser.consume(cls._REGEX_TSMAP_LOCAL)
            if m:
                m = parser.consume(_REGEX_TS)
                if m is None:
                    raise ParseError(parser)
                local = _parse_ts(m)
                if local is None:
                    raise ParseError(parser)
            else:
                m = parser.consume(cls._REGEX_TSMAP_MPEGTS)
                if m:
                    mpegts = int_or_none(m.group(1))
                    if mpegts is None:
                        raise ParseError(parser)
                else:
                    raise ParseError(parser)
            if parser.consume(cls._REGEX_TSMAP_SEP):
                continue
            if parser.consume(_REGEX_NL):
                break
            raise ParseError(parser)

        parser.commit()
        return local, mpegts

    @classmethod
    def parse(cls, parser):
        parser = parser.child()

        m = parser.consume(cls._REGEX)
        if not m:
            raise ParseError(parser)

        extra = m.group(1)
        local, mpegts = None, None
        if parser.consume(cls._REGEX_TSMAP):
            local, mpegts = cls.__parse_tsmap(parser)
        if not parser.consume(_REGEX_NL):
            raise ParseError(parser)
        parser.commit()
        return cls(extra=extra, mpegts=mpegts, local=local)

    def write_into(self, stream):
        stream.write('WEBVTT')
        if self.extra is not None:
            stream.write(self.extra)
        stream.write('\n')
        if self.local or self.mpegts:
            stream.write('X-TIMESTAMP-MAP=LOCAL:')
            stream.write(_format_ts(self.local if self.local is not None else 0))
            stream.write(',MPEGTS:')
            stream.write(str(self.mpegts if self.mpegts is not None else 0))
            stream.write('\n')
        stream.write('\n')


class StyleBlock(HeaderBlock):
    _REGEX = re.compile(r'''(?x)
        STYLE[\ \t]*(?:\r\n|[\r\n])
        ((?:(?!-->)[^\r\n])+(?:\r\n|[\r\n]))*
        (?:\r\n|[\r\n])
    ''')


class RegionBlock(HeaderBlock):
    _REGEX = re.compile(r'''(?x)
        REGION[\ \t]*
        ((?:(?!-->)[^\r\n])+(?:\r\n|[\r\n]))*
        (?:\r\n|[\r\n])
    ''')


class CommentBlock(Block):
    _REGEX = re.compile(r'''(?x)
        NOTE(?:\r\n|[\ \t\r\n])
        ((?:(?!-->)[^\r\n])+(?:\r\n|[\r\n]))*
        (?:\r\n|[\r\n])
    ''')


class CueBlock(Block):
    """
    A cue block. The payload is not interpreted.
    """

    _REGEX_ID = re.compile(r'((?:(?!-->)[^\r\n])+)(?:\r\n|[\r\n])')
    _REGEX_ARROW = re.compile(r'[ \t]+-->[ \t]+')
    _REGEX_SETTINGS = re.compile(r'[ \t]+((?:(?!-->)[^\r\n])+)')
    _REGEX_PAYLOAD = re.compile(r'[^\r\n]+(?:\r\n|[\r\n])?')

    @classmethod
    def parse(cls, parser):
        parser = parser.child()

        id = None
        m = parser.consume(cls._REGEX_ID)
        if m:
            id = m.group(1)

        m0 = parser.consume(_REGEX_TS)
        if not m0:
            return None
        if not parser.consume(cls._REGEX_ARROW):
            return None
        m1 = parser.consume(_REGEX_TS)
        if not m1:
            return None
        m2 = parser.consume(cls._REGEX_SETTINGS)
        if not parser.consume(_REGEX_NL):
            return None

        start = _parse_ts(m0)
        end = _parse_ts(m1)
        settings = m2.group(1) if m2 is not None else None

        text = io.StringIO()
        while True:
            m = parser.consume(cls._REGEX_PAYLOAD)
            if not m:
                break
            text.write(m.group(0))

        parser.commit()
        return cls(
            id=id,
            start=start, end=end, settings=settings,
            text=text.getvalue()
        )

    def write_into(self, stream):
        if self.id is not None:
            stream.write(self.id)
            stream.write('\n')
        stream.write(_format_ts(self.start))
        stream.write(' --> ')
        stream.write(_format_ts(self.end))
        if self.settings is not None:
            stream.write(' ')
            stream.write(self.settings)
        stream.write('\n')
        stream.write(self.text)
        stream.write('\n')

    @property
    def as_json(self):
        return {
            'id': self.id,
            'start': self.start,
            'end': self.end,
            'text': self.text,
            'settings': self.settings,
        }

    def __eq__(self, other):
        return self.as_json == other.as_json

    @classmethod
    def from_json(cls, json):
        return cls(
            id=json['id'],
            start=json['start'],
            end=json['end'],
            text=json['text'],
            settings=json['settings']
        )

    def hinges(self, other):
        if self.text != other.text:
            return False
        if self.settings != other.settings:
            return False
        return self.start <= self.end == other.start <= other.end


def parse_fragment(frag_content):
    """
    A generator that yields (partially) parsed WebVTT blocks when given
    a bytes object containing the raw contents of a WebVTT file.
    """

    parser = _MatchParser(frag_content.decode('utf-8'))

    yield Magic.parse(parser)

    while not parser.match(_REGEX_EOF):
        if parser.consume(_REGEX_BLANK):
            continue

        block = RegionBlock.parse(parser)
        if block:
            yield block
            continue
        block = StyleBlock.parse(parser)
        if block:
            yield block
            continue
        block = CommentBlock.parse(parser)
        if block:
            yield block  # XXX: or skip
            continue

        break

    while not parser.match(_REGEX_EOF):
        if parser.consume(_REGEX_BLANK):
            continue

        block = CommentBlock.parse(parser)
        if block:
            yield block  # XXX: or skip
            continue
        block = CueBlock.parse(parser)
        if block:
            yield block
            continue

        raise ParseError(parser)
Commit	Line	Data
4a2f19ab F	1	"""
	2	A partial parser for WebVTT segments. Interprets enough of the WebVTT stream
	3	to be able to assemble a single stand-alone subtitle file, suitably adjusting
	4	timestamps on the way, while everything else is passed through unmodified.
	5
	6	Regular expressions based on the W3C WebVTT specification
	7	<https://www.w3.org/TR/webvtt1/>. The X-TIMESTAMP-MAP extension is described
	8	in RFC 8216 §3.5 <https://tools.ietf.org/html/rfc8216#section-3.5>.
	9	"""
	10
4a2f19ab	11	import io
f8271158	12
77f90330	13	from .compat import re
aa7785f8	14	from .utils import int_or_none, timetuple_from_msec
4a2f19ab F	15
4a2f19ab F	16
86e5f3ed	17	class _MatchParser:
4a2f19ab F	18	"""
	19	An object that maintains the current parsing position and allows
	20	conveniently advancing it as syntax elements are successfully parsed.
	21	"""
	22
	23	def __init__(self, string):
	24	self._data = string
	25	self._pos = 0
	26
	27	def match(self, r):
77f90330	28	if isinstance(r, re.Pattern):
4a2f19ab F	29	return r.match(self._data, self._pos)
	30	if isinstance(r, str):
	31	if self._data.startswith(r, self._pos):
	32	return len(r)
	33	return None
	34	raise ValueError(r)
	35
	36	def advance(self, by):
	37	if by is None:
	38	amt = 0
77f90330	39	elif isinstance(by, re.Match):
4a2f19ab F	40	amt = len(by.group(0))
	41	elif isinstance(by, str):
	42	amt = len(by)
	43	elif isinstance(by, int):
	44	amt = by
	45	else:
	46	raise ValueError(by)
	47	self._pos += amt
	48	return by
	49
	50	def consume(self, r):
	51	return self.advance(self.match(r))
	52
	53	def child(self):
	54	return _MatchChildParser(self)
	55
	56
	57	class _MatchChildParser(_MatchParser):
	58	"""
	59	A child parser state, which advances through the same data as
	60	its parent, but has an independent position. This is useful when
	61	advancing through syntax elements we might later want to backtrack
	62	from.
	63	"""
	64
	65	def __init__(self, parent):
86e5f3ed	66	super().__init__(parent._data)
4a2f19ab F	67	self.__parent = parent
	68	self._pos = parent._pos
	69
	70	def commit(self):
	71	"""
	72	Advance the parent state to the current position of this child state.
	73	"""
	74	self.__parent._pos = self._pos
	75	return self.__parent
	76
	77
	78	class ParseError(Exception):
	79	def __init__(self, parser):
86e5f3ed	80	super().__init__("Parse error at position %u (near %r)" % (
4a2f19ab F	81	parser._pos, parser._data[parser._pos:parser._pos + 20]
	82	))
	83
	84
81a136b8	85	# While the specification <https://www.w3.org/TR/webvtt1/#webvtt-timestamp>
	86	# prescribes that hours must be 2 or more digits, timestamps with a single
	87	# digit for the hour part has been seen in the wild.
	88	# See https://github.com/yt-dlp/yt-dlp/issues/921
4a2f19ab	89	_REGEX_TS = re.compile(r'''(?x)
81a136b8	90	(?:([0-9]{1,}):)?
4a2f19ab F	91	([0-9]{2}):
	92	([0-9]{2})\.
	93	([0-9]{3})?
	94	''')
	95	_REGEX_EOF = re.compile(r'\Z')
	96	_REGEX_NL = re.compile(r'(?:\r\n\|[\r\n])')
	97	_REGEX_BLANK = re.compile(r'(?:\r\n\|[\r\n])+')
	98
	99
	100	def _parse_ts(ts):
	101	"""
	102	Convert a parsed WebVTT timestamp (a re.Match obtained from _REGEX_TS)
	103	into an MPEG PES timestamp: a tick counter at 90 kHz resolution.
	104	"""
19a03940	105	return 90 * sum(
19a03940	106	int(part or 0) * mult for part, mult in zip(ts.groups(), (3600_000, 60_000, 1000, 1)))
4a2f19ab F	107
	108
	109	def _format_ts(ts):
	110	"""
	111	Convert an MPEG PES timestamp into a WebVTT timestamp.
	112	This will lose sub-millisecond precision.
	113	"""
aa7785f8	114	return '%02u:%02u:%02u.%03u' % timetuple_from_msec(int((ts + 45) // 90))
4a2f19ab F	115
4a2f19ab F	116
86e5f3ed	117	class Block:
4a2f19ab F	118	"""
	119	An abstract WebVTT block.
	120	"""
	121
	122	def __init__(self, **kwargs):
	123	for key, val in kwargs.items():
	124	setattr(self, key, val)
	125
	126	@classmethod
	127	def parse(cls, parser):
	128	m = parser.match(cls._REGEX)
	129	if not m:
	130	return None
	131	parser.advance(m)
	132	return cls(raw=m.group(0))
	133
	134	def write_into(self, stream):
	135	stream.write(self.raw)
	136
	137
	138	class HeaderBlock(Block):
	139	"""
	140	A WebVTT block that may only appear in the header part of the file,
	141	i.e. before any cue blocks.
	142	"""
	143
	144	pass
	145
	146
	147	class Magic(HeaderBlock):
	148	_REGEX = re.compile(r'\ufeff?WEBVTT([ \t][^\r\n]*)?(?:\r\n\|[\r\n])')
	149
	150	# XXX: The X-TIMESTAMP-MAP extension is described in RFC 8216 §3.5
	151	# <https://tools.ietf.org/html/rfc8216#section-3.5>, but the RFC
	152	# doesn’t specify the exact grammar nor where in the WebVTT
	153	# syntax it should be placed; the below has been devised based
	154	# on usage in the wild
	155	#
	156	# And strictly speaking, the presence of this extension violates
	157	# the W3C WebVTT spec. Oh well.
	158
	159	_REGEX_TSMAP = re.compile(r'X-TIMESTAMP-MAP=')
	160	_REGEX_TSMAP_LOCAL = re.compile(r'LOCAL:')
	161	_REGEX_TSMAP_MPEGTS = re.compile(r'MPEGTS:([0-9]+)')
81a136b8	162	_REGEX_TSMAP_SEP = re.compile(r'[ \t],[ \t]')
4a2f19ab F	163
	164	@classmethod
	165	def __parse_tsmap(cls, parser):
	166	parser = parser.child()
	167
	168	while True:
	169	m = parser.consume(cls._REGEX_TSMAP_LOCAL)
	170	if m:
	171	m = parser.consume(_REGEX_TS)
	172	if m is None:
	173	raise ParseError(parser)
	174	local = _parse_ts(m)
	175	if local is None:
	176	raise ParseError(parser)
	177	else:
	178	m = parser.consume(cls._REGEX_TSMAP_MPEGTS)
	179	if m:
	180	mpegts = int_or_none(m.group(1))
	181	if mpegts is None:
	182	raise ParseError(parser)
	183	else:
	184	raise ParseError(parser)
81a136b8	185	if parser.consume(cls._REGEX_TSMAP_SEP):
4a2f19ab F	186	continue
	187	if parser.consume(_REGEX_NL):
	188	break
	189	raise ParseError(parser)
	190
	191	parser.commit()
	192	return local, mpegts
	193
	194	@classmethod
	195	def parse(cls, parser):
	196	parser = parser.child()
	197
	198	m = parser.consume(cls._REGEX)
	199	if not m:
	200	raise ParseError(parser)
	201
	202	extra = m.group(1)
	203	local, mpegts = None, None
	204	if parser.consume(cls._REGEX_TSMAP):
	205	local, mpegts = cls.__parse_tsmap(parser)
	206	if not parser.consume(_REGEX_NL):
	207	raise ParseError(parser)
	208	parser.commit()
	209	return cls(extra=extra, mpegts=mpegts, local=local)
	210
	211	def write_into(self, stream):
	212	stream.write('WEBVTT')
	213	if self.extra is not None:
	214	stream.write(self.extra)
	215	stream.write('\n')
	216	if self.local or self.mpegts:
	217	stream.write('X-TIMESTAMP-MAP=LOCAL:')
	218	stream.write(_format_ts(self.local if self.local is not None else 0))
	219	stream.write(',MPEGTS:')
	220	stream.write(str(self.mpegts if self.mpegts is not None else 0))
	221	stream.write('\n')
	222	stream.write('\n')
	223
	224
	225	class StyleBlock(HeaderBlock):
	226	_REGEX = re.compile(r'''(?x)
	227	STYLE[\ \t]*(?:\r\n\|[\r\n])
	228	((?:(?!-->)[^\r\n])+(?:\r\n\|[\r\n]))*
	229	(?:\r\n\|[\r\n])
	230	''')
	231
	232
	233	class RegionBlock(HeaderBlock):
	234	_REGEX = re.compile(r'''(?x)
	235	REGION[\ \t]*
	236	((?:(?!-->)[^\r\n])+(?:\r\n\|[\r\n]))*
	237	(?:\r\n\|[\r\n])
	238	''')
	239
	240
	241	class CommentBlock(Block):
	242	_REGEX = re.compile(r'''(?x)
	243	NOTE(?:\r\n\|[\ \t\r\n])
	244	((?:(?!-->)[^\r\n])+(?:\r\n\|[\r\n]))*
	245	(?:\r\n\|[\r\n])
	246	''')
	247
	248
	249	class CueBlock(Block):
250	"""
251	A cue block. The payload is not interpreted.
252	"""
253
254	_REGEX_ID = re.compile(r'((?:(?!-->)[^\r\n])+)(?:\r\n\|[\r\n])')
255	_REGEX_ARROW = re.compile(r'[ \t]+-->[ \t]+')
256	_REGEX_SETTINGS = re.compile(r'[ \t]+((?:(?!-->)[^\r\n])+)')
257	_REGEX_PAYLOAD = re.compile(r'[^\r\n]+(?:\r\n\|[\r\n])?')
258
259	@classmethod
260	def parse(cls, parser):
261	parser = parser.child()
262
263	id = None
264	m = parser.consume(cls._REGEX_ID)
265	if m:
266	id = m.group(1)
267
268	m0 = parser.consume(_REGEX_TS)
269	if not m0:
270	return None
271	if not parser.consume(cls._REGEX_ARROW):
272	return None
273	m1 = parser.consume(_REGEX_TS)
274	if not m1:
275	return None
276	m2 = parser.consume(cls._REGEX_SETTINGS)
277	if not parser.consume(_REGEX_NL):
278	return None
279
280	start = _parse_ts(m0)
281	end = _parse_ts(m1)
282	settings = m2.group(1) if m2 is not None else None
283
284	text = io.StringIO()
285	while True:
286	m = parser.consume(cls._REGEX_PAYLOAD)
287	if not m:
288	break
289	text.write(m.group(0))
290
291	parser.commit()
292	return cls(
293	id=id,
294	start=start, end=end, settings=settings,
295	text=text.getvalue()
296	)
297
298	def write_into(self, stream):
299	if self.id is not None:
300	stream.write(self.id)
301	stream.write('\n')
302	stream.write(_format_ts(self.start))
303	stream.write(' --> ')
304	stream.write(_format_ts(self.end))
305	if self.settings is not None:
306	stream.write(' ')
307	stream.write(self.settings)
308	stream.write('\n')
309	stream.write(self.text)
310	stream.write('\n')
311
333217f4 F	312	@property
	313	def as_json(self):
	314	return {
	315	'id': self.id,
	316	'start': self.start,
	317	'end': self.end,
	318	'text': self.text,
	319	'settings': self.settings,
	320	}
	321
25a3f4f5 F	322	def __eq__(self, other):
	323	return self.as_json == other.as_json
	324
	325	@classmethod
	326	def from_json(cls, json):
	327	return cls(
	328	id=json['id'],
	329	start=json['start'],
	330	end=json['end'],
	331	text=json['text'],
	332	settings=json['settings']
	333	)
	334
	335	def hinges(self, other):
	336	if self.text != other.text:
	337	return False
	338	if self.settings != other.settings:
	339	return False
	340	return self.start <= self.end == other.start <= other.end
	341
4a2f19ab F	342
	343	def parse_fragment(frag_content):
	344	"""
	345	A generator that yields (partially) parsed WebVTT blocks when given
	346	a bytes object containing the raw contents of a WebVTT file.
	347	"""
	348
	349	parser = _MatchParser(frag_content.decode('utf-8'))
	350
	351	yield Magic.parse(parser)
	352
	353	while not parser.match(_REGEX_EOF):
	354	if parser.consume(_REGEX_BLANK):
	355	continue
	356
	357	block = RegionBlock.parse(parser)
	358	if block:
	359	yield block
	360	continue
	361	block = StyleBlock.parse(parser)
	362	if block:
	363	yield block
	364	continue
	365	block = CommentBlock.parse(parser)
	366	if block:
	367	yield block # XXX: or skip
	368	continue
	369
	370	break
	371
	372	while not parser.match(_REGEX_EOF):
	373	if parser.consume(_REGEX_BLANK):
	374	continue
	375
	376	block = CommentBlock.parse(parser)
	377	if block:
	378	yield block # XXX: or skip
	379	continue
	380	block = CueBlock.parse(parser)
	381	if block:
	382	yield block
	383	continue
	384
	385	raise ParseError(parser)