[yt-dlp.git] / yt_dlp / webvtt.py

# coding: utf-8
from __future__ import unicode_literals, print_function, division

"""
A partial parser for WebVTT segments. Interprets enough of the WebVTT stream
to be able to assemble a single stand-alone subtitle file, suitably adjusting
timestamps on the way, while everything else is passed through unmodified.

Regular expressions based on the W3C WebVTT specification
<https://www.w3.org/TR/webvtt1/>. The X-TIMESTAMP-MAP extension is described
in RFC 8216 §3.5 <https://tools.ietf.org/html/rfc8216#section-3.5>.
"""

import re
import io
from .utils import int_or_none, timetuple_from_msec
from .compat import (
    compat_str as str,
    compat_Pattern,
    compat_Match,
)


class _MatchParser(object):
    """
    An object that maintains the current parsing position and allows
    conveniently advancing it as syntax elements are successfully parsed.
    """

    def __init__(self, string):
        self._data = string
        self._pos = 0

    def match(self, r):
        if isinstance(r, compat_Pattern):
            return r.match(self._data, self._pos)
        if isinstance(r, str):
            if self._data.startswith(r, self._pos):
                return len(r)
            return None
        raise ValueError(r)

    def advance(self, by):
        if by is None:
            amt = 0
        elif isinstance(by, compat_Match):
            amt = len(by.group(0))
        elif isinstance(by, str):
            amt = len(by)
        elif isinstance(by, int):
            amt = by
        else:
            raise ValueError(by)
        self._pos += amt
        return by

    def consume(self, r):
        return self.advance(self.match(r))

    def child(self):
        return _MatchChildParser(self)


class _MatchChildParser(_MatchParser):
    """
    A child parser state, which advances through the same data as
    its parent, but has an independent position. This is useful when
    advancing through syntax elements we might later want to backtrack
    from.
    """

    def __init__(self, parent):
        super(_MatchChildParser, self).__init__(parent._data)
        self.__parent = parent
        self._pos = parent._pos

    def commit(self):
        """
        Advance the parent state to the current position of this child state.
        """
        self.__parent._pos = self._pos
        return self.__parent


class ParseError(Exception):
    def __init__(self, parser):
        super(ParseError, self).__init__("Parse error at position %u (near %r)" % (
            parser._pos, parser._data[parser._pos:parser._pos + 20]
        ))


# While the specification <https://www.w3.org/TR/webvtt1/#webvtt-timestamp>
# prescribes that hours must be *2 or more* digits, timestamps with a single
# digit for the hour part has been seen in the wild.
# See https://github.com/yt-dlp/yt-dlp/issues/921
_REGEX_TS = re.compile(r'''(?x)
    (?:([0-9]{1,}):)?
    ([0-9]{2}):
    ([0-9]{2})\.
    ([0-9]{3})?
''')
_REGEX_EOF = re.compile(r'\Z')
_REGEX_NL = re.compile(r'(?:\r\n|[\r\n])')
_REGEX_BLANK = re.compile(r'(?:\r\n|[\r\n])+')


def _parse_ts(ts):
    """
    Convert a parsed WebVTT timestamp (a re.Match obtained from _REGEX_TS)
    into an MPEG PES timestamp: a tick counter at 90 kHz resolution.
    """

    h, min, s, ms = ts.groups()
    return 90 * (
        int(h or 0) * 3600000 +  # noqa: W504,E221,E222
        int(min)    *   60000 +  # noqa: W504,E221,E222
        int(s)      *    1000 +  # noqa: W504,E221,E222
        int(ms)                  # noqa: W504,E221,E222
    )


def _format_ts(ts):
    """
    Convert an MPEG PES timestamp into a WebVTT timestamp.
    This will lose sub-millisecond precision.
    """
    return '%02u:%02u:%02u.%03u' % timetuple_from_msec(int((ts + 45) // 90))


class Block(object):
    """
    An abstract WebVTT block.
    """

    def __init__(self, **kwargs):
        for key, val in kwargs.items():
            setattr(self, key, val)

    @classmethod
    def parse(cls, parser):
        m = parser.match(cls._REGEX)
        if not m:
            return None
        parser.advance(m)
        return cls(raw=m.group(0))

    def write_into(self, stream):
        stream.write(self.raw)


class HeaderBlock(Block):
    """
    A WebVTT block that may only appear in the header part of the file,
    i.e. before any cue blocks.
    """

    pass


class Magic(HeaderBlock):
    _REGEX = re.compile(r'\ufeff?WEBVTT([ \t][^\r\n]*)?(?:\r\n|[\r\n])')

    # XXX: The X-TIMESTAMP-MAP extension is described in RFC 8216 §3.5
    # <https://tools.ietf.org/html/rfc8216#section-3.5>, but the RFC
    # doesn’t specify the exact grammar nor where in the WebVTT
    # syntax it should be placed; the below has been devised based
    # on usage in the wild
    #
    # And strictly speaking, the presence of this extension violates
    # the W3C WebVTT spec. Oh well.

    _REGEX_TSMAP = re.compile(r'X-TIMESTAMP-MAP=')
    _REGEX_TSMAP_LOCAL = re.compile(r'LOCAL:')
    _REGEX_TSMAP_MPEGTS = re.compile(r'MPEGTS:([0-9]+)')
    _REGEX_TSMAP_SEP = re.compile(r'[ \t]*,[ \t]*')

    @classmethod
    def __parse_tsmap(cls, parser):
        parser = parser.child()

        while True:
            m = parser.consume(cls._REGEX_TSMAP_LOCAL)
            if m:
                m = parser.consume(_REGEX_TS)
                if m is None:
                    raise ParseError(parser)
                local = _parse_ts(m)
                if local is None:
                    raise ParseError(parser)
            else:
                m = parser.consume(cls._REGEX_TSMAP_MPEGTS)
                if m:
                    mpegts = int_or_none(m.group(1))
                    if mpegts is None:
                        raise ParseError(parser)
                else:
                    raise ParseError(parser)
            if parser.consume(cls._REGEX_TSMAP_SEP):
                continue
            if parser.consume(_REGEX_NL):
                break
            raise ParseError(parser)

        parser.commit()
        return local, mpegts

    @classmethod
    def parse(cls, parser):
        parser = parser.child()

        m = parser.consume(cls._REGEX)
        if not m:
            raise ParseError(parser)

        extra = m.group(1)
        local, mpegts = None, None
        if parser.consume(cls._REGEX_TSMAP):
            local, mpegts = cls.__parse_tsmap(parser)
        if not parser.consume(_REGEX_NL):
            raise ParseError(parser)
        parser.commit()
        return cls(extra=extra, mpegts=mpegts, local=local)

    def write_into(self, stream):
        stream.write('WEBVTT')
        if self.extra is not None:
            stream.write(self.extra)
        stream.write('\n')
        if self.local or self.mpegts:
            stream.write('X-TIMESTAMP-MAP=LOCAL:')
            stream.write(_format_ts(self.local if self.local is not None else 0))
            stream.write(',MPEGTS:')
            stream.write(str(self.mpegts if self.mpegts is not None else 0))
            stream.write('\n')
        stream.write('\n')


class StyleBlock(HeaderBlock):
    _REGEX = re.compile(r'''(?x)
        STYLE[\ \t]*(?:\r\n|[\r\n])
        ((?:(?!-->)[^\r\n])+(?:\r\n|[\r\n]))*
        (?:\r\n|[\r\n])
    ''')


class RegionBlock(HeaderBlock):
    _REGEX = re.compile(r'''(?x)
        REGION[\ \t]*
        ((?:(?!-->)[^\r\n])+(?:\r\n|[\r\n]))*
        (?:\r\n|[\r\n])
    ''')


class CommentBlock(Block):
    _REGEX = re.compile(r'''(?x)
        NOTE(?:\r\n|[\ \t\r\n])
        ((?:(?!-->)[^\r\n])+(?:\r\n|[\r\n]))*
        (?:\r\n|[\r\n])
    ''')


class CueBlock(Block):
    """
    A cue block. The payload is not interpreted.
    """

    _REGEX_ID = re.compile(r'((?:(?!-->)[^\r\n])+)(?:\r\n|[\r\n])')
    _REGEX_ARROW = re.compile(r'[ \t]+-->[ \t]+')
    _REGEX_SETTINGS = re.compile(r'[ \t]+((?:(?!-->)[^\r\n])+)')
    _REGEX_PAYLOAD = re.compile(r'[^\r\n]+(?:\r\n|[\r\n])?')

    @classmethod
    def parse(cls, parser):
        parser = parser.child()

        id = None
        m = parser.consume(cls._REGEX_ID)
        if m:
            id = m.group(1)

        m0 = parser.consume(_REGEX_TS)
        if not m0:
            return None
        if not parser.consume(cls._REGEX_ARROW):
            return None
        m1 = parser.consume(_REGEX_TS)
        if not m1:
            return None
        m2 = parser.consume(cls._REGEX_SETTINGS)
        if not parser.consume(_REGEX_NL):
            return None

        start = _parse_ts(m0)
        end = _parse_ts(m1)
        settings = m2.group(1) if m2 is not None else None

        text = io.StringIO()
        while True:
            m = parser.consume(cls._REGEX_PAYLOAD)
            if not m:
                break
            text.write(m.group(0))

        parser.commit()
        return cls(
            id=id,
            start=start, end=end, settings=settings,
            text=text.getvalue()
        )

    def write_into(self, stream):
        if self.id is not None:
            stream.write(self.id)
            stream.write('\n')
        stream.write(_format_ts(self.start))
        stream.write(' --> ')
        stream.write(_format_ts(self.end))
        if self.settings is not None:
            stream.write(' ')
            stream.write(self.settings)
        stream.write('\n')
        stream.write(self.text)
        stream.write('\n')

    @property
    def as_json(self):
        return {
            'id': self.id,
            'start': self.start,
            'end': self.end,
            'text': self.text,
            'settings': self.settings,
        }

    def __eq__(self, other):
        return self.as_json == other.as_json

    @classmethod
    def from_json(cls, json):
        return cls(
            id=json['id'],
            start=json['start'],
            end=json['end'],
            text=json['text'],
            settings=json['settings']
        )

    def hinges(self, other):
        if self.text != other.text:
            return False
        if self.settings != other.settings:
            return False
        return self.start <= self.end == other.start <= other.end


def parse_fragment(frag_content):
    """
    A generator that yields (partially) parsed WebVTT blocks when given
    a bytes object containing the raw contents of a WebVTT file.
    """

    parser = _MatchParser(frag_content.decode('utf-8'))

    yield Magic.parse(parser)

    while not parser.match(_REGEX_EOF):
        if parser.consume(_REGEX_BLANK):
            continue

        block = RegionBlock.parse(parser)
        if block:
            yield block
            continue
        block = StyleBlock.parse(parser)
        if block:
            yield block
            continue
        block = CommentBlock.parse(parser)
        if block:
            yield block  # XXX: or skip
            continue

        break

    while not parser.match(_REGEX_EOF):
        if parser.consume(_REGEX_BLANK):
            continue

        block = CommentBlock.parse(parser)
        if block:
            yield block  # XXX: or skip
            continue
        block = CueBlock.parse(parser)
        if block:
            yield block
            continue

        raise ParseError(parser)
Commit	Line	Data
4a2f19ab F	1	# coding: utf-8
	2	from __future__ import unicode_literals, print_function, division
	3
	4	"""
	5	A partial parser for WebVTT segments. Interprets enough of the WebVTT stream
	6	to be able to assemble a single stand-alone subtitle file, suitably adjusting
	7	timestamps on the way, while everything else is passed through unmodified.
	8
	9	Regular expressions based on the W3C WebVTT specification
	10	<https://www.w3.org/TR/webvtt1/>. The X-TIMESTAMP-MAP extension is described
	11	in RFC 8216 §3.5 <https://tools.ietf.org/html/rfc8216#section-3.5>.
	12	"""
	13
	14	import re
	15	import io
aa7785f8	16	from .utils import int_or_none, timetuple_from_msec
4a2f19ab F	17	from .compat import (
	18	compat_str as str,
	19	compat_Pattern,
	20	compat_Match,
	21	)
	22
	23
	24	class _MatchParser(object):
	25	"""
	26	An object that maintains the current parsing position and allows
	27	conveniently advancing it as syntax elements are successfully parsed.
	28	"""
	29
	30	def __init__(self, string):
	31	self._data = string
	32	self._pos = 0
	33
	34	def match(self, r):
	35	if isinstance(r, compat_Pattern):
	36	return r.match(self._data, self._pos)
	37	if isinstance(r, str):
	38	if self._data.startswith(r, self._pos):
	39	return len(r)
	40	return None
	41	raise ValueError(r)
	42
	43	def advance(self, by):
	44	if by is None:
	45	amt = 0
	46	elif isinstance(by, compat_Match):
	47	amt = len(by.group(0))
	48	elif isinstance(by, str):
	49	amt = len(by)
	50	elif isinstance(by, int):
	51	amt = by
	52	else:
	53	raise ValueError(by)
	54	self._pos += amt
	55	return by
	56
	57	def consume(self, r):
	58	return self.advance(self.match(r))
	59
	60	def child(self):
	61	return _MatchChildParser(self)
	62
	63
	64	class _MatchChildParser(_MatchParser):
	65	"""
	66	A child parser state, which advances through the same data as
	67	its parent, but has an independent position. This is useful when
	68	advancing through syntax elements we might later want to backtrack
	69	from.
	70	"""
	71
	72	def __init__(self, parent):
	73	super(_MatchChildParser, self).__init__(parent._data)
	74	self.__parent = parent
	75	self._pos = parent._pos
	76
	77	def commit(self):
	78	"""
	79	Advance the parent state to the current position of this child state.
	80	"""
81	self.__parent._pos = self._pos
82	return self.__parent
83
84
85	class ParseError(Exception):
86	def __init__(self, parser):
87	super(ParseError, self).__init__("Parse error at position %u (near %r)" % (
88	parser._pos, parser._data[parser._pos:parser._pos + 20]
89	))
90
91
81a136b8	92	# While the specification <https://www.w3.org/TR/webvtt1/#webvtt-timestamp>
	93	# prescribes that hours must be 2 or more digits, timestamps with a single
	94	# digit for the hour part has been seen in the wild.
	95	# See https://github.com/yt-dlp/yt-dlp/issues/921
4a2f19ab	96	_REGEX_TS = re.compile(r'''(?x)
81a136b8	97	(?:([0-9]{1,}):)?
4a2f19ab F	98	([0-9]{2}):
	99	([0-9]{2})\.
	100	([0-9]{3})?
	101	''')
	102	_REGEX_EOF = re.compile(r'\Z')
	103	_REGEX_NL = re.compile(r'(?:\r\n\|[\r\n])')
	104	_REGEX_BLANK = re.compile(r'(?:\r\n\|[\r\n])+')
	105
	106
	107	def _parse_ts(ts):
	108	"""
	109	Convert a parsed WebVTT timestamp (a re.Match obtained from _REGEX_TS)
	110	into an MPEG PES timestamp: a tick counter at 90 kHz resolution.
	111	"""
	112
	113	h, min, s, ms = ts.groups()
	114	return 90 * (
	115	int(h or 0) * 3600000 + # noqa: W504,E221,E222
	116	int(min) * 60000 + # noqa: W504,E221,E222
	117	int(s) * 1000 + # noqa: W504,E221,E222
	118	int(ms) # noqa: W504,E221,E222
	119	)
	120
	121
	122	def _format_ts(ts):
	123	"""
	124	Convert an MPEG PES timestamp into a WebVTT timestamp.
	125	This will lose sub-millisecond precision.
	126	"""
aa7785f8	127	return '%02u:%02u:%02u.%03u' % timetuple_from_msec(int((ts + 45) // 90))
4a2f19ab F	128
	129
	130	class Block(object):
	131	"""
	132	An abstract WebVTT block.
	133	"""
	134
	135	def __init__(self, **kwargs):
	136	for key, val in kwargs.items():
	137	setattr(self, key, val)
	138
	139	@classmethod
	140	def parse(cls, parser):
	141	m = parser.match(cls._REGEX)
	142	if not m:
	143	return None
	144	parser.advance(m)
	145	return cls(raw=m.group(0))
	146
	147	def write_into(self, stream):
	148	stream.write(self.raw)
	149
	150
	151	class HeaderBlock(Block):
	152	"""
	153	A WebVTT block that may only appear in the header part of the file,
	154	i.e. before any cue blocks.
	155	"""
	156
	157	pass
	158
	159
	160	class Magic(HeaderBlock):
	161	_REGEX = re.compile(r'\ufeff?WEBVTT([ \t][^\r\n]*)?(?:\r\n\|[\r\n])')
	162
	163	# XXX: The X-TIMESTAMP-MAP extension is described in RFC 8216 §3.5
	164	# <https://tools.ietf.org/html/rfc8216#section-3.5>, but the RFC
	165	# doesn’t specify the exact grammar nor where in the WebVTT
	166	# syntax it should be placed; the below has been devised based
	167	# on usage in the wild
	168	#
	169	# And strictly speaking, the presence of this extension violates
	170	# the W3C WebVTT spec. Oh well.
	171
	172	_REGEX_TSMAP = re.compile(r'X-TIMESTAMP-MAP=')
	173	_REGEX_TSMAP_LOCAL = re.compile(r'LOCAL:')
	174	_REGEX_TSMAP_MPEGTS = re.compile(r'MPEGTS:([0-9]+)')
81a136b8	175	_REGEX_TSMAP_SEP = re.compile(r'[ \t],[ \t]')
4a2f19ab F	176
	177	@classmethod
	178	def __parse_tsmap(cls, parser):
	179	parser = parser.child()
	180
	181	while True:
	182	m = parser.consume(cls._REGEX_TSMAP_LOCAL)
	183	if m:
	184	m = parser.consume(_REGEX_TS)
	185	if m is None:
	186	raise ParseError(parser)
	187	local = _parse_ts(m)
	188	if local is None:
	189	raise ParseError(parser)
	190	else:
	191	m = parser.consume(cls._REGEX_TSMAP_MPEGTS)
	192	if m:
	193	mpegts = int_or_none(m.group(1))
	194	if mpegts is None:
	195	raise ParseError(parser)
	196	else:
	197	raise ParseError(parser)
81a136b8	198	if parser.consume(cls._REGEX_TSMAP_SEP):
4a2f19ab F	199	continue
	200	if parser.consume(_REGEX_NL):
	201	break
	202	raise ParseError(parser)
	203
	204	parser.commit()
	205	return local, mpegts
	206
	207	@classmethod
	208	def parse(cls, parser):
	209	parser = parser.child()
	210
	211	m = parser.consume(cls._REGEX)
	212	if not m:
	213	raise ParseError(parser)
	214
	215	extra = m.group(1)
	216	local, mpegts = None, None
	217	if parser.consume(cls._REGEX_TSMAP):
	218	local, mpegts = cls.__parse_tsmap(parser)
	219	if not parser.consume(_REGEX_NL):
	220	raise ParseError(parser)
	221	parser.commit()
	222	return cls(extra=extra, mpegts=mpegts, local=local)
	223
	224	def write_into(self, stream):
	225	stream.write('WEBVTT')
	226	if self.extra is not None:
	227	stream.write(self.extra)
	228	stream.write('\n')
	229	if self.local or self.mpegts:
	230	stream.write('X-TIMESTAMP-MAP=LOCAL:')
	231	stream.write(_format_ts(self.local if self.local is not None else 0))
	232	stream.write(',MPEGTS:')
	233	stream.write(str(self.mpegts if self.mpegts is not None else 0))
	234	stream.write('\n')
	235	stream.write('\n')
	236
	237
	238	class StyleBlock(HeaderBlock):
	239	_REGEX = re.compile(r'''(?x)
	240	STYLE[\ \t]*(?:\r\n\|[\r\n])
	241	((?:(?!-->)[^\r\n])+(?:\r\n\|[\r\n]))*
	242	(?:\r\n\|[\r\n])
	243	''')
	244
	245
	246	class RegionBlock(HeaderBlock):
	247	_REGEX = re.compile(r'''(?x)
	248	REGION[\ \t]*
	249	((?:(?!-->)[^\r\n])+(?:\r\n\|[\r\n]))*
	250	(?:\r\n\|[\r\n])
	251	''')
	252
	253
	254	class CommentBlock(Block):
	255	_REGEX = re.compile(r'''(?x)
	256	NOTE(?:\r\n\|[\ \t\r\n])
	257	((?:(?!-->)[^\r\n])+(?:\r\n\|[\r\n]))*
	258	(?:\r\n\|[\r\n])
	259	''')
	260
	261
	262	class CueBlock(Block):
263	"""
264	A cue block. The payload is not interpreted.
265	"""
266
267	_REGEX_ID = re.compile(r'((?:(?!-->)[^\r\n])+)(?:\r\n\|[\r\n])')
268	_REGEX_ARROW = re.compile(r'[ \t]+-->[ \t]+')
269	_REGEX_SETTINGS = re.compile(r'[ \t]+((?:(?!-->)[^\r\n])+)')
270	_REGEX_PAYLOAD = re.compile(r'[^\r\n]+(?:\r\n\|[\r\n])?')
271
272	@classmethod
273	def parse(cls, parser):
274	parser = parser.child()
275
276	id = None
277	m = parser.consume(cls._REGEX_ID)
278	if m:
279	id = m.group(1)
280
281	m0 = parser.consume(_REGEX_TS)
282	if not m0:
283	return None
284	if not parser.consume(cls._REGEX_ARROW):
285	return None
286	m1 = parser.consume(_REGEX_TS)
287	if not m1:
288	return None
289	m2 = parser.consume(cls._REGEX_SETTINGS)
290	if not parser.consume(_REGEX_NL):
291	return None
292
293	start = _parse_ts(m0)
294	end = _parse_ts(m1)
295	settings = m2.group(1) if m2 is not None else None
296
297	text = io.StringIO()
298	while True:
299	m = parser.consume(cls._REGEX_PAYLOAD)
300	if not m:
301	break
302	text.write(m.group(0))
303
304	parser.commit()
305	return cls(
306	id=id,
307	start=start, end=end, settings=settings,
308	text=text.getvalue()
309	)
310
311	def write_into(self, stream):
312	if self.id is not None:
313	stream.write(self.id)
314	stream.write('\n')
315	stream.write(_format_ts(self.start))
316	stream.write(' --> ')
317	stream.write(_format_ts(self.end))
318	if self.settings is not None:
319	stream.write(' ')
320	stream.write(self.settings)
321	stream.write('\n')
322	stream.write(self.text)
323	stream.write('\n')
324
333217f4 F	325	@property
	326	def as_json(self):
	327	return {
	328	'id': self.id,
	329	'start': self.start,
	330	'end': self.end,
	331	'text': self.text,
	332	'settings': self.settings,
	333	}
	334
25a3f4f5 F	335	def __eq__(self, other):
	336	return self.as_json == other.as_json
	337
	338	@classmethod
	339	def from_json(cls, json):
	340	return cls(
	341	id=json['id'],
	342	start=json['start'],
	343	end=json['end'],
	344	text=json['text'],
	345	settings=json['settings']
	346	)
	347
	348	def hinges(self, other):
	349	if self.text != other.text:
	350	return False
	351	if self.settings != other.settings:
	352	return False
	353	return self.start <= self.end == other.start <= other.end
	354
4a2f19ab F	355
	356	def parse_fragment(frag_content):
	357	"""
	358	A generator that yields (partially) parsed WebVTT blocks when given
	359	a bytes object containing the raw contents of a WebVTT file.
	360	"""
	361
	362	parser = _MatchParser(frag_content.decode('utf-8'))
	363
	364	yield Magic.parse(parser)
	365
	366	while not parser.match(_REGEX_EOF):
	367	if parser.consume(_REGEX_BLANK):
	368	continue
	369
	370	block = RegionBlock.parse(parser)
	371	if block:
	372	yield block
	373	continue
	374	block = StyleBlock.parse(parser)
	375	if block:
	376	yield block
	377	continue
	378	block = CommentBlock.parse(parser)
	379	if block:
	380	yield block # XXX: or skip
	381	continue
	382
	383	break
	384
	385	while not parser.match(_REGEX_EOF):
	386	if parser.consume(_REGEX_BLANK):
	387	continue
	388
	389	block = CommentBlock.parse(parser)
	390	if block:
	391	yield block # XXX: or skip
	392	continue
	393	block = CueBlock.parse(parser)
	394	if block:
	395	yield block
	396	continue
	397
	398	raise ParseError(parser)