[yt-dlp.git] / yt_dlp / webvtt.py

# coding: utf-8
from __future__ import unicode_literals, print_function, division

"""
A partial parser for WebVTT segments. Interprets enough of the WebVTT stream
to be able to assemble a single stand-alone subtitle file, suitably adjusting
timestamps on the way, while everything else is passed through unmodified.

Regular expressions based on the W3C WebVTT specification
<https://www.w3.org/TR/webvtt1/>. The X-TIMESTAMP-MAP extension is described
in RFC 8216 §3.5 <https://tools.ietf.org/html/rfc8216#section-3.5>.
"""

import re
import io
from .utils import int_or_none
from .compat import (
    compat_str as str,
    compat_Pattern,
    compat_Match,
)


class _MatchParser(object):
    """
    An object that maintains the current parsing position and allows
    conveniently advancing it as syntax elements are successfully parsed.
    """

    def __init__(self, string):
        self._data = string
        self._pos = 0

    def match(self, r):
        if isinstance(r, compat_Pattern):
            return r.match(self._data, self._pos)
        if isinstance(r, str):
            if self._data.startswith(r, self._pos):
                return len(r)
            return None
        raise ValueError(r)

    def advance(self, by):
        if by is None:
            amt = 0
        elif isinstance(by, compat_Match):
            amt = len(by.group(0))
        elif isinstance(by, str):
            amt = len(by)
        elif isinstance(by, int):
            amt = by
        else:
            raise ValueError(by)
        self._pos += amt
        return by

    def consume(self, r):
        return self.advance(self.match(r))

    def child(self):
        return _MatchChildParser(self)


class _MatchChildParser(_MatchParser):
    """
    A child parser state, which advances through the same data as
    its parent, but has an independent position. This is useful when
    advancing through syntax elements we might later want to backtrack
    from.
    """

    def __init__(self, parent):
        super(_MatchChildParser, self).__init__(parent._data)
        self.__parent = parent
        self._pos = parent._pos

    def commit(self):
        """
        Advance the parent state to the current position of this child state.
        """
        self.__parent._pos = self._pos
        return self.__parent


class ParseError(Exception):
    def __init__(self, parser):
        super(ParseError, self).__init__("Parse error at position %u (near %r)" % (
            parser._pos, parser._data[parser._pos:parser._pos + 20]
        ))


# While the specification <https://www.w3.org/TR/webvtt1/#webvtt-timestamp>
# prescribes that hours must be *2 or more* digits, timestamps with a single
# digit for the hour part has been seen in the wild.
# See https://github.com/yt-dlp/yt-dlp/issues/921
_REGEX_TS = re.compile(r'''(?x)
    (?:([0-9]{1,}):)?
    ([0-9]{2}):
    ([0-9]{2})\.
    ([0-9]{3})?
''')
_REGEX_EOF = re.compile(r'\Z')
_REGEX_NL = re.compile(r'(?:\r\n|[\r\n])')
_REGEX_BLANK = re.compile(r'(?:\r\n|[\r\n])+')


def _parse_ts(ts):
    """
    Convert a parsed WebVTT timestamp (a re.Match obtained from _REGEX_TS)
    into an MPEG PES timestamp: a tick counter at 90 kHz resolution.
    """

    h, min, s, ms = ts.groups()
    return 90 * (
        int(h or 0) * 3600000 +  # noqa: W504,E221,E222
        int(min)    *   60000 +  # noqa: W504,E221,E222
        int(s)      *    1000 +  # noqa: W504,E221,E222
        int(ms)                  # noqa: W504,E221,E222
    )


def _format_ts(ts):
    """
    Convert an MPEG PES timestamp into a WebVTT timestamp.
    This will lose sub-millisecond precision.
    """
    msec = int((ts + 45) // 90)
    secs, msec = divmod(msec, 1000)
    mins, secs = divmod(secs, 60)
    hrs, mins = divmod(mins, 60)
    return '%02u:%02u:%02u.%03u' % (hrs, mins, secs, msec)


class Block(object):
    """
    An abstract WebVTT block.
    """

    def __init__(self, **kwargs):
        for key, val in kwargs.items():
            setattr(self, key, val)

    @classmethod
    def parse(cls, parser):
        m = parser.match(cls._REGEX)
        if not m:
            return None
        parser.advance(m)
        return cls(raw=m.group(0))

    def write_into(self, stream):
        stream.write(self.raw)


class HeaderBlock(Block):
    """
    A WebVTT block that may only appear in the header part of the file,
    i.e. before any cue blocks.
    """

    pass


class Magic(HeaderBlock):
    _REGEX = re.compile(r'\ufeff?WEBVTT([ \t][^\r\n]*)?(?:\r\n|[\r\n])')

    # XXX: The X-TIMESTAMP-MAP extension is described in RFC 8216 §3.5
    # <https://tools.ietf.org/html/rfc8216#section-3.5>, but the RFC
    # doesn’t specify the exact grammar nor where in the WebVTT
    # syntax it should be placed; the below has been devised based
    # on usage in the wild
    #
    # And strictly speaking, the presence of this extension violates
    # the W3C WebVTT spec. Oh well.

    _REGEX_TSMAP = re.compile(r'X-TIMESTAMP-MAP=')
    _REGEX_TSMAP_LOCAL = re.compile(r'LOCAL:')
    _REGEX_TSMAP_MPEGTS = re.compile(r'MPEGTS:([0-9]+)')
    _REGEX_TSMAP_SEP = re.compile(r'[ \t]*,[ \t]*')

    @classmethod
    def __parse_tsmap(cls, parser):
        parser = parser.child()

        while True:
            m = parser.consume(cls._REGEX_TSMAP_LOCAL)
            if m:
                m = parser.consume(_REGEX_TS)
                if m is None:
                    raise ParseError(parser)
                local = _parse_ts(m)
                if local is None:
                    raise ParseError(parser)
            else:
                m = parser.consume(cls._REGEX_TSMAP_MPEGTS)
                if m:
                    mpegts = int_or_none(m.group(1))
                    if mpegts is None:
                        raise ParseError(parser)
                else:
                    raise ParseError(parser)
            if parser.consume(cls._REGEX_TSMAP_SEP):
                continue
            if parser.consume(_REGEX_NL):
                break
            raise ParseError(parser)

        parser.commit()
        return local, mpegts

    @classmethod
    def parse(cls, parser):
        parser = parser.child()

        m = parser.consume(cls._REGEX)
        if not m:
            raise ParseError(parser)

        extra = m.group(1)
        local, mpegts = None, None
        if parser.consume(cls._REGEX_TSMAP):
            local, mpegts = cls.__parse_tsmap(parser)
        if not parser.consume(_REGEX_NL):
            raise ParseError(parser)
        parser.commit()
        return cls(extra=extra, mpegts=mpegts, local=local)

    def write_into(self, stream):
        stream.write('WEBVTT')
        if self.extra is not None:
            stream.write(self.extra)
        stream.write('\n')
        if self.local or self.mpegts:
            stream.write('X-TIMESTAMP-MAP=LOCAL:')
            stream.write(_format_ts(self.local if self.local is not None else 0))
            stream.write(',MPEGTS:')
            stream.write(str(self.mpegts if self.mpegts is not None else 0))
            stream.write('\n')
        stream.write('\n')


class StyleBlock(HeaderBlock):
    _REGEX = re.compile(r'''(?x)
        STYLE[\ \t]*(?:\r\n|[\r\n])
        ((?:(?!-->)[^\r\n])+(?:\r\n|[\r\n]))*
        (?:\r\n|[\r\n])
    ''')


class RegionBlock(HeaderBlock):
    _REGEX = re.compile(r'''(?x)
        REGION[\ \t]*
        ((?:(?!-->)[^\r\n])+(?:\r\n|[\r\n]))*
        (?:\r\n|[\r\n])
    ''')


class CommentBlock(Block):
    _REGEX = re.compile(r'''(?x)
        NOTE(?:\r\n|[\ \t\r\n])
        ((?:(?!-->)[^\r\n])+(?:\r\n|[\r\n]))*
        (?:\r\n|[\r\n])
    ''')


class CueBlock(Block):
    """
    A cue block. The payload is not interpreted.
    """

    _REGEX_ID = re.compile(r'((?:(?!-->)[^\r\n])+)(?:\r\n|[\r\n])')
    _REGEX_ARROW = re.compile(r'[ \t]+-->[ \t]+')
    _REGEX_SETTINGS = re.compile(r'[ \t]+((?:(?!-->)[^\r\n])+)')
    _REGEX_PAYLOAD = re.compile(r'[^\r\n]+(?:\r\n|[\r\n])?')

    @classmethod
    def parse(cls, parser):
        parser = parser.child()

        id = None
        m = parser.consume(cls._REGEX_ID)
        if m:
            id = m.group(1)

        m0 = parser.consume(_REGEX_TS)
        if not m0:
            return None
        if not parser.consume(cls._REGEX_ARROW):
            return None
        m1 = parser.consume(_REGEX_TS)
        if not m1:
            return None
        m2 = parser.consume(cls._REGEX_SETTINGS)
        if not parser.consume(_REGEX_NL):
            return None

        start = _parse_ts(m0)
        end = _parse_ts(m1)
        settings = m2.group(1) if m2 is not None else None

        text = io.StringIO()
        while True:
            m = parser.consume(cls._REGEX_PAYLOAD)
            if not m:
                break
            text.write(m.group(0))

        parser.commit()
        return cls(
            id=id,
            start=start, end=end, settings=settings,
            text=text.getvalue()
        )

    def write_into(self, stream):
        if self.id is not None:
            stream.write(self.id)
            stream.write('\n')
        stream.write(_format_ts(self.start))
        stream.write(' --> ')
        stream.write(_format_ts(self.end))
        if self.settings is not None:
            stream.write(' ')
            stream.write(self.settings)
        stream.write('\n')
        stream.write(self.text)
        stream.write('\n')

    @property
    def as_json(self):
        return {
            'id': self.id,
            'start': self.start,
            'end': self.end,
            'text': self.text,
            'settings': self.settings,
        }

    def __eq__(self, other):
        return self.as_json == other.as_json

    @classmethod
    def from_json(cls, json):
        return cls(
            id=json['id'],
            start=json['start'],
            end=json['end'],
            text=json['text'],
            settings=json['settings']
        )

    def hinges(self, other):
        if self.text != other.text:
            return False
        if self.settings != other.settings:
            return False
        return self.start <= self.end == other.start <= other.end


def parse_fragment(frag_content):
    """
    A generator that yields (partially) parsed WebVTT blocks when given
    a bytes object containing the raw contents of a WebVTT file.
    """

    parser = _MatchParser(frag_content.decode('utf-8'))

    yield Magic.parse(parser)

    while not parser.match(_REGEX_EOF):
        if parser.consume(_REGEX_BLANK):
            continue

        block = RegionBlock.parse(parser)
        if block:
            yield block
            continue
        block = StyleBlock.parse(parser)
        if block:
            yield block
            continue
        block = CommentBlock.parse(parser)
        if block:
            yield block  # XXX: or skip
            continue

        break

    while not parser.match(_REGEX_EOF):
        if parser.consume(_REGEX_BLANK):
            continue

        block = CommentBlock.parse(parser)
        if block:
            yield block  # XXX: or skip
            continue
        block = CueBlock.parse(parser)
        if block:
            yield block
            continue

        raise ParseError(parser)
Commit	Line	Data
4a2f19ab F	1	# coding: utf-8
	2	from __future__ import unicode_literals, print_function, division
	3
	4	"""
	5	A partial parser for WebVTT segments. Interprets enough of the WebVTT stream
	6	to be able to assemble a single stand-alone subtitle file, suitably adjusting
	7	timestamps on the way, while everything else is passed through unmodified.
	8
	9	Regular expressions based on the W3C WebVTT specification
	10	<https://www.w3.org/TR/webvtt1/>. The X-TIMESTAMP-MAP extension is described
	11	in RFC 8216 §3.5 <https://tools.ietf.org/html/rfc8216#section-3.5>.
	12	"""
	13
	14	import re
	15	import io
	16	from .utils import int_or_none
	17	from .compat import (
	18	compat_str as str,
	19	compat_Pattern,
	20	compat_Match,
	21	)
	22
	23
	24	class _MatchParser(object):
	25	"""
	26	An object that maintains the current parsing position and allows
	27	conveniently advancing it as syntax elements are successfully parsed.
	28	"""
	29
	30	def __init__(self, string):
	31	self._data = string
	32	self._pos = 0
	33
	34	def match(self, r):
	35	if isinstance(r, compat_Pattern):
	36	return r.match(self._data, self._pos)
	37	if isinstance(r, str):
	38	if self._data.startswith(r, self._pos):
	39	return len(r)
	40	return None
	41	raise ValueError(r)
	42
	43	def advance(self, by):
	44	if by is None:
	45	amt = 0
	46	elif isinstance(by, compat_Match):
	47	amt = len(by.group(0))
	48	elif isinstance(by, str):
	49	amt = len(by)
	50	elif isinstance(by, int):
	51	amt = by
	52	else:
	53	raise ValueError(by)
	54	self._pos += amt
	55	return by
	56
	57	def consume(self, r):
	58	return self.advance(self.match(r))
	59
	60	def child(self):
	61	return _MatchChildParser(self)
	62
	63
	64	class _MatchChildParser(_MatchParser):
65	"""
66	A child parser state, which advances through the same data as
67	its parent, but has an independent position. This is useful when
68	advancing through syntax elements we might later want to backtrack
69	from.
70	"""
71
72	def __init__(self, parent):
73	super(_MatchChildParser, self).__init__(parent._data)
74	self.__parent = parent
75	self._pos = parent._pos
76
77	def commit(self):
78	"""
79	Advance the parent state to the current position of this child state.
80	"""
81	self.__parent._pos = self._pos
82	return self.__parent
83
84
85	class ParseError(Exception):
86	def __init__(self, parser):
87	super(ParseError, self).__init__("Parse error at position %u (near %r)" % (
88	parser._pos, parser._data[parser._pos:parser._pos + 20]
89	))
90
91
81a136b8	92	# While the specification <https://www.w3.org/TR/webvtt1/#webvtt-timestamp>
	93	# prescribes that hours must be 2 or more digits, timestamps with a single
	94	# digit for the hour part has been seen in the wild.
	95	# See https://github.com/yt-dlp/yt-dlp/issues/921
4a2f19ab	96	_REGEX_TS = re.compile(r'''(?x)
81a136b8	97	(?:([0-9]{1,}):)?
4a2f19ab F	98	([0-9]{2}):
	99	([0-9]{2})\.
	100	([0-9]{3})?
	101	''')
	102	_REGEX_EOF = re.compile(r'\Z')
	103	_REGEX_NL = re.compile(r'(?:\r\n\|[\r\n])')
	104	_REGEX_BLANK = re.compile(r'(?:\r\n\|[\r\n])+')
	105
	106
	107	def _parse_ts(ts):
	108	"""
	109	Convert a parsed WebVTT timestamp (a re.Match obtained from _REGEX_TS)
	110	into an MPEG PES timestamp: a tick counter at 90 kHz resolution.
	111	"""
	112
	113	h, min, s, ms = ts.groups()
	114	return 90 * (
	115	int(h or 0) * 3600000 + # noqa: W504,E221,E222
	116	int(min) * 60000 + # noqa: W504,E221,E222
	117	int(s) * 1000 + # noqa: W504,E221,E222
	118	int(ms) # noqa: W504,E221,E222
	119	)
	120
	121
	122	def _format_ts(ts):
	123	"""
	124	Convert an MPEG PES timestamp into a WebVTT timestamp.
	125	This will lose sub-millisecond precision.
	126	"""
75722b03	127	msec = int((ts + 45) // 90)
	128	secs, msec = divmod(msec, 1000)
	129	mins, secs = divmod(secs, 60)
	130	hrs, mins = divmod(mins, 60)
	131	return '%02u:%02u:%02u.%03u' % (hrs, mins, secs, msec)
4a2f19ab F	132
	133
	134	class Block(object):
	135	"""
	136	An abstract WebVTT block.
	137	"""
	138
	139	def __init__(self, **kwargs):
	140	for key, val in kwargs.items():
	141	setattr(self, key, val)
	142
	143	@classmethod
	144	def parse(cls, parser):
	145	m = parser.match(cls._REGEX)
	146	if not m:
	147	return None
	148	parser.advance(m)
	149	return cls(raw=m.group(0))
	150
	151	def write_into(self, stream):
	152	stream.write(self.raw)
	153
	154
	155	class HeaderBlock(Block):
	156	"""
	157	A WebVTT block that may only appear in the header part of the file,
	158	i.e. before any cue blocks.
	159	"""
	160
	161	pass
	162
	163
	164	class Magic(HeaderBlock):
	165	_REGEX = re.compile(r'\ufeff?WEBVTT([ \t][^\r\n]*)?(?:\r\n\|[\r\n])')
	166
	167	# XXX: The X-TIMESTAMP-MAP extension is described in RFC 8216 §3.5
	168	# <https://tools.ietf.org/html/rfc8216#section-3.5>, but the RFC
	169	# doesn’t specify the exact grammar nor where in the WebVTT
	170	# syntax it should be placed; the below has been devised based
	171	# on usage in the wild
	172	#
	173	# And strictly speaking, the presence of this extension violates
	174	# the W3C WebVTT spec. Oh well.
	175
	176	_REGEX_TSMAP = re.compile(r'X-TIMESTAMP-MAP=')
	177	_REGEX_TSMAP_LOCAL = re.compile(r'LOCAL:')
	178	_REGEX_TSMAP_MPEGTS = re.compile(r'MPEGTS:([0-9]+)')
81a136b8	179	_REGEX_TSMAP_SEP = re.compile(r'[ \t],[ \t]')
4a2f19ab F	180
	181	@classmethod
	182	def __parse_tsmap(cls, parser):
	183	parser = parser.child()
	184
	185	while True:
	186	m = parser.consume(cls._REGEX_TSMAP_LOCAL)
	187	if m:
	188	m = parser.consume(_REGEX_TS)
	189	if m is None:
	190	raise ParseError(parser)
	191	local = _parse_ts(m)
	192	if local is None:
	193	raise ParseError(parser)
	194	else:
	195	m = parser.consume(cls._REGEX_TSMAP_MPEGTS)
	196	if m:
	197	mpegts = int_or_none(m.group(1))
	198	if mpegts is None:
	199	raise ParseError(parser)
	200	else:
	201	raise ParseError(parser)
81a136b8	202	if parser.consume(cls._REGEX_TSMAP_SEP):
4a2f19ab F	203	continue
	204	if parser.consume(_REGEX_NL):
	205	break
	206	raise ParseError(parser)
	207
	208	parser.commit()
	209	return local, mpegts
	210
	211	@classmethod
	212	def parse(cls, parser):
	213	parser = parser.child()
	214
	215	m = parser.consume(cls._REGEX)
	216	if not m:
	217	raise ParseError(parser)
	218
	219	extra = m.group(1)
	220	local, mpegts = None, None
	221	if parser.consume(cls._REGEX_TSMAP):
	222	local, mpegts = cls.__parse_tsmap(parser)
	223	if not parser.consume(_REGEX_NL):
	224	raise ParseError(parser)
	225	parser.commit()
	226	return cls(extra=extra, mpegts=mpegts, local=local)
	227
	228	def write_into(self, stream):
	229	stream.write('WEBVTT')
	230	if self.extra is not None:
	231	stream.write(self.extra)
	232	stream.write('\n')
	233	if self.local or self.mpegts:
	234	stream.write('X-TIMESTAMP-MAP=LOCAL:')
	235	stream.write(_format_ts(self.local if self.local is not None else 0))
	236	stream.write(',MPEGTS:')
	237	stream.write(str(self.mpegts if self.mpegts is not None else 0))
	238	stream.write('\n')
	239	stream.write('\n')
	240
	241
	242	class StyleBlock(HeaderBlock):
	243	_REGEX = re.compile(r'''(?x)
	244	STYLE[\ \t]*(?:\r\n\|[\r\n])
	245	((?:(?!-->)[^\r\n])+(?:\r\n\|[\r\n]))*
	246	(?:\r\n\|[\r\n])
	247	''')
	248
	249
	250	class RegionBlock(HeaderBlock):
	251	_REGEX = re.compile(r'''(?x)
	252	REGION[\ \t]*
	253	((?:(?!-->)[^\r\n])+(?:\r\n\|[\r\n]))*
	254	(?:\r\n\|[\r\n])
	255	''')
	256
	257
	258	class CommentBlock(Block):
	259	_REGEX = re.compile(r'''(?x)
	260	NOTE(?:\r\n\|[\ \t\r\n])
	261	((?:(?!-->)[^\r\n])+(?:\r\n\|[\r\n]))*
	262	(?:\r\n\|[\r\n])
	263	''')
	264
	265
	266	class CueBlock(Block):
267	"""
268	A cue block. The payload is not interpreted.
269	"""
270
271	_REGEX_ID = re.compile(r'((?:(?!-->)[^\r\n])+)(?:\r\n\|[\r\n])')
272	_REGEX_ARROW = re.compile(r'[ \t]+-->[ \t]+')
273	_REGEX_SETTINGS = re.compile(r'[ \t]+((?:(?!-->)[^\r\n])+)')
274	_REGEX_PAYLOAD = re.compile(r'[^\r\n]+(?:\r\n\|[\r\n])?')
275
276	@classmethod
277	def parse(cls, parser):
278	parser = parser.child()
279
280	id = None
281	m = parser.consume(cls._REGEX_ID)
282	if m:
283	id = m.group(1)
284
285	m0 = parser.consume(_REGEX_TS)
286	if not m0:
287	return None
288	if not parser.consume(cls._REGEX_ARROW):
289	return None
290	m1 = parser.consume(_REGEX_TS)
291	if not m1:
292	return None
293	m2 = parser.consume(cls._REGEX_SETTINGS)
294	if not parser.consume(_REGEX_NL):
295	return None
296
297	start = _parse_ts(m0)
298	end = _parse_ts(m1)
299	settings = m2.group(1) if m2 is not None else None
300
301	text = io.StringIO()
302	while True:
303	m = parser.consume(cls._REGEX_PAYLOAD)
304	if not m:
305	break
306	text.write(m.group(0))
307
308	parser.commit()
309	return cls(
310	id=id,
311	start=start, end=end, settings=settings,
312	text=text.getvalue()
313	)
314
315	def write_into(self, stream):
316	if self.id is not None:
317	stream.write(self.id)
318	stream.write('\n')
319	stream.write(_format_ts(self.start))
320	stream.write(' --> ')
321	stream.write(_format_ts(self.end))
322	if self.settings is not None:
323	stream.write(' ')
324	stream.write(self.settings)
325	stream.write('\n')
326	stream.write(self.text)
327	stream.write('\n')
328
333217f4 F	329	@property
	330	def as_json(self):
	331	return {
	332	'id': self.id,
	333	'start': self.start,
	334	'end': self.end,
	335	'text': self.text,
	336	'settings': self.settings,
	337	}
	338
25a3f4f5 F	339	def __eq__(self, other):
	340	return self.as_json == other.as_json
	341
	342	@classmethod
	343	def from_json(cls, json):
	344	return cls(
	345	id=json['id'],
	346	start=json['start'],
	347	end=json['end'],
	348	text=json['text'],
	349	settings=json['settings']
	350	)
	351
	352	def hinges(self, other):
	353	if self.text != other.text:
	354	return False
	355	if self.settings != other.settings:
	356	return False
	357	return self.start <= self.end == other.start <= other.end
	358
4a2f19ab F	359
	360	def parse_fragment(frag_content):
	361	"""
	362	A generator that yields (partially) parsed WebVTT blocks when given
	363	a bytes object containing the raw contents of a WebVTT file.
	364	"""
	365
	366	parser = _MatchParser(frag_content.decode('utf-8'))
	367
	368	yield Magic.parse(parser)
	369
	370	while not parser.match(_REGEX_EOF):
	371	if parser.consume(_REGEX_BLANK):
	372	continue
	373
	374	block = RegionBlock.parse(parser)
	375	if block:
	376	yield block
	377	continue
	378	block = StyleBlock.parse(parser)
	379	if block:
	380	yield block
	381	continue
	382	block = CommentBlock.parse(parser)
	383	if block:
	384	yield block # XXX: or skip
	385	continue
	386
	387	break
	388
	389	while not parser.match(_REGEX_EOF):
	390	if parser.consume(_REGEX_BLANK):
	391	continue
	392
	393	block = CommentBlock.parse(parser)
	394	if block:
	395	yield block # XXX: or skip
	396	continue
	397	block = CueBlock.parse(parser)
	398	if block:
	399	yield block
	400	continue
	401
	402	raise ParseError(parser)