[yt-dlp.git] / yt_dlp / webvtt.py

# coding: utf-8
from __future__ import unicode_literals, print_function, division

"""
A partial parser for WebVTT segments. Interprets enough of the WebVTT stream
to be able to assemble a single stand-alone subtitle file, suitably adjusting
timestamps on the way, while everything else is passed through unmodified.

Regular expressions based on the W3C WebVTT specification
<https://www.w3.org/TR/webvtt1/>. The X-TIMESTAMP-MAP extension is described
in RFC 8216 §3.5 <https://tools.ietf.org/html/rfc8216#section-3.5>.
"""

import re
import io
from .utils import int_or_none
from .compat import (
    compat_str as str,
    compat_Pattern,
    compat_Match,
)


class _MatchParser(object):
    """
    An object that maintains the current parsing position and allows
    conveniently advancing it as syntax elements are successfully parsed.
    """

    def __init__(self, string):
        self._data = string
        self._pos = 0

    def match(self, r):
        if isinstance(r, compat_Pattern):
            return r.match(self._data, self._pos)
        if isinstance(r, str):
            if self._data.startswith(r, self._pos):
                return len(r)
            return None
        raise ValueError(r)

    def advance(self, by):
        if by is None:
            amt = 0
        elif isinstance(by, compat_Match):
            amt = len(by.group(0))
        elif isinstance(by, str):
            amt = len(by)
        elif isinstance(by, int):
            amt = by
        else:
            raise ValueError(by)
        self._pos += amt
        return by

    def consume(self, r):
        return self.advance(self.match(r))

    def child(self):
        return _MatchChildParser(self)


class _MatchChildParser(_MatchParser):
    """
    A child parser state, which advances through the same data as
    its parent, but has an independent position. This is useful when
    advancing through syntax elements we might later want to backtrack
    from.
    """

    def __init__(self, parent):
        super(_MatchChildParser, self).__init__(parent._data)
        self.__parent = parent
        self._pos = parent._pos

    def commit(self):
        """
        Advance the parent state to the current position of this child state.
        """
        self.__parent._pos = self._pos
        return self.__parent


class ParseError(Exception):
    def __init__(self, parser):
        super(ParseError, self).__init__("Parse error at position %u (near %r)" % (
            parser._pos, parser._data[parser._pos:parser._pos + 20]
        ))


_REGEX_TS = re.compile(r'''(?x)
    (?:([0-9]{2,}):)?
    ([0-9]{2}):
    ([0-9]{2})\.
    ([0-9]{3})?
''')
_REGEX_EOF = re.compile(r'\Z')
_REGEX_NL = re.compile(r'(?:\r\n|[\r\n])')
_REGEX_BLANK = re.compile(r'(?:\r\n|[\r\n])+')


def _parse_ts(ts):
    """
    Convert a parsed WebVTT timestamp (a re.Match obtained from _REGEX_TS)
    into an MPEG PES timestamp: a tick counter at 90 kHz resolution.
    """

    h, min, s, ms = ts.groups()
    return 90 * (
        int(h or 0) * 3600000 +  # noqa: W504,E221,E222
        int(min)    *   60000 +  # noqa: W504,E221,E222
        int(s)      *    1000 +  # noqa: W504,E221,E222
        int(ms)                  # noqa: W504,E221,E222
    )


def _format_ts(ts):
    """
    Convert an MPEG PES timestamp into a WebVTT timestamp.
    This will lose sub-millisecond precision.
    """
    msec = int((ts + 45) // 90)
    secs, msec = divmod(msec, 1000)
    mins, secs = divmod(secs, 60)
    hrs, mins = divmod(mins, 60)
    return '%02u:%02u:%02u.%03u' % (hrs, mins, secs, msec)


class Block(object):
    """
    An abstract WebVTT block.
    """

    def __init__(self, **kwargs):
        for key, val in kwargs.items():
            setattr(self, key, val)

    @classmethod
    def parse(cls, parser):
        m = parser.match(cls._REGEX)
        if not m:
            return None
        parser.advance(m)
        return cls(raw=m.group(0))

    def write_into(self, stream):
        stream.write(self.raw)


class HeaderBlock(Block):
    """
    A WebVTT block that may only appear in the header part of the file,
    i.e. before any cue blocks.
    """

    pass


class Magic(HeaderBlock):
    _REGEX = re.compile(r'\ufeff?WEBVTT([ \t][^\r\n]*)?(?:\r\n|[\r\n])')

    # XXX: The X-TIMESTAMP-MAP extension is described in RFC 8216 §3.5
    # <https://tools.ietf.org/html/rfc8216#section-3.5>, but the RFC
    # doesn’t specify the exact grammar nor where in the WebVTT
    # syntax it should be placed; the below has been devised based
    # on usage in the wild
    #
    # And strictly speaking, the presence of this extension violates
    # the W3C WebVTT spec. Oh well.

    _REGEX_TSMAP = re.compile(r'X-TIMESTAMP-MAP=')
    _REGEX_TSMAP_LOCAL = re.compile(r'LOCAL:')
    _REGEX_TSMAP_MPEGTS = re.compile(r'MPEGTS:([0-9]+)')

    @classmethod
    def __parse_tsmap(cls, parser):
        parser = parser.child()

        while True:
            m = parser.consume(cls._REGEX_TSMAP_LOCAL)
            if m:
                m = parser.consume(_REGEX_TS)
                if m is None:
                    raise ParseError(parser)
                local = _parse_ts(m)
                if local is None:
                    raise ParseError(parser)
            else:
                m = parser.consume(cls._REGEX_TSMAP_MPEGTS)
                if m:
                    mpegts = int_or_none(m.group(1))
                    if mpegts is None:
                        raise ParseError(parser)
                else:
                    raise ParseError(parser)
            if parser.consume(','):
                continue
            if parser.consume(_REGEX_NL):
                break
            raise ParseError(parser)

        parser.commit()
        return local, mpegts

    @classmethod
    def parse(cls, parser):
        parser = parser.child()

        m = parser.consume(cls._REGEX)
        if not m:
            raise ParseError(parser)

        extra = m.group(1)
        local, mpegts = None, None
        if parser.consume(cls._REGEX_TSMAP):
            local, mpegts = cls.__parse_tsmap(parser)
        if not parser.consume(_REGEX_NL):
            raise ParseError(parser)
        parser.commit()
        return cls(extra=extra, mpegts=mpegts, local=local)

    def write_into(self, stream):
        stream.write('WEBVTT')
        if self.extra is not None:
            stream.write(self.extra)
        stream.write('\n')
        if self.local or self.mpegts:
            stream.write('X-TIMESTAMP-MAP=LOCAL:')
            stream.write(_format_ts(self.local if self.local is not None else 0))
            stream.write(',MPEGTS:')
            stream.write(str(self.mpegts if self.mpegts is not None else 0))
            stream.write('\n')
        stream.write('\n')


class StyleBlock(HeaderBlock):
    _REGEX = re.compile(r'''(?x)
        STYLE[\ \t]*(?:\r\n|[\r\n])
        ((?:(?!-->)[^\r\n])+(?:\r\n|[\r\n]))*
        (?:\r\n|[\r\n])
    ''')


class RegionBlock(HeaderBlock):
    _REGEX = re.compile(r'''(?x)
        REGION[\ \t]*
        ((?:(?!-->)[^\r\n])+(?:\r\n|[\r\n]))*
        (?:\r\n|[\r\n])
    ''')


class CommentBlock(Block):
    _REGEX = re.compile(r'''(?x)
        NOTE(?:\r\n|[\ \t\r\n])
        ((?:(?!-->)[^\r\n])+(?:\r\n|[\r\n]))*
        (?:\r\n|[\r\n])
    ''')


class CueBlock(Block):
    """
    A cue block. The payload is not interpreted.
    """

    _REGEX_ID = re.compile(r'((?:(?!-->)[^\r\n])+)(?:\r\n|[\r\n])')
    _REGEX_ARROW = re.compile(r'[ \t]+-->[ \t]+')
    _REGEX_SETTINGS = re.compile(r'[ \t]+((?:(?!-->)[^\r\n])+)')
    _REGEX_PAYLOAD = re.compile(r'[^\r\n]+(?:\r\n|[\r\n])?')

    @classmethod
    def parse(cls, parser):
        parser = parser.child()

        id = None
        m = parser.consume(cls._REGEX_ID)
        if m:
            id = m.group(1)

        m0 = parser.consume(_REGEX_TS)
        if not m0:
            return None
        if not parser.consume(cls._REGEX_ARROW):
            return None
        m1 = parser.consume(_REGEX_TS)
        if not m1:
            return None
        m2 = parser.consume(cls._REGEX_SETTINGS)
        if not parser.consume(_REGEX_NL):
            return None

        start = _parse_ts(m0)
        end = _parse_ts(m1)
        settings = m2.group(1) if m2 is not None else None

        text = io.StringIO()
        while True:
            m = parser.consume(cls._REGEX_PAYLOAD)
            if not m:
                break
            text.write(m.group(0))

        parser.commit()
        return cls(
            id=id,
            start=start, end=end, settings=settings,
            text=text.getvalue()
        )

    def write_into(self, stream):
        if self.id is not None:
            stream.write(self.id)
            stream.write('\n')
        stream.write(_format_ts(self.start))
        stream.write(' --> ')
        stream.write(_format_ts(self.end))
        if self.settings is not None:
            stream.write(' ')
            stream.write(self.settings)
        stream.write('\n')
        stream.write(self.text)
        stream.write('\n')

    @property
    def as_json(self):
        return {
            'id': self.id,
            'start': self.start,
            'end': self.end,
            'text': self.text,
            'settings': self.settings,
        }

    def __eq__(self, other):
        return self.as_json == other.as_json

    @classmethod
    def from_json(cls, json):
        return cls(
            id=json['id'],
            start=json['start'],
            end=json['end'],
            text=json['text'],
            settings=json['settings']
        )

    def hinges(self, other):
        if self.text != other.text:
            return False
        if self.settings != other.settings:
            return False
        return self.start <= self.end == other.start <= other.end


def parse_fragment(frag_content):
    """
    A generator that yields (partially) parsed WebVTT blocks when given
    a bytes object containing the raw contents of a WebVTT file.
    """

    parser = _MatchParser(frag_content.decode('utf-8'))

    yield Magic.parse(parser)

    while not parser.match(_REGEX_EOF):
        if parser.consume(_REGEX_BLANK):
            continue

        block = RegionBlock.parse(parser)
        if block:
            yield block
            continue
        block = StyleBlock.parse(parser)
        if block:
            yield block
            continue
        block = CommentBlock.parse(parser)
        if block:
            yield block  # XXX: or skip
            continue

        break

    while not parser.match(_REGEX_EOF):
        if parser.consume(_REGEX_BLANK):
            continue

        block = CommentBlock.parse(parser)
        if block:
            yield block  # XXX: or skip
            continue
        block = CueBlock.parse(parser)
        if block:
            yield block
            continue

        raise ParseError(parser)
Commit	Line	Data
4a2f19ab F	1	# coding: utf-8
	2	from __future__ import unicode_literals, print_function, division
	3
	4	"""
	5	A partial parser for WebVTT segments. Interprets enough of the WebVTT stream
	6	to be able to assemble a single stand-alone subtitle file, suitably adjusting
	7	timestamps on the way, while everything else is passed through unmodified.
	8
	9	Regular expressions based on the W3C WebVTT specification
	10	<https://www.w3.org/TR/webvtt1/>. The X-TIMESTAMP-MAP extension is described
	11	in RFC 8216 §3.5 <https://tools.ietf.org/html/rfc8216#section-3.5>.
	12	"""
	13
	14	import re
	15	import io
	16	from .utils import int_or_none
	17	from .compat import (
	18	compat_str as str,
	19	compat_Pattern,
	20	compat_Match,
	21	)
	22
	23
	24	class _MatchParser(object):
	25	"""
	26	An object that maintains the current parsing position and allows
	27	conveniently advancing it as syntax elements are successfully parsed.
	28	"""
	29
	30	def __init__(self, string):
	31	self._data = string
	32	self._pos = 0
	33
	34	def match(self, r):
	35	if isinstance(r, compat_Pattern):
	36	return r.match(self._data, self._pos)
	37	if isinstance(r, str):
	38	if self._data.startswith(r, self._pos):
	39	return len(r)
	40	return None
	41	raise ValueError(r)
	42
	43	def advance(self, by):
	44	if by is None:
	45	amt = 0
	46	elif isinstance(by, compat_Match):
	47	amt = len(by.group(0))
	48	elif isinstance(by, str):
	49	amt = len(by)
	50	elif isinstance(by, int):
	51	amt = by
	52	else:
	53	raise ValueError(by)
	54	self._pos += amt
	55	return by
	56
	57	def consume(self, r):
	58	return self.advance(self.match(r))
	59
	60	def child(self):
	61	return _MatchChildParser(self)
	62
	63
	64	class _MatchChildParser(_MatchParser):
65	"""
66	A child parser state, which advances through the same data as
67	its parent, but has an independent position. This is useful when
68	advancing through syntax elements we might later want to backtrack
69	from.
70	"""
71
72	def __init__(self, parent):
73	super(_MatchChildParser, self).__init__(parent._data)
74	self.__parent = parent
75	self._pos = parent._pos
76
77	def commit(self):
78	"""
79	Advance the parent state to the current position of this child state.
80	"""
81	self.__parent._pos = self._pos
82	return self.__parent
83
84
85	class ParseError(Exception):
86	def __init__(self, parser):
87	super(ParseError, self).__init__("Parse error at position %u (near %r)" % (
88	parser._pos, parser._data[parser._pos:parser._pos + 20]
89	))
90
91
92	_REGEX_TS = re.compile(r'''(?x)
93	(?:([0-9]{2,}):)?
94	([0-9]{2}):
95	([0-9]{2})\.
96	([0-9]{3})?
97	''')
98	_REGEX_EOF = re.compile(r'\Z')
99	_REGEX_NL = re.compile(r'(?:\r\n\|[\r\n])')
100	_REGEX_BLANK = re.compile(r'(?:\r\n\|[\r\n])+')
101
102
103	def _parse_ts(ts):
104	"""
105	Convert a parsed WebVTT timestamp (a re.Match obtained from _REGEX_TS)
106	into an MPEG PES timestamp: a tick counter at 90 kHz resolution.
107	"""
108
109	h, min, s, ms = ts.groups()
110	return 90 * (
111	int(h or 0) * 3600000 + # noqa: W504,E221,E222
112	int(min) * 60000 + # noqa: W504,E221,E222
113	int(s) * 1000 + # noqa: W504,E221,E222
114	int(ms) # noqa: W504,E221,E222
115	)
116
117
118	def _format_ts(ts):
119	"""
120	Convert an MPEG PES timestamp into a WebVTT timestamp.
121	This will lose sub-millisecond precision.
122	"""
75722b03	123	msec = int((ts + 45) // 90)
	124	secs, msec = divmod(msec, 1000)
	125	mins, secs = divmod(secs, 60)
	126	hrs, mins = divmod(mins, 60)
	127	return '%02u:%02u:%02u.%03u' % (hrs, mins, secs, msec)
4a2f19ab F	128
	129
	130	class Block(object):
	131	"""
	132	An abstract WebVTT block.
	133	"""
	134
	135	def __init__(self, **kwargs):
	136	for key, val in kwargs.items():
	137	setattr(self, key, val)
	138
	139	@classmethod
	140	def parse(cls, parser):
	141	m = parser.match(cls._REGEX)
	142	if not m:
	143	return None
	144	parser.advance(m)
	145	return cls(raw=m.group(0))
	146
	147	def write_into(self, stream):
	148	stream.write(self.raw)
	149
	150
	151	class HeaderBlock(Block):
	152	"""
	153	A WebVTT block that may only appear in the header part of the file,
	154	i.e. before any cue blocks.
	155	"""
	156
	157	pass
	158
	159
	160	class Magic(HeaderBlock):
	161	_REGEX = re.compile(r'\ufeff?WEBVTT([ \t][^\r\n]*)?(?:\r\n\|[\r\n])')
	162
	163	# XXX: The X-TIMESTAMP-MAP extension is described in RFC 8216 §3.5
	164	# <https://tools.ietf.org/html/rfc8216#section-3.5>, but the RFC
	165	# doesn’t specify the exact grammar nor where in the WebVTT
	166	# syntax it should be placed; the below has been devised based
	167	# on usage in the wild
	168	#
	169	# And strictly speaking, the presence of this extension violates
	170	# the W3C WebVTT spec. Oh well.
	171
	172	_REGEX_TSMAP = re.compile(r'X-TIMESTAMP-MAP=')
	173	_REGEX_TSMAP_LOCAL = re.compile(r'LOCAL:')
	174	_REGEX_TSMAP_MPEGTS = re.compile(r'MPEGTS:([0-9]+)')
	175
	176	@classmethod
	177	def __parse_tsmap(cls, parser):
	178	parser = parser.child()
	179
	180	while True:
	181	m = parser.consume(cls._REGEX_TSMAP_LOCAL)
	182	if m:
	183	m = parser.consume(_REGEX_TS)
	184	if m is None:
	185	raise ParseError(parser)
	186	local = _parse_ts(m)
	187	if local is None:
	188	raise ParseError(parser)
	189	else:
	190	m = parser.consume(cls._REGEX_TSMAP_MPEGTS)
	191	if m:
192	mpegts = int_or_none(m.group(1))
193	if mpegts is None:
194	raise ParseError(parser)
195	else:
196	raise ParseError(parser)
197	if parser.consume(','):
198	continue
199	if parser.consume(_REGEX_NL):
200	break
201	raise ParseError(parser)
202
203	parser.commit()
204	return local, mpegts
205
206	@classmethod
207	def parse(cls, parser):
208	parser = parser.child()
209
210	m = parser.consume(cls._REGEX)
211	if not m:
212	raise ParseError(parser)
213
214	extra = m.group(1)
215	local, mpegts = None, None
216	if parser.consume(cls._REGEX_TSMAP):
217	local, mpegts = cls.__parse_tsmap(parser)
218	if not parser.consume(_REGEX_NL):
219	raise ParseError(parser)
220	parser.commit()
221	return cls(extra=extra, mpegts=mpegts, local=local)
222
223	def write_into(self, stream):
224	stream.write('WEBVTT')
225	if self.extra is not None:
226	stream.write(self.extra)
227	stream.write('\n')
228	if self.local or self.mpegts:
229	stream.write('X-TIMESTAMP-MAP=LOCAL:')
230	stream.write(_format_ts(self.local if self.local is not None else 0))
231	stream.write(',MPEGTS:')
232	stream.write(str(self.mpegts if self.mpegts is not None else 0))
233	stream.write('\n')
234	stream.write('\n')
235
236
237	class StyleBlock(HeaderBlock):
238	_REGEX = re.compile(r'''(?x)
239	STYLE[\ \t]*(?:\r\n\|[\r\n])
240	((?:(?!-->)[^\r\n])+(?:\r\n\|[\r\n]))*
241	(?:\r\n\|[\r\n])
242	''')
243
244
245	class RegionBlock(HeaderBlock):
246	_REGEX = re.compile(r'''(?x)
247	REGION[\ \t]*
248	((?:(?!-->)[^\r\n])+(?:\r\n\|[\r\n]))*
249	(?:\r\n\|[\r\n])
250	''')
251
252
253	class CommentBlock(Block):
254	_REGEX = re.compile(r'''(?x)
255	NOTE(?:\r\n\|[\ \t\r\n])
256	((?:(?!-->)[^\r\n])+(?:\r\n\|[\r\n]))*
257	(?:\r\n\|[\r\n])
258	''')
259
260
261	class CueBlock(Block):
262	"""
263	A cue block. The payload is not interpreted.
264	"""
265
266	_REGEX_ID = re.compile(r'((?:(?!-->)[^\r\n])+)(?:\r\n\|[\r\n])')
267	_REGEX_ARROW = re.compile(r'[ \t]+-->[ \t]+')
268	_REGEX_SETTINGS = re.compile(r'[ \t]+((?:(?!-->)[^\r\n])+)')
269	_REGEX_PAYLOAD = re.compile(r'[^\r\n]+(?:\r\n\|[\r\n])?')
270
271	@classmethod
272	def parse(cls, parser):
273	parser = parser.child()
274
275	id = None
276	m = parser.consume(cls._REGEX_ID)
277	if m:
278	id = m.group(1)
279
280	m0 = parser.consume(_REGEX_TS)
281	if not m0:
282	return None
283	if not parser.consume(cls._REGEX_ARROW):
284	return None
285	m1 = parser.consume(_REGEX_TS)
286	if not m1:
287	return None
288	m2 = parser.consume(cls._REGEX_SETTINGS)
289	if not parser.consume(_REGEX_NL):
290	return None
291
292	start = _parse_ts(m0)
293	end = _parse_ts(m1)
294	settings = m2.group(1) if m2 is not None else None
295
296	text = io.StringIO()
297	while True:
298	m = parser.consume(cls._REGEX_PAYLOAD)
299	if not m:
300	break
301	text.write(m.group(0))
302
303	parser.commit()
304	return cls(
305	id=id,
306	start=start, end=end, settings=settings,
307	text=text.getvalue()
308	)
309
310	def write_into(self, stream):
311	if self.id is not None:
312	stream.write(self.id)
313	stream.write('\n')
314	stream.write(_format_ts(self.start))
315	stream.write(' --> ')
316	stream.write(_format_ts(self.end))
317	if self.settings is not None:
318	stream.write(' ')
319	stream.write(self.settings)
320	stream.write('\n')
321	stream.write(self.text)
322	stream.write('\n')
323
333217f4 F	324	@property
	325	def as_json(self):
	326	return {
	327	'id': self.id,
	328	'start': self.start,
	329	'end': self.end,
	330	'text': self.text,
	331	'settings': self.settings,
	332	}
	333
25a3f4f5 F	334	def __eq__(self, other):
	335	return self.as_json == other.as_json
	336
	337	@classmethod
	338	def from_json(cls, json):
	339	return cls(
	340	id=json['id'],
	341	start=json['start'],
	342	end=json['end'],
	343	text=json['text'],
	344	settings=json['settings']
	345	)
	346
	347	def hinges(self, other):
	348	if self.text != other.text:
	349	return False
	350	if self.settings != other.settings:
	351	return False
	352	return self.start <= self.end == other.start <= other.end
	353
4a2f19ab F	354
	355	def parse_fragment(frag_content):
	356	"""
	357	A generator that yields (partially) parsed WebVTT blocks when given
	358	a bytes object containing the raw contents of a WebVTT file.
	359	"""
	360
	361	parser = _MatchParser(frag_content.decode('utf-8'))
	362
	363	yield Magic.parse(parser)
	364
	365	while not parser.match(_REGEX_EOF):
	366	if parser.consume(_REGEX_BLANK):
	367	continue
	368
	369	block = RegionBlock.parse(parser)
	370	if block:
	371	yield block
	372	continue
	373	block = StyleBlock.parse(parser)
	374	if block:
	375	yield block
	376	continue
	377	block = CommentBlock.parse(parser)
	378	if block:
	379	yield block # XXX: or skip
	380	continue
	381
	382	break
	383
	384	while not parser.match(_REGEX_EOF):
	385	if parser.consume(_REGEX_BLANK):
	386	continue
	387
	388	block = CommentBlock.parse(parser)
	389	if block:
	390	yield block # XXX: or skip
	391	continue
	392	block = CueBlock.parse(parser)
	393	if block:
	394	yield block
	395	continue
	396
	397	raise ParseError(parser)