[yt-dlp.git] / yt_dlp / webvtt.py

"""
A partial parser for WebVTT segments. Interprets enough of the WebVTT stream
to be able to assemble a single stand-alone subtitle file, suitably adjusting
timestamps on the way, while everything else is passed through unmodified.

Regular expressions based on the W3C WebVTT specification
<https://www.w3.org/TR/webvtt1/>. The X-TIMESTAMP-MAP extension is described
in RFC 8216 §3.5 <https://tools.ietf.org/html/rfc8216#section-3.5>.
"""

import io
import re

from .utils import int_or_none, timetuple_from_msec


class _MatchParser:
    """
    An object that maintains the current parsing position and allows
    conveniently advancing it as syntax elements are successfully parsed.
    """

    def __init__(self, string):
        self._data = string
        self._pos = 0

    def match(self, r):
        if isinstance(r, re.Pattern):
            return r.match(self._data, self._pos)
        if isinstance(r, str):
            if self._data.startswith(r, self._pos):
                return len(r)
            return None
        raise ValueError(r)

    def advance(self, by):
        if by is None:
            amt = 0
        elif isinstance(by, re.Match):
            amt = len(by.group(0))
        elif isinstance(by, str):
            amt = len(by)
        elif isinstance(by, int):
            amt = by
        else:
            raise ValueError(by)
        self._pos += amt
        return by

    def consume(self, r):
        return self.advance(self.match(r))

    def child(self):
        return _MatchChildParser(self)


class _MatchChildParser(_MatchParser):
    """
    A child parser state, which advances through the same data as
    its parent, but has an independent position. This is useful when
    advancing through syntax elements we might later want to backtrack
    from.
    """

    def __init__(self, parent):
        super().__init__(parent._data)
        self.__parent = parent
        self._pos = parent._pos

    def commit(self):
        """
        Advance the parent state to the current position of this child state.
        """
        self.__parent._pos = self._pos
        return self.__parent


class ParseError(Exception):
    def __init__(self, parser):
        super().__init__("Parse error at position %u (near %r)" % (
            parser._pos, parser._data[parser._pos:parser._pos + 100]
        ))


# While the specification <https://www.w3.org/TR/webvtt1/#webvtt-timestamp>
# prescribes that hours must be *2 or more* digits, timestamps with a single
# digit for the hour part has been seen in the wild.
# See https://github.com/yt-dlp/yt-dlp/issues/921
_REGEX_TS = re.compile(r'''(?x)
    (?:([0-9]{1,}):)?
    ([0-9]{2}):
    ([0-9]{2})\.
    ([0-9]{3})?
''')
_REGEX_EOF = re.compile(r'\Z')
_REGEX_NL = re.compile(r'(?:\r\n|[\r\n]|$)')
_REGEX_BLANK = re.compile(r'(?:\r\n|[\r\n])+')
_REGEX_OPTIONAL_WHITESPACE = re.compile(r'[ \t]*')


def _parse_ts(ts):
    """
    Convert a parsed WebVTT timestamp (a re.Match obtained from _REGEX_TS)
    into an MPEG PES timestamp: a tick counter at 90 kHz resolution.
    """
    return 90 * sum(
        int(part or 0) * mult for part, mult in zip(ts.groups(), (3600_000, 60_000, 1000, 1)))


def _format_ts(ts):
    """
    Convert an MPEG PES timestamp into a WebVTT timestamp.
    This will lose sub-millisecond precision.
    """
    return '%02u:%02u:%02u.%03u' % timetuple_from_msec(int((ts + 45) // 90))


class Block:
    """
    An abstract WebVTT block.
    """

    def __init__(self, **kwargs):
        for key, val in kwargs.items():
            setattr(self, key, val)

    @classmethod
    def parse(cls, parser):
        m = parser.match(cls._REGEX)
        if not m:
            return None
        parser.advance(m)
        return cls(raw=m.group(0))

    def write_into(self, stream):
        stream.write(self.raw)


class HeaderBlock(Block):
    """
    A WebVTT block that may only appear in the header part of the file,
    i.e. before any cue blocks.
    """
    pass


class Magic(HeaderBlock):
    _REGEX = re.compile(r'\ufeff?WEBVTT([ \t][^\r\n]*)?(?:\r\n|[\r\n])')

    # XXX: The X-TIMESTAMP-MAP extension is described in RFC 8216 §3.5
    # <https://tools.ietf.org/html/rfc8216#section-3.5>, but the RFC
    # doesn’t specify the exact grammar nor where in the WebVTT
    # syntax it should be placed; the below has been devised based
    # on usage in the wild
    #
    # And strictly speaking, the presence of this extension violates
    # the W3C WebVTT spec. Oh well.

    _REGEX_TSMAP = re.compile(r'X-TIMESTAMP-MAP=')
    _REGEX_TSMAP_LOCAL = re.compile(r'LOCAL:')
    _REGEX_TSMAP_MPEGTS = re.compile(r'MPEGTS:([0-9]+)')
    _REGEX_TSMAP_SEP = re.compile(r'[ \t]*,[ \t]*')

    # This was removed from the spec in the 2017 revision;
    # the last spec draft to describe this syntax element is
    # <https://www.w3.org/TR/2015/WD-webvtt1-20151208/#webvtt-metadata-header>.
    # Nevertheless, YouTube keeps serving those
    _REGEX_META = re.compile(r'(?:(?!-->)[^\r\n])+:(?:(?!-->)[^\r\n])+(?:\r\n|[\r\n])')

    @classmethod
    def __parse_tsmap(cls, parser):
        parser = parser.child()

        while True:
            m = parser.consume(cls._REGEX_TSMAP_LOCAL)
            if m:
                m = parser.consume(_REGEX_TS)
                if m is None:
                    raise ParseError(parser)
                local = _parse_ts(m)
                if local is None:
                    raise ParseError(parser)
            else:
                m = parser.consume(cls._REGEX_TSMAP_MPEGTS)
                if m:
                    mpegts = int_or_none(m.group(1))
                    if mpegts is None:
                        raise ParseError(parser)
                else:
                    raise ParseError(parser)
            if parser.consume(cls._REGEX_TSMAP_SEP):
                continue
            if parser.consume(_REGEX_NL):
                break
            raise ParseError(parser)

        parser.commit()
        return local, mpegts

    @classmethod
    def parse(cls, parser):
        parser = parser.child()

        m = parser.consume(cls._REGEX)
        if not m:
            raise ParseError(parser)

        extra = m.group(1)
        local, mpegts, meta = None, None, ''
        while not parser.consume(_REGEX_NL):
            if parser.consume(cls._REGEX_TSMAP):
                local, mpegts = cls.__parse_tsmap(parser)
                continue
            m = parser.consume(cls._REGEX_META)
            if m:
                meta += m.group(0)
                continue
            raise ParseError(parser)
        parser.commit()
        return cls(extra=extra, mpegts=mpegts, local=local, meta=meta)

    def write_into(self, stream):
        stream.write('WEBVTT')
        if self.extra is not None:
            stream.write(self.extra)
        stream.write('\n')
        if self.local or self.mpegts:
            stream.write('X-TIMESTAMP-MAP=LOCAL:')
            stream.write(_format_ts(self.local if self.local is not None else 0))
            stream.write(',MPEGTS:')
            stream.write(str(self.mpegts if self.mpegts is not None else 0))
            stream.write('\n')
        if self.meta:
            stream.write(self.meta)
        stream.write('\n')


class StyleBlock(HeaderBlock):
    _REGEX = re.compile(r'''(?x)
        STYLE[\ \t]*(?:\r\n|[\r\n])
        ((?:(?!-->)[^\r\n])+(?:\r\n|[\r\n]))*
        (?:\r\n|[\r\n])
    ''')


class RegionBlock(HeaderBlock):
    _REGEX = re.compile(r'''(?x)
        REGION[\ \t]*
        ((?:(?!-->)[^\r\n])+(?:\r\n|[\r\n]))*
        (?:\r\n|[\r\n])
    ''')


class CommentBlock(Block):
    _REGEX = re.compile(r'''(?x)
        NOTE(?:\r\n|[\ \t\r\n])
        ((?:(?!-->)[^\r\n])+(?:\r\n|[\r\n]))*
        (?:\r\n|[\r\n])
    ''')


class CueBlock(Block):
    """
    A cue block. The payload is not interpreted.
    """

    _REGEX_ID = re.compile(r'((?:(?!-->)[^\r\n])+)(?:\r\n|[\r\n])')
    _REGEX_ARROW = re.compile(r'[ \t]+-->[ \t]+')
    _REGEX_SETTINGS = re.compile(r'[ \t]+((?:(?!-->)[^\r\n])+)')
    _REGEX_PAYLOAD = re.compile(r'[^\r\n]+(?:\r\n|[\r\n])?')

    @classmethod
    def parse(cls, parser):
        parser = parser.child()

        id = None
        m = parser.consume(cls._REGEX_ID)
        if m:
            id = m.group(1)

        m0 = parser.consume(_REGEX_TS)
        if not m0:
            return None
        if not parser.consume(cls._REGEX_ARROW):
            return None
        m1 = parser.consume(_REGEX_TS)
        if not m1:
            return None
        m2 = parser.consume(cls._REGEX_SETTINGS)
        parser.consume(_REGEX_OPTIONAL_WHITESPACE)
        if not parser.consume(_REGEX_NL):
            return None

        start = _parse_ts(m0)
        end = _parse_ts(m1)
        settings = m2.group(1) if m2 is not None else None

        text = io.StringIO()
        while True:
            m = parser.consume(cls._REGEX_PAYLOAD)
            if not m:
                break
            text.write(m.group(0))

        parser.commit()
        return cls(
            id=id,
            start=start, end=end, settings=settings,
            text=text.getvalue()
        )

    def write_into(self, stream):
        if self.id is not None:
            stream.write(self.id)
            stream.write('\n')
        stream.write(_format_ts(self.start))
        stream.write(' --> ')
        stream.write(_format_ts(self.end))
        if self.settings is not None:
            stream.write(' ')
            stream.write(self.settings)
        stream.write('\n')
        stream.write(self.text)
        stream.write('\n')

    @property
    def as_json(self):
        return {
            'id': self.id,
            'start': self.start,
            'end': self.end,
            'text': self.text,
            'settings': self.settings,
        }

    def __eq__(self, other):
        return self.as_json == other.as_json

    @classmethod
    def from_json(cls, json):
        return cls(
            id=json['id'],
            start=json['start'],
            end=json['end'],
            text=json['text'],
            settings=json['settings']
        )

    def hinges(self, other):
        if self.text != other.text:
            return False
        if self.settings != other.settings:
            return False
        return self.start <= self.end == other.start <= other.end


def parse_fragment(frag_content):
    """
    A generator that yields (partially) parsed WebVTT blocks when given
    a bytes object containing the raw contents of a WebVTT file.
    """

    parser = _MatchParser(frag_content.decode())

    yield Magic.parse(parser)

    while not parser.match(_REGEX_EOF):
        if parser.consume(_REGEX_BLANK):
            continue

        block = RegionBlock.parse(parser)
        if block:
            yield block
            continue
        block = StyleBlock.parse(parser)
        if block:
            yield block
            continue
        block = CommentBlock.parse(parser)
        if block:
            yield block  # XXX: or skip
            continue

        break

    while not parser.match(_REGEX_EOF):
        if parser.consume(_REGEX_BLANK):
            continue

        block = CommentBlock.parse(parser)
        if block:
            yield block  # XXX: or skip
            continue
        block = CueBlock.parse(parser)
        if block:
            yield block
            continue

        raise ParseError(parser)
Commit	Line	Data
4a2f19ab F	1	"""
	2	A partial parser for WebVTT segments. Interprets enough of the WebVTT stream
	3	to be able to assemble a single stand-alone subtitle file, suitably adjusting
	4	timestamps on the way, while everything else is passed through unmodified.
	5
	6	Regular expressions based on the W3C WebVTT specification
	7	<https://www.w3.org/TR/webvtt1/>. The X-TIMESTAMP-MAP extension is described
	8	in RFC 8216 §3.5 <https://tools.ietf.org/html/rfc8216#section-3.5>.
	9	"""
	10
4a2f19ab	11	import io
6929b41a	12	import re
f8271158	13
aa7785f8	14	from .utils import int_or_none, timetuple_from_msec
4a2f19ab F	15
4a2f19ab F	16
86e5f3ed	17	class _MatchParser:
4a2f19ab F	18	"""
	19	An object that maintains the current parsing position and allows
	20	conveniently advancing it as syntax elements are successfully parsed.
	21	"""
	22
	23	def __init__(self, string):
	24	self._data = string
	25	self._pos = 0
	26
	27	def match(self, r):
77f90330	28	if isinstance(r, re.Pattern):
4a2f19ab F	29	return r.match(self._data, self._pos)
	30	if isinstance(r, str):
	31	if self._data.startswith(r, self._pos):
	32	return len(r)
	33	return None
	34	raise ValueError(r)
	35
	36	def advance(self, by):
	37	if by is None:
	38	amt = 0
77f90330	39	elif isinstance(by, re.Match):
4a2f19ab F	40	amt = len(by.group(0))
	41	elif isinstance(by, str):
	42	amt = len(by)
	43	elif isinstance(by, int):
	44	amt = by
	45	else:
	46	raise ValueError(by)
	47	self._pos += amt
	48	return by
	49
	50	def consume(self, r):
	51	return self.advance(self.match(r))
	52
	53	def child(self):
	54	return _MatchChildParser(self)
	55
	56
	57	class _MatchChildParser(_MatchParser):
	58	"""
	59	A child parser state, which advances through the same data as
	60	its parent, but has an independent position. This is useful when
	61	advancing through syntax elements we might later want to backtrack
	62	from.
	63	"""
	64
	65	def __init__(self, parent):
86e5f3ed	66	super().__init__(parent._data)
4a2f19ab F	67	self.__parent = parent
	68	self._pos = parent._pos
	69
	70	def commit(self):
	71	"""
	72	Advance the parent state to the current position of this child state.
	73	"""
	74	self.__parent._pos = self._pos
	75	return self.__parent
	76
	77
	78	class ParseError(Exception):
	79	def __init__(self, parser):
86e5f3ed	80	super().__init__("Parse error at position %u (near %r)" % (
615a8444	81	parser._pos, parser._data[parser._pos:parser._pos + 100]
4a2f19ab F	82	))
	83
	84
81a136b8	85	# While the specification <https://www.w3.org/TR/webvtt1/#webvtt-timestamp>
	86	# prescribes that hours must be 2 or more digits, timestamps with a single
	87	# digit for the hour part has been seen in the wild.
	88	# See https://github.com/yt-dlp/yt-dlp/issues/921
4a2f19ab	89	_REGEX_TS = re.compile(r'''(?x)
81a136b8	90	(?:([0-9]{1,}):)?
4a2f19ab F	91	([0-9]{2}):
	92	([0-9]{2})\.
	93	([0-9]{3})?
	94	''')
	95	_REGEX_EOF = re.compile(r'\Z')
f352a097	96	_REGEX_NL = re.compile(r'(?:\r\n\|[\r\n]\|$)')
4a2f19ab	97	_REGEX_BLANK = re.compile(r'(?:\r\n\|[\r\n])+')
15f22b48	98	_REGEX_OPTIONAL_WHITESPACE = re.compile(r'[ \t]*')
4a2f19ab F	99
	100
	101	def _parse_ts(ts):
	102	"""
	103	Convert a parsed WebVTT timestamp (a re.Match obtained from _REGEX_TS)
	104	into an MPEG PES timestamp: a tick counter at 90 kHz resolution.
	105	"""
19a03940	106	return 90 * sum(
19a03940	107	int(part or 0) * mult for part, mult in zip(ts.groups(), (3600_000, 60_000, 1000, 1)))
4a2f19ab F	108
	109
	110	def _format_ts(ts):
	111	"""
	112	Convert an MPEG PES timestamp into a WebVTT timestamp.
	113	This will lose sub-millisecond precision.
	114	"""
aa7785f8	115	return '%02u:%02u:%02u.%03u' % timetuple_from_msec(int((ts + 45) // 90))
4a2f19ab F	116
4a2f19ab F	117
86e5f3ed	118	class Block:
4a2f19ab F	119	"""
	120	An abstract WebVTT block.
	121	"""
	122
	123	def __init__(self, **kwargs):
	124	for key, val in kwargs.items():
	125	setattr(self, key, val)
	126
	127	@classmethod
	128	def parse(cls, parser):
	129	m = parser.match(cls._REGEX)
	130	if not m:
	131	return None
	132	parser.advance(m)
	133	return cls(raw=m.group(0))
	134
	135	def write_into(self, stream):
	136	stream.write(self.raw)
	137
	138
	139	class HeaderBlock(Block):
	140	"""
	141	A WebVTT block that may only appear in the header part of the file,
	142	i.e. before any cue blocks.
	143	"""
4a2f19ab F	144	pass
	145
	146
	147	class Magic(HeaderBlock):
	148	_REGEX = re.compile(r'\ufeff?WEBVTT([ \t][^\r\n]*)?(?:\r\n\|[\r\n])')
	149
	150	# XXX: The X-TIMESTAMP-MAP extension is described in RFC 8216 §3.5
	151	# <https://tools.ietf.org/html/rfc8216#section-3.5>, but the RFC
	152	# doesn’t specify the exact grammar nor where in the WebVTT
	153	# syntax it should be placed; the below has been devised based
	154	# on usage in the wild
	155	#
	156	# And strictly speaking, the presence of this extension violates
	157	# the W3C WebVTT spec. Oh well.
	158
	159	_REGEX_TSMAP = re.compile(r'X-TIMESTAMP-MAP=')
	160	_REGEX_TSMAP_LOCAL = re.compile(r'LOCAL:')
	161	_REGEX_TSMAP_MPEGTS = re.compile(r'MPEGTS:([0-9]+)')
81a136b8	162	_REGEX_TSMAP_SEP = re.compile(r'[ \t],[ \t]')
4a2f19ab	163
c646d76f	164	# This was removed from the spec in the 2017 revision;
	165	# the last spec draft to describe this syntax element is
	166	# <https://www.w3.org/TR/2015/WD-webvtt1-20151208/#webvtt-metadata-header>.
	167	# Nevertheless, YouTube keeps serving those
	168	_REGEX_META = re.compile(r'(?:(?!-->)[^\r\n])+:(?:(?!-->)[^\r\n])+(?:\r\n\|[\r\n])')
	169
4a2f19ab F	170	@classmethod
	171	def __parse_tsmap(cls, parser):
	172	parser = parser.child()
	173
	174	while True:
	175	m = parser.consume(cls._REGEX_TSMAP_LOCAL)
	176	if m:
	177	m = parser.consume(_REGEX_TS)
	178	if m is None:
	179	raise ParseError(parser)
	180	local = _parse_ts(m)
	181	if local is None:
	182	raise ParseError(parser)
	183	else:
	184	m = parser.consume(cls._REGEX_TSMAP_MPEGTS)
	185	if m:
	186	mpegts = int_or_none(m.group(1))
	187	if mpegts is None:
	188	raise ParseError(parser)
	189	else:
	190	raise ParseError(parser)
81a136b8	191	if parser.consume(cls._REGEX_TSMAP_SEP):
4a2f19ab F	192	continue
	193	if parser.consume(_REGEX_NL):
	194	break
	195	raise ParseError(parser)
	196
	197	parser.commit()
	198	return local, mpegts
	199
	200	@classmethod
	201	def parse(cls, parser):
	202	parser = parser.child()
	203
	204	m = parser.consume(cls._REGEX)
	205	if not m:
	206	raise ParseError(parser)
	207
	208	extra = m.group(1)
c646d76f	209	local, mpegts, meta = None, None, ''
	210	while not parser.consume(_REGEX_NL):
	211	if parser.consume(cls._REGEX_TSMAP):
	212	local, mpegts = cls.__parse_tsmap(parser)
	213	continue
	214	m = parser.consume(cls._REGEX_META)
	215	if m:
	216	meta += m.group(0)
	217	continue
4a2f19ab F	218	raise ParseError(parser)
4a2f19ab F	219	parser.commit()
c646d76f	220	return cls(extra=extra, mpegts=mpegts, local=local, meta=meta)
4a2f19ab F	221
	222	def write_into(self, stream):
	223	stream.write('WEBVTT')
	224	if self.extra is not None:
	225	stream.write(self.extra)
	226	stream.write('\n')
	227	if self.local or self.mpegts:
	228	stream.write('X-TIMESTAMP-MAP=LOCAL:')
	229	stream.write(_format_ts(self.local if self.local is not None else 0))
	230	stream.write(',MPEGTS:')
	231	stream.write(str(self.mpegts if self.mpegts is not None else 0))
	232	stream.write('\n')
c646d76f	233	if self.meta:
c646d76f	234	stream.write(self.meta)
4a2f19ab F	235	stream.write('\n')
	236
	237
	238	class StyleBlock(HeaderBlock):
	239	_REGEX = re.compile(r'''(?x)
	240	STYLE[\ \t]*(?:\r\n\|[\r\n])
	241	((?:(?!-->)[^\r\n])+(?:\r\n\|[\r\n]))*
	242	(?:\r\n\|[\r\n])
	243	''')
	244
	245
	246	class RegionBlock(HeaderBlock):
	247	_REGEX = re.compile(r'''(?x)
	248	REGION[\ \t]*
	249	((?:(?!-->)[^\r\n])+(?:\r\n\|[\r\n]))*
	250	(?:\r\n\|[\r\n])
	251	''')
	252
	253
	254	class CommentBlock(Block):
	255	_REGEX = re.compile(r'''(?x)
	256	NOTE(?:\r\n\|[\ \t\r\n])
	257	((?:(?!-->)[^\r\n])+(?:\r\n\|[\r\n]))*
	258	(?:\r\n\|[\r\n])
	259	''')
	260
	261
	262	class CueBlock(Block):
	263	"""
	264	A cue block. The payload is not interpreted.
	265	"""
	266
	267	_REGEX_ID = re.compile(r'((?:(?!-->)[^\r\n])+)(?:\r\n\|[\r\n])')
	268	_REGEX_ARROW = re.compile(r'[ \t]+-->[ \t]+')
	269	_REGEX_SETTINGS = re.compile(r'[ \t]+((?:(?!-->)[^\r\n])+)')
	270	_REGEX_PAYLOAD = re.compile(r'[^\r\n]+(?:\r\n\|[\r\n])?')
	271
	272	@classmethod
	273	def parse(cls, parser):
	274	parser = parser.child()
	275
	276	id = None
	277	m = parser.consume(cls._REGEX_ID)
	278	if m:
	279	id = m.group(1)
	280
	281	m0 = parser.consume(_REGEX_TS)
	282	if not m0:
	283	return None
	284	if not parser.consume(cls._REGEX_ARROW):
	285	return None
	286	m1 = parser.consume(_REGEX_TS)
	287	if not m1:
	288	return None
	289	m2 = parser.consume(cls._REGEX_SETTINGS)
298230e5	290	parser.consume(_REGEX_OPTIONAL_WHITESPACE)
4a2f19ab F	291	if not parser.consume(_REGEX_NL):
	292	return None
	293
	294	start = _parse_ts(m0)
	295	end = _parse_ts(m1)
	296	settings = m2.group(1) if m2 is not None else None
	297
	298	text = io.StringIO()
	299	while True:
	300	m = parser.consume(cls._REGEX_PAYLOAD)
	301	if not m:
	302	break
	303	text.write(m.group(0))
	304
	305	parser.commit()
	306	return cls(
	307	id=id,
	308	start=start, end=end, settings=settings,
	309	text=text.getvalue()
	310	)
	311
	312	def write_into(self, stream):
	313	if self.id is not None:
	314	stream.write(self.id)
	315	stream.write('\n')
	316	stream.write(_format_ts(self.start))
	317	stream.write(' --> ')
	318	stream.write(_format_ts(self.end))
	319	if self.settings is not None:
	320	stream.write(' ')
	321	stream.write(self.settings)
	322	stream.write('\n')
	323	stream.write(self.text)
	324	stream.write('\n')
	325
333217f4 F	326	@property
	327	def as_json(self):
	328	return {
	329	'id': self.id,
	330	'start': self.start,
	331	'end': self.end,
	332	'text': self.text,
	333	'settings': self.settings,
	334	}
	335
25a3f4f5 F	336	def __eq__(self, other):
	337	return self.as_json == other.as_json
	338
	339	@classmethod
	340	def from_json(cls, json):
	341	return cls(
	342	id=json['id'],
	343	start=json['start'],
	344	end=json['end'],
	345	text=json['text'],
	346	settings=json['settings']
	347	)
	348
	349	def hinges(self, other):
	350	if self.text != other.text:
	351	return False
	352	if self.settings != other.settings:
	353	return False
	354	return self.start <= self.end == other.start <= other.end
	355
4a2f19ab F	356
	357	def parse_fragment(frag_content):
	358	"""
	359	A generator that yields (partially) parsed WebVTT blocks when given
	360	a bytes object containing the raw contents of a WebVTT file.
	361	"""
	362
0f06bcd7	363	parser = _MatchParser(frag_content.decode())
4a2f19ab F	364
	365	yield Magic.parse(parser)
	366
	367	while not parser.match(_REGEX_EOF):
	368	if parser.consume(_REGEX_BLANK):
	369	continue
	370
	371	block = RegionBlock.parse(parser)
	372	if block:
	373	yield block
	374	continue
	375	block = StyleBlock.parse(parser)
	376	if block:
	377	yield block
	378	continue
	379	block = CommentBlock.parse(parser)
	380	if block:
	381	yield block # XXX: or skip
	382	continue
	383
	384	break
	385
	386	while not parser.match(_REGEX_EOF):
	387	if parser.consume(_REGEX_BLANK):
	388	continue
	389
	390	block = CommentBlock.parse(parser)
	391	if block:
	392	yield block # XXX: or skip
	393	continue
	394	block = CueBlock.parse(parser)
	395	if block:
	396	yield block
	397	continue
	398
	399	raise ParseError(parser)