X-Git-Url: https://jfr.im/git/yt-dlp.git/blobdiff_plain/0f06bcd7591332937fdec497d6cbb4914358bc79..7b5674949fd03a33b47b67b31d56a5adf1c48c91:/yt_dlp/webvtt.py diff --git a/yt_dlp/webvtt.py b/yt_dlp/webvtt.py index b8974f883..7683bfb0f 100644 --- a/yt_dlp/webvtt.py +++ b/yt_dlp/webvtt.py @@ -9,8 +9,8 @@ """ import io +import re -from .compat import re from .utils import int_or_none, timetuple_from_msec @@ -78,7 +78,7 @@ def commit(self): class ParseError(Exception): def __init__(self, parser): super().__init__("Parse error at position %u (near %r)" % ( - parser._pos, parser._data[parser._pos:parser._pos + 20] + parser._pos, parser._data[parser._pos:parser._pos + 100] )) @@ -93,8 +93,9 @@ def __init__(self, parser): ([0-9]{3})? ''') _REGEX_EOF = re.compile(r'\Z') -_REGEX_NL = re.compile(r'(?:\r\n|[\r\n])') +_REGEX_NL = re.compile(r'(?:\r\n|[\r\n]|$)') _REGEX_BLANK = re.compile(r'(?:\r\n|[\r\n])+') +_REGEX_OPTIONAL_WHITESPACE = re.compile(r'[ \t]*') def _parse_ts(ts): @@ -140,7 +141,6 @@ class HeaderBlock(Block): A WebVTT block that may only appear in the header part of the file, i.e. before any cue blocks. """ - pass @@ -161,6 +161,12 @@ class Magic(HeaderBlock): _REGEX_TSMAP_MPEGTS = re.compile(r'MPEGTS:([0-9]+)') _REGEX_TSMAP_SEP = re.compile(r'[ \t]*,[ \t]*') + # This was removed from the spec in the 2017 revision; + # the last spec draft to describe this syntax element is + # . + # Nevertheless, YouTube keeps serving those + _REGEX_META = re.compile(r'(?:(?!-->)[^\r\n])+:(?:(?!-->)[^\r\n])+(?:\r\n|[\r\n])') + @classmethod def __parse_tsmap(cls, parser): parser = parser.child() @@ -200,13 +206,18 @@ def parse(cls, parser): raise ParseError(parser) extra = m.group(1) - local, mpegts = None, None - if parser.consume(cls._REGEX_TSMAP): - local, mpegts = cls.__parse_tsmap(parser) - if not parser.consume(_REGEX_NL): + local, mpegts, meta = None, None, '' + while not parser.consume(_REGEX_NL): + if parser.consume(cls._REGEX_TSMAP): + local, mpegts = cls.__parse_tsmap(parser) + continue + m = parser.consume(cls._REGEX_META) + if m: + meta += m.group(0) + continue raise ParseError(parser) parser.commit() - return cls(extra=extra, mpegts=mpegts, local=local) + return cls(extra=extra, mpegts=mpegts, local=local, meta=meta) def write_into(self, stream): stream.write('WEBVTT') @@ -219,6 +230,8 @@ def write_into(self, stream): stream.write(',MPEGTS:') stream.write(str(self.mpegts if self.mpegts is not None else 0)) stream.write('\n') + if self.meta: + stream.write(self.meta) stream.write('\n') @@ -274,6 +287,7 @@ def parse(cls, parser): if not m1: return None m2 = parser.consume(cls._REGEX_SETTINGS) + parser.consume(_REGEX_OPTIONAL_WHITESPACE) if not parser.consume(_REGEX_NL): return None