X-Git-Url: https://jfr.im/git/yt-dlp.git/blobdiff_plain/81a136b80f3d29c73884bb116f869df44bfd6fa1..d4b52ce3fcb8d9578ed12365648eaba8718c603e:/yt_dlp/webvtt.py diff --git a/yt_dlp/webvtt.py b/yt_dlp/webvtt.py index cd936e7e5..9f1a5086b 100644 --- a/yt_dlp/webvtt.py +++ b/yt_dlp/webvtt.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals, print_function, division - """ A partial parser for WebVTT segments. Interprets enough of the WebVTT stream to be able to assemble a single stand-alone subtitle file, suitably adjusting @@ -11,17 +8,13 @@ in RFC 8216 §3.5 . """ -import re import io -from .utils import int_or_none -from .compat import ( - compat_str as str, - compat_Pattern, - compat_Match, -) +import re +from .utils import int_or_none, timetuple_from_msec -class _MatchParser(object): + +class _MatchParser: """ An object that maintains the current parsing position and allows conveniently advancing it as syntax elements are successfully parsed. @@ -32,7 +25,7 @@ def __init__(self, string): self._pos = 0 def match(self, r): - if isinstance(r, compat_Pattern): + if isinstance(r, re.Pattern): return r.match(self._data, self._pos) if isinstance(r, str): if self._data.startswith(r, self._pos): @@ -43,7 +36,7 @@ def match(self, r): def advance(self, by): if by is None: amt = 0 - elif isinstance(by, compat_Match): + elif isinstance(by, re.Match): amt = len(by.group(0)) elif isinstance(by, str): amt = len(by) @@ -70,7 +63,7 @@ class _MatchChildParser(_MatchParser): """ def __init__(self, parent): - super(_MatchChildParser, self).__init__(parent._data) + super().__init__(parent._data) self.__parent = parent self._pos = parent._pos @@ -84,9 +77,8 @@ def commit(self): class ParseError(Exception): def __init__(self, parser): - super(ParseError, self).__init__("Parse error at position %u (near %r)" % ( - parser._pos, parser._data[parser._pos:parser._pos + 20] - )) + data = parser._data[parser._pos:parser._pos + 100] + super().__init__(f'Parse error at position {parser._pos} (near {data!r})') # While the specification @@ -100,8 +92,9 @@ def __init__(self, parser): ([0-9]{3})? ''') _REGEX_EOF = re.compile(r'\Z') -_REGEX_NL = re.compile(r'(?:\r\n|[\r\n])') +_REGEX_NL = re.compile(r'(?:\r\n|[\r\n]|$)') _REGEX_BLANK = re.compile(r'(?:\r\n|[\r\n])+') +_REGEX_OPTIONAL_WHITESPACE = re.compile(r'[ \t]*') def _parse_ts(ts): @@ -109,14 +102,8 @@ def _parse_ts(ts): Convert a parsed WebVTT timestamp (a re.Match obtained from _REGEX_TS) into an MPEG PES timestamp: a tick counter at 90 kHz resolution. """ - - h, min, s, ms = ts.groups() - return 90 * ( - int(h or 0) * 3600000 + # noqa: W504,E221,E222 - int(min) * 60000 + # noqa: W504,E221,E222 - int(s) * 1000 + # noqa: W504,E221,E222 - int(ms) # noqa: W504,E221,E222 - ) + return 90 * sum( + int(part or 0) * mult for part, mult in zip(ts.groups(), (3600_000, 60_000, 1000, 1))) def _format_ts(ts): @@ -124,14 +111,10 @@ def _format_ts(ts): Convert an MPEG PES timestamp into a WebVTT timestamp. This will lose sub-millisecond precision. """ - msec = int((ts + 45) // 90) - secs, msec = divmod(msec, 1000) - mins, secs = divmod(secs, 60) - hrs, mins = divmod(mins, 60) - return '%02u:%02u:%02u.%03u' % (hrs, mins, secs, msec) + return '%02u:%02u:%02u.%03u' % timetuple_from_msec(int((ts + 45) // 90)) -class Block(object): +class Block: """ An abstract WebVTT block. """ @@ -157,7 +140,6 @@ class HeaderBlock(Block): A WebVTT block that may only appear in the header part of the file, i.e. before any cue blocks. """ - pass @@ -166,7 +148,7 @@ class Magic(HeaderBlock): # XXX: The X-TIMESTAMP-MAP extension is described in RFC 8216 §3.5 # , but the RFC - # doesn’t specify the exact grammar nor where in the WebVTT + # doesn't specify the exact grammar nor where in the WebVTT # syntax it should be placed; the below has been devised based # on usage in the wild # @@ -178,6 +160,12 @@ class Magic(HeaderBlock): _REGEX_TSMAP_MPEGTS = re.compile(r'MPEGTS:([0-9]+)') _REGEX_TSMAP_SEP = re.compile(r'[ \t]*,[ \t]*') + # This was removed from the spec in the 2017 revision; + # the last spec draft to describe this syntax element is + # . + # Nevertheless, YouTube keeps serving those + _REGEX_META = re.compile(r'(?:(?!-->)[^\r\n])+:(?:(?!-->)[^\r\n])+(?:\r\n|[\r\n])') + @classmethod def __parse_tsmap(cls, parser): parser = parser.child() @@ -217,13 +205,18 @@ def parse(cls, parser): raise ParseError(parser) extra = m.group(1) - local, mpegts = None, None - if parser.consume(cls._REGEX_TSMAP): - local, mpegts = cls.__parse_tsmap(parser) - if not parser.consume(_REGEX_NL): + local, mpegts, meta = None, None, '' + while not parser.consume(_REGEX_NL): + if parser.consume(cls._REGEX_TSMAP): + local, mpegts = cls.__parse_tsmap(parser) + continue + m = parser.consume(cls._REGEX_META) + if m: + meta += m.group(0) + continue raise ParseError(parser) parser.commit() - return cls(extra=extra, mpegts=mpegts, local=local) + return cls(extra=extra, mpegts=mpegts, local=local, meta=meta) def write_into(self, stream): stream.write('WEBVTT') @@ -236,6 +229,8 @@ def write_into(self, stream): stream.write(',MPEGTS:') stream.write(str(self.mpegts if self.mpegts is not None else 0)) stream.write('\n') + if self.meta: + stream.write(self.meta) stream.write('\n') @@ -277,10 +272,10 @@ class CueBlock(Block): def parse(cls, parser): parser = parser.child() - id = None + id_ = None m = parser.consume(cls._REGEX_ID) if m: - id = m.group(1) + id_ = m.group(1) m0 = parser.consume(_REGEX_TS) if not m0: @@ -291,6 +286,7 @@ def parse(cls, parser): if not m1: return None m2 = parser.consume(cls._REGEX_SETTINGS) + parser.consume(_REGEX_OPTIONAL_WHITESPACE) if not parser.consume(_REGEX_NL): return None @@ -307,9 +303,9 @@ def parse(cls, parser): parser.commit() return cls( - id=id, + id=id_, start=start, end=end, settings=settings, - text=text.getvalue() + text=text.getvalue(), ) def write_into(self, stream): @@ -346,7 +342,7 @@ def from_json(cls, json): start=json['start'], end=json['end'], text=json['text'], - settings=json['settings'] + settings=json['settings'], ) def hinges(self, other): @@ -363,7 +359,7 @@ def parse_fragment(frag_content): a bytes object containing the raw contents of a WebVTT file. """ - parser = _MatchParser(frag_content.decode('utf-8')) + parser = _MatchParser(frag_content.decode()) yield Magic.parse(parser)