yt_dlp/webvtt.py

   1 """
   2 A partial parser for WebVTT segments. Interprets enough of the WebVTT stream
   3 to be able to assemble a single stand-alone subtitle file, suitably adjusting
   4 timestamps on the way, while everything else is passed through unmodified.
   5
   6 Regular expressions based on the W3C WebVTT specification
   7 <https://www.w3.org/TR/webvtt1/>. The X-TIMESTAMP-MAP extension is described
   8 in RFC 8216 §3.5 <https://tools.ietf.org/html/rfc8216#section-3.5>.
   9 """
  10
  11 import io
  12 import re
  13
  14 from .utils import int_or_none, timetuple_from_msec
  15
  16
  17 class _MatchParser:
  18     """
  19     An object that maintains the current parsing position and allows
  20     conveniently advancing it as syntax elements are successfully parsed.
  21     """
  22
  23     def __init__(self, string):
  24         self._data = string
  25         self._pos = 0
  26
  27     def match(self, r):
  28         if isinstance(r, re.Pattern):
  29             return r.match(self._data, self._pos)
  30         if isinstance(r, str):
  31             if self._data.startswith(r, self._pos):
  32                 return len(r)
  33             return None
  34         raise ValueError(r)
  35
  36     def advance(self, by):
  37         if by is None:
  38             amt = 0
  39         elif isinstance(by, re.Match):
  40             amt = len(by.group(0))
  41         elif isinstance(by, str):
  42             amt = len(by)
  43         elif isinstance(by, int):
  44             amt = by
  45         else:
  46             raise ValueError(by)
  47         self._pos += amt
  48         return by
  49
  50     def consume(self, r):
  51         return self.advance(self.match(r))
  52
  53     def child(self):
  54         return _MatchChildParser(self)
  55
  56
  57 class _MatchChildParser(_MatchParser):
  58     """
  59     A child parser state, which advances through the same data as
  60     its parent, but has an independent position. This is useful when
  61     advancing through syntax elements we might later want to backtrack
  62     from.
  63     """
  64
  65     def __init__(self, parent):
  66         super().__init__(parent._data)
  67         self.__parent = parent
  68         self._pos = parent._pos
  69
  70     def commit(self):
  71         """
  72         Advance the parent state to the current position of this child state.
  73         """
  74         self.__parent._pos = self._pos
  75         return self.__parent
  76
  77
  78 class ParseError(Exception):
  79     def __init__(self, parser):
  80         super().__init__("Parse error at position %u (near %r)" % (
  81             parser._pos, parser._data[parser._pos:parser._pos + 100]
  82         ))
  83
  84
  85 # While the specification <https://www.w3.org/TR/webvtt1/#webvtt-timestamp>
  86 # prescribes that hours must be *2 or more* digits, timestamps with a single
  87 # digit for the hour part has been seen in the wild.
  88 # See https://github.com/yt-dlp/yt-dlp/issues/921
  89 _REGEX_TS = re.compile(r'''(?x)
  90     (?:([0-9]{1,}):)?
  91     ([0-9]{2}):
  92     ([0-9]{2})\.
  93     ([0-9]{3})?
  94 ''')
  95 _REGEX_EOF = re.compile(r'\Z')
  96 _REGEX_NL = re.compile(r'(?:\r\n|[\r\n]|$)')
  97 _REGEX_BLANK = re.compile(r'(?:\r\n|[\r\n])+')
  98 _REGEX_OPTIONAL_WHITESPACE = re.compile(r'[ \t]*')
  99
 100
 101 def _parse_ts(ts):
 102     """
 103     Convert a parsed WebVTT timestamp (a re.Match obtained from _REGEX_TS)
 104     into an MPEG PES timestamp: a tick counter at 90 kHz resolution.
 105     """
 106     return 90 * sum(
 107         int(part or 0) * mult for part, mult in zip(ts.groups(), (3600_000, 60_000, 1000, 1)))
 108
 109
 110 def _format_ts(ts):
 111     """
 112     Convert an MPEG PES timestamp into a WebVTT timestamp.
 113     This will lose sub-millisecond precision.
 114     """
 115     return '%02u:%02u:%02u.%03u' % timetuple_from_msec(int((ts + 45) // 90))
 116
 117
 118 class Block:
 119     """
 120     An abstract WebVTT block.
 121     """
 122
 123     def __init__(self, **kwargs):
 124         for key, val in kwargs.items():
 125             setattr(self, key, val)
 126
 127     @classmethod
 128     def parse(cls, parser):
 129         m = parser.match(cls._REGEX)
 130         if not m:
 131             return None
 132         parser.advance(m)
 133         return cls(raw=m.group(0))
 134
 135     def write_into(self, stream):
 136         stream.write(self.raw)
 137
 138
 139 class HeaderBlock(Block):
 140     """
 141     A WebVTT block that may only appear in the header part of the file,
 142     i.e. before any cue blocks.
 143     """
 144     pass
 145
 146
 147 class Magic(HeaderBlock):
 148     _REGEX = re.compile(r'\ufeff?WEBVTT([ \t][^\r\n]*)?(?:\r\n|[\r\n])')
 149
 150     # XXX: The X-TIMESTAMP-MAP extension is described in RFC 8216 §3.5
 151     # <https://tools.ietf.org/html/rfc8216#section-3.5>, but the RFC
 152     # doesn’t specify the exact grammar nor where in the WebVTT
 153     # syntax it should be placed; the below has been devised based
 154     # on usage in the wild
 155     #
 156     # And strictly speaking, the presence of this extension violates
 157     # the W3C WebVTT spec. Oh well.
 158
 159     _REGEX_TSMAP = re.compile(r'X-TIMESTAMP-MAP=')
 160     _REGEX_TSMAP_LOCAL = re.compile(r'LOCAL:')
 161     _REGEX_TSMAP_MPEGTS = re.compile(r'MPEGTS:([0-9]+)')
 162     _REGEX_TSMAP_SEP = re.compile(r'[ \t]*,[ \t]*')
 163
 164     # This was removed from the spec in the 2017 revision;
 165     # the last spec draft to describe this syntax element is
 166     # <https://www.w3.org/TR/2015/WD-webvtt1-20151208/#webvtt-metadata-header>.
 167     # Nevertheless, YouTube keeps serving those
 168     _REGEX_META = re.compile(r'(?:(?!-->)[^\r\n])+:(?:(?!-->)[^\r\n])+(?:\r\n|[\r\n])')
 169
 170     @classmethod
 171     def __parse_tsmap(cls, parser):
 172         parser = parser.child()
 173
 174         while True:
 175             m = parser.consume(cls._REGEX_TSMAP_LOCAL)
 176             if m:
 177                 m = parser.consume(_REGEX_TS)
 178                 if m is None:
 179                     raise ParseError(parser)
 180                 local = _parse_ts(m)
 181                 if local is None:
 182                     raise ParseError(parser)
 183             else:
 184                 m = parser.consume(cls._REGEX_TSMAP_MPEGTS)
 185                 if m:
 186                     mpegts = int_or_none(m.group(1))
 187                     if mpegts is None:
 188                         raise ParseError(parser)
 189                 else:
 190                     raise ParseError(parser)
 191             if parser.consume(cls._REGEX_TSMAP_SEP):
 192                 continue
 193             if parser.consume(_REGEX_NL):
 194                 break
 195             raise ParseError(parser)
 196
 197         parser.commit()
 198         return local, mpegts
 199
 200     @classmethod
 201     def parse(cls, parser):
 202         parser = parser.child()
 203
 204         m = parser.consume(cls._REGEX)
 205         if not m:
 206             raise ParseError(parser)
 207
 208         extra = m.group(1)
 209         local, mpegts, meta = None, None, ''
 210         while not parser.consume(_REGEX_NL):
 211             if parser.consume(cls._REGEX_TSMAP):
 212                 local, mpegts = cls.__parse_tsmap(parser)
 213                 continue
 214             m = parser.consume(cls._REGEX_META)
 215             if m:
 216                 meta += m.group(0)
 217                 continue
 218             raise ParseError(parser)
 219         parser.commit()
 220         return cls(extra=extra, mpegts=mpegts, local=local, meta=meta)
 221
 222     def write_into(self, stream):
 223         stream.write('WEBVTT')
 224         if self.extra is not None:
 225             stream.write(self.extra)
 226         stream.write('\n')
 227         if self.local or self.mpegts:
 228             stream.write('X-TIMESTAMP-MAP=LOCAL:')
 229             stream.write(_format_ts(self.local if self.local is not None else 0))
 230             stream.write(',MPEGTS:')
 231             stream.write(str(self.mpegts if self.mpegts is not None else 0))
 232             stream.write('\n')
 233         if self.meta:
 234             stream.write(self.meta)
 235         stream.write('\n')
 236
 237
 238 class StyleBlock(HeaderBlock):
 239     _REGEX = re.compile(r'''(?x)
 240         STYLE[\ \t]*(?:\r\n|[\r\n])
 241         ((?:(?!-->)[^\r\n])+(?:\r\n|[\r\n]))*
 242         (?:\r\n|[\r\n])
 243     ''')
 244
 245
 246 class RegionBlock(HeaderBlock):
 247     _REGEX = re.compile(r'''(?x)
 248         REGION[\ \t]*
 249         ((?:(?!-->)[^\r\n])+(?:\r\n|[\r\n]))*
 250         (?:\r\n|[\r\n])
 251     ''')
 252
 253
 254 class CommentBlock(Block):
 255     _REGEX = re.compile(r'''(?x)
 256         NOTE(?:\r\n|[\ \t\r\n])
 257         ((?:(?!-->)[^\r\n])+(?:\r\n|[\r\n]))*
 258         (?:\r\n|[\r\n])
 259     ''')
 260
 261
 262 class CueBlock(Block):
 263     """
 264     A cue block. The payload is not interpreted.
 265     """
 266
 267     _REGEX_ID = re.compile(r'((?:(?!-->)[^\r\n])+)(?:\r\n|[\r\n])')
 268     _REGEX_ARROW = re.compile(r'[ \t]+-->[ \t]+')
 269     _REGEX_SETTINGS = re.compile(r'[ \t]+((?:(?!-->)[^\r\n])+)')
 270     _REGEX_PAYLOAD = re.compile(r'[^\r\n]+(?:\r\n|[\r\n])?')
 271
 272     @classmethod
 273     def parse(cls, parser):
 274         parser = parser.child()
 275
 276         id = None
 277         m = parser.consume(cls._REGEX_ID)
 278         if m:
 279             id = m.group(1)
 280
 281         m0 = parser.consume(_REGEX_TS)
 282         if not m0:
 283             return None
 284         if not parser.consume(cls._REGEX_ARROW):
 285             return None
 286         m1 = parser.consume(_REGEX_TS)
 287         if not m1:
 288             return None
 289         m2 = parser.consume(cls._REGEX_SETTINGS)
 290         parser.consume(_REGEX_OPTIONAL_WHITESPACE)
 291         if not parser.consume(_REGEX_NL):
 292             return None
 293
 294         start = _parse_ts(m0)
 295         end = _parse_ts(m1)
 296         settings = m2.group(1) if m2 is not None else None
 297
 298         text = io.StringIO()
 299         while True:
 300             m = parser.consume(cls._REGEX_PAYLOAD)
 301             if not m:
 302                 break
 303             text.write(m.group(0))
 304
 305         parser.commit()
 306         return cls(
 307             id=id,
 308             start=start, end=end, settings=settings,
 309             text=text.getvalue()
 310         )
 311
 312     def write_into(self, stream):
 313         if self.id is not None:
 314             stream.write(self.id)
 315             stream.write('\n')
 316         stream.write(_format_ts(self.start))
 317         stream.write(' --> ')
 318         stream.write(_format_ts(self.end))
 319         if self.settings is not None:
 320             stream.write(' ')
 321             stream.write(self.settings)
 322         stream.write('\n')
 323         stream.write(self.text)
 324         stream.write('\n')
 325
 326     @property
 327     def as_json(self):
 328         return {
 329             'id': self.id,
 330             'start': self.start,
 331             'end': self.end,
 332             'text': self.text,
 333             'settings': self.settings,
 334         }
 335
 336     def __eq__(self, other):
 337         return self.as_json == other.as_json
 338
 339     @classmethod
 340     def from_json(cls, json):
 341         return cls(
 342             id=json['id'],
 343             start=json['start'],
 344             end=json['end'],
 345             text=json['text'],
 346             settings=json['settings']
 347         )
 348
 349     def hinges(self, other):
 350         if self.text != other.text:
 351             return False
 352         if self.settings != other.settings:
 353             return False
 354         return self.start <= self.end == other.start <= other.end
 355
 356
 357 def parse_fragment(frag_content):
 358     """
 359     A generator that yields (partially) parsed WebVTT blocks when given
 360     a bytes object containing the raw contents of a WebVTT file.
 361     """
 362
 363     parser = _MatchParser(frag_content.decode())
 364
 365     yield Magic.parse(parser)
 366
 367     while not parser.match(_REGEX_EOF):
 368         if parser.consume(_REGEX_BLANK):
 369             continue
 370
 371         block = RegionBlock.parse(parser)
 372         if block:
 373             yield block
 374             continue
 375         block = StyleBlock.parse(parser)
 376         if block:
 377             yield block
 378             continue
 379         block = CommentBlock.parse(parser)
 380         if block:
 381             yield block  # XXX: or skip
 382             continue
 383
 384         break
 385
 386     while not parser.match(_REGEX_EOF):
 387         if parser.consume(_REGEX_BLANK):
 388             continue
 389
 390         block = CommentBlock.parse(parser)
 391         if block:
 392             yield block  # XXX: or skip
 393             continue
 394         block = CueBlock.parse(parser)
 395         if block:
 396             yield block
 397             continue
 398
 399         raise ParseError(parser)