yt_dlp/webvtt.py

   1 # coding: utf-8
   2 from __future__ import unicode_literals, print_function, division
   3
   4 """
   5 A partial parser for WebVTT segments. Interprets enough of the WebVTT stream
   6 to be able to assemble a single stand-alone subtitle file, suitably adjusting
   7 timestamps on the way, while everything else is passed through unmodified.
   8
   9 Regular expressions based on the W3C WebVTT specification
  10 <https://www.w3.org/TR/webvtt1/>. The X-TIMESTAMP-MAP extension is described
  11 in RFC 8216 §3.5 <https://tools.ietf.org/html/rfc8216#section-3.5>.
  12 """
  13
  14 import re
  15 import io
  16 from .utils import int_or_none, timetuple_from_msec
  17 from .compat import (
  18     compat_Pattern,
  19     compat_Match,
  20 )
  21
  22
  23 class _MatchParser(object):
  24     """
  25     An object that maintains the current parsing position and allows
  26     conveniently advancing it as syntax elements are successfully parsed.
  27     """
  28
  29     def __init__(self, string):
  30         self._data = string
  31         self._pos = 0
  32
  33     def match(self, r):
  34         if isinstance(r, compat_Pattern):
  35             return r.match(self._data, self._pos)
  36         if isinstance(r, str):
  37             if self._data.startswith(r, self._pos):
  38                 return len(r)
  39             return None
  40         raise ValueError(r)
  41
  42     def advance(self, by):
  43         if by is None:
  44             amt = 0
  45         elif isinstance(by, compat_Match):
  46             amt = len(by.group(0))
  47         elif isinstance(by, str):
  48             amt = len(by)
  49         elif isinstance(by, int):
  50             amt = by
  51         else:
  52             raise ValueError(by)
  53         self._pos += amt
  54         return by
  55
  56     def consume(self, r):
  57         return self.advance(self.match(r))
  58
  59     def child(self):
  60         return _MatchChildParser(self)
  61
  62
  63 class _MatchChildParser(_MatchParser):
  64     """
  65     A child parser state, which advances through the same data as
  66     its parent, but has an independent position. This is useful when
  67     advancing through syntax elements we might later want to backtrack
  68     from.
  69     """
  70
  71     def __init__(self, parent):
  72         super(_MatchChildParser, self).__init__(parent._data)
  73         self.__parent = parent
  74         self._pos = parent._pos
  75
  76     def commit(self):
  77         """
  78         Advance the parent state to the current position of this child state.
  79         """
  80         self.__parent._pos = self._pos
  81         return self.__parent
  82
  83
  84 class ParseError(Exception):
  85     def __init__(self, parser):
  86         super(ParseError, self).__init__("Parse error at position %u (near %r)" % (
  87             parser._pos, parser._data[parser._pos:parser._pos + 20]
  88         ))
  89
  90
  91 # While the specification <https://www.w3.org/TR/webvtt1/#webvtt-timestamp>
  92 # prescribes that hours must be *2 or more* digits, timestamps with a single
  93 # digit for the hour part has been seen in the wild.
  94 # See https://github.com/yt-dlp/yt-dlp/issues/921
  95 _REGEX_TS = re.compile(r'''(?x)
  96     (?:([0-9]{1,}):)?
  97     ([0-9]{2}):
  98     ([0-9]{2})\.
  99     ([0-9]{3})?
 100 ''')
 101 _REGEX_EOF = re.compile(r'\Z')
 102 _REGEX_NL = re.compile(r'(?:\r\n|[\r\n])')
 103 _REGEX_BLANK = re.compile(r'(?:\r\n|[\r\n])+')
 104
 105
 106 def _parse_ts(ts):
 107     """
 108     Convert a parsed WebVTT timestamp (a re.Match obtained from _REGEX_TS)
 109     into an MPEG PES timestamp: a tick counter at 90 kHz resolution.
 110     """
 111
 112     h, min, s, ms = ts.groups()
 113     return 90 * (
 114         int(h or 0) * 3600000 +  # noqa: W504,E221,E222
 115         int(min)    *   60000 +  # noqa: W504,E221,E222
 116         int(s)      *    1000 +  # noqa: W504,E221,E222
 117         int(ms)                  # noqa: W504,E221,E222
 118     )
 119
 120
 121 def _format_ts(ts):
 122     """
 123     Convert an MPEG PES timestamp into a WebVTT timestamp.
 124     This will lose sub-millisecond precision.
 125     """
 126     return '%02u:%02u:%02u.%03u' % timetuple_from_msec(int((ts + 45) // 90))
 127
 128
 129 class Block(object):
 130     """
 131     An abstract WebVTT block.
 132     """
 133
 134     def __init__(self, **kwargs):
 135         for key, val in kwargs.items():
 136             setattr(self, key, val)
 137
 138     @classmethod
 139     def parse(cls, parser):
 140         m = parser.match(cls._REGEX)
 141         if not m:
 142             return None
 143         parser.advance(m)
 144         return cls(raw=m.group(0))
 145
 146     def write_into(self, stream):
 147         stream.write(self.raw)
 148
 149
 150 class HeaderBlock(Block):
 151     """
 152     A WebVTT block that may only appear in the header part of the file,
 153     i.e. before any cue blocks.
 154     """
 155
 156     pass
 157
 158
 159 class Magic(HeaderBlock):
 160     _REGEX = re.compile(r'\ufeff?WEBVTT([ \t][^\r\n]*)?(?:\r\n|[\r\n])')
 161
 162     # XXX: The X-TIMESTAMP-MAP extension is described in RFC 8216 §3.5
 163     # <https://tools.ietf.org/html/rfc8216#section-3.5>, but the RFC
 164     # doesn’t specify the exact grammar nor where in the WebVTT
 165     # syntax it should be placed; the below has been devised based
 166     # on usage in the wild
 167     #
 168     # And strictly speaking, the presence of this extension violates
 169     # the W3C WebVTT spec. Oh well.
 170
 171     _REGEX_TSMAP = re.compile(r'X-TIMESTAMP-MAP=')
 172     _REGEX_TSMAP_LOCAL = re.compile(r'LOCAL:')
 173     _REGEX_TSMAP_MPEGTS = re.compile(r'MPEGTS:([0-9]+)')
 174     _REGEX_TSMAP_SEP = re.compile(r'[ \t]*,[ \t]*')
 175
 176     @classmethod
 177     def __parse_tsmap(cls, parser):
 178         parser = parser.child()
 179
 180         while True:
 181             m = parser.consume(cls._REGEX_TSMAP_LOCAL)
 182             if m:
 183                 m = parser.consume(_REGEX_TS)
 184                 if m is None:
 185                     raise ParseError(parser)
 186                 local = _parse_ts(m)
 187                 if local is None:
 188                     raise ParseError(parser)
 189             else:
 190                 m = parser.consume(cls._REGEX_TSMAP_MPEGTS)
 191                 if m:
 192                     mpegts = int_or_none(m.group(1))
 193                     if mpegts is None:
 194                         raise ParseError(parser)
 195                 else:
 196                     raise ParseError(parser)
 197             if parser.consume(cls._REGEX_TSMAP_SEP):
 198                 continue
 199             if parser.consume(_REGEX_NL):
 200                 break
 201             raise ParseError(parser)
 202
 203         parser.commit()
 204         return local, mpegts
 205
 206     @classmethod
 207     def parse(cls, parser):
 208         parser = parser.child()
 209
 210         m = parser.consume(cls._REGEX)
 211         if not m:
 212             raise ParseError(parser)
 213
 214         extra = m.group(1)
 215         local, mpegts = None, None
 216         if parser.consume(cls._REGEX_TSMAP):
 217             local, mpegts = cls.__parse_tsmap(parser)
 218         if not parser.consume(_REGEX_NL):
 219             raise ParseError(parser)
 220         parser.commit()
 221         return cls(extra=extra, mpegts=mpegts, local=local)
 222
 223     def write_into(self, stream):
 224         stream.write('WEBVTT')
 225         if self.extra is not None:
 226             stream.write(self.extra)
 227         stream.write('\n')
 228         if self.local or self.mpegts:
 229             stream.write('X-TIMESTAMP-MAP=LOCAL:')
 230             stream.write(_format_ts(self.local if self.local is not None else 0))
 231             stream.write(',MPEGTS:')
 232             stream.write(str(self.mpegts if self.mpegts is not None else 0))
 233             stream.write('\n')
 234         stream.write('\n')
 235
 236
 237 class StyleBlock(HeaderBlock):
 238     _REGEX = re.compile(r'''(?x)
 239         STYLE[\ \t]*(?:\r\n|[\r\n])
 240         ((?:(?!-->)[^\r\n])+(?:\r\n|[\r\n]))*
 241         (?:\r\n|[\r\n])
 242     ''')
 243
 244
 245 class RegionBlock(HeaderBlock):
 246     _REGEX = re.compile(r'''(?x)
 247         REGION[\ \t]*
 248         ((?:(?!-->)[^\r\n])+(?:\r\n|[\r\n]))*
 249         (?:\r\n|[\r\n])
 250     ''')
 251
 252
 253 class CommentBlock(Block):
 254     _REGEX = re.compile(r'''(?x)
 255         NOTE(?:\r\n|[\ \t\r\n])
 256         ((?:(?!-->)[^\r\n])+(?:\r\n|[\r\n]))*
 257         (?:\r\n|[\r\n])
 258     ''')
 259
 260
 261 class CueBlock(Block):
 262     """
 263     A cue block. The payload is not interpreted.
 264     """
 265
 266     _REGEX_ID = re.compile(r'((?:(?!-->)[^\r\n])+)(?:\r\n|[\r\n])')
 267     _REGEX_ARROW = re.compile(r'[ \t]+-->[ \t]+')
 268     _REGEX_SETTINGS = re.compile(r'[ \t]+((?:(?!-->)[^\r\n])+)')
 269     _REGEX_PAYLOAD = re.compile(r'[^\r\n]+(?:\r\n|[\r\n])?')
 270
 271     @classmethod
 272     def parse(cls, parser):
 273         parser = parser.child()
 274
 275         id = None
 276         m = parser.consume(cls._REGEX_ID)
 277         if m:
 278             id = m.group(1)
 279
 280         m0 = parser.consume(_REGEX_TS)
 281         if not m0:
 282             return None
 283         if not parser.consume(cls._REGEX_ARROW):
 284             return None
 285         m1 = parser.consume(_REGEX_TS)
 286         if not m1:
 287             return None
 288         m2 = parser.consume(cls._REGEX_SETTINGS)
 289         if not parser.consume(_REGEX_NL):
 290             return None
 291
 292         start = _parse_ts(m0)
 293         end = _parse_ts(m1)
 294         settings = m2.group(1) if m2 is not None else None
 295
 296         text = io.StringIO()
 297         while True:
 298             m = parser.consume(cls._REGEX_PAYLOAD)
 299             if not m:
 300                 break
 301             text.write(m.group(0))
 302
 303         parser.commit()
 304         return cls(
 305             id=id,
 306             start=start, end=end, settings=settings,
 307             text=text.getvalue()
 308         )
 309
 310     def write_into(self, stream):
 311         if self.id is not None:
 312             stream.write(self.id)
 313             stream.write('\n')
 314         stream.write(_format_ts(self.start))
 315         stream.write(' --> ')
 316         stream.write(_format_ts(self.end))
 317         if self.settings is not None:
 318             stream.write(' ')
 319             stream.write(self.settings)
 320         stream.write('\n')
 321         stream.write(self.text)
 322         stream.write('\n')
 323
 324     @property
 325     def as_json(self):
 326         return {
 327             'id': self.id,
 328             'start': self.start,
 329             'end': self.end,
 330             'text': self.text,
 331             'settings': self.settings,
 332         }
 333
 334     def __eq__(self, other):
 335         return self.as_json == other.as_json
 336
 337     @classmethod
 338     def from_json(cls, json):
 339         return cls(
 340             id=json['id'],
 341             start=json['start'],
 342             end=json['end'],
 343             text=json['text'],
 344             settings=json['settings']
 345         )
 346
 347     def hinges(self, other):
 348         if self.text != other.text:
 349             return False
 350         if self.settings != other.settings:
 351             return False
 352         return self.start <= self.end == other.start <= other.end
 353
 354
 355 def parse_fragment(frag_content):
 356     """
 357     A generator that yields (partially) parsed WebVTT blocks when given
 358     a bytes object containing the raw contents of a WebVTT file.
 359     """
 360
 361     parser = _MatchParser(frag_content.decode('utf-8'))
 362
 363     yield Magic.parse(parser)
 364
 365     while not parser.match(_REGEX_EOF):
 366         if parser.consume(_REGEX_BLANK):
 367             continue
 368
 369         block = RegionBlock.parse(parser)
 370         if block:
 371             yield block
 372             continue
 373         block = StyleBlock.parse(parser)
 374         if block:
 375             yield block
 376             continue
 377         block = CommentBlock.parse(parser)
 378         if block:
 379             yield block  # XXX: or skip
 380             continue
 381
 382         break
 383
 384     while not parser.match(_REGEX_EOF):
 385         if parser.consume(_REGEX_BLANK):
 386             continue
 387
 388         block = CommentBlock.parse(parser)
 389         if block:
 390             yield block  # XXX: or skip
 391             continue
 392         block = CueBlock.parse(parser)
 393         if block:
 394             yield block
 395             continue
 396
 397         raise ParseError(parser)