[ie/bbc] Fix and extend extraction (#9705)

[yt-dlp.git] / yt_dlp / webvtt.py
diff --git a/yt_dlp/webvtt.py b/yt_dlp/webvtt.py

index 741622b25b777ee75c5eb29da5059a88327ec610..7683bfb0f3a0768e44b76c70ab034cd2da5a5d82 100644 (file)
--- a/yt_dlp/webvtt.py
+++ b/yt_dlp/webvtt.py
@@ -11,7 +11,6 @@
  import io
  import re
  
-from .compat import compat_Match, compat_Pattern
  from .utils import int_or_none, timetuple_from_msec
  
  
@@ -26,7 +25,7 @@ def __init__(self, string):
          self._pos = 0
  
      def match(self, r):
-        if isinstance(r, compat_Pattern):
+        if isinstance(r, re.Pattern):
              return r.match(self._data, self._pos)
          if isinstance(r, str):
              if self._data.startswith(r, self._pos):
@@ -37,7 +36,7 @@ def match(self, r):
      def advance(self, by):
          if by is None:
              amt = 0
-        elif isinstance(by, compat_Match):
+        elif isinstance(by, re.Match):
              amt = len(by.group(0))
          elif isinstance(by, str):
              amt = len(by)
@@ -79,7 +78,7 @@ def commit(self):
  class ParseError(Exception):
      def __init__(self, parser):
          super().__init__("Parse error at position %u (near %r)" % (
-            parser._pos, parser._data[parser._pos:parser._pos + 20]
+            parser._pos, parser._data[parser._pos:parser._pos + 100]
          ))
  
  
@@ -94,8 +93,9 @@ def __init__(self, parser):
      ([0-9]{3})?
  ''')
  _REGEX_EOF = re.compile(r'\Z')
-_REGEX_NL = re.compile(r'(?:\r\n|[\r\n])')
+_REGEX_NL = re.compile(r'(?:\r\n|[\r\n]|$)')
  _REGEX_BLANK = re.compile(r'(?:\r\n|[\r\n])+')
+_REGEX_OPTIONAL_WHITESPACE = re.compile(r'[ \t]*')
  
  
  def _parse_ts(ts):
@@ -141,7 +141,6 @@ class HeaderBlock(Block):
      A WebVTT block that may only appear in the header part of the file,
      i.e. before any cue blocks.
      """
-
      pass
  
  
@@ -162,6 +161,12 @@ class Magic(HeaderBlock):
      _REGEX_TSMAP_MPEGTS = re.compile(r'MPEGTS:([0-9]+)')
      _REGEX_TSMAP_SEP = re.compile(r'[ \t]*,[ \t]*')
  
+    # This was removed from the spec in the 2017 revision;
+    # the last spec draft to describe this syntax element is
+    # <https://www.w3.org/TR/2015/WD-webvtt1-20151208/#webvtt-metadata-header>.
+    # Nevertheless, YouTube keeps serving those
+    _REGEX_META = re.compile(r'(?:(?!-->)[^\r\n])+:(?:(?!-->)[^\r\n])+(?:\r\n|[\r\n])')
+
      @classmethod
      def __parse_tsmap(cls, parser):
          parser = parser.child()
@@ -201,13 +206,18 @@ def parse(cls, parser):
              raise ParseError(parser)
  
          extra = m.group(1)
-        local, mpegts = None, None
-        if parser.consume(cls._REGEX_TSMAP):
-            local, mpegts = cls.__parse_tsmap(parser)
-        if not parser.consume(_REGEX_NL):
+        local, mpegts, meta = None, None, ''
+        while not parser.consume(_REGEX_NL):
+            if parser.consume(cls._REGEX_TSMAP):
+                local, mpegts = cls.__parse_tsmap(parser)
+                continue
+            m = parser.consume(cls._REGEX_META)
+            if m:
+                meta += m.group(0)
+                continue
              raise ParseError(parser)
          parser.commit()
-        return cls(extra=extra, mpegts=mpegts, local=local)
+        return cls(extra=extra, mpegts=mpegts, local=local, meta=meta)
  
      def write_into(self, stream):
          stream.write('WEBVTT')
@@ -220,6 +230,8 @@ def write_into(self, stream):
              stream.write(',MPEGTS:')
              stream.write(str(self.mpegts if self.mpegts is not None else 0))
              stream.write('\n')
+        if self.meta:
+            stream.write(self.meta)
          stream.write('\n')
  
  
@@ -275,6 +287,7 @@ def parse(cls, parser):
          if not m1:
              return None
          m2 = parser.consume(cls._REGEX_SETTINGS)
+        parser.consume(_REGEX_OPTIONAL_WHITESPACE)
          if not parser.consume(_REGEX_NL):
              return None
  
@@ -347,7 +360,7 @@ def parse_fragment(frag_content):
      a bytes object containing the raw contents of a WebVTT file.
      """
  
-    parser = _MatchParser(frag_content.decode('utf-8'))
+    parser = _MatchParser(frag_content.decode())
  
      yield Magic.parse(parser)