[ie/ArteTV] Label forced subtitles (#9945)

[yt-dlp.git] / yt_dlp / webvtt.py
diff --git a/yt_dlp/webvtt.py b/yt_dlp/webvtt.py

index 962aa57ad61e7db511a2253140b5860aac21e693..7683bfb0f3a0768e44b76c70ab034cd2da5a5d82 100644 (file)
--- a/yt_dlp/webvtt.py
+++ b/yt_dlp/webvtt.py
@@ -1,6 +1,3 @@
-# coding: utf-8
-from __future__ import unicode_literals, print_function, division
-
  """
  A partial parser for WebVTT segments. Interprets enough of the WebVTT stream
  to be able to assemble a single stand-alone subtitle file, suitably adjusting
  """
  A partial parser for WebVTT segments. Interprets enough of the WebVTT stream
  to be able to assemble a single stand-alone subtitle file, suitably adjusting
@@ -11,17 +8,13 @@
  in RFC 8216 §3.5 <https://tools.ietf.org/html/rfc8216#section-3.5>.
  """
  
  in RFC 8216 §3.5 <https://tools.ietf.org/html/rfc8216#section-3.5>.
  """
  
-import re
  import io
  import io
+import re
+
  from .utils import int_or_none, timetuple_from_msec
  from .utils import int_or_none, timetuple_from_msec
-from .compat import (
-    compat_str as str,
-    compat_Pattern,
-    compat_Match,
-)
  
  
  
  
-class _MatchParser(object):
+class _MatchParser:
      """
      An object that maintains the current parsing position and allows
      conveniently advancing it as syntax elements are successfully parsed.
      """
      An object that maintains the current parsing position and allows
      conveniently advancing it as syntax elements are successfully parsed.
@@ -32,7 +25,7 @@ def __init__(self, string):
          self._pos = 0
  
      def match(self, r):
          self._pos = 0
  
      def match(self, r):
-        if isinstance(r, compat_Pattern):
+        if isinstance(r, re.Pattern):
              return r.match(self._data, self._pos)
          if isinstance(r, str):
              if self._data.startswith(r, self._pos):
              return r.match(self._data, self._pos)
          if isinstance(r, str):
              if self._data.startswith(r, self._pos):
@@ -43,7 +36,7 @@ def match(self, r):
      def advance(self, by):
          if by is None:
              amt = 0
      def advance(self, by):
          if by is None:
              amt = 0
-        elif isinstance(by, compat_Match):
+        elif isinstance(by, re.Match):
              amt = len(by.group(0))
          elif isinstance(by, str):
              amt = len(by)
              amt = len(by.group(0))
          elif isinstance(by, str):
              amt = len(by)
@@ -70,7 +63,7 @@ class _MatchChildParser(_MatchParser):
      """
  
      def __init__(self, parent):
      """
  
      def __init__(self, parent):
-        super(_MatchChildParser, self).__init__(parent._data)
+        super().__init__(parent._data)
          self.__parent = parent
          self._pos = parent._pos
  
          self.__parent = parent
          self._pos = parent._pos
  
@@ -84,8 +77,8 @@ def commit(self):
  
  class ParseError(Exception):
      def __init__(self, parser):
  
  class ParseError(Exception):
      def __init__(self, parser):
-        super(ParseError, self).__init__("Parse error at position %u (near %r)" % (
-            parser._pos, parser._data[parser._pos:parser._pos + 20]
+        super().__init__("Parse error at position %u (near %r)" % (
+            parser._pos, parser._data[parser._pos:parser._pos + 100]
          ))
  
  
          ))
  
  
@@ -100,8 +93,9 @@ def __init__(self, parser):
      ([0-9]{3})?
  ''')
  _REGEX_EOF = re.compile(r'\Z')
      ([0-9]{3})?
  ''')
  _REGEX_EOF = re.compile(r'\Z')
-_REGEX_NL = re.compile(r'(?:\r\n|[\r\n])')
+_REGEX_NL = re.compile(r'(?:\r\n|[\r\n]|$)')
  _REGEX_BLANK = re.compile(r'(?:\r\n|[\r\n])+')
  _REGEX_BLANK = re.compile(r'(?:\r\n|[\r\n])+')
+_REGEX_OPTIONAL_WHITESPACE = re.compile(r'[ \t]*')
  
  
  def _parse_ts(ts):
  
  
  def _parse_ts(ts):
@@ -109,14 +103,8 @@ def _parse_ts(ts):
      Convert a parsed WebVTT timestamp (a re.Match obtained from _REGEX_TS)
      into an MPEG PES timestamp: a tick counter at 90 kHz resolution.
      """
      Convert a parsed WebVTT timestamp (a re.Match obtained from _REGEX_TS)
      into an MPEG PES timestamp: a tick counter at 90 kHz resolution.
      """
-
-    h, min, s, ms = ts.groups()
-    return 90 * (
-        int(h or 0) * 3600000 +  # noqa: W504,E221,E222
-        int(min)    *   60000 +  # noqa: W504,E221,E222
-        int(s)      *    1000 +  # noqa: W504,E221,E222
-        int(ms)                  # noqa: W504,E221,E222
-    )
+    return 90 * sum(
+        int(part or 0) * mult for part, mult in zip(ts.groups(), (3600_000, 60_000, 1000, 1)))
  
  
  def _format_ts(ts):
  
  
  def _format_ts(ts):
@@ -127,7 +115,7 @@ def _format_ts(ts):
      return '%02u:%02u:%02u.%03u' % timetuple_from_msec(int((ts + 45) // 90))
  
  
      return '%02u:%02u:%02u.%03u' % timetuple_from_msec(int((ts + 45) // 90))
  
  
-class Block(object):
+class Block:
      """
      An abstract WebVTT block.
      """
      """
      An abstract WebVTT block.
      """
@@ -153,7 +141,6 @@ class HeaderBlock(Block):
      A WebVTT block that may only appear in the header part of the file,
      i.e. before any cue blocks.
      """
      A WebVTT block that may only appear in the header part of the file,
      i.e. before any cue blocks.
      """
-
      pass
  
  
      pass
  
  
@@ -174,6 +161,12 @@ class Magic(HeaderBlock):
      _REGEX_TSMAP_MPEGTS = re.compile(r'MPEGTS:([0-9]+)')
      _REGEX_TSMAP_SEP = re.compile(r'[ \t]*,[ \t]*')
  
      _REGEX_TSMAP_MPEGTS = re.compile(r'MPEGTS:([0-9]+)')
      _REGEX_TSMAP_SEP = re.compile(r'[ \t]*,[ \t]*')
  
+    # This was removed from the spec in the 2017 revision;
+    # the last spec draft to describe this syntax element is
+    # <https://www.w3.org/TR/2015/WD-webvtt1-20151208/#webvtt-metadata-header>.
+    # Nevertheless, YouTube keeps serving those
+    _REGEX_META = re.compile(r'(?:(?!-->)[^\r\n])+:(?:(?!-->)[^\r\n])+(?:\r\n|[\r\n])')
+
      @classmethod
      def __parse_tsmap(cls, parser):
          parser = parser.child()
      @classmethod
      def __parse_tsmap(cls, parser):
          parser = parser.child()
@@ -213,13 +206,18 @@ def parse(cls, parser):
              raise ParseError(parser)
  
          extra = m.group(1)
              raise ParseError(parser)
  
          extra = m.group(1)
-        local, mpegts = None, None
-        if parser.consume(cls._REGEX_TSMAP):
-            local, mpegts = cls.__parse_tsmap(parser)
-        if not parser.consume(_REGEX_NL):
+        local, mpegts, meta = None, None, ''
+        while not parser.consume(_REGEX_NL):
+            if parser.consume(cls._REGEX_TSMAP):
+                local, mpegts = cls.__parse_tsmap(parser)
+                continue
+            m = parser.consume(cls._REGEX_META)
+            if m:
+                meta += m.group(0)
+                continue
              raise ParseError(parser)
          parser.commit()
              raise ParseError(parser)
          parser.commit()
-        return cls(extra=extra, mpegts=mpegts, local=local)
+        return cls(extra=extra, mpegts=mpegts, local=local, meta=meta)
  
      def write_into(self, stream):
          stream.write('WEBVTT')
  
      def write_into(self, stream):
          stream.write('WEBVTT')
@@ -232,6 +230,8 @@ def write_into(self, stream):
              stream.write(',MPEGTS:')
              stream.write(str(self.mpegts if self.mpegts is not None else 0))
              stream.write('\n')
              stream.write(',MPEGTS:')
              stream.write(str(self.mpegts if self.mpegts is not None else 0))
              stream.write('\n')
+        if self.meta:
+            stream.write(self.meta)
          stream.write('\n')
  
  
          stream.write('\n')
  
  
@@ -287,6 +287,7 @@ def parse(cls, parser):
          if not m1:
              return None
          m2 = parser.consume(cls._REGEX_SETTINGS)
          if not m1:
              return None
          m2 = parser.consume(cls._REGEX_SETTINGS)
+        parser.consume(_REGEX_OPTIONAL_WHITESPACE)
          if not parser.consume(_REGEX_NL):
              return None
  
          if not parser.consume(_REGEX_NL):
              return None
  
@@ -359,7 +360,7 @@ def parse_fragment(frag_content):
      a bytes object containing the raw contents of a WebVTT file.
      """
  
      a bytes object containing the raw contents of a WebVTT file.
      """
  
-    parser = _MatchParser(frag_content.decode('utf-8'))
+    parser = _MatchParser(frag_content.decode())
  
      yield Magic.parse(parser)
  
  
      yield Magic.parse(parser)