[webvtt] Merge daisy-chained duplicate cues (#638)

author Felix S <redacted>

Mon, 9 Aug 2021 20:22:30 +0000 (20:22 +0000)

committer GitHub <redacted>

Mon, 9 Aug 2021 20:22:30 +0000 (01:52 +0530)
author Felix S <redacted>
Mon, 9 Aug 2021 20:22:30 +0000 (20:22 +0000)
committer GitHub <redacted>
Mon, 9 Aug 2021 20:22:30 +0000 (01:52 +0530)
diff --git a/yt_dlp/downloader/fragment.py b/yt_dlp/downloader/fragment.py

index 1cc99a4e92a8bb3d7426b7cd0755174f514102a2..e3af140fde3a85ffc1c461d60b852afe45f5a7fc 100644 (file)
--- a/yt_dlp/downloader/fragment.py
+++ b/yt_dlp/downloader/fragment.py
@@ -329,7 +329,7 @@ def _prepare_external_frag_download(self, ctx):
              'fragment_index': 0,
          })
  
-    def download_and_append_fragments(self, ctx, fragments, info_dict, pack_func=None):
+    def download_and_append_fragments(self, ctx, fragments, info_dict, *, pack_func=None, finish_func=None):
          fragment_retries = self.params.get('fragment_retries', 0)
          is_fatal = (lambda idx: idx == 0) if self.params.get('skip_unavailable_fragments', True) else (lambda _: True)
          if not pack_func:
@@ -424,5 +424,8 @@ def _download_fragment(fragment):
                  if not result:
                      return False
  
+        if finish_func is not None:
+            ctx['dest_stream'].write(finish_func())
+            ctx['dest_stream'].flush()
          self._finish_frag_download(ctx, info_dict)
          return True
diff --git a/yt_dlp/downloader/hls.py b/yt_dlp/downloader/hls.py

index 9cbd5a584f3e12848fba074c9d52220e2d0258fd..9cfc191cbb7d8e7d1cd6cfc2dadc6417c9bcdf3f 100644 (file)
--- a/yt_dlp/downloader/hls.py
+++ b/yt_dlp/downloader/hls.py
@@ -260,29 +260,35 @@ def pack_fragment(frag_content, frag_index):
                          block.end += adjust
  
                          dedup_window = extra_state.setdefault('webvtt_dedup_window', [])
-                        cue = block.as_json
  
-                        # skip the cue if an identical one appears
-                        # in the window of potential duplicates
-                        # and prune the window of unviable candidates
+                        ready = []
+
                          i = 0
-                        skip = True
+                        is_new = True
                          while i < len(dedup_window):
-                            window_cue = dedup_window[i]
-                            if window_cue == cue:
-                                break
-                            if window_cue['end'] >= cue['start']:
-                                i += 1
+                            wcue = dedup_window[i]
+                            wblock = webvtt.CueBlock.from_json(wcue)
+                            i += 1
+                            if wblock.hinges(block):
+                                wcue['end'] = block.end
+                                is_new = False
+                                continue
+                            if wblock == block:
+                                is_new = False
+                                continue
+                            if wblock.end > block.start:
                                  continue
+                            ready.append(wblock)
+                            i -= 1
                              del dedup_window[i]
-                        else:
-                            skip = False
  
-                        if skip:
-                            continue
+                        if is_new:
+                            dedup_window.append(block.as_json)
+                        for block in ready:
+                            block.write_into(output)
  
-                        # add the cue to the window
-                        dedup_window.append(cue)
+                        # we only emit cues once they fall out of the duplicate window
+                        continue
                      elif isinstance(block, webvtt.Magic):
                          # take care of MPEG PES timestamp overflow
                          if block.mpegts is None:
@@ -317,6 +323,19 @@ def pack_fragment(frag_content, frag_index):
                      block.write_into(output)
  
                  return output.getvalue().encode('utf-8')
+
+            def fin_fragments():
+                dedup_window = extra_state.get('webvtt_dedup_window')
+                if not dedup_window:
+                    return b''
+
+                output = io.StringIO()
+                for cue in dedup_window:
+                    webvtt.CueBlock.from_json(cue).write_into(output)
+
+                return output.getvalue().encode('utf-8')
+
+            self.download_and_append_fragments(
+                ctx, fragments, info_dict, pack_func=pack_fragment, finish_func=fin_fragments)
          else:
-            pack_fragment = None
-        return self.download_and_append_fragments(ctx, fragments, info_dict, pack_fragment)
+            return self.download_and_append_fragments(ctx, fragments, info_dict)
diff --git a/yt_dlp/webvtt.py b/yt_dlp/webvtt.py

index ef55e6459ad2ae0ed9045495b3bdbf9f038fef33..eee2a4a2dd28ea16249ba11a5039a3ca2c3600f2 100644 (file)
--- a/yt_dlp/webvtt.py
+++ b/yt_dlp/webvtt.py
@@ -331,6 +331,26 @@ def as_json(self):
              'settings': self.settings,
          }
  
+    def __eq__(self, other):
+        return self.as_json == other.as_json
+
+    @classmethod
+    def from_json(cls, json):
+        return cls(
+            id=json['id'],
+            start=json['start'],
+            end=json['end'],
+            text=json['text'],
+            settings=json['settings']
+        )
+
+    def hinges(self, other):
+        if self.text != other.text:
+            return False
+        if self.settings != other.settings:
+            return False
+        return self.start <= self.end == other.start <= other.end
+
  
  def parse_fragment(frag_content):
      """
author	Felix S <redacted>
	Mon, 9 Aug 2021 20:22:30 +0000 (20:22 +0000)
committer	GitHub <redacted>
	Mon, 9 Aug 2021 20:22:30 +0000 (01:52 +0530)
yt_dlp/downloader/fragment.py		patch \| blob \| blame \| history
yt_dlp/downloader/hls.py		patch \| blob \| blame \| history
yt_dlp/webvtt.py		patch \| blob \| blame \| history