]> jfr.im git - yt-dlp.git/commitdiff
[webvtt] Merge daisy-chained duplicate cues (#638)
authorFelix S <redacted>
Mon, 9 Aug 2021 20:22:30 +0000 (20:22 +0000)
committerGitHub <redacted>
Mon, 9 Aug 2021 20:22:30 +0000 (01:52 +0530)
Fixes: https://github.com/yt-dlp/yt-dlp/issues/631#issuecomment-893338552
Previous deduplication algorithm only removed duplicate cues with
identical text, styles and timestamps.  This change also merges
cues that come in ‘daisy chains’, where sequences of cues with
identical text and styles appear in which the ending timestamp of
one equals the starting timestamp of the next.

This deduplication algorithm has the somewhat unfortunate side effect
that NOTE blocks between cues, if found, will be emitted in a different
order relative to their original cues.  This may be unwanted if perfect
fidelity is desired, but then so is daisy-chain deduplication itself.
NOTE blocks ought to be ignored by WebVTT players in any case.

Authored by: fstirlitz

yt_dlp/downloader/fragment.py
yt_dlp/downloader/hls.py
yt_dlp/webvtt.py

index 1cc99a4e92a8bb3d7426b7cd0755174f514102a2..e3af140fde3a85ffc1c461d60b852afe45f5a7fc 100644 (file)
@@ -329,7 +329,7 @@ def _prepare_external_frag_download(self, ctx):
             'fragment_index': 0,
         })
 
-    def download_and_append_fragments(self, ctx, fragments, info_dict, pack_func=None):
+    def download_and_append_fragments(self, ctx, fragments, info_dict, *, pack_func=None, finish_func=None):
         fragment_retries = self.params.get('fragment_retries', 0)
         is_fatal = (lambda idx: idx == 0) if self.params.get('skip_unavailable_fragments', True) else (lambda _: True)
         if not pack_func:
@@ -424,5 +424,8 @@ def _download_fragment(fragment):
                 if not result:
                     return False
 
+        if finish_func is not None:
+            ctx['dest_stream'].write(finish_func())
+            ctx['dest_stream'].flush()
         self._finish_frag_download(ctx, info_dict)
         return True
index 9cbd5a584f3e12848fba074c9d52220e2d0258fd..9cfc191cbb7d8e7d1cd6cfc2dadc6417c9bcdf3f 100644 (file)
@@ -260,29 +260,35 @@ def pack_fragment(frag_content, frag_index):
                         block.end += adjust
 
                         dedup_window = extra_state.setdefault('webvtt_dedup_window', [])
-                        cue = block.as_json
 
-                        # skip the cue if an identical one appears
-                        # in the window of potential duplicates
-                        # and prune the window of unviable candidates
+                        ready = []
+
                         i = 0
-                        skip = True
+                        is_new = True
                         while i < len(dedup_window):
-                            window_cue = dedup_window[i]
-                            if window_cue == cue:
-                                break
-                            if window_cue['end'] >= cue['start']:
-                                i += 1
+                            wcue = dedup_window[i]
+                            wblock = webvtt.CueBlock.from_json(wcue)
+                            i += 1
+                            if wblock.hinges(block):
+                                wcue['end'] = block.end
+                                is_new = False
+                                continue
+                            if wblock == block:
+                                is_new = False
+                                continue
+                            if wblock.end > block.start:
                                 continue
+                            ready.append(wblock)
+                            i -= 1
                             del dedup_window[i]
-                        else:
-                            skip = False
 
-                        if skip:
-                            continue
+                        if is_new:
+                            dedup_window.append(block.as_json)
+                        for block in ready:
+                            block.write_into(output)
 
-                        # add the cue to the window
-                        dedup_window.append(cue)
+                        # we only emit cues once they fall out of the duplicate window
+                        continue
                     elif isinstance(block, webvtt.Magic):
                         # take care of MPEG PES timestamp overflow
                         if block.mpegts is None:
@@ -317,6 +323,19 @@ def pack_fragment(frag_content, frag_index):
                     block.write_into(output)
 
                 return output.getvalue().encode('utf-8')
+
+            def fin_fragments():
+                dedup_window = extra_state.get('webvtt_dedup_window')
+                if not dedup_window:
+                    return b''
+
+                output = io.StringIO()
+                for cue in dedup_window:
+                    webvtt.CueBlock.from_json(cue).write_into(output)
+
+                return output.getvalue().encode('utf-8')
+
+            self.download_and_append_fragments(
+                ctx, fragments, info_dict, pack_func=pack_fragment, finish_func=fin_fragments)
         else:
-            pack_fragment = None
-        return self.download_and_append_fragments(ctx, fragments, info_dict, pack_fragment)
+            return self.download_and_append_fragments(ctx, fragments, info_dict)
index ef55e6459ad2ae0ed9045495b3bdbf9f038fef33..eee2a4a2dd28ea16249ba11a5039a3ca2c3600f2 100644 (file)
@@ -331,6 +331,26 @@ def as_json(self):
             'settings': self.settings,
         }
 
+    def __eq__(self, other):
+        return self.as_json == other.as_json
+
+    @classmethod
+    def from_json(cls, json):
+        return cls(
+            id=json['id'],
+            start=json['start'],
+            end=json['end'],
+            text=json['text'],
+            settings=json['settings']
+        )
+
+    def hinges(self, other):
+        if self.text != other.text:
+            return False
+        if self.settings != other.settings:
+            return False
+        return self.start <= self.end == other.start <= other.end
+
 
 def parse_fragment(frag_content):
     """