[ie/cloudflarestream] Fix `_VALID_URL` and embed extraction (#10215)

[yt-dlp.git] / yt_dlp / postprocessor / sponsorblock.py
diff --git a/yt_dlp/postprocessor/sponsorblock.py b/yt_dlp/postprocessor/sponsorblock.py

index 6264d45c5d1944c45901a4aa7abd719e20b2f60a..6cf9ab62eab5d43a0eb59edbfaf7314a716adc82 100644 (file)
--- a/yt_dlp/postprocessor/sponsorblock.py
+++ b/yt_dlp/postprocessor/sponsorblock.py
@@ -1,25 +1,33 @@
+import hashlib
  import json
  import re
-from hashlib import sha256
+import urllib.parse
  
  from .ffmpeg import FFmpegPostProcessor
-from ..compat import compat_urllib_parse_urlencode, compat_HTTPError
-from ..utils import PostProcessingError, sanitized_Request
  
  
  class SponsorBlockPP(FFmpegPostProcessor):
-
+    # https://wiki.sponsor.ajay.app/w/Types
      EXTRACTORS = {
          'Youtube': 'YouTube',
      }
+    POI_CATEGORIES = {
+        'poi_highlight': 'Highlight',
+    }
+    NON_SKIPPABLE_CATEGORIES = {
+        **POI_CATEGORIES,
+        'chapter': 'Chapter',
+    }
      CATEGORIES = {
          'sponsor': 'Sponsor',
          'intro': 'Intermission/Intro Animation',
          'outro': 'Endcards/Credits',
          'selfpromo': 'Unpaid/Self Promotion',
-        'interaction': 'Interaction Reminder',
          'preview': 'Preview/Recap',
-        'music_offtopic': 'Non-Music Section'
+        'filler': 'Filler Tangent',
+        'interaction': 'Interaction Reminder',
+        'music_offtopic': 'Non-Music Section',
+        **NON_SKIPPABLE_CATEGORIES,
      }
  
      def __init__(self, downloader, categories=None, api='https://sponsor.ajay.app'):
@@ -33,7 +41,8 @@ def run(self, info):
              self.to_screen(f'SponsorBlock is not supported for {extractor}')
              return [], info
  
-        info['sponsorblock_chapters'] = self._get_sponsor_chapters(info, info['duration'])
+        self.to_screen('Fetching SponsorBlock segments')
+        info['sponsorblock_chapters'] = self._get_sponsor_chapters(info, info.get('duration'))
          return [], info
  
      def _get_sponsor_chapters(self, info, duration):
@@ -41,15 +50,22 @@ def _get_sponsor_chapters(self, info, duration):
  
          def duration_filter(s):
              start_end = s['segment']
+            # Ignore entire video segments (https://wiki.sponsor.ajay.app/w/Types).
+            if start_end == (0, 0):
+                return False
              # Ignore milliseconds difference at the start.
              if start_end[0] <= 1:
                  start_end[0] = 0
+            # Make POI chapters 1 sec so that we can properly mark them
+            if s['category'] in self.POI_CATEGORIES:
+                start_end[1] += 1
              # Ignore milliseconds difference at the end.
              # Never allow the segment to exceed the video.
              if duration and duration - start_end[1] <= 1:
                  start_end[1] = duration
              # SponsorBlock duration may be absent or it may deviate from the real one.
-            return s['videoDuration'] == 0 or not duration or abs(duration - s['videoDuration']) <= 1
+            diff = abs(duration - s['videoDuration']) if s['videoDuration'] else 0
+            return diff < 1 or (diff < 5 and diff / (start_end[1] - start_end[0]) < 0.05)
  
          duration_match = [s for s in segments if duration_filter(s)]
          if len(duration_match) != len(segments):
@@ -57,40 +73,32 @@ def duration_filter(s):
  
          def to_chapter(s):
              (start, end), cat = s['segment'], s['category']
+            title = s['description'] if cat == 'chapter' else self.CATEGORIES[cat]
              return {
                  'start_time': start,
                  'end_time': end,
                  'category': cat,
-                'title': self.CATEGORIES[cat],
-                '_categories': [(cat, start, end)]
+                'title': title,
+                'type': s['actionType'],
+                '_categories': [(cat, start, end, title)],
              }
  
          sponsor_chapters = [to_chapter(s) for s in duration_match]
          if not sponsor_chapters:
-            self.to_screen('No segments were found in the SponsorBlock database')
+            self.to_screen('No matching segments were found in the SponsorBlock database')
          else:
              self.to_screen(f'Found {len(sponsor_chapters)} segments in the SponsorBlock database')
          return sponsor_chapters
  
      def _get_sponsor_segments(self, video_id, service):
-        hash = sha256(video_id.encode('ascii')).hexdigest()
+        video_hash = hashlib.sha256(video_id.encode('ascii')).hexdigest()
          # SponsorBlock API recommends using first 4 hash characters.
-        url = f'{self._API_URL}/api/skipSegments/{hash[:4]}?' + compat_urllib_parse_urlencode({
+        url = f'{self._API_URL}/api/skipSegments/{video_hash[:4]}?' + urllib.parse.urlencode({
              'service': service,
              'categories': json.dumps(self._categories),
+            'actionTypes': json.dumps(['skip', 'poi', 'chapter']),
          })
-        for d in self._get_json(url):
+        for d in self._download_json(url) or []:
              if d['videoID'] == video_id:
                  return d['segments']
          return []
-
-    def _get_json(self, url):
-        self.write_debug(f'SponsorBlock query: {url}')
-        try:
-            rsp = self._downloader.urlopen(sanitized_Request(url))
-        except compat_HTTPError as e:
-            if e.code == 404:
-                return []
-            raise PostProcessingError(f'Error communicating with SponsorBlock API - {e}')
-
-        return json.loads(rsp.read().decode(rsp.info().get_param('charset') or 'utf-8'))