yt_dlp/postprocessor/sponsorblock.py

   1 import hashlib
   2 import json
   3 import re
   4 import urllib.parse
   5
   6 from .ffmpeg import FFmpegPostProcessor
   7
   8
   9 class SponsorBlockPP(FFmpegPostProcessor):
  10     # https://wiki.sponsor.ajay.app/w/Types
  11     EXTRACTORS = {
  12         'Youtube': 'YouTube',
  13     }
  14     POI_CATEGORIES = {
  15         'poi_highlight': 'Highlight',
  16     }
  17     NON_SKIPPABLE_CATEGORIES = {
  18         **POI_CATEGORIES,
  19         'chapter': 'Chapter',
  20     }
  21     CATEGORIES = {
  22         'sponsor': 'Sponsor',
  23         'intro': 'Intermission/Intro Animation',
  24         'outro': 'Endcards/Credits',
  25         'selfpromo': 'Unpaid/Self Promotion',
  26         'preview': 'Preview/Recap',
  27         'filler': 'Filler Tangent',
  28         'interaction': 'Interaction Reminder',
  29         'music_offtopic': 'Non-Music Section',
  30         **NON_SKIPPABLE_CATEGORIES
  31     }
  32
  33     def __init__(self, downloader, categories=None, api='https://sponsor.ajay.app'):
  34         FFmpegPostProcessor.__init__(self, downloader)
  35         self._categories = tuple(categories or self.CATEGORIES.keys())
  36         self._API_URL = api if re.match('^https?://', api) else 'https://' + api
  37
  38     def run(self, info):
  39         extractor = info['extractor_key']
  40         if extractor not in self.EXTRACTORS:
  41             self.to_screen(f'SponsorBlock is not supported for {extractor}')
  42             return [], info
  43
  44         self.to_screen('Fetching SponsorBlock segments')
  45         info['sponsorblock_chapters'] = self._get_sponsor_chapters(info, info.get('duration'))
  46         return [], info
  47
  48     def _get_sponsor_chapters(self, info, duration):
  49         segments = self._get_sponsor_segments(info['id'], self.EXTRACTORS[info['extractor_key']])
  50
  51         def duration_filter(s):
  52             start_end = s['segment']
  53             # Ignore entire video segments (https://wiki.sponsor.ajay.app/w/Types).
  54             if start_end == (0, 0):
  55                 return False
  56             # Ignore milliseconds difference at the start.
  57             if start_end[0] <= 1:
  58                 start_end[0] = 0
  59             # Make POI chapters 1 sec so that we can properly mark them
  60             if s['category'] in self.POI_CATEGORIES.keys():
  61                 start_end[1] += 1
  62             # Ignore milliseconds difference at the end.
  63             # Never allow the segment to exceed the video.
  64             if duration and duration - start_end[1] <= 1:
  65                 start_end[1] = duration
  66             # SponsorBlock duration may be absent or it may deviate from the real one.
  67             return s['videoDuration'] == 0 or not duration or abs(duration - s['videoDuration']) <= 1
  68
  69         duration_match = [s for s in segments if duration_filter(s)]
  70         if len(duration_match) != len(segments):
  71             self.report_warning('Some SponsorBlock segments are from a video of different duration, maybe from an old version of this video')
  72
  73         def to_chapter(s):
  74             (start, end), cat = s['segment'], s['category']
  75             title = s['description'] if cat == 'chapter' else self.CATEGORIES[cat]
  76             return {
  77                 'start_time': start,
  78                 'end_time': end,
  79                 'category': cat,
  80                 'title': title,
  81                 '_categories': [(cat, start, end, title)],
  82             }
  83
  84         sponsor_chapters = [to_chapter(s) for s in duration_match]
  85         if not sponsor_chapters:
  86             self.to_screen('No segments were found in the SponsorBlock database')
  87         else:
  88             self.to_screen(f'Found {len(sponsor_chapters)} segments in the SponsorBlock database')
  89         return sponsor_chapters
  90
  91     def _get_sponsor_segments(self, video_id, service):
  92         hash = hashlib.sha256(video_id.encode('ascii')).hexdigest()
  93         # SponsorBlock API recommends using first 4 hash characters.
  94         url = f'{self._API_URL}/api/skipSegments/{hash[:4]}?' + urllib.parse.urlencode({
  95             'service': service,
  96             'categories': json.dumps(self._categories),
  97             'actionTypes': json.dumps(['skip', 'poi', 'chapter'])
  98         })
  99         for d in self._download_json(url) or []:
 100             if d['videoID'] == video_id:
 101                 return d['segments']
 102         return []