yt_dlp/postprocessor/sponsorblock.py

   1 from hashlib import sha256
   2 import itertools
   3 import json
   4 import re
   5 import time
   6
   7 from .ffmpeg import FFmpegPostProcessor
   8 from ..compat import compat_urllib_parse_urlencode, compat_HTTPError
   9 from ..utils import PostProcessingError, network_exceptions, sanitized_Request
  10
  11
  12 class SponsorBlockPP(FFmpegPostProcessor):
  13     # https://wiki.sponsor.ajay.app/w/Types
  14     EXTRACTORS = {
  15         'Youtube': 'YouTube',
  16     }
  17     POI_CATEGORIES = {
  18         'poi_highlight': 'Highlight',
  19     }
  20     CATEGORIES = {
  21         'sponsor': 'Sponsor',
  22         'intro': 'Intermission/Intro Animation',
  23         'outro': 'Endcards/Credits',
  24         'selfpromo': 'Unpaid/Self Promotion',
  25         'preview': 'Preview/Recap',
  26         'filler': 'Filler Tangent',
  27         'interaction': 'Interaction Reminder',
  28         'music_offtopic': 'Non-Music Section',
  29         **POI_CATEGORIES,
  30     }
  31
  32     def __init__(self, downloader, categories=None, api='https://sponsor.ajay.app'):
  33         FFmpegPostProcessor.__init__(self, downloader)
  34         self._categories = tuple(categories or self.CATEGORIES.keys())
  35         self._API_URL = api if re.match('^https?://', api) else 'https://' + api
  36
  37     def run(self, info):
  38         extractor = info['extractor_key']
  39         if extractor not in self.EXTRACTORS:
  40             self.to_screen(f'SponsorBlock is not supported for {extractor}')
  41             return [], info
  42
  43         self.to_screen('Fetching SponsorBlock segments')
  44         info['sponsorblock_chapters'] = self._get_sponsor_chapters(info, info['duration'])
  45         return [], info
  46
  47     def _get_sponsor_chapters(self, info, duration):
  48         segments = self._get_sponsor_segments(info['id'], self.EXTRACTORS[info['extractor_key']])
  49
  50         def duration_filter(s):
  51             start_end = s['segment']
  52             # Ignore entire video segments (https://wiki.sponsor.ajay.app/w/Types).
  53             if start_end == (0, 0):
  54                 return False
  55             # Ignore milliseconds difference at the start.
  56             if start_end[0] <= 1:
  57                 start_end[0] = 0
  58             # Make POI chapters 1 sec so that we can properly mark them
  59             if s['category'] in self.POI_CATEGORIES.keys():
  60                 start_end[1] += 1
  61             # Ignore milliseconds difference at the end.
  62             # Never allow the segment to exceed the video.
  63             if duration and duration - start_end[1] <= 1:
  64                 start_end[1] = duration
  65             # SponsorBlock duration may be absent or it may deviate from the real one.
  66             return s['videoDuration'] == 0 or not duration or abs(duration - s['videoDuration']) <= 1
  67
  68         duration_match = [s for s in segments if duration_filter(s)]
  69         if len(duration_match) != len(segments):
  70             self.report_warning('Some SponsorBlock segments are from a video of different duration, maybe from an old version of this video')
  71
  72         def to_chapter(s):
  73             (start, end), cat = s['segment'], s['category']
  74             return {
  75                 'start_time': start,
  76                 'end_time': end,
  77                 'category': cat,
  78                 'title': self.CATEGORIES[cat],
  79                 '_categories': [(cat, start, end)]
  80             }
  81
  82         sponsor_chapters = [to_chapter(s) for s in duration_match]
  83         if not sponsor_chapters:
  84             self.to_screen('No segments were found in the SponsorBlock database')
  85         else:
  86             self.to_screen(f'Found {len(sponsor_chapters)} segments in the SponsorBlock database')
  87         return sponsor_chapters
  88
  89     def _get_sponsor_segments(self, video_id, service):
  90         hash = sha256(video_id.encode('ascii')).hexdigest()
  91         # SponsorBlock API recommends using first 4 hash characters.
  92         url = f'{self._API_URL}/api/skipSegments/{hash[:4]}?' + compat_urllib_parse_urlencode({
  93             'service': service,
  94             'categories': json.dumps(self._categories),
  95             'actionTypes': json.dumps(['skip', 'poi'])
  96         })
  97         self.write_debug(f'SponsorBlock query: {url}')
  98         for d in self._get_json(url):
  99             if d['videoID'] == video_id:
 100                 return d['segments']
 101         return []
 102
 103     def _get_json(self, url):
 104         # While this is not an extractor, it behaves similar to one and
 105         # so obey extractor_retries and sleep_interval_requests
 106         max_retries = self.get_param('extractor_retries', 3)
 107         sleep_interval = self.get_param('sleep_interval_requests') or 0
 108         for retries in itertools.count():
 109             try:
 110                 rsp = self._downloader.urlopen(sanitized_Request(url))
 111                 return json.loads(rsp.read().decode(rsp.info().get_param('charset') or 'utf-8'))
 112             except network_exceptions as e:
 113                 if isinstance(e, compat_HTTPError) and e.code == 404:
 114                     return []
 115                 if retries < max_retries:
 116                     self.report_warning(f'{e}. Retrying...')
 117                     if sleep_interval > 0:
 118                         self.to_screen(f'Sleeping {sleep_interval} seconds ...')
 119                         time.sleep(sleep_interval)
 120                     continue
 121                 raise PostProcessingError(f'Unable to communicate with SponsorBlock API: {e}')