]> jfr.im git - yt-dlp.git/blame - yt_dlp/postprocessor/sponsorblock.py
[EmbedThumbnail] Prefer AtomicParsley over ffmpeg if available
[yt-dlp.git] / yt_dlp / postprocessor / sponsorblock.py
CommitLineData
ef58c476 1from hashlib import sha256
2import itertools
7a340e0d
NA
3import json
4import re
ef58c476 5import time
7a340e0d
NA
6
7from .ffmpeg import FFmpegPostProcessor
8from ..compat import compat_urllib_parse_urlencode, compat_HTTPError
e6f21b3d 9from ..utils import PostProcessingError, network_exceptions, sanitized_Request
7a340e0d
NA
10
11
12class SponsorBlockPP(FFmpegPostProcessor):
8157a09d 13 # https://wiki.sponsor.ajay.app/w/Types
7a340e0d
NA
14 EXTRACTORS = {
15 'Youtube': 'YouTube',
16 }
8157a09d
NA
17 POI_CATEGORIES = {
18 'poi_highlight': 'Highlight',
19 }
7a340e0d
NA
20 CATEGORIES = {
21 'sponsor': 'Sponsor',
22 'intro': 'Intermission/Intro Animation',
23 'outro': 'Endcards/Credits',
24 'selfpromo': 'Unpaid/Self Promotion',
7a340e0d 25 'preview': 'Preview/Recap',
8157a09d
NA
26 'filler': 'Filler Tangent',
27 'interaction': 'Interaction Reminder',
28 'music_offtopic': 'Non-Music Section',
29 **POI_CATEGORIES,
7a340e0d
NA
30 }
31
32 def __init__(self, downloader, categories=None, api='https://sponsor.ajay.app'):
33 FFmpegPostProcessor.__init__(self, downloader)
34 self._categories = tuple(categories or self.CATEGORIES.keys())
35 self._API_URL = api if re.match('^https?://', api) else 'https://' + api
36
37 def run(self, info):
38 extractor = info['extractor_key']
39 if extractor not in self.EXTRACTORS:
40 self.to_screen(f'SponsorBlock is not supported for {extractor}')
41 return [], info
42
ef58c476 43 self.to_screen('Fetching SponsorBlock segments')
7a340e0d
NA
44 info['sponsorblock_chapters'] = self._get_sponsor_chapters(info, info['duration'])
45 return [], info
46
47 def _get_sponsor_chapters(self, info, duration):
48 segments = self._get_sponsor_segments(info['id'], self.EXTRACTORS[info['extractor_key']])
49
50 def duration_filter(s):
51 start_end = s['segment']
52 # Ignore milliseconds difference at the start.
53 if start_end[0] <= 1:
54 start_end[0] = 0
8157a09d
NA
55 # Make POI chapters 1 sec so that we can properly mark them
56 if s['category'] in self.POI_CATEGORIES.keys():
57 start_end[1] += 1
7a340e0d
NA
58 # Ignore milliseconds difference at the end.
59 # Never allow the segment to exceed the video.
60 if duration and duration - start_end[1] <= 1:
61 start_end[1] = duration
62 # SponsorBlock duration may be absent or it may deviate from the real one.
63 return s['videoDuration'] == 0 or not duration or abs(duration - s['videoDuration']) <= 1
64
65 duration_match = [s for s in segments if duration_filter(s)]
66 if len(duration_match) != len(segments):
67 self.report_warning('Some SponsorBlock segments are from a video of different duration, maybe from an old version of this video')
68
69 def to_chapter(s):
70 (start, end), cat = s['segment'], s['category']
71 return {
72 'start_time': start,
73 'end_time': end,
74 'category': cat,
75 'title': self.CATEGORIES[cat],
76 '_categories': [(cat, start, end)]
77 }
78
79 sponsor_chapters = [to_chapter(s) for s in duration_match]
80 if not sponsor_chapters:
81 self.to_screen('No segments were found in the SponsorBlock database')
82 else:
83 self.to_screen(f'Found {len(sponsor_chapters)} segments in the SponsorBlock database')
84 return sponsor_chapters
85
86 def _get_sponsor_segments(self, video_id, service):
87 hash = sha256(video_id.encode('ascii')).hexdigest()
88 # SponsorBlock API recommends using first 4 hash characters.
89 url = f'{self._API_URL}/api/skipSegments/{hash[:4]}?' + compat_urllib_parse_urlencode({
90 'service': service,
91 'categories': json.dumps(self._categories),
92 })
ef58c476 93 self.write_debug(f'SponsorBlock query: {url}')
7a340e0d
NA
94 for d in self._get_json(url):
95 if d['videoID'] == video_id:
96 return d['segments']
97 return []
98
99 def _get_json(self, url):
ef58c476 100 # While this is not an extractor, it behaves similar to one and
101 # so obey extractor_retries and sleep_interval_requests
102 max_retries = self.get_param('extractor_retries', 3)
103 sleep_interval = self.get_param('sleep_interval_requests') or 0
104 for retries in itertools.count():
105 try:
106 rsp = self._downloader.urlopen(sanitized_Request(url))
107 return json.loads(rsp.read().decode(rsp.info().get_param('charset') or 'utf-8'))
108 except network_exceptions as e:
109 if isinstance(e, compat_HTTPError) and e.code == 404:
110 return []
111 if retries < max_retries:
112 self.report_warning(f'{e}. Retrying...')
113 if sleep_interval > 0:
114 self.to_screen(f'Sleeping {sleep_interval} seconds ...')
115 time.sleep(sleep_interval)
116 continue
117 raise PostProcessingError(f'Unable to communicate with SponsorBlock API: {e}')