[yt-dlp.git] / yt_dlp / postprocessor / sponsorblock.py

from hashlib import sha256
import itertools
import json
import re
import time

from .ffmpeg import FFmpegPostProcessor
from ..compat import compat_urllib_parse_urlencode, compat_HTTPError
from ..utils import PostProcessingError, network_exceptions, sanitized_Request


class SponsorBlockPP(FFmpegPostProcessor):
    # https://wiki.sponsor.ajay.app/w/Types
    EXTRACTORS = {
        'Youtube': 'YouTube',
    }
    POI_CATEGORIES = {
        'poi_highlight': 'Highlight',
    }
    CATEGORIES = {
        'sponsor': 'Sponsor',
        'intro': 'Intermission/Intro Animation',
        'outro': 'Endcards/Credits',
        'selfpromo': 'Unpaid/Self Promotion',
        'preview': 'Preview/Recap',
        'filler': 'Filler Tangent',
        'interaction': 'Interaction Reminder',
        'music_offtopic': 'Non-Music Section',
        **POI_CATEGORIES,
    }

    def __init__(self, downloader, categories=None, api='https://sponsor.ajay.app'):
        FFmpegPostProcessor.__init__(self, downloader)
        self._categories = tuple(categories or self.CATEGORIES.keys())
        self._API_URL = api if re.match('^https?://', api) else 'https://' + api

    def run(self, info):
        extractor = info['extractor_key']
        if extractor not in self.EXTRACTORS:
            self.to_screen(f'SponsorBlock is not supported for {extractor}')
            return [], info

        self.to_screen('Fetching SponsorBlock segments')
        info['sponsorblock_chapters'] = self._get_sponsor_chapters(info, info['duration'])
        return [], info

    def _get_sponsor_chapters(self, info, duration):
        segments = self._get_sponsor_segments(info['id'], self.EXTRACTORS[info['extractor_key']])

        def duration_filter(s):
            start_end = s['segment']
            # Ignore milliseconds difference at the start.
            if start_end[0] <= 1:
                start_end[0] = 0
            # Make POI chapters 1 sec so that we can properly mark them
            if s['category'] in self.POI_CATEGORIES.keys():
                start_end[1] += 1
            # Ignore milliseconds difference at the end.
            # Never allow the segment to exceed the video.
            if duration and duration - start_end[1] <= 1:
                start_end[1] = duration
            # SponsorBlock duration may be absent or it may deviate from the real one.
            return s['videoDuration'] == 0 or not duration or abs(duration - s['videoDuration']) <= 1

        duration_match = [s for s in segments if duration_filter(s)]
        if len(duration_match) != len(segments):
            self.report_warning('Some SponsorBlock segments are from a video of different duration, maybe from an old version of this video')

        def to_chapter(s):
            (start, end), cat = s['segment'], s['category']
            return {
                'start_time': start,
                'end_time': end,
                'category': cat,
                'title': self.CATEGORIES[cat],
                '_categories': [(cat, start, end)]
            }

        sponsor_chapters = [to_chapter(s) for s in duration_match]
        if not sponsor_chapters:
            self.to_screen('No segments were found in the SponsorBlock database')
        else:
            self.to_screen(f'Found {len(sponsor_chapters)} segments in the SponsorBlock database')
        return sponsor_chapters

    def _get_sponsor_segments(self, video_id, service):
        hash = sha256(video_id.encode('ascii')).hexdigest()
        # SponsorBlock API recommends using first 4 hash characters.
        url = f'{self._API_URL}/api/skipSegments/{hash[:4]}?' + compat_urllib_parse_urlencode({
            'service': service,
            'categories': json.dumps(self._categories),
        })
        self.write_debug(f'SponsorBlock query: {url}')
        for d in self._get_json(url):
            if d['videoID'] == video_id:
                return d['segments']
        return []

    def _get_json(self, url):
        # While this is not an extractor, it behaves similar to one and
        # so obey extractor_retries and sleep_interval_requests
        max_retries = self.get_param('extractor_retries', 3)
        sleep_interval = self.get_param('sleep_interval_requests') or 0
        for retries in itertools.count():
            try:
                rsp = self._downloader.urlopen(sanitized_Request(url))
                return json.loads(rsp.read().decode(rsp.info().get_param('charset') or 'utf-8'))
            except network_exceptions as e:
                if isinstance(e, compat_HTTPError) and e.code == 404:
                    return []
                if retries < max_retries:
                    self.report_warning(f'{e}. Retrying...')
                    if sleep_interval > 0:
                        self.to_screen(f'Sleeping {sleep_interval} seconds ...')
                        time.sleep(sleep_interval)
                    continue
                raise PostProcessingError(f'Unable to communicate with SponsorBlock API: {e}')
Commit	Line	Data
ef58c476	1	from hashlib import sha256
ef58c476	2	import itertools
7a340e0d NA	3	import json
7a340e0d NA	4	import re
ef58c476	5	import time
7a340e0d NA	6
	7	from .ffmpeg import FFmpegPostProcessor
	8	from ..compat import compat_urllib_parse_urlencode, compat_HTTPError
e6f21b3d	9	from ..utils import PostProcessingError, network_exceptions, sanitized_Request
7a340e0d NA	10
	11
	12	class SponsorBlockPP(FFmpegPostProcessor):
8157a09d	13	# https://wiki.sponsor.ajay.app/w/Types
7a340e0d NA	14	EXTRACTORS = {
	15	'Youtube': 'YouTube',
	16	}
8157a09d NA	17	POI_CATEGORIES = {
	18	'poi_highlight': 'Highlight',
	19	}
7a340e0d NA	20	CATEGORIES = {
	21	'sponsor': 'Sponsor',
	22	'intro': 'Intermission/Intro Animation',
	23	'outro': 'Endcards/Credits',
	24	'selfpromo': 'Unpaid/Self Promotion',
7a340e0d	25	'preview': 'Preview/Recap',
8157a09d NA	26	'filler': 'Filler Tangent',
	27	'interaction': 'Interaction Reminder',
	28	'music_offtopic': 'Non-Music Section',
	29	**POI_CATEGORIES,
7a340e0d NA	30	}
	31
	32	def __init__(self, downloader, categories=None, api='https://sponsor.ajay.app'):
	33	FFmpegPostProcessor.__init__(self, downloader)
	34	self._categories = tuple(categories or self.CATEGORIES.keys())
	35	self._API_URL = api if re.match('^https?://', api) else 'https://' + api
	36
	37	def run(self, info):
	38	extractor = info['extractor_key']
	39	if extractor not in self.EXTRACTORS:
	40	self.to_screen(f'SponsorBlock is not supported for {extractor}')
	41	return [], info
	42
ef58c476	43	self.to_screen('Fetching SponsorBlock segments')
7a340e0d NA	44	info['sponsorblock_chapters'] = self._get_sponsor_chapters(info, info['duration'])
	45	return [], info
	46
	47	def _get_sponsor_chapters(self, info, duration):
	48	segments = self._get_sponsor_segments(info['id'], self.EXTRACTORS[info['extractor_key']])
	49
	50	def duration_filter(s):
	51	start_end = s['segment']
	52	# Ignore milliseconds difference at the start.
	53	if start_end[0] <= 1:
	54	start_end[0] = 0
8157a09d NA	55	# Make POI chapters 1 sec so that we can properly mark them
	56	if s['category'] in self.POI_CATEGORIES.keys():
	57	start_end[1] += 1
7a340e0d NA	58	# Ignore milliseconds difference at the end.
	59	# Never allow the segment to exceed the video.
	60	if duration and duration - start_end[1] <= 1:
	61	start_end[1] = duration
	62	# SponsorBlock duration may be absent or it may deviate from the real one.
	63	return s['videoDuration'] == 0 or not duration or abs(duration - s['videoDuration']) <= 1
	64
	65	duration_match = [s for s in segments if duration_filter(s)]
	66	if len(duration_match) != len(segments):
	67	self.report_warning('Some SponsorBlock segments are from a video of different duration, maybe from an old version of this video')
	68
	69	def to_chapter(s):
	70	(start, end), cat = s['segment'], s['category']
	71	return {
	72	'start_time': start,
	73	'end_time': end,
	74	'category': cat,
	75	'title': self.CATEGORIES[cat],
	76	'_categories': [(cat, start, end)]
	77	}
	78
	79	sponsor_chapters = [to_chapter(s) for s in duration_match]
	80	if not sponsor_chapters:
	81	self.to_screen('No segments were found in the SponsorBlock database')
	82	else:
	83	self.to_screen(f'Found {len(sponsor_chapters)} segments in the SponsorBlock database')
	84	return sponsor_chapters
	85
	86	def _get_sponsor_segments(self, video_id, service):
	87	hash = sha256(video_id.encode('ascii')).hexdigest()
	88	# SponsorBlock API recommends using first 4 hash characters.
	89	url = f'{self._API_URL}/api/skipSegments/{hash[:4]}?' + compat_urllib_parse_urlencode({
	90	'service': service,
	91	'categories': json.dumps(self._categories),
	92	})
ef58c476	93	self.write_debug(f'SponsorBlock query: {url}')
7a340e0d NA	94	for d in self._get_json(url):
	95	if d['videoID'] == video_id:
	96	return d['segments']
	97	return []
	98
	99	def _get_json(self, url):
ef58c476	100	# While this is not an extractor, it behaves similar to one and
	101	# so obey extractor_retries and sleep_interval_requests
	102	max_retries = self.get_param('extractor_retries', 3)
	103	sleep_interval = self.get_param('sleep_interval_requests') or 0
	104	for retries in itertools.count():
	105	try:
	106	rsp = self._downloader.urlopen(sanitized_Request(url))
	107	return json.loads(rsp.read().decode(rsp.info().get_param('charset') or 'utf-8'))
	108	except network_exceptions as e:
	109	if isinstance(e, compat_HTTPError) and e.code == 404:
	110	return []
	111	if retries < max_retries:
	112	self.report_warning(f'{e}. Retrying...')
	113	if sleep_interval > 0:
	114	self.to_screen(f'Sleeping {sleep_interval} seconds ...')
	115	time.sleep(sleep_interval)
	116	continue
	117	raise PostProcessingError(f'Unable to communicate with SponsorBlock API: {e}')