[yt-dlp.git] / yt_dlp / extractor / sproutvideo.py

import base64
import urllib.parse

from .common import InfoExtractor
from ..networking.exceptions import HTTPError
from ..utils import (
    ExtractorError,
    int_or_none,
    qualities,
    remove_start,
    smuggle_url,
    unsmuggle_url,
    update_url_query,
    url_or_none,
    urlencode_postdata,
)
from ..utils.traversal import traverse_obj


class SproutVideoIE(InfoExtractor):
    _NO_SCHEME_RE = r'//videos\.sproutvideo\.com/embed/(?P<id>[\da-f]+)/[\da-f]+'
    _VALID_URL = rf'https?:{_NO_SCHEME_RE}'
    _EMBED_REGEX = [rf'<iframe [^>]*\bsrc=["\'](?P<url>(?:https?:)?{_NO_SCHEME_RE}[^"\']*)["\']']
    _TESTS = [{
        'url': 'https://videos.sproutvideo.com/embed/4c9dddb01910e3c9c4/0fc24387c4f24ee3',
        'md5': '1343ce1a6cb39d67889bfa07c7b02b0e',
        'info_dict': {
            'id': '4c9dddb01910e3c9c4',
            'ext': 'mp4',
            'title': 'Adrien Labaeye : Berlin, des communautés aux communs',
            'duration': 576,
            'thumbnail': r're:https?://images\.sproutvideo\.com/.+\.jpg',
        },
    }, {
        'url': 'https://videos.sproutvideo.com/embed/a79fdcb21f1be2c62e/93bf31e41e39ca27',
        'md5': 'cebae5cf558cca83271917cf4ec03f26',
        'info_dict': {
            'id': 'a79fdcb21f1be2c62e',
            'ext': 'mp4',
            'title': 'HS_01_Live Stream 2023-01-14 10:00',
            'duration': 703,
            'thumbnail': r're:https?://images\.sproutvideo\.com/.+\.jpg',
        },
    }, {
        # http formats 'sd' and 'hd' are available
        'url': 'https://videos.sproutvideo.com/embed/119cd6bc1a18e6cd98/30751a1761ae5b90',
        'md5': 'f368c78df07e78a749508b221528672c',
        'info_dict': {
            'id': '119cd6bc1a18e6cd98',
            'ext': 'mp4',
            'title': '3. Updating your Partner details',
            'thumbnail': r're:https?://images\.sproutvideo\.com/.+\.jpg',
            'duration': 60,
        },
        'params': {'format': 'hd'},
    }, {
        # subtitles
        'url': 'https://videos.sproutvideo.com/embed/119dd8ba121ee0cc98/4ee50c88a343215d?type=hd',
        'md5': '7f6798f037d7a3e3e07e67959de68fc6',
        'info_dict': {
            'id': '119dd8ba121ee0cc98',
            'ext': 'mp4',
            'title': 'Recipients Setup - Domestic Wire Only',
            'thumbnail': r're:https?://images\.sproutvideo\.com/.+\.jpg',
            'duration': 77,
            'subtitles': {'en': 'count:1'},
        },
    }]
    _WEBPAGE_TESTS = [{
        'url': 'https://www.solidarum.org/vivre-ensemble/adrien-labaeye-berlin-des-communautes-aux-communs',
        'info_dict': {
            'id': '4c9dddb01910e3c9c4',
            'ext': 'mp4',
            'title': 'Adrien Labaeye : Berlin, des communautés aux communs',
            'duration': 576,
            'thumbnail': r're:https?://images\.sproutvideo\.com/.+\.jpg',
        },
    }]
    _M3U8_URL_TMPL = 'https://{base}.videos.sproutvideo.com/{s3_user_hash}/{s3_video_hash}/video/index.m3u8'
    _QUALITIES = ('hd', 'uhd', 'source')  # Exclude 'sd' to prioritize hls formats above it

    @staticmethod
    def _policy_to_qs(policy, signature_key, as_string=False):
        query = {}
        for key, value in policy['signatures'][signature_key].items():
            query[remove_start(key, 'CloudFront-')] = value
        query['sessionID'] = policy['sessionID']
        return urllib.parse.urlencode(query, doseq=True) if as_string else query

    @classmethod
    def _extract_embed_urls(cls, url, webpage):
        for embed_url in super()._extract_embed_urls(url, webpage):
            if embed_url.startswith('//'):
                embed_url = f'https:{embed_url}'
            yield smuggle_url(embed_url, {'referer': url})

    def _real_extract(self, url):
        url, smuggled_data = unsmuggle_url(url, {})
        video_id = self._match_id(url)
        webpage = self._download_webpage(
            url, video_id, headers=traverse_obj(smuggled_data, {'Referer': 'referer'}))
        data = self._search_json(
            r'var\s+dat\s*=\s*["\']', webpage, 'data', video_id, contains_pattern=r'[A-Za-z0-9+/=]+',
            end_pattern=r'["\'];', transform_source=lambda x: base64.b64decode(x).decode())

        formats, subtitles = [], {}
        headers = {
            'Accept': '*/*',
            'Origin': 'https://videos.sproutvideo.com',
            'Referer': url,
        }

        # HLS extraction is fatal; only attempt it if the JSON data says it's available
        if traverse_obj(data, 'hls'):
            manifest_query = self._policy_to_qs(data, 'm')
            fragment_query = self._policy_to_qs(data, 't', as_string=True)
            key_query = self._policy_to_qs(data, 'k', as_string=True)

            formats.extend(self._extract_m3u8_formats(
                self._M3U8_URL_TMPL.format(**data), video_id, 'mp4',
                m3u8_id='hls', headers=headers, query=manifest_query))
            for fmt in formats:
                fmt.update({
                    'url': update_url_query(fmt['url'], manifest_query),
                    'extra_param_to_segment_url': fragment_query,
                    'extra_param_to_key_url': key_query,
                })

        if downloads := traverse_obj(data, ('downloads', {dict.items}, lambda _, v: url_or_none(v[1]))):
            quality = qualities(self._QUALITIES)
            acodec = 'none' if data.get('has_audio') is False else None
            formats.extend([{
                'format_id': str(format_id),
                'url': format_url,
                'ext': 'mp4',
                'quality': quality(format_id),
                'acodec': acodec,
            } for format_id, format_url in downloads])

        for sub_data in traverse_obj(data, ('subtitleData', lambda _, v: url_or_none(v['src']))):
            subtitles.setdefault(sub_data.get('srclang', 'en'), []).append({
                'url': sub_data['src'],
            })

        return {
            'id': video_id,
            'formats': formats,
            'subtitles': subtitles,
            'http_headers': headers,
            **traverse_obj(data, {
                'title': ('title', {str}),
                'duration': ('duration', {int_or_none}),
                'thumbnail': ('posterframe_url', {url_or_none}),
            }),
        }


class VidsIoIE(InfoExtractor):
    IE_NAME = 'vids.io'
    _VALID_URL = r'https?://[\w-]+\.vids\.io/videos/(?P<id>[\da-f]+)/(?P<display_id>[\w-]+)'
    _TESTS = [{
        'url': 'https://how-to-video.vids.io/videos/799cd8b11c10efc1f0/how-to-video-live-streaming',
        'md5': '9bbbb2c0c0739eb163b80f87b8d77c9e',
        'info_dict': {
            'id': '799cd8b11c10efc1f0',
            'ext': 'mp4',
            'title': 'How to Video: Live Streaming',
            'duration': 2787,
            'thumbnail': r're:https?://images\.sproutvideo\.com/.+\.jpg',
        },
    }]

    def _real_extract(self, url):
        video_id, display_id = self._match_valid_url(url).group('id', 'display_id')
        webpage, urlh = self._download_webpage_handle(url, display_id, expected_status=403)

        if urlh.status == 403:
            password = self.get_param('videopassword')
            if not password:
                raise ExtractorError(
                    'This video is password-protected; use the --video-password option', expected=True)
            try:
                webpage = self._download_webpage(
                    url, display_id, 'Submitting video password',
                    data=urlencode_postdata({
                        'password': password,
                        **self._hidden_inputs(webpage),
                    }))
                # Requests with user's session cookie `_sproutvideo_session` are now authorized
            except ExtractorError as e:
                if isinstance(e.cause, HTTPError) and e.cause.status == 403:
                    raise ExtractorError('Incorrect password', expected=True)
                raise

        if embed_url := next(SproutVideoIE._extract_embed_urls(url, webpage), None):
            return self.url_result(embed_url, SproutVideoIE, video_id)

        raise ExtractorError('Unable to extract any SproutVideo embed url')
Commit	Line	Data
d6c2c2bc	1	import base64
	2	import urllib.parse
	3
	4	from .common import InfoExtractor
	5	from ..networking.exceptions import HTTPError
	6	from ..utils import (
	7	ExtractorError,
	8	int_or_none,
	9	qualities,
	10	remove_start,
	11	smuggle_url,
	12	unsmuggle_url,
	13	update_url_query,
	14	url_or_none,
	15	urlencode_postdata,
	16	)
	17	from ..utils.traversal import traverse_obj
	18
	19
	20	class SproutVideoIE(InfoExtractor):
	21	_NO_SCHEME_RE = r'//videos\.sproutvideo\.com/embed/(?P<id>[\da-f]+)/[\da-f]+'
	22	_VALID_URL = rf'https?:{_NO_SCHEME_RE}'
	23	_EMBED_REGEX = [rf'<iframe [^>]\bsrc=["\'](?P<url>(?:https?:)?{_NO_SCHEME_RE}[^"\'])["\']']
	24	_TESTS = [{
	25	'url': 'https://videos.sproutvideo.com/embed/4c9dddb01910e3c9c4/0fc24387c4f24ee3',
	26	'md5': '1343ce1a6cb39d67889bfa07c7b02b0e',
	27	'info_dict': {
	28	'id': '4c9dddb01910e3c9c4',
	29	'ext': 'mp4',
	30	'title': 'Adrien Labaeye : Berlin, des communautés aux communs',
	31	'duration': 576,
	32	'thumbnail': r're:https?://images\.sproutvideo\.com/.+\.jpg',
	33	},
	34	}, {
	35	'url': 'https://videos.sproutvideo.com/embed/a79fdcb21f1be2c62e/93bf31e41e39ca27',
	36	'md5': 'cebae5cf558cca83271917cf4ec03f26',
	37	'info_dict': {
	38	'id': 'a79fdcb21f1be2c62e',
	39	'ext': 'mp4',
	40	'title': 'HS_01_Live Stream 2023-01-14 10:00',
	41	'duration': 703,
	42	'thumbnail': r're:https?://images\.sproutvideo\.com/.+\.jpg',
	43	},
	44	}, {
	45	# http formats 'sd' and 'hd' are available
	46	'url': 'https://videos.sproutvideo.com/embed/119cd6bc1a18e6cd98/30751a1761ae5b90',
	47	'md5': 'f368c78df07e78a749508b221528672c',
	48	'info_dict': {
	49	'id': '119cd6bc1a18e6cd98',
	50	'ext': 'mp4',
	51	'title': '3. Updating your Partner details',
	52	'thumbnail': r're:https?://images\.sproutvideo\.com/.+\.jpg',
	53	'duration': 60,
	54	},
	55	'params': {'format': 'hd'},
	56	}, {
	57	# subtitles
	58	'url': 'https://videos.sproutvideo.com/embed/119dd8ba121ee0cc98/4ee50c88a343215d?type=hd',
	59	'md5': '7f6798f037d7a3e3e07e67959de68fc6',
	60	'info_dict': {
	61	'id': '119dd8ba121ee0cc98',
	62	'ext': 'mp4',
	63	'title': 'Recipients Setup - Domestic Wire Only',
	64	'thumbnail': r're:https?://images\.sproutvideo\.com/.+\.jpg',
65	'duration': 77,
66	'subtitles': {'en': 'count:1'},
67	},
68	}]
69	_WEBPAGE_TESTS = [{
70	'url': 'https://www.solidarum.org/vivre-ensemble/adrien-labaeye-berlin-des-communautes-aux-communs',
71	'info_dict': {
72	'id': '4c9dddb01910e3c9c4',
73	'ext': 'mp4',
74	'title': 'Adrien Labaeye : Berlin, des communautés aux communs',
75	'duration': 576,
76	'thumbnail': r're:https?://images\.sproutvideo\.com/.+\.jpg',
77	},
78	}]
79	_M3U8_URL_TMPL = 'https://{base}.videos.sproutvideo.com/{s3_user_hash}/{s3_video_hash}/video/index.m3u8'
80	_QUALITIES = ('hd', 'uhd', 'source') # Exclude 'sd' to prioritize hls formats above it
81
82	@staticmethod
83	def _policy_to_qs(policy, signature_key, as_string=False):
84	query = {}
85	for key, value in policy['signatures'][signature_key].items():
86	query[remove_start(key, 'CloudFront-')] = value
87	query['sessionID'] = policy['sessionID']
88	return urllib.parse.urlencode(query, doseq=True) if as_string else query
89
90	@classmethod
91	def _extract_embed_urls(cls, url, webpage):
92	for embed_url in super()._extract_embed_urls(url, webpage):
93	if embed_url.startswith('//'):
94	embed_url = f'https:{embed_url}'
95	yield smuggle_url(embed_url, {'referer': url})
96
97	def _real_extract(self, url):
98	url, smuggled_data = unsmuggle_url(url, {})
99	video_id = self._match_id(url)
100	webpage = self._download_webpage(
101	url, video_id, headers=traverse_obj(smuggled_data, {'Referer': 'referer'}))
102	data = self._search_json(
103	r'var\s+dat\s=\s["\']', webpage, 'data', video_id, contains_pattern=r'[A-Za-z0-9+/=]+',
104	end_pattern=r'["\'];', transform_source=lambda x: base64.b64decode(x).decode())
105
106	formats, subtitles = [], {}
107	headers = {
108	'Accept': '/',
109	'Origin': 'https://videos.sproutvideo.com',
110	'Referer': url,
111	}
112
113	# HLS extraction is fatal; only attempt it if the JSON data says it's available
114	if traverse_obj(data, 'hls'):
115	manifest_query = self._policy_to_qs(data, 'm')
116	fragment_query = self._policy_to_qs(data, 't', as_string=True)
117	key_query = self._policy_to_qs(data, 'k', as_string=True)
118
119	formats.extend(self._extract_m3u8_formats(
120	self._M3U8_URL_TMPL.format(**data), video_id, 'mp4',
121	m3u8_id='hls', headers=headers, query=manifest_query))
122	for fmt in formats:
123	fmt.update({
124	'url': update_url_query(fmt['url'], manifest_query),
125	'extra_param_to_segment_url': fragment_query,
126	'extra_param_to_key_url': key_query,
127	})
128
129	if downloads := traverse_obj(data, ('downloads', {dict.items}, lambda _, v: url_or_none(v[1]))):
130	quality = qualities(self._QUALITIES)
131	acodec = 'none' if data.get('has_audio') is False else None
132	formats.extend([{
133	'format_id': str(format_id),
134	'url': format_url,
135	'ext': 'mp4',
136	'quality': quality(format_id),
137	'acodec': acodec,
138	} for format_id, format_url in downloads])
139
140	for sub_data in traverse_obj(data, ('subtitleData', lambda _, v: url_or_none(v['src']))):
141	subtitles.setdefault(sub_data.get('srclang', 'en'), []).append({
142	'url': sub_data['src'],
143	})
144
145	return {
146	'id': video_id,
147	'formats': formats,
148	'subtitles': subtitles,
149	'http_headers': headers,
150	**traverse_obj(data, {
151	'title': ('title', {str}),
152	'duration': ('duration', {int_or_none}),
153	'thumbnail': ('posterframe_url', {url_or_none}),
154	}),
155	}
156
157
158	class VidsIoIE(InfoExtractor):
159	IE_NAME = 'vids.io'
160	_VALID_URL = r'https?://[\w-]+\.vids\.io/videos/(?P<id>[\da-f]+)/(?P<display_id>[\w-]+)'
161	_TESTS = [{
162	'url': 'https://how-to-video.vids.io/videos/799cd8b11c10efc1f0/how-to-video-live-streaming',
163	'md5': '9bbbb2c0c0739eb163b80f87b8d77c9e',
164	'info_dict': {
165	'id': '799cd8b11c10efc1f0',
166	'ext': 'mp4',
167	'title': 'How to Video: Live Streaming',
168	'duration': 2787,
169	'thumbnail': r're:https?://images\.sproutvideo\.com/.+\.jpg',
170	},
171	}]
172
173	def _real_extract(self, url):
174	video_id, display_id = self._match_valid_url(url).group('id', 'display_id')
175	webpage, urlh = self._download_webpage_handle(url, display_id, expected_status=403)
176
177	if urlh.status == 403:
178	password = self.get_param('videopassword')
179	if not password:
180	raise ExtractorError(
181	'This video is password-protected; use the --video-password option', expected=True)
182	try:
183	webpage = self._download_webpage(
184	url, display_id, 'Submitting video password',
185	data=urlencode_postdata({
186	'password': password,
187	**self._hidden_inputs(webpage),
188	}))
189	# Requests with user's session cookie `_sproutvideo_session` are now authorized
190	except ExtractorError as e:
191	if isinstance(e.cause, HTTPError) and e.cause.status == 403:
192	raise ExtractorError('Incorrect password', expected=True)
193	raise
194
195	if embed_url := next(SproutVideoIE._extract_embed_urls(url, webpage), None):
196	return self.url_result(embed_url, SproutVideoIE, video_id)
197
198	raise ExtractorError('Unable to extract any SproutVideo embed url')