]> jfr.im git - yt-dlp.git/blame - yt_dlp/extractor/rtvcplay.py
[ie/orf:on] Improve extraction (#9677)
[yt-dlp.git] / yt_dlp / extractor / rtvcplay.py
CommitLineData
9b30cd3d
E
1import re
2
3from .common import InfoExtractor, ExtractorError
4from ..utils import (
5 clean_html,
6 determine_ext,
7 int_or_none,
8 float_or_none,
9 js_to_json,
10 mimetype2ext,
11 traverse_obj,
12 urljoin,
13 url_or_none,
14)
15
16
17class RTVCPlayBaseIE(InfoExtractor):
18 _BASE_VALID_URL = r'https?://(?:www\.)?rtvcplay\.co'
19
20 def _extract_player_config(self, webpage, video_id):
21 return self._search_json(
22 r'<script\b[^>]*>[^<]*(?:var|let|const)\s+config\s*=', re.sub(r'"\s*\+\s*"', '', webpage),
23 'player_config', video_id, transform_source=js_to_json)
24
25 def _extract_formats_and_subtitles_player_config(self, player_config, video_id):
26 formats, subtitles = [], {}
27 for source in traverse_obj(player_config, ('sources', ..., lambda _, v: url_or_none(v['url']))):
28 ext = mimetype2ext(source.get('mimetype'), default=determine_ext(source['url']))
29 if ext == 'm3u8':
30 fmts, subs = self._extract_m3u8_formats_and_subtitles(
31 source['url'], video_id, 'mp4', fatal=False)
32 formats.extend(fmts)
33 self._merge_subtitles(subs, target=subtitles)
34 else:
35 formats.append({
36 'url': source['url'],
37 'ext': ext,
38 })
39
40 return formats, subtitles
41
42
43class RTVCPlayIE(RTVCPlayBaseIE):
44 _VALID_URL = RTVCPlayBaseIE._BASE_VALID_URL + r'/(?P<category>(?!embed)[^/]+)/(?:[^?#]+/)?(?P<id>[\w-]+)'
45
46 _TESTS = [{
47 'url': 'https://www.rtvcplay.co/en-vivo/canal-institucional',
48 'info_dict': {
49 'id': 'canal-institucional',
50 'title': r're:^Canal Institucional',
51 'description': 'md5:eff9e548394175928059320c006031ea',
52 'thumbnail': r're:^https?://.*\.(?:jpg|png)',
53 'live_status': 'is_live',
54 'ext': 'mp4',
55 },
56 'params': {
57 'skip_download': 'Livestream',
58 },
59 }, {
60 'url': 'https://www.rtvcplay.co/en-vivo/senal-colombia',
61 'info_dict': {
62 'id': 'senal-colombia',
63 'title': r're:^Señal Colombia',
64 'description': 'md5:799f16a401d97f40c33a2c6a3e2a507b',
65 'thumbnail': r're:^https?://.*\.(?:jpg|png)',
66 'live_status': 'is_live',
67 'ext': 'mp4',
68 },
69 'params': {
70 'skip_download': 'Livestream',
71 },
72 }, {
73 'url': 'https://www.rtvcplay.co/en-vivo/radio-nacional',
74 'info_dict': {
75 'id': 'radio-nacional',
76 'title': r're:^Radio Nacional',
77 'description': 'md5:5de009bc6a9fa79d2a6cf0b73f977d53',
78 'thumbnail': r're:^https?://.*\.(?:jpg|png)',
79 'live_status': 'is_live',
80 'ext': 'mp4',
81 },
82 'params': {
83 'skip_download': 'Livestream',
84 },
85 }, {
86 'url': 'https://www.rtvcplay.co/peliculas-ficcion/senoritas',
87 'md5': '1288ee6f6d1330d880f98bff2ed710a3',
88 'info_dict': {
89 'id': 'senoritas',
90 'title': 'Señoritas',
91 'description': 'md5:f095a2bb52cb6cf279daf6302f86fb32',
92 'thumbnail': r're:^https?://.*\.(?:jpg|png)',
93 'ext': 'mp4',
94 },
95 }, {
96 'url': 'https://www.rtvcplay.co/competencias-basicas-ciudadanas-y-socioemocionales/profe-en-tu-casa/james-regresa-clases-28022022',
97 'md5': 'f040a7380a269ad633cf837384d5e9fc',
98 'info_dict': {
99 'id': 'james-regresa-clases-28022022',
100 'title': 'James regresa a clases - 28/02/2022',
101 'description': 'md5:c5dcdf757c7ab29305e8763c6007e675',
102 'ext': 'mp4',
103 },
104 }, {
105 'url': 'https://www.rtvcplay.co/peliculas-documentales/llinas-el-cerebro-y-el-universo',
106 'info_dict': {
107 'id': 'llinas-el-cerebro-y-el-universo',
108 'title': 'Llinás, el cerebro y el universo',
109 'description': 'md5:add875bf2309bb52b3e8b9b06116d9b0',
110 'thumbnail': r're:^https?://.*\.(?:jpg|png)',
111 },
112 'playlist_mincount': 3,
113 }, {
114 'url': 'https://www.rtvcplay.co/competencias-basicas-ciudadanas-y-socioemocionales/profe-en-tu-casa',
115 'info_dict': {
116 'id': 'profe-en-tu-casa',
117 'title': 'Profe en tu casa',
118 'description': 'md5:47dbe20e263194413b1db2a2805a4f2e',
119 'thumbnail': r're:^https?://.*\.(?:jpg|png)',
120 },
121 'playlist_mincount': 537,
122 }, {
123 'url': 'https://www.rtvcplay.co/series-al-oido/relato-de-un-naufrago-una-travesia-del-periodismo-a-la-literatura',
124 'info_dict': {
125 'id': 'relato-de-un-naufrago-una-travesia-del-periodismo-a-la-literatura',
126 'title': 'Relato de un náufrago: una travesía del periodismo a la literatura',
127 'description': 'md5:6da28fdca4a5a568ea47ef65ef775603',
128 'thumbnail': r're:^https?://.*\.(?:jpg|png)',
129 },
130 'playlist_mincount': 5,
131 }, {
132 'url': 'https://www.rtvcplay.co/series-al-oido/diez-versiones',
133 'info_dict': {
134 'id': 'diez-versiones',
135 'title': 'Diez versiones',
136 'description': 'md5:997471ed971cb3fd8e41969457675306',
137 'thumbnail': r're:^https?://.*\.(?:jpg|png)',
138 },
139 'playlist_mincount': 20,
140 }]
141
142 def _real_extract(self, url):
143 video_id, category = self._match_valid_url(url).group('id', 'category')
144 webpage = self._download_webpage(url, video_id)
145
146 hydration = self._search_json(
147 r'window\.__RTVCPLAY_STATE__\s*=', webpage, 'hydration',
148 video_id, transform_source=js_to_json)['content']['currentContent']
149
150 asset_id = traverse_obj(hydration, ('video', 'assetid'))
151 if asset_id:
152 hls_url = hydration['base_url_hls'].replace('[node:field_asset_id]', asset_id)
153 else:
154 hls_url = traverse_obj(hydration, ('channel', 'hls'))
155
156 metadata = traverse_obj(hydration, {
157 'title': 'title',
158 'description': 'description',
159 'thumbnail': ((('channel', 'image', 'logo'), ('resource', 'image', 'cover_desktop')), 'path'),
160 }, get_all=False)
161
162 # Probably it's a program's page
163 if not hls_url:
164 seasons = traverse_obj(
165 hydration, ('widgets', lambda _, y: y['type'] == 'seasonList', 'contents'),
166 get_all=False)
167 if not seasons:
168 podcast_episodes = hydration.get('audios')
169 if not podcast_episodes:
170 raise ExtractorError('Could not find asset_id nor program playlist nor podcast episodes')
171
172 return self.playlist_result([
173 self.url_result(episode['file'], url_transparent=True, **traverse_obj(episode, {
174 'title': 'title',
175 'description': ('description', {clean_html}),
176 'episode_number': ('chapter_number', {float_or_none}, {int_or_none}),
177 'season_number': ('season', {int_or_none}),
178 })) for episode in podcast_episodes], video_id, **metadata)
179
180 entries = [self.url_result(
181 urljoin(url, episode['slug']), url_transparent=True,
182 **traverse_obj(season, {
183 'season': 'title',
184 'season_number': ('season', {int_or_none}),
185 }), **traverse_obj(episode, {
186 'title': 'title',
187 'thumbnail': ('image', 'cover', 'path'),
188 'episode_number': ('chapter_number', {int_or_none}),
189 })) for season in seasons for episode in traverse_obj(season, ('contents', ...))]
190
191 return self.playlist_result(entries, video_id, **metadata)
192
193 formats, subtitles = self._extract_m3u8_formats_and_subtitles(hls_url, video_id, 'mp4')
194
195 return {
196 'id': video_id,
197 'formats': formats,
198 'subtitles': subtitles,
199 'is_live': category == 'en-vivo',
200 **metadata,
201 }
202
203
204class RTVCPlayEmbedIE(RTVCPlayBaseIE):
205 _VALID_URL = RTVCPlayBaseIE._BASE_VALID_URL + r'/embed/(?P<id>[\w-]+)'
206
207 _TESTS = [{
208 'url': 'https://www.rtvcplay.co/embed/72b0e699-248b-4929-a4a8-3782702fa7f9',
209 'md5': 'ed529aeaee7aa2a72afe91ac7d1177a8',
210 'info_dict': {
211 'id': '72b0e699-248b-4929-a4a8-3782702fa7f9',
212 'title': 'Tráiler: Señoritas',
213 'thumbnail': r're:^https?://.*\.(?:jpg|png)',
214 'ext': 'mp4',
215 }
216 }]
217
218 def _real_extract(self, url):
219 video_id = self._match_id(url)
220 webpage = self._download_webpage(url, video_id)
221
222 player_config = self._extract_player_config(webpage, video_id)
223 formats, subtitles = self._extract_formats_and_subtitles_player_config(player_config, video_id)
224
225 asset_id = traverse_obj(player_config, ('rtvcplay', 'assetid'))
226 metadata = {} if not asset_id else self._download_json(
227 f'https://cms.rtvcplay.co/api/v1/video/asset-id/{asset_id}', video_id, fatal=False)
228
229 return {
230 'id': video_id,
231 'formats': formats,
232 'subtitles': subtitles,
233 **traverse_obj(metadata, {
234 'title': 'title',
235 'description': 'description',
236 'thumbnail': ('image', ..., 'thumbnail', 'path'),
237 }, get_all=False)
238 }
239
240
241class RTVCKalturaIE(RTVCPlayBaseIE):
242 _VALID_URL = r'https?://media\.rtvc\.gov\.co/kalturartvc/(?P<id>[\w-]+)'
243
244 _TESTS = [{
245 'url': 'https://media.rtvc.gov.co/kalturartvc/indexSC.html',
246 'info_dict': {
247 'id': 'indexSC',
248 'title': r're:^Señal Colombia',
249 'description': 'md5:799f16a401d97f40c33a2c6a3e2a507b',
250 'thumbnail': r're:^https?://.*\.(?:jpg|png)',
251 'live_status': 'is_live',
252 'ext': 'mp4',
253 },
254 'params': {
255 'skip_download': 'Livestream',
256 },
257 }]
258
259 def _real_extract(self, url):
260 video_id = self._match_id(url)
261 webpage = self._download_webpage(url, video_id)
262
263 player_config = self._extract_player_config(webpage, video_id)
264 formats, subtitles = self._extract_formats_and_subtitles_player_config(player_config, video_id)
265
266 channel_id = traverse_obj(player_config, ('rtvcplay', 'channelId'))
267 metadata = {} if not channel_id else self._download_json(
268 f'https://cms.rtvcplay.co/api/v1/taxonomy_term/streaming/{channel_id}', video_id, fatal=False)
269
270 fmts, subs = self._extract_m3u8_formats_and_subtitles(
271 traverse_obj(metadata, ('channel', 'hls')), video_id, 'mp4', fatal=False)
272 formats.extend(fmts)
273 self._merge_subtitles(subs, target=subtitles)
274
275 return {
276 'id': video_id,
277 'formats': formats,
278 'subtitles': subtitles,
279 'is_live': True,
280 **traverse_obj(metadata, {
281 'title': 'title',
282 'description': 'description',
283 'thumbnail': ('channel', 'image', 'logo', 'path'),
284 })
285 }