]> jfr.im git - yt-dlp.git/blob - yt_dlp/extractor/rtlnl.py
[extractor] Support multiple archive ids for one video (#4307)
[yt-dlp.git] / yt_dlp / extractor / rtlnl.py
1 from .common import InfoExtractor
2 from ..utils import (
3 int_or_none,
4 parse_duration,
5 )
6
7
8 class RtlNlIE(InfoExtractor):
9 IE_NAME = 'rtl.nl'
10 IE_DESC = 'rtl.nl and rtlxl.nl'
11 _VALID_URL = r'''(?x)
12 https?://(?:(?:www|static)\.)?
13 (?:
14 rtlxl\.nl/(?:[^\#]*\#!|programma)/[^/]+/|
15 rtl\.nl/(?:(?:system/videoplayer/(?:[^/]+/)+(?:video_)?embed\.html|embed)\b.+?\buuid=|video/)|
16 embed\.rtl\.nl/\#uuid=
17 )
18 (?P<id>[0-9a-f-]+)'''
19
20 _TESTS = [{
21 # new URL schema
22 'url': 'https://www.rtlxl.nl/programma/rtl-nieuws/0bd1384d-d970-3086-98bb-5c104e10c26f',
23 'md5': '490428f1187b60d714f34e1f2e3af0b6',
24 'info_dict': {
25 'id': '0bd1384d-d970-3086-98bb-5c104e10c26f',
26 'ext': 'mp4',
27 'title': 'RTL Nieuws',
28 'description': 'md5:d41d8cd98f00b204e9800998ecf8427e',
29 'timestamp': 1593293400,
30 'upload_date': '20200627',
31 'duration': 661.08,
32 },
33 }, {
34 # old URL schema
35 'url': 'http://www.rtlxl.nl/#!/rtl-nieuws-132237/82b1aad1-4a14-3d7b-b554-b0aed1b2c416',
36 'md5': '473d1946c1fdd050b2c0161a4b13c373',
37 'info_dict': {
38 'id': '82b1aad1-4a14-3d7b-b554-b0aed1b2c416',
39 'ext': 'mp4',
40 'title': 'RTL Nieuws',
41 'description': 'md5:d41d8cd98f00b204e9800998ecf8427e',
42 'timestamp': 1461951000,
43 'upload_date': '20160429',
44 'duration': 1167.96,
45 },
46 'skip': '404',
47 }, {
48 # best format available a3t
49 'url': 'http://www.rtl.nl/system/videoplayer/derden/rtlnieuws/video_embed.html#uuid=84ae5571-ac25-4225-ae0c-ef8d9efb2aed/autoplay=false',
50 'md5': 'dea7474214af1271d91ef332fb8be7ea',
51 'info_dict': {
52 'id': '84ae5571-ac25-4225-ae0c-ef8d9efb2aed',
53 'ext': 'mp4',
54 'timestamp': 1424039400,
55 'title': 'RTL Nieuws - Nieuwe beelden Kopenhagen: chaos direct na aanslag',
56 'thumbnail': r're:^https?://screenshots\.rtl\.nl/(?:[^/]+/)*sz=[0-9]+x[0-9]+/uuid=84ae5571-ac25-4225-ae0c-ef8d9efb2aed$',
57 'upload_date': '20150215',
58 'description': 'Er zijn nieuwe beelden vrijgegeven die vlak na de aanslag in Kopenhagen zijn gemaakt. Op de video is goed te zien hoe omstanders zich bekommeren om één van de slachtoffers, terwijl de eerste agenten ter plaatse komen.',
59 }
60 }, {
61 # empty synopsis and missing episodes (see https://github.com/ytdl-org/youtube-dl/issues/6275)
62 # best format available nettv
63 'url': 'http://www.rtl.nl/system/videoplayer/derden/rtlnieuws/video_embed.html#uuid=f536aac0-1dc3-4314-920e-3bd1c5b3811a/autoplay=false',
64 'info_dict': {
65 'id': 'f536aac0-1dc3-4314-920e-3bd1c5b3811a',
66 'ext': 'mp4',
67 'title': 'RTL Nieuws - Meer beelden van overval juwelier',
68 'thumbnail': r're:^https?://screenshots\.rtl\.nl/(?:[^/]+/)*sz=[0-9]+x[0-9]+/uuid=f536aac0-1dc3-4314-920e-3bd1c5b3811a$',
69 'timestamp': 1437233400,
70 'upload_date': '20150718',
71 'duration': 30.474,
72 },
73 'params': {
74 'skip_download': True,
75 },
76 }, {
77 # encrypted m3u8 streams, georestricted
78 'url': 'http://www.rtlxl.nl/#!/afl-2-257632/52a74543-c504-4cde-8aa8-ec66fe8d68a7',
79 'only_matching': True,
80 }, {
81 'url': 'http://www.rtl.nl/system/videoplayer/derden/embed.html#!/uuid=bb0353b0-d6a4-1dad-90e9-18fe75b8d1f0',
82 'only_matching': True,
83 }, {
84 'url': 'http://rtlxl.nl/?_ga=1.204735956.572365465.1466978370#!/rtl-nieuws-132237/3c487912-023b-49ac-903e-2c5d79f8410f',
85 'only_matching': True,
86 }, {
87 'url': 'https://www.rtl.nl/video/c603c9c2-601d-4b5e-8175-64f1e942dc7d/',
88 'only_matching': True,
89 }, {
90 'url': 'https://static.rtl.nl/embed/?uuid=1a2970fc-5c0b-43ff-9fdc-927e39e6d1bc&autoplay=false&publicatiepunt=rtlnieuwsnl',
91 'only_matching': True,
92 }, {
93 # new embed URL schema
94 'url': 'https://embed.rtl.nl/#uuid=84ae5571-ac25-4225-ae0c-ef8d9efb2aed/autoplay=false',
95 'only_matching': True,
96 }]
97
98 def _real_extract(self, url):
99 uuid = self._match_id(url)
100 info = self._download_json(
101 'http://www.rtl.nl/system/s4m/vfd/version=2/uuid=%s/fmt=adaptive/' % uuid,
102 uuid)
103
104 material = info['material'][0]
105 title = info['abstracts'][0]['name']
106 subtitle = material.get('title')
107 if subtitle:
108 title += ' - %s' % subtitle
109 description = material.get('synopsis')
110
111 meta = info.get('meta', {})
112
113 videopath = material['videopath']
114 m3u8_url = meta.get('videohost', 'http://manifest.us.rtl.nl') + videopath
115
116 formats = self._extract_m3u8_formats(
117 m3u8_url, uuid, 'mp4', m3u8_id='hls', fatal=False)
118 self._sort_formats(formats)
119
120 thumbnails = []
121
122 for p in ('poster_base_url', '"thumb_base_url"'):
123 if not meta.get(p):
124 continue
125
126 thumbnails.append({
127 'url': self._proto_relative_url(meta[p] + uuid),
128 'width': int_or_none(self._search_regex(
129 r'/sz=([0-9]+)', meta[p], 'thumbnail width', fatal=False)),
130 'height': int_or_none(self._search_regex(
131 r'/sz=[0-9]+x([0-9]+)',
132 meta[p], 'thumbnail height', fatal=False))
133 })
134
135 return {
136 'id': uuid,
137 'title': title,
138 'formats': formats,
139 'timestamp': material['original_date'],
140 'description': description,
141 'duration': parse_duration(material.get('duration')),
142 'thumbnails': thumbnails,
143 }
144
145
146 class RTLLuBaseIE(InfoExtractor):
147 _MEDIA_REGEX = {
148 'video': r'<rtl-player\s[^>]*\bhls\s*=\s*"([^"]+)',
149 'audio': r'<rtl-audioplayer\s[^>]*\bsrc\s*=\s*"([^"]+)',
150 'thumbnail': r'<rtl-player\s[^>]*\bposter\s*=\s*"([^"]+)',
151 }
152
153 def get_media_url(self, webpage, video_id, media_type):
154 return self._search_regex(self._MEDIA_REGEX[media_type], webpage, f'{media_type} url', default=None)
155
156 def get_formats_and_subtitles(self, webpage, video_id):
157 video_url, audio_url = self.get_media_url(webpage, video_id, 'video'), self.get_media_url(webpage, video_id, 'audio')
158
159 formats, subtitles = [], {}
160 if video_url is not None:
161 formats, subtitles = self._extract_m3u8_formats_and_subtitles(video_url, video_id)
162 if audio_url is not None:
163 formats.append({'url': audio_url, 'ext': 'mp3', 'vcodec': 'none'})
164
165 return formats, subtitles
166
167 def _real_extract(self, url):
168 video_id = self._match_id(url)
169 is_live = video_id in ('live', 'live-2', 'lauschteren')
170
171 # TODO: extract comment from https://www.rtl.lu/comments?status=1&order=desc&context=news|article|<video_id>
172 # we can context from <rtl-comments context=<context> in webpage
173 webpage = self._download_webpage(url, video_id)
174
175 formats, subtitles = self.get_formats_and_subtitles(webpage, video_id)
176 self._sort_formats(formats)
177
178 return {
179 'id': video_id,
180 'title': self._og_search_title(webpage),
181 'description': self._og_search_description(webpage, default=None),
182 'formats': formats,
183 'subtitles': subtitles,
184 'thumbnail': self.get_media_url(webpage, video_id, 'thumbnail') or self._og_search_thumbnail(webpage, default=None),
185 'is_live': is_live,
186 }
187
188
189 class RTLLuTeleVODIE(RTLLuBaseIE):
190 IE_NAME = 'rtl.lu:tele-vod'
191 _VALID_URL = r'https?://(?:www\.)?rtl\.lu/(tele/(?P<slug>[\w-]+)/v/|video/)(?P<id>\d+)(\.html)?'
192 _TESTS = [{
193 'url': 'https://www.rtl.lu/tele/de-journal-vun-der-tele/v/3266757.html',
194 'info_dict': {
195 'id': '3266757',
196 'title': 'Informatiounsversammlung Héichwaasser',
197 'ext': 'mp4',
198 'thumbnail': 'https://replay-assets.rtl.lu/2021/11/16/d3647fc4-470d-11ec-adc2-3a00abd6e90f_00008.jpg',
199 'description': 'md5:b1db974408cc858c9fd241812e4a2a14',
200 }
201 }, {
202 'url': 'https://www.rtl.lu/video/3295215',
203 'info_dict': {
204 'id': '3295215',
205 'title': 'Kulturassisen iwwer d\'Bestandsopnam vum Lëtzebuerger Konscht',
206 'ext': 'mp4',
207 'thumbnail': 'https://replay-assets.rtl.lu/2022/06/28/0000_3295215_0000.jpg',
208 'description': 'md5:85bcd4e0490aa6ec969d9bf16927437b',
209 }
210 }]
211
212
213 class RTLLuArticleIE(RTLLuBaseIE):
214 IE_NAME = 'rtl.lu:article'
215 _VALID_URL = r'https?://(?:(www|5minutes|today)\.)rtl\.lu/(?:[\w-]+)/(?:[\w-]+)/a/(?P<id>\d+)\.html'
216 _TESTS = [{
217 # Audio-only
218 'url': 'https://www.rtl.lu/sport/news/a/1934360.html',
219 'info_dict': {
220 'id': '1934360',
221 'ext': 'mp3',
222 'thumbnail': 'https://static.rtl.lu/rtl2008.lu/nt/p/2022/06/28/19/e4b37d66ddf00bab4c45617b91a5bb9b.jpeg',
223 'description': 'md5:5eab4a2a911c1fff7efc1682a38f9ef7',
224 'title': 'md5:40aa85f135578fbd549d3c9370321f99',
225 }
226 }, {
227 # 5minutes
228 'url': 'https://5minutes.rtl.lu/espace-frontaliers/frontaliers-en-questions/a/1853173.html',
229 'info_dict': {
230 'id': '1853173',
231 'ext': 'mp4',
232 'description': 'md5:ac031da0740e997a5cf4633173634fee',
233 'title': 'md5:87e17722ed21af0f24be3243f4ec0c46',
234 'thumbnail': 'https://replay-assets.rtl.lu/2022/01/26/screenshot_20220126104933_3274749_12b249833469b0d6e4440a1dec83cdfa.jpg',
235 }
236 }, {
237 # today.lu
238 'url': 'https://today.rtl.lu/entertainment/news/a/1936203.html',
239 'info_dict': {
240 'id': '1936203',
241 'ext': 'mp4',
242 'title': 'Once Upon A Time...zu Lëtzebuerg: The Three Witches\' Tower',
243 'description': 'The witchy theme continues in the latest episode of Once Upon A Time...',
244 'thumbnail': 'https://replay-assets.rtl.lu/2022/07/02/screenshot_20220702122859_3290019_412dc5185951b7f6545a4039c8be9235.jpg',
245 }
246 }]
247
248
249 class RTLLuLiveIE(RTLLuBaseIE):
250 _VALID_URL = r'https?://www\.rtl\.lu/(?:tele|radio)/(?P<id>live(?:-\d+)?|lauschteren)'
251 _TESTS = [{
252 # Tele:live
253 'url': 'https://www.rtl.lu/tele/live',
254 'info_dict': {
255 'id': 'live',
256 'ext': 'mp4',
257 'live_status': 'is_live',
258 'title': r're:RTL - Télé LIVE \d{4}-\d{2}-\d{2} \d{2}:\d{2}',
259 'thumbnail': 'https://static.rtl.lu/livestream/channel1.jpg',
260 }
261 }, {
262 # Tele:live-2
263 'url': 'https://www.rtl.lu/tele/live-2',
264 'info_dict': {
265 'id': 'live-2',
266 'ext': 'mp4',
267 'live_status': 'is_live',
268 'title': r're:RTL - Télé LIVE \d{4}-\d{2}-\d{2} \d{2}:\d{2}',
269 'thumbnail': 'https://static.rtl.lu/livestream/channel2.jpg',
270 }
271 }, {
272 # Radio:lauschteren
273 'url': 'https://www.rtl.lu/radio/lauschteren',
274 'info_dict': {
275 'id': 'lauschteren',
276 'ext': 'mp4',
277 'live_status': 'is_live',
278 'title': r're:RTL - Radio LIVE \d{4}-\d{2}-\d{2} \d{2}:\d{2}',
279 'thumbnail': 'https://static.rtl.lu/livestream/rtlradiowebtv.jpg',
280 }
281 }]
282
283
284 class RTLLuRadioIE(RTLLuBaseIE):
285 _VALID_URL = r'https?://www\.rtl\.lu/radio/(?:[\w-]+)/s/(?P<id>\d+)(\.html)?'
286 _TESTS = [{
287 'url': 'https://www.rtl.lu/radio/5-vir-12/s/4033058.html',
288 'info_dict': {
289 'id': '4033058',
290 'ext': 'mp3',
291 'description': 'md5:f855a4f3e3235393ae47ed1db5d934b9',
292 'title': '5 vir 12 - Stau um Stau',
293 'thumbnail': 'https://static.rtl.lu/rtlg//2022/06/24/c9c19e5694a14be46a3647a3760e1f62.jpg',
294 }
295 }]