]> jfr.im git - yt-dlp.git/blob - yt_dlp/extractor/rtlnl.py
[extractor/reddit] Add fallback format (#5165)
[yt-dlp.git] / yt_dlp / extractor / rtlnl.py
1 from .common import InfoExtractor
2 from ..utils import (
3 int_or_none,
4 parse_duration,
5 )
6
7
8 class RtlNlIE(InfoExtractor):
9 IE_NAME = 'rtl.nl'
10 IE_DESC = 'rtl.nl and rtlxl.nl'
11 _EMBED_REGEX = [r'<iframe[^>]+?\bsrc=(?P<q1>[\'"])(?P<url>(?:https?:)?//(?:(?:www|static)\.)?rtl\.nl/(?:system/videoplayer/[^"]+(?:video_)?)?embed[^"]+)(?P=q1)']
12 _VALID_URL = r'''(?x)
13 https?://(?:(?:www|static)\.)?
14 (?:
15 rtlxl\.nl/(?:[^\#]*\#!|programma)/[^/]+/|
16 rtl\.nl/(?:(?:system/videoplayer/(?:[^/]+/)+(?:video_)?embed\.html|embed)\b.+?\buuid=|video/)|
17 embed\.rtl\.nl/\#uuid=
18 )
19 (?P<id>[0-9a-f-]+)'''
20
21 _TESTS = [{
22 # new URL schema
23 'url': 'https://www.rtlxl.nl/programma/rtl-nieuws/0bd1384d-d970-3086-98bb-5c104e10c26f',
24 'md5': '490428f1187b60d714f34e1f2e3af0b6',
25 'info_dict': {
26 'id': '0bd1384d-d970-3086-98bb-5c104e10c26f',
27 'ext': 'mp4',
28 'title': 'RTL Nieuws',
29 'description': 'md5:d41d8cd98f00b204e9800998ecf8427e',
30 'timestamp': 1593293400,
31 'upload_date': '20200627',
32 'duration': 661.08,
33 },
34 }, {
35 # old URL schema
36 'url': 'http://www.rtlxl.nl/#!/rtl-nieuws-132237/82b1aad1-4a14-3d7b-b554-b0aed1b2c416',
37 'md5': '473d1946c1fdd050b2c0161a4b13c373',
38 'info_dict': {
39 'id': '82b1aad1-4a14-3d7b-b554-b0aed1b2c416',
40 'ext': 'mp4',
41 'title': 'RTL Nieuws',
42 'description': 'md5:d41d8cd98f00b204e9800998ecf8427e',
43 'timestamp': 1461951000,
44 'upload_date': '20160429',
45 'duration': 1167.96,
46 },
47 'skip': '404',
48 }, {
49 # best format available a3t
50 'url': 'http://www.rtl.nl/system/videoplayer/derden/rtlnieuws/video_embed.html#uuid=84ae5571-ac25-4225-ae0c-ef8d9efb2aed/autoplay=false',
51 'md5': 'dea7474214af1271d91ef332fb8be7ea',
52 'info_dict': {
53 'id': '84ae5571-ac25-4225-ae0c-ef8d9efb2aed',
54 'ext': 'mp4',
55 'timestamp': 1424039400,
56 'title': 'RTL Nieuws - Nieuwe beelden Kopenhagen: chaos direct na aanslag',
57 'thumbnail': r're:^https?://screenshots\.rtl\.nl/(?:[^/]+/)*sz=[0-9]+x[0-9]+/uuid=84ae5571-ac25-4225-ae0c-ef8d9efb2aed$',
58 'upload_date': '20150215',
59 'description': 'Er zijn nieuwe beelden vrijgegeven die vlak na de aanslag in Kopenhagen zijn gemaakt. Op de video is goed te zien hoe omstanders zich bekommeren om één van de slachtoffers, terwijl de eerste agenten ter plaatse komen.',
60 }
61 }, {
62 # empty synopsis and missing episodes (see https://github.com/ytdl-org/youtube-dl/issues/6275)
63 # best format available nettv
64 'url': 'http://www.rtl.nl/system/videoplayer/derden/rtlnieuws/video_embed.html#uuid=f536aac0-1dc3-4314-920e-3bd1c5b3811a/autoplay=false',
65 'info_dict': {
66 'id': 'f536aac0-1dc3-4314-920e-3bd1c5b3811a',
67 'ext': 'mp4',
68 'title': 'RTL Nieuws - Meer beelden van overval juwelier',
69 'thumbnail': r're:^https?://screenshots\.rtl\.nl/(?:[^/]+/)*sz=[0-9]+x[0-9]+/uuid=f536aac0-1dc3-4314-920e-3bd1c5b3811a$',
70 'timestamp': 1437233400,
71 'upload_date': '20150718',
72 'duration': 30.474,
73 },
74 'params': {
75 'skip_download': True,
76 },
77 }, {
78 # encrypted m3u8 streams, georestricted
79 'url': 'http://www.rtlxl.nl/#!/afl-2-257632/52a74543-c504-4cde-8aa8-ec66fe8d68a7',
80 'only_matching': True,
81 }, {
82 'url': 'http://www.rtl.nl/system/videoplayer/derden/embed.html#!/uuid=bb0353b0-d6a4-1dad-90e9-18fe75b8d1f0',
83 'only_matching': True,
84 }, {
85 'url': 'http://rtlxl.nl/?_ga=1.204735956.572365465.1466978370#!/rtl-nieuws-132237/3c487912-023b-49ac-903e-2c5d79f8410f',
86 'only_matching': True,
87 }, {
88 'url': 'https://www.rtl.nl/video/c603c9c2-601d-4b5e-8175-64f1e942dc7d/',
89 'only_matching': True,
90 }, {
91 'url': 'https://static.rtl.nl/embed/?uuid=1a2970fc-5c0b-43ff-9fdc-927e39e6d1bc&autoplay=false&publicatiepunt=rtlnieuwsnl',
92 'only_matching': True,
93 }, {
94 # new embed URL schema
95 'url': 'https://embed.rtl.nl/#uuid=84ae5571-ac25-4225-ae0c-ef8d9efb2aed/autoplay=false',
96 'only_matching': True,
97 }]
98
99 def _real_extract(self, url):
100 uuid = self._match_id(url)
101 info = self._download_json(
102 'http://www.rtl.nl/system/s4m/vfd/version=2/uuid=%s/fmt=adaptive/' % uuid,
103 uuid)
104
105 material = info['material'][0]
106 title = info['abstracts'][0]['name']
107 subtitle = material.get('title')
108 if subtitle:
109 title += ' - %s' % subtitle
110 description = material.get('synopsis')
111
112 meta = info.get('meta', {})
113
114 videopath = material['videopath']
115 m3u8_url = meta.get('videohost', 'http://manifest.us.rtl.nl') + videopath
116
117 formats = self._extract_m3u8_formats(
118 m3u8_url, uuid, 'mp4', m3u8_id='hls', fatal=False)
119 self._sort_formats(formats)
120
121 thumbnails = []
122
123 for p in ('poster_base_url', '"thumb_base_url"'):
124 if not meta.get(p):
125 continue
126
127 thumbnails.append({
128 'url': self._proto_relative_url(meta[p] + uuid),
129 'width': int_or_none(self._search_regex(
130 r'/sz=([0-9]+)', meta[p], 'thumbnail width', fatal=False)),
131 'height': int_or_none(self._search_regex(
132 r'/sz=[0-9]+x([0-9]+)',
133 meta[p], 'thumbnail height', fatal=False))
134 })
135
136 return {
137 'id': uuid,
138 'title': title,
139 'formats': formats,
140 'timestamp': material['original_date'],
141 'description': description,
142 'duration': parse_duration(material.get('duration')),
143 'thumbnails': thumbnails,
144 }
145
146
147 class RTLLuBaseIE(InfoExtractor):
148 _MEDIA_REGEX = {
149 'video': r'<rtl-player\s[^>]*\bhls\s*=\s*"([^"]+)',
150 'audio': r'<rtl-audioplayer\s[^>]*\bsrc\s*=\s*"([^"]+)',
151 'thumbnail': r'<rtl-player\s[^>]*\bposter\s*=\s*"([^"]+)',
152 }
153
154 def get_media_url(self, webpage, video_id, media_type):
155 return self._search_regex(self._MEDIA_REGEX[media_type], webpage, f'{media_type} url', default=None)
156
157 def get_formats_and_subtitles(self, webpage, video_id):
158 video_url, audio_url = self.get_media_url(webpage, video_id, 'video'), self.get_media_url(webpage, video_id, 'audio')
159
160 formats, subtitles = [], {}
161 if video_url is not None:
162 formats, subtitles = self._extract_m3u8_formats_and_subtitles(video_url, video_id)
163 if audio_url is not None:
164 formats.append({'url': audio_url, 'ext': 'mp3', 'vcodec': 'none'})
165
166 return formats, subtitles
167
168 def _real_extract(self, url):
169 video_id = self._match_id(url)
170 is_live = video_id in ('live', 'live-2', 'lauschteren')
171
172 # TODO: extract comment from https://www.rtl.lu/comments?status=1&order=desc&context=news|article|<video_id>
173 # we can context from <rtl-comments context=<context> in webpage
174 webpage = self._download_webpage(url, video_id)
175
176 formats, subtitles = self.get_formats_and_subtitles(webpage, video_id)
177 self._sort_formats(formats)
178
179 return {
180 'id': video_id,
181 'title': self._og_search_title(webpage),
182 'description': self._og_search_description(webpage, default=None),
183 'formats': formats,
184 'subtitles': subtitles,
185 'thumbnail': self.get_media_url(webpage, video_id, 'thumbnail') or self._og_search_thumbnail(webpage, default=None),
186 'is_live': is_live,
187 }
188
189
190 class RTLLuTeleVODIE(RTLLuBaseIE):
191 IE_NAME = 'rtl.lu:tele-vod'
192 _VALID_URL = r'https?://(?:www\.)?rtl\.lu/(tele/(?P<slug>[\w-]+)/v/|video/)(?P<id>\d+)(\.html)?'
193 _TESTS = [{
194 'url': 'https://www.rtl.lu/tele/de-journal-vun-der-tele/v/3266757.html',
195 'info_dict': {
196 'id': '3266757',
197 'title': 'Informatiounsversammlung Héichwaasser',
198 'ext': 'mp4',
199 'thumbnail': 'https://replay-assets.rtl.lu/2021/11/16/d3647fc4-470d-11ec-adc2-3a00abd6e90f_00008.jpg',
200 'description': 'md5:b1db974408cc858c9fd241812e4a2a14',
201 }
202 }, {
203 'url': 'https://www.rtl.lu/video/3295215',
204 'info_dict': {
205 'id': '3295215',
206 'title': 'Kulturassisen iwwer d\'Bestandsopnam vum Lëtzebuerger Konscht',
207 'ext': 'mp4',
208 'thumbnail': 'https://replay-assets.rtl.lu/2022/06/28/0000_3295215_0000.jpg',
209 'description': 'md5:85bcd4e0490aa6ec969d9bf16927437b',
210 }
211 }]
212
213
214 class RTLLuArticleIE(RTLLuBaseIE):
215 IE_NAME = 'rtl.lu:article'
216 _VALID_URL = r'https?://(?:(www|5minutes|today)\.)rtl\.lu/(?:[\w-]+)/(?:[\w-]+)/a/(?P<id>\d+)\.html'
217 _TESTS = [{
218 # Audio-only
219 'url': 'https://www.rtl.lu/sport/news/a/1934360.html',
220 'info_dict': {
221 'id': '1934360',
222 'ext': 'mp3',
223 'thumbnail': 'https://static.rtl.lu/rtl2008.lu/nt/p/2022/06/28/19/e4b37d66ddf00bab4c45617b91a5bb9b.jpeg',
224 'description': 'md5:5eab4a2a911c1fff7efc1682a38f9ef7',
225 'title': 'md5:40aa85f135578fbd549d3c9370321f99',
226 }
227 }, {
228 # 5minutes
229 'url': 'https://5minutes.rtl.lu/espace-frontaliers/frontaliers-en-questions/a/1853173.html',
230 'info_dict': {
231 'id': '1853173',
232 'ext': 'mp4',
233 'description': 'md5:ac031da0740e997a5cf4633173634fee',
234 'title': 'md5:87e17722ed21af0f24be3243f4ec0c46',
235 'thumbnail': 'https://replay-assets.rtl.lu/2022/01/26/screenshot_20220126104933_3274749_12b249833469b0d6e4440a1dec83cdfa.jpg',
236 }
237 }, {
238 # today.lu
239 'url': 'https://today.rtl.lu/entertainment/news/a/1936203.html',
240 'info_dict': {
241 'id': '1936203',
242 'ext': 'mp4',
243 'title': 'Once Upon A Time...zu Lëtzebuerg: The Three Witches\' Tower',
244 'description': 'The witchy theme continues in the latest episode of Once Upon A Time...',
245 'thumbnail': 'https://replay-assets.rtl.lu/2022/07/02/screenshot_20220702122859_3290019_412dc5185951b7f6545a4039c8be9235.jpg',
246 }
247 }]
248
249
250 class RTLLuLiveIE(RTLLuBaseIE):
251 _VALID_URL = r'https?://www\.rtl\.lu/(?:tele|radio)/(?P<id>live(?:-\d+)?|lauschteren)'
252 _TESTS = [{
253 # Tele:live
254 'url': 'https://www.rtl.lu/tele/live',
255 'info_dict': {
256 'id': 'live',
257 'ext': 'mp4',
258 'live_status': 'is_live',
259 'title': r're:RTL - Télé LIVE \d{4}-\d{2}-\d{2} \d{2}:\d{2}',
260 'thumbnail': 'https://static.rtl.lu/livestream/channel1.jpg',
261 }
262 }, {
263 # Tele:live-2
264 'url': 'https://www.rtl.lu/tele/live-2',
265 'info_dict': {
266 'id': 'live-2',
267 'ext': 'mp4',
268 'live_status': 'is_live',
269 'title': r're:RTL - Télé LIVE \d{4}-\d{2}-\d{2} \d{2}:\d{2}',
270 'thumbnail': 'https://static.rtl.lu/livestream/channel2.jpg',
271 }
272 }, {
273 # Radio:lauschteren
274 'url': 'https://www.rtl.lu/radio/lauschteren',
275 'info_dict': {
276 'id': 'lauschteren',
277 'ext': 'mp4',
278 'live_status': 'is_live',
279 'title': r're:RTL - Radio LIVE \d{4}-\d{2}-\d{2} \d{2}:\d{2}',
280 'thumbnail': 'https://static.rtl.lu/livestream/rtlradiowebtv.jpg',
281 }
282 }]
283
284
285 class RTLLuRadioIE(RTLLuBaseIE):
286 _VALID_URL = r'https?://www\.rtl\.lu/radio/(?:[\w-]+)/s/(?P<id>\d+)(\.html)?'
287 _TESTS = [{
288 'url': 'https://www.rtl.lu/radio/5-vir-12/s/4033058.html',
289 'info_dict': {
290 'id': '4033058',
291 'ext': 'mp3',
292 'description': 'md5:f855a4f3e3235393ae47ed1db5d934b9',
293 'title': '5 vir 12 - Stau um Stau',
294 'thumbnail': 'https://static.rtl.lu/rtlg//2022/06/24/c9c19e5694a14be46a3647a3760e1f62.jpg',
295 }
296 }]