]>
Commit | Line | Data |
---|---|---|
1 | import functools | |
2 | import re | |
3 | ||
4 | from .common import InfoExtractor | |
5 | from ..utils import ( | |
6 | OnDemandPagedList, | |
7 | bug_reports_message, | |
8 | determine_ext, | |
9 | int_or_none, | |
10 | join_nonempty, | |
11 | jwt_decode_hs256, | |
12 | make_archive_id, | |
13 | parse_duration, | |
14 | parse_iso8601, | |
15 | remove_start, | |
16 | str_or_none, | |
17 | unified_strdate, | |
18 | update_url_query, | |
19 | url_or_none, | |
20 | xpath_text, | |
21 | ) | |
22 | from ..utils.traversal import traverse_obj | |
23 | ||
24 | ||
25 | class ARDMediathekBaseIE(InfoExtractor): | |
26 | _GEO_COUNTRIES = ['DE'] | |
27 | ||
28 | def _extract_media_info(self, media_info_url, webpage, video_id): | |
29 | media_info = self._download_json( | |
30 | media_info_url, video_id, 'Downloading media JSON') | |
31 | return self._parse_media_info(media_info, video_id, '"fsk"' in webpage) | |
32 | ||
33 | def _parse_media_info(self, media_info, video_id, fsk): | |
34 | formats = self._extract_formats(media_info, video_id) | |
35 | ||
36 | if not formats: | |
37 | if fsk: | |
38 | self.raise_no_formats( | |
39 | 'This video is only available after 20:00', expected=True) | |
40 | elif media_info.get('_geoblocked'): | |
41 | self.raise_geo_restricted( | |
42 | 'This video is not available due to geoblocking', | |
43 | countries=self._GEO_COUNTRIES, metadata_available=True) | |
44 | ||
45 | subtitles = {} | |
46 | subtitle_url = media_info.get('_subtitleUrl') | |
47 | if subtitle_url: | |
48 | subtitles['de'] = [{ | |
49 | 'ext': 'ttml', | |
50 | 'url': subtitle_url, | |
51 | }, { | |
52 | 'ext': 'vtt', | |
53 | 'url': subtitle_url.replace('/ebutt/', '/webvtt/') + '.vtt', | |
54 | }] | |
55 | ||
56 | return { | |
57 | 'id': video_id, | |
58 | 'duration': int_or_none(media_info.get('_duration')), | |
59 | 'thumbnail': media_info.get('_previewImage'), | |
60 | 'is_live': media_info.get('_isLive') is True, | |
61 | 'formats': formats, | |
62 | 'subtitles': subtitles, | |
63 | } | |
64 | ||
65 | def _extract_formats(self, media_info, video_id): | |
66 | type_ = media_info.get('_type') | |
67 | media_array = media_info.get('_mediaArray', []) | |
68 | formats = [] | |
69 | for num, media in enumerate(media_array): | |
70 | for stream in media.get('_mediaStreamArray', []): | |
71 | stream_urls = stream.get('_stream') | |
72 | if not stream_urls: | |
73 | continue | |
74 | if not isinstance(stream_urls, list): | |
75 | stream_urls = [stream_urls] | |
76 | quality = stream.get('_quality') | |
77 | server = stream.get('_server') | |
78 | for stream_url in stream_urls: | |
79 | if not url_or_none(stream_url): | |
80 | continue | |
81 | ext = determine_ext(stream_url) | |
82 | if quality != 'auto' and ext in ('f4m', 'm3u8'): | |
83 | continue | |
84 | if ext == 'f4m': | |
85 | formats.extend(self._extract_f4m_formats( | |
86 | update_url_query(stream_url, { | |
87 | 'hdcore': '3.1.1', | |
88 | 'plugin': 'aasp-3.1.1.69.124' | |
89 | }), video_id, f4m_id='hds', fatal=False)) | |
90 | elif ext == 'm3u8': | |
91 | formats.extend(self._extract_m3u8_formats( | |
92 | stream_url, video_id, 'mp4', 'm3u8_native', | |
93 | m3u8_id='hls', fatal=False)) | |
94 | else: | |
95 | if server and server.startswith('rtmp'): | |
96 | f = { | |
97 | 'url': server, | |
98 | 'play_path': stream_url, | |
99 | 'format_id': 'a%s-rtmp-%s' % (num, quality), | |
100 | } | |
101 | else: | |
102 | f = { | |
103 | 'url': stream_url, | |
104 | 'format_id': 'a%s-%s-%s' % (num, ext, quality) | |
105 | } | |
106 | m = re.search( | |
107 | r'_(?P<width>\d+)x(?P<height>\d+)\.mp4$', | |
108 | stream_url) | |
109 | if m: | |
110 | f.update({ | |
111 | 'width': int(m.group('width')), | |
112 | 'height': int(m.group('height')), | |
113 | }) | |
114 | if type_ == 'audio': | |
115 | f['vcodec'] = 'none' | |
116 | formats.append(f) | |
117 | return formats | |
118 | ||
119 | ||
120 | class ARDIE(InfoExtractor): | |
121 | _VALID_URL = r'(?P<mainurl>https?://(?:www\.)?daserste\.de/(?:[^/?#&]+/)+(?P<id>[^/?#&]+))\.html' | |
122 | _TESTS = [{ | |
123 | # available till 7.12.2023 | |
124 | 'url': 'https://www.daserste.de/information/talk/maischberger/videos/maischberger-video-424.html', | |
125 | 'md5': '94812e6438488fb923c361a44469614b', | |
126 | 'info_dict': { | |
127 | 'id': 'maischberger-video-424', | |
128 | 'display_id': 'maischberger-video-424', | |
129 | 'ext': 'mp4', | |
130 | 'duration': 4452.0, | |
131 | 'title': 'maischberger am 07.12.2022', | |
132 | 'upload_date': '20221207', | |
133 | 'thumbnail': r're:^https?://.*\.jpg$', | |
134 | }, | |
135 | }, { | |
136 | 'url': 'https://www.daserste.de/information/politik-weltgeschehen/morgenmagazin/videosextern/dominik-kahun-aus-der-nhl-direkt-zur-weltmeisterschaft-100.html', | |
137 | 'only_matching': True, | |
138 | }, { | |
139 | 'url': 'https://www.daserste.de/information/nachrichten-wetter/tagesthemen/videosextern/tagesthemen-17736.html', | |
140 | 'only_matching': True, | |
141 | }, { | |
142 | 'url': 'https://www.daserste.de/unterhaltung/serie/in-aller-freundschaft-die-jungen-aerzte/videos/diversity-tag-sanam-afrashteh100.html', | |
143 | 'only_matching': True, | |
144 | }, { | |
145 | 'url': 'http://www.daserste.de/information/reportage-dokumentation/dokus/videos/die-story-im-ersten-mission-unter-falscher-flagge-100.html', | |
146 | 'only_matching': True, | |
147 | }, { | |
148 | 'url': 'https://www.daserste.de/unterhaltung/serie/in-aller-freundschaft-die-jungen-aerzte/Drehpause-100.html', | |
149 | 'only_matching': True, | |
150 | }, { | |
151 | 'url': 'https://www.daserste.de/unterhaltung/film/filmmittwoch-im-ersten/videos/making-ofwendezeit-video-100.html', | |
152 | 'only_matching': True, | |
153 | }] | |
154 | ||
155 | def _real_extract(self, url): | |
156 | mobj = self._match_valid_url(url) | |
157 | display_id = mobj.group('id') | |
158 | ||
159 | player_url = mobj.group('mainurl') + '~playerXml.xml' | |
160 | doc = self._download_xml(player_url, display_id) | |
161 | video_node = doc.find('./video') | |
162 | upload_date = unified_strdate(xpath_text( | |
163 | video_node, './broadcastDate')) | |
164 | thumbnail = xpath_text(video_node, './/teaserImage//variant/url') | |
165 | ||
166 | formats = [] | |
167 | for a in video_node.findall('.//asset'): | |
168 | file_name = xpath_text(a, './fileName', default=None) | |
169 | if not file_name: | |
170 | continue | |
171 | format_type = a.attrib.get('type') | |
172 | format_url = url_or_none(file_name) | |
173 | if format_url: | |
174 | ext = determine_ext(file_name) | |
175 | if ext == 'm3u8': | |
176 | formats.extend(self._extract_m3u8_formats( | |
177 | format_url, display_id, 'mp4', entry_protocol='m3u8_native', | |
178 | m3u8_id=format_type or 'hls', fatal=False)) | |
179 | continue | |
180 | elif ext == 'f4m': | |
181 | formats.extend(self._extract_f4m_formats( | |
182 | update_url_query(format_url, {'hdcore': '3.7.0'}), | |
183 | display_id, f4m_id=format_type or 'hds', fatal=False)) | |
184 | continue | |
185 | f = { | |
186 | 'format_id': format_type, | |
187 | 'width': int_or_none(xpath_text(a, './frameWidth')), | |
188 | 'height': int_or_none(xpath_text(a, './frameHeight')), | |
189 | 'vbr': int_or_none(xpath_text(a, './bitrateVideo')), | |
190 | 'abr': int_or_none(xpath_text(a, './bitrateAudio')), | |
191 | 'vcodec': xpath_text(a, './codecVideo'), | |
192 | 'tbr': int_or_none(xpath_text(a, './totalBitrate')), | |
193 | } | |
194 | server_prefix = xpath_text(a, './serverPrefix', default=None) | |
195 | if server_prefix: | |
196 | f.update({ | |
197 | 'url': server_prefix, | |
198 | 'playpath': file_name, | |
199 | }) | |
200 | else: | |
201 | if not format_url: | |
202 | continue | |
203 | f['url'] = format_url | |
204 | formats.append(f) | |
205 | ||
206 | _SUB_FORMATS = ( | |
207 | ('./dataTimedText', 'ttml'), | |
208 | ('./dataTimedTextNoOffset', 'ttml'), | |
209 | ('./dataTimedTextVtt', 'vtt'), | |
210 | ) | |
211 | ||
212 | subtitles = {} | |
213 | for subsel, subext in _SUB_FORMATS: | |
214 | for node in video_node.findall(subsel): | |
215 | subtitles.setdefault('de', []).append({ | |
216 | 'url': node.attrib['url'], | |
217 | 'ext': subext, | |
218 | }) | |
219 | ||
220 | return { | |
221 | 'id': xpath_text(video_node, './videoId', default=display_id), | |
222 | 'formats': formats, | |
223 | 'subtitles': subtitles, | |
224 | 'display_id': display_id, | |
225 | 'title': video_node.find('./title').text, | |
226 | 'duration': parse_duration(video_node.find('./duration').text), | |
227 | 'upload_date': upload_date, | |
228 | 'thumbnail': thumbnail, | |
229 | } | |
230 | ||
231 | ||
232 | class ARDBetaMediathekIE(InfoExtractor): | |
233 | IE_NAME = 'ARDMediathek' | |
234 | _VALID_URL = r'''(?x)https:// | |
235 | (?:(?:beta|www)\.)?ardmediathek\.de/ | |
236 | (?:[^/]+/)? | |
237 | (?:player|live|video)/ | |
238 | (?:[^?#]+/)? | |
239 | (?P<id>[a-zA-Z0-9]+) | |
240 | /?(?:[?#]|$)''' | |
241 | _GEO_COUNTRIES = ['DE'] | |
242 | _TOKEN_URL = 'https://sso.ardmediathek.de/sso/token' | |
243 | ||
244 | _TESTS = [{ | |
245 | 'url': 'https://www.ardmediathek.de/video/filme-im-mdr/liebe-auf-vier-pfoten/mdr-fernsehen/Y3JpZDovL21kci5kZS9zZW5kdW5nLzI4MjA0MC80MjIwOTEtNDAyNTM0', | |
246 | 'md5': 'b6e8ab03f2bcc6e1f9e6cef25fcc03c4', | |
247 | 'info_dict': { | |
248 | 'display_id': 'Y3JpZDovL21kci5kZS9zZW5kdW5nLzI4MjA0MC80MjIwOTEtNDAyNTM0', | |
249 | 'id': '12939099', | |
250 | 'title': 'Liebe auf vier Pfoten', | |
251 | 'description': r're:^Claudia Schmitt, Anwältin in Salzburg', | |
252 | 'duration': 5222, | |
253 | 'thumbnail': 'https://api.ardmediathek.de/image-service/images/urn:ard:image:aee7cbf8f06de976?w=960&ch=ae4d0f2ee47d8b9b', | |
254 | 'timestamp': 1701343800, | |
255 | 'upload_date': '20231130', | |
256 | 'ext': 'mp4', | |
257 | 'episode': 'Liebe auf vier Pfoten', | |
258 | 'series': 'Filme im MDR', | |
259 | 'age_limit': 0, | |
260 | 'channel': 'MDR', | |
261 | '_old_archive_ids': ['ardbetamediathek Y3JpZDovL21kci5kZS9zZW5kdW5nLzI4MjA0MC80MjIwOTEtNDAyNTM0'], | |
262 | }, | |
263 | }, { | |
264 | 'url': 'https://www.ardmediathek.de/mdr/video/die-robuste-roswita/Y3JpZDovL21kci5kZS9iZWl0cmFnL2Ntcy84MWMxN2MzZC0wMjkxLTRmMzUtODk4ZS0wYzhlOWQxODE2NGI/', | |
265 | 'md5': 'a1dc75a39c61601b980648f7c9f9f71d', | |
266 | 'info_dict': { | |
267 | 'display_id': 'die-robuste-roswita', | |
268 | 'id': '78566716', | |
269 | 'title': 'Die robuste Roswita', | |
270 | 'description': r're:^Der Mord.*totgeglaubte Ehefrau Roswita', | |
271 | 'duration': 5316, | |
272 | 'thumbnail': 'https://img.ardmediathek.de/standard/00/78/56/67/84/575672121/16x9/960?mandant=ard', | |
273 | 'timestamp': 1596658200, | |
274 | 'upload_date': '20200805', | |
275 | 'ext': 'mp4', | |
276 | }, | |
277 | 'skip': 'Error', | |
278 | }, { | |
279 | 'url': 'https://www.ardmediathek.de/video/tagesschau-oder-tagesschau-20-00-uhr/das-erste/Y3JpZDovL2Rhc2Vyc3RlLmRlL3RhZ2Vzc2NoYXUvZmM4ZDUxMjgtOTE0ZC00Y2MzLTgzNzAtNDZkNGNiZWJkOTll', | |
280 | 'md5': '1e73ded21cb79bac065117e80c81dc88', | |
281 | 'info_dict': { | |
282 | 'id': '10049223', | |
283 | 'ext': 'mp4', | |
284 | 'title': 'tagesschau, 20:00 Uhr', | |
285 | 'timestamp': 1636398000, | |
286 | 'description': 'md5:39578c7b96c9fe50afdf5674ad985e6b', | |
287 | 'upload_date': '20211108', | |
288 | 'display_id': 'Y3JpZDovL2Rhc2Vyc3RlLmRlL3RhZ2Vzc2NoYXUvZmM4ZDUxMjgtOTE0ZC00Y2MzLTgzNzAtNDZkNGNiZWJkOTll', | |
289 | 'duration': 915, | |
290 | 'episode': 'tagesschau, 20:00 Uhr', | |
291 | 'series': 'tagesschau', | |
292 | 'thumbnail': 'https://api.ardmediathek.de/image-service/images/urn:ard:image:fbb21142783b0a49?w=960&ch=ee69108ae344f678', | |
293 | 'channel': 'ARD-Aktuell', | |
294 | '_old_archive_ids': ['ardbetamediathek Y3JpZDovL2Rhc2Vyc3RlLmRlL3RhZ2Vzc2NoYXUvZmM4ZDUxMjgtOTE0ZC00Y2MzLTgzNzAtNDZkNGNiZWJkOTll'], | |
295 | }, | |
296 | }, { | |
297 | 'url': 'https://www.ardmediathek.de/video/7-tage/7-tage-unter-harten-jungs/hr-fernsehen/N2I2YmM5MzgtNWFlOS00ZGFlLTg2NzMtYzNjM2JlNjk4MDg3', | |
298 | 'md5': 'c428b9effff18ff624d4f903bda26315', | |
299 | 'info_dict': { | |
300 | 'id': '94834686', | |
301 | 'ext': 'mp4', | |
302 | 'duration': 2700, | |
303 | 'episode': '7 Tage ... unter harten Jungs', | |
304 | 'description': 'md5:0f215470dcd2b02f59f4bd10c963f072', | |
305 | 'upload_date': '20231005', | |
306 | 'timestamp': 1696491171, | |
307 | 'display_id': 'N2I2YmM5MzgtNWFlOS00ZGFlLTg2NzMtYzNjM2JlNjk4MDg3', | |
308 | 'series': '7 Tage ...', | |
309 | 'channel': 'HR', | |
310 | 'thumbnail': 'https://api.ardmediathek.de/image-service/images/urn:ard:image:f6e6d5ffac41925c?w=960&ch=fa32ba69bc87989a', | |
311 | 'title': '7 Tage ... unter harten Jungs', | |
312 | '_old_archive_ids': ['ardbetamediathek N2I2YmM5MzgtNWFlOS00ZGFlLTg2NzMtYzNjM2JlNjk4MDg3'], | |
313 | }, | |
314 | }, { | |
315 | 'url': 'https://beta.ardmediathek.de/ard/video/Y3JpZDovL2Rhc2Vyc3RlLmRlL3RhdG9ydC9mYmM4NGM1NC0xNzU4LTRmZGYtYWFhZS0wYzcyZTIxNGEyMDE', | |
316 | 'only_matching': True, | |
317 | }, { | |
318 | 'url': 'https://ardmediathek.de/ard/video/saartalk/saartalk-gesellschaftsgift-haltung-gegen-hass/sr-fernsehen/Y3JpZDovL3NyLW9ubGluZS5kZS9TVF84MTY4MA/', | |
319 | 'only_matching': True, | |
320 | }, { | |
321 | 'url': 'https://www.ardmediathek.de/ard/video/trailer/private-eyes-s01-e01/one/Y3JpZDovL3dkci5kZS9CZWl0cmFnLTE1MTgwYzczLWNiMTEtNGNkMS1iMjUyLTg5MGYzOWQxZmQ1YQ/', | |
322 | 'only_matching': True, | |
323 | }, { | |
324 | 'url': 'https://www.ardmediathek.de/ard/player/Y3JpZDovL3N3ci5kZS9hZXgvbzEwNzE5MTU/', | |
325 | 'only_matching': True, | |
326 | }, { | |
327 | 'url': 'https://www.ardmediathek.de/swr/live/Y3JpZDovL3N3ci5kZS8xMzQ4MTA0Mg', | |
328 | 'only_matching': True, | |
329 | }, { | |
330 | 'url': 'https://www.ardmediathek.de/video/coronavirus-update-ndr-info/astrazeneca-kurz-lockdown-und-pims-syndrom-81/ndr/Y3JpZDovL25kci5kZS84NzE0M2FjNi0wMWEwLTQ5ODEtOTE5NS1mOGZhNzdhOTFmOTI/', | |
331 | 'only_matching': True, | |
332 | }] | |
333 | ||
334 | def _extract_episode_info(self, title): | |
335 | patterns = [ | |
336 | # Pattern for title like "Homo sapiens (S06/E07) - Originalversion" | |
337 | # from: https://www.ardmediathek.de/one/sendung/doctor-who/Y3JpZDovL3dkci5kZS9vbmUvZG9jdG9yIHdobw | |
338 | r'.*(?P<ep_info> \(S(?P<season_number>\d+)/E(?P<episode_number>\d+)\)).*', | |
339 | # E.g.: title="Fritjof aus Norwegen (2) (AD)" | |
340 | # from: https://www.ardmediathek.de/ard/sammlung/der-krieg-und-ich/68cMkqJdllm639Skj4c7sS/ | |
341 | r'.*(?P<ep_info> \((?:Folge |Teil )?(?P<episode_number>\d+)(?:/\d+)?\)).*', | |
342 | r'.*(?P<ep_info>Folge (?P<episode_number>\d+)(?:\:| -|) )\"(?P<episode>.+)\".*', | |
343 | # E.g.: title="Folge 25/42: Symmetrie" | |
344 | # from: https://www.ardmediathek.de/ard/video/grips-mathe/folge-25-42-symmetrie/ard-alpha/Y3JpZDovL2JyLmRlL3ZpZGVvLzMyYzI0ZjczLWQ1N2MtNDAxNC05ZmZhLTFjYzRkZDA5NDU5OQ/ | |
345 | # E.g.: title="Folge 1063 - Vertrauen" | |
346 | # from: https://www.ardmediathek.de/ard/sendung/die-fallers/Y3JpZDovL3N3ci5kZS8yMzAyMDQ4/ | |
347 | r'.*(?P<ep_info>Folge (?P<episode_number>\d+)(?:/\d+)?(?:\:| -|) ).*', | |
348 | # As a fallback use the full title | |
349 | r'(?P<title>.*)', | |
350 | ] | |
351 | ||
352 | return traverse_obj(patterns, (..., {functools.partial(re.match, string=title)}, { | |
353 | 'season_number': ('season_number', {int_or_none}), | |
354 | 'episode_number': ('episode_number', {int_or_none}), | |
355 | 'episode': (( | |
356 | ('episode', {str_or_none}), | |
357 | ('ep_info', {lambda x: title.replace(x, '')}), | |
358 | ('title', {str}), | |
359 | ), {str.strip}), | |
360 | }), get_all=False) | |
361 | ||
362 | def _real_extract(self, url): | |
363 | display_id = self._match_id(url) | |
364 | query = {'embedded': 'false', 'mcV6': 'true'} | |
365 | headers = {} | |
366 | ||
367 | if self._get_cookies(self._TOKEN_URL).get('ams'): | |
368 | token = self._download_json( | |
369 | self._TOKEN_URL, display_id, 'Fetching token for age verification', | |
370 | 'Unable to fetch age verification token', fatal=False) | |
371 | id_token = traverse_obj(token, ('idToken', {str})) | |
372 | decoded_token = traverse_obj(id_token, ({jwt_decode_hs256}, {dict})) | |
373 | user_id = traverse_obj(decoded_token, (('user_id', 'sub'), {str}), get_all=False) | |
374 | if not user_id: | |
375 | self.report_warning('Unable to extract token, continuing without authentication') | |
376 | else: | |
377 | headers['x-authorization'] = f'Bearer {id_token}' | |
378 | query['userId'] = user_id | |
379 | if decoded_token.get('age_rating') != 18: | |
380 | self.report_warning('Account is not verified as 18+; video may be unavailable') | |
381 | ||
382 | page_data = self._download_json( | |
383 | f'https://api.ardmediathek.de/page-gateway/pages/ard/item/{display_id}', | |
384 | display_id, query=query, headers=headers) | |
385 | ||
386 | # For user convenience we use the old contentId instead of the longer crid | |
387 | # Ref: https://github.com/yt-dlp/yt-dlp/issues/8731#issuecomment-1874398283 | |
388 | old_id = traverse_obj(page_data, ('tracking', 'atiCustomVars', 'contentId', {int})) | |
389 | if old_id is not None: | |
390 | video_id = str(old_id) | |
391 | archive_ids = [make_archive_id(ARDBetaMediathekIE, display_id)] | |
392 | else: | |
393 | self.report_warning(f'Could not extract contentId{bug_reports_message()}') | |
394 | video_id = display_id | |
395 | archive_ids = None | |
396 | ||
397 | player_data = traverse_obj( | |
398 | page_data, ('widgets', lambda _, v: v['type'] in ('player_ondemand', 'player_live'), {dict}), get_all=False) | |
399 | is_live = player_data.get('type') == 'player_live' | |
400 | media_data = traverse_obj(player_data, ('mediaCollection', 'embedded', {dict})) | |
401 | ||
402 | if player_data.get('blockedByFsk'): | |
403 | self.raise_login_required('This video is only available for age verified users or after 22:00') | |
404 | ||
405 | formats = [] | |
406 | subtitles = {} | |
407 | for stream in traverse_obj(media_data, ('streams', ..., {dict})): | |
408 | kind = stream.get('kind') | |
409 | # Prioritize main stream over sign language and others | |
410 | preference = 1 if kind == 'main' else None | |
411 | for media in traverse_obj(stream, ('media', lambda _, v: url_or_none(v['url']))): | |
412 | media_url = media['url'] | |
413 | ||
414 | audio_kind = traverse_obj(media, ( | |
415 | 'audios', 0, 'kind', {str}), default='').replace('standard', '') | |
416 | lang_code = traverse_obj(media, ('audios', 0, 'languageCode', {str})) or 'deu' | |
417 | lang = join_nonempty(lang_code, audio_kind) | |
418 | language_preference = 10 if lang == 'deu' else -10 | |
419 | ||
420 | if determine_ext(media_url) == 'm3u8': | |
421 | fmts, subs = self._extract_m3u8_formats_and_subtitles( | |
422 | media_url, video_id, m3u8_id=f'hls-{kind}', preference=preference, fatal=False, live=is_live) | |
423 | for f in fmts: | |
424 | f['language'] = lang | |
425 | f['language_preference'] = language_preference | |
426 | formats.extend(fmts) | |
427 | self._merge_subtitles(subs, target=subtitles) | |
428 | else: | |
429 | formats.append({ | |
430 | 'url': media_url, | |
431 | 'format_id': f'http-{kind}', | |
432 | 'preference': preference, | |
433 | 'language': lang, | |
434 | 'language_preference': language_preference, | |
435 | **traverse_obj(media, { | |
436 | 'format_note': ('forcedLabel', {str}), | |
437 | 'width': ('maxHResolutionPx', {int_or_none}), | |
438 | 'height': ('maxVResolutionPx', {int_or_none}), | |
439 | 'vcodec': ('videoCodec', {str}), | |
440 | }), | |
441 | }) | |
442 | ||
443 | for sub in traverse_obj(media_data, ('subtitles', ..., {dict})): | |
444 | for sources in traverse_obj(sub, ('sources', lambda _, v: url_or_none(v['url']))): | |
445 | subtitles.setdefault(sub.get('languageCode') or 'deu', []).append({ | |
446 | 'url': sources['url'], | |
447 | 'ext': {'webvtt': 'vtt', 'ebutt': 'ttml'}.get(sources.get('kind')), | |
448 | }) | |
449 | ||
450 | age_limit = traverse_obj(page_data, ('fskRating', {lambda x: remove_start(x, 'FSK')}, {int_or_none})) | |
451 | return { | |
452 | 'id': video_id, | |
453 | 'display_id': display_id, | |
454 | 'formats': formats, | |
455 | 'subtitles': subtitles, | |
456 | 'is_live': is_live, | |
457 | 'age_limit': age_limit, | |
458 | **traverse_obj(media_data, ('meta', { | |
459 | 'title': 'title', | |
460 | 'description': 'synopsis', | |
461 | 'timestamp': ('broadcastedOnDateTime', {parse_iso8601}), | |
462 | 'series': 'seriesTitle', | |
463 | 'thumbnail': ('images', 0, 'url', {url_or_none}), | |
464 | 'duration': ('durationSeconds', {int_or_none}), | |
465 | 'channel': 'clipSourceName', | |
466 | })), | |
467 | **self._extract_episode_info(page_data.get('title')), | |
468 | '_old_archive_ids': archive_ids, | |
469 | } | |
470 | ||
471 | ||
472 | class ARDMediathekCollectionIE(InfoExtractor): | |
473 | _VALID_URL = r'''(?x)https:// | |
474 | (?:(?:beta|www)\.)?ardmediathek\.de/ | |
475 | (?:[^/?#]+/)? | |
476 | (?P<playlist>sendung|serie|sammlung)/ | |
477 | (?:(?P<display_id>[^?#]+?)/)? | |
478 | (?P<id>[a-zA-Z0-9]+) | |
479 | (?:/(?P<season>\d+)(?:/(?P<version>OV|AD))?)?/?(?:[?#]|$)''' | |
480 | _GEO_COUNTRIES = ['DE'] | |
481 | ||
482 | _TESTS = [{ | |
483 | 'url': 'https://www.ardmediathek.de/serie/quiz/staffel-1-originalversion/Y3JpZDovL3dkci5kZS9vbmUvcXVpeg/1/OV', | |
484 | 'info_dict': { | |
485 | 'id': 'Y3JpZDovL3dkci5kZS9vbmUvcXVpeg_1_OV', | |
486 | 'display_id': 'quiz/staffel-1-originalversion', | |
487 | 'title': 'Staffel 1 Originalversion', | |
488 | }, | |
489 | 'playlist_count': 3, | |
490 | }, { | |
491 | 'url': 'https://www.ardmediathek.de/serie/babylon-berlin/staffel-4-mit-audiodeskription/Y3JpZDovL2Rhc2Vyc3RlLmRlL2JhYnlsb24tYmVybGlu/4/AD', | |
492 | 'info_dict': { | |
493 | 'id': 'Y3JpZDovL2Rhc2Vyc3RlLmRlL2JhYnlsb24tYmVybGlu_4_AD', | |
494 | 'display_id': 'babylon-berlin/staffel-4-mit-audiodeskription', | |
495 | 'title': 'Staffel 4 mit Audiodeskription', | |
496 | }, | |
497 | 'playlist_count': 12, | |
498 | }, { | |
499 | 'url': 'https://www.ardmediathek.de/serie/babylon-berlin/staffel-1/Y3JpZDovL2Rhc2Vyc3RlLmRlL2JhYnlsb24tYmVybGlu/1/', | |
500 | 'info_dict': { | |
501 | 'id': 'Y3JpZDovL2Rhc2Vyc3RlLmRlL2JhYnlsb24tYmVybGlu_1', | |
502 | 'display_id': 'babylon-berlin/staffel-1', | |
503 | 'title': 'Staffel 1', | |
504 | }, | |
505 | 'playlist_count': 8, | |
506 | }, { | |
507 | 'url': 'https://www.ardmediathek.de/sendung/tatort/Y3JpZDovL2Rhc2Vyc3RlLmRlL3RhdG9ydA', | |
508 | 'info_dict': { | |
509 | 'id': 'Y3JpZDovL2Rhc2Vyc3RlLmRlL3RhdG9ydA', | |
510 | 'display_id': 'tatort', | |
511 | 'title': 'Tatort', | |
512 | }, | |
513 | 'playlist_mincount': 500, | |
514 | }, { | |
515 | 'url': 'https://www.ardmediathek.de/sammlung/die-kirche-bleibt-im-dorf/5eOHzt8XB2sqeFXbIoJlg2', | |
516 | 'info_dict': { | |
517 | 'id': '5eOHzt8XB2sqeFXbIoJlg2', | |
518 | 'display_id': 'die-kirche-bleibt-im-dorf', | |
519 | 'title': 'Die Kirche bleibt im Dorf', | |
520 | 'description': 'Die Kirche bleibt im Dorf', | |
521 | }, | |
522 | 'playlist_count': 4, | |
523 | }, { | |
524 | # playlist of type 'sendung' | |
525 | 'url': 'https://www.ardmediathek.de/ard/sendung/doctor-who/Y3JpZDovL3dkci5kZS9vbmUvZG9jdG9yIHdobw/', | |
526 | 'only_matching': True, | |
527 | }, { | |
528 | # playlist of type 'serie' | |
529 | 'url': 'https://www.ardmediathek.de/serie/nachtstreife/staffel-1/Y3JpZDovL3N3ci5kZS9zZGIvc3RJZC8xMjQy/1', | |
530 | 'only_matching': True, | |
531 | }, { | |
532 | # playlist of type 'sammlung' | |
533 | 'url': 'https://www.ardmediathek.de/ard/sammlung/team-muenster/5JpTzLSbWUAK8184IOvEir/', | |
534 | 'only_matching': True, | |
535 | }] | |
536 | ||
537 | _PAGE_SIZE = 100 | |
538 | ||
539 | def _real_extract(self, url): | |
540 | playlist_id, display_id, playlist_type, season_number, version = self._match_valid_url(url).group( | |
541 | 'id', 'display_id', 'playlist', 'season', 'version') | |
542 | ||
543 | def call_api(page_num): | |
544 | api_path = 'compilations/ard' if playlist_type == 'sammlung' else 'widgets/ard/asset' | |
545 | return self._download_json( | |
546 | f'https://api.ardmediathek.de/page-gateway/{api_path}/{playlist_id}', playlist_id, | |
547 | f'Downloading playlist page {page_num}', query={ | |
548 | 'pageNumber': page_num, | |
549 | 'pageSize': self._PAGE_SIZE, | |
550 | **({ | |
551 | 'seasoned': 'true', | |
552 | 'seasonNumber': season_number, | |
553 | 'withOriginalversion': 'true' if version == 'OV' else 'false', | |
554 | 'withAudiodescription': 'true' if version == 'AD' else 'false', | |
555 | } if season_number else {}), | |
556 | }) | |
557 | ||
558 | def fetch_page(page_num): | |
559 | for item in traverse_obj(call_api(page_num), ('teasers', ..., {dict})): | |
560 | item_id = traverse_obj(item, ('links', 'target', ('urlId', 'id')), 'id', get_all=False) | |
561 | if not item_id or item_id == playlist_id: | |
562 | continue | |
563 | item_mode = 'sammlung' if item.get('type') == 'compilation' else 'video' | |
564 | yield self.url_result( | |
565 | f'https://www.ardmediathek.de/{item_mode}/{item_id}', | |
566 | ie=(ARDMediathekCollectionIE if item_mode == 'sammlung' else ARDBetaMediathekIE), | |
567 | **traverse_obj(item, { | |
568 | 'id': ('id', {str}), | |
569 | 'title': ('longTitle', {str}), | |
570 | 'duration': ('duration', {int_or_none}), | |
571 | 'timestamp': ('broadcastedOn', {parse_iso8601}), | |
572 | })) | |
573 | ||
574 | page_data = call_api(0) | |
575 | full_id = join_nonempty(playlist_id, season_number, version, delim='_') | |
576 | ||
577 | return self.playlist_result( | |
578 | OnDemandPagedList(fetch_page, self._PAGE_SIZE), full_id, display_id=display_id, | |
579 | title=page_data.get('title'), description=page_data.get('synopsis')) |