]>
Commit | Line | Data |
---|---|---|
866f0373 H |
1 | from .common import InfoExtractor |
2 | from ..utils import parse_duration, parse_iso8601, traverse_obj | |
3 | ||
4 | ||
5 | class NOSNLArticleIE(InfoExtractor): | |
6 | _VALID_URL = r'https?://nos\.nl/((?!video)(\w+/)?\w+/)\d+-(?P<display_id>[\w-]+)' | |
7 | _TESTS = [ | |
8 | { | |
9 | # only 1 video | |
10 | 'url': 'https://nos.nl/nieuwsuur/artikel/2440353-verzakking-door-droogte-dreigt-tot-een-miljoen-kwetsbare-huizen', | |
11 | 'info_dict': { | |
12 | 'id': '2440340', | |
13 | 'ext': 'mp4', | |
14 | 'description': 'md5:5f83185d902ac97af3af4bed7ece3db5', | |
15 | 'title': '\'We hebben een huis vol met scheuren\'', | |
16 | 'duration': 95.0, | |
17 | 'thumbnail': 'https://cdn.nos.nl/image/2022/08/12/887149/3840x2160a.jpg', | |
18 | } | |
19 | }, { | |
20 | # more than 1 video | |
21 | 'url': 'https://nos.nl/artikel/2440409-vannacht-sliepen-weer-enkele-honderden-asielzoekers-in-ter-apel-buiten', | |
22 | 'info_dict': { | |
23 | 'id': '2440409', | |
24 | 'title': 'Vannacht sliepen weer enkele honderden asielzoekers in Ter Apel buiten', | |
25 | 'description': 'Er werd wel geprobeerd om kwetsbare migranten onderdak te bieden, zegt het COA.', | |
26 | 'tags': ['aanmeldcentrum', 'Centraal Orgaan opvang asielzoekers', 'COA', 'asielzoekers', 'Ter Apel'], | |
27 | 'modified_timestamp': 1660452773, | |
28 | 'modified_date': '20220814', | |
29 | 'upload_date': '20220813', | |
30 | 'thumbnail': 'https://cdn.nos.nl/image/2022/07/18/880346/1024x576a.jpg', | |
31 | 'timestamp': 1660401384, | |
32 | }, | |
33 | 'playlist_count': 2, | |
34 | }, { | |
35 | # audio + video | |
36 | 'url': 'https://nos.nl/artikel/2440789-wekdienst-16-8-groningse-acties-tien-jaar-na-zware-aardbeving-femke-bol-in-actie-op-ek-atletiek', | |
37 | 'info_dict': { | |
38 | 'id': '2440789', | |
39 | 'title': 'Wekdienst 16/8: Groningse acties tien jaar na zware aardbeving • Femke Bol in actie op EK atletiek ', | |
40 | 'description': 'Nieuws, weer, verkeer: met dit overzicht begin je geïnformeerd aan de dag.', | |
41 | 'tags': ['wekdienst'], | |
42 | 'modified_date': '20220816', | |
43 | 'modified_timestamp': 1660625449, | |
44 | 'timestamp': 1660625449, | |
45 | 'upload_date': '20220816', | |
46 | 'thumbnail': 'https://cdn.nos.nl/image/2022/08/16/888178/1024x576a.jpg', | |
47 | }, | |
48 | 'playlist_count': 2, | |
49 | } | |
50 | ] | |
51 | ||
52 | def _entries(self, nextjs_json, display_id): | |
53 | for item in nextjs_json['items']: | |
54 | if item.get('type') == 'video': | |
55 | formats, subtitle = self._extract_m3u8_formats_and_subtitles( | |
56 | traverse_obj(item, ('source', 'url')), display_id, ext='mp4') | |
57 | yield { | |
58 | 'id': str(item['id']), | |
59 | 'title': item.get('title'), | |
60 | 'description': item.get('description'), | |
61 | 'formats': formats, | |
62 | 'subtitles': subtitle, | |
63 | 'duration': parse_duration(item.get('duration')), | |
64 | 'thumbnails': [{ | |
65 | 'url': traverse_obj(image, ('url', ...), get_all=False), | |
66 | 'width': image.get('width'), | |
67 | 'height': image.get('height') | |
68 | } for image in traverse_obj(item, ('imagesByRatio', ...))[0]], | |
69 | } | |
70 | ||
71 | elif item.get('type') == 'audio': | |
72 | yield { | |
73 | 'id': str(item['id']), | |
74 | 'title': item.get('title'), | |
75 | 'url': traverse_obj(item, ('media', 'src')), | |
76 | 'ext': 'mp3', | |
77 | } | |
78 | ||
79 | def _real_extract(self, url): | |
80 | display_id = self._match_valid_url(url).group('display_id') | |
81 | webpage = self._download_webpage(url, display_id) | |
82 | ||
83 | nextjs_json = self._search_nextjs_data(webpage, display_id)['props']['pageProps']['data'] | |
84 | return { | |
85 | '_type': 'playlist', | |
86 | 'entries': self._entries(nextjs_json, display_id), | |
87 | 'id': str(nextjs_json['id']), | |
88 | 'title': nextjs_json.get('title') or self._html_search_meta(['title', 'og:title', 'twitter:title'], webpage), | |
89 | 'description': (nextjs_json.get('description') | |
90 | or self._html_search_meta(['description', 'twitter:description', 'og:description'], webpage)), | |
91 | 'tags': nextjs_json.get('keywords'), | |
92 | 'modified_timestamp': parse_iso8601(nextjs_json.get('modifiedAt')), | |
93 | 'thumbnail': nextjs_json.get('shareImageSrc') or self._html_search_meta(['og:image', 'twitter:image'], webpage), | |
94 | 'timestamp': parse_iso8601(nextjs_json.get('publishedAt')) | |
95 | } |