]>
Commit | Line | Data |
---|---|---|
1 | from .common import InfoExtractor | |
2 | from ..utils import parse_duration, parse_iso8601, traverse_obj | |
3 | ||
4 | ||
5 | class NOSNLArticleIE(InfoExtractor): | |
6 | _VALID_URL = r'https?://nos\.nl/(?P<type>video|(\w+/)?\w+)/?\d+-(?P<display_id>[\w-]+)' | |
7 | _TESTS = [ | |
8 | { | |
9 | # only 1 video | |
10 | 'url': 'https://nos.nl/nieuwsuur/artikel/2440353-verzakking-door-droogte-dreigt-tot-een-miljoen-kwetsbare-huizen', | |
11 | 'info_dict': { | |
12 | 'id': '2440340', | |
13 | 'ext': 'mp4', | |
14 | 'description': 'md5:5f83185d902ac97af3af4bed7ece3db5', | |
15 | 'title': '\'We hebben een huis vol met scheuren\'', | |
16 | 'duration': 95.0, | |
17 | 'thumbnail': 'https://cdn.nos.nl/image/2022/08/12/887149/3840x2160a.jpg', | |
18 | } | |
19 | }, { | |
20 | # more than 1 video | |
21 | 'url': 'https://nos.nl/artikel/2440409-vannacht-sliepen-weer-enkele-honderden-asielzoekers-in-ter-apel-buiten', | |
22 | 'info_dict': { | |
23 | 'id': '2440409', | |
24 | 'title': 'Vannacht sliepen weer enkele honderden asielzoekers in Ter Apel buiten', | |
25 | 'description': 'md5:72b1e1674d798460e79d78fa37e9f56d', | |
26 | 'tags': ['aanmeldcentrum', 'Centraal Orgaan opvang asielzoekers', 'COA', 'asielzoekers', 'Ter Apel'], | |
27 | 'modified_timestamp': 1660452773, | |
28 | 'modified_date': '20220814', | |
29 | 'upload_date': '20220813', | |
30 | 'thumbnail': 'https://cdn.nos.nl/image/2022/07/18/880346/1024x576a.jpg', | |
31 | 'timestamp': 1660401384, | |
32 | 'categories': ['Regionaal nieuws', 'Binnenland'], | |
33 | }, | |
34 | 'playlist_count': 2, | |
35 | }, { | |
36 | # audio + video | |
37 | 'url': 'https://nos.nl/artikel/2440789-wekdienst-16-8-groningse-acties-tien-jaar-na-zware-aardbeving-femke-bol-in-actie-op-ek-atletiek', | |
38 | 'info_dict': { | |
39 | 'id': '2440789', | |
40 | 'title': 'Wekdienst 16/8: Groningse acties tien jaar na zware aardbeving • Femke Bol in actie op EK atletiek ', | |
41 | 'description': 'md5:0bd277ed7a44fc15cb12a9d27d8f6641', | |
42 | 'tags': ['wekdienst'], | |
43 | 'modified_date': '20220816', | |
44 | 'modified_timestamp': 1660625449, | |
45 | 'timestamp': 1660625449, | |
46 | 'upload_date': '20220816', | |
47 | 'thumbnail': 'https://cdn.nos.nl/image/2022/08/16/888178/1024x576a.jpg', | |
48 | 'categories': ['Binnenland', 'Buitenland'], | |
49 | }, | |
50 | 'playlist_count': 2, | |
51 | }, { | |
52 | # video url | |
53 | 'url': 'https://nos.nl/video/2452718-xi-en-trudeau-botsen-voor-de-camera-op-g20-top-je-hebt-gelekt', | |
54 | 'info_dict': { | |
55 | 'id': '2452718', | |
56 | 'title': 'Xi en Trudeau botsen voor de camera op G20-top: \'Je hebt gelekt\'', | |
57 | 'modified_date': '20221117', | |
58 | 'description': 'md5:61907dac576f75c11bf8ffffd4a3cc0f', | |
59 | 'tags': ['Xi', 'Trudeau', 'G20', 'indonesié'], | |
60 | 'upload_date': '20221117', | |
61 | 'thumbnail': 'https://cdn.nos.nl/image/2022/11/17/916155/1024x576a.jpg', | |
62 | 'modified_timestamp': 1668663388, | |
63 | 'timestamp': 1668663388, | |
64 | 'categories': ['Buitenland'], | |
65 | }, | |
66 | 'playlist_mincount': 1, | |
67 | } | |
68 | ] | |
69 | ||
70 | def _entries(self, nextjs_json, display_id): | |
71 | for item in nextjs_json: | |
72 | if item.get('type') == 'video': | |
73 | formats, subtitle = self._extract_m3u8_formats_and_subtitles( | |
74 | traverse_obj(item, ('source', 'url')), display_id, ext='mp4') | |
75 | yield { | |
76 | 'id': str(item['id']), | |
77 | 'title': item.get('title'), | |
78 | 'description': item.get('description'), | |
79 | 'formats': formats, | |
80 | 'subtitles': subtitle, | |
81 | 'duration': parse_duration(item.get('duration')), | |
82 | 'thumbnails': [{ | |
83 | 'url': traverse_obj(image, ('url', ...), get_all=False), | |
84 | 'width': image.get('width'), | |
85 | 'height': image.get('height') | |
86 | } for image in traverse_obj(item, ('imagesByRatio', ...))[0]], | |
87 | } | |
88 | ||
89 | elif item.get('type') == 'audio': | |
90 | yield { | |
91 | 'id': str(item['id']), | |
92 | 'title': item.get('title'), | |
93 | 'url': traverse_obj(item, ('media', 'src')), | |
94 | 'ext': 'mp3', | |
95 | } | |
96 | ||
97 | def _real_extract(self, url): | |
98 | site_type, display_id = self._match_valid_url(url).group('type', 'display_id') | |
99 | webpage = self._download_webpage(url, display_id) | |
100 | ||
101 | nextjs_json = self._search_nextjs_data(webpage, display_id)['props']['pageProps']['data'] | |
102 | return { | |
103 | '_type': 'playlist', | |
104 | 'entries': self._entries( | |
105 | [nextjs_json['video']] if site_type == 'video' else nextjs_json['items'], display_id), | |
106 | 'id': str(nextjs_json['id']), | |
107 | 'title': nextjs_json.get('title') or self._html_search_meta(['title', 'og:title', 'twitter:title'], webpage), | |
108 | 'description': (nextjs_json.get('description') | |
109 | or self._html_search_meta(['description', 'twitter:description', 'og:description'], webpage)), | |
110 | 'tags': nextjs_json.get('keywords'), | |
111 | 'modified_timestamp': parse_iso8601(nextjs_json.get('modifiedAt')), | |
112 | 'thumbnail': nextjs_json.get('shareImageSrc') or self._html_search_meta(['og:image', 'twitter:image'], webpage), | |
113 | 'timestamp': parse_iso8601(nextjs_json.get('publishedAt')), | |
114 | 'categories': traverse_obj(nextjs_json, ('categories', ..., 'label')), | |
115 | } |