]>
Commit | Line | Data |
---|---|---|
1 | import functools | |
2 | import re | |
3 | ||
4 | from .common import InfoExtractor | |
5 | from ..utils import ( | |
6 | clean_html, | |
7 | extract_attributes, | |
8 | get_element_text_and_html_by_tag, | |
9 | get_elements_by_class, | |
10 | join_nonempty, | |
11 | js_to_json, | |
12 | mimetype2ext, | |
13 | unified_strdate, | |
14 | url_or_none, | |
15 | urljoin, | |
16 | variadic, | |
17 | ) | |
18 | from ..utils.traversal import traverse_obj | |
19 | ||
20 | ||
21 | def html_get_element(tag=None, cls=None): | |
22 | assert tag or cls, 'One of tag or class is required' | |
23 | ||
24 | if cls: | |
25 | func = functools.partial(get_elements_by_class, cls, tag=tag) | |
26 | else: | |
27 | func = functools.partial(get_element_text_and_html_by_tag, tag) | |
28 | ||
29 | def html_get_element_wrapper(html): | |
30 | return variadic(func(html))[0] | |
31 | ||
32 | return html_get_element_wrapper | |
33 | ||
34 | ||
35 | class BpbIE(InfoExtractor): | |
36 | IE_DESC = 'Bundeszentrale für politische Bildung' | |
37 | _VALID_URL = r'https?://(?:www\.|m\.)?bpb\.de/(?:[^/?#]+/)*(?P<id>\d+)(?:[/?#]|$)' | |
38 | ||
39 | _TESTS = [{ | |
40 | 'url': 'http://www.bpb.de/mediathek/297/joachim-gauck-zu-1989-und-die-erinnerung-an-die-ddr', | |
41 | 'info_dict': { | |
42 | 'id': '297', | |
43 | 'ext': 'mp4', | |
44 | 'creator': 'Kooperative Berlin', | |
45 | 'description': 'md5:f4f75885ba009d3e2b156247a8941ce6', | |
46 | 'release_date': '20160115', | |
47 | 'series': 'Interview auf dem Geschichtsforum 1989 | 2009', | |
48 | 'tags': ['Friedliche Revolution', 'Erinnerungskultur', 'Vergangenheitspolitik', 'DDR 1949 - 1990', 'Freiheitsrecht', 'BStU', 'Deutschland'], | |
49 | 'thumbnail': 'https://www.bpb.de/cache/images/7/297_teaser_16x9_1240.jpg?8839D', | |
50 | 'title': 'Joachim Gauck zu 1989 und die Erinnerung an die DDR', | |
51 | 'uploader': 'Bundeszentrale für politische Bildung', | |
52 | }, | |
53 | }, { | |
54 | 'url': 'https://www.bpb.de/mediathek/video/522184/krieg-flucht-und-falschmeldungen-wirstattdesinformation-2/', | |
55 | 'info_dict': { | |
56 | 'id': '522184', | |
57 | 'ext': 'mp4', | |
58 | 'creator': 'Institute for Strategic Dialogue Germany gGmbH (ISD)', | |
59 | 'description': 'md5:f83c795ff8f825a69456a9e51fc15903', | |
60 | 'release_date': '20230621', | |
61 | 'tags': ['Desinformation', 'Ukraine', 'Russland', 'Geflüchtete'], | |
62 | 'thumbnail': 'https://www.bpb.de/cache/images/4/522184_teaser_16x9_1240.png?EABFB', | |
63 | 'title': 'md5:9b01ccdbf58dbf9e5c9f6e771a803b1c', | |
64 | 'uploader': 'Bundeszentrale für politische Bildung', | |
65 | }, | |
66 | }, { | |
67 | 'url': 'https://www.bpb.de/lernen/bewegtbild-und-politische-bildung/webvideo/518789/krieg-flucht-und-falschmeldungen-wirstattdesinformation-1/', | |
68 | 'info_dict': { | |
69 | 'id': '518789', | |
70 | 'ext': 'mp4', | |
71 | 'creator': 'Institute for Strategic Dialogue Germany gGmbH (ISD)', | |
72 | 'description': 'md5:85228aed433e84ff0ff9bc582abd4ea8', | |
73 | 'release_date': '20230302', | |
74 | 'tags': ['Desinformation', 'Ukraine', 'Russland', 'Geflüchtete'], | |
75 | 'thumbnail': 'https://www.bpb.de/cache/images/9/518789_teaser_16x9_1240.jpeg?56D0D', | |
76 | 'title': 'md5:3e956f264bb501f6383f10495a401da4', | |
77 | 'uploader': 'Bundeszentrale für politische Bildung', | |
78 | }, | |
79 | }, { | |
80 | 'url': 'https://www.bpb.de/mediathek/podcasts/apuz-podcast/539727/apuz-20-china/', | |
81 | 'only_matching': True, | |
82 | }, { | |
83 | 'url': 'https://www.bpb.de/mediathek/audio/315813/folge-1-eine-einfuehrung/', | |
84 | 'info_dict': { | |
85 | 'id': '315813', | |
86 | 'ext': 'mp3', | |
87 | 'creator': 'Axel Schröder', | |
88 | 'description': 'md5:eda9d1af34e5912efef5baf54fba4427', | |
89 | 'release_date': '20200921', | |
90 | 'series': 'Auf Endlagersuche. Der deutsche Weg zu einem sicheren Atommülllager', | |
91 | 'tags': ['Atomenergie', 'Endlager', 'hoch-radioaktiver Abfall', 'Endlagersuche', 'Atommüll', 'Atomendlager', 'Gorleben', 'Deutschland'], | |
92 | 'thumbnail': 'https://www.bpb.de/cache/images/3/315813_teaser_16x9_1240.png?92A94', | |
93 | 'title': 'Folge 1: Eine Einführung', | |
94 | 'uploader': 'Bundeszentrale für politische Bildung', | |
95 | }, | |
96 | }, { | |
97 | 'url': 'https://www.bpb.de/517806/die-weltanschauung-der-neuen-rechten/', | |
98 | 'info_dict': { | |
99 | 'id': '517806', | |
100 | 'ext': 'mp3', | |
101 | 'creator': 'Bundeszentrale für politische Bildung', | |
102 | 'description': 'md5:594689600e919912aade0b2871cc3fed', | |
103 | 'release_date': '20230127', | |
104 | 'series': 'Vorträge des Fachtags "Modernisierer. Grenzgänger. Anstifter. Sechs Jahrzehnte \'Neue Rechte\'"', | |
105 | 'tags': ['Rechtsextremismus', 'Konservatismus', 'Konservativismus', 'neue Rechte', 'Rechtspopulismus', 'Schnellroda', 'Deutschland'], | |
106 | 'thumbnail': 'https://www.bpb.de/cache/images/6/517806_teaser_16x9_1240.png?7A7A0', | |
107 | 'title': 'Die Weltanschauung der "Neuen Rechten"', | |
108 | 'uploader': 'Bundeszentrale für politische Bildung', | |
109 | }, | |
110 | }, { | |
111 | 'url': 'https://www.bpb.de/mediathek/reihen/zahlen-und-fakten-soziale-situation-filme/520153/zahlen-und-fakten-die-soziale-situation-in-deutschland-migration/', | |
112 | 'only_matching': True, | |
113 | }] | |
114 | ||
115 | _TITLE_RE = re.compile('(?P<title>[^<]*)<[^>]+>(?P<series>[^<]*)') | |
116 | ||
117 | def _parse_vue_attributes(self, name, string, video_id): | |
118 | attributes = extract_attributes(self._search_regex(rf'(<{name}(?:"[^"]*?"|[^>])*>)', string, name)) | |
119 | ||
120 | for key, value in attributes.items(): | |
121 | if key.startswith(':'): | |
122 | attributes[key] = self._parse_json(value, video_id, transform_source=js_to_json, fatal=False) | |
123 | ||
124 | return attributes | |
125 | ||
126 | @staticmethod | |
127 | def _process_source(source): | |
128 | url = url_or_none(source['src']) | |
129 | if not url: | |
130 | return None | |
131 | ||
132 | source_type = source.get('type', '') | |
133 | extension = mimetype2ext(source_type) | |
134 | is_video = source_type.startswith('video') | |
135 | note = url.rpartition('.')[0].rpartition('_')[2] if is_video else None | |
136 | ||
137 | return { | |
138 | 'url': url, | |
139 | 'ext': extension, | |
140 | 'vcodec': None if is_video else 'none', | |
141 | 'quality': 10 if note == 'high' else 0, | |
142 | 'format_note': note, | |
143 | 'format_id': join_nonempty(extension, note), | |
144 | } | |
145 | ||
146 | def _real_extract(self, url): | |
147 | video_id = self._match_id(url) | |
148 | webpage = self._download_webpage(url, video_id) | |
149 | ||
150 | title_result = traverse_obj(webpage, ({html_get_element(cls='opening-header__title')}, {self._TITLE_RE.match})) | |
151 | json_lds = list(self._yield_json_ld(webpage, video_id, fatal=False)) | |
152 | ||
153 | return { | |
154 | 'id': video_id, | |
155 | 'title': traverse_obj(title_result, ('title', {str.strip})) or None, | |
156 | # This metadata could be interpreted otherwise, but it fits "series" the most | |
157 | 'series': traverse_obj(title_result, ('series', {str.strip})) or None, | |
158 | 'description': join_nonempty(*traverse_obj(webpage, [( | |
159 | {html_get_element(cls='opening-intro')}, | |
160 | [{html_get_element(tag='bpb-accordion-item')}, {html_get_element(cls='text-content')}], | |
161 | ), {clean_html}]), delim='\n\n') or None, | |
162 | 'creator': self._html_search_meta('author', webpage), | |
163 | 'uploader': self._html_search_meta('publisher', webpage), | |
164 | 'release_date': unified_strdate(self._html_search_meta('date', webpage)), | |
165 | 'tags': traverse_obj(json_lds, (..., 'keywords', {lambda x: x.split(',')}, ...)), | |
166 | **traverse_obj(self._parse_vue_attributes('bpb-player', webpage, video_id), { | |
167 | 'formats': (':sources', ..., {self._process_source}), | |
168 | 'thumbnail': ('poster', {lambda x: urljoin(url, x)}), | |
169 | }), | |
170 | } |