]>
Commit | Line | Data |
---|---|---|
98cb1eda | 1 | from .common import InfoExtractor |
2 | from ..utils import extract_attributes, merge_dicts, remove_end | |
3 | ||
4 | ||
5 | class RheinMainTVIE(InfoExtractor): | |
6 | _VALID_URL = r'https?://(?:www\.)?rheinmaintv\.de/sendungen/(?:[\w-]+/)*(?P<video_id>(?P<display_id>[\w-]+)/vom-\d{2}\.\d{2}\.\d{4}(?:/\d+)?)' | |
7 | _TESTS = [{ | |
8 | 'url': 'https://www.rheinmaintv.de/sendungen/beitrag-video/auf-dem-weg-zur-deutschen-meisterschaft/vom-07.11.2022/', | |
9 | 'info_dict': { | |
10 | 'id': 'auf-dem-weg-zur-deutschen-meisterschaft-vom-07.11.2022', | |
11 | 'ext': 'ismv', # ismv+isma will be merged into mp4 | |
12 | 'alt_title': 'Auf dem Weg zur Deutschen Meisterschaft', | |
13 | 'title': 'Auf dem Weg zur Deutschen Meisterschaft', | |
14 | 'upload_date': '20221108', | |
15 | 'view_count': int, | |
16 | 'display_id': 'auf-dem-weg-zur-deutschen-meisterschaft', | |
17 | 'thumbnail': r're:^https://.+\.jpg', | |
18 | 'description': 'md5:48c59b74192bc819a9b34af1d5ed1eb9', | |
19 | 'timestamp': 1667933057, | |
20 | 'duration': 243.0, | |
21 | }, | |
22 | 'params': {'skip_download': 'ism'}, | |
23 | }, { | |
24 | 'url': 'https://www.rheinmaintv.de/sendungen/beitrag-video/formationsgemeinschaft-rhein-main-bei-den-deutschen-meisterschaften/vom-14.11.2022/', | |
25 | 'info_dict': { | |
26 | 'id': 'formationsgemeinschaft-rhein-main-bei-den-deutschen-meisterschaften-vom-14.11.2022', | |
27 | 'ext': 'ismv', | |
28 | 'title': 'Formationsgemeinschaft Rhein-Main bei den Deutschen Meisterschaften', | |
29 | 'timestamp': 1668526214, | |
30 | 'display_id': 'formationsgemeinschaft-rhein-main-bei-den-deutschen-meisterschaften', | |
31 | 'alt_title': 'Formationsgemeinschaft Rhein-Main bei den Deutschen Meisterschaften', | |
32 | 'view_count': int, | |
33 | 'thumbnail': r're:^https://.+\.jpg', | |
34 | 'duration': 345.0, | |
35 | 'description': 'md5:9370ba29526984006c2cba1372e5c5a0', | |
36 | 'upload_date': '20221115', | |
37 | }, | |
38 | 'params': {'skip_download': 'ism'}, | |
39 | }, { | |
40 | 'url': 'https://www.rheinmaintv.de/sendungen/beitrag-video/casino-mainz-bei-den-deutschen-meisterschaften/vom-14.11.2022/', | |
41 | 'info_dict': { | |
42 | 'id': 'casino-mainz-bei-den-deutschen-meisterschaften-vom-14.11.2022', | |
43 | 'ext': 'ismv', | |
44 | 'title': 'Casino Mainz bei den Deutschen Meisterschaften', | |
45 | 'view_count': int, | |
46 | 'timestamp': 1668527402, | |
47 | 'alt_title': 'Casino Mainz bei den Deutschen Meisterschaften', | |
48 | 'upload_date': '20221115', | |
49 | 'display_id': 'casino-mainz-bei-den-deutschen-meisterschaften', | |
50 | 'duration': 348.0, | |
51 | 'thumbnail': r're:^https://.+\.jpg', | |
52 | 'description': 'md5:70fc1660eeba96da17199e5bdff4c0aa', | |
53 | }, | |
54 | 'params': {'skip_download': 'ism'}, | |
55 | }, { | |
56 | 'url': 'https://www.rheinmaintv.de/sendungen/beitrag-video/bricks4kids/vom-22.06.2022/', | |
57 | 'only_matching': True, | |
58 | }] | |
59 | ||
60 | def _real_extract(self, url): | |
61 | mobj = self._match_valid_url(url) | |
62 | display_id = mobj.group('display_id') | |
63 | video_id = mobj.group('video_id').replace('/', '-') | |
64 | webpage = self._download_webpage(url, video_id) | |
65 | ||
66 | source, img = self._search_regex(r'(?s)(?P<source><source[^>]*>)(?P<img><img[^>]*>)', | |
67 | webpage, 'video', group=('source', 'img')) | |
68 | source = extract_attributes(source) | |
69 | img = extract_attributes(img) | |
70 | ||
71 | raw_json_ld = list(self._yield_json_ld(webpage, video_id)) | |
72 | json_ld = self._json_ld(raw_json_ld, video_id) | |
73 | json_ld.pop('url', None) | |
74 | ||
75 | ism_manifest_url = ( | |
76 | source.get('src') | |
77 | or next(json_ld.get('embedUrl') for json_ld in raw_json_ld if json_ld.get('@type') == 'VideoObject') | |
78 | ) | |
79 | formats, subtitles = self._extract_ism_formats_and_subtitles(ism_manifest_url, video_id) | |
80 | ||
81 | return merge_dicts({ | |
82 | 'id': video_id, | |
83 | 'display_id': display_id, | |
84 | 'title': | |
85 | self._html_search_regex(r'<h1><span class="title">([^<]*)</span>', | |
86 | webpage, 'headline', default=None) | |
87 | or img.get('title') or json_ld.get('title') or self._og_search_title(webpage) | |
88 | or remove_end(self._html_extract_title(webpage), ' -'), | |
89 | 'alt_title': img.get('alt'), | |
90 | 'description': json_ld.get('description') or self._og_search_description(webpage), | |
91 | 'formats': formats, | |
92 | 'subtitles': subtitles, | |
93 | 'thumbnails': [{'url': img['src']}] if 'src' in img else json_ld.get('thumbnails'), | |
94 | }, json_ld) |