]> jfr.im git - yt-dlp.git/blame - yt_dlp/extractor/rheinmaintv.py
[ie/orf:on] Improve extraction (#9677)
[yt-dlp.git] / yt_dlp / extractor / rheinmaintv.py
CommitLineData
98cb1eda 1from .common import InfoExtractor
2from ..utils import extract_attributes, merge_dicts, remove_end
3
4
5class RheinMainTVIE(InfoExtractor):
6 _VALID_URL = r'https?://(?:www\.)?rheinmaintv\.de/sendungen/(?:[\w-]+/)*(?P<video_id>(?P<display_id>[\w-]+)/vom-\d{2}\.\d{2}\.\d{4}(?:/\d+)?)'
7 _TESTS = [{
8 'url': 'https://www.rheinmaintv.de/sendungen/beitrag-video/auf-dem-weg-zur-deutschen-meisterschaft/vom-07.11.2022/',
9 'info_dict': {
10 'id': 'auf-dem-weg-zur-deutschen-meisterschaft-vom-07.11.2022',
11 'ext': 'ismv', # ismv+isma will be merged into mp4
12 'alt_title': 'Auf dem Weg zur Deutschen Meisterschaft',
13 'title': 'Auf dem Weg zur Deutschen Meisterschaft',
14 'upload_date': '20221108',
15 'view_count': int,
16 'display_id': 'auf-dem-weg-zur-deutschen-meisterschaft',
17 'thumbnail': r're:^https://.+\.jpg',
18 'description': 'md5:48c59b74192bc819a9b34af1d5ed1eb9',
19 'timestamp': 1667933057,
20 'duration': 243.0,
21 },
22 'params': {'skip_download': 'ism'},
23 }, {
24 'url': 'https://www.rheinmaintv.de/sendungen/beitrag-video/formationsgemeinschaft-rhein-main-bei-den-deutschen-meisterschaften/vom-14.11.2022/',
25 'info_dict': {
26 'id': 'formationsgemeinschaft-rhein-main-bei-den-deutschen-meisterschaften-vom-14.11.2022',
27 'ext': 'ismv',
28 'title': 'Formationsgemeinschaft Rhein-Main bei den Deutschen Meisterschaften',
29 'timestamp': 1668526214,
30 'display_id': 'formationsgemeinschaft-rhein-main-bei-den-deutschen-meisterschaften',
31 'alt_title': 'Formationsgemeinschaft Rhein-Main bei den Deutschen Meisterschaften',
32 'view_count': int,
33 'thumbnail': r're:^https://.+\.jpg',
34 'duration': 345.0,
35 'description': 'md5:9370ba29526984006c2cba1372e5c5a0',
36 'upload_date': '20221115',
37 },
38 'params': {'skip_download': 'ism'},
39 }, {
40 'url': 'https://www.rheinmaintv.de/sendungen/beitrag-video/casino-mainz-bei-den-deutschen-meisterschaften/vom-14.11.2022/',
41 'info_dict': {
42 'id': 'casino-mainz-bei-den-deutschen-meisterschaften-vom-14.11.2022',
43 'ext': 'ismv',
44 'title': 'Casino Mainz bei den Deutschen Meisterschaften',
45 'view_count': int,
46 'timestamp': 1668527402,
47 'alt_title': 'Casino Mainz bei den Deutschen Meisterschaften',
48 'upload_date': '20221115',
49 'display_id': 'casino-mainz-bei-den-deutschen-meisterschaften',
50 'duration': 348.0,
51 'thumbnail': r're:^https://.+\.jpg',
52 'description': 'md5:70fc1660eeba96da17199e5bdff4c0aa',
53 },
54 'params': {'skip_download': 'ism'},
55 }, {
56 'url': 'https://www.rheinmaintv.de/sendungen/beitrag-video/bricks4kids/vom-22.06.2022/',
57 'only_matching': True,
58 }]
59
60 def _real_extract(self, url):
61 mobj = self._match_valid_url(url)
62 display_id = mobj.group('display_id')
63 video_id = mobj.group('video_id').replace('/', '-')
64 webpage = self._download_webpage(url, video_id)
65
66 source, img = self._search_regex(r'(?s)(?P<source><source[^>]*>)(?P<img><img[^>]*>)',
67 webpage, 'video', group=('source', 'img'))
68 source = extract_attributes(source)
69 img = extract_attributes(img)
70
71 raw_json_ld = list(self._yield_json_ld(webpage, video_id))
72 json_ld = self._json_ld(raw_json_ld, video_id)
73 json_ld.pop('url', None)
74
75 ism_manifest_url = (
76 source.get('src')
77 or next(json_ld.get('embedUrl') for json_ld in raw_json_ld if json_ld.get('@type') == 'VideoObject')
78 )
79 formats, subtitles = self._extract_ism_formats_and_subtitles(ism_manifest_url, video_id)
80
81 return merge_dicts({
82 'id': video_id,
83 'display_id': display_id,
84 'title':
85 self._html_search_regex(r'<h1><span class="title">([^<]*)</span>',
86 webpage, 'headline', default=None)
87 or img.get('title') or json_ld.get('title') or self._og_search_title(webpage)
88 or remove_end(self._html_extract_title(webpage), ' -'),
89 'alt_title': img.get('alt'),
90 'description': json_ld.get('description') or self._og_search_description(webpage),
91 'formats': formats,
92 'subtitles': subtitles,
93 'thumbnails': [{'url': img['src']}] if 'src' in img else json_ld.get('thumbnails'),
94 }, json_ld)