]> jfr.im git - yt-dlp.git/blob - yt_dlp/extractor/tagesschau.py
[extractor] Deprecate `_sort_formats`
[yt-dlp.git] / yt_dlp / extractor / tagesschau.py
1 import re
2
3 from .common import InfoExtractor
4 from ..utils import (
5 js_to_json,
6 extract_attributes,
7 try_get,
8 int_or_none,
9 )
10
11
12 class TagesschauIE(InfoExtractor):
13 _VALID_URL = r'https?://(?:www\.)?tagesschau\.de/(?P<path>[^/]+/(?:[^/]+/)*?(?P<id>[^/#?]+?(?:-?[0-9]+)?))(?:~_?[^/#?]+?)?\.html'
14
15 _TESTS = [{
16 'url': 'http://www.tagesschau.de/multimedia/video/video-102143.html',
17 'md5': '7a7287612fa881a1ae1d087df45c2fd6',
18 'info_dict': {
19 'id': 'video-102143-1',
20 'ext': 'mp4',
21 'title': 'Regierungsumbildung in Athen: Neue Minister in Griechenland vereidigt',
22 },
23 }, {
24 'url': 'http://www.tagesschau.de/multimedia/sendung/ts-5727.html',
25 'md5': '3c54c1f6243d279b706bde660ceec633',
26 'info_dict': {
27 'id': 'ts-5727-1',
28 'ext': 'mp4',
29 'title': 'Ganze Sendung',
30 },
31 }, {
32 # exclusive audio
33 'url': 'http://www.tagesschau.de/multimedia/audio/audio-29417.html',
34 'md5': '4cf22023c285f35e99c24d290ba58cc9',
35 'info_dict': {
36 'id': 'audio-29417-1',
37 'ext': 'mp3',
38 'title': 'Brasilianischer Präsident Bolsonaro unter Druck: Corona-Bericht wird vorgestellt',
39 },
40 }, {
41 'url': 'http://www.tagesschau.de/inland/bnd-303.html',
42 'md5': '12cfb212d9325b5ba0d52b625f1aa61c',
43 'info_dict': {
44 'id': 'bnd-303-1',
45 'ext': 'mp4',
46 'title': 'SPD-Gruppenbild mit Bärbel Bas nach der Fraktionssitzung | dpa',
47 },
48 }, {
49 'url': 'http://www.tagesschau.de/inland/afd-parteitag-135.html',
50 'info_dict': {
51 'id': 'afd-parteitag-135',
52 'title': 'AfD',
53 },
54 'playlist_count': 20,
55 }, {
56 'url': 'https://www.tagesschau.de/multimedia/audio/audio-29417~player.html',
57 'info_dict': {
58 'id': 'audio-29417-1',
59 'ext': 'mp3',
60 'title': 'Brasilianischer Präsident Bolsonaro unter Druck: Corona-Bericht wird vorgestellt',
61 },
62 }, {
63 'url': 'http://www.tagesschau.de/multimedia/sendung/tsg-3771.html',
64 'only_matching': True,
65 }, {
66 'url': 'http://www.tagesschau.de/multimedia/sendung/tt-3827.html',
67 'only_matching': True,
68 }, {
69 'url': 'http://www.tagesschau.de/multimedia/sendung/nm-3475.html',
70 'only_matching': True,
71 }, {
72 'url': 'http://www.tagesschau.de/multimedia/sendung/weltspiegel-3167.html',
73 'only_matching': True,
74 }, {
75 'url': 'http://www.tagesschau.de/multimedia/tsvorzwanzig-959.html',
76 'only_matching': True,
77 }, {
78 'url': 'http://www.tagesschau.de/multimedia/sendung/bab/bab-3299~_bab-sendung-209.html',
79 'only_matching': True,
80 }, {
81 'url': 'http://www.tagesschau.de/multimedia/video/video-102303~_bab-sendung-211.html',
82 'only_matching': True,
83 }, {
84 'url': 'http://www.tagesschau.de/100sekunden/index.html',
85 'only_matching': True,
86 }, {
87 # playlist article with collapsing sections
88 'url': 'http://www.tagesschau.de/wirtschaft/faq-freihandelszone-eu-usa-101.html',
89 'only_matching': True,
90 }]
91
92 def _real_extract(self, url):
93 mobj = self._match_valid_url(url)
94 video_id = mobj.group('id') or mobj.group('path')
95 display_id = video_id.lstrip('-')
96
97 webpage = self._download_webpage(url, display_id)
98
99 title = self._html_search_regex(
100 r'<span[^>]*class="headline"[^>]*>(.+?)</span>',
101 webpage, 'title', default=None) or self._og_search_title(webpage, fatal=False)
102
103 entries = []
104 videos = re.findall(r'<div[^>]+>', webpage)
105 num = 0
106 for video in videos:
107 video = extract_attributes(video).get('data-config')
108 if not video:
109 continue
110 video = self._parse_json(video, video_id, transform_source=js_to_json, fatal=False)
111 video_formats = try_get(video, lambda x: x['mc']['_mediaArray'][0]['_mediaStreamArray'])
112 if not video_formats:
113 continue
114 num += 1
115 for video_format in video_formats:
116 media_url = video_format.get('_stream') or ''
117 formats = []
118 if media_url.endswith('master.m3u8'):
119 formats = self._extract_m3u8_formats(media_url, video_id, 'mp4', m3u8_id='hls')
120 elif media_url.endswith('.hi.mp3') and media_url.startswith('https://download'):
121 formats = [{
122 'url': media_url,
123 'vcodec': 'none',
124 }]
125 if not formats:
126 continue
127 entries.append({
128 'id': '%s-%d' % (display_id, num),
129 'title': try_get(video, lambda x: x['mc']['_title']),
130 'duration': int_or_none(try_get(video, lambda x: x['mc']['_duration'])),
131 'formats': formats
132 })
133 if len(entries) > 1:
134 return self.playlist_result(entries, display_id, title)
135 formats = entries[0]['formats']
136 video_info = self._search_json_ld(webpage, video_id)
137 description = video_info.get('description')
138 thumbnail = self._og_search_thumbnail(webpage) or video_info.get('thumbnail')
139 timestamp = video_info.get('timestamp')
140 title = title or video_info.get('description')
141
142 return {
143 'id': display_id,
144 'title': title,
145 'thumbnail': thumbnail,
146 'formats': formats,
147 'timestamp': timestamp,
148 'description': description,
149 }