]> jfr.im git - yt-dlp.git/blame - yt_dlp/extractor/tagesschau.py
[test/download] Fallback test to `bv`
[yt-dlp.git] / yt_dlp / extractor / tagesschau.py
CommitLineData
dcdb292f 1# coding: utf-8
0e3ae924 2from __future__ import unicode_literals
3
4import re
0e3ae924 5
6from .common import InfoExtractor
1a2b377c 7from ..utils import (
6a0f9a24 8 js_to_json,
0f6e60bb 9 extract_attributes,
10 try_get,
11 int_or_none,
1a2b377c 12)
0e3ae924 13
14
15class TagesschauIE(InfoExtractor):
854cc54b 16 _VALID_URL = r'https?://(?:www\.)?tagesschau\.de/(?P<path>[^/]+/(?:[^/]+/)*?(?P<id>[^/#?]+?(?:-?[0-9]+)?))(?:~_?[^/#?]+?)?\.html'
0e3ae924 17
18 _TESTS = [{
c51bc70e 19 'url': 'http://www.tagesschau.de/multimedia/video/video-102143.html',
0f6e60bb 20 'md5': '7a7287612fa881a1ae1d087df45c2fd6',
0e3ae924 21 'info_dict': {
0f6e60bb 22 'id': 'video-102143-1',
0e3ae924 23 'ext': 'mp4',
c51bc70e 24 'title': 'Regierungsumbildung in Athen: Neue Minister in Griechenland vereidigt',
0e3ae924 25 },
045c4884
PH
26 }, {
27 'url': 'http://www.tagesschau.de/multimedia/sendung/ts-5727.html',
28 'md5': '3c54c1f6243d279b706bde660ceec633',
29 'info_dict': {
0f6e60bb 30 'id': 'ts-5727-1',
045c4884 31 'ext': 'mp4',
0f6e60bb 32 'title': 'Ganze Sendung',
6a0f9a24
S
33 },
34 }, {
35 # exclusive audio
36 'url': 'http://www.tagesschau.de/multimedia/audio/audio-29417.html',
0f6e60bb 37 'md5': '4cf22023c285f35e99c24d290ba58cc9',
6a0f9a24 38 'info_dict': {
0f6e60bb 39 'id': 'audio-29417-1',
6a0f9a24 40 'ext': 'mp3',
0f6e60bb 41 'title': 'Brasilianischer Präsident Bolsonaro unter Druck: Corona-Bericht wird vorgestellt',
948199de 42 },
a47b602b 43 }, {
6a0f9a24 44 'url': 'http://www.tagesschau.de/inland/bnd-303.html',
0f6e60bb 45 'md5': '12cfb212d9325b5ba0d52b625f1aa61c',
a47b602b 46 'info_dict': {
0f6e60bb 47 'id': 'bnd-303-1',
48 'ext': 'mp4',
49 'title': 'SPD-Gruppenbild mit Bärbel Bas nach der Fraktionssitzung | dpa',
a47b602b 50 },
4c1b2e5c
S
51 }, {
52 'url': 'http://www.tagesschau.de/inland/afd-parteitag-135.html',
53 'info_dict': {
854cc54b 54 'id': 'afd-parteitag-135',
0f6e60bb 55 'title': 'AfD',
56 },
57 'playlist_count': 20,
58 }, {
59 'url': 'https://www.tagesschau.de/multimedia/audio/audio-29417~player.html',
60 'info_dict': {
61 'id': 'audio-29417-1',
62 'ext': 'mp3',
63 'title': 'Brasilianischer Präsident Bolsonaro unter Druck: Corona-Bericht wird vorgestellt',
4c1b2e5c 64 },
e89d7e30
RH
65 }, {
66 'url': 'http://www.tagesschau.de/multimedia/sendung/tsg-3771.html',
948199de 67 'only_matching': True,
e89d7e30
RH
68 }, {
69 'url': 'http://www.tagesschau.de/multimedia/sendung/tt-3827.html',
948199de 70 'only_matching': True,
e89d7e30
RH
71 }, {
72 'url': 'http://www.tagesschau.de/multimedia/sendung/nm-3475.html',
948199de
S
73 'only_matching': True,
74 }, {
75 'url': 'http://www.tagesschau.de/multimedia/sendung/weltspiegel-3167.html',
76 'only_matching': True,
e89d7e30
RH
77 }, {
78 'url': 'http://www.tagesschau.de/multimedia/tsvorzwanzig-959.html',
948199de 79 'only_matching': True,
3c6ae8b5
RH
80 }, {
81 'url': 'http://www.tagesschau.de/multimedia/sendung/bab/bab-3299~_bab-sendung-209.html',
948199de
S
82 'only_matching': True,
83 }, {
84 'url': 'http://www.tagesschau.de/multimedia/video/video-102303~_bab-sendung-211.html',
85 'only_matching': True,
651ad35c
S
86 }, {
87 'url': 'http://www.tagesschau.de/100sekunden/index.html',
88 'only_matching': True,
68bb2fef
S
89 }, {
90 # playlist article with collapsing sections
91 'url': 'http://www.tagesschau.de/wirtschaft/faq-freihandelszone-eu-usa-101.html',
92 'only_matching': True,
4a5b4d34
PH
93 }]
94
0e3ae924 95 def _real_extract(self, url):
5ad28e7f 96 mobj = self._match_valid_url(url)
651ad35c 97 video_id = mobj.group('id') or mobj.group('path')
122c2f87 98 display_id = video_id.lstrip('-')
6a0f9a24 99
0e3ae924 100 webpage = self._download_webpage(url, display_id)
101
6a0f9a24
S
102 title = self._html_search_regex(
103 r'<span[^>]*class="headline"[^>]*>(.+?)</span>',
0f6e60bb 104 webpage, 'title', default=None) or self._og_search_title(webpage, fatal=False)
105
106 entries = []
107 videos = re.findall(r'<div[^>]+>', webpage)
108 num = 0
109 for video in videos:
110 video = extract_attributes(video).get('data-config')
111 if not video:
112 continue
113 video = self._parse_json(video, video_id, transform_source=js_to_json, fatal=False)
114 video_formats = try_get(video, lambda x: x['mc']['_mediaArray'][0]['_mediaStreamArray'])
115 if not video_formats:
116 continue
117 num += 1
118 for video_format in video_formats:
119 media_url = video_format.get('_stream') or ''
120 formats = []
121 if media_url.endswith('master.m3u8'):
122 formats = self._extract_m3u8_formats(media_url, video_id, 'mp4', m3u8_id='hls')
123 elif media_url.endswith('.hi.mp3') and media_url.startswith('https://download'):
124 formats = [{
125 'url': media_url,
126 'vcodec': 'none',
127 }]
128 if not formats:
129 continue
6a0f9a24
S
130 entries.append({
131 'id': '%s-%d' % (display_id, num),
0f6e60bb 132 'title': try_get(video, lambda x: x['mc']['_title']),
133 'duration': int_or_none(try_get(video, lambda x: x['mc']['_duration'])),
134 'formats': formats
6a0f9a24 135 })
0f6e60bb 136 if len(entries) > 1:
137 return self.playlist_result(entries, display_id, title)
138 formats = entries[0]['formats']
139 video_info = self._search_json_ld(webpage, video_id)
140 description = video_info.get('description')
141 thumbnail = self._og_search_thumbnail(webpage) or video_info.get('thumbnail')
142 timestamp = video_info.get('timestamp')
143 title = title or video_info.get('description')
4c1b2e5c 144
0e3ae924 145 self._sort_formats(formats)
0e3ae924 146
147 return {
148 'id': display_id,
045c4884
PH
149 'title': title,
150 'thumbnail': thumbnail,
0e3ae924 151 'formats': formats,
0f6e60bb 152 'timestamp': timestamp,
045c4884 153 'description': description,
0e3ae924 154 }