]> jfr.im git - yt-dlp.git/blame - yt_dlp/extractor/rtvslo.py
[ie/orf:on] Improve extraction (#9677)
[yt-dlp.git] / yt_dlp / extractor / rtvslo.py
CommitLineData
dfa6661e 1from .common import InfoExtractor
2from ..utils import (
c6e07cf1 3 ExtractorError,
94389b22 4 int_or_none,
c6e07cf1 5 parse_duration,
6 traverse_obj,
7 unified_timestamp,
8 url_or_none,
dfa6661e 9)
10
11
12class RTVSLOIE(InfoExtractor):
13 IE_NAME = 'rtvslo.si'
14 _VALID_URL = r'''(?x)
15 https?://(?:
16 (?:365|4d)\.rtvslo.si/arhiv/[^/?#&;]+|
17 (?:www\.)?rtvslo\.si/rtv365/arhiv
18 )/(?P<id>\d+)'''
19 _GEO_COUNTRIES = ['SI']
20
21 _API_BASE = 'https://api.rtvslo.si/ava/{}/{}?client_id=82013fb3a531d5414f478747c1aca622'
22 SUB_LANGS_MAP = {'Slovenski': 'sl'}
23
24 _TESTS = [
25 {
26 'url': 'https://www.rtvslo.si/rtv365/arhiv/174842550?s=tv',
27 'info_dict': {
28 'id': '174842550',
94389b22 29 'ext': 'mp4',
dfa6661e 30 'release_timestamp': 1643140032,
31 'upload_date': '20220125',
32 'series': 'Dnevnik',
33 'thumbnail': 'https://img.rtvcdn.si/_up/ava/ava_misc/show_logos/92/dnevnik_3_wide2.jpg',
34 'description': 'md5:76a18692757aeb8f0f51221106277dd2',
35 'timestamp': 1643137046,
36 'title': 'Dnevnik',
37 'series_id': '92',
38 'release_date': '20220125',
39 'duration': 1789,
40 },
41 }, {
42 'url': 'https://365.rtvslo.si/arhiv/utrip/174843754',
43 'info_dict': {
44 'id': '174843754',
45 'ext': 'mp4',
46 'series_id': '94',
47 'release_date': '20220129',
48 'timestamp': 1643484455,
49 'title': 'Utrip',
50 'duration': 813,
51 'thumbnail': 'https://img.rtvcdn.si/_up/ava/ava_misc/show_logos/94/utrip_1_wide2.jpg',
52 'description': 'md5:77f2892630c7b17bb7a5bb84319020c9',
53 'release_timestamp': 1643485825,
54 'upload_date': '20220129',
55 'series': 'Utrip',
56 },
57 }, {
58 'url': 'https://365.rtvslo.si/arhiv/il-giornale-della-sera/174844609',
59 'info_dict': {
60 'id': '174844609',
61 'ext': 'mp3',
62 'series_id': '106615841',
63 'title': 'Il giornale della sera',
64 'duration': 1328,
65 'series': 'Il giornale della sera',
66 'timestamp': 1643743800,
67 'release_timestamp': 1643745424,
68 'thumbnail': 'https://img.rtvcdn.si/_up/ava/ava_misc/show_logos/il-giornale-della-sera_wide2.jpg',
69 'upload_date': '20220201',
70 'tbr': 128000,
71 'release_date': '20220201',
72 },
94389b22 73 }, {
74 'url': 'https://365.rtvslo.si/arhiv/razred-zase/148350750',
75 'info_dict': {
76 'id': '148350750',
77 'ext': 'mp4',
78 'title': 'Prvi šolski dan, mozaična oddaja za mlade',
79 'series': 'Razred zase',
80 'series_id': '148185730',
81 'duration': 1481,
82 'upload_date': '20121019',
83 'timestamp': 1350672122,
84 'release_date': '20121019',
85 'release_timestamp': 1350672122,
86 'thumbnail': 'https://img.rtvcdn.si/_up/ava/ava_misc/show_logos/148185730/razred_zase_2014_logo_4d_wide2.jpg',
87 },
dfa6661e 88 }, {
89 'url': 'https://4d.rtvslo.si/arhiv/dnevnik/174842550',
90 'only_matching': True
91 }
92 ]
93
94 def _real_extract(self, url):
95 v_id = self._match_id(url)
96 meta = self._download_json(self._API_BASE.format('getRecordingDrm', v_id), v_id)['response']
97
98 thumbs = [{'id': k, 'url': v, 'http_headers': {'Accept': 'image/jpeg'}}
99 for k, v in (meta.get('images') or {}).items()]
100
101 subs = {}
102 for s in traverse_obj(meta, 'subs', 'subtitles', default=[]):
103 lang = self.SUB_LANGS_MAP.get(s.get('language'), s.get('language') or 'und')
104 subs.setdefault(lang, []).append({
105 'url': s.get('file'),
106 'ext': traverse_obj(s, 'format', expected_type=str.lower),
107 })
108
109 jwt = meta.get('jwt')
110 if not jwt:
111 raise ExtractorError('Site did not provide an authentication token, cannot proceed.')
112
113 media = self._download_json(self._API_BASE.format('getMedia', v_id), v_id, query={'jwt': jwt})['response']
114
115 formats = []
94389b22 116 skip_protocols = ['smil', 'f4m', 'dash']
dfa6661e 117 adaptive_url = traverse_obj(media, ('addaptiveMedia', 'hls_sec'), expected_type=url_or_none)
118 if adaptive_url:
94389b22 119 formats = self._extract_wowza_formats(adaptive_url, v_id, skip_protocols=skip_protocols)
dfa6661e 120
121 adaptive_url = traverse_obj(media, ('addaptiveMedia_sl', 'hls_sec'), expected_type=url_or_none)
122 if adaptive_url:
94389b22 123 for f in self._extract_wowza_formats(adaptive_url, v_id, skip_protocols=skip_protocols):
dfa6661e 124 formats.append({
125 **f,
126 'format_id': 'sign-' + f['format_id'],
127 'format_note': 'Sign language interpretation', 'preference': -10,
128 'language': (
129 'slv' if f.get('language') == 'eng' and f.get('acodec') != 'none'
130 else f.get('language'))
131 })
132
94389b22 133 for mediafile in traverse_obj(media, ('mediaFiles', lambda _, v: url_or_none(v['streams']['https']))):
134 formats.append(traverse_obj(mediafile, {
135 'url': ('streams', 'https'),
136 'ext': ('mediaType', {str.lower}),
137 'width': ('width', {int_or_none}),
138 'height': ('height', {int_or_none}),
139 'tbr': ('bitrate', {int_or_none}),
140 'filesize': ('filesize', {int_or_none}),
141 }))
142
143 for mediafile in traverse_obj(media, ('mediaFiles', lambda _, v: url_or_none(v['streams']['hls_sec']))):
144 formats.extend(self._extract_wowza_formats(
145 mediafile['streams']['hls_sec'], v_id, skip_protocols=skip_protocols))
dfa6661e 146
147 if any('intermission.mp4' in x['url'] for x in formats):
148 self.raise_geo_restricted(countries=self._GEO_COUNTRIES, metadata_available=True)
149 if any('dummy_720p.mp4' in x.get('manifest_url', '') for x in formats) and meta.get('stub') == 'error':
150 raise ExtractorError(f'{self.IE_NAME} said: Clip not available', expected=True)
151
dfa6661e 152 return {
153 'id': v_id,
154 'webpage_url': ''.join(traverse_obj(meta, ('canonical', ('domain', 'path')))),
155 'title': meta.get('title'),
156 'formats': formats,
157 'subtitles': subs,
158 'thumbnails': thumbs,
159 'description': meta.get('description'),
160 'timestamp': unified_timestamp(traverse_obj(meta, 'broadcastDate', ('broadcastDates', 0))),
161 'release_timestamp': unified_timestamp(meta.get('recordingDate')),
162 'duration': meta.get('duration') or parse_duration(meta.get('length')),
163 'tags': meta.get('genre'),
164 'series': meta.get('showName'),
165 'series_id': meta.get('showId'),
166 }