]> jfr.im git - yt-dlp.git/blob - yt_dlp/extractor/hrti.py
Tolerate failure to `--write-link` due to unknown URL
[yt-dlp.git] / yt_dlp / extractor / hrti.py
1 # coding: utf-8
2 from __future__ import unicode_literals
3
4 import json
5
6 from .common import InfoExtractor
7 from ..compat import compat_HTTPError
8 from ..utils import (
9 clean_html,
10 ExtractorError,
11 int_or_none,
12 parse_age_limit,
13 sanitized_Request,
14 try_get,
15 )
16
17
18 class HRTiBaseIE(InfoExtractor):
19 """
20 Base Information Extractor for Croatian Radiotelevision
21 video on demand site https://hrti.hrt.hr
22 Reverse engineered from the JavaScript app in app.min.js
23 """
24 _NETRC_MACHINE = 'hrti'
25
26 _APP_LANGUAGE = 'hr'
27 _APP_VERSION = '1.1'
28 _APP_PUBLICATION_ID = 'all_in_one'
29 _API_URL = 'http://clientapi.hrt.hr/client_api.php/config/identify/format/json'
30
31 def _initialize_api(self):
32 init_data = {
33 'application_publication_id': self._APP_PUBLICATION_ID
34 }
35
36 uuid = self._download_json(
37 self._API_URL, None, note='Downloading uuid',
38 errnote='Unable to download uuid',
39 data=json.dumps(init_data).encode('utf-8'))['uuid']
40
41 app_data = {
42 'uuid': uuid,
43 'application_publication_id': self._APP_PUBLICATION_ID,
44 'application_version': self._APP_VERSION
45 }
46
47 req = sanitized_Request(self._API_URL, data=json.dumps(app_data).encode('utf-8'))
48 req.get_method = lambda: 'PUT'
49
50 resources = self._download_json(
51 req, None, note='Downloading session information',
52 errnote='Unable to download session information')
53
54 self._session_id = resources['session_id']
55
56 modules = resources['modules']
57
58 self._search_url = modules['vod_catalog']['resources']['search']['uri'].format(
59 language=self._APP_LANGUAGE,
60 application_id=self._APP_PUBLICATION_ID)
61
62 self._login_url = (modules['user']['resources']['login']['uri']
63 + '/format/json').format(session_id=self._session_id)
64
65 self._logout_url = modules['user']['resources']['logout']['uri']
66
67 def _login(self):
68 username, password = self._get_login_info()
69 # TODO: figure out authentication with cookies
70 if username is None or password is None:
71 self.raise_login_required()
72
73 auth_data = {
74 'username': username,
75 'password': password,
76 }
77
78 try:
79 auth_info = self._download_json(
80 self._login_url, None, note='Logging in', errnote='Unable to log in',
81 data=json.dumps(auth_data).encode('utf-8'))
82 except ExtractorError as e:
83 if isinstance(e.cause, compat_HTTPError) and e.cause.code == 406:
84 auth_info = self._parse_json(e.cause.read().encode('utf-8'), None)
85 else:
86 raise
87
88 error_message = auth_info.get('error', {}).get('message')
89 if error_message:
90 raise ExtractorError(
91 '%s said: %s' % (self.IE_NAME, error_message),
92 expected=True)
93
94 self._token = auth_info['secure_streaming_token']
95
96 def _real_initialize(self):
97 self._initialize_api()
98 self._login()
99
100
101 class HRTiIE(HRTiBaseIE):
102 _VALID_URL = r'''(?x)
103 (?:
104 hrti:(?P<short_id>[0-9]+)|
105 https?://
106 hrti\.hrt\.hr/(?:\#/)?video/show/(?P<id>[0-9]+)/(?P<display_id>[^/]+)?
107 )
108 '''
109 _TESTS = [{
110 'url': 'https://hrti.hrt.hr/#/video/show/2181385/republika-dokumentarna-serija-16-hd',
111 'info_dict': {
112 'id': '2181385',
113 'display_id': 'republika-dokumentarna-serija-16-hd',
114 'ext': 'mp4',
115 'title': 'REPUBLIKA, dokumentarna serija (1/6) (HD)',
116 'description': 'md5:48af85f620e8e0e1df4096270568544f',
117 'duration': 2922,
118 'view_count': int,
119 'average_rating': int,
120 'episode_number': int,
121 'season_number': int,
122 'age_limit': 12,
123 },
124 'skip': 'Requires account credentials',
125 }, {
126 'url': 'https://hrti.hrt.hr/#/video/show/2181385/',
127 'only_matching': True,
128 }, {
129 'url': 'hrti:2181385',
130 'only_matching': True,
131 }, {
132 'url': 'https://hrti.hrt.hr/video/show/3873068/cuvar-dvorca-dramska-serija-14',
133 'only_matching': True,
134 }]
135
136 def _real_extract(self, url):
137 mobj = self._match_valid_url(url)
138 video_id = mobj.group('short_id') or mobj.group('id')
139 display_id = mobj.group('display_id') or video_id
140
141 video = self._download_json(
142 '%s/video_id/%s/format/json' % (self._search_url, video_id),
143 display_id, 'Downloading video metadata JSON')['video'][0]
144
145 title_info = video['title']
146 title = title_info['title_long']
147
148 movie = video['video_assets']['movie'][0]
149 m3u8_url = movie['url'].format(TOKEN=self._token)
150 formats = self._extract_m3u8_formats(
151 m3u8_url, display_id, 'mp4', entry_protocol='m3u8_native',
152 m3u8_id='hls')
153 self._sort_formats(formats)
154
155 description = clean_html(title_info.get('summary_long'))
156 age_limit = parse_age_limit(video.get('parental_control', {}).get('rating'))
157 view_count = int_or_none(video.get('views'))
158 average_rating = int_or_none(video.get('user_rating'))
159 duration = int_or_none(movie.get('duration'))
160
161 return {
162 'id': video_id,
163 'display_id': display_id,
164 'title': title,
165 'description': description,
166 'duration': duration,
167 'view_count': view_count,
168 'average_rating': average_rating,
169 'age_limit': age_limit,
170 'formats': formats,
171 }
172
173
174 class HRTiPlaylistIE(HRTiBaseIE):
175 _VALID_URL = r'https?://hrti\.hrt\.hr/(?:#/)?video/list/category/(?P<id>[0-9]+)/(?P<display_id>[^/]+)?'
176 _TESTS = [{
177 'url': 'https://hrti.hrt.hr/#/video/list/category/212/ekumena',
178 'info_dict': {
179 'id': '212',
180 'title': 'ekumena',
181 },
182 'playlist_mincount': 8,
183 'skip': 'Requires account credentials',
184 }, {
185 'url': 'https://hrti.hrt.hr/#/video/list/category/212/',
186 'only_matching': True,
187 }, {
188 'url': 'https://hrti.hrt.hr/video/list/category/212/ekumena',
189 'only_matching': True,
190 }]
191
192 def _real_extract(self, url):
193 mobj = self._match_valid_url(url)
194 category_id = mobj.group('id')
195 display_id = mobj.group('display_id') or category_id
196
197 response = self._download_json(
198 '%s/category_id/%s/format/json' % (self._search_url, category_id),
199 display_id, 'Downloading video metadata JSON')
200
201 video_ids = try_get(
202 response, lambda x: x['video_listings'][0]['alternatives'][0]['list'],
203 list) or [video['id'] for video in response.get('videos', []) if video.get('id')]
204
205 entries = [self.url_result('hrti:%s' % video_id) for video_id in video_ids]
206
207 return self.playlist_result(entries, category_id, display_id)