]> jfr.im git - yt-dlp.git/blame - yt_dlp/extractor/ceskatelevize.py
[cleanup] Upgrade syntax
[yt-dlp.git] / yt_dlp / extractor / ceskatelevize.py
CommitLineData
8cfb5bbf 1import re
8cfb5bbf 2
48246541 3from .common import InfoExtractor
1cc79574 4from ..compat import (
3e72f5f1 5 compat_urllib_parse_unquote,
39201787 6 compat_urllib_parse_urlparse,
1cc79574
PH
7)
8from ..utils import (
39201787 9 ExtractorError,
02ec32a1 10 float_or_none,
5c2266df 11 sanitized_Request,
443b21dc 12 traverse_obj,
6e6bc8da 13 urlencode_postdata,
5cb2d36c 14 USER_AGENTS,
8cfb5bbf 15)
16
17
48246541 18class CeskaTelevizeIE(InfoExtractor):
92592bd3 19 _VALID_URL = r'https?://(?:www\.)?ceskatelevize\.cz/(?:ivysilani|porady)/(?:[^/?#&]+/)*(?P<id>[^/#?]+)'
97243fe3 20 _TESTS = [{
3951e7eb
S
21 'url': 'http://www.ceskatelevize.cz/ivysilani/10441294653-hyde-park-civilizace/215411058090502/bonus/20641-bonus-01-en',
22 'info_dict': {
23 'id': '61924494877028507',
24 'ext': 'mp4',
25 'title': 'Hyde Park Civilizace: Bonus 01 - En',
26 'description': 'English Subtittles',
ec85ded8 27 'thumbnail': r're:^https?://.*\.jpg',
3951e7eb
S
28 'duration': 81.3,
29 },
30 'params': {
31 # m3u8 download
32 'skip_download': True,
33 },
97243fe3 34 }, {
f1f6f5aa
35 # live stream
36 'url': 'http://www.ceskatelevize.cz/ivysilani/zive/ct4/',
97243fe3 37 'info_dict': {
f1f6f5aa 38 'id': 402,
97243fe3 39 'ext': 'mp4',
ec85ded8 40 'title': r're:^ČT Sport \d{4}-\d{2}-\d{2} \d{2}:\d{2}$',
f1f6f5aa 41 'is_live': True,
97243fe3
S
42 },
43 'params': {
44 # m3u8 download
45 'skip_download': True,
46 },
3951e7eb 47 'skip': 'Georestricted to Czech Republic',
97243fe3 48 }, {
e18f1da9
S
49 'url': 'http://www.ceskatelevize.cz/ivysilani/embed/iFramePlayer.php?hash=d6a3e1370d2e4fa76296b90bad4dfc19673b641e&IDEC=217 562 22150/0004&channelID=1&width=100%25',
50 'only_matching': True,
92592bd3
M
51 }, {
52 # video with 18+ caution trailer
53 'url': 'http://www.ceskatelevize.cz/porady/10520528904-queer/215562210900007-bogotart/',
54 'info_dict': {
55 'id': '215562210900007-bogotart',
56 'title': 'Queer: Bogotart',
57 'description': 'Hlavní město Kolumbie v doprovodu queer umělců. Vroucí svět plný vášně, sebevědomí, ale i násilí a bolesti. Připravil Peter Serge Butko',
58 },
59 'playlist': [{
60 'info_dict': {
61 'id': '61924494877311053',
62 'ext': 'mp4',
63 'title': 'Queer: Bogotart (Varování 18+)',
64 'duration': 11.9,
65 },
66 }, {
67 'info_dict': {
68 'id': '61924494877068022',
69 'ext': 'mp4',
70 'title': 'Queer: Bogotart (Queer)',
71 'thumbnail': r're:^https?://.*\.jpg',
72 'duration': 1558.3,
73 },
74 }],
75 'params': {
76 # m3u8 download
77 'skip_download': True,
78 },
79 }, {
80 # iframe embed
81 'url': 'http://www.ceskatelevize.cz/porady/10614999031-neviditelni/21251212048/',
82 'only_matching': True,
97243fe3 83 }]
8cfb5bbf 84
85 def _real_extract(self, url):
e18f1da9 86 playlist_id = self._match_id(url)
92592bd3 87 parsed_url = compat_urllib_parse_urlparse(url)
97243fe3 88 webpage = self._download_webpage(url, playlist_id)
92592bd3
M
89 site_name = self._og_search_property('site_name', webpage, fatal=False, default=None)
90 playlist_title = self._og_search_title(webpage, default=None)
91 if site_name and playlist_title:
92 playlist_title = playlist_title.replace(f' — {site_name}', '', 1)
93 playlist_description = self._og_search_description(webpage, default=None)
94 if playlist_description:
95 playlist_description = playlist_description.replace('\xa0', ' ')
96
97 if parsed_url.path.startswith('/porady/'):
443b21dc
MK
98 next_data = self._search_nextjs_data(webpage, playlist_id)
99 idec = traverse_obj(next_data, ('props', 'pageProps', 'data', ('show', 'mediaMeta'), 'idec'), get_all=False)
100 if not idec:
101 raise ExtractorError('Failed to find IDEC id')
102 iframe_hash = self._download_webpage('https://www.ceskatelevize.cz/v-api/iframe-hash/', playlist_id)
103 webpage = self._download_webpage('https://www.ceskatelevize.cz/ivysilani/embed/iFramePlayer.php', playlist_id,
104 query={'hash': iframe_hash, 'origin': 'iVysilani', 'autoStart': 'true', 'IDEC': idec})
8cfb5bbf 105
7d78f0cc
S
106 NOT_AVAILABLE_STRING = 'This content is not available at your territory due to limited copyright.'
107 if '%s</p>' % NOT_AVAILABLE_STRING in webpage:
108 raise ExtractorError(NOT_AVAILABLE_STRING, expected=True)
39201787 109
e18f1da9
S
110 type_ = None
111 episode_id = None
112
113 playlist = self._parse_json(
114 self._search_regex(
115 r'getPlaylistUrl\(\[({.+?})\]', webpage, 'playlist',
116 default='{}'), playlist_id)
117 if playlist:
118 type_ = playlist.get('type')
119 episode_id = playlist.get('id')
120
121 if not type_:
122 type_ = self._html_search_regex(
123 r'getPlaylistUrl\(\[\{"type":"(.+?)","id":".+?"\}\],',
124 webpage, 'type')
125 if not episode_id:
126 episode_id = self._html_search_regex(
127 r'getPlaylistUrl\(\[\{"type":".+?","id":"(.+?)"\}\],',
128 webpage, 'episode_id')
8cfb5bbf 129
130 data = {
e18f1da9 131 'playlist[0][type]': type_,
8cfb5bbf 132 'playlist[0][id]': episode_id,
92592bd3 133 'requestUrl': parsed_url.path,
8cfb5bbf 134 'requestSource': 'iVysilani',
135 }
136
97243fe3 137 entries = []
5cb2d36c
S
138
139 for user_agent in (None, USER_AGENTS['Safari']):
140 req = sanitized_Request(
92592bd3 141 'https://www.ceskatelevize.cz/ivysilani/ajax/get-client-playlist/',
5cb2d36c
S
142 data=urlencode_postdata(data))
143
144 req.add_header('Content-type', 'application/x-www-form-urlencoded')
145 req.add_header('x-addr', '127.0.0.1')
146 req.add_header('X-Requested-With', 'XMLHttpRequest')
147 if user_agent:
148 req.add_header('User-Agent', user_agent)
149 req.add_header('Referer', url)
150
151 playlistpage = self._download_json(req, playlist_id, fatal=False)
152
153 if not playlistpage:
154 continue
155
156 playlist_url = playlistpage['url']
157 if playlist_url == 'error_region':
158 raise ExtractorError(NOT_AVAILABLE_STRING, expected=True)
159
160 req = sanitized_Request(compat_urllib_parse_unquote(playlist_url))
161 req.add_header('Referer', url)
162
5cb2d36c
S
163 playlist = self._download_json(req, playlist_id, fatal=False)
164 if not playlist:
165 continue
166
167 playlist = playlist.get('playlist')
168 if not isinstance(playlist, list):
169 continue
170
171 playlist_len = len(playlist)
172
173 for num, item in enumerate(playlist):
174 is_live = item.get('type') == 'LIVE'
175 formats = []
176 for format_id, stream_url in item.get('streamUrls', {}).items():
46365484 177 stream_url = stream_url.replace('https://', 'http://')
5cb2d36c 178 if 'playerType=flash' in stream_url:
eafaeb22 179 stream_formats = self._extract_m3u8_formats(
fb4fc449 180 stream_url, playlist_id, 'mp4', 'm3u8_native',
eafaeb22 181 m3u8_id='hls-%s' % format_id, fatal=False)
5cb2d36c 182 else:
eafaeb22
S
183 stream_formats = self._extract_mpd_formats(
184 stream_url, playlist_id,
185 mpd_id='dash-%s' % format_id, fatal=False)
88acdbc2 186 if 'drmOnly=true' in stream_url:
187 for f in stream_formats:
188 f['has_drm'] = True
067aa17e 189 # See https://github.com/ytdl-org/youtube-dl/issues/12119#issuecomment-280037031
eafaeb22
S
190 if format_id == 'audioDescription':
191 for f in stream_formats:
192 f['source_preference'] = -10
193 formats.extend(stream_formats)
5cb2d36c
S
194
195 if user_agent and len(entries) == playlist_len:
196 entries[num]['formats'].extend(formats)
197 continue
198
199 item_id = item.get('id') or item['assetId']
200 title = item['title']
201
202 duration = float_or_none(item.get('duration'))
203 thumbnail = item.get('previewImageUrl')
204
205 subtitles = {}
206 if item.get('type') == 'VOD':
207 subs = item.get('subtitles')
208 if subs:
209 subtitles = self.extract_subtitles(episode_id, subs)
210
211 if playlist_len == 1:
212 final_title = playlist_title or title
5cb2d36c
S
213 else:
214 final_title = '%s (%s)' % (playlist_title, title)
215
216 entries.append({
217 'id': item_id,
218 'title': final_title,
219 'description': playlist_description if playlist_len == 1 else None,
220 'thumbnail': thumbnail,
221 'duration': duration,
222 'formats': formats,
223 'subtitles': subtitles,
224 'is_live': is_live,
225 })
226
227 for e in entries:
228 self._sort_formats(e['formats'])
97243fe3
S
229
230 return self.playlist_result(entries, playlist_id, playlist_title, playlist_description)
c067545c 231
48246541
JMF
232 def _get_subtitles(self, episode_id, subs):
233 original_subtitles = self._download_webpage(
234 subs[0]['url'], episode_id, 'Downloading subtitles')
235 srt_subs = self._fix_subtitles(original_subtitles)
236 return {
237 'cs': [{
238 'ext': 'srt',
239 'data': srt_subs,
240 }]
241 }
242
27a82a1b
S
243 @staticmethod
244 def _fix_subtitles(subtitles):
245 """ Convert millisecond-based subtitles to SRT """
c067545c
OC
246
247 def _msectotimecode(msec):
27a82a1b 248 """ Helper utility to convert milliseconds to timecode """
c067545c
OC
249 components = []
250 for divider in [1000, 60, 60, 100]:
251 components.append(msec % divider)
252 msec //= divider
611c1dd9 253 return '{3:02}:{2:02}:{1:02},{0:03}'.format(*components)
c067545c
OC
254
255 def _fix_subtitle(subtitle):
256 for line in subtitle.splitlines():
611c1dd9 257 m = re.match(r'^\s*([0-9]+);\s*([0-9]+)\s+([0-9]+)\s*$', line)
c067545c
OC
258 if m:
259 yield m.group(1)
260 start, stop = (_msectotimecode(int(t)) for t in m.groups()[1:])
611c1dd9 261 yield '{0} --> {1}'.format(start, stop)
c067545c
OC
262 else:
263 yield line
264
611c1dd9 265 return '\r\n'.join(_fix_subtitle(subtitles))