]> jfr.im git - yt-dlp.git/blob - yt_dlp/extractor/tvp.py
69168f6558bde624db53d63439307da3f3c4b0f8
[yt-dlp.git] / yt_dlp / extractor / tvp.py
1 import itertools
2 import random
3 import re
4
5 from .common import InfoExtractor
6 from ..utils import (
7 determine_ext,
8 dict_get,
9 ExtractorError,
10 int_or_none,
11 js_to_json,
12 orderedSet,
13 str_or_none,
14 try_get,
15 )
16
17
18 class TVPIE(InfoExtractor):
19 IE_NAME = 'tvp'
20 IE_DESC = 'Telewizja Polska'
21 _VALID_URL = r'https?://(?:[^/]+\.)?(?:tvp(?:parlament)?\.(?:pl|info)|polandin\.com)/(?:video/(?:[^,\s]*,)*|(?:(?!\d+/)[^/]+/)*)(?P<id>\d+)'
22
23 _TESTS = [{
24 # TVPlayer 2 in js wrapper
25 'url': 'https://vod.tvp.pl/video/czas-honoru,i-seria-odc-13,194536',
26 'info_dict': {
27 'id': '194536',
28 'ext': 'mp4',
29 'title': 'Czas honoru, odc. 13 – Władek',
30 'description': 'md5:437f48b93558370b031740546b696e24',
31 'age_limit': 12,
32 },
33 }, {
34 # TVPlayer legacy
35 'url': 'http://www.tvp.pl/there-can-be-anything-so-i-shortened-it/17916176',
36 'info_dict': {
37 'id': '17916176',
38 'ext': 'mp4',
39 'title': 'TVP Gorzów pokaże filmy studentów z podroży dookoła świata',
40 'description': 'TVP Gorzów pokaże filmy studentów z podroży dookoła świata',
41 },
42 }, {
43 # TVPlayer 2 in iframe
44 'url': 'https://wiadomosci.tvp.pl/50725617/dzieci-na-sprzedaz-dla-homoseksualistow',
45 'info_dict': {
46 'id': '50725617',
47 'ext': 'mp4',
48 'title': 'Dzieci na sprzedaż dla homoseksualistów',
49 'description': 'md5:7d318eef04e55ddd9f87a8488ac7d590',
50 'age_limit': 12,
51 },
52 }, {
53 # TVPlayer 2 in client-side rendered website (regional; window.__newsData)
54 'url': 'https://warszawa.tvp.pl/25804446/studio-yayo',
55 'info_dict': {
56 'id': '25804446',
57 'ext': 'mp4',
58 'title': 'Studio Yayo',
59 'upload_date': '20160616',
60 'timestamp': 1466075700,
61 }
62 }, {
63 # TVPlayer 2 in client-side rendered website (tvp.info; window.__videoData)
64 'url': 'https://www.tvp.info/52880236/09042021-0800',
65 'info_dict': {
66 'id': '52880236',
67 'ext': 'mp4',
68 'title': '09.04.2021, 08:00',
69 },
70 }, {
71 # client-side rendered (regional) program (playlist) page
72 'url': 'https://opole.tvp.pl/9660819/rozmowa-dnia',
73 'info_dict': {
74 'id': '9660819',
75 'description': 'Od poniedziałku do piątku o 18:55',
76 'title': 'Rozmowa dnia',
77 },
78 'playlist_mincount': 1800,
79 'params': {
80 'skip_download': True,
81 }
82 }, {
83 # ABC-specific video embeding
84 # moved to https://bajkowakraina.tvp.pl/wideo/50981130,teleranek,51027049,zubr,51116450
85 'url': 'https://abc.tvp.pl/48636269/zubry-odc-124',
86 'info_dict': {
87 'id': '48320456',
88 'ext': 'mp4',
89 'title': 'Teleranek, Żubr',
90 },
91 'skip': 'unavailable',
92 }, {
93 # yet another vue page
94 'url': 'https://jp2.tvp.pl/46925618/filmy',
95 'info_dict': {
96 'id': '46925618',
97 'title': 'Filmy',
98 },
99 'playlist_mincount': 19,
100 }, {
101 'url': 'http://vod.tvp.pl/seriale/obyczajowe/na-sygnale/sezon-2-27-/odc-39/17834272',
102 'only_matching': True,
103 }, {
104 'url': 'http://wiadomosci.tvp.pl/25169746/24052016-1200',
105 'only_matching': True,
106 }, {
107 'url': 'http://krakow.tvp.pl/25511623/25lecie-mck-wyjatkowe-miejsce-na-mapie-krakowa',
108 'only_matching': True,
109 }, {
110 'url': 'http://teleexpress.tvp.pl/25522307/wierni-wzieli-udzial-w-procesjach',
111 'only_matching': True,
112 }, {
113 'url': 'http://sport.tvp.pl/25522165/krychowiak-uspokaja-w-sprawie-kontuzji-dwa-tygodnie-to-maksimum',
114 'only_matching': True,
115 }, {
116 'url': 'http://www.tvp.info/25511919/trwa-rewolucja-wladza-zdecydowala-sie-na-pogwalcenie-konstytucji',
117 'only_matching': True,
118 }, {
119 'url': 'https://tvp.info/49193823/teczowe-flagi-na-pomnikach-prokuratura-wszczela-postepowanie-wieszwiecej',
120 'only_matching': True,
121 }, {
122 'url': 'https://www.tvpparlament.pl/retransmisje-vod/inne/wizyta-premiera-mateusza-morawieckiego-w-firmie-berotu-sp-z-oo/48857277',
123 'only_matching': True,
124 }, {
125 'url': 'https://polandin.com/47942651/pln-10-billion-in-subsidies-transferred-to-companies-pm',
126 'only_matching': True,
127 }]
128
129 def _parse_vue_website_data(self, webpage, page_id):
130 website_data = self._search_regex([
131 # website - regiony, tvp.info
132 # directory - jp2.tvp.pl
133 r'window\.__(?:website|directory)Data\s*=\s*({(?:.|\s)+?});',
134 ], webpage, 'website data')
135 if not website_data:
136 return None
137 return self._parse_json(website_data, page_id, transform_source=js_to_json)
138
139 def _extract_vue_video(self, video_data, page_id=None):
140 if isinstance(video_data, str):
141 video_data = self._parse_json(video_data, page_id, transform_source=js_to_json)
142 thumbnails = []
143 image = video_data.get('image')
144 if image:
145 for thumb in (image if isinstance(image, list) else [image]):
146 thmb_url = str_or_none(thumb.get('url'))
147 if thmb_url:
148 thumbnails.append({
149 'url': thmb_url,
150 })
151 is_website = video_data.get('type') == 'website'
152 if is_website:
153 url = video_data['url']
154 fucked_up_url_parts = re.match(r'https?://vod\.tvp\.pl/(\d+)/([^/?#]+)', url)
155 if fucked_up_url_parts:
156 url = f'https://vod.tvp.pl/website/{fucked_up_url_parts.group(2)},{fucked_up_url_parts.group(1)}'
157 else:
158 url = 'tvp:' + str_or_none(video_data.get('_id') or page_id)
159 return {
160 '_type': 'url_transparent',
161 'id': str_or_none(video_data.get('_id') or page_id),
162 'url': url,
163 'ie_key': 'TVPEmbed' if not is_website else 'TVPWebsite',
164 'title': str_or_none(video_data.get('title')),
165 'description': str_or_none(video_data.get('lead')),
166 'timestamp': int_or_none(video_data.get('release_date_long')),
167 'duration': int_or_none(video_data.get('duration')),
168 'thumbnails': thumbnails,
169 }
170
171 def _handle_vuejs_page(self, url, webpage, page_id):
172 # vue client-side rendered sites (all regional pages + tvp.info)
173 video_data = self._search_regex([
174 r'window\.__(?:news|video)Data\s*=\s*({(?:.|\s)+?})\s*;',
175 ], webpage, 'video data', default=None)
176 if video_data:
177 return self._extract_vue_video(video_data, page_id=page_id)
178 # paged playlists
179 website_data = self._parse_vue_website_data(webpage, page_id)
180 if website_data:
181 entries = self._vuejs_entries(url, website_data, page_id)
182
183 return {
184 '_type': 'playlist',
185 'id': page_id,
186 'title': str_or_none(website_data.get('title')),
187 'description': str_or_none(website_data.get('lead')),
188 'entries': entries,
189 }
190 raise ExtractorError('Could not extract video/website data')
191
192 def _vuejs_entries(self, url, website_data, page_id):
193
194 def extract_videos(wd):
195 if wd.get('latestVideo'):
196 yield self._extract_vue_video(wd['latestVideo'])
197 for video in wd.get('videos') or []:
198 yield self._extract_vue_video(video)
199 for video in wd.get('items') or []:
200 yield self._extract_vue_video(video)
201
202 yield from extract_videos(website_data)
203
204 if website_data.get('items_total_count') > website_data.get('items_per_page'):
205 for page in itertools.count(2):
206 page_website_data = self._parse_vue_website_data(
207 self._download_webpage(url, page_id, note='Downloading page #%d' % page,
208 query={'page': page}),
209 page_id)
210 if not page_website_data.get('videos') and not page_website_data.get('items'):
211 break
212 yield from extract_videos(page_website_data)
213
214 def _real_extract(self, url):
215 page_id = self._match_id(url)
216 webpage, urlh = self._download_webpage_handle(url, page_id)
217
218 # The URL may redirect to a VOD
219 # example: https://vod.tvp.pl/48463890/wadowickie-spotkania-z-janem-pawlem-ii
220 if TVPWebsiteIE.suitable(urlh.url):
221 return self.url_result(urlh.url, ie=TVPWebsiteIE.ie_key(), video_id=page_id)
222
223 if re.search(
224 r'window\.__(?:video|news|website|directory)Data\s*=',
225 webpage):
226 return self._handle_vuejs_page(url, webpage, page_id)
227
228 # classic server-side rendered sites
229 video_id = self._search_regex([
230 r'<iframe[^>]+src="[^"]*?embed\.php\?(?:[^&]+&)*ID=(\d+)',
231 r'<iframe[^>]+src="[^"]*?object_id=(\d+)',
232 r"object_id\s*:\s*'(\d+)'",
233 r'data-video-id="(\d+)"',
234
235 # abc.tvp.pl - somehow there are more than one video IDs that seem to be the same video?
236 # the first one is referenced to as "copyid", and seems to be unused by the website
237 r'<script>\s*tvpabc\.video\.init\(\s*\d+,\s*(\d+)\s*\)\s*</script>',
238 ], webpage, 'video id', default=page_id)
239 return {
240 '_type': 'url_transparent',
241 'url': 'tvp:' + video_id,
242 'description': self._og_search_description(
243 webpage, default=None) or (self._html_search_meta(
244 'description', webpage, default=None)
245 if '//s.tvp.pl/files/portal/v' in webpage else None),
246 'thumbnail': self._og_search_thumbnail(webpage, default=None),
247 'ie_key': 'TVPEmbed',
248 }
249
250
251 class TVPStreamIE(InfoExtractor):
252 IE_NAME = 'tvp:stream'
253 _VALID_URL = r'(?:tvpstream:|https?://tvpstream\.vod\.tvp\.pl/(?:\?(?:[^&]+[&;])*channel_id=)?)(?P<id>\d*)'
254 _TESTS = [{
255 # untestable as "video" id changes many times across a day
256 'url': 'https://tvpstream.vod.tvp.pl/?channel_id=1455',
257 'only_matching': True,
258 }, {
259 'url': 'tvpstream:39821455',
260 'only_matching': True,
261 }, {
262 # the default stream when you provide no channel_id, most probably TVP Info
263 'url': 'tvpstream:',
264 'only_matching': True,
265 }, {
266 'url': 'https://tvpstream.vod.tvp.pl/',
267 'only_matching': True,
268 }]
269
270 _PLAYER_BOX_RE = r'<div\s[^>]*id\s*=\s*["\']?tvp_player_box["\']?[^>]+data-%s-id\s*=\s*["\']?(\d+)'
271 _BUTTON_RE = r'<div\s[^>]*data-channel-id=["\']?%s["\']?[^>]*\sdata-title=(?:"([^"]*)"|\'([^\']*)\')[^>]*\sdata-stationname=(?:"([^"]*)"|\'([^\']*)\')'
272
273 def _real_extract(self, url):
274 channel_id = self._match_id(url)
275 channel_url = self._proto_relative_url('//tvpstream.vod.tvp.pl/?channel_id=%s' % channel_id or 'default')
276 webpage = self._download_webpage(channel_url, channel_id, 'Downloading channel webpage')
277 if not channel_id:
278 channel_id = self._search_regex(self._PLAYER_BOX_RE % 'channel',
279 webpage, 'default channel id')
280 video_id = self._search_regex(self._PLAYER_BOX_RE % 'video',
281 webpage, 'video id')
282 audition_title, station_name = self._search_regex(
283 self._BUTTON_RE % (re.escape(channel_id)), webpage,
284 'audition title and station name',
285 group=(1, 2))
286 return {
287 '_type': 'url_transparent',
288 'id': channel_id,
289 'url': 'tvp:%s' % video_id,
290 'title': audition_title,
291 'alt_title': station_name,
292 'is_live': True,
293 'ie_key': 'TVPEmbed',
294 }
295
296
297 class TVPEmbedIE(InfoExtractor):
298 IE_NAME = 'tvp:embed'
299 IE_DESC = 'Telewizja Polska'
300 _VALID_URL = r'''(?x)
301 (?:
302 tvp:
303 |https?://
304 (?:[^/]+\.)?
305 (?:tvp(?:parlament)?\.pl|tvp\.info|polandin\.com)/
306 (?:sess/
307 (?:tvplayer\.php\?.*?object_id
308 |TVPlayer2/(?:embed|api)\.php\?.*[Ii][Dd])
309 |shared/details\.php\?.*?object_id)
310 =)
311 (?P<id>\d+)
312 '''
313
314 _TESTS = [{
315 'url': 'tvp:194536',
316 'info_dict': {
317 'id': '194536',
318 'ext': 'mp4',
319 'title': 'Czas honoru, odc. 13 – Władek',
320 'description': 'md5:76649d2014f65c99477be17f23a4dead',
321 'age_limit': 12,
322 },
323 }, {
324 'url': 'https://www.tvp.pl/sess/tvplayer.php?object_id=51247504&amp;autoplay=false',
325 'info_dict': {
326 'id': '51247504',
327 'ext': 'mp4',
328 'title': 'Razmova 091220',
329 },
330 }, {
331 # TVPlayer2 embed URL
332 'url': 'https://tvp.info/sess/TVPlayer2/embed.php?ID=50595757',
333 'only_matching': True,
334 }, {
335 'url': 'https://wiadomosci.tvp.pl/sess/TVPlayer2/api.php?id=51233452',
336 'only_matching': True,
337 }, {
338 # pulsembed on dziennik.pl
339 'url': 'https://www.tvp.pl/shared/details.php?copy_id=52205981&object_id=52204505&autoplay=false&is_muted=false&allowfullscreen=true&template=external-embed/video/iframe-video.html',
340 'only_matching': True,
341 }]
342
343 @staticmethod
344 def _extract_urls(webpage, **kw):
345 return [m.group('embed') for m in re.finditer(
346 r'(?x)<iframe[^>]+?src=(["\'])(?P<embed>%s)' % TVPEmbedIE._VALID_URL[4:],
347 webpage)]
348
349 def _real_extract(self, url):
350 video_id = self._match_id(url)
351
352 # it could be anything that is a valid JS function name
353 callback = random.choice((
354 'jebac_pis',
355 'jebacpis',
356 'ziobro',
357 'sasin70',
358 'sasin_przejebal_70_milionow_PLN',
359 'tvp_is_a_state_propaganda_service',
360 ))
361
362 webpage = self._download_webpage(
363 ('https://www.tvp.pl/sess/TVPlayer2/api.php?id=%s'
364 + '&@method=getTvpConfig&@callback=%s') % (video_id, callback), video_id)
365
366 # stripping JSONP padding
367 datastr = webpage[15 + len(callback):-3]
368 if datastr.startswith('null,'):
369 error = self._parse_json(datastr[5:], video_id)
370 raise ExtractorError(error[0]['desc'])
371
372 content = self._parse_json(datastr, video_id)['content']
373 info = content['info']
374 is_live = try_get(info, lambda x: x['isLive'], bool)
375
376 formats = []
377 for file in content['files']:
378 video_url = file.get('url')
379 if not video_url:
380 continue
381 if video_url.endswith('.m3u8'):
382 formats.extend(self._extract_m3u8_formats(video_url, video_id, m3u8_id='hls', fatal=False, live=is_live))
383 elif video_url.endswith('.mpd'):
384 if is_live:
385 # doesn't work with either ffmpeg or native downloader
386 continue
387 formats.extend(self._extract_mpd_formats(video_url, video_id, mpd_id='dash', fatal=False))
388 elif video_url.endswith('.f4m'):
389 formats.extend(self._extract_f4m_formats(video_url, video_id, f4m_id='hds', fatal=False))
390 elif video_url.endswith('.ism/manifest'):
391 formats.extend(self._extract_ism_formats(video_url, video_id, ism_id='mss', fatal=False))
392 else:
393 # mp4, wmv or something
394 quality = file.get('quality', {})
395 formats.append({
396 'format_id': 'direct',
397 'url': video_url,
398 'ext': determine_ext(video_url, file['type']),
399 'fps': int_or_none(quality.get('fps')),
400 'tbr': int_or_none(quality.get('bitrate')),
401 'width': int_or_none(quality.get('width')),
402 'height': int_or_none(quality.get('height')),
403 })
404
405 self._sort_formats(formats)
406
407 title = dict_get(info, ('subtitle', 'title', 'seoTitle'))
408 description = dict_get(info, ('description', 'seoDescription'))
409 thumbnails = []
410 for thumb in content.get('posters') or ():
411 thumb_url = thumb.get('src')
412 if not thumb_url or '{width}' in thumb_url or '{height}' in thumb_url:
413 continue
414 thumbnails.append({
415 'url': thumb.get('src'),
416 'width': thumb.get('width'),
417 'height': thumb.get('height'),
418 })
419 age_limit = try_get(info, lambda x: x['ageGroup']['minAge'], int)
420 if age_limit == 1:
421 age_limit = 0
422 duration = try_get(info, lambda x: x['duration'], int) if not is_live else None
423
424 subtitles = {}
425 for sub in content.get('subtitles') or []:
426 if not sub.get('url'):
427 continue
428 subtitles.setdefault(sub['lang'], []).append({
429 'url': sub['url'],
430 'ext': sub.get('type'),
431 })
432
433 info_dict = {
434 'id': video_id,
435 'title': title,
436 'description': description,
437 'thumbnails': thumbnails,
438 'age_limit': age_limit,
439 'is_live': is_live,
440 'duration': duration,
441 'formats': formats,
442 'subtitles': subtitles,
443 }
444
445 # vod.tvp.pl
446 if info.get('vortalName') == 'vod':
447 info_dict.update({
448 'title': '%s, %s' % (info.get('title'), info.get('subtitle')),
449 'series': info.get('title'),
450 'season': info.get('season'),
451 'episode_number': info.get('episode'),
452 })
453
454 return info_dict
455
456
457 class TVPWebsiteIE(InfoExtractor):
458 IE_NAME = 'tvp:series'
459 _VALID_URL = r'https?://vod\.tvp\.pl/website/(?P<display_id>[^,]+),(?P<id>\d+)'
460
461 _TESTS = [{
462 # series
463 'url': 'https://vod.tvp.pl/website/wspaniale-stulecie,17069012/video',
464 'info_dict': {
465 'id': '17069012',
466 },
467 'playlist_count': 312,
468 }, {
469 # film
470 'url': 'https://vod.tvp.pl/website/krzysztof-krawczyk-cale-moje-zycie,51374466',
471 'info_dict': {
472 'id': '51374509',
473 'ext': 'mp4',
474 'title': 'Krzysztof Krawczyk – całe moje życie, Krzysztof Krawczyk – całe moje życie',
475 'description': 'md5:2e80823f00f5fc263555482f76f8fa42',
476 'age_limit': 12,
477 },
478 'params': {
479 'skip_download': True,
480 },
481 'add_ie': ['TVPEmbed'],
482 }, {
483 'url': 'https://vod.tvp.pl/website/lzy-cennet,38678312',
484 'only_matching': True,
485 }]
486
487 def _entries(self, display_id, playlist_id):
488 url = 'https://vod.tvp.pl/website/%s,%s/video' % (display_id, playlist_id)
489 for page_num in itertools.count(1):
490 page = self._download_webpage(
491 url, display_id, 'Downloading page %d' % page_num,
492 query={'page': page_num})
493
494 video_ids = orderedSet(re.findall(
495 r'<a[^>]+\bhref=["\']/video/%s,[^,]+,(\d+)' % display_id,
496 page))
497
498 if not video_ids:
499 break
500
501 for video_id in video_ids:
502 yield self.url_result(
503 'tvp:%s' % video_id, ie=TVPEmbedIE.ie_key(),
504 video_id=video_id)
505
506 def _real_extract(self, url):
507 mobj = self._match_valid_url(url)
508 display_id, playlist_id = mobj.group('display_id', 'id')
509 return self.playlist_result(
510 self._entries(display_id, playlist_id), playlist_id)