]> jfr.im git - yt-dlp.git/blob - yt_dlp/extractor/tvp.py
[extractors] Use new framework for existing embeds (#4307)
[yt-dlp.git] / yt_dlp / extractor / tvp.py
1 import itertools
2 import random
3 import re
4
5 from .common import InfoExtractor
6 from ..utils import (
7 determine_ext,
8 dict_get,
9 ExtractorError,
10 int_or_none,
11 js_to_json,
12 orderedSet,
13 str_or_none,
14 try_get,
15 )
16
17
18 class TVPIE(InfoExtractor):
19 IE_NAME = 'tvp'
20 IE_DESC = 'Telewizja Polska'
21 _VALID_URL = r'https?://(?:[^/]+\.)?(?:tvp(?:parlament)?\.(?:pl|info)|polandin\.com)/(?:video/(?:[^,\s]*,)*|(?:(?!\d+/)[^/]+/)*)(?P<id>\d+)'
22
23 _TESTS = [{
24 # TVPlayer 2 in js wrapper
25 'url': 'https://vod.tvp.pl/video/czas-honoru,i-seria-odc-13,194536',
26 'info_dict': {
27 'id': '194536',
28 'ext': 'mp4',
29 'title': 'Czas honoru, odc. 13 – Władek',
30 'description': 'md5:437f48b93558370b031740546b696e24',
31 'age_limit': 12,
32 },
33 }, {
34 # TVPlayer legacy
35 'url': 'http://www.tvp.pl/there-can-be-anything-so-i-shortened-it/17916176',
36 'info_dict': {
37 'id': '17916176',
38 'ext': 'mp4',
39 'title': 'TVP Gorzów pokaże filmy studentów z podroży dookoła świata',
40 'description': 'TVP Gorzów pokaże filmy studentów z podroży dookoła świata',
41 },
42 }, {
43 # TVPlayer 2 in iframe
44 'url': 'https://wiadomosci.tvp.pl/50725617/dzieci-na-sprzedaz-dla-homoseksualistow',
45 'info_dict': {
46 'id': '50725617',
47 'ext': 'mp4',
48 'title': 'Dzieci na sprzedaż dla homoseksualistów',
49 'description': 'md5:7d318eef04e55ddd9f87a8488ac7d590',
50 'age_limit': 12,
51 },
52 }, {
53 # TVPlayer 2 in client-side rendered website (regional; window.__newsData)
54 'url': 'https://warszawa.tvp.pl/25804446/studio-yayo',
55 'info_dict': {
56 'id': '25804446',
57 'ext': 'mp4',
58 'title': 'Studio Yayo',
59 'upload_date': '20160616',
60 'timestamp': 1466075700,
61 }
62 }, {
63 # TVPlayer 2 in client-side rendered website (tvp.info; window.__videoData)
64 'url': 'https://www.tvp.info/52880236/09042021-0800',
65 'info_dict': {
66 'id': '52880236',
67 'ext': 'mp4',
68 'title': '09.04.2021, 08:00',
69 },
70 }, {
71 # client-side rendered (regional) program (playlist) page
72 'url': 'https://opole.tvp.pl/9660819/rozmowa-dnia',
73 'info_dict': {
74 'id': '9660819',
75 'description': 'Od poniedziałku do piątku o 18:55',
76 'title': 'Rozmowa dnia',
77 },
78 'playlist_mincount': 1800,
79 'params': {
80 'skip_download': True,
81 }
82 }, {
83 # ABC-specific video embeding
84 # moved to https://bajkowakraina.tvp.pl/wideo/50981130,teleranek,51027049,zubr,51116450
85 'url': 'https://abc.tvp.pl/48636269/zubry-odc-124',
86 'info_dict': {
87 'id': '48320456',
88 'ext': 'mp4',
89 'title': 'Teleranek, Żubr',
90 },
91 'skip': 'unavailable',
92 }, {
93 # yet another vue page
94 'url': 'https://jp2.tvp.pl/46925618/filmy',
95 'info_dict': {
96 'id': '46925618',
97 'title': 'Filmy',
98 },
99 'playlist_mincount': 19,
100 }, {
101 'url': 'http://vod.tvp.pl/seriale/obyczajowe/na-sygnale/sezon-2-27-/odc-39/17834272',
102 'only_matching': True,
103 }, {
104 'url': 'http://wiadomosci.tvp.pl/25169746/24052016-1200',
105 'only_matching': True,
106 }, {
107 'url': 'http://krakow.tvp.pl/25511623/25lecie-mck-wyjatkowe-miejsce-na-mapie-krakowa',
108 'only_matching': True,
109 }, {
110 'url': 'http://teleexpress.tvp.pl/25522307/wierni-wzieli-udzial-w-procesjach',
111 'only_matching': True,
112 }, {
113 'url': 'http://sport.tvp.pl/25522165/krychowiak-uspokaja-w-sprawie-kontuzji-dwa-tygodnie-to-maksimum',
114 'only_matching': True,
115 }, {
116 'url': 'http://www.tvp.info/25511919/trwa-rewolucja-wladza-zdecydowala-sie-na-pogwalcenie-konstytucji',
117 'only_matching': True,
118 }, {
119 'url': 'https://tvp.info/49193823/teczowe-flagi-na-pomnikach-prokuratura-wszczela-postepowanie-wieszwiecej',
120 'only_matching': True,
121 }, {
122 'url': 'https://www.tvpparlament.pl/retransmisje-vod/inne/wizyta-premiera-mateusza-morawieckiego-w-firmie-berotu-sp-z-oo/48857277',
123 'only_matching': True,
124 }, {
125 'url': 'https://polandin.com/47942651/pln-10-billion-in-subsidies-transferred-to-companies-pm',
126 'only_matching': True,
127 }]
128
129 def _parse_vue_website_data(self, webpage, page_id):
130 website_data = self._search_regex([
131 # website - regiony, tvp.info
132 # directory - jp2.tvp.pl
133 r'window\.__(?:website|directory)Data\s*=\s*({(?:.|\s)+?});',
134 ], webpage, 'website data')
135 if not website_data:
136 return None
137 return self._parse_json(website_data, page_id, transform_source=js_to_json)
138
139 def _extract_vue_video(self, video_data, page_id=None):
140 if isinstance(video_data, str):
141 video_data = self._parse_json(video_data, page_id, transform_source=js_to_json)
142 thumbnails = []
143 image = video_data.get('image')
144 if image:
145 for thumb in (image if isinstance(image, list) else [image]):
146 thmb_url = str_or_none(thumb.get('url'))
147 if thmb_url:
148 thumbnails.append({
149 'url': thmb_url,
150 })
151 is_website = video_data.get('type') == 'website'
152 if is_website:
153 url = video_data['url']
154 fucked_up_url_parts = re.match(r'https?://vod\.tvp\.pl/(\d+)/([^/?#]+)', url)
155 if fucked_up_url_parts:
156 url = f'https://vod.tvp.pl/website/{fucked_up_url_parts.group(2)},{fucked_up_url_parts.group(1)}'
157 else:
158 url = 'tvp:' + str_or_none(video_data.get('_id') or page_id)
159 return {
160 '_type': 'url_transparent',
161 'id': str_or_none(video_data.get('_id') or page_id),
162 'url': url,
163 'ie_key': 'TVPEmbed' if not is_website else 'TVPWebsite',
164 'title': str_or_none(video_data.get('title')),
165 'description': str_or_none(video_data.get('lead')),
166 'timestamp': int_or_none(video_data.get('release_date_long')),
167 'duration': int_or_none(video_data.get('duration')),
168 'thumbnails': thumbnails,
169 }
170
171 def _handle_vuejs_page(self, url, webpage, page_id):
172 # vue client-side rendered sites (all regional pages + tvp.info)
173 video_data = self._search_regex([
174 r'window\.__(?:news|video)Data\s*=\s*({(?:.|\s)+?})\s*;',
175 ], webpage, 'video data', default=None)
176 if video_data:
177 return self._extract_vue_video(video_data, page_id=page_id)
178 # paged playlists
179 website_data = self._parse_vue_website_data(webpage, page_id)
180 if website_data:
181 entries = self._vuejs_entries(url, website_data, page_id)
182
183 return {
184 '_type': 'playlist',
185 'id': page_id,
186 'title': str_or_none(website_data.get('title')),
187 'description': str_or_none(website_data.get('lead')),
188 'entries': entries,
189 }
190 raise ExtractorError('Could not extract video/website data')
191
192 def _vuejs_entries(self, url, website_data, page_id):
193
194 def extract_videos(wd):
195 if wd.get('latestVideo'):
196 yield self._extract_vue_video(wd['latestVideo'])
197 for video in wd.get('videos') or []:
198 yield self._extract_vue_video(video)
199 for video in wd.get('items') or []:
200 yield self._extract_vue_video(video)
201
202 yield from extract_videos(website_data)
203
204 if website_data.get('items_total_count') > website_data.get('items_per_page'):
205 for page in itertools.count(2):
206 page_website_data = self._parse_vue_website_data(
207 self._download_webpage(url, page_id, note='Downloading page #%d' % page,
208 query={'page': page}),
209 page_id)
210 if not page_website_data.get('videos') and not page_website_data.get('items'):
211 break
212 yield from extract_videos(page_website_data)
213
214 def _real_extract(self, url):
215 page_id = self._match_id(url)
216 webpage, urlh = self._download_webpage_handle(url, page_id)
217
218 # The URL may redirect to a VOD
219 # example: https://vod.tvp.pl/48463890/wadowickie-spotkania-z-janem-pawlem-ii
220 if TVPWebsiteIE.suitable(urlh.url):
221 return self.url_result(urlh.url, ie=TVPWebsiteIE.ie_key(), video_id=page_id)
222
223 if re.search(
224 r'window\.__(?:video|news|website|directory)Data\s*=',
225 webpage):
226 return self._handle_vuejs_page(url, webpage, page_id)
227
228 # classic server-side rendered sites
229 video_id = self._search_regex([
230 r'<iframe[^>]+src="[^"]*?embed\.php\?(?:[^&]+&)*ID=(\d+)',
231 r'<iframe[^>]+src="[^"]*?object_id=(\d+)',
232 r"object_id\s*:\s*'(\d+)'",
233 r'data-video-id="(\d+)"',
234
235 # abc.tvp.pl - somehow there are more than one video IDs that seem to be the same video?
236 # the first one is referenced to as "copyid", and seems to be unused by the website
237 r'<script>\s*tvpabc\.video\.init\(\s*\d+,\s*(\d+)\s*\)\s*</script>',
238 ], webpage, 'video id', default=page_id)
239 return {
240 '_type': 'url_transparent',
241 'url': 'tvp:' + video_id,
242 'description': self._og_search_description(
243 webpage, default=None) or (self._html_search_meta(
244 'description', webpage, default=None)
245 if '//s.tvp.pl/files/portal/v' in webpage else None),
246 'thumbnail': self._og_search_thumbnail(webpage, default=None),
247 'ie_key': 'TVPEmbed',
248 }
249
250
251 class TVPStreamIE(InfoExtractor):
252 IE_NAME = 'tvp:stream'
253 _VALID_URL = r'(?:tvpstream:|https?://tvpstream\.vod\.tvp\.pl/(?:\?(?:[^&]+[&;])*channel_id=)?)(?P<id>\d*)'
254 _TESTS = [{
255 # untestable as "video" id changes many times across a day
256 'url': 'https://tvpstream.vod.tvp.pl/?channel_id=1455',
257 'only_matching': True,
258 }, {
259 'url': 'tvpstream:39821455',
260 'only_matching': True,
261 }, {
262 # the default stream when you provide no channel_id, most probably TVP Info
263 'url': 'tvpstream:',
264 'only_matching': True,
265 }, {
266 'url': 'https://tvpstream.vod.tvp.pl/',
267 'only_matching': True,
268 }]
269
270 _PLAYER_BOX_RE = r'<div\s[^>]*id\s*=\s*["\']?tvp_player_box["\']?[^>]+data-%s-id\s*=\s*["\']?(\d+)'
271 _BUTTON_RE = r'<div\s[^>]*data-channel-id=["\']?%s["\']?[^>]*\sdata-title=(?:"([^"]*)"|\'([^\']*)\')[^>]*\sdata-stationname=(?:"([^"]*)"|\'([^\']*)\')'
272
273 def _real_extract(self, url):
274 channel_id = self._match_id(url)
275 channel_url = self._proto_relative_url('//tvpstream.vod.tvp.pl/?channel_id=%s' % channel_id or 'default')
276 webpage = self._download_webpage(channel_url, channel_id, 'Downloading channel webpage')
277 if not channel_id:
278 channel_id = self._search_regex(self._PLAYER_BOX_RE % 'channel',
279 webpage, 'default channel id')
280 video_id = self._search_regex(self._PLAYER_BOX_RE % 'video',
281 webpage, 'video id')
282 audition_title, station_name = self._search_regex(
283 self._BUTTON_RE % (re.escape(channel_id)), webpage,
284 'audition title and station name',
285 group=(1, 2))
286 return {
287 '_type': 'url_transparent',
288 'id': channel_id,
289 'url': 'tvp:%s' % video_id,
290 'title': audition_title,
291 'alt_title': station_name,
292 'is_live': True,
293 'ie_key': 'TVPEmbed',
294 }
295
296
297 class TVPEmbedIE(InfoExtractor):
298 IE_NAME = 'tvp:embed'
299 IE_DESC = 'Telewizja Polska'
300 _VALID_URL = r'''(?x)
301 (?:
302 tvp:
303 |https?://
304 (?:[^/]+\.)?
305 (?:tvp(?:parlament)?\.pl|tvp\.info|polandin\.com)/
306 (?:sess/
307 (?:tvplayer\.php\?.*?object_id
308 |TVPlayer2/(?:embed|api)\.php\?.*[Ii][Dd])
309 |shared/details\.php\?.*?object_id)
310 =)
311 (?P<id>\d+)
312 '''
313 _EMBED_REGEX = [rf'(?x)<iframe[^>]+?src=(["\'])(?P<url>{_VALID_URL[4:]})']
314
315 _TESTS = [{
316 'url': 'tvp:194536',
317 'info_dict': {
318 'id': '194536',
319 'ext': 'mp4',
320 'title': 'Czas honoru, odc. 13 – Władek',
321 'description': 'md5:76649d2014f65c99477be17f23a4dead',
322 'age_limit': 12,
323 },
324 }, {
325 'url': 'https://www.tvp.pl/sess/tvplayer.php?object_id=51247504&amp;autoplay=false',
326 'info_dict': {
327 'id': '51247504',
328 'ext': 'mp4',
329 'title': 'Razmova 091220',
330 },
331 }, {
332 # TVPlayer2 embed URL
333 'url': 'https://tvp.info/sess/TVPlayer2/embed.php?ID=50595757',
334 'only_matching': True,
335 }, {
336 'url': 'https://wiadomosci.tvp.pl/sess/TVPlayer2/api.php?id=51233452',
337 'only_matching': True,
338 }, {
339 # pulsembed on dziennik.pl
340 'url': 'https://www.tvp.pl/shared/details.php?copy_id=52205981&object_id=52204505&autoplay=false&is_muted=false&allowfullscreen=true&template=external-embed/video/iframe-video.html',
341 'only_matching': True,
342 }]
343
344 def _real_extract(self, url):
345 video_id = self._match_id(url)
346
347 # it could be anything that is a valid JS function name
348 callback = random.choice((
349 'jebac_pis',
350 'jebacpis',
351 'ziobro',
352 'sasin70',
353 'sasin_przejebal_70_milionow_PLN',
354 'tvp_is_a_state_propaganda_service',
355 ))
356
357 webpage = self._download_webpage(
358 ('https://www.tvp.pl/sess/TVPlayer2/api.php?id=%s'
359 + '&@method=getTvpConfig&@callback=%s') % (video_id, callback), video_id)
360
361 # stripping JSONP padding
362 datastr = webpage[15 + len(callback):-3]
363 if datastr.startswith('null,'):
364 error = self._parse_json(datastr[5:], video_id)
365 raise ExtractorError(error[0]['desc'])
366
367 content = self._parse_json(datastr, video_id)['content']
368 info = content['info']
369 is_live = try_get(info, lambda x: x['isLive'], bool)
370
371 formats = []
372 for file in content['files']:
373 video_url = file.get('url')
374 if not video_url:
375 continue
376 if video_url.endswith('.m3u8'):
377 formats.extend(self._extract_m3u8_formats(video_url, video_id, m3u8_id='hls', fatal=False, live=is_live))
378 elif video_url.endswith('.mpd'):
379 if is_live:
380 # doesn't work with either ffmpeg or native downloader
381 continue
382 formats.extend(self._extract_mpd_formats(video_url, video_id, mpd_id='dash', fatal=False))
383 elif video_url.endswith('.f4m'):
384 formats.extend(self._extract_f4m_formats(video_url, video_id, f4m_id='hds', fatal=False))
385 elif video_url.endswith('.ism/manifest'):
386 formats.extend(self._extract_ism_formats(video_url, video_id, ism_id='mss', fatal=False))
387 else:
388 # mp4, wmv or something
389 quality = file.get('quality', {})
390 formats.append({
391 'format_id': 'direct',
392 'url': video_url,
393 'ext': determine_ext(video_url, file['type']),
394 'fps': int_or_none(quality.get('fps')),
395 'tbr': int_or_none(quality.get('bitrate')),
396 'width': int_or_none(quality.get('width')),
397 'height': int_or_none(quality.get('height')),
398 })
399
400 self._sort_formats(formats)
401
402 title = dict_get(info, ('subtitle', 'title', 'seoTitle'))
403 description = dict_get(info, ('description', 'seoDescription'))
404 thumbnails = []
405 for thumb in content.get('posters') or ():
406 thumb_url = thumb.get('src')
407 if not thumb_url or '{width}' in thumb_url or '{height}' in thumb_url:
408 continue
409 thumbnails.append({
410 'url': thumb.get('src'),
411 'width': thumb.get('width'),
412 'height': thumb.get('height'),
413 })
414 age_limit = try_get(info, lambda x: x['ageGroup']['minAge'], int)
415 if age_limit == 1:
416 age_limit = 0
417 duration = try_get(info, lambda x: x['duration'], int) if not is_live else None
418
419 subtitles = {}
420 for sub in content.get('subtitles') or []:
421 if not sub.get('url'):
422 continue
423 subtitles.setdefault(sub['lang'], []).append({
424 'url': sub['url'],
425 'ext': sub.get('type'),
426 })
427
428 info_dict = {
429 'id': video_id,
430 'title': title,
431 'description': description,
432 'thumbnails': thumbnails,
433 'age_limit': age_limit,
434 'is_live': is_live,
435 'duration': duration,
436 'formats': formats,
437 'subtitles': subtitles,
438 }
439
440 # vod.tvp.pl
441 if info.get('vortalName') == 'vod':
442 info_dict.update({
443 'title': '%s, %s' % (info.get('title'), info.get('subtitle')),
444 'series': info.get('title'),
445 'season': info.get('season'),
446 'episode_number': info.get('episode'),
447 })
448
449 return info_dict
450
451
452 class TVPWebsiteIE(InfoExtractor):
453 IE_NAME = 'tvp:series'
454 _VALID_URL = r'https?://vod\.tvp\.pl/website/(?P<display_id>[^,]+),(?P<id>\d+)'
455
456 _TESTS = [{
457 # series
458 'url': 'https://vod.tvp.pl/website/wspaniale-stulecie,17069012/video',
459 'info_dict': {
460 'id': '17069012',
461 },
462 'playlist_count': 312,
463 }, {
464 # film
465 'url': 'https://vod.tvp.pl/website/krzysztof-krawczyk-cale-moje-zycie,51374466',
466 'info_dict': {
467 'id': '51374509',
468 'ext': 'mp4',
469 'title': 'Krzysztof Krawczyk – całe moje życie, Krzysztof Krawczyk – całe moje życie',
470 'description': 'md5:2e80823f00f5fc263555482f76f8fa42',
471 'age_limit': 12,
472 },
473 'params': {
474 'skip_download': True,
475 },
476 'add_ie': ['TVPEmbed'],
477 }, {
478 'url': 'https://vod.tvp.pl/website/lzy-cennet,38678312',
479 'only_matching': True,
480 }]
481
482 def _entries(self, display_id, playlist_id):
483 url = 'https://vod.tvp.pl/website/%s,%s/video' % (display_id, playlist_id)
484 for page_num in itertools.count(1):
485 page = self._download_webpage(
486 url, display_id, 'Downloading page %d' % page_num,
487 query={'page': page_num})
488
489 video_ids = orderedSet(re.findall(
490 r'<a[^>]+\bhref=["\']/video/%s,[^,]+,(\d+)' % display_id,
491 page))
492
493 if not video_ids:
494 break
495
496 for video_id in video_ids:
497 yield self.url_result(
498 'tvp:%s' % video_id, ie=TVPEmbedIE.ie_key(),
499 video_id=video_id)
500
501 def _real_extract(self, url):
502 mobj = self._match_valid_url(url)
503 display_id, playlist_id = mobj.group('display_id', 'id')
504 return self.playlist_result(
505 self._entries(display_id, playlist_id), playlist_id)