]> jfr.im git - yt-dlp.git/blame - yt_dlp/extractor/tvp.py
[panopto] Add extractors (#2908)
[yt-dlp.git] / yt_dlp / extractor / tvp.py
CommitLineData
6f8cb242 1# coding: utf-8
24144e3b 2from __future__ import unicode_literals
5137ebac 3
d9308378 4import itertools
c15de6ff 5import random
29f400b9
TF
6import re
7
5137ebac 8from .common import InfoExtractor
6e3c2047 9from ..utils import (
d9308378 10 determine_ext,
c15de6ff 11 dict_get,
6e3c2047 12 ExtractorError,
c15de6ff
LL
13 int_or_none,
14 js_to_json,
d9308378 15 orderedSet,
c15de6ff
LL
16 str_or_none,
17 try_get,
6e3c2047 18)
c3a3028f 19
5137ebac 20
6f8cb242
S
21class TVPIE(InfoExtractor):
22 IE_NAME = 'tvp'
23 IE_DESC = 'Telewizja Polska'
c15de6ff 24 _VALID_URL = r'https?://(?:[^/]+\.)?(?:tvp(?:parlament)?\.(?:pl|info)|polandin\.com)/(?:video/(?:[^,\s]*,)*|(?:(?!\d+/)[^/]+/)*)(?P<id>\d+)'
fb4b030a
PH
25
26 _TESTS = [{
c15de6ff 27 # TVPlayer 2 in js wrapper
4e599194 28 'url': 'https://vod.tvp.pl/video/czas-honoru,i-seria-odc-13,194536',
fb4b030a
PH
29 'info_dict': {
30 'id': '194536',
31 'ext': 'mp4',
3c964737
S
32 'title': 'Czas honoru, odc. 13 – Władek',
33 'description': 'md5:437f48b93558370b031740546b696e24',
c15de6ff 34 'age_limit': 12,
fb4b030a
PH
35 },
36 }, {
c15de6ff 37 # TVPlayer legacy
fb4b030a
PH
38 'url': 'http://www.tvp.pl/there-can-be-anything-so-i-shortened-it/17916176',
39 'info_dict': {
40 'id': '17916176',
41 'ext': 'mp4',
42 'title': 'TVP Gorzów pokaże filmy studentów z podroży dookoła świata',
fdd0b8f8
RA
43 'description': 'TVP Gorzów pokaże filmy studentów z podroży dookoła świata',
44 },
45 }, {
c15de6ff
LL
46 # TVPlayer 2 in iframe
47 'url': 'https://wiadomosci.tvp.pl/50725617/dzieci-na-sprzedaz-dla-homoseksualistow',
48 'info_dict': {
49 'id': '50725617',
50 'ext': 'mp4',
51 'title': 'Dzieci na sprzedaż dla homoseksualistów',
52 'description': 'md5:7d318eef04e55ddd9f87a8488ac7d590',
53 'age_limit': 12,
54 },
55 }, {
56 # TVPlayer 2 in client-side rendered website (regional; window.__newsData)
57 'url': 'https://warszawa.tvp.pl/25804446/studio-yayo',
58 'info_dict': {
59 'id': '25804446',
60 'ext': 'mp4',
61 'title': 'Studio Yayo',
62 'upload_date': '20160616',
63 'timestamp': 1466075700,
64 }
65 }, {
66 # TVPlayer 2 in client-side rendered website (tvp.info; window.__videoData)
67 'url': 'https://www.tvp.info/52880236/09042021-0800',
68 'info_dict': {
69 'id': '52880236',
70 'ext': 'mp4',
71 'title': '09.04.2021, 08:00',
72 },
73 }, {
74 # client-side rendered (regional) program (playlist) page
75 'url': 'https://opole.tvp.pl/9660819/rozmowa-dnia',
76 'info_dict': {
77 'id': '9660819',
78 'description': 'Od poniedziałku do piątku o 18:55',
79 'title': 'Rozmowa dnia',
80 },
81 'playlist_mincount': 1800,
82 'params': {
83 'skip_download': True,
84 }
85 }, {
86 # ABC-specific video embeding
87 # moved to https://bajkowakraina.tvp.pl/wideo/50981130,teleranek,51027049,zubr,51116450
88 'url': 'https://abc.tvp.pl/48636269/zubry-odc-124',
fdd0b8f8 89 'info_dict': {
c15de6ff 90 'id': '48320456',
fdd0b8f8 91 'ext': 'mp4',
c15de6ff
LL
92 'title': 'Teleranek, Żubr',
93 },
94 'skip': 'unavailable',
95 }, {
96 # yet another vue page
97 'url': 'https://jp2.tvp.pl/46925618/filmy',
98 'info_dict': {
99 'id': '46925618',
100 'title': 'Filmy',
fb4b030a 101 },
c15de6ff 102 'playlist_mincount': 19,
fb4b030a
PH
103 }, {
104 'url': 'http://vod.tvp.pl/seriale/obyczajowe/na-sygnale/sezon-2-27-/odc-39/17834272',
6f8cb242
S
105 'only_matching': True,
106 }, {
107 'url': 'http://wiadomosci.tvp.pl/25169746/24052016-1200',
108 'only_matching': True,
109 }, {
110 'url': 'http://krakow.tvp.pl/25511623/25lecie-mck-wyjatkowe-miejsce-na-mapie-krakowa',
111 'only_matching': True,
112 }, {
113 'url': 'http://teleexpress.tvp.pl/25522307/wierni-wzieli-udzial-w-procesjach',
114 'only_matching': True,
115 }, {
116 'url': 'http://sport.tvp.pl/25522165/krychowiak-uspokaja-w-sprawie-kontuzji-dwa-tygodnie-to-maksimum',
117 'only_matching': True,
118 }, {
119 'url': 'http://www.tvp.info/25511919/trwa-rewolucja-wladza-zdecydowala-sie-na-pogwalcenie-konstytucji',
120 'only_matching': True,
c15de6ff
LL
121 }, {
122 'url': 'https://tvp.info/49193823/teczowe-flagi-na-pomnikach-prokuratura-wszczela-postepowanie-wieszwiecej',
123 'only_matching': True,
124 }, {
125 'url': 'https://www.tvpparlament.pl/retransmisje-vod/inne/wizyta-premiera-mateusza-morawieckiego-w-firmie-berotu-sp-z-oo/48857277',
126 'only_matching': True,
127 }, {
128 'url': 'https://polandin.com/47942651/pln-10-billion-in-subsidies-transferred-to-companies-pm',
129 'only_matching': True,
fb4b030a 130 }]
5137ebac 131
c15de6ff
LL
132 def _parse_vue_website_data(self, webpage, page_id):
133 website_data = self._search_regex([
134 # website - regiony, tvp.info
135 # directory - jp2.tvp.pl
136 r'window\.__(?:website|directory)Data\s*=\s*({(?:.|\s)+?});',
137 ], webpage, 'website data')
138 if not website_data:
139 return None
140 return self._parse_json(website_data, page_id, transform_source=js_to_json)
141
142 def _extract_vue_video(self, video_data, page_id=None):
143 if isinstance(video_data, str):
144 video_data = self._parse_json(video_data, page_id, transform_source=js_to_json)
145 thumbnails = []
146 image = video_data.get('image')
147 if image:
148 for thumb in (image if isinstance(image, list) else [image]):
149 thmb_url = str_or_none(thumb.get('url'))
150 if thmb_url:
151 thumbnails.append({
152 'url': thmb_url,
153 })
154 is_website = video_data.get('type') == 'website'
155 if is_website:
156 url = video_data['url']
157 fucked_up_url_parts = re.match(r'https?://vod\.tvp\.pl/(\d+)/([^/?#]+)', url)
158 if fucked_up_url_parts:
159 url = f'https://vod.tvp.pl/website/{fucked_up_url_parts.group(2)},{fucked_up_url_parts.group(1)}'
160 else:
161 url = 'tvp:' + str_or_none(video_data.get('_id') or page_id)
162 return {
163 '_type': 'url_transparent',
164 'id': str_or_none(video_data.get('_id') or page_id),
165 'url': url,
166 'ie_key': 'TVPEmbed' if not is_website else 'TVPWebsite',
167 'title': str_or_none(video_data.get('title')),
168 'description': str_or_none(video_data.get('lead')),
169 'timestamp': int_or_none(video_data.get('release_date_long')),
170 'duration': int_or_none(video_data.get('duration')),
171 'thumbnails': thumbnails,
172 }
173
174 def _handle_vuejs_page(self, url, webpage, page_id):
175 # vue client-side rendered sites (all regional pages + tvp.info)
176 video_data = self._search_regex([
177 r'window\.__(?:news|video)Data\s*=\s*({(?:.|\s)+?})\s*;',
178 ], webpage, 'video data', default=None)
179 if video_data:
180 return self._extract_vue_video(video_data, page_id=page_id)
181 # paged playlists
182 website_data = self._parse_vue_website_data(webpage, page_id)
183 if website_data:
184 entries = self._vuejs_entries(url, website_data, page_id)
185
186 return {
187 '_type': 'playlist',
188 'id': page_id,
189 'title': str_or_none(website_data.get('title')),
190 'description': str_or_none(website_data.get('lead')),
191 'entries': entries,
192 }
193 raise ExtractorError('Could not extract video/website data')
194
195 def _vuejs_entries(self, url, website_data, page_id):
196
197 def extract_videos(wd):
198 if wd.get('latestVideo'):
199 yield self._extract_vue_video(wd['latestVideo'])
200 for video in wd.get('videos') or []:
201 yield self._extract_vue_video(video)
202 for video in wd.get('items') or []:
203 yield self._extract_vue_video(video)
204
205 yield from extract_videos(website_data)
206
207 if website_data.get('items_total_count') > website_data.get('items_per_page'):
208 for page in itertools.count(2):
209 page_website_data = self._parse_vue_website_data(
210 self._download_webpage(url, page_id, note='Downloading page #%d' % page,
211 query={'page': page}),
212 page_id)
213 if not page_website_data.get('videos') and not page_website_data.get('items'):
214 break
215 yield from extract_videos(page_website_data)
216
fdd0b8f8
RA
217 def _real_extract(self, url):
218 page_id = self._match_id(url)
c15de6ff
LL
219 webpage, urlh = self._download_webpage_handle(url, page_id)
220
221 # The URL may redirect to a VOD
222 # example: https://vod.tvp.pl/48463890/wadowickie-spotkania-z-janem-pawlem-ii
223 if TVPWebsiteIE.suitable(urlh.url):
224 return self.url_result(urlh.url, ie=TVPWebsiteIE.ie_key(), video_id=page_id)
225
226 if re.search(
227 r'window\.__(?:video|news|website|directory)Data\s*=',
228 webpage):
229 return self._handle_vuejs_page(url, webpage, page_id)
230
231 # classic server-side rendered sites
fdd0b8f8 232 video_id = self._search_regex([
c15de6ff 233 r'<iframe[^>]+src="[^"]*?embed\.php\?(?:[^&]+&)*ID=(\d+)',
fdd0b8f8 234 r'<iframe[^>]+src="[^"]*?object_id=(\d+)',
3d8d44c7 235 r"object_id\s*:\s*'(\d+)'",
c15de6ff
LL
236 r'data-video-id="(\d+)"',
237
238 # abc.tvp.pl - somehow there are more than one video IDs that seem to be the same video?
239 # the first one is referenced to as "copyid", and seems to be unused by the website
240 r'<script>\s*tvpabc\.video\.init\(\s*\d+,\s*(\d+)\s*\)\s*</script>',
241 ], webpage, 'video id', default=page_id)
fdd0b8f8
RA
242 return {
243 '_type': 'url_transparent',
244 'url': 'tvp:' + video_id,
3c964737 245 'description': self._og_search_description(
c15de6ff
LL
246 webpage, default=None) or (self._html_search_meta(
247 'description', webpage, default=None)
248 if '//s.tvp.pl/files/portal/v' in webpage else None),
3c964737 249 'thumbnail': self._og_search_thumbnail(webpage, default=None),
fdd0b8f8
RA
250 'ie_key': 'TVPEmbed',
251 }
252
253
ebfab36f
LL
254class TVPStreamIE(InfoExtractor):
255 IE_NAME = 'tvp:stream'
256 _VALID_URL = r'(?:tvpstream:|https?://tvpstream\.vod\.tvp\.pl/(?:\?(?:[^&]+[&;])*channel_id=)?)(?P<id>\d*)'
257 _TESTS = [{
258 # untestable as "video" id changes many times across a day
259 'url': 'https://tvpstream.vod.tvp.pl/?channel_id=1455',
260 'only_matching': True,
261 }, {
262 'url': 'tvpstream:39821455',
263 'only_matching': True,
264 }, {
265 # the default stream when you provide no channel_id, most probably TVP Info
266 'url': 'tvpstream:',
267 'only_matching': True,
268 }, {
269 'url': 'https://tvpstream.vod.tvp.pl/',
270 'only_matching': True,
271 }]
272
273 _PLAYER_BOX_RE = r'<div\s[^>]*id\s*=\s*["\']?tvp_player_box["\']?[^>]+data-%s-id\s*=\s*["\']?(\d+)'
274 _BUTTON_RE = r'<div\s[^>]*data-channel-id=["\']?%s["\']?[^>]*\sdata-title=(?:"([^"]*)"|\'([^\']*)\')[^>]*\sdata-stationname=(?:"([^"]*)"|\'([^\']*)\')'
275
276 def _real_extract(self, url):
277 channel_id = self._match_id(url)
278 channel_url = self._proto_relative_url('//tvpstream.vod.tvp.pl/?channel_id=%s' % channel_id or 'default')
279 webpage = self._download_webpage(channel_url, channel_id, 'Downloading channel webpage')
280 if not channel_id:
281 channel_id = self._search_regex(self._PLAYER_BOX_RE % 'channel',
282 webpage, 'default channel id')
283 video_id = self._search_regex(self._PLAYER_BOX_RE % 'video',
284 webpage, 'video id')
285 audition_title, station_name = self._search_regex(
286 self._BUTTON_RE % (re.escape(channel_id)), webpage,
287 'audition title and station name',
288 group=(1, 2))
289 return {
290 '_type': 'url_transparent',
291 'id': channel_id,
292 'url': 'tvp:%s' % video_id,
293 'title': audition_title,
294 'alt_title': station_name,
295 'is_live': True,
296 'ie_key': 'TVPEmbed',
297 }
298
299
fdd0b8f8
RA
300class TVPEmbedIE(InfoExtractor):
301 IE_NAME = 'tvp:embed'
302 IE_DESC = 'Telewizja Polska'
56bb56f3
LL
303 _VALID_URL = r'''(?x)
304 (?:
305 tvp:
306 |https?://
307 (?:[^/]+\.)?
308 (?:tvp(?:parlament)?\.pl|tvp\.info|polandin\.com)/
309 (?:sess/
310 (?:tvplayer\.php\?.*?object_id
311 |TVPlayer2/(?:embed|api)\.php\?.*[Ii][Dd])
312 |shared/details\.php\?.*?object_id)
313 =)
314 (?P<id>\d+)
315 '''
fdd0b8f8
RA
316
317 _TESTS = [{
3c964737 318 'url': 'tvp:194536',
3c964737
S
319 'info_dict': {
320 'id': '194536',
321 'ext': 'mp4',
322 'title': 'Czas honoru, odc. 13 – Władek',
56bb56f3
LL
323 'description': 'md5:76649d2014f65c99477be17f23a4dead',
324 'age_limit': 12,
3c964737
S
325 },
326 }, {
56bb56f3 327 'url': 'https://www.tvp.pl/sess/tvplayer.php?object_id=51247504&amp;autoplay=false',
fdd0b8f8 328 'info_dict': {
56bb56f3 329 'id': '51247504',
fdd0b8f8 330 'ext': 'mp4',
56bb56f3 331 'title': 'Razmova 091220',
fdd0b8f8
RA
332 },
333 }, {
56bb56f3
LL
334 # TVPlayer2 embed URL
335 'url': 'https://tvp.info/sess/TVPlayer2/embed.php?ID=50595757',
336 'only_matching': True,
337 }, {
338 'url': 'https://wiadomosci.tvp.pl/sess/TVPlayer2/api.php?id=51233452',
339 'only_matching': True,
340 }, {
341 # pulsembed on dziennik.pl
342 'url': 'https://www.tvp.pl/shared/details.php?copy_id=52205981&object_id=52204505&autoplay=false&is_muted=false&allowfullscreen=true&template=external-embed/video/iframe-video.html',
fdd0b8f8
RA
343 'only_matching': True,
344 }]
345
56bb56f3
LL
346 @staticmethod
347 def _extract_urls(webpage, **kw):
348 return [m.group('embed') for m in re.finditer(
349 r'(?x)<iframe[^>]+?src=(["\'])(?P<embed>%s)' % TVPEmbedIE._VALID_URL[4:],
350 webpage)]
351
5137ebac 352 def _real_extract(self, url):
fb4b030a 353 video_id = self._match_id(url)
030aa5d9 354
56bb56f3
LL
355 # it could be anything that is a valid JS function name
356 callback = random.choice((
357 'jebac_pis',
358 'jebacpis',
359 'ziobro',
360 'sasin70',
361 'sasin_przejebal_70_milionow_PLN',
362 'tvp_is_a_state_propaganda_service',
363 ))
364
29f400b9 365 webpage = self._download_webpage(
56bb56f3
LL
366 ('https://www.tvp.pl/sess/TVPlayer2/api.php?id=%s'
367 + '&@method=getTvpConfig&@callback=%s') % (video_id, callback), video_id)
368
369 # stripping JSONP padding
370 datastr = webpage[15 + len(callback):-3]
371 if datastr.startswith('null,'):
372 error = self._parse_json(datastr[5:], video_id)
373 raise ExtractorError(error[0]['desc'])
374
375 content = self._parse_json(datastr, video_id)['content']
376 info = content['info']
377 is_live = try_get(info, lambda x: x['isLive'], bool)
29f400b9 378
6e3c2047 379 formats = []
56bb56f3
LL
380 for file in content['files']:
381 video_url = file.get('url')
382 if not video_url:
383 continue
384 if video_url.endswith('.m3u8'):
385 formats.extend(self._extract_m3u8_formats(video_url, video_id, m3u8_id='hls', fatal=False, live=is_live))
386 elif video_url.endswith('.mpd'):
387 if is_live:
388 # doesn't work with either ffmpeg or native downloader
389 continue
390 formats.extend(self._extract_mpd_formats(video_url, video_id, mpd_id='dash', fatal=False))
391 elif video_url.endswith('.f4m'):
392 formats.extend(self._extract_f4m_formats(video_url, video_id, f4m_id='hds', fatal=False))
393 elif video_url.endswith('.ism/manifest'):
394 formats.extend(self._extract_ism_formats(video_url, video_id, ism_id='mss', fatal=False))
395 else:
396 # mp4, wmv or something
397 quality = file.get('quality', {})
398 formats.append({
399 'format_id': 'direct',
400 'url': video_url,
401 'ext': determine_ext(video_url, file['type']),
402 'fps': int_or_none(quality.get('fps')),
403 'tbr': int_or_none(quality.get('bitrate')),
404 'width': int_or_none(quality.get('width')),
405 'height': int_or_none(quality.get('height')),
406 })
fb4b030a
PH
407
408 self._sort_formats(formats)
409
56bb56f3
LL
410 title = dict_get(info, ('subtitle', 'title', 'seoTitle'))
411 description = dict_get(info, ('description', 'seoDescription'))
412 thumbnails = []
413 for thumb in content.get('posters') or ():
414 thumb_url = thumb.get('src')
415 if not thumb_url or '{width}' in thumb_url or '{height}' in thumb_url:
416 continue
417 thumbnails.append({
418 'url': thumb.get('src'),
419 'width': thumb.get('width'),
420 'height': thumb.get('height'),
421 })
422 age_limit = try_get(info, lambda x: x['ageGroup']['minAge'], int)
423 if age_limit == 1:
424 age_limit = 0
425 duration = try_get(info, lambda x: x['duration'], int) if not is_live else None
426
427 subtitles = {}
428 for sub in content.get('subtitles') or []:
429 if not sub.get('url'):
430 continue
431 subtitles.setdefault(sub['lang'], []).append({
432 'url': sub['url'],
433 'ext': sub.get('type'),
434 })
435
436 info_dict = {
fb4b030a
PH
437 'id': video_id,
438 'title': title,
56bb56f3
LL
439 'description': description,
440 'thumbnails': thumbnails,
441 'age_limit': age_limit,
442 'is_live': is_live,
443 'duration': duration,
fb4b030a 444 'formats': formats,
56bb56f3 445 'subtitles': subtitles,
fb4b030a 446 }
6ce2c678 447
56bb56f3
LL
448 # vod.tvp.pl
449 if info.get('vortalName') == 'vod':
450 info_dict.update({
451 'title': '%s, %s' % (info.get('title'), info.get('subtitle')),
452 'series': info.get('title'),
453 'season': info.get('season'),
454 'episode_number': info.get('episode'),
455 })
456
457 return info_dict
458
6ce2c678 459
388cfbd3 460class TVPWebsiteIE(InfoExtractor):
6f8cb242 461 IE_NAME = 'tvp:series'
388cfbd3 462 _VALID_URL = r'https?://vod\.tvp\.pl/website/(?P<display_id>[^,]+),(?P<id>\d+)'
6ce2c678 463
fb4b030a 464 _TESTS = [{
388cfbd3 465 # series
c15de6ff 466 'url': 'https://vod.tvp.pl/website/wspaniale-stulecie,17069012/video',
fb4b030a 467 'info_dict': {
c15de6ff 468 'id': '17069012',
fb4b030a 469 },
c15de6ff 470 'playlist_count': 312,
388cfbd3
S
471 }, {
472 # film
c15de6ff 473 'url': 'https://vod.tvp.pl/website/krzysztof-krawczyk-cale-moje-zycie,51374466',
388cfbd3 474 'info_dict': {
c15de6ff 475 'id': '51374509',
388cfbd3 476 'ext': 'mp4',
c15de6ff
LL
477 'title': 'Krzysztof Krawczyk – całe moje życie, Krzysztof Krawczyk – całe moje życie',
478 'description': 'md5:2e80823f00f5fc263555482f76f8fa42',
479 'age_limit': 12,
388cfbd3
S
480 },
481 'params': {
482 'skip_download': True,
483 },
484 'add_ie': ['TVPEmbed'],
485 }, {
486 'url': 'https://vod.tvp.pl/website/lzy-cennet,38678312',
487 'only_matching': True,
fb4b030a 488 }]
6ce2c678 489
388cfbd3
S
490 def _entries(self, display_id, playlist_id):
491 url = 'https://vod.tvp.pl/website/%s,%s/video' % (display_id, playlist_id)
d9308378
S
492 for page_num in itertools.count(1):
493 page = self._download_webpage(
494 url, display_id, 'Downloading page %d' % page_num,
495 query={'page': page_num})
fb4b030a 496
d9308378
S
497 video_ids = orderedSet(re.findall(
498 r'<a[^>]+\bhref=["\']/video/%s,[^,]+,(\d+)' % display_id,
499 page))
500
501 if not video_ids:
502 break
503
504 for video_id in video_ids:
505 yield self.url_result(
506 'tvp:%s' % video_id, ie=TVPEmbedIE.ie_key(),
507 video_id=video_id)
508
509 def _real_extract(self, url):
5ad28e7f 510 mobj = self._match_valid_url(url)
d9308378 511 display_id, playlist_id = mobj.group('display_id', 'id')
388cfbd3
S
512 return self.playlist_result(
513 self._entries(display_id, playlist_id), playlist_id)