5 from .common
import InfoExtractor
18 class TVPIE(InfoExtractor
):
20 IE_DESC
= 'Telewizja Polska'
21 _VALID_URL
= r
'https?://(?:[^/]+\.)?(?:tvp(?:parlament)?\.(?:pl|info)|polandin\.com)/(?:video/(?:[^,\s]*,)*|(?:(?!\d+/)[^/]+/)*)(?P<id>\d+)'
24 # TVPlayer 2 in js wrapper
25 'url': 'https://vod.tvp.pl/video/czas-honoru,i-seria-odc-13,194536',
29 'title': 'Czas honoru, odc. 13 – Władek',
30 'description': 'md5:437f48b93558370b031740546b696e24',
35 'url': 'http://www.tvp.pl/there-can-be-anything-so-i-shortened-it/17916176',
39 'title': 'TVP Gorzów pokaże filmy studentów z podroży dookoła świata',
40 'description': 'TVP Gorzów pokaże filmy studentów z podroży dookoła świata',
43 # TVPlayer 2 in iframe
44 'url': 'https://wiadomosci.tvp.pl/50725617/dzieci-na-sprzedaz-dla-homoseksualistow',
48 'title': 'Dzieci na sprzedaż dla homoseksualistów',
49 'description': 'md5:7d318eef04e55ddd9f87a8488ac7d590',
53 # TVPlayer 2 in client-side rendered website (regional; window.__newsData)
54 'url': 'https://warszawa.tvp.pl/25804446/studio-yayo',
58 'title': 'Studio Yayo',
59 'upload_date': '20160616',
60 'timestamp': 1466075700,
63 # TVPlayer 2 in client-side rendered website (tvp.info; window.__videoData)
64 'url': 'https://www.tvp.info/52880236/09042021-0800',
68 'title': '09.04.2021, 08:00',
71 # client-side rendered (regional) program (playlist) page
72 'url': 'https://opole.tvp.pl/9660819/rozmowa-dnia',
75 'description': 'Od poniedziałku do piątku o 18:55',
76 'title': 'Rozmowa dnia',
78 'playlist_mincount': 1800,
80 'skip_download': True,
83 # ABC-specific video embeding
84 # moved to https://bajkowakraina.tvp.pl/wideo/50981130,teleranek,51027049,zubr,51116450
85 'url': 'https://abc.tvp.pl/48636269/zubry-odc-124',
89 'title': 'Teleranek, Żubr',
91 'skip': 'unavailable',
93 # yet another vue page
94 'url': 'https://jp2.tvp.pl/46925618/filmy',
99 'playlist_mincount': 19,
101 'url': 'http://vod.tvp.pl/seriale/obyczajowe/na-sygnale/sezon-2-27-/odc-39/17834272',
102 'only_matching': True,
104 'url': 'http://wiadomosci.tvp.pl/25169746/24052016-1200',
105 'only_matching': True,
107 'url': 'http://krakow.tvp.pl/25511623/25lecie-mck-wyjatkowe-miejsce-na-mapie-krakowa',
108 'only_matching': True,
110 'url': 'http://teleexpress.tvp.pl/25522307/wierni-wzieli-udzial-w-procesjach',
111 'only_matching': True,
113 'url': 'http://sport.tvp.pl/25522165/krychowiak-uspokaja-w-sprawie-kontuzji-dwa-tygodnie-to-maksimum',
114 'only_matching': True,
116 'url': 'http://www.tvp.info/25511919/trwa-rewolucja-wladza-zdecydowala-sie-na-pogwalcenie-konstytucji',
117 'only_matching': True,
119 'url': 'https://tvp.info/49193823/teczowe-flagi-na-pomnikach-prokuratura-wszczela-postepowanie-wieszwiecej',
120 'only_matching': True,
122 'url': 'https://www.tvpparlament.pl/retransmisje-vod/inne/wizyta-premiera-mateusza-morawieckiego-w-firmie-berotu-sp-z-oo/48857277',
123 'only_matching': True,
125 'url': 'https://polandin.com/47942651/pln-10-billion-in-subsidies-transferred-to-companies-pm',
126 'only_matching': True,
129 def _parse_vue_website_data(self
, webpage
, page_id
):
130 website_data
= self
._search
_regex
([
131 # website - regiony, tvp.info
132 # directory - jp2.tvp.pl
133 r
'window\.__(?:website|directory)Data\s*=\s*({(?:.|\s)+?});',
134 ], webpage
, 'website data')
137 return self
._parse
_json
(website_data
, page_id
, transform_source
=js_to_json
)
139 def _extract_vue_video(self
, video_data
, page_id
=None):
140 if isinstance(video_data
, str):
141 video_data
= self
._parse
_json
(video_data
, page_id
, transform_source
=js_to_json
)
143 image
= video_data
.get('image')
145 for thumb
in (image
if isinstance(image
, list) else [image
]):
146 thmb_url
= str_or_none(thumb
.get('url'))
151 is_website
= video_data
.get('type') == 'website'
153 url
= video_data
['url']
154 fucked_up_url_parts
= re
.match(r
'https?://vod\.tvp\.pl/(\d+)/([^/?#]+)', url
)
155 if fucked_up_url_parts
:
156 url
= f
'https://vod.tvp.pl/website/{fucked_up_url_parts.group(2)},{fucked_up_url_parts.group(1)}'
158 url
= 'tvp:' + str_or_none(video_data
.get('_id') or page_id
)
160 '_type': 'url_transparent',
161 'id': str_or_none(video_data
.get('_id') or page_id
),
163 'ie_key': 'TVPEmbed' if not is_website
else 'TVPWebsite',
164 'title': str_or_none(video_data
.get('title')),
165 'description': str_or_none(video_data
.get('lead')),
166 'timestamp': int_or_none(video_data
.get('release_date_long')),
167 'duration': int_or_none(video_data
.get('duration')),
168 'thumbnails': thumbnails
,
171 def _handle_vuejs_page(self
, url
, webpage
, page_id
):
172 # vue client-side rendered sites (all regional pages + tvp.info)
173 video_data
= self
._search
_regex
([
174 r
'window\.__(?:news|video)Data\s*=\s*({(?:.|\s)+?})\s*;',
175 ], webpage
, 'video data', default
=None)
177 return self
._extract
_vue
_video
(video_data
, page_id
=page_id
)
179 website_data
= self
._parse
_vue
_website
_data
(webpage
, page_id
)
181 entries
= self
._vuejs
_entries
(url
, website_data
, page_id
)
186 'title': str_or_none(website_data
.get('title')),
187 'description': str_or_none(website_data
.get('lead')),
190 raise ExtractorError('Could not extract video/website data')
192 def _vuejs_entries(self
, url
, website_data
, page_id
):
194 def extract_videos(wd
):
195 if wd
.get('latestVideo'):
196 yield self
._extract
_vue
_video
(wd
['latestVideo'])
197 for video
in wd
.get('videos') or []:
198 yield self
._extract
_vue
_video
(video
)
199 for video
in wd
.get('items') or []:
200 yield self
._extract
_vue
_video
(video
)
202 yield from extract_videos(website_data
)
204 if website_data
.get('items_total_count') > website_data
.get('items_per_page'):
205 for page
in itertools
.count(2):
206 page_website_data
= self
._parse
_vue
_website
_data
(
207 self
._download
_webpage
(url
, page_id
, note
='Downloading page #%d' % page
,
208 query
={'page': page}
),
210 if not page_website_data
.get('videos') and not page_website_data
.get('items'):
212 yield from extract_videos(page_website_data
)
214 def _real_extract(self
, url
):
215 page_id
= self
._match
_id
(url
)
216 webpage
, urlh
= self
._download
_webpage
_handle
(url
, page_id
)
218 # The URL may redirect to a VOD
219 # example: https://vod.tvp.pl/48463890/wadowickie-spotkania-z-janem-pawlem-ii
220 if TVPWebsiteIE
.suitable(urlh
.url
):
221 return self
.url_result(urlh
.url
, ie
=TVPWebsiteIE
.ie_key(), video_id
=page_id
)
224 r
'window\.__(?:video|news|website|directory)Data\s*=',
226 return self
._handle
_vuejs
_page
(url
, webpage
, page_id
)
228 # classic server-side rendered sites
229 video_id
= self
._search
_regex
([
230 r
'<iframe[^>]+src="[^"]*?embed\.php\?(?:[^&]+&)*ID=(\d+)',
231 r
'<iframe[^>]+src="[^"]*?object_id=(\d+)',
232 r
"object_id\s*:\s*'(\d+)'",
233 r
'data-video-id="(\d+)"',
235 # abc.tvp.pl - somehow there are more than one video IDs that seem to be the same video?
236 # the first one is referenced to as "copyid", and seems to be unused by the website
237 r
'<script>\s*tvpabc\.video\.init\(\s*\d+,\s*(\d+)\s*\)\s*</script>',
238 ], webpage
, 'video id', default
=page_id
)
240 '_type': 'url_transparent',
241 'url': 'tvp:' + video_id
,
242 'description': self
._og
_search
_description
(
243 webpage
, default
=None) or (self
._html
_search
_meta
(
244 'description', webpage
, default
=None)
245 if '//s.tvp.pl/files/portal/v' in webpage
else None),
246 'thumbnail': self
._og
_search
_thumbnail
(webpage
, default
=None),
247 'ie_key': 'TVPEmbed',
251 class TVPStreamIE(InfoExtractor
):
252 IE_NAME
= 'tvp:stream'
253 _VALID_URL
= r
'(?:tvpstream:|https?://tvpstream\.vod\.tvp\.pl/(?:\?(?:[^&]+[&;])*channel_id=)?)(?P<id>\d*)'
255 # untestable as "video" id changes many times across a day
256 'url': 'https://tvpstream.vod.tvp.pl/?channel_id=1455',
257 'only_matching': True,
259 'url': 'tvpstream:39821455',
260 'only_matching': True,
262 # the default stream when you provide no channel_id, most probably TVP Info
264 'only_matching': True,
266 'url': 'https://tvpstream.vod.tvp.pl/',
267 'only_matching': True,
270 _PLAYER_BOX_RE
= r
'<div\s[^>]*id\s*=\s*["\']?tvp_player_box
["\']?[^>]+data-%s-id\s*=\s*["\']?
(\d
+)'
271 _BUTTON_RE = r'<div\s
[^
>]*data
-channel
-id=["\']?%s["\']?
[^
>]*\sdata
-title
=(?
:"([^"]*)"|\'([^\']*)\')[^>]*\sdata-stationname=(?:"([^
"]*)"|
\'([^
\']*)\')'
273 def _real_extract(self, url):
274 channel_id = self._match_id(url)
275 channel_url = self._proto_relative_url('//tvpstream
.vod
.tvp
.pl
/?channel_id
=%s' % channel_id or 'default
')
276 webpage = self._download_webpage(channel_url, channel_id, 'Downloading channel webpage
')
278 channel_id = self._search_regex(self._PLAYER_BOX_RE % 'channel
',
279 webpage, 'default channel
id')
280 video_id = self._search_regex(self._PLAYER_BOX_RE % 'video
',
282 audition_title, station_name = self._search_regex(
283 self._BUTTON_RE % (re.escape(channel_id)), webpage,
284 'audition title
and station name
',
287 '_type
': 'url_transparent
',
289 'url
': 'tvp
:%s' % video_id,
290 'title
': audition_title,
291 'alt_title
': station_name,
293 'ie_key
': 'TVPEmbed
',
297 class TVPEmbedIE(InfoExtractor):
298 IE_NAME = 'tvp
:embed
'
299 IE_DESC = 'Telewizja Polska
'
300 _VALID_URL = r'''(?x)
305 (?:tvp(?:parlament)?\.pl|tvp\.info|polandin\.com)/
307 (?:tvplayer\.php\?.*?object_id
308 |TVPlayer2/(?:embed|api)\.php\?.*[Ii][Dd])
309 |shared/details\.php\?.*?object_id)
319 'title
': 'Czas honoru
, odc
. 13 – Władek
',
320 'description
': 'md5
:76649d2014f65c99477be17f23a4dead
',
324 'url
': 'https
://www
.tvp
.pl
/sess
/tvplayer
.php?object_id
=51247504&
;autoplay
=false
',
328 'title
': 'Razmova
091220',
331 # TVPlayer2 embed URL
332 'url
': 'https
://tvp
.info
/sess
/TVPlayer2
/embed
.php?ID
=50595757',
333 'only_matching
': True,
335 'url
': 'https
://wiadomosci
.tvp
.pl
/sess
/TVPlayer2
/api
.php?
id=51233452',
336 'only_matching
': True,
338 # pulsembed on dziennik.pl
339 'url
': 'https
://www
.tvp
.pl
/shared
/details
.php?copy_id
=52205981&object_id
=52204505&autoplay
=false
&is_muted
=false
&allowfullscreen
=true
&template
=external
-embed
/video
/iframe
-video
.html
',
340 'only_matching
': True,
344 def _extract_urls(webpage, **kw):
345 return [m.group('embed
') for m in re.finditer(
346 r'(?x
)<iframe
[^
>]+?src
=(["\'])(?P<embed>%s)' % TVPEmbedIE._VALID_URL[4:],
349 def _real_extract(self, url):
350 video_id = self._match_id(url)
352 # it could be anything that is a valid JS function name
353 callback = random.choice((
358 'sasin_przejebal_70_milionow_PLN',
359 'tvp_is_a_state_propaganda_service',
362 webpage = self._download_webpage(
363 ('https://www.tvp.pl/sess/TVPlayer2/api.php?id=%s'
364 + '&@method=getTvpConfig&@callback=%s') % (video_id, callback), video_id)
366 # stripping JSONP padding
367 datastr = webpage[15 + len(callback):-3]
368 if datastr.startswith('null,'):
369 error = self._parse_json(datastr[5:], video_id)
370 raise ExtractorError(error[0]['desc'])
372 content = self._parse_json(datastr, video_id)['content']
373 info = content['info']
374 is_live = try_get(info, lambda x: x['isLive'], bool)
377 for file in content['files']:
378 video_url = file.get('url')
381 if video_url.endswith('.m3u8'):
382 formats.extend(self._extract_m3u8_formats(video_url, video_id, m3u8_id='hls', fatal=False, live=is_live))
383 elif video_url.endswith('.mpd'):
385 # doesn't work with either ffmpeg or native downloader
387 formats.extend(self._extract_mpd_formats(video_url, video_id, mpd_id='dash', fatal=False))
388 elif video_url.endswith('.f4m'):
389 formats.extend(self._extract_f4m_formats(video_url, video_id, f4m_id='hds', fatal=False))
390 elif video_url.endswith('.ism/manifest'):
391 formats.extend(self._extract_ism_formats(video_url, video_id, ism_id='mss', fatal=False))
393 # mp4, wmv or something
394 quality = file.get('quality', {})
396 'format_id': 'direct',
398 'ext': determine_ext(video_url, file['type']),
399 'fps': int_or_none(quality.get('fps')),
400 'tbr': int_or_none(quality.get('bitrate')),
401 'width': int_or_none(quality.get('width')),
402 'height': int_or_none(quality.get('height')),
405 self._sort_formats(formats)
407 title = dict_get(info, ('subtitle', 'title', 'seoTitle'))
408 description = dict_get(info, ('description', 'seoDescription'))
410 for thumb in content.get('posters') or ():
411 thumb_url = thumb.get('src')
412 if not thumb_url or '{width}' in thumb_url or '{height}' in thumb_url:
415 'url': thumb.get('src'),
416 'width': thumb.get('width'),
417 'height': thumb.get('height'),
419 age_limit = try_get(info, lambda x: x['ageGroup']['minAge'], int)
422 duration = try_get(info, lambda x: x['duration'], int) if not is_live else None
425 for sub in content.get('subtitles') or []:
426 if not sub.get('url'):
428 subtitles.setdefault(sub['lang'], []).append({
430 'ext': sub.get('type'),
436 'description': description,
437 'thumbnails': thumbnails,
438 'age_limit': age_limit,
440 'duration': duration,
442 'subtitles': subtitles,
446 if info.get('vortalName') == 'vod':
448 'title': '%s, %s' % (info.get('title'), info.get('subtitle')),
449 'series': info.get('title'),
450 'season': info.get('season'),
451 'episode_number': info.get('episode'),
457 class TVPWebsiteIE(InfoExtractor):
458 IE_NAME = 'tvp:series'
459 _VALID_URL = r'https?://vod\.tvp\.pl/website/(?P<display_id>[^,]+),(?P<id>\d+)'
463 'url': 'https://vod.tvp.pl/website/wspaniale-stulecie,17069012/video',
467 'playlist_count': 312,
470 'url': 'https://vod.tvp.pl/website/krzysztof-krawczyk-cale-moje-zycie,51374466',
474 'title': 'Krzysztof Krawczyk – całe moje życie, Krzysztof Krawczyk – całe moje życie',
475 'description': 'md5:2e80823f00f5fc263555482f76f8fa42',
479 'skip_download': True,
481 'add_ie': ['TVPEmbed'],
483 'url': 'https://vod.tvp.pl/website/lzy-cennet,38678312',
484 'only_matching': True,
487 def _entries(self, display_id, playlist_id):
488 url = 'https://vod.tvp.pl/website/%s,%s/video' % (display_id, playlist_id)
489 for page_num in itertools.count(1):
490 page = self._download_webpage(
491 url, display_id, 'Downloading page %d' % page_num,
492 query={'page': page_num})
494 video_ids = orderedSet(re.findall(
495 r'<a[^>]+\bhref=["\']/video
/%s,[^
,]+,(\d
+)' % display_id,
501 for video_id in video_ids:
502 yield self.url_result(
503 'tvp
:%s' % video_id, ie=TVPEmbedIE.ie_key(),
506 def _real_extract(self, url):
507 mobj = self._match_valid_url(url)
508 display_id, playlist_id = mobj.group('display_id
', 'id')
509 return self.playlist_result(
510 self._entries(display_id, playlist_id), playlist_id)