2 from __future__
import unicode_literals
8 from .common
import InfoExtractor
21 class TVPIE(InfoExtractor
):
23 IE_DESC
= 'Telewizja Polska'
24 _VALID_URL
= r
'https?://(?:[^/]+\.)?(?:tvp(?:parlament)?\.(?:pl|info)|polandin\.com)/(?:video/(?:[^,\s]*,)*|(?:(?!\d+/)[^/]+/)*)(?P<id>\d+)'
27 # TVPlayer 2 in js wrapper
28 'url': 'https://vod.tvp.pl/video/czas-honoru,i-seria-odc-13,194536',
32 'title': 'Czas honoru, odc. 13 – Władek',
33 'description': 'md5:437f48b93558370b031740546b696e24',
38 'url': 'http://www.tvp.pl/there-can-be-anything-so-i-shortened-it/17916176',
42 'title': 'TVP Gorzów pokaże filmy studentów z podroży dookoła świata',
43 'description': 'TVP Gorzów pokaże filmy studentów z podroży dookoła świata',
46 # TVPlayer 2 in iframe
47 'url': 'https://wiadomosci.tvp.pl/50725617/dzieci-na-sprzedaz-dla-homoseksualistow',
51 'title': 'Dzieci na sprzedaż dla homoseksualistów',
52 'description': 'md5:7d318eef04e55ddd9f87a8488ac7d590',
56 # TVPlayer 2 in client-side rendered website (regional; window.__newsData)
57 'url': 'https://warszawa.tvp.pl/25804446/studio-yayo',
61 'title': 'Studio Yayo',
62 'upload_date': '20160616',
63 'timestamp': 1466075700,
66 # TVPlayer 2 in client-side rendered website (tvp.info; window.__videoData)
67 'url': 'https://www.tvp.info/52880236/09042021-0800',
71 'title': '09.04.2021, 08:00',
74 # client-side rendered (regional) program (playlist) page
75 'url': 'https://opole.tvp.pl/9660819/rozmowa-dnia',
78 'description': 'Od poniedziałku do piątku o 18:55',
79 'title': 'Rozmowa dnia',
81 'playlist_mincount': 1800,
83 'skip_download': True,
86 # ABC-specific video embeding
87 # moved to https://bajkowakraina.tvp.pl/wideo/50981130,teleranek,51027049,zubr,51116450
88 'url': 'https://abc.tvp.pl/48636269/zubry-odc-124',
92 'title': 'Teleranek, Żubr',
94 'skip': 'unavailable',
96 # yet another vue page
97 'url': 'https://jp2.tvp.pl/46925618/filmy',
102 'playlist_mincount': 19,
104 'url': 'http://vod.tvp.pl/seriale/obyczajowe/na-sygnale/sezon-2-27-/odc-39/17834272',
105 'only_matching': True,
107 'url': 'http://wiadomosci.tvp.pl/25169746/24052016-1200',
108 'only_matching': True,
110 'url': 'http://krakow.tvp.pl/25511623/25lecie-mck-wyjatkowe-miejsce-na-mapie-krakowa',
111 'only_matching': True,
113 'url': 'http://teleexpress.tvp.pl/25522307/wierni-wzieli-udzial-w-procesjach',
114 'only_matching': True,
116 'url': 'http://sport.tvp.pl/25522165/krychowiak-uspokaja-w-sprawie-kontuzji-dwa-tygodnie-to-maksimum',
117 'only_matching': True,
119 'url': 'http://www.tvp.info/25511919/trwa-rewolucja-wladza-zdecydowala-sie-na-pogwalcenie-konstytucji',
120 'only_matching': True,
122 'url': 'https://tvp.info/49193823/teczowe-flagi-na-pomnikach-prokuratura-wszczela-postepowanie-wieszwiecej',
123 'only_matching': True,
125 'url': 'https://www.tvpparlament.pl/retransmisje-vod/inne/wizyta-premiera-mateusza-morawieckiego-w-firmie-berotu-sp-z-oo/48857277',
126 'only_matching': True,
128 'url': 'https://polandin.com/47942651/pln-10-billion-in-subsidies-transferred-to-companies-pm',
129 'only_matching': True,
132 def _parse_vue_website_data(self
, webpage
, page_id
):
133 website_data
= self
._search
_regex
([
134 # website - regiony, tvp.info
135 # directory - jp2.tvp.pl
136 r
'window\.__(?:website|directory)Data\s*=\s*({(?:.|\s)+?});',
137 ], webpage
, 'website data')
140 return self
._parse
_json
(website_data
, page_id
, transform_source
=js_to_json
)
142 def _extract_vue_video(self
, video_data
, page_id
=None):
143 if isinstance(video_data
, str):
144 video_data
= self
._parse
_json
(video_data
, page_id
, transform_source
=js_to_json
)
146 image
= video_data
.get('image')
148 for thumb
in (image
if isinstance(image
, list) else [image
]):
149 thmb_url
= str_or_none(thumb
.get('url'))
154 is_website
= video_data
.get('type') == 'website'
156 url
= video_data
['url']
157 fucked_up_url_parts
= re
.match(r
'https?://vod\.tvp\.pl/(\d+)/([^/?#]+)', url
)
158 if fucked_up_url_parts
:
159 url
= f
'https://vod.tvp.pl/website/{fucked_up_url_parts.group(2)},{fucked_up_url_parts.group(1)}'
161 url
= 'tvp:' + str_or_none(video_data
.get('_id') or page_id
)
163 '_type': 'url_transparent',
164 'id': str_or_none(video_data
.get('_id') or page_id
),
166 'ie_key': 'TVPEmbed' if not is_website
else 'TVPWebsite',
167 'title': str_or_none(video_data
.get('title')),
168 'description': str_or_none(video_data
.get('lead')),
169 'timestamp': int_or_none(video_data
.get('release_date_long')),
170 'duration': int_or_none(video_data
.get('duration')),
171 'thumbnails': thumbnails
,
174 def _handle_vuejs_page(self
, url
, webpage
, page_id
):
175 # vue client-side rendered sites (all regional pages + tvp.info)
176 video_data
= self
._search
_regex
([
177 r
'window\.__(?:news|video)Data\s*=\s*({(?:.|\s)+?})\s*;',
178 ], webpage
, 'video data', default
=None)
180 return self
._extract
_vue
_video
(video_data
, page_id
=page_id
)
182 website_data
= self
._parse
_vue
_website
_data
(webpage
, page_id
)
184 entries
= self
._vuejs
_entries
(url
, website_data
, page_id
)
189 'title': str_or_none(website_data
.get('title')),
190 'description': str_or_none(website_data
.get('lead')),
193 raise ExtractorError('Could not extract video/website data')
195 def _vuejs_entries(self
, url
, website_data
, page_id
):
197 def extract_videos(wd
):
198 if wd
.get('latestVideo'):
199 yield self
._extract
_vue
_video
(wd
['latestVideo'])
200 for video
in wd
.get('videos') or []:
201 yield self
._extract
_vue
_video
(video
)
202 for video
in wd
.get('items') or []:
203 yield self
._extract
_vue
_video
(video
)
205 yield from extract_videos(website_data
)
207 if website_data
.get('items_total_count') > website_data
.get('items_per_page'):
208 for page
in itertools
.count(2):
209 page_website_data
= self
._parse
_vue
_website
_data
(
210 self
._download
_webpage
(url
, page_id
, note
='Downloading page #%d' % page
,
211 query
={'page': page}
),
213 if not page_website_data
.get('videos') and not page_website_data
.get('items'):
215 yield from extract_videos(page_website_data
)
217 def _real_extract(self
, url
):
218 page_id
= self
._match
_id
(url
)
219 webpage
, urlh
= self
._download
_webpage
_handle
(url
, page_id
)
221 # The URL may redirect to a VOD
222 # example: https://vod.tvp.pl/48463890/wadowickie-spotkania-z-janem-pawlem-ii
223 if TVPWebsiteIE
.suitable(urlh
.url
):
224 return self
.url_result(urlh
.url
, ie
=TVPWebsiteIE
.ie_key(), video_id
=page_id
)
227 r
'window\.__(?:video|news|website|directory)Data\s*=',
229 return self
._handle
_vuejs
_page
(url
, webpage
, page_id
)
231 # classic server-side rendered sites
232 video_id
= self
._search
_regex
([
233 r
'<iframe[^>]+src="[^"]*?embed\.php\?(?:[^&]+&)*ID=(\d+)',
234 r
'<iframe[^>]+src="[^"]*?object_id=(\d+)',
235 r
"object_id\s*:\s*'(\d+)'",
236 r
'data-video-id="(\d+)"',
238 # abc.tvp.pl - somehow there are more than one video IDs that seem to be the same video?
239 # the first one is referenced to as "copyid", and seems to be unused by the website
240 r
'<script>\s*tvpabc\.video\.init\(\s*\d+,\s*(\d+)\s*\)\s*</script>',
241 ], webpage
, 'video id', default
=page_id
)
243 '_type': 'url_transparent',
244 'url': 'tvp:' + video_id
,
245 'description': self
._og
_search
_description
(
246 webpage
, default
=None) or (self
._html
_search
_meta
(
247 'description', webpage
, default
=None)
248 if '//s.tvp.pl/files/portal/v' in webpage
else None),
249 'thumbnail': self
._og
_search
_thumbnail
(webpage
, default
=None),
250 'ie_key': 'TVPEmbed',
254 class TVPStreamIE(InfoExtractor
):
255 IE_NAME
= 'tvp:stream'
256 _VALID_URL
= r
'(?:tvpstream:|https?://tvpstream\.vod\.tvp\.pl/(?:\?(?:[^&]+[&;])*channel_id=)?)(?P<id>\d*)'
258 # untestable as "video" id changes many times across a day
259 'url': 'https://tvpstream.vod.tvp.pl/?channel_id=1455',
260 'only_matching': True,
262 'url': 'tvpstream:39821455',
263 'only_matching': True,
265 # the default stream when you provide no channel_id, most probably TVP Info
267 'only_matching': True,
269 'url': 'https://tvpstream.vod.tvp.pl/',
270 'only_matching': True,
273 _PLAYER_BOX_RE
= r
'<div\s[^>]*id\s*=\s*["\']?tvp_player_box
["\']?[^>]+data-%s-id\s*=\s*["\']?
(\d
+)'
274 _BUTTON_RE = r'<div\s
[^
>]*data
-channel
-id=["\']?%s["\']?
[^
>]*\sdata
-title
=(?
:"([^"]*)"|\'([^\']*)\')[^>]*\sdata-stationname=(?:"([^
"]*)"|
\'([^
\']*)\')'
276 def _real_extract(self, url):
277 channel_id = self._match_id(url)
278 channel_url = self._proto_relative_url('//tvpstream
.vod
.tvp
.pl
/?channel_id
=%s' % channel_id or 'default
')
279 webpage = self._download_webpage(channel_url, channel_id, 'Downloading channel webpage
')
281 channel_id = self._search_regex(self._PLAYER_BOX_RE % 'channel
',
282 webpage, 'default channel
id')
283 video_id = self._search_regex(self._PLAYER_BOX_RE % 'video
',
285 audition_title, station_name = self._search_regex(
286 self._BUTTON_RE % (re.escape(channel_id)), webpage,
287 'audition title
and station name
',
290 '_type
': 'url_transparent
',
292 'url
': 'tvp
:%s' % video_id,
293 'title
': audition_title,
294 'alt_title
': station_name,
296 'ie_key
': 'TVPEmbed
',
300 class TVPEmbedIE(InfoExtractor):
301 IE_NAME = 'tvp
:embed
'
302 IE_DESC = 'Telewizja Polska
'
303 _VALID_URL = r'''(?x)
308 (?:tvp(?:parlament)?\.pl|tvp\.info|polandin\.com)/
310 (?:tvplayer\.php\?.*?object_id
311 |TVPlayer2/(?:embed|api)\.php\?.*[Ii][Dd])
312 |shared/details\.php\?.*?object_id)
322 'title
': 'Czas honoru
, odc
. 13 – Władek
',
323 'description
': 'md5
:76649d2014f65c99477be17f23a4dead
',
327 'url
': 'https
://www
.tvp
.pl
/sess
/tvplayer
.php?object_id
=51247504&
;autoplay
=false
',
331 'title
': 'Razmova
091220',
334 # TVPlayer2 embed URL
335 'url
': 'https
://tvp
.info
/sess
/TVPlayer2
/embed
.php?ID
=50595757',
336 'only_matching
': True,
338 'url
': 'https
://wiadomosci
.tvp
.pl
/sess
/TVPlayer2
/api
.php?
id=51233452',
339 'only_matching
': True,
341 # pulsembed on dziennik.pl
342 'url
': 'https
://www
.tvp
.pl
/shared
/details
.php?copy_id
=52205981&object_id
=52204505&autoplay
=false
&is_muted
=false
&allowfullscreen
=true
&template
=external
-embed
/video
/iframe
-video
.html
',
343 'only_matching
': True,
347 def _extract_urls(webpage, **kw):
348 return [m.group('embed
') for m in re.finditer(
349 r'(?x
)<iframe
[^
>]+?src
=(["\'])(?P<embed>%s)' % TVPEmbedIE._VALID_URL[4:],
352 def _real_extract(self, url):
353 video_id = self._match_id(url)
355 # it could be anything that is a valid JS function name
356 callback = random.choice((
361 'sasin_przejebal_70_milionow_PLN',
362 'tvp_is_a_state_propaganda_service',
365 webpage = self._download_webpage(
366 ('https://www.tvp.pl/sess/TVPlayer2/api.php?id=%s'
367 + '&@method=getTvpConfig&@callback=%s') % (video_id, callback), video_id)
369 # stripping JSONP padding
370 datastr = webpage[15 + len(callback):-3]
371 if datastr.startswith('null,'):
372 error = self._parse_json(datastr[5:], video_id)
373 raise ExtractorError(error[0]['desc'])
375 content = self._parse_json(datastr, video_id)['content']
376 info = content['info']
377 is_live = try_get(info, lambda x: x['isLive'], bool)
380 for file in content['files']:
381 video_url = file.get('url')
384 if video_url.endswith('.m3u8'):
385 formats.extend(self._extract_m3u8_formats(video_url, video_id, m3u8_id='hls', fatal=False, live=is_live))
386 elif video_url.endswith('.mpd'):
388 # doesn't work with either ffmpeg or native downloader
390 formats.extend(self._extract_mpd_formats(video_url, video_id, mpd_id='dash', fatal=False))
391 elif video_url.endswith('.f4m'):
392 formats.extend(self._extract_f4m_formats(video_url, video_id, f4m_id='hds', fatal=False))
393 elif video_url.endswith('.ism/manifest'):
394 formats.extend(self._extract_ism_formats(video_url, video_id, ism_id='mss', fatal=False))
396 # mp4, wmv or something
397 quality = file.get('quality', {})
399 'format_id': 'direct',
401 'ext': determine_ext(video_url, file['type']),
402 'fps': int_or_none(quality.get('fps')),
403 'tbr': int_or_none(quality.get('bitrate')),
404 'width': int_or_none(quality.get('width')),
405 'height': int_or_none(quality.get('height')),
408 self._sort_formats(formats)
410 title = dict_get(info, ('subtitle', 'title', 'seoTitle'))
411 description = dict_get(info, ('description', 'seoDescription'))
413 for thumb in content.get('posters') or ():
414 thumb_url = thumb.get('src')
415 if not thumb_url or '{width}' in thumb_url or '{height}' in thumb_url:
418 'url': thumb.get('src'),
419 'width': thumb.get('width'),
420 'height': thumb.get('height'),
422 age_limit = try_get(info, lambda x: x['ageGroup']['minAge'], int)
425 duration = try_get(info, lambda x: x['duration'], int) if not is_live else None
428 for sub in content.get('subtitles') or []:
429 if not sub.get('url'):
431 subtitles.setdefault(sub['lang'], []).append({
433 'ext': sub.get('type'),
439 'description': description,
440 'thumbnails': thumbnails,
441 'age_limit': age_limit,
443 'duration': duration,
445 'subtitles': subtitles,
449 if info.get('vortalName') == 'vod':
451 'title': '%s, %s' % (info.get('title'), info.get('subtitle')),
452 'series': info.get('title'),
453 'season': info.get('season'),
454 'episode_number': info.get('episode'),
460 class TVPWebsiteIE(InfoExtractor):
461 IE_NAME = 'tvp:series'
462 _VALID_URL = r'https?://vod\.tvp\.pl/website/(?P<display_id>[^,]+),(?P<id>\d+)'
466 'url': 'https://vod.tvp.pl/website/wspaniale-stulecie,17069012/video',
470 'playlist_count': 312,
473 'url': 'https://vod.tvp.pl/website/krzysztof-krawczyk-cale-moje-zycie,51374466',
477 'title': 'Krzysztof Krawczyk – całe moje życie, Krzysztof Krawczyk – całe moje życie',
478 'description': 'md5:2e80823f00f5fc263555482f76f8fa42',
482 'skip_download': True,
484 'add_ie': ['TVPEmbed'],
486 'url': 'https://vod.tvp.pl/website/lzy-cennet,38678312',
487 'only_matching': True,
490 def _entries(self, display_id, playlist_id):
491 url = 'https://vod.tvp.pl/website/%s,%s/video' % (display_id, playlist_id)
492 for page_num in itertools.count(1):
493 page = self._download_webpage(
494 url, display_id, 'Downloading page %d' % page_num,
495 query={'page': page_num})
497 video_ids = orderedSet(re.findall(
498 r'<a[^>]+\bhref=["\']/video
/%s,[^
,]+,(\d
+)' % display_id,
504 for video_id in video_ids:
505 yield self.url_result(
506 'tvp
:%s' % video_id, ie=TVPEmbedIE.ie_key(),
509 def _real_extract(self, url):
510 mobj = self._match_valid_url(url)
511 display_id, playlist_id = mobj.group('display_id
', 'id')
512 return self.playlist_result(
513 self._entries(display_id, playlist_id), playlist_id)