[yt-dlp.git] / yt_dlp / extractor / wppilot.py

# coding: utf-8

from .common import InfoExtractor
from ..utils import (
    try_get,
    ExtractorError,
)

import json
import random
import re


class WPPilotBaseIE(InfoExtractor):
    _VIDEO_URL = 'https://pilot.wp.pl/api/v1/channel/%s'
    _VIDEO_GUEST_URL = 'https://pilot.wp.pl/api/v1/guest/channel/%s'

    _HEADERS_WEB = {
        'Content-Type': 'application/json; charset=UTF-8',
        'Referer': 'https://pilot.wp.pl/tv/',
    }

    def _get_channel_list(self, cache=True):
        if cache is True:
            cache_res = self._downloader.cache.load('wppilot', 'channel-list')
            if cache_res:
                return cache_res, True
        webpage = self._download_webpage('https://pilot.wp.pl/tv/', None, 'Downloading webpage')
        page_data_base_url = self._search_regex(
            r'<script src="(https://wp-pilot-gatsby\.wpcdn\.pl/v[\d.-]+/desktop)',
            webpage, 'gatsby build version') + '/page-data'
        page_data = self._download_json(f'{page_data_base_url}/tv/page-data.json', None, 'Downloading page data')
        for qhash in page_data['staticQueryHashes']:
            qhash_content = self._download_json(
                f'{page_data_base_url}/sq/d/{qhash}.json', None,
                'Searching for channel list')
            channel_list = try_get(qhash_content, lambda x: x['data']['allChannels']['nodes'])
            if channel_list is None:
                continue
            self._downloader.cache.store('wppilot', 'channel-list', channel_list)
            return channel_list, False
        raise ExtractorError('Unable to find the channel list')

    def _parse_channel(self, chan):
        return {
            'id': str(chan['id']),
            'title': chan['name'],
            'is_live': True,
            'thumbnails': [{
                'id': key,
                'url': chan[key],
            } for key in ('thumbnail', 'thumbnail_mobile', 'icon') if chan.get(key)],
        }


class WPPilotIE(WPPilotBaseIE):
    _VALID_URL = r'(?:https?://pilot\.wp\.pl/tv/?#|wppilot:)(?P<id>[a-z\d-]+)'
    IE_NAME = 'wppilot'

    _TESTS = [{
        'url': 'https://pilot.wp.pl/tv/#telewizja-wp-hd',
        'info_dict': {
            'id': '158',
            'ext': 'mp4',
            'title': 'Telewizja WP HD',
        },
        'params': {
            'format': 'bestvideo',
        },
    }, {
        # audio only
        'url': 'https://pilot.wp.pl/tv/#radio-nowy-swiat',
        'info_dict': {
            'id': '238',
            'ext': 'm4a',
            'title': 'Radio Nowy Świat',
        },
        'params': {
            'format': 'bestaudio',
        },
    }, {
        'url': 'wppilot:9',
        'only_matching': True,
    }]

    def _get_channel(self, id_or_slug):
        video_list, is_cached = self._get_channel_list(cache=True)
        key = 'id' if re.match(r'^\d+$', id_or_slug) else 'slug'
        for video in video_list:
            if video.get(key) == id_or_slug:
                return self._parse_channel(video)
        # if cached channel not found, download and retry
        if is_cached:
            video_list, _ = self._get_channel_list(cache=False)
            for video in video_list:
                if video.get(key) == id_or_slug:
                    return self._parse_channel(video)
        raise ExtractorError('Channel not found')

    def _real_extract(self, url):
        video_id = self._match_id(url)

        channel = self._get_channel(video_id)
        video_id = str(channel['id'])

        is_authorized = next((c for c in self._downloader.cookiejar if c.name == 'netviapisessid'), None)
        # cookies starting with "g:" are assigned to guests
        is_authorized = True if is_authorized is not None and not is_authorized.value.startswith('g:') else False

        video = self._download_json(
            (self._VIDEO_URL if is_authorized else self._VIDEO_GUEST_URL) % video_id,
            video_id, query={
                'device_type': 'web',
            }, headers=self._HEADERS_WEB,
            expected_status=(200, 422))

        stream_token = try_get(video, lambda x: x['_meta']['error']['info']['stream_token'])
        if stream_token:
            close = self._download_json(
                'https://pilot.wp.pl/api/v1/channels/close', video_id,
                'Invalidating previous stream session', headers=self._HEADERS_WEB,
                data=json.dumps({
                    'channelId': video_id,
                    't': stream_token,
                }).encode('utf-8'))
            if try_get(close, lambda x: x['data']['status']) == 'ok':
                return self.url_result(url, ie=WPPilotIE.ie_key())

        formats = []

        for fmt in video['data']['stream_channel']['streams']:
            # live DASH does not work for now
            # if fmt['type'] == 'dash@live:abr':
            #     formats.extend(
            #         self._extract_mpd_formats(
            #             random.choice(fmt['url']), video_id))
            if fmt['type'] == 'hls@live:abr':
                formats.extend(
                    self._extract_m3u8_formats(
                        random.choice(fmt['url']),
                        video_id, live=True))

        self._sort_formats(formats)

        channel['formats'] = formats
        return channel


class WPPilotChannelsIE(WPPilotBaseIE):
    _VALID_URL = r'(?:https?://pilot\.wp\.pl/(?:tv/?)?(?:\?[^#]*)?#?|wppilot:)$'
    IE_NAME = 'wppilot:channels'

    _TESTS = [{
        'url': 'wppilot:',
        'info_dict': {
            'id': 'wppilot',
            'title': 'WP Pilot',
        },
        'playlist_mincount': 100,
    }, {
        'url': 'https://pilot.wp.pl/',
        'only_matching': True,
    }]

    def _entries(self):
        channel_list, _ = self._get_channel_list()
        for chan in channel_list:
            entry = self._parse_channel(chan)
            entry.update({
                '_type': 'url_transparent',
                'url': f'wppilot:{chan["id"]}',
                'ie_key': WPPilotIE.ie_key(),
            })
            yield entry

    def _real_extract(self, url):
        return self.playlist_result(self._entries(), 'wppilot', 'WP Pilot')
Commit	Line	Data
c0599d4f LL	1	# coding: utf-8
	2
	3	from .common import InfoExtractor
	4	from ..utils import (
	5	try_get,
	6	ExtractorError,
	7	)
	8
	9	import json
	10	import random
	11	import re
	12
	13
	14	class WPPilotBaseIE(InfoExtractor):
	15	_VIDEO_URL = 'https://pilot.wp.pl/api/v1/channel/%s'
	16	_VIDEO_GUEST_URL = 'https://pilot.wp.pl/api/v1/guest/channel/%s'
	17
	18	_HEADERS_WEB = {
	19	'Content-Type': 'application/json; charset=UTF-8',
	20	'Referer': 'https://pilot.wp.pl/tv/',
	21	}
	22
	23	def _get_channel_list(self, cache=True):
	24	if cache is True:
	25	cache_res = self._downloader.cache.load('wppilot', 'channel-list')
	26	if cache_res:
	27	return cache_res, True
	28	webpage = self._download_webpage('https://pilot.wp.pl/tv/', None, 'Downloading webpage')
	29	page_data_base_url = self._search_regex(
	30	r'<script src="(https://wp-pilot-gatsby\.wpcdn\.pl/v[\d.-]+/desktop)',
	31	webpage, 'gatsby build version') + '/page-data'
	32	page_data = self._download_json(f'{page_data_base_url}/tv/page-data.json', None, 'Downloading page data')
	33	for qhash in page_data['staticQueryHashes']:
	34	qhash_content = self._download_json(
	35	f'{page_data_base_url}/sq/d/{qhash}.json', None,
	36	'Searching for channel list')
	37	channel_list = try_get(qhash_content, lambda x: x['data']['allChannels']['nodes'])
	38	if channel_list is None:
	39	continue
	40	self._downloader.cache.store('wppilot', 'channel-list', channel_list)
	41	return channel_list, False
	42	raise ExtractorError('Unable to find the channel list')
	43
	44	def _parse_channel(self, chan):
	45	return {
	46	'id': str(chan['id']),
	47	'title': chan['name'],
	48	'is_live': True,
	49	'thumbnails': [{
	50	'id': key,
	51	'url': chan[key],
	52	} for key in ('thumbnail', 'thumbnail_mobile', 'icon') if chan.get(key)],
	53	}
	54
	55
	56	class WPPilotIE(WPPilotBaseIE):
	57	_VALID_URL = r'(?:https?://pilot\.wp\.pl/tv/?#\|wppilot:)(?P<id>[a-z\d-]+)'
	58	IE_NAME = 'wppilot'
	59
	60	_TESTS = [{
	61	'url': 'https://pilot.wp.pl/tv/#telewizja-wp-hd',
	62	'info_dict': {
	63	'id': '158',
	64	'ext': 'mp4',
65	'title': 'Telewizja WP HD',
66	},
67	'params': {
68	'format': 'bestvideo',
69	},
70	}, {
71	# audio only
72	'url': 'https://pilot.wp.pl/tv/#radio-nowy-swiat',
73	'info_dict': {
74	'id': '238',
75	'ext': 'm4a',
76	'title': 'Radio Nowy Świat',
77	},
78	'params': {
79	'format': 'bestaudio',
80	},
81	}, {
82	'url': 'wppilot:9',
83	'only_matching': True,
84	}]
85
86	def _get_channel(self, id_or_slug):
87	video_list, is_cached = self._get_channel_list(cache=True)
88	key = 'id' if re.match(r'^\d+$', id_or_slug) else 'slug'
89	for video in video_list:
90	if video.get(key) == id_or_slug:
91	return self._parse_channel(video)
92	# if cached channel not found, download and retry
93	if is_cached:
94	video_list, _ = self._get_channel_list(cache=False)
95	for video in video_list:
96	if video.get(key) == id_or_slug:
97	return self._parse_channel(video)
98	raise ExtractorError('Channel not found')
99
100	def _real_extract(self, url):
101	video_id = self._match_id(url)
102
103	channel = self._get_channel(video_id)
104	video_id = str(channel['id'])
105
106	is_authorized = next((c for c in self._downloader.cookiejar if c.name == 'netviapisessid'), None)
107	# cookies starting with "g:" are assigned to guests
108	is_authorized = True if is_authorized is not None and not is_authorized.value.startswith('g:') else False
109
110	video = self._download_json(
111	(self._VIDEO_URL if is_authorized else self._VIDEO_GUEST_URL) % video_id,
112	video_id, query={
113	'device_type': 'web',
114	}, headers=self._HEADERS_WEB,
115	expected_status=(200, 422))
116
117	stream_token = try_get(video, lambda x: x['_meta']['error']['info']['stream_token'])
118	if stream_token:
119	close = self._download_json(
120	'https://pilot.wp.pl/api/v1/channels/close', video_id,
121	'Invalidating previous stream session', headers=self._HEADERS_WEB,
122	data=json.dumps({
123	'channelId': video_id,
124	't': stream_token,
125	}).encode('utf-8'))
126	if try_get(close, lambda x: x['data']['status']) == 'ok':
127	return self.url_result(url, ie=WPPilotIE.ie_key())
128
129	formats = []
130
131	for fmt in video['data']['stream_channel']['streams']:
132	# live DASH does not work for now
133	# if fmt['type'] == 'dash@live:abr':
134	# formats.extend(
135	# self._extract_mpd_formats(
136	# random.choice(fmt['url']), video_id))
137	if fmt['type'] == 'hls@live:abr':
138	formats.extend(
139	self._extract_m3u8_formats(
140	random.choice(fmt['url']),
141	video_id, live=True))
142
143	self._sort_formats(formats)
144
145	channel['formats'] = formats
146	return channel
147
148
149	class WPPilotChannelsIE(WPPilotBaseIE):
150	_VALID_URL = r'(?:https?://pilot\.wp\.pl/(?:tv/?)?(?:\?[^#]*)?#?\|wppilot:)$'
151	IE_NAME = 'wppilot:channels'
152
153	_TESTS = [{
154	'url': 'wppilot:',
155	'info_dict': {
156	'id': 'wppilot',
157	'title': 'WP Pilot',
158	},
159	'playlist_mincount': 100,
160	}, {
161	'url': 'https://pilot.wp.pl/',
162	'only_matching': True,
163	}]
164
165	def _entries(self):
166	channel_list, _ = self._get_channel_list()
167	for chan in channel_list:
168	entry = self._parse_channel(chan)
169	entry.update({
170	'_type': 'url_transparent',
171	'url': f'wppilot:{chan["id"]}',
172	'ie_key': WPPilotIE.ie_key(),
173	})
174	yield entry
175
176	def _real_extract(self, url):
177	return self.playlist_result(self._entries(), 'wppilot', 'WP Pilot')