yt_dlp/extractor/wppilot.py

   1 import json
   2 import random
   3 import re
   4
   5 from .common import InfoExtractor
   6 from ..utils import (
   7     ExtractorError,
   8     try_get,
   9 )
  10
  11
  12 class WPPilotBaseIE(InfoExtractor):
  13     _VIDEO_URL = 'https://pilot.wp.pl/api/v1/channel/%s'
  14     _VIDEO_GUEST_URL = 'https://pilot.wp.pl/api/v1/guest/channel/%s'
  15
  16     _HEADERS_WEB = {
  17         'Content-Type': 'application/json; charset=UTF-8',
  18         'Referer': 'https://pilot.wp.pl/tv/',
  19     }
  20
  21     def _get_channel_list(self, cache=True):
  22         if cache is True:
  23             cache_res = self.cache.load('wppilot', 'channel-list')
  24             if cache_res:
  25                 return cache_res, True
  26         webpage = self._download_webpage('https://pilot.wp.pl/tv/', None, 'Downloading webpage')
  27         page_data_base_url = self._search_regex(
  28             r'<script src="(https://wp-pilot-gatsby\.wpcdn\.pl/v[\d.-]+/desktop)',
  29             webpage, 'gatsby build version') + '/page-data'
  30         page_data = self._download_json(f'{page_data_base_url}/tv/page-data.json', None, 'Downloading page data')
  31         for qhash in page_data['staticQueryHashes']:
  32             qhash_content = self._download_json(
  33                 f'{page_data_base_url}/sq/d/{qhash}.json', None,
  34                 'Searching for channel list')
  35             channel_list = try_get(qhash_content, lambda x: x['data']['allChannels']['nodes'])
  36             if channel_list is None:
  37                 continue
  38             self.cache.store('wppilot', 'channel-list', channel_list)
  39             return channel_list, False
  40         raise ExtractorError('Unable to find the channel list')
  41
  42     def _parse_channel(self, chan):
  43         return {
  44             'id': str(chan['id']),
  45             'title': chan['name'],
  46             'is_live': True,
  47             'thumbnails': [{
  48                 'id': key,
  49                 'url': chan[key],
  50             } for key in ('thumbnail', 'thumbnail_mobile', 'icon') if chan.get(key)],
  51         }
  52
  53
  54 class WPPilotIE(WPPilotBaseIE):
  55     _VALID_URL = r'(?:https?://pilot\.wp\.pl/tv/?#|wppilot:)(?P<id>[a-z\d-]+)'
  56     IE_NAME = 'wppilot'
  57
  58     _TESTS = [{
  59         'url': 'https://pilot.wp.pl/tv/#telewizja-wp-hd',
  60         'info_dict': {
  61             'id': '158',
  62             'ext': 'mp4',
  63             'title': 'Telewizja WP HD',
  64         },
  65         'params': {
  66             'format': 'bestvideo',
  67         },
  68     }, {
  69         # audio only
  70         'url': 'https://pilot.wp.pl/tv/#radio-nowy-swiat',
  71         'info_dict': {
  72             'id': '238',
  73             'ext': 'm4a',
  74             'title': 'Radio Nowy Świat',
  75         },
  76         'params': {
  77             'format': 'bestaudio',
  78         },
  79     }, {
  80         'url': 'wppilot:9',
  81         'only_matching': True,
  82     }]
  83
  84     def _get_channel(self, id_or_slug):
  85         video_list, is_cached = self._get_channel_list(cache=True)
  86         key = 'id' if re.match(r'^\d+$', id_or_slug) else 'slug'
  87         for video in video_list:
  88             if video.get(key) == id_or_slug:
  89                 return self._parse_channel(video)
  90         # if cached channel not found, download and retry
  91         if is_cached:
  92             video_list, _ = self._get_channel_list(cache=False)
  93             for video in video_list:
  94                 if video.get(key) == id_or_slug:
  95                     return self._parse_channel(video)
  96         raise ExtractorError('Channel not found')
  97
  98     def _real_extract(self, url):
  99         video_id = self._match_id(url)
 100
 101         channel = self._get_channel(video_id)
 102         video_id = str(channel['id'])
 103
 104         is_authorized = next((c for c in self.cookiejar if c.name == 'netviapisessid'), None)
 105         # cookies starting with "g:" are assigned to guests
 106         is_authorized = True if is_authorized is not None and not is_authorized.value.startswith('g:') else False
 107
 108         video = self._download_json(
 109             (self._VIDEO_URL if is_authorized else self._VIDEO_GUEST_URL) % video_id,
 110             video_id, query={
 111                 'device_type': 'web',
 112             }, headers=self._HEADERS_WEB,
 113             expected_status=(200, 422))
 114
 115         stream_token = try_get(video, lambda x: x['_meta']['error']['info']['stream_token'])
 116         if stream_token:
 117             close = self._download_json(
 118                 'https://pilot.wp.pl/api/v1/channels/close', video_id,
 119                 'Invalidating previous stream session', headers=self._HEADERS_WEB,
 120                 data=json.dumps({
 121                     'channelId': video_id,
 122                     't': stream_token,
 123                 }).encode('utf-8'))
 124             if try_get(close, lambda x: x['data']['status']) == 'ok':
 125                 return self.url_result(url, ie=WPPilotIE.ie_key())
 126
 127         formats = []
 128
 129         for fmt in video['data']['stream_channel']['streams']:
 130             # live DASH does not work for now
 131             # if fmt['type'] == 'dash@live:abr':
 132             #     formats.extend(
 133             #         self._extract_mpd_formats(
 134             #             random.choice(fmt['url']), video_id))
 135             if fmt['type'] == 'hls@live:abr':
 136                 formats.extend(
 137                     self._extract_m3u8_formats(
 138                         random.choice(fmt['url']),
 139                         video_id, live=True))
 140
 141         channel['formats'] = formats
 142         return channel
 143
 144
 145 class WPPilotChannelsIE(WPPilotBaseIE):
 146     _VALID_URL = r'(?:https?://pilot\.wp\.pl/(?:tv/?)?(?:\?[^#]*)?#?|wppilot:)$'
 147     IE_NAME = 'wppilot:channels'
 148
 149     _TESTS = [{
 150         'url': 'wppilot:',
 151         'info_dict': {
 152             'id': 'wppilot',
 153             'title': 'WP Pilot',
 154         },
 155         'playlist_mincount': 100,
 156     }, {
 157         'url': 'https://pilot.wp.pl/',
 158         'only_matching': True,
 159     }]
 160
 161     def _entries(self):
 162         channel_list, _ = self._get_channel_list()
 163         for chan in channel_list:
 164             entry = self._parse_channel(chan)
 165             entry.update({
 166                 '_type': 'url_transparent',
 167                 'url': f'wppilot:{chan["id"]}',
 168                 'ie_key': WPPilotIE.ie_key(),
 169             })
 170             yield entry
 171
 172     def _real_extract(self, url):
 173         return self.playlist_result(self._entries(), 'wppilot', 'WP Pilot')