yt_dlp/extractor/redbee.py

   1 import json
   2 import re
   3 import time
   4 import urllib.parse
   5 import uuid
   6
   7 from .common import InfoExtractor
   8 from ..utils import (
   9     ExtractorError,
  10     float_or_none,
  11     int_or_none,
  12     strip_or_none,
  13     traverse_obj,
  14     try_call,
  15     unified_timestamp,
  16 )
  17
  18
  19 class RedBeeBaseIE(InfoExtractor):
  20     _DEVICE_ID = str(uuid.uuid4())
  21
  22     @property
  23     def _API_URL(self):
  24         """
  25         Ref: https://apidocs.emp.ebsd.ericsson.net
  26         Subclasses must set _REDBEE_CUSTOMER, _REDBEE_BUSINESS_UNIT
  27         """
  28         return f'https://exposure.api.redbee.live/v2/customer/{self._REDBEE_CUSTOMER}/businessunit/{self._REDBEE_BUSINESS_UNIT}'
  29
  30     def _get_bearer_token(self, asset_id, jwt=None):
  31         request = {
  32             'deviceId': self._DEVICE_ID,
  33             'device': {
  34                 'deviceId': self._DEVICE_ID,
  35                 'name': 'Mozilla Firefox 102',
  36                 'type': 'WEB',
  37             },
  38         }
  39         if jwt:
  40             request['jwt'] = jwt
  41
  42         return self._download_json(
  43             f'{self._API_URL}/auth/{"gigyaLogin" if jwt else "anonymous"}',
  44             asset_id, data=json.dumps(request).encode(), headers={
  45                 'Content-Type': 'application/json;charset=utf-8',
  46             })['sessionToken']
  47
  48     def _get_formats_and_subtitles(self, asset_id, **kwargs):
  49         bearer_token = self._get_bearer_token(asset_id, **kwargs)
  50         api_response = self._download_json(
  51             f'{self._API_URL}/entitlement/{asset_id}/play',
  52             asset_id, headers={
  53                 'Authorization': f'Bearer {bearer_token}',
  54                 'Accept': 'application/json, text/plain, */*',
  55             })
  56
  57         formats, subtitles = [], {}
  58         for format_data in api_response['formats']:
  59             if not format_data.get('mediaLocator'):
  60                 continue
  61
  62             fmts, subs = [], {}
  63             if format_data.get('format') == 'DASH':
  64                 fmts, subs = self._extract_mpd_formats_and_subtitles(
  65                     format_data['mediaLocator'], asset_id, fatal=False)
  66             elif format_data.get('format') == 'SMOOTHSTREAMING':
  67                 fmts, subs = self._extract_ism_formats_and_subtitles(
  68                     format_data['mediaLocator'], asset_id, fatal=False)
  69             elif format_data.get('format') == 'HLS':
  70                 fmts, subs = self._extract_m3u8_formats_and_subtitles(
  71                     format_data['mediaLocator'], asset_id, fatal=False)
  72
  73             if format_data.get('drm'):
  74                 for f in fmts:
  75                     f['has_drm'] = True
  76
  77             formats.extend(fmts)
  78             self._merge_subtitles(subs, target=subtitles)
  79
  80         return formats, subtitles
  81
  82
  83 class ParliamentLiveUKIE(RedBeeBaseIE):
  84     IE_NAME = 'parliamentlive.tv'
  85     IE_DESC = 'UK parliament videos'
  86     _VALID_URL = r'(?i)https?://(?:www\.)?parliamentlive\.tv/Event/Index/(?P<id>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})'
  87
  88     _REDBEE_CUSTOMER = 'UKParliament'
  89     _REDBEE_BUSINESS_UNIT = 'ParliamentLive'
  90
  91     _TESTS = [{
  92         'url': 'http://parliamentlive.tv/Event/Index/c1e9d44d-fd6c-4263-b50f-97ed26cc998b',
  93         'info_dict': {
  94             'id': 'c1e9d44d-fd6c-4263-b50f-97ed26cc998b',
  95             'ext': 'mp4',
  96             'title': 'Home Affairs Committee',
  97             'timestamp': 1395153872,
  98             'upload_date': '20140318',
  99             'thumbnail': r're:https?://[^?#]+c1e9d44d-fd6c-4263-b50f-97ed26cc998b[^/]*/thumbnail',
 100         },
 101     }, {
 102         'url': 'http://parliamentlive.tv/event/index/3f24936f-130f-40bf-9a5d-b3d6479da6a4',
 103         'only_matching': True,
 104     }, {
 105         'url': 'https://parliamentlive.tv/Event/Index/27cf25e4-e77b-42a3-93c5-c815cd6d7377',
 106         'info_dict': {
 107             'id': '27cf25e4-e77b-42a3-93c5-c815cd6d7377',
 108             'ext': 'mp4',
 109             'title': 'House of Commons',
 110             'timestamp': 1658392447,
 111             'upload_date': '20220721',
 112             'thumbnail': r're:https?://[^?#]+27cf25e4-e77b-42a3-93c5-c815cd6d7377[^/]*/thumbnail',
 113         },
 114     }]
 115
 116     def _real_extract(self, url):
 117         video_id = self._match_id(url)
 118
 119         formats, subtitles = self._get_formats_and_subtitles(video_id)
 120
 121         video_info = self._download_json(
 122             f'https://www.parliamentlive.tv/Event/GetShareVideo/{video_id}', video_id, fatal=False)
 123
 124         return {
 125             'id': video_id,
 126             'formats': formats,
 127             'subtitles': subtitles,
 128             'title': traverse_obj(video_info, ('event', 'title')),
 129             'thumbnail': traverse_obj(video_info, 'thumbnailUrl'),
 130             'timestamp': traverse_obj(
 131                 video_info, ('event', 'publishedStartTime'), expected_type=unified_timestamp),
 132             '_format_sort_fields': ('res', 'proto'),
 133         }
 134
 135
 136 class RTBFIE(RedBeeBaseIE):
 137     _WORKING = False
 138     _VALID_URL = r'''(?x)
 139         https?://(?:www\.)?rtbf\.be/
 140         (?:
 141             video/[^?]+\?.*\bid=|
 142             ouftivi/(?:[^/]+/)*[^?]+\?.*\bvideoId=|
 143             auvio/[^/]+\?.*\b(?P<live>l)?id=
 144         )(?P<id>\d+)'''
 145     _NETRC_MACHINE = 'rtbf'
 146
 147     _REDBEE_CUSTOMER = 'RTBF'
 148     _REDBEE_BUSINESS_UNIT = 'Auvio'
 149
 150     _TESTS = [{
 151         'url': 'https://www.rtbf.be/video/detail_les-diables-au-coeur-episode-2?id=1921274',
 152         'md5': '8c876a1cceeb6cf31b476461ade72384',
 153         'info_dict': {
 154             'id': '1921274',
 155             'ext': 'mp4',
 156             'title': 'Les Diables au coeur (épisode 2)',
 157             'description': '(du 25/04/2014)',
 158             'duration': 3099.54,
 159             'upload_date': '20140425',
 160             'timestamp': 1398456300,
 161         },
 162         'skip': 'No longer available',
 163     }, {
 164         # geo restricted
 165         'url': 'http://www.rtbf.be/ouftivi/heros/detail_scooby-doo-mysteres-associes?id=1097&videoId=2057442',
 166         'only_matching': True,
 167     }, {
 168         'url': 'http://www.rtbf.be/ouftivi/niouzz?videoId=2055858',
 169         'only_matching': True,
 170     }, {
 171         'url': 'http://www.rtbf.be/auvio/detail_jeudi-en-prime-siegfried-bracke?id=2102996',
 172         'only_matching': True,
 173     }, {
 174         # Live
 175         'url': 'https://www.rtbf.be/auvio/direct_pure-fm?lid=134775',
 176         'only_matching': True,
 177     }, {
 178         # Audio
 179         'url': 'https://www.rtbf.be/auvio/detail_cinq-heures-cinema?id=2360811',
 180         'only_matching': True,
 181     }, {
 182         # With Subtitle
 183         'url': 'https://www.rtbf.be/auvio/detail_les-carnets-du-bourlingueur?id=2361588',
 184         'only_matching': True,
 185     }, {
 186         'url': 'https://www.rtbf.be/auvio/detail_investigation?id=2921926',
 187         'md5': 'd5d11bb62169fef38d7ce7ac531e034f',
 188         'info_dict': {
 189             'id': '2921926',
 190             'ext': 'mp4',
 191             'title': 'Le handicap un confinement perpétuel - Maladie de Lyme',
 192             'description': 'md5:dcbd5dcf6015488c9069b057c15ccc52',
 193             'duration': 5258.8,
 194             'upload_date': '20220727',
 195             'timestamp': 1658934000,
 196             'series': '#Investigation',
 197             'thumbnail': r're:^https?://[^?&]+\.jpg$',
 198         },
 199     }, {
 200         'url': 'https://www.rtbf.be/auvio/detail_la-belgique-criminelle?id=2920492',
 201         'md5': '054f9f143bc79c89647c35e5a7d35fa8',
 202         'info_dict': {
 203             'id': '2920492',
 204             'ext': 'mp4',
 205             'title': '04 - Le crime de la rue Royale',
 206             'description': 'md5:0c3da1efab286df83f2ab3f8f96bd7a6',
 207             'duration': 1574.6,
 208             'upload_date': '20220723',
 209             'timestamp': 1658596887,
 210             'series': 'La Belgique criminelle - TV',
 211             'thumbnail': r're:^https?://[^?&]+\.jpg$',
 212         },
 213     }]
 214
 215     _IMAGE_HOST = 'http://ds1.ds.static.rtbf.be'
 216     _PROVIDERS = {
 217         'YOUTUBE': 'Youtube',
 218         'DAILYMOTION': 'Dailymotion',
 219         'VIMEO': 'Vimeo',
 220     }
 221     _QUALITIES = [
 222         ('mobile', 'SD'),
 223         ('web', 'MD'),
 224         ('high', 'HD'),
 225     ]
 226     _LOGIN_URL = 'https://login.rtbf.be/accounts.login'
 227     _GIGYA_API_KEY = '3_kWKuPgcdAybqnqxq_MvHVk0-6PN8Zk8pIIkJM_yXOu-qLPDDsGOtIDFfpGivtbeO'
 228     _LOGIN_COOKIE_ID = f'glt_{_GIGYA_API_KEY}'
 229
 230     def _perform_login(self, username, password):
 231         if self._get_cookies(self._LOGIN_URL).get(self._LOGIN_COOKIE_ID):
 232             return
 233
 234         self._set_cookie('.rtbf.be', 'gmid', 'gmid.ver4', secure=True, expire_time=time.time() + 3600)
 235
 236         login_response = self._download_json(
 237             self._LOGIN_URL, None, data=urllib.parse.urlencode({
 238                 'loginID': username,
 239                 'password': password,
 240                 'APIKey': self._GIGYA_API_KEY,
 241                 'targetEnv': 'jssdk',
 242                 'sessionExpiration': '-2',
 243             }).encode(), headers={
 244                 'Content-Type': 'application/x-www-form-urlencoded',
 245             })
 246
 247         if login_response['statusCode'] != 200:
 248             raise ExtractorError('Login failed. Server message: {}'.format(login_response['errorMessage']), expected=True)
 249
 250         self._set_cookie('.rtbf.be', self._LOGIN_COOKIE_ID, login_response['sessionInfo']['login_token'],
 251                          secure=True, expire_time=time.time() + 3600)
 252
 253     def _get_formats_and_subtitles(self, url, media_id):
 254         login_token = self._get_cookies(url).get(self._LOGIN_COOKIE_ID)
 255         if not login_token:
 256             self.raise_login_required()
 257
 258         session_jwt = try_call(lambda: self._get_cookies(url)['rtbf_jwt'].value) or self._download_json(
 259             'https://login.rtbf.be/accounts.getJWT', media_id, query={
 260                 'login_token': login_token.value,
 261                 'APIKey': self._GIGYA_API_KEY,
 262                 'sdk': 'js_latest',
 263                 'authMode': 'cookie',
 264                 'pageURL': url,
 265                 'sdkBuild': '13273',
 266                 'format': 'json',
 267             })['id_token']
 268
 269         return super()._get_formats_and_subtitles(media_id, jwt=session_jwt)
 270
 271     def _real_extract(self, url):
 272         live, media_id = self._match_valid_url(url).groups()
 273         embed_page = self._download_webpage(
 274             'https://www.rtbf.be/auvio/embed/' + ('direct' if live else 'media'),
 275             media_id, query={'id': media_id})
 276
 277         media_data = self._html_search_regex(r'data-media="([^"]+)"', embed_page, 'media data', fatal=False)
 278         if not media_data:
 279             if re.search(r'<div[^>]+id="js-error-expired"[^>]+class="(?![^"]*hidden)', embed_page):
 280                 raise ExtractorError('Livestream has ended.', expected=True)
 281             if re.search(r'<div[^>]+id="js-sso-connect"[^>]+class="(?![^"]*hidden)', embed_page):
 282                 self.raise_login_required()
 283
 284             raise ExtractorError('Could not find media data')
 285
 286         data = self._parse_json(media_data, media_id)
 287
 288         error = data.get('error')
 289         if error:
 290             raise ExtractorError(f'{self.IE_NAME} said: {error}', expected=True)
 291
 292         provider = data.get('provider')
 293         if provider in self._PROVIDERS:
 294             return self.url_result(data['url'], self._PROVIDERS[provider])
 295
 296         title = traverse_obj(data, 'subtitle', 'title')
 297         is_live = data.get('isLive')
 298         height_re = r'-(\d+)p\.'
 299         formats, subtitles = [], {}
 300
 301         # The old api still returns m3u8 and mpd manifest for livestreams, but these are 'fake'
 302         # since all they contain is a 20s video that is completely unrelated.
 303         # https://github.com/yt-dlp/yt-dlp/issues/4656#issuecomment-1214461092
 304         m3u8_url = None if data.get('isLive') else traverse_obj(data, 'urlHlsAes128', 'urlHls')
 305         if m3u8_url:
 306             fmts, subs = self._extract_m3u8_formats_and_subtitles(
 307                 m3u8_url, media_id, 'mp4', m3u8_id='hls', fatal=False)
 308             formats.extend(fmts)
 309             self._merge_subtitles(subs, target=subtitles)
 310
 311         fix_url = lambda x: x.replace('//rtbf-vod.', '//rtbf.') if '/geo/drm/' in x else x
 312         http_url = data.get('url')
 313         if formats and http_url and re.search(height_re, http_url):
 314             http_url = fix_url(http_url)
 315             for m3u8_f in formats[:]:
 316                 height = m3u8_f.get('height')
 317                 if not height:
 318                     continue
 319                 f = m3u8_f.copy()
 320                 del f['protocol']
 321                 f.update({
 322                     'format_id': m3u8_f['format_id'].replace('hls-', 'http-'),
 323                     'url': re.sub(height_re, '-%dp.' % height, http_url),
 324                 })
 325                 formats.append(f)
 326         else:
 327             sources = data.get('sources') or {}
 328             for key, format_id in self._QUALITIES:
 329                 format_url = sources.get(key)
 330                 if not format_url:
 331                     continue
 332                 height = int_or_none(self._search_regex(
 333                     height_re, format_url, 'height', default=None))
 334                 formats.append({
 335                     'format_id': format_id,
 336                     'url': fix_url(format_url),
 337                     'height': height,
 338                 })
 339
 340         mpd_url = None if data.get('isLive') else data.get('urlDash')
 341         if mpd_url and (self.get_param('allow_unplayable_formats') or not data.get('drm')):
 342             fmts, subs = self._extract_mpd_formats_and_subtitles(
 343                 mpd_url, media_id, mpd_id='dash', fatal=False)
 344             formats.extend(fmts)
 345             self._merge_subtitles(subs, target=subtitles)
 346
 347         audio_url = data.get('urlAudio')
 348         if audio_url:
 349             formats.append({
 350                 'format_id': 'audio',
 351                 'url': audio_url,
 352                 'vcodec': 'none',
 353             })
 354
 355         for track in (data.get('tracks') or {}).values():
 356             sub_url = track.get('url')
 357             if not sub_url:
 358                 continue
 359             subtitles.setdefault(track.get('lang') or 'fr', []).append({
 360                 'url': sub_url,
 361             })
 362
 363         if not formats:
 364             fmts, subs = self._get_formats_and_subtitles(url, f'live_{media_id}' if is_live else media_id)
 365             formats.extend(fmts)
 366             self._merge_subtitles(subs, target=subtitles)
 367
 368         return {
 369             'id': media_id,
 370             'formats': formats,
 371             'title': title,
 372             'description': strip_or_none(data.get('description')),
 373             'thumbnail': data.get('thumbnail'),
 374             'duration': float_or_none(data.get('realDuration')),
 375             'timestamp': int_or_none(data.get('liveFrom')),
 376             'series': data.get('programLabel'),
 377             'subtitles': subtitles,
 378             'is_live': is_live,
 379             '_format_sort_fields': ('res', 'proto'),
 380         }