X-Git-Url: https://jfr.im/git/yt-dlp.git/blobdiff_plain/3c239332b0df3b22a5cbd66930ad240d2398fb44..4ce05f57599961c853253398b993c94efb504048:/yt_dlp/extractor/cbc.py diff --git a/yt_dlp/extractor/cbc.py b/yt_dlp/extractor/cbc.py index 61fe4074c..cac3f1e9d 100644 --- a/yt_dlp/extractor/cbc.py +++ b/yt_dlp/extractor/cbc.py @@ -1,18 +1,20 @@ -# coding: utf-8 -from __future__ import unicode_literals - import re +import json +import base64 +import time from .common import InfoExtractor from ..compat import ( compat_str, ) from ..utils import ( + int_or_none, + join_nonempty, js_to_json, - smuggle_url, - try_get, orderedSet, + smuggle_url, strip_or_none, + try_get, ExtractorError, ) @@ -122,9 +124,9 @@ def _extract_player_init(self, player_init, display_id): def _real_extract(self, url): display_id = self._match_id(url) webpage = self._download_webpage(url, display_id) - title = self._og_search_title(webpage, default=None) or self._html_search_meta( - 'twitter:title', webpage, 'title', default=None) or self._html_search_regex( - r'([^<]+)', webpage, 'title', fatal=False) + title = (self._og_search_title(webpage, default=None) + or self._html_search_meta('twitter:title', webpage, 'title', default=None) + or self._html_extract_title(webpage)) entries = [ self._extract_player_init(player_init, display_id) for player_init in re.findall(r'CBC\.APP\.Caffeine\.initInstance\(({.+?})\);', webpage)] @@ -244,37 +246,129 @@ class CBCGemIE(InfoExtractor): 'params': {'format': 'bv'}, 'skip': 'Geo-restricted to Canada', }] - _API_BASE = 'https://services.radio-canada.ca/ott/cbc-api/v2/assets/' + + _GEO_COUNTRIES = ['CA'] + _TOKEN_API_KEY = '3f4beddd-2061-49b0-ae80-6f1f2ed65b37' + _NETRC_MACHINE = 'cbcgem' + _claims_token = None + + def _new_claims_token(self, email, password): + data = json.dumps({ + 'email': email, + 'password': password, + }).encode() + headers = {'content-type': 'application/json'} + query = {'apikey': self._TOKEN_API_KEY} + resp = self._download_json('https://api.loginradius.com/identity/v2/auth/login', + None, data=data, headers=headers, query=query) + access_token = resp['access_token'] + + query = { + 'access_token': access_token, + 'apikey': self._TOKEN_API_KEY, + 'jwtapp': 'jwt', + } + resp = self._download_json('https://cloud-api.loginradius.com/sso/jwt/api/token', + None, headers=headers, query=query) + sig = resp['signature'] + + data = json.dumps({'jwt': sig}).encode() + headers = {'content-type': 'application/json', 'ott-device-type': 'web'} + resp = self._download_json('https://services.radio-canada.ca/ott/cbc-api/v2/token', + None, data=data, headers=headers) + cbc_access_token = resp['accessToken'] + + headers = {'content-type': 'application/json', 'ott-device-type': 'web', 'ott-access-token': cbc_access_token} + resp = self._download_json('https://services.radio-canada.ca/ott/cbc-api/v2/profile', + None, headers=headers) + return resp['claimsToken'] + + def _get_claims_token_expiry(self): + # Token is a JWT + # JWT is decoded here and 'exp' field is extracted + # It is a Unix timestamp for when the token expires + b64_data = self._claims_token.split('.')[1] + data = base64.urlsafe_b64decode(b64_data + "==") + return json.loads(data)['exp'] + + def claims_token_expired(self): + exp = self._get_claims_token_expiry() + if exp - time.time() < 10: + # It will expire in less than 10 seconds, or has already expired + return True + return False + + def claims_token_valid(self): + return self._claims_token is not None and not self.claims_token_expired() + + def _get_claims_token(self, email, password): + if not self.claims_token_valid(): + self._claims_token = self._new_claims_token(email, password) + self._downloader.cache.store(self._NETRC_MACHINE, 'claims_token', self._claims_token) + return self._claims_token + + def _real_initialize(self): + if self.claims_token_valid(): + return + self._claims_token = self._downloader.cache.load(self._NETRC_MACHINE, 'claims_token') + + def _find_secret_formats(self, formats, video_id): + """ Find a valid video url and convert it to the secret variant """ + base_format = next((f for f in formats if f.get('vcodec') != 'none'), None) + if not base_format: + return + + base_url = re.sub(r'(Manifest\(.*?),filter=[\w-]+(.*?\))', r'\1\2', base_format['url']) + url = re.sub(r'(Manifest\(.*?),format=[\w-]+(.*?\))', r'\1\2', base_url) + + secret_xml = self._download_xml(url, video_id, note='Downloading secret XML', fatal=False) + if not secret_xml: + return + + for child in secret_xml: + if child.attrib.get('Type') != 'video': + continue + for video_quality in child: + bitrate = int_or_none(video_quality.attrib.get('Bitrate')) + if not bitrate or 'Index' not in video_quality.attrib: + continue + height = int_or_none(video_quality.attrib.get('MaxHeight')) + + yield { + **base_format, + 'format_id': join_nonempty('sec', height), + # Note: \g<1> is necessary instead of \1 since bitrate is a number + 'url': re.sub(r'(QualityLevels\()\d+(\))', fr'\g<1>{bitrate}\2', base_url), + 'width': int_or_none(video_quality.attrib.get('MaxWidth')), + 'tbr': bitrate / 1000.0, + 'height': height, + } def _real_extract(self, url): video_id = self._match_id(url) - video_info = self._download_json(self._API_BASE + video_id, video_id) - - last_error = None - attempt = -1 - retries = self.get_param('extractor_retries', 15) - while attempt < retries: - attempt += 1 - if last_error: - self.report_warning('%s. Retrying ...' % last_error) - m3u8_info = self._download_json( - video_info['playSession']['url'], video_id, - note='Downloading JSON metadata%s' % f' (attempt {attempt})') - m3u8_url = m3u8_info.get('url') - if m3u8_url: - break - elif m3u8_info.get('errorCode') == 1: - self.raise_geo_restricted(countries=['CA']) - else: - last_error = f'{self.IE_NAME} said: {m3u8_info.get("errorCode")} - {m3u8_info.get("message")}' - # 35 means media unavailable, but retries work - if m3u8_info.get('errorCode') != 35 or attempt >= retries: - raise ExtractorError(last_error) + video_info = self._download_json('https://services.radio-canada.ca/ott/cbc-api/v2/assets/' + video_id, video_id) + + email, password = self._get_login_info() + if email and password: + claims_token = self._get_claims_token(email, password) + headers = {'x-claims-token': claims_token} + else: + headers = {} + m3u8_info = self._download_json(video_info['playSession']['url'], video_id, headers=headers) + m3u8_url = m3u8_info.get('url') + + if m3u8_info.get('errorCode') == 1: + self.raise_geo_restricted(countries=['CA']) + elif m3u8_info.get('errorCode') == 35: + self.raise_login_required(method='password') + elif m3u8_info.get('errorCode') != 0: + raise ExtractorError(f'{self.IE_NAME} said: {m3u8_info.get("errorCode")} - {m3u8_info.get("message")}') formats = self._extract_m3u8_formats(m3u8_url, video_id, m3u8_id='hls') self._remove_duplicate_formats(formats) + formats.extend(self._find_secret_formats(formats, video_id)) - for i, format in enumerate(formats): + for format in formats: if format.get('vcodec') == 'none': if format.get('ext') is None: format['ext'] = 'm4a' @@ -328,7 +422,8 @@ def _real_extract(self, url): show = match.group('show') show_info = self._download_json(self._API_BASE + show, season_id) season = int(match.group('season')) - season_info = try_get(show_info, lambda x: x['seasons'][season - 1]) + + season_info = next((s for s in show_info['seasons'] if s.get('season') == season), None) if season_info is None: raise ExtractorError(f'Couldn\'t find season {season} of {show}')