[exractor/lbry] Use HEAD request for redirect URL (#4181)

[yt-dlp.git] / yt_dlp / extractor / cbc.py
diff --git a/yt_dlp/extractor/cbc.py b/yt_dlp/extractor/cbc.py

index 061b09908d503fc5ebbc802f6b4c7cc0736269cd..999b7bc53ae31774b6369997817f0d7883cc6052 100644 (file)
--- a/yt_dlp/extractor/cbc.py
+++ b/yt_dlp/extractor/cbc.py
@@ -1,18 +1,20 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
  import re
+import json
+import base64
+import time
  
  from .common import InfoExtractor
  from ..compat import (
      compat_str,
  )
  from ..utils import (
+    int_or_none,
+    join_nonempty,
      js_to_json,
-    smuggle_url,
-    try_get,
      orderedSet,
+    smuggle_url,
      strip_or_none,
+    try_get,
      ExtractorError,
  )
  
@@ -122,9 +124,9 @@ def _extract_player_init(self, player_init, display_id):
      def _real_extract(self, url):
          display_id = self._match_id(url)
          webpage = self._download_webpage(url, display_id)
-        title = self._og_search_title(webpage, default=None) or self._html_search_meta(
-            'twitter:title', webpage, 'title', default=None) or self._html_search_regex(
-                r'<title>([^<]+)</title>', webpage, 'title', fatal=False)
+        title = (self._og_search_title(webpage, default=None)
+                 or self._html_search_meta('twitter:title', webpage, 'title', default=None)
+                 or self._html_extract_title(webpage))
          entries = [
              self._extract_player_init(player_init, display_id)
              for player_init in re.findall(r'CBC\.APP\.Caffeine\.initInstance\(({.+?})\);', webpage)]
@@ -202,7 +204,6 @@ class CBCGemIE(InfoExtractor):
      IE_NAME = 'gem.cbc.ca'
      _VALID_URL = r'https?://gem\.cbc\.ca/media/(?P<id>[0-9a-z-]+/s[0-9]+[a-z][0-9]+)'
      _TESTS = [{
-        # geo-restricted to Canada, bypassable
          # This is a normal, public, TV show video
          'url': 'https://gem.cbc.ca/media/schitts-creek/s06e01',
          'md5': '93dbb31c74a8e45b378cf13bd3f6f11e',
@@ -224,7 +225,6 @@ class CBCGemIE(InfoExtractor):
          'params': {'format': 'bv'},
          'skip': 'Geo-restricted to Canada',
      }, {
-        # geo-restricted to Canada, bypassable
          # This video requires an account in the browser, but works fine in yt-dlp
          'url': 'https://gem.cbc.ca/media/schitts-creek/s01e01',
          'md5': '297a9600f554f2258aed01514226a697',
@@ -246,37 +246,129 @@ class CBCGemIE(InfoExtractor):
          'params': {'format': 'bv'},
          'skip': 'Geo-restricted to Canada',
      }]
-    _API_BASE = 'https://services.radio-canada.ca/ott/cbc-api/v2/assets/'
+
+    _GEO_COUNTRIES = ['CA']
+    _TOKEN_API_KEY = '3f4beddd-2061-49b0-ae80-6f1f2ed65b37'
+    _NETRC_MACHINE = 'cbcgem'
+    _claims_token = None
+
+    def _new_claims_token(self, email, password):
+        data = json.dumps({
+            'email': email,
+            'password': password,
+        }).encode()
+        headers = {'content-type': 'application/json'}
+        query = {'apikey': self._TOKEN_API_KEY}
+        resp = self._download_json('https://api.loginradius.com/identity/v2/auth/login',
+                                   None, data=data, headers=headers, query=query)
+        access_token = resp['access_token']
+
+        query = {
+            'access_token': access_token,
+            'apikey': self._TOKEN_API_KEY,
+            'jwtapp': 'jwt',
+        }
+        resp = self._download_json('https://cloud-api.loginradius.com/sso/jwt/api/token',
+                                   None, headers=headers, query=query)
+        sig = resp['signature']
+
+        data = json.dumps({'jwt': sig}).encode()
+        headers = {'content-type': 'application/json', 'ott-device-type': 'web'}
+        resp = self._download_json('https://services.radio-canada.ca/ott/cbc-api/v2/token',
+                                   None, data=data, headers=headers)
+        cbc_access_token = resp['accessToken']
+
+        headers = {'content-type': 'application/json', 'ott-device-type': 'web', 'ott-access-token': cbc_access_token}
+        resp = self._download_json('https://services.radio-canada.ca/ott/cbc-api/v2/profile',
+                                   None, headers=headers)
+        return resp['claimsToken']
+
+    def _get_claims_token_expiry(self):
+        # Token is a JWT
+        # JWT is decoded here and 'exp' field is extracted
+        # It is a Unix timestamp for when the token expires
+        b64_data = self._claims_token.split('.')[1]
+        data = base64.urlsafe_b64decode(b64_data + "==")
+        return json.loads(data)['exp']
+
+    def claims_token_expired(self):
+        exp = self._get_claims_token_expiry()
+        if exp - time.time() < 10:
+            # It will expire in less than 10 seconds, or has already expired
+            return True
+        return False
+
+    def claims_token_valid(self):
+        return self._claims_token is not None and not self.claims_token_expired()
+
+    def _get_claims_token(self, email, password):
+        if not self.claims_token_valid():
+            self._claims_token = self._new_claims_token(email, password)
+            self.cache.store(self._NETRC_MACHINE, 'claims_token', self._claims_token)
+        return self._claims_token
+
+    def _real_initialize(self):
+        if self.claims_token_valid():
+            return
+        self._claims_token = self.cache.load(self._NETRC_MACHINE, 'claims_token')
+
+    def _find_secret_formats(self, formats, video_id):
+        """ Find a valid video url and convert it to the secret variant """
+        base_format = next((f for f in formats if f.get('vcodec') != 'none'), None)
+        if not base_format:
+            return
+
+        base_url = re.sub(r'(Manifest\(.*?),filter=[\w-]+(.*?\))', r'\1\2', base_format['url'])
+        url = re.sub(r'(Manifest\(.*?),format=[\w-]+(.*?\))', r'\1\2', base_url)
+
+        secret_xml = self._download_xml(url, video_id, note='Downloading secret XML', fatal=False)
+        if not secret_xml:
+            return
+
+        for child in secret_xml:
+            if child.attrib.get('Type') != 'video':
+                continue
+            for video_quality in child:
+                bitrate = int_or_none(video_quality.attrib.get('Bitrate'))
+                if not bitrate or 'Index' not in video_quality.attrib:
+                    continue
+                height = int_or_none(video_quality.attrib.get('MaxHeight'))
+
+                yield {
+                    **base_format,
+                    'format_id': join_nonempty('sec', height),
+                    # Note: \g<1> is necessary instead of \1 since bitrate is a number
+                    'url': re.sub(r'(QualityLevels\()\d+(\))', fr'\g<1>{bitrate}\2', base_url),
+                    'width': int_or_none(video_quality.attrib.get('MaxWidth')),
+                    'tbr': bitrate / 1000.0,
+                    'height': height,
+                }
  
      def _real_extract(self, url):
          video_id = self._match_id(url)
-        video_info = self._download_json(self._API_BASE + video_id, video_id)
-
-        last_error = None
-        attempt = -1
-        retries = self.get_param('extractor_retries', 15)
-        while attempt < retries:
-            attempt += 1
-            if last_error:
-                self.report_warning('%s. Retrying ...' % last_error)
-            m3u8_info = self._download_json(
-                video_info['playSession']['url'], video_id,
-                note='Downloading JSON metadata%s' % f' (attempt {attempt})')
-            m3u8_url = m3u8_info.get('url')
-            if m3u8_url:
-                break
-            elif m3u8_info.get('errorCode') == 1:
-                self.raise_geo_restricted(countries=['CA'])
-            else:
-                last_error = f'{self.IE_NAME} said: {m3u8_info.get("errorCode")} - {m3u8_info.get("message")}'
-                # 35 means media unavailable, but retries work
-                if m3u8_info.get('errorCode') != 35 or attempt >= retries:
-                    raise ExtractorError(last_error)
+        video_info = self._download_json('https://services.radio-canada.ca/ott/cbc-api/v2/assets/' + video_id, video_id)
+
+        email, password = self._get_login_info()
+        if email and password:
+            claims_token = self._get_claims_token(email, password)
+            headers = {'x-claims-token': claims_token}
+        else:
+            headers = {}
+        m3u8_info = self._download_json(video_info['playSession']['url'], video_id, headers=headers)
+        m3u8_url = m3u8_info.get('url')
+
+        if m3u8_info.get('errorCode') == 1:
+            self.raise_geo_restricted(countries=['CA'])
+        elif m3u8_info.get('errorCode') == 35:
+            self.raise_login_required(method='password')
+        elif m3u8_info.get('errorCode') != 0:
+            raise ExtractorError(f'{self.IE_NAME} said: {m3u8_info.get("errorCode")} - {m3u8_info.get("message")}')
  
          formats = self._extract_m3u8_formats(m3u8_url, video_id, m3u8_id='hls')
          self._remove_duplicate_formats(formats)
+        formats.extend(self._find_secret_formats(formats, video_id))
  
-        for i, format in enumerate(formats):
+        for format in formats:
              if format.get('vcodec') == 'none':
                  if format.get('ext') is None:
                      format['ext'] = 'm4a'
@@ -313,7 +405,6 @@ class CBCGemPlaylistIE(InfoExtractor):
      IE_NAME = 'gem.cbc.ca:playlist'
      _VALID_URL = r'https?://gem\.cbc\.ca/media/(?P<id>(?P<show>[0-9a-z-]+)/s(?P<season>[0-9]+))/?(?:[?#]|$)'
      _TESTS = [{
-        # geo-restricted to Canada, bypassable
          # TV show playlist, all public videos
          'url': 'https://gem.cbc.ca/media/schitts-creek/s06',
          'playlist_count': 16,
@@ -322,7 +413,6 @@ class CBCGemPlaylistIE(InfoExtractor):
              'title': 'Season 6',
              'description': 'md5:6a92104a56cbeb5818cc47884d4326a2',
          },
-        'skip': 'Geo-restricted to Canada',
      }]
      _API_BASE = 'https://services.radio-canada.ca/ott/cbc-api/v2/shows/'
  
@@ -332,7 +422,8 @@ def _real_extract(self, url):
          show = match.group('show')
          show_info = self._download_json(self._API_BASE + show, season_id)
          season = int(match.group('season'))
-        season_info = try_get(show_info, lambda x: x['seasons'][season - 1])
+
+        season_info = next((s for s in show_info['seasons'] if s.get('season') == season), None)
  
          if season_info is None:
              raise ExtractorError(f'Couldn\'t find season {season} of {show}')
@@ -381,7 +472,7 @@ def _real_extract(self, url):
  
  class CBCGemLiveIE(InfoExtractor):
      IE_NAME = 'gem.cbc.ca:live'
-    _VALID_URL = r'https?://gem\.cbc\.ca/live/(?P<id>[0-9]{12})'
+    _VALID_URL = r'https?://gem\.cbc\.ca/live/(?P<id>\d+)'
      _TEST = {
          'url': 'https://gem.cbc.ca/live/920604739687',
          'info_dict': {
@@ -400,21 +491,21 @@ class CBCGemLiveIE(InfoExtractor):
  
      # It's unclear where the chars at the end come from, but they appear to be
      # constant. Might need updating in the future.
-    _API = 'https://tpfeed.cbc.ca/f/ExhSPC/t_t3UKJR6MAT'
+    # There are two URLs, some livestreams are in one, and some
+    # in the other. The JSON schema is the same for both.
+    _API_URLS = ['https://tpfeed.cbc.ca/f/ExhSPC/t_t3UKJR6MAT', 'https://tpfeed.cbc.ca/f/ExhSPC/FNiv9xQx_BnT']
  
      def _real_extract(self, url):
          video_id = self._match_id(url)
-        live_info = self._download_json(self._API, video_id)['entries']
-
-        video_info = None
-        for stream in live_info:
-            if stream.get('guid') == video_id:
-                video_info = stream
  
-        if video_info is None:
-            raise ExtractorError(
-                'Couldn\'t find video metadata, maybe this livestream is now offline',
-                expected=True)
+        for api_url in self._API_URLS:
+            video_info = next((
+                stream for stream in self._download_json(api_url, video_id)['entries']
+                if stream.get('guid') == video_id), None)
+            if video_info:
+                break
+        else:
+            raise ExtractorError('Couldn\'t find video metadata, maybe this livestream is now offline', expected=True)
  
          return {
              '_type': 'url_transparent',