]> jfr.im git - yt-dlp.git/blobdiff - yt_dlp/extractor/cbc.py
[extractor] Deprecate `_sort_formats`
[yt-dlp.git] / yt_dlp / extractor / cbc.py
index 413053499bf5a78e6526a45a83347e97d22c0f9c..210f5f8eea4cc14c341380832d823877d9c72a39 100644 (file)
@@ -1,6 +1,3 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
 import re
 import json
 import base64
     compat_str,
 )
 from ..utils import (
+    int_or_none,
+    join_nonempty,
     js_to_json,
-    smuggle_url,
-    try_get,
     orderedSet,
+    smuggle_url,
     strip_or_none,
+    try_get,
     ExtractorError,
 )
 
@@ -125,9 +124,9 @@ def _extract_player_init(self, player_init, display_id):
     def _real_extract(self, url):
         display_id = self._match_id(url)
         webpage = self._download_webpage(url, display_id)
-        title = self._og_search_title(webpage, default=None) or self._html_search_meta(
-            'twitter:title', webpage, 'title', default=None) or self._html_search_regex(
-                r'<title>([^<]+)</title>', webpage, 'title', fatal=False)
+        title = (self._og_search_title(webpage, default=None)
+                 or self._html_search_meta('twitter:title', webpage, 'title', default=None)
+                 or self._html_extract_title(webpage))
         entries = [
             self._extract_player_init(player_init, display_id)
             for player_init in re.findall(r'CBC\.APP\.Caffeine\.initInstance\(({.+?})\);', webpage)]
@@ -305,13 +304,45 @@ def claims_token_valid(self):
     def _get_claims_token(self, email, password):
         if not self.claims_token_valid():
             self._claims_token = self._new_claims_token(email, password)
-            self._downloader.cache.store(self._NETRC_MACHINE, 'claims_token', self._claims_token)
+            self.cache.store(self._NETRC_MACHINE, 'claims_token', self._claims_token)
         return self._claims_token
 
     def _real_initialize(self):
         if self.claims_token_valid():
             return
-        self._claims_token = self._downloader.cache.load(self._NETRC_MACHINE, 'claims_token')
+        self._claims_token = self.cache.load(self._NETRC_MACHINE, 'claims_token')
+
+    def _find_secret_formats(self, formats, video_id):
+        """ Find a valid video url and convert it to the secret variant """
+        base_format = next((f for f in formats if f.get('vcodec') != 'none'), None)
+        if not base_format:
+            return
+
+        base_url = re.sub(r'(Manifest\(.*?),filter=[\w-]+(.*?\))', r'\1\2', base_format['url'])
+        url = re.sub(r'(Manifest\(.*?),format=[\w-]+(.*?\))', r'\1\2', base_url)
+
+        secret_xml = self._download_xml(url, video_id, note='Downloading secret XML', fatal=False)
+        if not secret_xml:
+            return
+
+        for child in secret_xml:
+            if child.attrib.get('Type') != 'video':
+                continue
+            for video_quality in child:
+                bitrate = int_or_none(video_quality.attrib.get('Bitrate'))
+                if not bitrate or 'Index' not in video_quality.attrib:
+                    continue
+                height = int_or_none(video_quality.attrib.get('MaxHeight'))
+
+                yield {
+                    **base_format,
+                    'format_id': join_nonempty('sec', height),
+                    # Note: \g<1> is necessary instead of \1 since bitrate is a number
+                    'url': re.sub(r'(QualityLevels\()\d+(\))', fr'\g<1>{bitrate}\2', base_url),
+                    'width': int_or_none(video_quality.attrib.get('MaxWidth')),
+                    'tbr': bitrate / 1000.0,
+                    'height': height,
+                }
 
     def _real_extract(self, url):
         video_id = self._match_id(url)
@@ -335,6 +366,7 @@ def _real_extract(self, url):
 
         formats = self._extract_m3u8_formats(m3u8_url, video_id, m3u8_id='hls')
         self._remove_duplicate_formats(formats)
+        formats.extend(self._find_secret_formats(formats, video_id))
 
         for format in formats:
             if format.get('vcodec') == 'none':
@@ -348,8 +380,6 @@ def _real_extract(self, url):
                 if 'descriptive' in format['format_id'].lower():
                     format['preference'] = -2
 
-        self._sort_formats(formats)
-
         return {
             'id': video_id,
             'title': video_info['title'],