[ie/crunchyroll] Fix stream extraction (#10005)

[yt-dlp.git] / yt_dlp / extractor / rokfin.py
diff --git a/yt_dlp/extractor/rokfin.py b/yt_dlp/extractor/rokfin.py

index ad53d697e75e960512523865e418be860c2e1fc3..3bc5f3cab293483e602b3b4b86f13d86f2db074d 100644 (file)
--- a/yt_dlp/extractor/rokfin.py
+++ b/yt_dlp/extractor/rokfin.py
@@ -1,8 +1,8 @@
+import datetime as dt
  import itertools
  import json
  import re
  import urllib.parse
-from datetime import datetime
  
  from .common import InfoExtractor, SearchInfoExtractor
  from ..utils import (
@@ -38,13 +38,13 @@ class RokfinIE(InfoExtractor):
              'upload_date': '20211023',
              'timestamp': 1634998029,
              'channel': 'Jimmy Dore',
-            'channel_id': 65429,
+            'channel_id': '65429',
              'channel_url': 'https://rokfin.com/TheJimmyDoreShow',
-            'duration': 213.0,
              'availability': 'public',
              'live_status': 'not_live',
              'dislike_count': int,
              'like_count': int,
+            'duration': 213,
          }
      }, {
          'url': 'https://rokfin.com/post/223/Julian-Assange-Arrested-Streaming-In-Real-Time',
@@ -56,7 +56,7 @@ class RokfinIE(InfoExtractor):
              'upload_date': '20190412',
              'timestamp': 1555052644,
              'channel': 'Ron Placone',
-            'channel_id': 10,
+            'channel_id': '10',
              'channel_url': 'https://rokfin.com/RonPlacone',
              'availability': 'public',
              'live_status': 'not_live',
@@ -72,8 +72,8 @@ class RokfinIE(InfoExtractor):
              'title': '"It\'s A Crazy Mess" Regional Director Blows Whistle On Pfizer\'s Vaccine Trial Data',
              'thumbnail': r're:https://img\.production\.rokfin\.com/.+',
              'description': 'md5:324ce2d3e3b62e659506409e458b9d8e',
-            'channel': 'Ryan Cristián',
-            'channel_id': 53856,
+            'channel': 'TLAVagabond',
+            'channel_id': '53856',
              'channel_url': 'https://rokfin.com/TLAVagabond',
              'availability': 'public',
              'is_live': False,
@@ -87,6 +87,46 @@ class RokfinIE(InfoExtractor):
              'like_count': int,
              'tags': ['FreeThinkingMedia^'],
          }
+    }, {
+        'url': 'https://rokfin.com/post/126703/Brave-New-World--Aldous-Huxley-DEEPDIVE--Chpts-13--Quite-Frankly--Jay-Dyer',
+        'info_dict': {
+            'id': 'post/126703',
+            'ext': 'mp4',
+            'title': 'Brave New World - Aldous Huxley DEEPDIVE!  (Chpts 1-3) - Quite Frankly & Jay Dyer',
+            'thumbnail': r're:https://img\.production\.rokfin\.com/.+',
+            'channel': 'Jay Dyer',
+            'channel_id': '186881',
+            'channel_url': 'https://rokfin.com/jaydyer',
+            'availability': 'premium_only',
+            'live_status': 'not_live',
+            'dislike_count': int,
+            'like_count': int,
+            'timestamp': 1678213357,
+            'upload_date': '20230307',
+            'tags': ['FreeThinkingMedia^', 'OpenMind^'],
+            'description': 'md5:cb04e32e68326c9b2b251b297bacff35',
+            'duration': 3100,
+        }
+    }, {
+        'url': 'https://rokfin.com/stream/31332/The-Grayzone-live-on-Nordstream-blame-game',
+        'info_dict': {
+            'id': 'stream/31332',
+            'ext': 'mp4',
+            'title': 'The Grayzone live on Nordstream blame game',
+            'thumbnail': r're:https://image\.v\.rokfin\.com/.+',
+            'channel': 'Max Blumenthal',
+            'channel_id': '248902',
+            'channel_url': 'https://rokfin.com/MaxBlumenthal',
+            'availability': 'premium_only',
+            'live_status': 'was_live',
+            'dislike_count': int,
+            'like_count': int,
+            'timestamp': 1678475166,
+            'release_timestamp': 1678475166.0,
+            'release_date': '20230310',
+            'upload_date': '20230310',
+            'tags': ['FreeThinkingMedia^'],
+        }
      }]
  
      def _real_extract(self, url):
@@ -100,6 +140,12 @@ def _real_extract(self, url):
                         else 'not_live')
  
          video_url = traverse_obj(metadata, 'url', ('content', 'contentUrl'), expected_type=url_or_none)
+        if video_url in (None, 'fake.m3u8'):
+            video_url = format_field(self._search_regex(
+                r'https?://[^/]+/([^/]+)/storyboard.vtt',
+                traverse_obj(metadata, 'timelineUrl', ('content', 'timelineUrl'), expected_type=url_or_none),
+                video_id, default=None), None, 'https://stream.v.rokfin.com/%s.m3u8')
+
          formats, subtitles = [{'url': video_url}] if video_url else [], {}
          if determine_ext(video_url) == 'm3u8':
              formats, subtitles = self._extract_m3u8_formats_and_subtitles(
@@ -110,9 +156,8 @@ def _real_extract(self, url):
                  self.raise_login_required('This video is only available to premium users', True, method='cookies')
              elif scheduled:
                  self.raise_no_formats(
-                    f'Stream is offline; sheduled for {datetime.fromtimestamp(scheduled).strftime("%Y-%m-%d %H:%M:%S")}',
+                    f'Stream is offline; scheduled for {dt.datetime.fromtimestamp(scheduled).strftime("%Y-%m-%d %H:%M:%S")}',
                      video_id=video_id, expected=True)
-        self._sort_formats(formats)
  
          uploader = traverse_obj(metadata, ('createdBy', 'username'), ('creator', 'username'))
          timestamp = (scheduled or float_or_none(metadata.get('postedAtMilli'), 1000)
@@ -128,7 +173,7 @@ def _real_extract(self, url):
              'like_count': int_or_none(metadata.get('likeCount')),
              'dislike_count': int_or_none(metadata.get('dislikeCount')),
              'channel': str_or_none(traverse_obj(metadata, ('createdBy', 'name'), ('creator', 'name'))),
-            'channel_id': traverse_obj(metadata, ('createdBy', 'id'), ('creator', 'id')),
+            'channel_id': str_or_none(traverse_obj(metadata, ('createdBy', 'id'), ('creator', 'id'))),
              'channel_url': url_or_none(f'https://rokfin.com/{uploader}') if uploader else None,
              'timestamp': timestamp,
              'release_timestamp': timestamp if live_status != 'not_live' else None,
@@ -146,7 +191,7 @@ def _get_comments(self, video_id):
          for page_n in itertools.count():
              raw_comments = self._download_json(
                  f'{_API_BASE_URL}comment?postId={video_id[5:]}&page={page_n}&size=50',
-                video_id, note=f'Downloading viewer comments page {page_n + 1}{format_field(pages_total, template=" of %s")}',
+                video_id, note=f'Downloading viewer comments page {page_n + 1}{format_field(pages_total, None, " of %s")}',
                  fatal=False) or {}
  
              for comment in raw_comments.get('content') or []:
@@ -198,7 +243,7 @@ def _perform_login(self, username, password):
              f'{self._AUTH_BASE}/token', None,
              note='getting access credentials', errnote='error getting access credentials',
              data=urlencode_postdata({
-                'code': urllib.parse.parse_qs(urllib.parse.urldefrag(urlh.geturl()).fragment).get('code')[0],
+                'code': urllib.parse.parse_qs(urllib.parse.urldefrag(urlh.url).fragment).get('code')[0],
                  'client_id': 'web',
                  'grant_type': 'authorization_code',
                  'redirect_uri': 'https://rokfin.com/silent-check-sso.html'
@@ -222,7 +267,7 @@ def _download_json_using_access_token(self, url_or_request, video_id, headers={}
  
          json_string, urlh = self._download_webpage_handle(
              url_or_request, video_id, headers=headers, query=query, expected_status=401)
-        if not auth_token or urlh.code != 401 or refresh_token is None:
+        if not auth_token or urlh.status != 401 or refresh_token is None:
              return self._parse_json(json_string, video_id)
  
          self._access_mgmt_tokens = self._download_json(
@@ -318,7 +363,7 @@ def _entries(self, channel_id, channel_name, tab):
                  data_url = f'{_API_BASE_URL}post/search/{tab}?page={page_n}&size=50&creator={channel_id}'
              metadata = self._download_json(
                  data_url, channel_name,
-                note=f'Downloading video metadata page {page_n + 1}{format_field(pages_total, template=" of %s")}')
+                note=f'Downloading video metadata page {page_n + 1}{format_field(pages_total, None, " of %s")}')
  
              yield from self._get_video_data(metadata)
              pages_total = int_or_none(metadata.get('totalPages')) or None
@@ -360,7 +405,7 @@ class RokfinSearchIE(SearchInfoExtractor):
      _db_access_key = None
  
      def _real_initialize(self):
-        self._db_url, self._db_access_key = self._downloader.cache.load(self.ie_key(), 'auth', default=(None, None))
+        self._db_url, self._db_access_key = self.cache.load(self.ie_key(), 'auth', default=(None, None))
          if not self._db_url:
              self._get_db_access_credentials()
  
@@ -369,7 +414,7 @@ def _search_results(self, query):
          for page_number in itertools.count(1):
              search_results = self._run_search_query(
                  query, data={'query': query, 'page': {'size': 100, 'current': page_number}},
-                note=f'Downloading page {page_number}{format_field(total_pages, template=" of ~%s")}')
+                note=f'Downloading page {page_number}{format_field(total_pages, None, " of ~%s")}')
              total_pages = traverse_obj(search_results, ('meta', 'page', 'total_pages'), expected_type=int_or_none)
  
              for result in search_results.get('results') or []:
@@ -405,6 +450,6 @@ def _get_db_access_credentials(self, video_id=None):
  
              self._db_url = url_or_none(f'{auth_data["ENDPOINT_BASE"]}/api/as/v1/engines/rokfin-search/search.json')
              self._db_access_key = f'Bearer {auth_data["SEARCH_KEY"]}'
-            self._downloader.cache.store(self.ie_key(), 'auth', (self._db_url, self._db_access_key))
+            self.cache.store(self.ie_key(), 'auth', (self._db_url, self._db_access_key))
              return
          raise ExtractorError('Unable to extract access credentials')