]> jfr.im git - yt-dlp.git/commitdiff
[youtube_live_chat] Fix `parse_yt_initial_data` and add `fragment_retries`
authorpukkandan <redacted>
Sun, 31 Jan 2021 07:48:06 +0000 (13:18 +0530)
committerpukkandan <redacted>
Sun, 31 Jan 2021 15:22:43 +0000 (20:52 +0530)
:ci skip dl

youtube_dlc/downloader/youtube_live_chat.py
youtube_dlc/extractor/youtube.py

index 223b4b81c1fd014c3cbf76c90b91ed463a9077cb..f162aff9c9128584da3dd83e867932bb63d440c0 100644 (file)
@@ -4,6 +4,9 @@
 import json
 
 from .fragment import FragmentFD
+from ..compat import compat_urllib_error
+from ..utils import try_get
+from ..extractor.youtube import YoutubeBaseInfoExtractor as YT_BaseIE
 
 
 class YoutubeLiveChatReplayFD(FragmentFD):
@@ -15,6 +18,7 @@ def real_download(self, filename, info_dict):
         video_id = info_dict['video_id']
         self.to_screen('[%s] Downloading live chat' % self.FD_NAME)
 
+        fragment_retries = self.params.get('fragment_retries', 0)
         test = self.params.get('test', False)
 
         ctx = {
@@ -28,15 +32,52 @@ def dl_fragment(url):
             return self._download_fragment(ctx, url, info_dict, headers)
 
         def parse_yt_initial_data(data):
-            window_patt = b'window\\["ytInitialData"\\]\\s*=\\s*(.*?)(?<=});'
-            var_patt = b'var\\s+ytInitialData\\s*=\\s*(.*?)(?<=});'
-            for patt in window_patt, var_patt:
+            patterns = (
+                r'%s\\s*%s' % (YT_BaseIE._YT_INITIAL_DATA_RE, YT_BaseIE._YT_INITIAL_BOUNDARY_RE),
+                r'%s' % YT_BaseIE._YT_INITIAL_DATA_RE)
+            data = data.decode('utf-8', 'replace')
+            for patt in patterns:
                 try:
                     raw_json = re.search(patt, data).group(1)
                     return json.loads(raw_json)
                 except AttributeError:
                     continue
 
+        def download_and_parse_fragment(url, frag_index):
+            count = 0
+            while count <= fragment_retries:
+                try:
+                    success, raw_fragment = dl_fragment(url)
+                    if not success:
+                        return False, None, None
+                    data = parse_yt_initial_data(raw_fragment) or json.loads(raw_fragment)['response']
+
+                    live_chat_continuation = try_get(
+                        data,
+                        lambda x: x['continuationContents']['liveChatContinuation'], dict) or {}
+                    offset = continuation_id = None
+                    processed_fragment = bytearray()
+                    for action in live_chat_continuation.get('actions', []):
+                        if 'replayChatItemAction' in action:
+                            replay_chat_item_action = action['replayChatItemAction']
+                            offset = int(replay_chat_item_action['videoOffsetTimeMsec'])
+                        processed_fragment.extend(
+                            json.dumps(action, ensure_ascii=False).encode('utf-8') + b'\n')
+                    if offset is not None:
+                        continuation_id = try_get(
+                            live_chat_continuation,
+                            lambda x: x['continuations'][0]['liveChatReplayContinuationData']['continuation'])
+                    self._append_fragment(ctx, processed_fragment)
+
+                    return True, continuation_id, offset
+                except compat_urllib_error.HTTPError as err:
+                    count += 1
+                    if count <= fragment_retries:
+                        self.report_retry_fragment(err, frag_index, count, fragment_retries)
+            if count > fragment_retries:
+                self.report_error('giving up after %s fragment retries' % fragment_retries)
+                return False, None, None
+
         self._prepare_and_start_frag_download(ctx)
 
         success, raw_fragment = dl_fragment(
@@ -44,54 +85,23 @@ def parse_yt_initial_data(data):
         if not success:
             return False
         data = parse_yt_initial_data(raw_fragment)
-        continuation_id = data['contents']['twoColumnWatchNextResults']['conversationBar']['liveChatRenderer']['continuations'][0]['reloadContinuationData']['continuation']
+        continuation_id = try_get(
+            data,
+            lambda x: x['contents']['twoColumnWatchNextResults']['conversationBar']['liveChatRenderer']['continuations'][0]['reloadContinuationData']['continuation'])
         # no data yet but required to call _append_fragment
         self._append_fragment(ctx, b'')
 
-        first = True
-        offset = None
+        frag_index = offset = 0
         while continuation_id is not None:
-            data = None
-            if first:
-                url = 'https://www.youtube.com/live_chat_replay?continuation={}'.format(continuation_id)
-                success, raw_fragment = dl_fragment(url)
-                if not success:
-                    return False
-                data = parse_yt_initial_data(raw_fragment)
-            else:
-                url = ('https://www.youtube.com/live_chat_replay/get_live_chat_replay'
-                       + '?continuation={}'.format(continuation_id)
-                       + '&playerOffsetMs={}'.format(max(offset - 5000, 0))
-                       + '&hidden=false'
-                       + '&pbj=1')
-                success, raw_fragment = dl_fragment(url)
-                if not success:
-                    return False
-                data = json.loads(raw_fragment)['response']
-
-            first = False
-            continuation_id = None
-
-            live_chat_continuation = data['continuationContents']['liveChatContinuation']
-            offset = None
-            processed_fragment = bytearray()
-            if 'actions' in live_chat_continuation:
-                for action in live_chat_continuation['actions']:
-                    if 'replayChatItemAction' in action:
-                        replay_chat_item_action = action['replayChatItemAction']
-                        offset = int(replay_chat_item_action['videoOffsetTimeMsec'])
-                    processed_fragment.extend(
-                        json.dumps(action, ensure_ascii=False).encode('utf-8') + b'\n')
-                try:
-                    continuation_id = live_chat_continuation['continuations'][0]['liveChatReplayContinuationData']['continuation']
-                except KeyError:
-                    continuation_id = None
-
-            self._append_fragment(ctx, processed_fragment)
-
-            if test or offset is None:
+            frag_index += 1
+            url = 'https://www.youtube.com/live_chat_replay?continuation=%s' % continuation_id
+            if frag_index > 1:
+                url += '&playerOffsetMs=%d&hidden=false&pbj=1' % max(offset - 5000, 0)
+            success, continuation_id, offset = download_and_parse_fragment(url, frag_index)
+            if not success:
+                return False
+            if test:
                 break
 
         self._finish_frag_download(ctx)
-
         return True
index 7c32d32000e9b2644a9cb891a03059b138f6b6af..0ba6a299e0ccacded163d7c62ed52f38e9bbb3c1 100644 (file)
@@ -277,15 +277,6 @@ def _download_webpage_handle(self, *args, **kwargs):
         return super(YoutubeBaseInfoExtractor, self)._download_webpage_handle(
             *args, **compat_kwargs(kwargs))
 
-    def _get_yt_initial_data(self, video_id, webpage):
-        config = self._search_regex(
-            (r'window\["ytInitialData"\]\s*=\s*(.*?)(?<=});',
-             r'var\s+ytInitialData\s*=\s*(.*?)(?<=});'),
-            webpage, 'ytInitialData', default=None)
-        if config:
-            return self._parse_json(
-                uppercase_escape(config), video_id, fatal=False)
-
     def _real_initialize(self):
         if self._downloader is None:
             return
@@ -1943,7 +1934,7 @@ def feed_entry(name):
 
         has_live_chat_replay = False
         if not is_live:
-            yt_initial_data = self._get_yt_initial_data(video_id, video_webpage)
+            yt_initial_data = self._extract_yt_initial_data(video_id, video_webpage)
             try:
                 yt_initial_data['contents']['twoColumnWatchNextResults']['conversationBar']['liveChatRenderer']['continuations'][0]['reloadContinuationData']['continuation']
                 has_live_chat_replay = True