[extractor/youtube] Fix initial player response extraction

author coletdjnz <redacted>

Sun, 29 May 2022 07:54:22 +0000 (19:54 +1200)

committer coletdjnz <redacted>

Sun, 29 May 2022 07:54:22 +0000 (19:54 +1200)
author coletdjnz <redacted>
Sun, 29 May 2022 07:54:22 +0000 (19:54 +1200)
committer coletdjnz <redacted>
Sun, 29 May 2022 07:54:22 +0000 (19:54 +1200)
diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py

index b24599d5ffb9156ecfb00b5c2b33ca5a8a94454b..5767662ed5ee065ed5e37a6fc33ebefc7bf986dd 100644 (file)
--- a/yt_dlp/extractor/common.py
+++ b/yt_dlp/extractor/common.py
@@ -1033,11 +1033,19 @@ def _download_json(
              expected_status=expected_status)
          return res if res is False else res[0]
  
-    def _parse_json(self, json_string, video_id, transform_source=None, fatal=True):
+    def _parse_json(self, json_string, video_id, transform_source=None, fatal=True, lenient=False):
          if transform_source:
              json_string = transform_source(json_string)
          try:
-            return json.loads(json_string, strict=False)
+            try:
+                return json.loads(json_string, strict=False)
+            except json.JSONDecodeError as e:
+                if not lenient:
+                    raise
+                try:
+                    return json.loads(json_string[:e.pos], strict=False)
+                except ValueError:
+                    raise e
          except ValueError as ve:
              errmsg = '%s: Failed to parse JSON ' % video_id
              if fatal:
diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py

index 245778dff2d8be48556e671cdc2e54c4072e87f1..6d9659b1d77d9f4fe7f3b0e20ed7f797dc12c546 100644 (file)
--- a/yt_dlp/extractor/youtube.py
+++ b/yt_dlp/extractor/youtube.py
@@ -397,8 +397,8 @@ def _check_login_required(self):
          if self._LOGIN_REQUIRED and not self._cookies_passed:
              self.raise_login_required('Login details are needed to download this content', method='cookies')
  
-    _YT_INITIAL_DATA_RE = r'(?:window\s*\[\s*["\']ytInitialData["\']\s*\]|ytInitialData)\s*=\s*({.+?})\s*;'
-    _YT_INITIAL_PLAYER_RESPONSE_RE = r'ytInitialPlayerResponse\s*=\s*({.+?})\s*;'
+    _YT_INITIAL_DATA_RE = r'(?:window\s*\[\s*["\']ytInitialData["\']\s*\]|ytInitialData)\s*=\s*({.+})\s*;'
+    _YT_INITIAL_PLAYER_RESPONSE_RE = r'ytInitialPlayerResponse\s*=\s*({.+})\s*;'
      _YT_INITIAL_BOUNDARY_RE = r'(?:var\s+meta|</script|\n)'
  
      def _get_default_ytcfg(self, client='web'):
@@ -2212,28 +2212,54 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
          }, {
              # Story. Requires specific player params to work.
              # Note: stories get removed after some period of time
-            'url': 'https://www.youtube.com/watch?v=yN3x1t3sieA',
+            'url': 'https://www.youtube.com/watch?v=vv8qTUWmulI',
              'info_dict': {
-                'id': 'yN3x1t3sieA',
+                'id': 'vv8qTUWmulI',
                  'ext': 'mp4',
-                'uploader': 'Linus Tech Tips',
-                'duration': 13,
-                'channel': 'Linus Tech Tips',
+                'availability': 'unlisted',
+                'view_count': int,
+                'channel_id': 'UCzIZ8HrzDgc-pNQDUG6avBA',
+                'upload_date': '20220526',
+                'categories': ['Education'],
+                'title': 'Story',
+                'channel': 'IT\'S HISTORY',
+                'description': '',
+                'uploader_id': 'BlastfromthePast',
+                'duration': 12,
+                'uploader': 'IT\'S HISTORY',
                  'playable_in_embed': True,
-                'tags': [],
                  'age_limit': 0,
-                'uploader_url': 'http://www.youtube.com/user/LinusTechTips',
-                'upload_date': '20220402',
-                'thumbnail': 'https://i.ytimg.com/vi_webp/yN3x1t3sieA/maxresdefault.webp',
-                'title': 'Story',
                  'live_status': 'not_live',
-                'uploader_id': 'LinusTechTips',
+                'tags': [],
+                'thumbnail': 'https://i.ytimg.com/vi_webp/vv8qTUWmulI/maxresdefault.webp',
+                'uploader_url': 'http://www.youtube.com/user/BlastfromthePast',
+                'channel_url': 'https://www.youtube.com/channel/UCzIZ8HrzDgc-pNQDUG6avBA',
+            }
+        }, {
+            'url': 'https://www.youtube.com/watch?v=tjjjtzRLHvA',
+            'info_dict': {
+                'id': 'tjjjtzRLHvA',
+                'ext': 'mp4',
+                'title': 'ハッシュタグ無し };if window.ytcsi',
+                'upload_date': '20220323',
+                'like_count': int,
+                'availability': 'unlisted',
+                'channel': 'nao20010128nao',
+                'thumbnail': 'https://i.ytimg.com/vi_webp/tjjjtzRLHvA/maxresdefault.webp',
+                'age_limit': 0,
+                'uploader': 'nao20010128nao',
+                'uploader_id': 'nao20010128nao',
+                'categories': ['Music'],
                  'view_count': int,
                  'description': '',
-                'channel_id': 'UCXuqSBlHAE6Xw-yeJA0Tunw',
-                'categories': ['Science & Technology'],
-                'channel_url': 'https://www.youtube.com/channel/UCXuqSBlHAE6Xw-yeJA0Tunw',
-                'availability': 'unlisted',
+                'channel_url': 'https://www.youtube.com/channel/UCdqltm_7iv1Vs6kp6Syke5A',
+                'channel_id': 'UCdqltm_7iv1Vs6kp6Syke5A',
+                'live_status': 'not_live',
+                'playable_in_embed': True,
+                'channel_follower_count': int,
+                'duration': 6,
+                'tags': [],
+                'uploader_url': 'http://www.youtube.com/user/nao20010128nao',
              }
          }
      ]
@@ -2754,7 +2780,7 @@ def _extract_chapters(self, chapter_list, chapter_time, chapter_title, duration)
      def _extract_yt_initial_variable(self, webpage, regex, video_id, name):
          return self._parse_json(self._search_regex(
              (fr'{regex}\s*{self._YT_INITIAL_BOUNDARY_RE}',
-             regex), webpage, name, default='{}'), video_id, fatal=False)
+             regex), webpage, name, default='{}'), video_id, fatal=False, lenient=True)
  
      def _extract_comment(self, comment_renderer, parent=None):
          comment_id = comment_renderer.get('commentId')
author	coletdjnz <redacted>
	Sun, 29 May 2022 07:54:22 +0000 (19:54 +1200)
committer	coletdjnz <redacted>
	Sun, 29 May 2022 07:54:22 +0000 (19:54 +1200)
yt_dlp/extractor/common.py		patch \| blob \| blame \| history
yt_dlp/extractor/youtube.py		patch \| blob \| blame \| history