[ie] Make `_search_nextjs_data` non fatal (#8937)

author Simon Sawicki <redacted>

Sun, 21 Apr 2024 11:40:38 +0000 (13:40 +0200)

committer GitHub <redacted>

Sun, 21 Apr 2024 11:40:38 +0000 (13:40 +0200)
author Simon Sawicki <redacted>
Sun, 21 Apr 2024 11:40:38 +0000 (13:40 +0200)
committer GitHub <redacted>
Sun, 21 Apr 2024 11:40:38 +0000 (13:40 +0200)
diff --git a/test/test_InfoExtractor.py b/test/test_InfoExtractor.py

index b7dee496afb97e4e2457b14ae2474c5b60ae1a5d..c633ce3e47ab5bdf5088a5d5a58447912156b5fa 100644 (file)
--- a/test/test_InfoExtractor.py
+++ b/test/test_InfoExtractor.py
@@ -1906,6 +1906,15 @@ def test_response_with_expected_status_returns_content(self):
              expected_status=TEAPOT_RESPONSE_STATUS)
          self.assertEqual(content, TEAPOT_RESPONSE_BODY)
  
+    def test_search_nextjs_data(self):
+        data = '<script id="__NEXT_DATA__" type="application/json">{"props":{}}</script>'
+        self.assertEqual(self.ie._search_nextjs_data(data, None), {'props': {}})
+        self.assertEqual(self.ie._search_nextjs_data('', None, fatal=False), {})
+        self.assertEqual(self.ie._search_nextjs_data('', None, default=None), None)
+        self.assertEqual(self.ie._search_nextjs_data('', None, default={}), {})
+        with self.assertRaises(DeprecationWarning):
+            self.assertEqual(self.ie._search_nextjs_data('', None, default='{}'), {})
+
  
  if __name__ == '__main__':
      unittest.main()
diff --git a/yt_dlp/extractor/asobistage.py b/yt_dlp/extractor/asobistage.py

index b088a1b1321ab5ff05d651bca539a0f086233c3b..8fa8f3edb634bd769c848fdb679563cb3acdfa9e 100644 (file)
--- a/yt_dlp/extractor/asobistage.py
+++ b/yt_dlp/extractor/asobistage.py
@@ -105,7 +105,7 @@ def _real_extract(self, url):
          video_type = {'archive': 'archives', 'player': 'broadcasts'}[type_]
          webpage = self._download_webpage(url, video_id)
          event_data = traverse_obj(
-            self._search_nextjs_data(webpage, video_id, default='{}'),
+            self._search_nextjs_data(webpage, video_id, default={}),
              ('props', 'pageProps', 'eventCMSData', {
                  'title': ('event_name', {str}),
                  'thumbnail': ('event_thumbnail_image', {url_or_none}),
diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py

index 57bbf9bdf1e6efef0fb45230cbefcbe130454b25..bebbc6b43f90d1cbc85c4e7372f15b6fb40ba431 100644 (file)
--- a/yt_dlp/extractor/common.py
+++ b/yt_dlp/extractor/common.py
@@ -1738,12 +1738,16 @@ def traverse_json_ld(json_ld, at_top_level=True):
          traverse_json_ld(json_ld)
          return filter_dict(info)
  
-    def _search_nextjs_data(self, webpage, video_id, *, transform_source=None, fatal=True, **kw):
-        return self._parse_json(
-            self._search_regex(
-                r'(?s)<script[^>]+id=[\'"]__NEXT_DATA__[\'"][^>]*>([^<]+)</script>',
-                webpage, 'next.js data', fatal=fatal, **kw),
-            video_id, transform_source=transform_source, fatal=fatal)
+    def _search_nextjs_data(self, webpage, video_id, *, fatal=True, default=NO_DEFAULT, **kw):
+        if default == '{}':
+            self._downloader.deprecation_warning('using `default=\'{}\'` is deprecated, use `default={}` instead')
+            default = {}
+        if default is not NO_DEFAULT:
+            fatal = False
+
+        return self._search_json(
+            r'<script[^>]+id=[\'"]__NEXT_DATA__[\'"][^>]*>', webpage, 'next.js data',
+            video_id, end_pattern='</script>', fatal=fatal, default=default, **kw)
  
      def _search_nuxt_data(self, webpage, video_id, context_name='__NUXT__', *, fatal=True, traverse=('data', 0)):
          """Parses Nuxt.js metadata. This works as long as the function __NUXT__ invokes is a pure function"""
diff --git a/yt_dlp/extractor/stv.py b/yt_dlp/extractor/stv.py

index 8b3e63538c85c22dead18bc6c0ec5d5a72efbd52..0ab7801004739786f6db87e9c9616ccd907b9e1e 100644 (file)
--- a/yt_dlp/extractor/stv.py
+++ b/yt_dlp/extractor/stv.py
@@ -41,7 +41,7 @@ def _real_extract(self, url):
          ptype, video_id = self._match_valid_url(url).groups()
  
          webpage = self._download_webpage(url, video_id, fatal=False) or ''
-        props = self._search_nextjs_data(webpage, video_id, default='{}').get('props') or {}
+        props = self._search_nextjs_data(webpage, video_id, default={}).get('props') or {}
          player_api_cache = try_get(
              props, lambda x: x['initialReduxState']['playerApiCache']) or {}
  
diff --git a/yt_dlp/extractor/tiktok.py b/yt_dlp/extractor/tiktok.py

index 3f5261ad968a7604753ee93b3e589e91b98b58c3..3d965dd4529fe6b779552acc77f6d0ab1175f007 100644 (file)
--- a/yt_dlp/extractor/tiktok.py
+++ b/yt_dlp/extractor/tiktok.py
@@ -776,7 +776,7 @@ def _real_extract(self, url):
              status = traverse_obj(sigi_data, ('VideoPage', 'statusCode', {int})) or 0
              video_data = traverse_obj(sigi_data, ('ItemModule', video_id, {dict}))
  
-        elif next_data := self._search_nextjs_data(webpage, video_id, default='{}'):
+        elif next_data := self._search_nextjs_data(webpage, video_id, default={}):
              self.write_debug('Found next.js data')
              status = traverse_obj(next_data, ('props', 'pageProps', 'statusCode', {int})) or 0
              video_data = traverse_obj(next_data, ('props', 'pageProps', 'itemInfo', 'itemStruct', {dict}))
author	Simon Sawicki <redacted>
	Sun, 21 Apr 2024 11:40:38 +0000 (13:40 +0200)
committer	GitHub <redacted>
	Sun, 21 Apr 2024 11:40:38 +0000 (13:40 +0200)
test/test_InfoExtractor.py		patch \| blob \| blame \| history
yt_dlp/extractor/asobistage.py		patch \| blob \| blame \| history
yt_dlp/extractor/common.py		patch \| blob \| blame \| history
yt_dlp/extractor/stv.py		patch \| blob \| blame \| history
yt_dlp/extractor/tiktok.py		patch \| blob \| blame \| history