]> jfr.im git - yt-dlp.git/commitdiff
[ie] Make `_search_nextjs_data` non fatal (#8937)
authorSimon Sawicki <redacted>
Sun, 21 Apr 2024 11:40:38 +0000 (13:40 +0200)
committerGitHub <redacted>
Sun, 21 Apr 2024 11:40:38 +0000 (13:40 +0200)
Authored by: Grub4K

test/test_InfoExtractor.py
yt_dlp/extractor/asobistage.py
yt_dlp/extractor/common.py
yt_dlp/extractor/stv.py
yt_dlp/extractor/tiktok.py

index b7dee496afb97e4e2457b14ae2474c5b60ae1a5d..c633ce3e47ab5bdf5088a5d5a58447912156b5fa 100644 (file)
@@ -1906,6 +1906,15 @@ def test_response_with_expected_status_returns_content(self):
             expected_status=TEAPOT_RESPONSE_STATUS)
         self.assertEqual(content, TEAPOT_RESPONSE_BODY)
 
+    def test_search_nextjs_data(self):
+        data = '<script id="__NEXT_DATA__" type="application/json">{"props":{}}</script>'
+        self.assertEqual(self.ie._search_nextjs_data(data, None), {'props': {}})
+        self.assertEqual(self.ie._search_nextjs_data('', None, fatal=False), {})
+        self.assertEqual(self.ie._search_nextjs_data('', None, default=None), None)
+        self.assertEqual(self.ie._search_nextjs_data('', None, default={}), {})
+        with self.assertRaises(DeprecationWarning):
+            self.assertEqual(self.ie._search_nextjs_data('', None, default='{}'), {})
+
 
 if __name__ == '__main__':
     unittest.main()
index b088a1b1321ab5ff05d651bca539a0f086233c3b..8fa8f3edb634bd769c848fdb679563cb3acdfa9e 100644 (file)
@@ -105,7 +105,7 @@ def _real_extract(self, url):
         video_type = {'archive': 'archives', 'player': 'broadcasts'}[type_]
         webpage = self._download_webpage(url, video_id)
         event_data = traverse_obj(
-            self._search_nextjs_data(webpage, video_id, default='{}'),
+            self._search_nextjs_data(webpage, video_id, default={}),
             ('props', 'pageProps', 'eventCMSData', {
                 'title': ('event_name', {str}),
                 'thumbnail': ('event_thumbnail_image', {url_or_none}),
index 57bbf9bdf1e6efef0fb45230cbefcbe130454b25..bebbc6b43f90d1cbc85c4e7372f15b6fb40ba431 100644 (file)
@@ -1738,12 +1738,16 @@ def traverse_json_ld(json_ld, at_top_level=True):
         traverse_json_ld(json_ld)
         return filter_dict(info)
 
-    def _search_nextjs_data(self, webpage, video_id, *, transform_source=None, fatal=True, **kw):
-        return self._parse_json(
-            self._search_regex(
-                r'(?s)<script[^>]+id=[\'"]__NEXT_DATA__[\'"][^>]*>([^<]+)</script>',
-                webpage, 'next.js data', fatal=fatal, **kw),
-            video_id, transform_source=transform_source, fatal=fatal)
+    def _search_nextjs_data(self, webpage, video_id, *, fatal=True, default=NO_DEFAULT, **kw):
+        if default == '{}':
+            self._downloader.deprecation_warning('using `default=\'{}\'` is deprecated, use `default={}` instead')
+            default = {}
+        if default is not NO_DEFAULT:
+            fatal = False
+
+        return self._search_json(
+            r'<script[^>]+id=[\'"]__NEXT_DATA__[\'"][^>]*>', webpage, 'next.js data',
+            video_id, end_pattern='</script>', fatal=fatal, default=default, **kw)
 
     def _search_nuxt_data(self, webpage, video_id, context_name='__NUXT__', *, fatal=True, traverse=('data', 0)):
         """Parses Nuxt.js metadata. This works as long as the function __NUXT__ invokes is a pure function"""
index 8b3e63538c85c22dead18bc6c0ec5d5a72efbd52..0ab7801004739786f6db87e9c9616ccd907b9e1e 100644 (file)
@@ -41,7 +41,7 @@ def _real_extract(self, url):
         ptype, video_id = self._match_valid_url(url).groups()
 
         webpage = self._download_webpage(url, video_id, fatal=False) or ''
-        props = self._search_nextjs_data(webpage, video_id, default='{}').get('props') or {}
+        props = self._search_nextjs_data(webpage, video_id, default={}).get('props') or {}
         player_api_cache = try_get(
             props, lambda x: x['initialReduxState']['playerApiCache']) or {}
 
index 3f5261ad968a7604753ee93b3e589e91b98b58c3..3d965dd4529fe6b779552acc77f6d0ab1175f007 100644 (file)
@@ -776,7 +776,7 @@ def _real_extract(self, url):
             status = traverse_obj(sigi_data, ('VideoPage', 'statusCode', {int})) or 0
             video_data = traverse_obj(sigi_data, ('ItemModule', video_id, {dict}))
 
-        elif next_data := self._search_nextjs_data(webpage, video_id, default='{}'):
+        elif next_data := self._search_nextjs_data(webpage, video_id, default={}):
             self.write_debug('Found next.js data')
             status = traverse_obj(next_data, ('props', 'pageProps', 'statusCode', {int})) or 0
             video_data = traverse_obj(next_data, ('props', 'pageProps', 'itemInfo', 'itemStruct', {dict}))