[ie/facebook] Fix webpage extraction (#7890)

author ringus1 <redacted>

Tue, 5 Sep 2023 20:35:23 +0000 (22:35 +0200)

committer GitHub <redacted>

Tue, 5 Sep 2023 20:35:23 +0000 (20:35 +0000)
author ringus1 <redacted>
Tue, 5 Sep 2023 20:35:23 +0000 (22:35 +0200)
committer GitHub <redacted>
Tue, 5 Sep 2023 20:35:23 +0000 (20:35 +0000)
diff --git a/yt_dlp/extractor/facebook.py b/yt_dlp/extractor/facebook.py

index 4fd17b5743cb3112863327119cc21fcfe21b757f..c30a6b06a0135bbec3e3d99829d0f1edf56b5408 100644 (file)
--- a/yt_dlp/extractor/facebook.py
+++ b/yt_dlp/extractor/facebook.py
@@ -74,6 +74,22 @@ class FacebookIE(InfoExtractor):
      _VIDEO_PAGE_TAHOE_TEMPLATE = 'https://www.facebook.com/video/tahoe/async/%s/?chain=true&isvideo=true&payloadtype=primary'
  
      _TESTS = [{
+        'url': 'https://www.facebook.com/radiokicksfm/videos/3676516585958356/',
+        'info_dict': {
+            'id': '3676516585958356',
+            'ext': 'mp4',
+            'title': 'dr Adam Przygoda',
+            'description': 'md5:34675bda53336b1d16400265c2bb9b3b',
+            'uploader': 'RADIO KICKS FM',
+            'upload_date': '20230818',
+            'timestamp': 1692346159,
+            'thumbnail': r're:^https?://.*',
+            'uploader_id': '100063551323670',
+            'duration': 3132.184,
+            'view_count': int,
+            'concurrent_view_count': 0,
+        },
+    }, {
          'url': 'https://www.facebook.com/video.php?v=637842556329505&fref=nf',
          'md5': '6a40d33c0eccbb1af76cf0485a052659',
          'info_dict': {
@@ -97,7 +113,7 @@ class FacebookIE(InfoExtractor):
              'upload_date': '20140506',
              'timestamp': 1399398998,
              'thumbnail': r're:^https?://.*',
-            'uploader_id': 'pfbid04scW44U4P9iTyLZAGy8y8W3pR3i2VugvHCimiRudUAVbN3MPp9eXBaYFcgVworZwl',
+            'uploader_id': 'pfbid028wxorhX2ErLFJ578N6P3crHD3PHmXTCqCvfBpsnbSLmbokwSY75p5hWBjHGkG4zxl',
              'duration': 131.03,
              'concurrent_view_count': int,
          },
@@ -179,7 +195,7 @@ class FacebookIE(InfoExtractor):
              'timestamp': 1486648217,
              'upload_date': '20170209',
              'uploader': 'Yaroslav Korpan',
-            'uploader_id': 'pfbid029y8j22EwH3ikeqgH3SEP9G3CAi9kmWKgXJJG9s5geV7mo3J2bvURqHCdgucRgAyhl',
+            'uploader_id': 'pfbid06AScABAWcW91qpiuGrLt99Ef9tvwHoXP6t8KeFYEqkSfreMtfa9nTveh8b2ZEVSWl',
              'concurrent_view_count': int,
              'thumbnail': r're:^https?://.*',
              'view_count': int,
@@ -274,7 +290,7 @@ class FacebookIE(InfoExtractor):
              'title': 'Josef',
              'thumbnail': r're:^https?://.*',
              'concurrent_view_count': int,
-            'uploader_id': 'pfbid02gXHbDwxumkaKJQaTGUf3znYfYzTuidGEWawiramNx4YamSj2afwYSRkpcjtHtMRJl',
+            'uploader_id': 'pfbid0cibUN6tV7DYgdbJdsUFN46wc4jKpVSPAvJQhFofGqBGmVn3V3JtAs2tfUwziw2hUl',
              'timestamp': 1549275572,
              'duration': 3.413,
              'uploader': 'Josef Novak',
@@ -401,9 +417,9 @@ def _extract_from_url(self, url, video_id):
  
          def extract_metadata(webpage):
              post_data = [self._parse_json(j, video_id, fatal=False) for j in re.findall(
-                r'handleWithCustomApplyEach\(\s*ScheduledApplyEach\s*,\s*(\{.+?\})\s*\);', webpage)]
+                r'data-sjs>({.*?ScheduledServerJS.*?})</script>', webpage)]
              post = traverse_obj(post_data, (
-                ..., 'require', ..., ..., ..., '__bbox', 'result', 'data'), expected_type=dict) or []
+                ..., 'require', ..., ..., ..., '__bbox', 'require', ..., ..., ..., '__bbox', 'result', 'data'), expected_type=dict) or []
              media = traverse_obj(post, (..., 'attachments', ..., lambda k, v: (
                  k == 'media' and str(v['id']) == video_id and v['__typename'] == 'Video')), expected_type=dict)
              title = get_first(media, ('title', 'text'))
@@ -493,14 +509,14 @@ def process_formats(info):
  
          def extract_relay_data(_filter):
              return self._parse_json(self._search_regex(
-                r'handleWithCustomApplyEach\([^,]+,\s*({.*?%s.*?})\);' % _filter,
+                r'data-sjs>({.*?%s.*?})</script>' % _filter,
                  webpage, 'replay data', default='{}'), video_id, fatal=False) or {}
  
          def extract_relay_prefetched_data(_filter):
-            replay_data = extract_relay_data(_filter)
-            for require in (replay_data.get('require') or []):
-                if require[0] == 'RelayPrefetchedStreamCache':
-                    return try_get(require, lambda x: x[3][1]['__bbox']['result']['data'], dict) or {}
+            return traverse_obj(extract_relay_data(_filter), (
+                'require', (None, (..., ..., ..., '__bbox', 'require')),
+                lambda _, v: 'RelayPrefetchedStreamCache' in v, ..., ...,
+                '__bbox', 'result', 'data', {dict}), get_all=False) or {}
  
          if not video_data:
              server_js_data = self._parse_json(self._search_regex([
@@ -511,7 +527,7 @@ def extract_relay_prefetched_data(_filter):
  
          if not video_data:
              data = extract_relay_prefetched_data(
-                r'"(?:dash_manifest|playable_url(?:_quality_hd)?)"\s*:\s*"[^"]+"')
+                r'"(?:dash_manifest|playable_url(?:_quality_hd)?)')
              if data:
                  entries = []
  
@@ -526,7 +542,8 @@ def parse_graphql_video(video):
                      formats = []
                      q = qualities(['sd', 'hd'])
                      for key, format_id in (('playable_url', 'sd'), ('playable_url_quality_hd', 'hd'),
-                                           ('playable_url_dash', '')):
+                                           ('playable_url_dash', ''), ('browser_native_hd_url', 'hd'),
+                                           ('browser_native_sd_url', 'sd')):
                          playable_url = video.get(key)
                          if not playable_url:
                              continue
author	ringus1 <redacted>
	Tue, 5 Sep 2023 20:35:23 +0000 (22:35 +0200)
committer	GitHub <redacted>
	Tue, 5 Sep 2023 20:35:23 +0000 (20:35 +0000)