[ie/Facebook] Fix Memories extraction (#8681)

[yt-dlp.git] / yt_dlp / extractor / facebook.py
diff --git a/yt_dlp/extractor/facebook.py b/yt_dlp/extractor/facebook.py

index 50a750d3b127092e26d883ab4146c1bd77dbeced..a07a0d344d58a7118e2b7a2da06341852b085d08 100644 (file)
--- a/yt_dlp/extractor/facebook.py
+++ b/yt_dlp/extractor/facebook.py
@@ -16,6 +16,7 @@
      determine_ext,
      error_to_compat_str,
      float_or_none,
+    format_field,
      get_element_by_id,
      get_first,
      int_or_none,
@@ -51,7 +52,7 @@ class FacebookIE(InfoExtractor):
                              )\?(?:.*?)(?:v|video_id|story_fbid)=|
                              [^/]+/videos/(?:[^/]+/)?|
                              [^/]+/posts/|
-                            groups/[^/]+/permalink/|
+                            groups/[^/]+/(?:permalink|posts)/|
                              watchparty/
                          )|
                      facebook:
@@ -231,6 +232,21 @@ class FacebookIE(InfoExtractor):
              'uploader_id': '100013949973717',
          },
          'skip': 'Requires logging in',
+    }, {
+        # data.node.comet_sections.content.story.attachments[].throwbackStyles.attachment_target_renderer.attachment.target.attachments[].styles.attachment.media
+        'url': 'https://www.facebook.com/groups/1645456212344334/posts/3737828833107051/',
+        'info_dict': {
+            'id': '1569199726448814',
+            'ext': 'mp4',
+            'title': 'Pence MUST GO!',
+            'description': 'Vickie Gentry shared a memory.',
+            'timestamp': 1511548260,
+            'upload_date': '20171124',
+            'uploader': 'Vickie Gentry',
+            'uploader_id': 'pfbid0FuZhHCeWDAxWxEbr3yKPFaRstXvRxgsp9uCPG6GjD4J2AitB35NUAuJ4Q75KcjiDl',
+            'thumbnail': r're:^https?://.*',
+            'duration': 148.435,
+        },
      }, {
          'url': 'https://www.facebook.com/video.php?v=10204634152394104',
          'only_matching': True,
@@ -420,6 +436,29 @@ def extract_metadata(webpage):
                  r'data-sjs>({.*?ScheduledServerJS.*?})</script>', webpage)]
              post = traverse_obj(post_data, (
                  ..., 'require', ..., ..., ..., '__bbox', 'require', ..., ..., ..., '__bbox', 'result', 'data'), expected_type=dict) or []
+
+            automatic_captions, subtitles = {}, {}
+            subs_data = traverse_obj(post, (..., 'video', ..., 'attachments', ..., lambda k, v: (
+                k == 'media' and str(v['id']) == video_id and v['__typename'] == 'Video')))
+            is_video_broadcast = get_first(subs_data, 'is_video_broadcast', expected_type=bool)
+            captions = get_first(subs_data, 'video_available_captions_locales', 'captions_url')
+            if url_or_none(captions):  # if subs_data only had a 'captions_url'
+                locale = self._html_search_meta(['og:locale', 'twitter:locale'], webpage, 'locale', default='en_US')
+                subtitles[locale] = [{'url': captions}]
+            # or else subs_data had 'video_available_captions_locales', a list of dicts
+            for caption in traverse_obj(captions, (
+                {lambda x: sorted(x, key=lambda c: c['locale'])}, lambda _, v: v['captions_url'])
+            ):
+                lang = caption.get('localized_language') or ''
+                subs = {
+                    'url': caption['captions_url'],
+                    'name': format_field(caption, 'localized_country', f'{lang} (%s)', default=lang),
+                }
+                if caption.get('localized_creation_method') or is_video_broadcast:
+                    automatic_captions.setdefault(caption['locale'], []).append(subs)
+                else:
+                    subtitles.setdefault(caption['locale'], []).append(subs)
+
              media = traverse_obj(post, (..., 'attachments', ..., lambda k, v: (
                  k == 'media' and str(v['id']) == video_id and v['__typename'] == 'Video')), expected_type=dict)
              title = get_first(media, ('title', 'text'))
@@ -463,6 +502,8 @@ def extract_metadata(webpage):
                      webpage, 'view count', default=None)),
                  'concurrent_view_count': get_first(post, (
                      ('video', (..., ..., 'attachments', ..., 'media')), 'liveViewerCount', {int_or_none})),
+                'automatic_captions': automatic_captions,
+                'subtitles': subtitles,
              }
  
              info_json_ld = self._search_json_ld(webpage, video_id, default={})
@@ -586,9 +627,11 @@ def parse_attachment(attachment, key='media'):
                  nodes = variadic(traverse_obj(data, 'nodes', 'node') or [])
                  attachments = traverse_obj(nodes, (
                      ..., 'comet_sections', 'content', 'story', (None, 'attached_story'), 'attachments',
-                    ..., ('styles', 'style_type_renderer'), 'attachment'), expected_type=dict) or []
+                    ..., ('styles', 'style_type_renderer', ('throwbackStyles', 'attachment_target_renderer')),
+                    'attachment', {dict}))
                  for attachment in attachments:
-                    ns = try_get(attachment, lambda x: x['all_subattachments']['nodes'], list) or []
+                    ns = traverse_obj(attachment, ('all_subattachments', 'nodes', ..., {dict}),
+                                      ('target', 'attachments', ..., 'styles', 'attachment', {dict}))
                      for n in ns:
                          parse_attachment(n)
                      parse_attachment(attachment)
@@ -611,7 +654,7 @@ def parse_attachment(attachment, key='media'):
                  if len(entries) > 1:
                      return self.playlist_result(entries, video_id)
  
-                video_info = entries[0]
+                video_info = entries[0] if entries else {'id': video_id}
                  webpage_info = extract_metadata(webpage)
                  # honor precise duration in video info
                  if video_info.get('duration'):