[ie/mlbtv] Fix extraction (#10296)

[yt-dlp.git] / yt_dlp / extractor / washingtonpost.py
diff --git a/yt_dlp/extractor/washingtonpost.py b/yt_dlp/extractor/washingtonpost.py

index 8afb1af831aec44831d07d61adae269d796f10c9..36e1f30e5dd9110046c21e1b5f0d0d945ac18fc2 100644 (file)
--- a/yt_dlp/extractor/washingtonpost.py
+++ b/yt_dlp/extractor/washingtonpost.py
@@ -1,15 +1,13 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
  import re
  
  from .common import InfoExtractor
+from ..utils import traverse_obj
  
  
  class WashingtonPostIE(InfoExtractor):
      IE_NAME = 'washingtonpost'
      _VALID_URL = r'(?:washingtonpost:|https?://(?:www\.)?washingtonpost\.com/(?:video|posttv)/(?:[^/]+/)*)(?P<id>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})'
-    _EMBED_URL = r'https?://(?:www\.)?washingtonpost\.com/video/c/embed/[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12}'
+    _EMBED_REGEX = [r'<iframe[^>]+\bsrc=["\'](?P<url>https?://(?:www\.)?washingtonpost\.com/video/c/embed/[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})']
      _TESTS = [{
          'url': 'https://www.washingtonpost.com/video/c/video/480ba4ee-1ec7-11e6-82c2-a7dcb313287d',
          'md5': '6f537e1334b714eb15f9563bd4b9cdfa',
@@ -29,11 +27,6 @@ class WashingtonPostIE(InfoExtractor):
          'only_matching': True,
      }]
  
-    @classmethod
-    def _extract_urls(cls, webpage):
-        return re.findall(
-            r'<iframe[^>]+\bsrc=["\'](%s)' % cls._EMBED_URL, webpage)
-
      def _real_extract(self, url):
          video_id = self._match_id(url)
          return self.url_result(
@@ -50,7 +43,7 @@ class WashingtonPostArticleIE(InfoExtractor):
              'title': 'Sinkhole of bureaucracy',
          },
          'playlist': [{
-            'md5': 'b9be794ceb56c7267d410a13f99d801a',
+            'md5': '7ccf53ea8cbb77de5f570242b3b21a59',
              'info_dict': {
                  'id': 'fc433c38-b146-11e3-b8b3-44b1d1cd4c1f',
                  'ext': 'mp4',
@@ -59,9 +52,10 @@ class WashingtonPostArticleIE(InfoExtractor):
                  'description': 'Overly complicated paper pushing is nothing new to government bureaucracy. But the way federal retirement applications are filed may be the most outdated. David Fahrenthold explains.',
                  'timestamp': 1395440416,
                  'upload_date': '20140321',
+                'thumbnail': r're:https://[^\.]+.cloudfront\.net/PAPERMINESplash\.jpg',
              },
          }, {
-            'md5': '1fff6a689d8770966df78c8cb6c8c17c',
+            'md5': '7ccf53ea8cbb77de5f570242b3b21a59',
              'info_dict': {
                  'id': '41255e28-b14a-11e3-b8b3-44b1d1cd4c1f',
                  'ext': 'mp4',
@@ -70,6 +64,7 @@ class WashingtonPostArticleIE(InfoExtractor):
                  'duration': 2220,
                  'timestamp': 1395441819,
                  'upload_date': '20140321',
+                'thumbnail': r're:https://[^\.]+.cloudfront\.net/BoyersSplash\.jpeg',
              },
          }],
      }, {
@@ -87,13 +82,17 @@ class WashingtonPostArticleIE(InfoExtractor):
                  'upload_date': '20141230',
                  'timestamp': 1419972442,
                  'title': 'Why black boxes don’t transmit data in real time',
-            }
-        }]
+            },
+        }],
+        'skip': 'Doesnt have a video anymore',
+    }, {
+        'url': 'https://www.washingtonpost.com/nation/2021/08/05/dixie-river-fire-california-climate/',
+        'only_matching': True,
      }]
  
      @classmethod
      def suitable(cls, url):
-        return False if WashingtonPostIE.suitable(url) else super(WashingtonPostArticleIE, cls).suitable(url)
+        return False if WashingtonPostIE.suitable(url) else super().suitable(url)
  
      def _real_extract(self, url):
          page_id = self._match_id(url)
@@ -106,7 +105,14 @@ def _real_extract(self, url):
                  <div\s+class="posttv-video-embed[^>]*?data-uuid=|
                  data-video-uuid=
              )"([^"]+)"''', webpage)
-        entries = [self.url_result('washingtonpost:%s' % uuid, 'WashingtonPost', uuid) for uuid in uuids]
+
+        if not uuids:
+            json_data = self._search_nextjs_data(webpage, page_id)
+            for content_element in traverse_obj(json_data, ('props', 'pageProps', 'globalContent', 'content_elements')):
+                if content_element.get('type') == 'video':
+                    uuids.append(content_element.get('_id'))
+
+        entries = [self.url_result(f'washingtonpost:{uuid}', 'WashingtonPost', uuid) for uuid in uuids]
  
          return {
              '_type': 'playlist',