]> jfr.im git - yt-dlp.git/blobdiff - yt_dlp/extractor/mlb.py
[ie/youtube] Extract upload timestamp if available (#9856)
[yt-dlp.git] / yt_dlp / extractor / mlb.py
index ab0edbae390fc7c8927bba57851406e7ad2b9791..d715b978920bb3c67bea8ab65f202da4f445e569 100644 (file)
@@ -54,7 +54,6 @@ def _real_extract(self, url):
                         'width': int(mobj.group(1)),
                     })
                 formats.append(f)
-        self._sort_formats(formats)
 
         thumbnails = []
         for cut in (try_get(feed, lambda x: x['image']['cuts'], list) or []):
@@ -339,11 +338,42 @@ def _real_extract(self, url):
             formats.extend(f)
             self._merge_subtitles(s, target=subtitles)
 
-        self._sort_formats(formats)
         return {
             'id': video_id,
             'title': traverse_obj(airings, (..., 'titles', 0, 'episodeName'), get_all=False),
+            'is_live': traverse_obj(airings, (..., 'mediaConfig', 'productType'), get_all=False) == 'LIVE',
             'formats': formats,
             'subtitles': subtitles,
             'http_headers': {'Authorization': f'Bearer {self._access_token}'},
         }
+
+
+class MLBArticleIE(InfoExtractor):
+    _VALID_URL = r'https?://www\.mlb\.com/news/(?P<id>[\w-]+)'
+    _TESTS = [{
+        'url': 'https://www.mlb.com/news/manny-machado-robs-guillermo-heredia-reacts',
+        'info_dict': {
+            'id': '36db7394-343c-4ea3-b8ca-ead2e61bca9a',
+            'title': 'Machado\'s grab draws hilarious irate reaction',
+            'modified_timestamp': 1675888370,
+            'description': 'md5:a19d4eb0487b2cb304e9a176f6b67676',
+            'modified_date': '20230208',
+        },
+        'playlist_mincount': 2,
+    }]
+
+    def _real_extract(self, url):
+        display_id = self._match_id(url)
+        webpage = self._download_webpage(url, display_id)
+        apollo_cache_json = self._search_json(r'window\.initState\s*=', webpage, 'window.initState', display_id)['apolloCache']
+
+        content_real_info = traverse_obj(
+            apollo_cache_json, ('ROOT_QUERY', lambda k, _: k.startswith('getArticle')), get_all=False)
+
+        return self.playlist_from_matches(
+            traverse_obj(content_real_info, ('parts', lambda _, v: v['__typename'] == 'Video' or v['type'] == 'video')),
+            getter=lambda x: f'https://www.mlb.com/video/{x["slug"]}',
+            ie=MLBVideoIE, playlist_id=content_real_info.get('translationId'),
+            title=self._html_search_meta('og:title', webpage),
+            description=content_real_info.get('summary'),
+            modified_timestamp=parse_iso8601(content_real_info.get('lastUpdatedDate')))