+ def _real_extract(self, url):
+ page_id = self._match_id(url)
+ webpage = self._download_webpage(url, page_id)
+ art_json = self._search_json(
+ r'window\.__preloadedData\s*=', webpage, 'media details', page_id,
+ transform_source=lambda x: x.replace('undefined', 'null'))['initialData']['data']['article']
+
+ blocks = traverse_obj(art_json, (
+ 'sprinkledBody', 'content', ..., ('ledeMedia', None),
+ lambda _, v: v['__typename'] in ('Video', 'Audio')))
+ if not blocks:
+ raise ExtractorError('Unable to extract any media blocks from webpage')
+
+ common_info = {
+ 'title': remove_end(self._html_extract_title(webpage), ' - The New York Times'),
+ 'description': traverse_obj(art_json, (
+ 'sprinkledBody', 'content', ..., 'summary', 'content', ..., 'text', {str}),
+ get_all=False) or self._html_search_meta(['og:description', 'twitter:description'], webpage),
+ 'timestamp': traverse_obj(art_json, ('firstPublished', {parse_iso8601})),
+ 'creator': ', '.join(
+ traverse_obj(art_json, ('bylines', ..., 'creators', ..., 'displayName'))), # TODO: change to 'creators' (list)
+ 'thumbnails': self._extract_thumbnails(traverse_obj(
+ art_json, ('promotionalMedia', 'assetCrops', ..., 'renditions', ...))),
+ }