]>
Commit | Line | Data |
---|---|---|
07256b9f S |
1 | import json |
2 | import uuid | |
4191779d | 3 | |
d664de44 | 4 | from .common import InfoExtractor |
f3c0c667 | 5 | from ..utils import ( |
07256b9f S |
6 | ExtractorError, |
7 | clean_html, | |
47da7823 | 8 | determine_ext, |
07256b9f | 9 | extract_attributes, |
f3c0c667 | 10 | float_or_none, |
07256b9f | 11 | get_elements_html_by_class, |
f3c0c667 | 12 | int_or_none, |
07256b9f | 13 | merge_dicts, |
4191779d | 14 | mimetype2ext, |
47da7823 | 15 | parse_iso8601, |
07256b9f | 16 | remove_end, |
47da7823 | 17 | remove_start, |
07256b9f S |
18 | str_or_none, |
19 | traverse_obj, | |
20 | url_or_none, | |
f3c0c667 | 21 | ) |
d664de44 S |
22 | |
23 | ||
50aa43b3 | 24 | class NYTimesBaseIE(InfoExtractor): |
07256b9f S |
25 | _DNS_NAMESPACE = uuid.UUID('36dd619a-56dc-595b-9e09-37f4152c7b5d') |
26 | _TOKEN = 'MIIBIjANBgkqhkiG9w0BAQEFAAOCAQ8AMIIBCgKCAQEAuNIzKBOFB77aT/jN/FQ+/QVKWq5V1ka1AYmCR9hstz1pGNPH5ajOU9gAqta0T89iPnhjwla+3oec/Z3kGjxbpv6miQXufHFq3u2RC6HyU458cLat5kVPSOQCe3VVB5NRpOlRuwKHqn0txfxnwSSj8mqzstR997d3gKB//RO9zE16y3PoWlDQXkASngNJEWvL19iob/xwAkfEWCjyRILWFY0JYX3AvLMSbq7wsqOCE5srJpo7rRU32zsByhsp1D5W9OYqqwDmflsgCEQy2vqTsJjrJohuNg+urMXNNZ7Y3naMoqttsGDrWVxtPBafKMI8pM2ReNZBbGQsQXRzQNo7+QIDAQAB' | |
27 | _GRAPHQL_API = 'https://samizdat-graphql.nytimes.com/graphql/v2' | |
28 | _GRAPHQL_QUERY = '''query VideoQuery($id: String!) { | |
29 | video(id: $id) { | |
30 | ... on Video { | |
31 | bylines { | |
32 | renderedRepresentation | |
33 | } | |
34 | duration | |
05420227 | 35 | firstPublished |
07256b9f S |
36 | promotionalHeadline |
37 | promotionalMedia { | |
38 | ... on Image { | |
39 | crops { | |
40 | name | |
41 | renditions { | |
42 | name | |
43 | width | |
44 | height | |
45 | url | |
46 | } | |
47 | } | |
48 | } | |
49 | } | |
50 | renditions { | |
51 | type | |
52 | width | |
53 | height | |
54 | url | |
55 | bitrate | |
56 | } | |
57 | summary | |
58 | } | |
59 | } | |
60 | }''' | |
61 | ||
62 | def _call_api(self, media_id): | |
63 | # reference: `id-to-uri.js` | |
64 | video_uuid = uuid.uuid5(self._DNS_NAMESPACE, 'video') | |
65 | media_uuid = uuid.uuid5(video_uuid, media_id) | |
66 | ||
67 | return traverse_obj(self._download_json( | |
68 | self._GRAPHQL_API, media_id, 'Downloading JSON from GraphQL API', data=json.dumps({ | |
69 | 'query': self._GRAPHQL_QUERY, | |
70 | 'variables': {'id': f'nyt://video/{media_uuid}'}, | |
71 | }, separators=(',', ':')).encode(), headers={ | |
72 | 'Content-Type': 'application/json', | |
73 | 'Nyt-App-Type': 'vhs', | |
74 | 'Nyt-App-Version': 'v3.52.21', | |
75 | 'Nyt-Token': self._TOKEN, | |
76 | 'Origin': 'https://nytimes.com', | |
77 | }, fatal=False), ('data', 'video', {dict})) or {} | |
78 | ||
79 | def _extract_thumbnails(self, thumbs): | |
80 | return traverse_obj(thumbs, (lambda _, v: url_or_none(v['url']), { | |
81 | 'url': 'url', | |
82 | 'width': ('width', {int_or_none}), | |
83 | 'height': ('height', {int_or_none}), | |
84 | }), default=None) | |
85 | ||
86 | def _extract_formats_and_subtitles(self, video_id, content_media_json): | |
4191779d RA |
87 | urls = [] |
88 | formats = [] | |
47f4203d | 89 | subtitles = {} |
07256b9f | 90 | for video in traverse_obj(content_media_json, ('renditions', ..., {dict})): |
4191779d RA |
91 | video_url = video.get('url') |
92 | format_id = video.get('type') | |
93 | if not video_url or format_id == 'thumbs' or video_url in urls: | |
94 | continue | |
95 | urls.append(video_url) | |
96 | ext = mimetype2ext(video.get('mimetype')) or determine_ext(video_url) | |
97 | if ext == 'm3u8': | |
47f4203d | 98 | m3u8_fmts, m3u8_subs = self._extract_m3u8_formats_and_subtitles( |
4191779d | 99 | video_url, video_id, 'mp4', 'm3u8_native', |
47f4203d F |
100 | m3u8_id=format_id or 'hls', fatal=False) |
101 | formats.extend(m3u8_fmts) | |
07256b9f | 102 | self._merge_subtitles(m3u8_subs, target=subtitles) |
4191779d | 103 | elif ext == 'mpd': |
07256b9f | 104 | continue # all mpd urls give 404 errors |
4191779d RA |
105 | else: |
106 | formats.append({ | |
107 | 'url': video_url, | |
108 | 'format_id': format_id, | |
109 | 'vcodec': video.get('videoencoding') or video.get('video_codec'), | |
110 | 'width': int_or_none(video.get('width')), | |
111 | 'height': int_or_none(video.get('height')), | |
07256b9f S |
112 | 'filesize': traverse_obj(video, ( |
113 | ('file_size', 'fileSize'), (None, ('value')), {int_or_none}), get_all=False), | |
f377edec | 114 | 'tbr': int_or_none(video.get('bitrate'), 1000) or None, |
4191779d RA |
115 | 'ext': ext, |
116 | }) | |
d664de44 | 117 | |
07256b9f | 118 | return formats, subtitles |
4191779d | 119 | |
07256b9f S |
120 | def _extract_video(self, media_id): |
121 | data = self._call_api(media_id) | |
122 | formats, subtitles = self._extract_formats_and_subtitles(media_id, data) | |
d664de44 S |
123 | |
124 | return { | |
07256b9f S |
125 | 'id': media_id, |
126 | 'title': data.get('promotionalHeadline'), | |
127 | 'description': data.get('summary'), | |
05420227 | 128 | 'timestamp': parse_iso8601(data.get('firstPublished')), |
07256b9f S |
129 | 'duration': float_or_none(data.get('duration'), scale=1000), |
130 | 'creator': ', '.join(traverse_obj(data, ( # TODO: change to 'creators' | |
131 | 'bylines', ..., 'renderedRepresentation', {lambda x: remove_start(x, 'By ')}))), | |
d664de44 | 132 | 'formats': formats, |
47f4203d | 133 | 'subtitles': subtitles, |
07256b9f S |
134 | 'thumbnails': self._extract_thumbnails( |
135 | traverse_obj(data, ('promotionalMedia', 'crops', ..., 'renditions', ...))), | |
5f6a1245 | 136 | } |
50aa43b3 YCH |
137 | |
138 | ||
139 | class NYTimesIE(NYTimesBaseIE): | |
140 | _VALID_URL = r'https?://(?:(?:www\.)?nytimes\.com/video/(?:[^/]+/)+?|graphics8\.nytimes\.com/bcvideo/\d+(?:\.\d+)?/iframe/embed\.html\?videoId=)(?P<id>\d+)' | |
bfd973ec | 141 | _EMBED_REGEX = [r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//graphics8\.nytimes\.com/bcvideo/[^/]+/iframe/embed\.html.+?)\1>'] |
50aa43b3 YCH |
142 | _TESTS = [{ |
143 | 'url': 'http://www.nytimes.com/video/opinion/100000002847155/verbatim-what-is-a-photocopier.html?playlistId=100000001150263', | |
07256b9f | 144 | 'md5': 'a553aa344014e3723d33893d89d4defc', |
50aa43b3 YCH |
145 | 'info_dict': { |
146 | 'id': '100000002847155', | |
07256b9f | 147 | 'ext': 'mp4', |
50aa43b3 YCH |
148 | 'title': 'Verbatim: What Is a Photocopier?', |
149 | 'description': 'md5:93603dada88ddbda9395632fdc5da260', | |
05420227 S |
150 | 'timestamp': 1398646132, |
151 | 'upload_date': '20140428', | |
07256b9f S |
152 | 'creator': 'Brett Weiner', |
153 | 'thumbnail': r're:https?://\w+\.nyt.com/images/.+\.jpg', | |
50aa43b3 | 154 | 'duration': 419, |
07256b9f | 155 | }, |
50aa43b3 YCH |
156 | }, { |
157 | 'url': 'http://www.nytimes.com/video/travel/100000003550828/36-hours-in-dubai.html', | |
158 | 'only_matching': True, | |
159 | }] | |
160 | ||
161 | def _real_extract(self, url): | |
162 | video_id = self._match_id(url) | |
163 | ||
07256b9f | 164 | return self._extract_video(video_id) |
50aa43b3 YCH |
165 | |
166 | ||
167 | class NYTimesArticleIE(NYTimesBaseIE): | |
07256b9f | 168 | _VALID_URL = r'https?://(?:www\.)?nytimes\.com/\d{4}/\d{2}/\d{2}/(?!books|podcasts)[^/?#]+/(?:\w+/)?(?P<id>[^./?#]+)(?:\.html)?' |
df8418ff | 169 | _TESTS = [{ |
50aa43b3 | 170 | 'url': 'http://www.nytimes.com/2015/04/14/business/owner-of-gravity-payments-a-credit-card-processor-is-setting-a-new-minimum-wage-70000-a-year.html?_r=0', |
07256b9f | 171 | 'md5': '3eb5ddb1d6f86254fe4f233826778737', |
50aa43b3 YCH |
172 | 'info_dict': { |
173 | 'id': '100000003628438', | |
07256b9f S |
174 | 'ext': 'mp4', |
175 | 'title': 'One Company’s New Minimum Wage: $70,000 a Year', | |
176 | 'description': 'md5:89ba9ab67ca767bb92bf823d1f138433', | |
177 | 'timestamp': 1429047468, | |
50aa43b3 YCH |
178 | 'upload_date': '20150414', |
179 | 'uploader': 'Matthew Williams', | |
07256b9f S |
180 | 'creator': 'Patricia Cohen', |
181 | 'thumbnail': r're:https?://\w+\.nyt.com/images/.*\.jpg', | |
182 | 'duration': 119.0, | |
183 | }, | |
74324a7a | 184 | }, { |
07256b9f S |
185 | # article with audio and no video |
186 | 'url': 'https://www.nytimes.com/2023/09/29/health/mosquitoes-genetic-engineering.html', | |
187 | 'md5': '2365b3555c8aa7f4dd34ca735ad02e6a', | |
74324a7a | 188 | 'info_dict': { |
07256b9f | 189 | 'id': '100000009110381', |
74324a7a | 190 | 'ext': 'mp3', |
07256b9f S |
191 | 'title': 'The Gamble: Can Genetically Modified Mosquitoes End Disease?', |
192 | 'description': 'md5:9ff8b47acbaf7f3ca8c732f5c815be2e', | |
193 | 'timestamp': 1695960700, | |
194 | 'upload_date': '20230929', | |
195 | 'creator': 'Stephanie Nolen, Natalija Gormalova', | |
196 | 'thumbnail': r're:https?://\w+\.nyt.com/images/.*\.jpg', | |
197 | 'duration': 1322, | |
198 | }, | |
74324a7a | 199 | }, { |
07256b9f S |
200 | 'url': 'https://www.nytimes.com/2023/11/29/business/dealbook/kamala-harris-biden-voters.html', |
201 | 'md5': '3eb5ddb1d6f86254fe4f233826778737', | |
74324a7a | 202 | 'info_dict': { |
07256b9f S |
203 | 'id': '100000009202270', |
204 | 'ext': 'mp4', | |
205 | 'title': 'Kamala Harris Defends Biden Policies, but Says ‘More Work’ Needed to Reach Voters', | |
206 | 'description': 'md5:de4212a7e19bb89e4fb14210ca915f1f', | |
207 | 'timestamp': 1701290997, | |
208 | 'upload_date': '20231129', | |
209 | 'uploader': 'By The New York Times', | |
210 | 'creator': 'Katie Rogers', | |
211 | 'thumbnail': r're:https?://\w+\.nyt.com/images/.*\.jpg', | |
212 | 'duration': 97.631, | |
47da7823 S |
213 | }, |
214 | 'params': { | |
07256b9f S |
215 | 'skip_download': 'm3u8', |
216 | }, | |
217 | }, { | |
218 | # multiple videos in the same article | |
219 | 'url': 'https://www.nytimes.com/2023/12/02/business/air-traffic-controllers-safety.html', | |
220 | 'info_dict': { | |
221 | 'id': 'air-traffic-controllers-safety', | |
222 | 'title': 'Drunk and Asleep on the Job: Air Traffic Controllers Pushed to the Brink', | |
223 | 'description': 'md5:549e5a5e935bf7d048be53ba3d2c863d', | |
224 | 'upload_date': '20231202', | |
225 | 'creator': 'Emily Steel, Sydney Ember', | |
226 | 'timestamp': 1701511264, | |
47da7823 | 227 | }, |
07256b9f | 228 | 'playlist_count': 3, |
df8418ff | 229 | }, { |
07256b9f | 230 | 'url': 'https://www.nytimes.com/2023/12/02/business/media/netflix-squid-game-challenge.html', |
df8418ff YCH |
231 | 'only_matching': True, |
232 | }] | |
50aa43b3 | 233 | |
07256b9f S |
234 | def _extract_content_from_block(self, block): |
235 | details = traverse_obj(block, { | |
236 | 'id': ('sourceId', {str}), | |
237 | 'uploader': ('bylines', ..., 'renderedRepresentation', {str}), | |
238 | 'duration': (None, (('duration', {lambda x: float_or_none(x, scale=1000)}), ('length', {int_or_none}))), | |
239 | 'timestamp': ('firstPublished', {parse_iso8601}), | |
240 | 'series': ('podcastSeries', {str}), | |
241 | }, get_all=False) | |
242 | ||
243 | formats, subtitles = self._extract_formats_and_subtitles(details.get('id'), block) | |
244 | # audio articles will have an url and no formats | |
245 | url = traverse_obj(block, ('fileUrl', {url_or_none})) | |
246 | if not formats and url: | |
247 | formats.append({'url': url, 'vcodec': 'none'}) | |
47da7823 | 248 | |
07256b9f S |
249 | return { |
250 | **details, | |
251 | 'thumbnails': self._extract_thumbnails(traverse_obj( | |
252 | block, ('promotionalMedia', 'crops', ..., 'renditions', ...))), | |
253 | 'formats': formats, | |
add96eb9 | 254 | 'subtitles': subtitles, |
07256b9f | 255 | } |
47da7823 | 256 | |
07256b9f S |
257 | def _real_extract(self, url): |
258 | page_id = self._match_id(url) | |
259 | webpage = self._download_webpage(url, page_id) | |
260 | art_json = self._search_json( | |
261 | r'window\.__preloadedData\s*=', webpage, 'media details', page_id, | |
262 | transform_source=lambda x: x.replace('undefined', 'null'))['initialData']['data']['article'] | |
263 | ||
264 | blocks = traverse_obj(art_json, ( | |
265 | 'sprinkledBody', 'content', ..., ('ledeMedia', None), | |
266 | lambda _, v: v['__typename'] in ('Video', 'Audio'))) | |
267 | if not blocks: | |
268 | raise ExtractorError('Unable to extract any media blocks from webpage') | |
269 | ||
270 | common_info = { | |
271 | 'title': remove_end(self._html_extract_title(webpage), ' - The New York Times'), | |
272 | 'description': traverse_obj(art_json, ( | |
273 | 'sprinkledBody', 'content', ..., 'summary', 'content', ..., 'text', {str}), | |
274 | get_all=False) or self._html_search_meta(['og:description', 'twitter:description'], webpage), | |
275 | 'timestamp': traverse_obj(art_json, ('firstPublished', {parse_iso8601})), | |
276 | 'creator': ', '.join( | |
277 | traverse_obj(art_json, ('bylines', ..., 'creators', ..., 'displayName'))), # TODO: change to 'creators' (list) | |
278 | 'thumbnails': self._extract_thumbnails(traverse_obj( | |
279 | art_json, ('promotionalMedia', 'assetCrops', ..., 'renditions', ...))), | |
280 | } | |
47da7823 | 281 | |
07256b9f S |
282 | entries = [] |
283 | for block in blocks: | |
284 | entries.append(merge_dicts(self._extract_content_from_block(block), common_info)) | |
47da7823 | 285 | |
07256b9f S |
286 | if len(entries) > 1: |
287 | return self.playlist_result(entries, page_id, **common_info) | |
47da7823 S |
288 | |
289 | return { | |
07256b9f S |
290 | 'id': page_id, |
291 | **entries[0], | |
47da7823 S |
292 | } |
293 | ||
07256b9f S |
294 | |
295 | class NYTimesCookingIE(NYTimesBaseIE): | |
296 | IE_NAME = 'NYTimesCookingGuide' | |
297 | _VALID_URL = r'https?://cooking\.nytimes\.com/guides/(?P<id>[\w-]+)' | |
298 | _TESTS = [{ | |
299 | 'url': 'https://cooking.nytimes.com/guides/13-how-to-cook-a-turkey', | |
300 | 'info_dict': { | |
301 | 'id': '13-how-to-cook-a-turkey', | |
302 | 'title': 'How to Cook a Turkey', | |
303 | 'description': 'md5:726cfd3f9b161bdf5c279879e8050ca0', | |
304 | }, | |
305 | 'playlist_count': 2, | |
306 | }, { | |
307 | # single video example | |
308 | 'url': 'https://cooking.nytimes.com/guides/50-how-to-make-mac-and-cheese', | |
309 | 'md5': '64415805fe0b8640fce6b0b9def5989a', | |
310 | 'info_dict': { | |
311 | 'id': '100000005835845', | |
312 | 'ext': 'mp4', | |
313 | 'title': 'How to Make Mac and Cheese', | |
314 | 'description': 'md5:b8f2f33ec1fb7523b21367147c9594f1', | |
05420227 S |
315 | 'timestamp': 1522950315, |
316 | 'upload_date': '20180405', | |
07256b9f S |
317 | 'duration': 9.51, |
318 | 'creator': 'Alison Roman', | |
319 | 'thumbnail': r're:https?://\w+\.nyt.com/images/.*\.jpg', | |
320 | }, | |
321 | }, { | |
322 | 'url': 'https://cooking.nytimes.com/guides/20-how-to-frost-a-cake', | |
323 | 'md5': '64415805fe0b8640fce6b0b9def5989a', | |
324 | 'info_dict': { | |
325 | 'id': '20-how-to-frost-a-cake', | |
326 | 'title': 'How to Frost a Cake', | |
327 | 'description': 'md5:a31fe3b98a8ce7b98aae097730c269cd', | |
328 | }, | |
329 | 'playlist_count': 8, | |
330 | }] | |
331 | ||
50aa43b3 | 332 | def _real_extract(self, url): |
74324a7a | 333 | page_id = self._match_id(url) |
74324a7a | 334 | webpage = self._download_webpage(url, page_id) |
07256b9f S |
335 | title = self._html_search_meta(['og:title', 'twitter:title'], webpage) |
336 | description = self._html_search_meta(['og:description', 'twitter:description'], webpage) | |
50aa43b3 | 337 | |
07256b9f S |
338 | lead_video_id = self._search_regex( |
339 | r'data-video-player-id="(\d+)"></div>', webpage, 'lead video') | |
340 | media_ids = traverse_obj( | |
341 | get_elements_html_by_class('video-item', webpage), (..., {extract_attributes}, 'data-video-id')) | |
47da7823 | 342 | |
07256b9f S |
343 | if media_ids: |
344 | media_ids.append(lead_video_id) | |
345 | return self.playlist_result( | |
346 | [self._extract_video(media_id) for media_id in media_ids], page_id, title, description) | |
70c5802b | 347 | |
07256b9f S |
348 | return { |
349 | **self._extract_video(lead_video_id), | |
350 | 'title': title, | |
351 | 'description': description, | |
352 | 'creator': self._search_regex( # TODO: change to 'creators' | |
353 | r'<span itemprop="author">([^<]+)</span></p>', webpage, 'author', default=None), | |
354 | } | |
70c5802b | 355 | |
07256b9f S |
356 | |
357 | class NYTimesCookingRecipeIE(InfoExtractor): | |
358 | _VALID_URL = r'https?://cooking\.nytimes\.com/recipes/(?P<id>\d+)' | |
70c5802b | 359 | _TESTS = [{ |
360 | 'url': 'https://cooking.nytimes.com/recipes/1017817-cranberry-curd-tart', | |
07256b9f | 361 | 'md5': '579e83bbe8e61e9de67f80edba8a78a8', |
70c5802b | 362 | 'info_dict': { |
07256b9f S |
363 | 'id': '1017817', |
364 | 'ext': 'mp4', | |
365 | 'title': 'Cranberry Curd Tart', | |
366 | 'description': 'md5:ad77a3fc321db636256d4343c5742152', | |
367 | 'timestamp': 1447804800, | |
368 | 'upload_date': '20151118', | |
369 | 'creator': 'David Tanis', | |
370 | 'thumbnail': r're:https?://\w+\.nyt.com/images/.*\.jpg', | |
70c5802b | 371 | }, |
372 | }, { | |
07256b9f S |
373 | 'url': 'https://cooking.nytimes.com/recipes/1024781-neapolitan-checkerboard-cookies', |
374 | 'md5': '58df35998241dcf0620e99e646331b42', | |
70c5802b | 375 | 'info_dict': { |
07256b9f S |
376 | 'id': '1024781', |
377 | 'ext': 'mp4', | |
378 | 'title': 'Neapolitan Checkerboard Cookies', | |
379 | 'description': 'md5:ba12394c585ababea951cb6d2fcc6631', | |
380 | 'timestamp': 1701302400, | |
381 | 'upload_date': '20231130', | |
382 | 'creator': 'Sue Li', | |
383 | 'thumbnail': r're:https?://\w+\.nyt.com/images/.*\.jpg', | |
384 | }, | |
385 | }, { | |
386 | 'url': 'https://cooking.nytimes.com/recipes/1019516-overnight-oats', | |
387 | 'md5': '2fe7965a3adc899913b8e25ada360823', | |
388 | 'info_dict': { | |
389 | 'id': '1019516', | |
390 | 'ext': 'mp4', | |
391 | 'timestamp': 1546387200, | |
392 | 'description': 'md5:8856ce10239161bd2596ac335b9f9bfb', | |
393 | 'upload_date': '20190102', | |
394 | 'title': 'Overnight Oats', | |
395 | 'creator': 'Genevieve Ko', | |
396 | 'thumbnail': r're:https?://\w+\.nyt.com/images/.*\.jpg', | |
397 | }, | |
70c5802b | 398 | }] |
399 | ||
400 | def _real_extract(self, url): | |
401 | page_id = self._match_id(url) | |
70c5802b | 402 | webpage = self._download_webpage(url, page_id) |
07256b9f | 403 | recipe_data = self._search_nextjs_data(webpage, page_id)['props']['pageProps']['recipe'] |
70c5802b | 404 | |
07256b9f S |
405 | formats, subtitles = self._extract_m3u8_formats_and_subtitles( |
406 | recipe_data['videoSrc'], page_id, 'mp4', m3u8_id='hls') | |
70c5802b | 407 | |
07256b9f S |
408 | return { |
409 | **traverse_obj(recipe_data, { | |
410 | 'id': ('id', {str_or_none}), | |
411 | 'title': ('title', {str}), | |
412 | 'description': ('topnote', {clean_html}), | |
413 | 'timestamp': ('publishedAt', {int_or_none}), | |
414 | 'creator': ('contentAttribution', 'cardByline', {str}), | |
415 | }), | |
416 | 'formats': formats, | |
417 | 'subtitles': subtitles, | |
418 | 'thumbnails': [{'url': thumb_url} for thumb_url in traverse_obj( | |
419 | recipe_data, ('image', 'crops', 'recipe', ..., {url_or_none}))], | |
420 | } |