]>
Commit | Line | Data |
---|---|---|
1 | from __future__ import unicode_literals | |
2 | ||
3 | import itertools | |
4 | import hashlib | |
5 | import json | |
6 | import re | |
7 | ||
8 | from .common import InfoExtractor | |
9 | from ..compat import ( | |
10 | compat_str, | |
11 | compat_HTTPError, | |
12 | ) | |
13 | from ..utils import ( | |
14 | ExtractorError, | |
15 | float_or_none, | |
16 | get_element_by_attribute, | |
17 | int_or_none, | |
18 | lowercase_escape, | |
19 | std_headers, | |
20 | try_get, | |
21 | url_or_none, | |
22 | ) | |
23 | ||
24 | ||
25 | class InstagramIE(InfoExtractor): | |
26 | _VALID_URL = r'(?P<url>https?://(?:www\.)?instagram\.com/(?:p|tv|reel)/(?P<id>[^/?#&]+))' | |
27 | _TESTS = [{ | |
28 | 'url': 'https://instagram.com/p/aye83DjauH/?foo=bar#abc', | |
29 | 'md5': '0d2da106a9d2631273e192b372806516', | |
30 | 'info_dict': { | |
31 | 'id': 'aye83DjauH', | |
32 | 'ext': 'mp4', | |
33 | 'title': 'Video by naomipq', | |
34 | 'description': 'md5:1f17f0ab29bd6fe2bfad705f58de3cb8', | |
35 | 'thumbnail': r're:^https?://.*\.jpg', | |
36 | 'duration': 0, | |
37 | 'timestamp': 1371748545, | |
38 | 'upload_date': '20130620', | |
39 | 'uploader_id': 'naomipq', | |
40 | 'uploader': 'B E A U T Y F O R A S H E S', | |
41 | 'like_count': int, | |
42 | 'comment_count': int, | |
43 | 'comments': list, | |
44 | }, | |
45 | }, { | |
46 | # missing description | |
47 | 'url': 'https://www.instagram.com/p/BA-pQFBG8HZ/?taken-by=britneyspears', | |
48 | 'info_dict': { | |
49 | 'id': 'BA-pQFBG8HZ', | |
50 | 'ext': 'mp4', | |
51 | 'title': 'Video by britneyspears', | |
52 | 'thumbnail': r're:^https?://.*\.jpg', | |
53 | 'duration': 0, | |
54 | 'timestamp': 1453760977, | |
55 | 'upload_date': '20160125', | |
56 | 'uploader_id': 'britneyspears', | |
57 | 'uploader': 'Britney Spears', | |
58 | 'like_count': int, | |
59 | 'comment_count': int, | |
60 | 'comments': list, | |
61 | }, | |
62 | 'params': { | |
63 | 'skip_download': True, | |
64 | }, | |
65 | }, { | |
66 | # multi video post | |
67 | 'url': 'https://www.instagram.com/p/BQ0eAlwhDrw/', | |
68 | 'playlist': [{ | |
69 | 'info_dict': { | |
70 | 'id': 'BQ0dSaohpPW', | |
71 | 'ext': 'mp4', | |
72 | 'title': 'Video 1', | |
73 | }, | |
74 | }, { | |
75 | 'info_dict': { | |
76 | 'id': 'BQ0dTpOhuHT', | |
77 | 'ext': 'mp4', | |
78 | 'title': 'Video 2', | |
79 | }, | |
80 | }, { | |
81 | 'info_dict': { | |
82 | 'id': 'BQ0dT7RBFeF', | |
83 | 'ext': 'mp4', | |
84 | 'title': 'Video 3', | |
85 | }, | |
86 | }], | |
87 | 'info_dict': { | |
88 | 'id': 'BQ0eAlwhDrw', | |
89 | 'title': 'Post by instagram', | |
90 | 'description': 'md5:0f9203fc6a2ce4d228da5754bcf54957', | |
91 | }, | |
92 | }, { | |
93 | # IGTV | |
94 | 'url': 'https://www.instagram.com/tv/BkfuX9UB-eK/', | |
95 | 'info_dict': { | |
96 | 'id': 'BkfuX9UB-eK', | |
97 | 'ext': 'mp4', | |
98 | 'title': 'Fingerboarding Tricks with @cass.fb', | |
99 | 'thumbnail': r're:^https?://.*\.jpg', | |
100 | 'duration': 53.83, | |
101 | 'timestamp': 1530032919, | |
102 | 'upload_date': '20180626', | |
103 | 'uploader_id': 'instagram', | |
104 | 'uploader': 'Instagram', | |
105 | 'like_count': int, | |
106 | 'comment_count': int, | |
107 | 'comments': list, | |
108 | 'description': 'Meet Cass Hirst (@cass.fb), a fingerboarding pro who can perform tiny ollies and kickflips while blindfolded.', | |
109 | } | |
110 | }, { | |
111 | 'url': 'https://instagram.com/p/-Cmh1cukG2/', | |
112 | 'only_matching': True, | |
113 | }, { | |
114 | 'url': 'http://instagram.com/p/9o6LshA7zy/embed/', | |
115 | 'only_matching': True, | |
116 | }, { | |
117 | 'url': 'https://www.instagram.com/tv/aye83DjauH/', | |
118 | 'only_matching': True, | |
119 | }, { | |
120 | 'url': 'https://www.instagram.com/reel/CDUMkliABpa/', | |
121 | 'only_matching': True, | |
122 | }] | |
123 | ||
124 | @staticmethod | |
125 | def _extract_embed_url(webpage): | |
126 | mobj = re.search( | |
127 | r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?instagram\.com/p/[^/]+/embed.*?)\1', | |
128 | webpage) | |
129 | if mobj: | |
130 | return mobj.group('url') | |
131 | ||
132 | blockquote_el = get_element_by_attribute( | |
133 | 'class', 'instagram-media', webpage) | |
134 | if blockquote_el is None: | |
135 | return | |
136 | ||
137 | mobj = re.search( | |
138 | r'<a[^>]+href=([\'"])(?P<link>[^\'"]+)\1', blockquote_el) | |
139 | if mobj: | |
140 | return mobj.group('link') | |
141 | ||
142 | def _real_extract(self, url): | |
143 | mobj = re.match(self._VALID_URL, url) | |
144 | video_id = mobj.group('id') | |
145 | url = mobj.group('url') | |
146 | ||
147 | webpage = self._download_webpage(url, video_id) | |
148 | ||
149 | (media, video_url, description, thumbnail, timestamp, uploader, | |
150 | uploader_id, like_count, comment_count, comments, height, | |
151 | width) = [None] * 12 | |
152 | ||
153 | shared_data = self._parse_json( | |
154 | self._search_regex( | |
155 | r'window\._sharedData\s*=\s*({.+?});', | |
156 | webpage, 'shared data', default='{}'), | |
157 | video_id, fatal=False) | |
158 | if shared_data: | |
159 | media = try_get( | |
160 | shared_data, | |
161 | (lambda x: x['entry_data']['PostPage'][0]['graphql']['shortcode_media'], | |
162 | lambda x: x['entry_data']['PostPage'][0]['media']), | |
163 | dict) | |
164 | # _sharedData.entry_data.PostPage is empty when authenticated (see | |
165 | # https://github.com/ytdl-org/youtube-dl/pull/22880) | |
166 | if not media: | |
167 | additional_data = self._parse_json( | |
168 | self._search_regex( | |
169 | r'window\.__additionalDataLoaded\s*\(\s*[^,]+,\s*({.+?})\s*\)\s*;', | |
170 | webpage, 'additional data', default='{}'), | |
171 | video_id, fatal=False) | |
172 | if additional_data: | |
173 | media = try_get( | |
174 | additional_data, lambda x: x['graphql']['shortcode_media'], | |
175 | dict) | |
176 | if media: | |
177 | video_url = media.get('video_url') | |
178 | height = int_or_none(media.get('dimensions', {}).get('height')) | |
179 | width = int_or_none(media.get('dimensions', {}).get('width')) | |
180 | description = try_get( | |
181 | media, lambda x: x['edge_media_to_caption']['edges'][0]['node']['text'], | |
182 | compat_str) or media.get('caption') | |
183 | title = media.get('title') | |
184 | thumbnail = media.get('display_src') or media.get('display_url') | |
185 | duration = float_or_none(media.get('video_duration')) | |
186 | timestamp = int_or_none(media.get('taken_at_timestamp') or media.get('date')) | |
187 | uploader = media.get('owner', {}).get('full_name') | |
188 | uploader_id = media.get('owner', {}).get('username') | |
189 | ||
190 | def get_count(keys, kind): | |
191 | if not isinstance(keys, (list, tuple)): | |
192 | keys = [keys] | |
193 | for key in keys: | |
194 | count = int_or_none(try_get( | |
195 | media, (lambda x: x['edge_media_%s' % key]['count'], | |
196 | lambda x: x['%ss' % kind]['count']))) | |
197 | if count is not None: | |
198 | return count | |
199 | like_count = get_count('preview_like', 'like') | |
200 | comment_count = get_count( | |
201 | ('preview_comment', 'to_comment', 'to_parent_comment'), 'comment') | |
202 | ||
203 | comments = [{ | |
204 | 'author': comment.get('user', {}).get('username'), | |
205 | 'author_id': comment.get('user', {}).get('id'), | |
206 | 'id': comment.get('id'), | |
207 | 'text': comment.get('text'), | |
208 | 'timestamp': int_or_none(comment.get('created_at')), | |
209 | } for comment in media.get( | |
210 | 'comments', {}).get('nodes', []) if comment.get('text')] | |
211 | if not video_url: | |
212 | edges = try_get( | |
213 | media, lambda x: x['edge_sidecar_to_children']['edges'], | |
214 | list) or [] | |
215 | if edges: | |
216 | entries = [] | |
217 | for edge_num, edge in enumerate(edges, start=1): | |
218 | node = try_get(edge, lambda x: x['node'], dict) | |
219 | if not node: | |
220 | continue | |
221 | node_video_url = url_or_none(node.get('video_url')) | |
222 | if not node_video_url: | |
223 | continue | |
224 | entries.append({ | |
225 | 'id': node.get('shortcode') or node['id'], | |
226 | 'title': node.get('title') or 'Video %d' % edge_num, | |
227 | 'url': node_video_url, | |
228 | 'thumbnail': node.get('display_url'), | |
229 | 'duration': float_or_none(node.get('video_duration')), | |
230 | 'width': int_or_none(try_get(node, lambda x: x['dimensions']['width'])), | |
231 | 'height': int_or_none(try_get(node, lambda x: x['dimensions']['height'])), | |
232 | 'view_count': int_or_none(node.get('video_view_count')), | |
233 | }) | |
234 | return self.playlist_result( | |
235 | entries, video_id, | |
236 | 'Post by %s' % uploader_id if uploader_id else None, | |
237 | description) | |
238 | ||
239 | if not video_url: | |
240 | video_url = self._og_search_video_url(webpage, secure=False) | |
241 | ||
242 | formats = [{ | |
243 | 'url': video_url, | |
244 | 'width': width, | |
245 | 'height': height, | |
246 | }] | |
247 | ||
248 | if not uploader_id: | |
249 | uploader_id = self._search_regex( | |
250 | r'"owner"\s*:\s*{\s*"username"\s*:\s*"(.+?)"', | |
251 | webpage, 'uploader id', fatal=False) | |
252 | ||
253 | if not description: | |
254 | description = self._search_regex( | |
255 | r'"caption"\s*:\s*"(.+?)"', webpage, 'description', default=None) | |
256 | if description is not None: | |
257 | description = lowercase_escape(description) | |
258 | ||
259 | if not thumbnail: | |
260 | thumbnail = self._og_search_thumbnail(webpage) | |
261 | ||
262 | return { | |
263 | 'id': video_id, | |
264 | 'formats': formats, | |
265 | 'ext': 'mp4', | |
266 | 'title': title or 'Video by %s' % uploader_id, | |
267 | 'description': description, | |
268 | 'duration': duration, | |
269 | 'thumbnail': thumbnail, | |
270 | 'timestamp': timestamp, | |
271 | 'uploader_id': uploader_id, | |
272 | 'uploader': uploader, | |
273 | 'like_count': like_count, | |
274 | 'comment_count': comment_count, | |
275 | 'comments': comments, | |
276 | } | |
277 | ||
278 | ||
279 | class InstagramPlaylistIE(InfoExtractor): | |
280 | # A superclass for handling any kind of query based on GraphQL which | |
281 | # results in a playlist. | |
282 | ||
283 | _gis_tmpl = None # used to cache GIS request type | |
284 | ||
285 | def _parse_graphql(self, webpage, item_id): | |
286 | # Reads a webpage and returns its GraphQL data. | |
287 | return self._parse_json( | |
288 | self._search_regex( | |
289 | r'sharedData\s*=\s*({.+?})\s*;\s*[<\n]', webpage, 'data'), | |
290 | item_id) | |
291 | ||
292 | def _extract_graphql(self, data, url): | |
293 | # Parses GraphQL queries containing videos and generates a playlist. | |
294 | def get_count(suffix): | |
295 | return int_or_none(try_get( | |
296 | node, lambda x: x['edge_media_' + suffix]['count'])) | |
297 | ||
298 | uploader_id = self._match_id(url) | |
299 | csrf_token = data['config']['csrf_token'] | |
300 | rhx_gis = data.get('rhx_gis') or '3c7ca9dcefcf966d11dacf1f151335e8' | |
301 | ||
302 | cursor = '' | |
303 | for page_num in itertools.count(1): | |
304 | variables = { | |
305 | 'first': 12, | |
306 | 'after': cursor, | |
307 | } | |
308 | variables.update(self._query_vars_for(data)) | |
309 | variables = json.dumps(variables) | |
310 | ||
311 | if self._gis_tmpl: | |
312 | gis_tmpls = [self._gis_tmpl] | |
313 | else: | |
314 | gis_tmpls = [ | |
315 | '%s' % rhx_gis, | |
316 | '', | |
317 | '%s:%s' % (rhx_gis, csrf_token), | |
318 | '%s:%s:%s' % (rhx_gis, csrf_token, std_headers['User-Agent']), | |
319 | ] | |
320 | ||
321 | # try all of the ways to generate a GIS query, and not only use the | |
322 | # first one that works, but cache it for future requests | |
323 | for gis_tmpl in gis_tmpls: | |
324 | try: | |
325 | json_data = self._download_json( | |
326 | 'https://www.instagram.com/graphql/query/', uploader_id, | |
327 | 'Downloading JSON page %d' % page_num, headers={ | |
328 | 'X-Requested-With': 'XMLHttpRequest', | |
329 | 'X-Instagram-GIS': hashlib.md5( | |
330 | ('%s:%s' % (gis_tmpl, variables)).encode('utf-8')).hexdigest(), | |
331 | }, query={ | |
332 | 'query_hash': self._QUERY_HASH, | |
333 | 'variables': variables, | |
334 | }) | |
335 | media = self._parse_timeline_from(json_data) | |
336 | self._gis_tmpl = gis_tmpl | |
337 | break | |
338 | except ExtractorError as e: | |
339 | # if it's an error caused by a bad query, and there are | |
340 | # more GIS templates to try, ignore it and keep trying | |
341 | if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: | |
342 | if gis_tmpl != gis_tmpls[-1]: | |
343 | continue | |
344 | raise | |
345 | ||
346 | edges = media.get('edges') | |
347 | if not edges or not isinstance(edges, list): | |
348 | break | |
349 | ||
350 | for edge in edges: | |
351 | node = edge.get('node') | |
352 | if not node or not isinstance(node, dict): | |
353 | continue | |
354 | if node.get('__typename') != 'GraphVideo' and node.get('is_video') is not True: | |
355 | continue | |
356 | video_id = node.get('shortcode') | |
357 | if not video_id: | |
358 | continue | |
359 | ||
360 | info = self.url_result( | |
361 | 'https://instagram.com/p/%s/' % video_id, | |
362 | ie=InstagramIE.ie_key(), video_id=video_id) | |
363 | ||
364 | description = try_get( | |
365 | node, lambda x: x['edge_media_to_caption']['edges'][0]['node']['text'], | |
366 | compat_str) | |
367 | thumbnail = node.get('thumbnail_src') or node.get('display_src') | |
368 | timestamp = int_or_none(node.get('taken_at_timestamp')) | |
369 | ||
370 | comment_count = get_count('to_comment') | |
371 | like_count = get_count('preview_like') | |
372 | view_count = int_or_none(node.get('video_view_count')) | |
373 | ||
374 | info.update({ | |
375 | 'description': description, | |
376 | 'thumbnail': thumbnail, | |
377 | 'timestamp': timestamp, | |
378 | 'comment_count': comment_count, | |
379 | 'like_count': like_count, | |
380 | 'view_count': view_count, | |
381 | }) | |
382 | ||
383 | yield info | |
384 | ||
385 | page_info = media.get('page_info') | |
386 | if not page_info or not isinstance(page_info, dict): | |
387 | break | |
388 | ||
389 | has_next_page = page_info.get('has_next_page') | |
390 | if not has_next_page: | |
391 | break | |
392 | ||
393 | cursor = page_info.get('end_cursor') | |
394 | if not cursor or not isinstance(cursor, compat_str): | |
395 | break | |
396 | ||
397 | def _real_extract(self, url): | |
398 | user_or_tag = self._match_id(url) | |
399 | webpage = self._download_webpage(url, user_or_tag) | |
400 | data = self._parse_graphql(webpage, user_or_tag) | |
401 | ||
402 | self._set_cookie('instagram.com', 'ig_pr', '1') | |
403 | ||
404 | return self.playlist_result( | |
405 | self._extract_graphql(data, url), user_or_tag, user_or_tag) | |
406 | ||
407 | ||
408 | class InstagramUserIE(InstagramPlaylistIE): | |
409 | _VALID_URL = r'https?://(?:www\.)?instagram\.com/(?P<id>[^/]{2,})/?(?:$|[?#])' | |
410 | IE_DESC = 'Instagram user profile' | |
411 | IE_NAME = 'instagram:user' | |
412 | _TEST = { | |
413 | 'url': 'https://instagram.com/porsche', | |
414 | 'info_dict': { | |
415 | 'id': 'porsche', | |
416 | 'title': 'porsche', | |
417 | }, | |
418 | 'playlist_count': 5, | |
419 | 'params': { | |
420 | 'extract_flat': True, | |
421 | 'skip_download': True, | |
422 | 'playlistend': 5, | |
423 | } | |
424 | } | |
425 | ||
426 | _QUERY_HASH = '42323d64886122307be10013ad2dcc44', | |
427 | ||
428 | @staticmethod | |
429 | def _parse_timeline_from(data): | |
430 | # extracts the media timeline data from a GraphQL result | |
431 | return data['data']['user']['edge_owner_to_timeline_media'] | |
432 | ||
433 | @staticmethod | |
434 | def _query_vars_for(data): | |
435 | # returns a dictionary of variables to add to the timeline query based | |
436 | # on the GraphQL of the original page | |
437 | return { | |
438 | 'id': data['entry_data']['ProfilePage'][0]['graphql']['user']['id'] | |
439 | } | |
440 | ||
441 | ||
442 | class InstagramTagIE(InstagramPlaylistIE): | |
443 | _VALID_URL = r'https?://(?:www\.)?instagram\.com/explore/tags/(?P<id>[^/]+)' | |
444 | IE_DESC = 'Instagram hashtag search' | |
445 | IE_NAME = 'instagram:tag' | |
446 | _TEST = { | |
447 | 'url': 'https://instagram.com/explore/tags/lolcats', | |
448 | 'info_dict': { | |
449 | 'id': 'lolcats', | |
450 | 'title': 'lolcats', | |
451 | }, | |
452 | 'playlist_count': 50, | |
453 | 'params': { | |
454 | 'extract_flat': True, | |
455 | 'skip_download': True, | |
456 | 'playlistend': 50, | |
457 | } | |
458 | } | |
459 | ||
460 | _QUERY_HASH = 'f92f56d47dc7a55b606908374b43a314', | |
461 | ||
462 | @staticmethod | |
463 | def _parse_timeline_from(data): | |
464 | # extracts the media timeline data from a GraphQL result | |
465 | return data['data']['hashtag']['edge_hashtag_to_media'] | |
466 | ||
467 | @staticmethod | |
468 | def _query_vars_for(data): | |
469 | # returns a dictionary of variables to add to the timeline query based | |
470 | # on the GraphQL of the original page | |
471 | return { | |
472 | 'tag_name': | |
473 | data['entry_data']['TagPage'][0]['graphql']['hashtag']['name'] | |
474 | } |