]>
Commit | Line | Data |
---|---|---|
0de668af JMF |
1 | from __future__ import unicode_literals |
2 | ||
cba5d1b6 | 3 | import itertools |
315ab3d5 | 4 | import hashlib |
27b1c73f | 5 | import json |
59fc531f JMF |
6 | import re |
7 | ||
8 | from .common import InfoExtractor | |
238d42cf S |
9 | from ..compat import ( |
10 | compat_str, | |
11 | compat_HTTPError, | |
12 | ) | |
e1ec9330 | 13 | from ..utils import ( |
238d42cf | 14 | ExtractorError, |
c4096e8a | 15 | get_element_by_attribute, |
e1ec9330 | 16 | int_or_none, |
87696e78 | 17 | lowercase_escape, |
238d42cf | 18 | std_headers, |
98960c91 | 19 | try_get, |
e1ec9330 | 20 | ) |
59fc531f | 21 | |
0de668af | 22 | |
59fc531f | 23 | class InstagramIE(InfoExtractor): |
0dafea02 | 24 | _VALID_URL = r'(?P<url>https?://(?:www\.)?instagram\.com/p/(?P<id>[^/?#&]+))' |
4479600d | 25 | _TESTS = [{ |
fc6e75dd | 26 | 'url': 'https://instagram.com/p/aye83DjauH/?foo=bar#abc', |
0de668af JMF |
27 | 'md5': '0d2da106a9d2631273e192b372806516', |
28 | 'info_dict': { | |
29 | 'id': 'aye83DjauH', | |
30 | 'ext': 'mp4', | |
0de668af JMF |
31 | 'title': 'Video by naomipq', |
32 | 'description': 'md5:1f17f0ab29bd6fe2bfad705f58de3cb8', | |
ec85ded8 | 33 | 'thumbnail': r're:^https?://.*\.jpg', |
98960c91 S |
34 | 'timestamp': 1371748545, |
35 | 'upload_date': '20130620', | |
36 | 'uploader_id': 'naomipq', | |
37 | 'uploader': 'Naomi Leonor Phan-Quang', | |
38 | 'like_count': int, | |
39 | 'comment_count': int, | |
a56e74e2 | 40 | 'comments': list, |
98960c91 | 41 | }, |
fb4b3458 S |
42 | }, { |
43 | # missing description | |
44 | 'url': 'https://www.instagram.com/p/BA-pQFBG8HZ/?taken-by=britneyspears', | |
45 | 'info_dict': { | |
46 | 'id': 'BA-pQFBG8HZ', | |
47 | 'ext': 'mp4', | |
fb4b3458 | 48 | 'title': 'Video by britneyspears', |
ec85ded8 | 49 | 'thumbnail': r're:^https?://.*\.jpg', |
98960c91 S |
50 | 'timestamp': 1453760977, |
51 | 'upload_date': '20160125', | |
52 | 'uploader_id': 'britneyspears', | |
53 | 'uploader': 'Britney Spears', | |
54 | 'like_count': int, | |
55 | 'comment_count': int, | |
a56e74e2 | 56 | 'comments': list, |
fb4b3458 S |
57 | }, |
58 | 'params': { | |
59 | 'skip_download': True, | |
60 | }, | |
ada77fa5 S |
61 | }, { |
62 | # multi video post | |
63 | 'url': 'https://www.instagram.com/p/BQ0eAlwhDrw/', | |
64 | 'playlist': [{ | |
65 | 'info_dict': { | |
66 | 'id': 'BQ0dSaohpPW', | |
67 | 'ext': 'mp4', | |
68 | 'title': 'Video 1', | |
69 | }, | |
70 | }, { | |
71 | 'info_dict': { | |
72 | 'id': 'BQ0dTpOhuHT', | |
73 | 'ext': 'mp4', | |
74 | 'title': 'Video 2', | |
75 | }, | |
76 | }, { | |
77 | 'info_dict': { | |
78 | 'id': 'BQ0dT7RBFeF', | |
79 | 'ext': 'mp4', | |
80 | 'title': 'Video 3', | |
81 | }, | |
82 | }], | |
83 | 'info_dict': { | |
84 | 'id': 'BQ0eAlwhDrw', | |
85 | 'title': 'Post by instagram', | |
86 | 'description': 'md5:0f9203fc6a2ce4d228da5754bcf54957', | |
87 | }, | |
4479600d S |
88 | }, { |
89 | 'url': 'https://instagram.com/p/-Cmh1cukG2/', | |
90 | 'only_matching': True, | |
0dafea02 S |
91 | }, { |
92 | 'url': 'http://instagram.com/p/9o6LshA7zy/embed/', | |
93 | 'only_matching': True, | |
4479600d | 94 | }] |
59fc531f | 95 | |
c4096e8a YCH |
96 | @staticmethod |
97 | def _extract_embed_url(webpage): | |
c23533a1 S |
98 | mobj = re.search( |
99 | r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?instagram\.com/p/[^/]+/embed.*?)\1', | |
100 | webpage) | |
101 | if mobj: | |
102 | return mobj.group('url') | |
103 | ||
c4096e8a YCH |
104 | blockquote_el = get_element_by_attribute( |
105 | 'class', 'instagram-media', webpage) | |
106 | if blockquote_el is None: | |
107 | return | |
108 | ||
109 | mobj = re.search( | |
110 | r'<a[^>]+href=([\'"])(?P<link>[^\'"]+)\1', blockquote_el) | |
111 | if mobj: | |
112 | return mobj.group('link') | |
113 | ||
59fc531f | 114 | def _real_extract(self, url): |
0dafea02 S |
115 | mobj = re.match(self._VALID_URL, url) |
116 | video_id = mobj.group('id') | |
117 | url = mobj.group('url') | |
d2d8248f | 118 | |
59fc531f | 119 | webpage = self._download_webpage(url, video_id) |
98960c91 S |
120 | |
121 | (video_url, description, thumbnail, timestamp, uploader, | |
18848d22 S |
122 | uploader_id, like_count, comment_count, comments, height, |
123 | width) = [None] * 11 | |
98960c91 S |
124 | |
125 | shared_data = self._parse_json( | |
126 | self._search_regex( | |
127 | r'window\._sharedData\s*=\s*({.+?});', | |
128 | webpage, 'shared data', default='{}'), | |
129 | video_id, fatal=False) | |
130 | if shared_data: | |
131 | media = try_get( | |
18848d22 S |
132 | shared_data, |
133 | (lambda x: x['entry_data']['PostPage'][0]['graphql']['shortcode_media'], | |
134 | lambda x: x['entry_data']['PostPage'][0]['media']), | |
135 | dict) | |
98960c91 S |
136 | if media: |
137 | video_url = media.get('video_url') | |
16097822 DR |
138 | height = int_or_none(media.get('dimensions', {}).get('height')) |
139 | width = int_or_none(media.get('dimensions', {}).get('width')) | |
9cbd4dda S |
140 | description = try_get( |
141 | media, lambda x: x['edge_media_to_caption']['edges'][0]['node']['text'], | |
142 | compat_str) or media.get('caption') | |
98960c91 | 143 | thumbnail = media.get('display_src') |
9cbd4dda | 144 | timestamp = int_or_none(media.get('taken_at_timestamp') or media.get('date')) |
98960c91 S |
145 | uploader = media.get('owner', {}).get('full_name') |
146 | uploader_id = media.get('owner', {}).get('username') | |
9cbd4dda S |
147 | |
148 | def get_count(key, kind): | |
149 | return int_or_none(try_get( | |
150 | media, (lambda x: x['edge_media_%s' % key]['count'], | |
151 | lambda x: x['%ss' % kind]['count']))) | |
152 | like_count = get_count('preview_like', 'like') | |
153 | comment_count = get_count('to_comment', 'comment') | |
154 | ||
a56e74e2 DR |
155 | comments = [{ |
156 | 'author': comment.get('user', {}).get('username'), | |
157 | 'author_id': comment.get('user', {}).get('id'), | |
158 | 'id': comment.get('id'), | |
159 | 'text': comment.get('text'), | |
160 | 'timestamp': int_or_none(comment.get('created_at')), | |
a1001f47 S |
161 | } for comment in media.get( |
162 | 'comments', {}).get('nodes', []) if comment.get('text')] | |
ada77fa5 S |
163 | if not video_url: |
164 | edges = try_get( | |
165 | media, lambda x: x['edge_sidecar_to_children']['edges'], | |
166 | list) or [] | |
167 | if edges: | |
168 | entries = [] | |
169 | for edge_num, edge in enumerate(edges, start=1): | |
170 | node = try_get(edge, lambda x: x['node'], dict) | |
171 | if not node: | |
172 | continue | |
173 | node_video_url = try_get(node, lambda x: x['video_url'], compat_str) | |
174 | if not node_video_url: | |
175 | continue | |
176 | entries.append({ | |
177 | 'id': node.get('shortcode') or node['id'], | |
178 | 'title': 'Video %d' % edge_num, | |
179 | 'url': node_video_url, | |
180 | 'thumbnail': node.get('display_url'), | |
181 | 'width': int_or_none(try_get(node, lambda x: x['dimensions']['width'])), | |
182 | 'height': int_or_none(try_get(node, lambda x: x['dimensions']['height'])), | |
183 | 'view_count': int_or_none(node.get('video_view_count')), | |
184 | }) | |
185 | return self.playlist_result( | |
186 | entries, video_id, | |
187 | 'Post by %s' % uploader_id if uploader_id else None, | |
188 | description) | |
98960c91 S |
189 | |
190 | if not video_url: | |
191 | video_url = self._og_search_video_url(webpage, secure=False) | |
192 | ||
16097822 DR |
193 | formats = [{ |
194 | 'url': video_url, | |
195 | 'width': width, | |
196 | 'height': height, | |
197 | }] | |
198 | ||
98960c91 S |
199 | if not uploader_id: |
200 | uploader_id = self._search_regex( | |
201 | r'"owner"\s*:\s*{\s*"username"\s*:\s*"(.+?)"', | |
202 | webpage, 'uploader id', fatal=False) | |
203 | ||
204 | if not description: | |
205 | description = self._search_regex( | |
206 | r'"caption"\s*:\s*"(.+?)"', webpage, 'description', default=None) | |
207 | if description is not None: | |
208 | description = lowercase_escape(description) | |
209 | ||
210 | if not thumbnail: | |
211 | thumbnail = self._og_search_thumbnail(webpage) | |
59fc531f | 212 | |
0de668af JMF |
213 | return { |
214 | 'id': video_id, | |
16097822 | 215 | 'formats': formats, |
0de668af JMF |
216 | 'ext': 'mp4', |
217 | 'title': 'Video by %s' % uploader_id, | |
98960c91 S |
218 | 'description': description, |
219 | 'thumbnail': thumbnail, | |
220 | 'timestamp': timestamp, | |
0de668af | 221 | 'uploader_id': uploader_id, |
98960c91 S |
222 | 'uploader': uploader, |
223 | 'like_count': like_count, | |
224 | 'comment_count': comment_count, | |
a56e74e2 | 225 | 'comments': comments, |
0de668af | 226 | } |
ea38e55f PH |
227 | |
228 | ||
229 | class InstagramUserIE(InfoExtractor): | |
5fc12b95 | 230 | _VALID_URL = r'https?://(?:www\.)?instagram\.com/(?P<id>[^/]{2,})/?(?:$|[?#])' |
ea38e55f PH |
231 | IE_DESC = 'Instagram user profile' |
232 | IE_NAME = 'instagram:user' | |
22a6f150 | 233 | _TEST = { |
fc6e75dd | 234 | 'url': 'https://instagram.com/porsche', |
22a6f150 PH |
235 | 'info_dict': { |
236 | 'id': 'porsche', | |
237 | 'title': 'porsche', | |
238 | }, | |
5fc12b95 | 239 | 'playlist_count': 5, |
22a6f150 PH |
240 | 'params': { |
241 | 'extract_flat': True, | |
242 | 'skip_download': True, | |
5fc12b95 | 243 | 'playlistend': 5, |
22a6f150 PH |
244 | } |
245 | } | |
ea38e55f | 246 | |
238d42cf S |
247 | _gis_tmpl = None |
248 | ||
dd9aea8c | 249 | def _entries(self, data): |
27b1c73f | 250 | def get_count(suffix): |
5fc12b95 | 251 | return int_or_none(try_get( |
27b1c73f RA |
252 | node, lambda x: x['edge_media_' + suffix]['count'])) |
253 | ||
dd9aea8c S |
254 | uploader_id = data['entry_data']['ProfilePage'][0]['graphql']['user']['id'] |
255 | csrf_token = data['config']['csrf_token'] | |
256 | rhx_gis = data.get('rhx_gis') or '3c7ca9dcefcf966d11dacf1f151335e8' | |
257 | ||
ff826177 S |
258 | self._set_cookie('instagram.com', 'ig_pr', '1') |
259 | ||
cba5d1b6 S |
260 | cursor = '' |
261 | for page_num in itertools.count(1): | |
dd9aea8c S |
262 | variables = json.dumps({ |
263 | 'id': uploader_id, | |
9b3036bd | 264 | 'first': 12, |
dd9aea8c S |
265 | 'after': cursor, |
266 | }) | |
238d42cf S |
267 | |
268 | if self._gis_tmpl: | |
269 | gis_tmpls = [self._gis_tmpl] | |
270 | else: | |
271 | gis_tmpls = [ | |
272 | '%s' % rhx_gis, | |
273 | '', | |
274 | '%s:%s' % (rhx_gis, csrf_token), | |
275 | '%s:%s:%s' % (rhx_gis, csrf_token, std_headers['User-Agent']), | |
276 | ] | |
277 | ||
278 | for gis_tmpl in gis_tmpls: | |
279 | try: | |
280 | media = self._download_json( | |
281 | 'https://www.instagram.com/graphql/query/', uploader_id, | |
282 | 'Downloading JSON page %d' % page_num, headers={ | |
283 | 'X-Requested-With': 'XMLHttpRequest', | |
284 | 'X-Instagram-GIS': hashlib.md5( | |
285 | ('%s:%s' % (gis_tmpl, variables)).encode('utf-8')).hexdigest(), | |
286 | }, query={ | |
287 | 'query_hash': '42323d64886122307be10013ad2dcc44', | |
288 | 'variables': variables, | |
289 | })['data']['user']['edge_owner_to_timeline_media'] | |
290 | self._gis_tmpl = gis_tmpl | |
291 | break | |
292 | except ExtractorError as e: | |
293 | if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: | |
294 | if gis_tmpl != gis_tmpls[-1]: | |
295 | continue | |
296 | raise | |
cba5d1b6 S |
297 | |
298 | edges = media.get('edges') | |
299 | if not edges or not isinstance(edges, list): | |
300 | break | |
301 | ||
302 | for edge in edges: | |
303 | node = edge.get('node') | |
304 | if not node or not isinstance(node, dict): | |
305 | continue | |
306 | if node.get('__typename') != 'GraphVideo' and node.get('is_video') is not True: | |
307 | continue | |
308 | video_id = node.get('shortcode') | |
309 | if not video_id: | |
310 | continue | |
311 | ||
312 | info = self.url_result( | |
313 | 'https://instagram.com/p/%s/' % video_id, | |
314 | ie=InstagramIE.ie_key(), video_id=video_id) | |
315 | ||
316 | description = try_get( | |
317 | node, lambda x: x['edge_media_to_caption']['edges'][0]['node']['text'], | |
318 | compat_str) | |
319 | thumbnail = node.get('thumbnail_src') or node.get('display_src') | |
320 | timestamp = int_or_none(node.get('taken_at_timestamp')) | |
321 | ||
322 | comment_count = get_count('to_comment') | |
323 | like_count = get_count('preview_like') | |
324 | view_count = int_or_none(node.get('video_view_count')) | |
325 | ||
326 | info.update({ | |
327 | 'description': description, | |
328 | 'thumbnail': thumbnail, | |
329 | 'timestamp': timestamp, | |
330 | 'comment_count': comment_count, | |
331 | 'like_count': like_count, | |
332 | 'view_count': view_count, | |
ea38e55f | 333 | }) |
cba5d1b6 S |
334 | |
335 | yield info | |
336 | ||
337 | page_info = media.get('page_info') | |
338 | if not page_info or not isinstance(page_info, dict): | |
339 | break | |
340 | ||
341 | has_next_page = page_info.get('has_next_page') | |
342 | if not has_next_page: | |
343 | break | |
344 | ||
345 | cursor = page_info.get('end_cursor') | |
346 | if not cursor or not isinstance(cursor, compat_str): | |
347 | break | |
5fc12b95 S |
348 | |
349 | def _real_extract(self, url): | |
27b1c73f | 350 | username = self._match_id(url) |
dd9aea8c S |
351 | |
352 | webpage = self._download_webpage(url, username) | |
353 | ||
354 | data = self._parse_json( | |
355 | self._search_regex( | |
356 | r'sharedData\s*=\s*({.+?})\s*;\s*[<\n]', webpage, 'data'), | |
357 | username) | |
358 | ||
5fc12b95 | 359 | return self.playlist_result( |
dd9aea8c | 360 | self._entries(data), username, username) |