]>
Commit | Line | Data |
---|---|---|
315ab3d5 | 1 | import hashlib |
e3e606de | 2 | import itertools |
27b1c73f | 3 | import json |
59fc531f | 4 | import re |
ab2ffab2 | 5 | import time |
e3e606de | 6 | import urllib.error |
59fc531f JMF |
7 | |
8 | from .common import InfoExtractor | |
e1ec9330 | 9 | from ..utils import ( |
238d42cf | 10 | ExtractorError, |
e3e606de PD |
11 | decode_base_n, |
12 | encode_base_n, | |
cce889b9 | 13 | float_or_none, |
e3e606de | 14 | format_field, |
c4096e8a | 15 | get_element_by_attribute, |
e1ec9330 | 16 | int_or_none, |
87696e78 | 17 | lowercase_escape, |
013322a9 | 18 | str_or_none, |
4e260d1a | 19 | str_to_int, |
eb56d132 | 20 | traverse_obj, |
3052a30d | 21 | url_or_none, |
ab2ffab2 | 22 | urlencode_postdata, |
e1ec9330 | 23 | ) |
59fc531f | 24 | |
e3e606de PD |
25 | _ENCODING_CHARS = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_' |
26 | ||
27 | ||
28 | def _pk_to_id(id): | |
29 | """Source: https://stackoverflow.com/questions/24437823/getting-instagram-post-url-from-media-id""" | |
30 | return encode_base_n(int(id.split('_')[0]), table=_ENCODING_CHARS) | |
31 | ||
32 | ||
33 | def _id_to_pk(shortcode): | |
34 | """Covert a shortcode to a numeric value""" | |
35 | return decode_base_n(shortcode[:11], table=_ENCODING_CHARS) | |
36 | ||
0de668af | 37 | |
8dcf65c9 | 38 | class InstagramBaseIE(InfoExtractor): |
ab2ffab2 | 39 | _NETRC_MACHINE = 'instagram' |
8dcf65c9 | 40 | _IS_LOGGED_IN = False |
41 | ||
7d3b98be | 42 | _API_BASE_URL = 'https://i.instagram.com/api/v1' |
43 | _LOGIN_URL = 'https://www.instagram.com/accounts/login' | |
44 | _API_HEADERS = { | |
45 | 'X-IG-App-ID': '936619743392459', | |
46 | 'X-ASBD-ID': '198387', | |
47 | 'X-IG-WWW-Claim': '0', | |
48 | 'Origin': 'https://www.instagram.com', | |
49 | 'Accept': '*/*', | |
50 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Safari/537.36', | |
51 | } | |
52 | ||
52efa4b3 | 53 | def _perform_login(self, username, password): |
54 | if self._IS_LOGGED_IN: | |
8dcf65c9 | 55 | return |
56 | ||
57 | login_webpage = self._download_webpage( | |
7d3b98be | 58 | self._LOGIN_URL, None, note='Downloading login webpage', errnote='Failed to download login webpage') |
8dcf65c9 | 59 | |
7d3b98be | 60 | shared_data = self._parse_json(self._search_regex( |
61 | r'window\._sharedData\s*=\s*({.+?});', login_webpage, 'shared data', default='{}'), None) | |
62 | ||
63 | login = self._download_json( | |
64 | f'{self._LOGIN_URL}/ajax/', None, note='Logging in', headers={ | |
65 | **self._API_HEADERS, | |
66 | 'X-Requested-With': 'XMLHttpRequest', | |
67 | 'X-CSRFToken': shared_data['config']['csrf_token'], | |
68 | 'X-Instagram-AJAX': shared_data['rollout_hash'], | |
69 | 'Referer': 'https://www.instagram.com/', | |
70 | }, data=urlencode_postdata({ | |
71 | 'enc_password': f'#PWD_INSTAGRAM_BROWSER:0:{int(time.time())}:{password}', | |
72 | 'username': username, | |
73 | 'queryParams': '{}', | |
74 | 'optIntoOneTap': 'false', | |
75 | 'stopDeletionNonce': '', | |
76 | 'trustedDeviceRecords': '{}', | |
77 | })) | |
8dcf65c9 | 78 | |
79 | if not login.get('authenticated'): | |
80 | if login.get('message'): | |
81 | raise ExtractorError(f'Unable to login: {login["message"]}') | |
d298d33f M |
82 | elif login.get('user'): |
83 | raise ExtractorError('Unable to login: Sorry, your password was incorrect. Please double-check your password.', expected=True) | |
84 | elif login.get('user') is False: | |
85 | raise ExtractorError('Unable to login: The username you entered doesn\'t belong to an account. Please check your username and try again.', expected=True) | |
8dcf65c9 | 86 | raise ExtractorError('Unable to login') |
87 | InstagramBaseIE._IS_LOGGED_IN = True | |
88 | ||
eb56d132 | 89 | def _get_count(self, media, kind, *keys): |
90 | return traverse_obj( | |
91 | media, (kind, 'count'), *((f'edge_media_{key}', 'count') for key in keys), | |
92 | expected_type=int_or_none) | |
93 | ||
94 | def _get_dimension(self, name, media, webpage=None): | |
95 | return ( | |
96 | traverse_obj(media, ('dimensions', name), expected_type=int_or_none) | |
97 | or int_or_none(self._html_search_meta( | |
98 | (f'og:video:{name}', f'video:{name}'), webpage or '', default=None))) | |
99 | ||
100 | def _extract_nodes(self, nodes, is_direct=False): | |
101 | for idx, node in enumerate(nodes, start=1): | |
102 | if node.get('__typename') != 'GraphVideo' and node.get('is_video') is not True: | |
103 | continue | |
104 | ||
105 | video_id = node.get('shortcode') | |
106 | ||
107 | if is_direct: | |
108 | info = { | |
109 | 'id': video_id or node['id'], | |
110 | 'url': node.get('video_url'), | |
111 | 'width': self._get_dimension('width', node), | |
112 | 'height': self._get_dimension('height', node), | |
113 | 'http_headers': { | |
114 | 'Referer': 'https://www.instagram.com/', | |
115 | } | |
116 | } | |
117 | elif not video_id: | |
118 | continue | |
119 | else: | |
120 | info = { | |
121 | '_type': 'url', | |
122 | 'ie_key': 'Instagram', | |
123 | 'id': video_id, | |
124 | 'url': f'https://instagram.com/p/{video_id}', | |
125 | } | |
126 | ||
127 | yield { | |
128 | **info, | |
129 | 'title': node.get('title') or (f'Video {idx}' if is_direct else None), | |
130 | 'description': traverse_obj( | |
131 | node, ('edge_media_to_caption', 'edges', 0, 'node', 'text'), expected_type=str), | |
132 | 'thumbnail': traverse_obj( | |
133 | node, 'display_url', 'thumbnail_src', 'display_src', expected_type=url_or_none), | |
134 | 'duration': float_or_none(node.get('video_duration')), | |
135 | 'timestamp': int_or_none(node.get('taken_at_timestamp')), | |
136 | 'view_count': int_or_none(node.get('video_view_count')), | |
137 | 'comment_count': self._get_count(node, 'comments', 'preview_comment', 'to_comment', 'to_parent_comment'), | |
138 | 'like_count': self._get_count(node, 'likes', 'preview_like'), | |
139 | } | |
140 | ||
013322a9 | 141 | def _extract_product_media(self, product_media): |
7d3b98be | 142 | media_id = product_media.get('code') or _pk_to_id(product_media.get('pk')) |
013322a9 M |
143 | vcodec = product_media.get('video_codec') |
144 | dash_manifest_raw = product_media.get('video_dash_manifest') | |
145 | videos_list = product_media.get('video_versions') | |
146 | if not (dash_manifest_raw or videos_list): | |
c5332d7f | 147 | return {} |
013322a9 M |
148 | |
149 | formats = [{ | |
150 | 'format_id': format.get('id'), | |
151 | 'url': format.get('url'), | |
152 | 'width': format.get('width'), | |
153 | 'height': format.get('height'), | |
154 | 'vcodec': vcodec, | |
155 | } for format in videos_list or []] | |
156 | if dash_manifest_raw: | |
157 | formats.extend(self._parse_mpd_formats(self._parse_xml(dash_manifest_raw, media_id), mpd_id='dash')) | |
013322a9 M |
158 | |
159 | thumbnails = [{ | |
160 | 'url': thumbnail.get('url'), | |
161 | 'width': thumbnail.get('width'), | |
162 | 'height': thumbnail.get('height') | |
163 | } for thumbnail in traverse_obj(product_media, ('image_versions2', 'candidates')) or []] | |
164 | return { | |
165 | 'id': media_id, | |
166 | 'duration': float_or_none(product_media.get('video_duration')), | |
167 | 'formats': formats, | |
168 | 'thumbnails': thumbnails | |
169 | } | |
170 | ||
171 | def _extract_product(self, product_info): | |
172 | if isinstance(product_info, list): | |
173 | product_info = product_info[0] | |
174 | ||
175 | user_info = product_info.get('user') or {} | |
176 | info_dict = { | |
2e767548 | 177 | 'id': _pk_to_id(traverse_obj(product_info, 'pk', 'id', expected_type=str_or_none)[:19]), |
013322a9 M |
178 | 'title': product_info.get('title') or f'Video by {user_info.get("username")}', |
179 | 'description': traverse_obj(product_info, ('caption', 'text'), expected_type=str_or_none), | |
180 | 'timestamp': int_or_none(product_info.get('taken_at')), | |
181 | 'channel': user_info.get('username'), | |
182 | 'uploader': user_info.get('full_name'), | |
183 | 'uploader_id': str_or_none(user_info.get('pk')), | |
184 | 'view_count': int_or_none(product_info.get('view_count')), | |
185 | 'like_count': int_or_none(product_info.get('like_count')), | |
186 | 'comment_count': int_or_none(product_info.get('comment_count')), | |
2e767548 | 187 | '__post_extractor': self.extract_comments(_pk_to_id(product_info.get('pk'))), |
013322a9 M |
188 | 'http_headers': { |
189 | 'Referer': 'https://www.instagram.com/', | |
190 | } | |
191 | } | |
192 | carousel_media = product_info.get('carousel_media') | |
193 | if carousel_media: | |
194 | return { | |
195 | '_type': 'playlist', | |
196 | **info_dict, | |
197 | 'title': f'Post by {user_info.get("username")}', | |
198 | 'entries': [{ | |
199 | **info_dict, | |
200 | **self._extract_product_media(product_media), | |
201 | } for product_media in carousel_media], | |
202 | } | |
203 | ||
204 | return { | |
205 | **info_dict, | |
206 | **self._extract_product_media(product_info) | |
207 | } | |
208 | ||
2e767548 PD |
209 | def _get_comments(self, video_id): |
210 | comments_info = self._download_json( | |
211 | f'{self._API_BASE_URL}/media/{_id_to_pk(video_id)}/comments/?can_support_threading=true&permalink_enabled=false', video_id, | |
212 | fatal=False, errnote='Comments extraction failed', note='Downloading comments info', headers=self._API_HEADERS) or {} | |
213 | ||
214 | comment_data = traverse_obj(comments_info, ('edge_media_to_parent_comment', 'edges'), 'comments') | |
215 | for comment_dict in comment_data or []: | |
216 | yield { | |
217 | 'author': traverse_obj(comment_dict, ('node', 'owner', 'username'), ('user', 'username')), | |
218 | 'author_id': traverse_obj(comment_dict, ('node', 'owner', 'id'), ('user', 'pk')), | |
219 | 'author_thumbnail': traverse_obj(comment_dict, ('node', 'owner', 'profile_pic_url'), ('user', 'profile_pic_url'), expected_type=url_or_none), | |
220 | 'id': traverse_obj(comment_dict, ('node', 'id'), 'pk'), | |
221 | 'text': traverse_obj(comment_dict, ('node', 'text'), 'text'), | |
222 | 'like_count': traverse_obj(comment_dict, ('node', 'edge_liked_by', 'count'), 'comment_like_count', expected_type=int_or_none), | |
223 | 'timestamp': traverse_obj(comment_dict, ('node', 'created_at'), 'created_at', expected_type=int_or_none), | |
224 | } | |
225 | ||
8dcf65c9 | 226 | |
fb2d1ee6 | 227 | class InstagramIOSIE(InfoExtractor): |
c586f9e8 | 228 | IE_DESC = 'IOS instagram:// URL' |
fb2d1ee6 | 229 | _VALID_URL = r'instagram://media\?id=(?P<id>[\d_]+)' |
230 | _TESTS = [{ | |
231 | 'url': 'instagram://media?id=482584233761418119', | |
232 | 'md5': '0d2da106a9d2631273e192b372806516', | |
233 | 'info_dict': { | |
234 | 'id': 'aye83DjauH', | |
235 | 'ext': 'mp4', | |
236 | 'title': 'Video by naomipq', | |
237 | 'description': 'md5:1f17f0ab29bd6fe2bfad705f58de3cb8', | |
238 | 'thumbnail': r're:^https?://.*\.jpg', | |
239 | 'duration': 0, | |
240 | 'timestamp': 1371748545, | |
241 | 'upload_date': '20130620', | |
242 | 'uploader_id': 'naomipq', | |
243 | 'uploader': 'B E A U T Y F O R A S H E S', | |
244 | 'like_count': int, | |
245 | 'comment_count': int, | |
246 | 'comments': list, | |
247 | }, | |
248 | 'add_ie': ['Instagram'] | |
249 | }] | |
250 | ||
fb2d1ee6 | 251 | def _real_extract(self, url): |
e3e606de PD |
252 | video_id = _pk_to_id(self._match_id(url)) |
253 | return self.url_result(f'http://instagram.com/tv/{video_id}', InstagramIE, video_id) | |
fb2d1ee6 | 254 | |
255 | ||
8dcf65c9 | 256 | class InstagramIE(InstagramBaseIE): |
8b688881 | 257 | _VALID_URL = r'(?P<url>https?://(?:www\.)?instagram\.com(?:/[^/]+)?/(?:p|tv|reel)/(?P<id>[^/?#&]+))' |
bfd973ec | 258 | _EMBED_REGEX = [r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?instagram\.com/p/[^/]+/embed.*?)\1'] |
4479600d | 259 | _TESTS = [{ |
fc6e75dd | 260 | 'url': 'https://instagram.com/p/aye83DjauH/?foo=bar#abc', |
0de668af JMF |
261 | 'md5': '0d2da106a9d2631273e192b372806516', |
262 | 'info_dict': { | |
263 | 'id': 'aye83DjauH', | |
264 | 'ext': 'mp4', | |
0de668af JMF |
265 | 'title': 'Video by naomipq', |
266 | 'description': 'md5:1f17f0ab29bd6fe2bfad705f58de3cb8', | |
ec85ded8 | 267 | 'thumbnail': r're:^https?://.*\.jpg', |
2e767548 | 268 | 'duration': 8.747, |
98960c91 S |
269 | 'timestamp': 1371748545, |
270 | 'upload_date': '20130620', | |
013322a9 | 271 | 'uploader_id': '2815873', |
29f7c58a | 272 | 'uploader': 'B E A U T Y F O R A S H E S', |
013322a9 | 273 | 'channel': 'naomipq', |
98960c91 S |
274 | 'like_count': int, |
275 | 'comment_count': int, | |
a56e74e2 | 276 | 'comments': list, |
98960c91 | 277 | }, |
2e767548 PD |
278 | 'expected_warnings': [ |
279 | 'General metadata extraction failed', | |
280 | 'Main webpage is locked behind the login page', | |
281 | ], | |
fb4b3458 | 282 | }, { |
2e767548 PD |
283 | # reel |
284 | 'url': 'https://www.instagram.com/reel/Chunk8-jurw/', | |
285 | 'md5': 'f6d8277f74515fa3ff9f5791426e42b1', | |
fb4b3458 | 286 | 'info_dict': { |
2e767548 | 287 | 'id': 'Chunk8-jurw', |
fb4b3458 | 288 | 'ext': 'mp4', |
2e767548 PD |
289 | 'title': 'Video by instagram', |
290 | 'description': 'md5:c9cde483606ed6f80fbe9283a6a2b290', | |
ec85ded8 | 291 | 'thumbnail': r're:^https?://.*\.jpg', |
2e767548 PD |
292 | 'duration': 5.016, |
293 | 'timestamp': 1661529231, | |
294 | 'upload_date': '20220826', | |
295 | 'uploader_id': '25025320', | |
296 | 'uploader': 'Instagram', | |
297 | 'channel': 'instagram', | |
98960c91 S |
298 | 'like_count': int, |
299 | 'comment_count': int, | |
a56e74e2 | 300 | 'comments': list, |
fb4b3458 | 301 | }, |
2e767548 PD |
302 | 'expected_warnings': [ |
303 | 'General metadata extraction failed', | |
304 | 'Main webpage is locked behind the login page', | |
305 | ], | |
ada77fa5 S |
306 | }, { |
307 | # multi video post | |
308 | 'url': 'https://www.instagram.com/p/BQ0eAlwhDrw/', | |
309 | 'playlist': [{ | |
310 | 'info_dict': { | |
311 | 'id': 'BQ0dSaohpPW', | |
312 | 'ext': 'mp4', | |
313 | 'title': 'Video 1', | |
2e767548 PD |
314 | 'thumbnail': r're:^https?://.*\.jpg', |
315 | 'view_count': int, | |
ada77fa5 S |
316 | }, |
317 | }, { | |
318 | 'info_dict': { | |
319 | 'id': 'BQ0dTpOhuHT', | |
320 | 'ext': 'mp4', | |
321 | 'title': 'Video 2', | |
2e767548 PD |
322 | 'thumbnail': r're:^https?://.*\.jpg', |
323 | 'view_count': int, | |
ada77fa5 S |
324 | }, |
325 | }, { | |
326 | 'info_dict': { | |
327 | 'id': 'BQ0dT7RBFeF', | |
328 | 'ext': 'mp4', | |
329 | 'title': 'Video 3', | |
2e767548 PD |
330 | 'thumbnail': r're:^https?://.*\.jpg', |
331 | 'view_count': int, | |
ada77fa5 S |
332 | }, |
333 | }], | |
334 | 'info_dict': { | |
335 | 'id': 'BQ0eAlwhDrw', | |
336 | 'title': 'Post by instagram', | |
337 | 'description': 'md5:0f9203fc6a2ce4d228da5754bcf54957', | |
338 | }, | |
2e767548 PD |
339 | 'expected_warnings': [ |
340 | 'General metadata extraction failed', | |
341 | 'Main webpage is locked behind the login page', | |
342 | ], | |
cce889b9 | 343 | }, { |
344 | # IGTV | |
345 | 'url': 'https://www.instagram.com/tv/BkfuX9UB-eK/', | |
346 | 'info_dict': { | |
347 | 'id': 'BkfuX9UB-eK', | |
348 | 'ext': 'mp4', | |
349 | 'title': 'Fingerboarding Tricks with @cass.fb', | |
350 | 'thumbnail': r're:^https?://.*\.jpg', | |
351 | 'duration': 53.83, | |
352 | 'timestamp': 1530032919, | |
353 | 'upload_date': '20180626', | |
013322a9 | 354 | 'uploader_id': '25025320', |
cce889b9 | 355 | 'uploader': 'Instagram', |
013322a9 | 356 | 'channel': 'instagram', |
cce889b9 | 357 | 'like_count': int, |
358 | 'comment_count': int, | |
359 | 'comments': list, | |
360 | 'description': 'Meet Cass Hirst (@cass.fb), a fingerboarding pro who can perform tiny ollies and kickflips while blindfolded.', | |
2e767548 PD |
361 | }, |
362 | 'expected_warnings': [ | |
363 | 'General metadata extraction failed', | |
364 | 'Main webpage is locked behind the login page', | |
365 | ], | |
4479600d S |
366 | }, { |
367 | 'url': 'https://instagram.com/p/-Cmh1cukG2/', | |
368 | 'only_matching': True, | |
0dafea02 S |
369 | }, { |
370 | 'url': 'http://instagram.com/p/9o6LshA7zy/embed/', | |
371 | 'only_matching': True, | |
edb2820c RA |
372 | }, { |
373 | 'url': 'https://www.instagram.com/tv/aye83DjauH/', | |
374 | 'only_matching': True, | |
29f7c58a | 375 | }, { |
376 | 'url': 'https://www.instagram.com/reel/CDUMkliABpa/', | |
377 | 'only_matching': True, | |
8b688881 | 378 | }, { |
379 | 'url': 'https://www.instagram.com/marvelskies.fc/reel/CWqAgUZgCku/', | |
380 | 'only_matching': True, | |
4479600d | 381 | }] |
59fc531f | 382 | |
bfd973ec | 383 | @classmethod |
384 | def _extract_embed_urls(cls, url, webpage): | |
385 | res = tuple(super()._extract_embed_urls(url, webpage)) | |
386 | if res: | |
387 | return res | |
c4096e8a | 388 | |
bfd973ec | 389 | mobj = re.search(r'<a[^>]+href=([\'"])(?P<link>[^\'"]+)\1', |
390 | get_element_by_attribute('class', 'instagram-media', webpage) or '') | |
c4096e8a | 391 | if mobj: |
bfd973ec | 392 | return [mobj.group('link')] |
c4096e8a | 393 | |
59fc531f | 394 | def _real_extract(self, url): |
eb56d132 | 395 | video_id, url = self._match_valid_url(url).group('id', 'url') |
7d3b98be | 396 | media, webpage = {}, '' |
397 | ||
2e767548 PD |
398 | if self._get_cookies(url).get('sessionid'): |
399 | info = traverse_obj(self._download_json( | |
400 | f'{self._API_BASE_URL}/media/{_id_to_pk(video_id)}/info/', video_id, | |
401 | fatal=False, errnote='Video info extraction failed', | |
402 | note='Downloading video info', headers=self._API_HEADERS), ('items', 0)) | |
403 | if info: | |
404 | media.update(info) | |
405 | return self._extract_product(media) | |
406 | ||
7d3b98be | 407 | api_check = self._download_json( |
408 | f'{self._API_BASE_URL}/web/get_ruling_for_content/?content_type=MEDIA&target_id={_id_to_pk(video_id)}', | |
409 | video_id, headers=self._API_HEADERS, fatal=False, note='Setting up session', errnote=False) or {} | |
410 | csrf_token = self._get_cookies('https://www.instagram.com').get('csrftoken') | |
411 | ||
412 | if not csrf_token: | |
413 | self.report_warning('No csrf token set by Instagram API', video_id) | |
7d3b98be | 414 | else: |
2e767548 PD |
415 | csrf_token = csrf_token.value if api_check.get('status') == 'ok' else None |
416 | if not csrf_token: | |
417 | self.report_warning('Instagram API is not granting access', video_id) | |
418 | ||
419 | variables = { | |
420 | 'shortcode': video_id, | |
421 | 'child_comment_count': 3, | |
422 | 'fetch_comment_count': 40, | |
423 | 'parent_comment_count': 24, | |
424 | 'has_threaded_comments': True, | |
425 | } | |
426 | general_info = self._download_json( | |
427 | 'https://www.instagram.com/graphql/query/', video_id, fatal=False, errnote=False, | |
428 | headers={ | |
429 | **self._API_HEADERS, | |
430 | 'X-CSRFToken': csrf_token or '', | |
431 | 'X-Requested-With': 'XMLHttpRequest', | |
432 | 'Referer': url, | |
433 | }, query={ | |
434 | 'query_hash': '9f8827793ef34641b2fb195d4d41151c', | |
435 | 'variables': json.dumps(variables, separators=(',', ':')), | |
436 | }) | |
437 | media.update(traverse_obj(general_info, ('data', 'shortcode_media')) or {}) | |
438 | ||
439 | if not general_info: | |
7d3b98be | 440 | self.report_warning('General metadata extraction failed (some metadata might be missing).', video_id) |
441 | webpage, urlh = self._download_webpage_handle(url, video_id) | |
442 | shared_data = self._search_json( | |
8a3da4c6 | 443 | r'window\._sharedData\s*=', webpage, 'shared data', video_id, fatal=False) or {} |
7d3b98be | 444 | |
8a3da4c6 | 445 | if shared_data and self._LOGIN_URL not in urlh.geturl(): |
7d3b98be | 446 | media.update(traverse_obj( |
447 | shared_data, ('entry_data', 'PostPage', 0, 'graphql', 'shortcode_media'), | |
448 | ('entry_data', 'PostPage', 0, 'media'), expected_type=dict) or {}) | |
449 | else: | |
2e767548 | 450 | self.report_warning('Main webpage is locked behind the login page. Retrying with embed webpage (some metadata might be missing).') |
7d3b98be | 451 | webpage = self._download_webpage( |
452 | f'{url}/embed/', video_id, note='Downloading embed webpage', fatal=False) | |
453 | additional_data = self._search_json( | |
304ad45a | 454 | r'window\.__additionalDataLoaded\s*\(\s*[^,]+,', webpage, 'additional data', video_id, fatal=False) |
2e767548 | 455 | if not additional_data and not media: |
8a3da4c6 | 456 | self.raise_login_required('Requested content is not available, rate-limit reached or login required') |
7d3b98be | 457 | |
458 | product_item = traverse_obj(additional_data, ('items', 0), expected_type=dict) | |
459 | if product_item: | |
460 | media.update(product_item) | |
461 | return self._extract_product(media) | |
462 | ||
463 | media.update(traverse_obj( | |
464 | additional_data, ('graphql', 'shortcode_media'), 'shortcode_media', expected_type=dict) or {}) | |
eb56d132 | 465 | |
013322a9 M |
466 | username = traverse_obj(media, ('owner', 'username')) or self._search_regex( |
467 | r'"owner"\s*:\s*{\s*"username"\s*:\s*"(.+?)"', webpage, 'username', fatal=False) | |
eb56d132 | 468 | |
469 | description = ( | |
470 | traverse_obj(media, ('edge_media_to_caption', 'edges', 0, 'node', 'text'), expected_type=str) | |
471 | or media.get('caption')) | |
472 | if not description: | |
473 | description = self._search_regex( | |
474 | r'"caption"\s*:\s*"(.+?)"', webpage, 'description', default=None) | |
475 | if description is not None: | |
476 | description = lowercase_escape(description) | |
98960c91 | 477 | |
eb56d132 | 478 | video_url = media.get('video_url') |
98960c91 | 479 | if not video_url: |
eb56d132 | 480 | nodes = traverse_obj(media, ('edge_sidecar_to_children', 'edges', ..., 'node'), expected_type=dict) or [] |
481 | if nodes: | |
482 | return self.playlist_result( | |
483 | self._extract_nodes(nodes, True), video_id, | |
a70635b8 | 484 | format_field(username, None, 'Post by %s'), description) |
eb56d132 | 485 | |
98960c91 S |
486 | video_url = self._og_search_video_url(webpage, secure=False) |
487 | ||
16097822 DR |
488 | formats = [{ |
489 | 'url': video_url, | |
eb56d132 | 490 | 'width': self._get_dimension('width', media, webpage), |
491 | 'height': self._get_dimension('height', media, webpage), | |
16097822 | 492 | }] |
eb56d132 | 493 | dash = traverse_obj(media, ('dash_info', 'video_dash_manifest')) |
cd9ea410 | 494 | if dash: |
495 | formats.extend(self._parse_mpd_formats(self._parse_xml(dash, video_id), mpd_id='dash')) | |
16097822 | 496 | |
4e260d1a | 497 | comment_data = traverse_obj(media, ('edge_media_to_parent_comment', 'edges')) |
eb56d132 | 498 | comments = [{ |
499 | 'author': traverse_obj(comment_dict, ('node', 'owner', 'username')), | |
500 | 'author_id': traverse_obj(comment_dict, ('node', 'owner', 'id')), | |
501 | 'id': traverse_obj(comment_dict, ('node', 'id')), | |
502 | 'text': traverse_obj(comment_dict, ('node', 'text')), | |
503 | 'timestamp': traverse_obj(comment_dict, ('node', 'created_at'), expected_type=int_or_none), | |
4e260d1a | 504 | } for comment_dict in comment_data] if comment_data else None |
eb56d132 | 505 | |
506 | display_resources = ( | |
507 | media.get('display_resources') | |
508 | or [{'src': media.get(key)} for key in ('display_src', 'display_url')] | |
509 | or [{'src': self._og_search_thumbnail(webpage)}]) | |
510 | thumbnails = [{ | |
511 | 'url': thumbnail['src'], | |
512 | 'width': thumbnail.get('config_width'), | |
513 | 'height': thumbnail.get('config_height'), | |
514 | } for thumbnail in display_resources if thumbnail.get('src')] | |
59fc531f | 515 | |
0de668af JMF |
516 | return { |
517 | 'id': video_id, | |
16097822 | 518 | 'formats': formats, |
013322a9 | 519 | 'title': media.get('title') or 'Video by %s' % username, |
98960c91 | 520 | 'description': description, |
eb56d132 | 521 | 'duration': float_or_none(media.get('video_duration')), |
522 | 'timestamp': traverse_obj(media, 'taken_at_timestamp', 'date', expected_type=int_or_none), | |
013322a9 | 523 | 'uploader_id': traverse_obj(media, ('owner', 'id')), |
eb56d132 | 524 | 'uploader': traverse_obj(media, ('owner', 'full_name')), |
013322a9 | 525 | 'channel': username, |
4e260d1a M |
526 | 'like_count': self._get_count(media, 'likes', 'preview_like') or str_to_int(self._search_regex( |
527 | r'data-log-event="likeCountClick"[^>]*>[^\d]*([\d,\.]+)', webpage, 'like count', fatal=False)), | |
eb56d132 | 528 | 'comment_count': self._get_count(media, 'comments', 'preview_comment', 'to_comment', 'to_parent_comment'), |
a56e74e2 | 529 | 'comments': comments, |
eb56d132 | 530 | 'thumbnails': thumbnails, |
3dd39c5f S |
531 | 'http_headers': { |
532 | 'Referer': 'https://www.instagram.com/', | |
533 | } | |
0de668af | 534 | } |
ea38e55f PH |
535 | |
536 | ||
8dcf65c9 | 537 | class InstagramPlaylistBaseIE(InstagramBaseIE): |
31fbedc0 | 538 | _gis_tmpl = None # used to cache GIS request type |
ea38e55f | 539 | |
31fbedc0 | 540 | def _parse_graphql(self, webpage, item_id): |
541 | # Reads a webpage and returns its GraphQL data. | |
542 | return self._parse_json( | |
543 | self._search_regex( | |
544 | r'sharedData\s*=\s*({.+?})\s*;\s*[<\n]', webpage, 'data'), | |
545 | item_id) | |
238d42cf | 546 | |
31fbedc0 | 547 | def _extract_graphql(self, data, url): |
548 | # Parses GraphQL queries containing videos and generates a playlist. | |
31fbedc0 | 549 | uploader_id = self._match_id(url) |
dd9aea8c S |
550 | csrf_token = data['config']['csrf_token'] |
551 | rhx_gis = data.get('rhx_gis') or '3c7ca9dcefcf966d11dacf1f151335e8' | |
552 | ||
cba5d1b6 S |
553 | cursor = '' |
554 | for page_num in itertools.count(1): | |
31fbedc0 | 555 | variables = { |
9b3036bd | 556 | 'first': 12, |
dd9aea8c | 557 | 'after': cursor, |
31fbedc0 | 558 | } |
559 | variables.update(self._query_vars_for(data)) | |
560 | variables = json.dumps(variables) | |
238d42cf S |
561 | |
562 | if self._gis_tmpl: | |
563 | gis_tmpls = [self._gis_tmpl] | |
564 | else: | |
565 | gis_tmpls = [ | |
566 | '%s' % rhx_gis, | |
567 | '', | |
568 | '%s:%s' % (rhx_gis, csrf_token), | |
8b7539d2 | 569 | '%s:%s:%s' % (rhx_gis, csrf_token, self.get_param('http_headers')['User-Agent']), |
238d42cf S |
570 | ] |
571 | ||
31fbedc0 | 572 | # try all of the ways to generate a GIS query, and not only use the |
573 | # first one that works, but cache it for future requests | |
238d42cf S |
574 | for gis_tmpl in gis_tmpls: |
575 | try: | |
31fbedc0 | 576 | json_data = self._download_json( |
238d42cf S |
577 | 'https://www.instagram.com/graphql/query/', uploader_id, |
578 | 'Downloading JSON page %d' % page_num, headers={ | |
579 | 'X-Requested-With': 'XMLHttpRequest', | |
580 | 'X-Instagram-GIS': hashlib.md5( | |
581 | ('%s:%s' % (gis_tmpl, variables)).encode('utf-8')).hexdigest(), | |
582 | }, query={ | |
31fbedc0 | 583 | 'query_hash': self._QUERY_HASH, |
238d42cf | 584 | 'variables': variables, |
31fbedc0 | 585 | }) |
586 | media = self._parse_timeline_from(json_data) | |
238d42cf S |
587 | self._gis_tmpl = gis_tmpl |
588 | break | |
589 | except ExtractorError as e: | |
31fbedc0 | 590 | # if it's an error caused by a bad query, and there are |
591 | # more GIS templates to try, ignore it and keep trying | |
e3e606de | 592 | if isinstance(e.cause, urllib.error.HTTPError) and e.cause.code == 403: |
238d42cf S |
593 | if gis_tmpl != gis_tmpls[-1]: |
594 | continue | |
595 | raise | |
cba5d1b6 | 596 | |
eb56d132 | 597 | nodes = traverse_obj(media, ('edges', ..., 'node'), expected_type=dict) or [] |
598 | if not nodes: | |
cba5d1b6 | 599 | break |
eb56d132 | 600 | yield from self._extract_nodes(nodes) |
cba5d1b6 | 601 | |
eb56d132 | 602 | has_next_page = traverse_obj(media, ('page_info', 'has_next_page')) |
603 | cursor = traverse_obj(media, ('page_info', 'end_cursor'), expected_type=str) | |
604 | if not has_next_page or not cursor: | |
cba5d1b6 | 605 | break |
5fc12b95 S |
606 | |
607 | def _real_extract(self, url): | |
31fbedc0 | 608 | user_or_tag = self._match_id(url) |
609 | webpage = self._download_webpage(url, user_or_tag) | |
610 | data = self._parse_graphql(webpage, user_or_tag) | |
dd9aea8c | 611 | |
31fbedc0 | 612 | self._set_cookie('instagram.com', 'ig_pr', '1') |
dd9aea8c | 613 | |
5fc12b95 | 614 | return self.playlist_result( |
31fbedc0 | 615 | self._extract_graphql(data, url), user_or_tag, user_or_tag) |
616 | ||
617 | ||
8dcf65c9 | 618 | class InstagramUserIE(InstagramPlaylistBaseIE): |
31fbedc0 | 619 | _VALID_URL = r'https?://(?:www\.)?instagram\.com/(?P<id>[^/]{2,})/?(?:$|[?#])' |
620 | IE_DESC = 'Instagram user profile' | |
621 | IE_NAME = 'instagram:user' | |
8dcf65c9 | 622 | _TESTS = [{ |
31fbedc0 | 623 | 'url': 'https://instagram.com/porsche', |
624 | 'info_dict': { | |
625 | 'id': 'porsche', | |
626 | 'title': 'porsche', | |
627 | }, | |
628 | 'playlist_count': 5, | |
629 | 'params': { | |
630 | 'extract_flat': True, | |
631 | 'skip_download': True, | |
632 | 'playlistend': 5, | |
633 | } | |
8dcf65c9 | 634 | }] |
31fbedc0 | 635 | |
636 | _QUERY_HASH = '42323d64886122307be10013ad2dcc44', | |
637 | ||
638 | @staticmethod | |
639 | def _parse_timeline_from(data): | |
640 | # extracts the media timeline data from a GraphQL result | |
641 | return data['data']['user']['edge_owner_to_timeline_media'] | |
642 | ||
643 | @staticmethod | |
644 | def _query_vars_for(data): | |
645 | # returns a dictionary of variables to add to the timeline query based | |
646 | # on the GraphQL of the original page | |
647 | return { | |
648 | 'id': data['entry_data']['ProfilePage'][0]['graphql']['user']['id'] | |
649 | } | |
650 | ||
651 | ||
8dcf65c9 | 652 | class InstagramTagIE(InstagramPlaylistBaseIE): |
31fbedc0 | 653 | _VALID_URL = r'https?://(?:www\.)?instagram\.com/explore/tags/(?P<id>[^/]+)' |
f304da8a | 654 | IE_DESC = 'Instagram hashtag search URLs' |
31fbedc0 | 655 | IE_NAME = 'instagram:tag' |
8dcf65c9 | 656 | _TESTS = [{ |
31fbedc0 | 657 | 'url': 'https://instagram.com/explore/tags/lolcats', |
658 | 'info_dict': { | |
659 | 'id': 'lolcats', | |
660 | 'title': 'lolcats', | |
661 | }, | |
662 | 'playlist_count': 50, | |
663 | 'params': { | |
664 | 'extract_flat': True, | |
665 | 'skip_download': True, | |
666 | 'playlistend': 50, | |
667 | } | |
8dcf65c9 | 668 | }] |
31fbedc0 | 669 | |
670 | _QUERY_HASH = 'f92f56d47dc7a55b606908374b43a314', | |
671 | ||
672 | @staticmethod | |
673 | def _parse_timeline_from(data): | |
674 | # extracts the media timeline data from a GraphQL result | |
675 | return data['data']['hashtag']['edge_hashtag_to_media'] | |
676 | ||
677 | @staticmethod | |
678 | def _query_vars_for(data): | |
679 | # returns a dictionary of variables to add to the timeline query based | |
680 | # on the GraphQL of the original page | |
681 | return { | |
682 | 'tag_name': | |
683 | data['entry_data']['TagPage'][0]['graphql']['hashtag']['name'] | |
684 | } | |
dd5e60b1 | 685 | |
686 | ||
687 | class InstagramStoryIE(InstagramBaseIE): | |
688 | _VALID_URL = r'https?://(?:www\.)?instagram\.com/stories/(?P<user>[^/]+)/(?P<id>\d+)' | |
689 | IE_NAME = 'instagram:story' | |
690 | ||
691 | _TESTS = [{ | |
692 | 'url': 'https://www.instagram.com/stories/highlights/18090946048123978/', | |
693 | 'info_dict': { | |
694 | 'id': '18090946048123978', | |
695 | 'title': 'Rare', | |
696 | }, | |
697 | 'playlist_mincount': 50 | |
698 | }] | |
699 | ||
700 | def _real_extract(self, url): | |
701 | username, story_id = self._match_valid_url(url).groups() | |
e3e606de PD |
702 | story_info = self._download_webpage(url, story_id) |
703 | user_info = self._search_json(r'"user":', story_info, 'user info', story_id, fatal=False) | |
704 | if not user_info: | |
705 | self.raise_login_required('This content is unreachable') | |
706 | user_id = user_info.get('id') | |
dd5e60b1 | 707 | |
708 | story_info_url = user_id if username != 'highlights' else f'highlight:{story_id}' | |
e3e606de | 709 | videos = traverse_obj(self._download_json( |
7d3b98be | 710 | f'{self._API_BASE_URL}/feed/reels_media/?reel_ids={story_info_url}', |
711 | story_id, errnote=False, fatal=False, headers=self._API_HEADERS), 'reels') | |
e3e606de PD |
712 | if not videos: |
713 | self.raise_login_required('You need to log in to access this content') | |
f7085283 | 714 | |
e3e606de PD |
715 | full_name = traverse_obj(videos, (f'highlight:{story_id}', 'user', 'full_name'), (str(user_id), 'user', 'full_name')) |
716 | story_title = traverse_obj(videos, (f'highlight:{story_id}', 'title')) | |
717 | if not story_title: | |
718 | story_title = f'Story by {username}' | |
f7085283 | 719 | |
013322a9 | 720 | highlights = traverse_obj(videos, (f'highlight:{story_id}', 'items'), (str(user_id), 'items')) |
e3e606de PD |
721 | info_data = [] |
722 | for highlight in highlights: | |
723 | highlight_data = self._extract_product(highlight) | |
724 | if highlight_data.get('formats'): | |
725 | info_data.append({ | |
726 | **highlight_data, | |
727 | 'uploader': full_name, | |
728 | 'uploader_id': user_id, | |
729 | }) | |
730 | return self.playlist_result(info_data, playlist_id=story_id, playlist_title=story_title) |