]> jfr.im git - yt-dlp.git/blame - yt_dlp/extractor/instagram.py
[RedGifs] Add Search and User extractors (#1808)
[yt-dlp.git] / yt_dlp / extractor / instagram.py
CommitLineData
8dcf65c9 1# coding: utf-8
0de668af 2
cba5d1b6 3import itertools
315ab3d5 4import hashlib
27b1c73f 5import json
59fc531f 6import re
ab2ffab2 7import time
59fc531f
JMF
8
9from .common import InfoExtractor
238d42cf 10from ..compat import (
238d42cf
S
11 compat_HTTPError,
12)
e1ec9330 13from ..utils import (
238d42cf 14 ExtractorError,
cce889b9 15 float_or_none,
c4096e8a 16 get_element_by_attribute,
e1ec9330 17 int_or_none,
87696e78 18 lowercase_escape,
238d42cf 19 std_headers,
eb56d132 20 traverse_obj,
3052a30d 21 url_or_none,
ab2ffab2 22 urlencode_postdata,
e1ec9330 23)
59fc531f 24
0de668af 25
8dcf65c9 26class InstagramBaseIE(InfoExtractor):
ab2ffab2 27 _NETRC_MACHINE = 'instagram'
8dcf65c9 28 _IS_LOGGED_IN = False
29
30 def _login(self):
31 username, password = self._get_login_info()
32 if username is None or self._IS_LOGGED_IN:
33 return
34
35 login_webpage = self._download_webpage(
36 'https://www.instagram.com/accounts/login/', None,
37 note='Downloading login webpage', errnote='Failed to download login webpage')
38
39 shared_data = self._parse_json(
40 self._search_regex(
41 r'window\._sharedData\s*=\s*({.+?});',
42 login_webpage, 'shared data', default='{}'),
43 None)
44
45 login = self._download_json('https://www.instagram.com/accounts/login/ajax/', None, note='Logging in', headers={
46 'Accept': '*/*',
47 'X-IG-App-ID': '936619743392459',
48 'X-ASBD-ID': '198387',
49 'X-IG-WWW-Claim': '0',
50 'X-Requested-With': 'XMLHttpRequest',
51 'X-CSRFToken': shared_data['config']['csrf_token'],
52 'X-Instagram-AJAX': shared_data['rollout_hash'],
53 'Referer': 'https://www.instagram.com/',
54 }, data=urlencode_postdata({
55 'enc_password': f'#PWD_INSTAGRAM_BROWSER:0:{int(time.time())}:{password}',
56 'username': username,
57 'queryParams': '{}',
58 'optIntoOneTap': 'false',
59 'stopDeletionNonce': '',
60 'trustedDeviceRecords': '{}',
61 }))
62
63 if not login.get('authenticated'):
64 if login.get('message'):
65 raise ExtractorError(f'Unable to login: {login["message"]}')
66 raise ExtractorError('Unable to login')
67 InstagramBaseIE._IS_LOGGED_IN = True
68
69 def _real_initialize(self):
70 self._login()
71
eb56d132 72 def _get_count(self, media, kind, *keys):
73 return traverse_obj(
74 media, (kind, 'count'), *((f'edge_media_{key}', 'count') for key in keys),
75 expected_type=int_or_none)
76
77 def _get_dimension(self, name, media, webpage=None):
78 return (
79 traverse_obj(media, ('dimensions', name), expected_type=int_or_none)
80 or int_or_none(self._html_search_meta(
81 (f'og:video:{name}', f'video:{name}'), webpage or '', default=None)))
82
83 def _extract_nodes(self, nodes, is_direct=False):
84 for idx, node in enumerate(nodes, start=1):
85 if node.get('__typename') != 'GraphVideo' and node.get('is_video') is not True:
86 continue
87
88 video_id = node.get('shortcode')
89
90 if is_direct:
91 info = {
92 'id': video_id or node['id'],
93 'url': node.get('video_url'),
94 'width': self._get_dimension('width', node),
95 'height': self._get_dimension('height', node),
96 'http_headers': {
97 'Referer': 'https://www.instagram.com/',
98 }
99 }
100 elif not video_id:
101 continue
102 else:
103 info = {
104 '_type': 'url',
105 'ie_key': 'Instagram',
106 'id': video_id,
107 'url': f'https://instagram.com/p/{video_id}',
108 }
109
110 yield {
111 **info,
112 'title': node.get('title') or (f'Video {idx}' if is_direct else None),
113 'description': traverse_obj(
114 node, ('edge_media_to_caption', 'edges', 0, 'node', 'text'), expected_type=str),
115 'thumbnail': traverse_obj(
116 node, 'display_url', 'thumbnail_src', 'display_src', expected_type=url_or_none),
117 'duration': float_or_none(node.get('video_duration')),
118 'timestamp': int_or_none(node.get('taken_at_timestamp')),
119 'view_count': int_or_none(node.get('video_view_count')),
120 'comment_count': self._get_count(node, 'comments', 'preview_comment', 'to_comment', 'to_parent_comment'),
121 'like_count': self._get_count(node, 'likes', 'preview_like'),
122 }
123
8dcf65c9 124
fb2d1ee6 125class InstagramIOSIE(InfoExtractor):
c586f9e8 126 IE_DESC = 'IOS instagram:// URL'
fb2d1ee6 127 _VALID_URL = r'instagram://media\?id=(?P<id>[\d_]+)'
128 _TESTS = [{
129 'url': 'instagram://media?id=482584233761418119',
130 'md5': '0d2da106a9d2631273e192b372806516',
131 'info_dict': {
132 'id': 'aye83DjauH',
133 'ext': 'mp4',
134 'title': 'Video by naomipq',
135 'description': 'md5:1f17f0ab29bd6fe2bfad705f58de3cb8',
136 'thumbnail': r're:^https?://.*\.jpg',
137 'duration': 0,
138 'timestamp': 1371748545,
139 'upload_date': '20130620',
140 'uploader_id': 'naomipq',
141 'uploader': 'B E A U T Y F O R A S H E S',
142 'like_count': int,
143 'comment_count': int,
144 'comments': list,
145 },
146 'add_ie': ['Instagram']
147 }]
148
149 def _get_id(self, id):
150 """Source: https://stackoverflow.com/questions/24437823/getting-instagram-post-url-from-media-id"""
151 chrs = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_'
152 media_id = int(id.split('_')[0])
153 shortened_id = ''
154 while media_id > 0:
155 r = media_id % 64
156 media_id = (media_id - r) // 64
157 shortened_id = chrs[r] + shortened_id
158 return shortened_id
159
160 def _real_extract(self, url):
161 return {
162 '_type': 'url_transparent',
163 'url': f'http://instagram.com/tv/{self._get_id(self._match_id(url))}/',
164 'ie_key': 'Instagram',
165 }
166
167
8dcf65c9 168class InstagramIE(InstagramBaseIE):
169 _VALID_URL = r'(?P<url>https?://(?:www\.)?instagram\.com/(?:p|tv|reel)/(?P<id>[^/?#&]+))'
4479600d 170 _TESTS = [{
fc6e75dd 171 'url': 'https://instagram.com/p/aye83DjauH/?foo=bar#abc',
0de668af
JMF
172 'md5': '0d2da106a9d2631273e192b372806516',
173 'info_dict': {
174 'id': 'aye83DjauH',
175 'ext': 'mp4',
0de668af
JMF
176 'title': 'Video by naomipq',
177 'description': 'md5:1f17f0ab29bd6fe2bfad705f58de3cb8',
ec85ded8 178 'thumbnail': r're:^https?://.*\.jpg',
cce889b9 179 'duration': 0,
98960c91
S
180 'timestamp': 1371748545,
181 'upload_date': '20130620',
182 'uploader_id': 'naomipq',
29f7c58a 183 'uploader': 'B E A U T Y F O R A S H E S',
98960c91
S
184 'like_count': int,
185 'comment_count': int,
a56e74e2 186 'comments': list,
98960c91 187 },
fb4b3458
S
188 }, {
189 # missing description
190 'url': 'https://www.instagram.com/p/BA-pQFBG8HZ/?taken-by=britneyspears',
191 'info_dict': {
192 'id': 'BA-pQFBG8HZ',
193 'ext': 'mp4',
fb4b3458 194 'title': 'Video by britneyspears',
ec85ded8 195 'thumbnail': r're:^https?://.*\.jpg',
cce889b9 196 'duration': 0,
98960c91
S
197 'timestamp': 1453760977,
198 'upload_date': '20160125',
199 'uploader_id': 'britneyspears',
200 'uploader': 'Britney Spears',
201 'like_count': int,
202 'comment_count': int,
a56e74e2 203 'comments': list,
fb4b3458
S
204 },
205 'params': {
206 'skip_download': True,
207 },
ada77fa5
S
208 }, {
209 # multi video post
210 'url': 'https://www.instagram.com/p/BQ0eAlwhDrw/',
211 'playlist': [{
212 'info_dict': {
213 'id': 'BQ0dSaohpPW',
214 'ext': 'mp4',
215 'title': 'Video 1',
216 },
217 }, {
218 'info_dict': {
219 'id': 'BQ0dTpOhuHT',
220 'ext': 'mp4',
221 'title': 'Video 2',
222 },
223 }, {
224 'info_dict': {
225 'id': 'BQ0dT7RBFeF',
226 'ext': 'mp4',
227 'title': 'Video 3',
228 },
229 }],
230 'info_dict': {
231 'id': 'BQ0eAlwhDrw',
232 'title': 'Post by instagram',
233 'description': 'md5:0f9203fc6a2ce4d228da5754bcf54957',
234 },
cce889b9 235 }, {
236 # IGTV
237 'url': 'https://www.instagram.com/tv/BkfuX9UB-eK/',
238 'info_dict': {
239 'id': 'BkfuX9UB-eK',
240 'ext': 'mp4',
241 'title': 'Fingerboarding Tricks with @cass.fb',
242 'thumbnail': r're:^https?://.*\.jpg',
243 'duration': 53.83,
244 'timestamp': 1530032919,
245 'upload_date': '20180626',
246 'uploader_id': 'instagram',
247 'uploader': 'Instagram',
248 'like_count': int,
249 'comment_count': int,
250 'comments': list,
251 'description': 'Meet Cass Hirst (@cass.fb), a fingerboarding pro who can perform tiny ollies and kickflips while blindfolded.',
252 }
4479600d
S
253 }, {
254 'url': 'https://instagram.com/p/-Cmh1cukG2/',
255 'only_matching': True,
0dafea02
S
256 }, {
257 'url': 'http://instagram.com/p/9o6LshA7zy/embed/',
258 'only_matching': True,
edb2820c
RA
259 }, {
260 'url': 'https://www.instagram.com/tv/aye83DjauH/',
261 'only_matching': True,
29f7c58a 262 }, {
263 'url': 'https://www.instagram.com/reel/CDUMkliABpa/',
264 'only_matching': True,
4479600d 265 }]
59fc531f 266
c4096e8a
YCH
267 @staticmethod
268 def _extract_embed_url(webpage):
c23533a1
S
269 mobj = re.search(
270 r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?instagram\.com/p/[^/]+/embed.*?)\1',
271 webpage)
272 if mobj:
273 return mobj.group('url')
274
c4096e8a
YCH
275 blockquote_el = get_element_by_attribute(
276 'class', 'instagram-media', webpage)
277 if blockquote_el is None:
278 return
279
280 mobj = re.search(
281 r'<a[^>]+href=([\'"])(?P<link>[^\'"]+)\1', blockquote_el)
282 if mobj:
283 return mobj.group('link')
284
59fc531f 285 def _real_extract(self, url):
eb56d132 286 video_id, url = self._match_valid_url(url).group('id', 'url')
a0c716bb 287 webpage, urlh = self._download_webpage_handle(url, video_id)
eb56d132 288 if 'www.instagram.com/accounts/login' in urlh.geturl():
ab2ffab2 289 self.raise_login_required('You need to log in to access this content')
98960c91 290
29f7c58a 291 shared_data = self._parse_json(
292 self._search_regex(
293 r'window\._sharedData\s*=\s*({.+?});',
294 webpage, 'shared data', default='{}'),
295 video_id, fatal=False)
eb56d132 296 media = traverse_obj(
297 shared_data,
298 ('entry_data', 'PostPage', 0, 'graphql', 'shortcode_media'),
299 ('entry_data', 'PostPage', 0, 'media'),
300 expected_type=dict)
301
29f7c58a 302 # _sharedData.entry_data.PostPage is empty when authenticated (see
303 # https://github.com/ytdl-org/youtube-dl/pull/22880)
304 if not media:
305 additional_data = self._parse_json(
306 self._search_regex(
307 r'window\.__additionalDataLoaded\s*\(\s*[^,]+,\s*({.+?})\s*\)\s*;',
308 webpage, 'additional data', default='{}'),
309 video_id, fatal=False)
eb56d132 310 media = traverse_obj(additional_data, ('graphql', 'shortcode_media'), expected_type=dict) or {}
311
312 uploader_id = traverse_obj(media, ('owner', 'username')) or self._search_regex(
313 r'"owner"\s*:\s*{\s*"username"\s*:\s*"(.+?)"', webpage, 'uploader id', fatal=False)
314
315 description = (
316 traverse_obj(media, ('edge_media_to_caption', 'edges', 0, 'node', 'text'), expected_type=str)
317 or media.get('caption'))
318 if not description:
319 description = self._search_regex(
320 r'"caption"\s*:\s*"(.+?)"', webpage, 'description', default=None)
321 if description is not None:
322 description = lowercase_escape(description)
98960c91 323
eb56d132 324 video_url = media.get('video_url')
98960c91 325 if not video_url:
eb56d132 326 nodes = traverse_obj(media, ('edge_sidecar_to_children', 'edges', ..., 'node'), expected_type=dict) or []
327 if nodes:
328 return self.playlist_result(
329 self._extract_nodes(nodes, True), video_id,
330 'Post by %s' % uploader_id if uploader_id else None, description)
331
98960c91
S
332 video_url = self._og_search_video_url(webpage, secure=False)
333
16097822
DR
334 formats = [{
335 'url': video_url,
eb56d132 336 'width': self._get_dimension('width', media, webpage),
337 'height': self._get_dimension('height', media, webpage),
16097822 338 }]
eb56d132 339 dash = traverse_obj(media, ('dash_info', 'video_dash_manifest'))
cd9ea410 340 if dash:
341 formats.extend(self._parse_mpd_formats(self._parse_xml(dash, video_id), mpd_id='dash'))
342 self._sort_formats(formats)
16097822 343
eb56d132 344 comments = [{
345 'author': traverse_obj(comment_dict, ('node', 'owner', 'username')),
346 'author_id': traverse_obj(comment_dict, ('node', 'owner', 'id')),
347 'id': traverse_obj(comment_dict, ('node', 'id')),
348 'text': traverse_obj(comment_dict, ('node', 'text')),
349 'timestamp': traverse_obj(comment_dict, ('node', 'created_at'), expected_type=int_or_none),
350 } for comment_dict in traverse_obj(media, ('edge_media_to_parent_comment', 'edges'))]
351
352 display_resources = (
353 media.get('display_resources')
354 or [{'src': media.get(key)} for key in ('display_src', 'display_url')]
355 or [{'src': self._og_search_thumbnail(webpage)}])
356 thumbnails = [{
357 'url': thumbnail['src'],
358 'width': thumbnail.get('config_width'),
359 'height': thumbnail.get('config_height'),
360 } for thumbnail in display_resources if thumbnail.get('src')]
59fc531f 361
0de668af
JMF
362 return {
363 'id': video_id,
16097822 364 'formats': formats,
eb56d132 365 'title': media.get('title') or 'Video by %s' % uploader_id,
98960c91 366 'description': description,
eb56d132 367 'duration': float_or_none(media.get('video_duration')),
368 'timestamp': traverse_obj(media, 'taken_at_timestamp', 'date', expected_type=int_or_none),
0de668af 369 'uploader_id': uploader_id,
eb56d132 370 'uploader': traverse_obj(media, ('owner', 'full_name')),
371 'like_count': self._get_count(media, 'likes', 'preview_like'),
372 'comment_count': self._get_count(media, 'comments', 'preview_comment', 'to_comment', 'to_parent_comment'),
a56e74e2 373 'comments': comments,
eb56d132 374 'thumbnails': thumbnails,
3dd39c5f
S
375 'http_headers': {
376 'Referer': 'https://www.instagram.com/',
377 }
0de668af 378 }
ea38e55f
PH
379
380
8dcf65c9 381class InstagramPlaylistBaseIE(InstagramBaseIE):
31fbedc0 382 _gis_tmpl = None # used to cache GIS request type
ea38e55f 383
31fbedc0 384 def _parse_graphql(self, webpage, item_id):
385 # Reads a webpage and returns its GraphQL data.
386 return self._parse_json(
387 self._search_regex(
388 r'sharedData\s*=\s*({.+?})\s*;\s*[<\n]', webpage, 'data'),
389 item_id)
238d42cf 390
31fbedc0 391 def _extract_graphql(self, data, url):
392 # Parses GraphQL queries containing videos and generates a playlist.
31fbedc0 393 uploader_id = self._match_id(url)
dd9aea8c
S
394 csrf_token = data['config']['csrf_token']
395 rhx_gis = data.get('rhx_gis') or '3c7ca9dcefcf966d11dacf1f151335e8'
396
cba5d1b6
S
397 cursor = ''
398 for page_num in itertools.count(1):
31fbedc0 399 variables = {
9b3036bd 400 'first': 12,
dd9aea8c 401 'after': cursor,
31fbedc0 402 }
403 variables.update(self._query_vars_for(data))
404 variables = json.dumps(variables)
238d42cf
S
405
406 if self._gis_tmpl:
407 gis_tmpls = [self._gis_tmpl]
408 else:
409 gis_tmpls = [
410 '%s' % rhx_gis,
411 '',
412 '%s:%s' % (rhx_gis, csrf_token),
413 '%s:%s:%s' % (rhx_gis, csrf_token, std_headers['User-Agent']),
414 ]
415
31fbedc0 416 # try all of the ways to generate a GIS query, and not only use the
417 # first one that works, but cache it for future requests
238d42cf
S
418 for gis_tmpl in gis_tmpls:
419 try:
31fbedc0 420 json_data = self._download_json(
238d42cf
S
421 'https://www.instagram.com/graphql/query/', uploader_id,
422 'Downloading JSON page %d' % page_num, headers={
423 'X-Requested-With': 'XMLHttpRequest',
424 'X-Instagram-GIS': hashlib.md5(
425 ('%s:%s' % (gis_tmpl, variables)).encode('utf-8')).hexdigest(),
426 }, query={
31fbedc0 427 'query_hash': self._QUERY_HASH,
238d42cf 428 'variables': variables,
31fbedc0 429 })
430 media = self._parse_timeline_from(json_data)
238d42cf
S
431 self._gis_tmpl = gis_tmpl
432 break
433 except ExtractorError as e:
31fbedc0 434 # if it's an error caused by a bad query, and there are
435 # more GIS templates to try, ignore it and keep trying
238d42cf
S
436 if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403:
437 if gis_tmpl != gis_tmpls[-1]:
438 continue
439 raise
cba5d1b6 440
eb56d132 441 nodes = traverse_obj(media, ('edges', ..., 'node'), expected_type=dict) or []
442 if not nodes:
cba5d1b6 443 break
eb56d132 444 yield from self._extract_nodes(nodes)
cba5d1b6 445
eb56d132 446 has_next_page = traverse_obj(media, ('page_info', 'has_next_page'))
447 cursor = traverse_obj(media, ('page_info', 'end_cursor'), expected_type=str)
448 if not has_next_page or not cursor:
cba5d1b6 449 break
5fc12b95
S
450
451 def _real_extract(self, url):
31fbedc0 452 user_or_tag = self._match_id(url)
453 webpage = self._download_webpage(url, user_or_tag)
454 data = self._parse_graphql(webpage, user_or_tag)
dd9aea8c 455
31fbedc0 456 self._set_cookie('instagram.com', 'ig_pr', '1')
dd9aea8c 457
5fc12b95 458 return self.playlist_result(
31fbedc0 459 self._extract_graphql(data, url), user_or_tag, user_or_tag)
460
461
8dcf65c9 462class InstagramUserIE(InstagramPlaylistBaseIE):
31fbedc0 463 _VALID_URL = r'https?://(?:www\.)?instagram\.com/(?P<id>[^/]{2,})/?(?:$|[?#])'
464 IE_DESC = 'Instagram user profile'
465 IE_NAME = 'instagram:user'
8dcf65c9 466 _TESTS = [{
31fbedc0 467 'url': 'https://instagram.com/porsche',
468 'info_dict': {
469 'id': 'porsche',
470 'title': 'porsche',
471 },
472 'playlist_count': 5,
473 'params': {
474 'extract_flat': True,
475 'skip_download': True,
476 'playlistend': 5,
477 }
8dcf65c9 478 }]
31fbedc0 479
480 _QUERY_HASH = '42323d64886122307be10013ad2dcc44',
481
482 @staticmethod
483 def _parse_timeline_from(data):
484 # extracts the media timeline data from a GraphQL result
485 return data['data']['user']['edge_owner_to_timeline_media']
486
487 @staticmethod
488 def _query_vars_for(data):
489 # returns a dictionary of variables to add to the timeline query based
490 # on the GraphQL of the original page
491 return {
492 'id': data['entry_data']['ProfilePage'][0]['graphql']['user']['id']
493 }
494
495
8dcf65c9 496class InstagramTagIE(InstagramPlaylistBaseIE):
31fbedc0 497 _VALID_URL = r'https?://(?:www\.)?instagram\.com/explore/tags/(?P<id>[^/]+)'
498 IE_DESC = 'Instagram hashtag search'
499 IE_NAME = 'instagram:tag'
8dcf65c9 500 _TESTS = [{
31fbedc0 501 'url': 'https://instagram.com/explore/tags/lolcats',
502 'info_dict': {
503 'id': 'lolcats',
504 'title': 'lolcats',
505 },
506 'playlist_count': 50,
507 'params': {
508 'extract_flat': True,
509 'skip_download': True,
510 'playlistend': 50,
511 }
8dcf65c9 512 }]
31fbedc0 513
514 _QUERY_HASH = 'f92f56d47dc7a55b606908374b43a314',
515
516 @staticmethod
517 def _parse_timeline_from(data):
518 # extracts the media timeline data from a GraphQL result
519 return data['data']['hashtag']['edge_hashtag_to_media']
520
521 @staticmethod
522 def _query_vars_for(data):
523 # returns a dictionary of variables to add to the timeline query based
524 # on the GraphQL of the original page
525 return {
526 'tag_name':
527 data['entry_data']['TagPage'][0]['graphql']['hashtag']['name']
528 }