2 from __future__
import unicode_literals
10 from .common
import InfoExtractor
11 from ..compat
import (
18 get_element_by_attribute
,
29 class InstagramBaseIE(InfoExtractor
):
30 _NETRC_MACHINE
= 'instagram'
34 username
, password
= self
._get
_login
_info
()
35 if username
is None or self
._IS
_LOGGED
_IN
:
38 login_webpage
= self
._download
_webpage
(
39 'https://www.instagram.com/accounts/login/', None,
40 note
='Downloading login webpage', errnote
='Failed to download login webpage')
42 shared_data
= self
._parse
_json
(
44 r
'window\._sharedData\s*=\s*({.+?});',
45 login_webpage
, 'shared data', default
='{}'),
48 login
= self
._download
_json
('https://www.instagram.com/accounts/login/ajax/', None, note
='Logging in', headers
={
50 'X-IG-App-ID': '936619743392459',
51 'X-ASBD-ID': '198387',
52 'X-IG-WWW-Claim': '0',
53 'X-Requested-With': 'XMLHttpRequest',
54 'X-CSRFToken': shared_data
['config']['csrf_token'],
55 'X-Instagram-AJAX': shared_data
['rollout_hash'],
56 'Referer': 'https://www.instagram.com/',
57 }, data
=urlencode_postdata({
58 'enc_password': f
'#PWD_INSTAGRAM_BROWSER:0:{int(time.time())}:{password}',
61 'optIntoOneTap': 'false',
62 'stopDeletionNonce': '',
63 'trustedDeviceRecords': '{}',
66 if not login
.get('authenticated'):
67 if login
.get('message'):
68 raise ExtractorError(f
'Unable to login: {login["message"]}')
69 raise ExtractorError('Unable to login')
70 InstagramBaseIE
._IS
_LOGGED
_IN
= True
72 def _real_initialize(self
):
76 class InstagramIOSIE(InfoExtractor
):
77 IE_DESC
= 'IOS instagram:// URL'
78 _VALID_URL
= r
'instagram://media\?id=(?P<id>[\d_]+)'
80 'url': 'instagram://media?id=482584233761418119',
81 'md5': '0d2da106a9d2631273e192b372806516',
85 'title': 'Video by naomipq',
86 'description': 'md5:1f17f0ab29bd6fe2bfad705f58de3cb8',
87 'thumbnail': r
're:^https?://.*\.jpg',
89 'timestamp': 1371748545,
90 'upload_date': '20130620',
91 'uploader_id': 'naomipq',
92 'uploader': 'B E A U T Y F O R A S H E S',
97 'add_ie': ['Instagram']
100 def _get_id(self
, id):
101 """Source: https://stackoverflow.com/questions/24437823/getting-instagram-post-url-from-media-id"""
102 chrs
= 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_'
103 media_id
= int(id.split('_')[0])
107 media_id
= (media_id
- r
) // 64
108 shortened_id
= chrs
[r
] + shortened_id
111 def _real_extract(self
, url
):
113 '_type': 'url_transparent',
114 'url': f
'http://instagram.com/tv/{self._get_id(self._match_id(url))}/',
115 'ie_key': 'Instagram',
119 class InstagramIE(InstagramBaseIE
):
120 _VALID_URL
= r
'(?P<url>https?://(?:www\.)?instagram\.com/(?:p|tv|reel)/(?P<id>[^/?#&]+))'
122 'url': 'https://instagram.com/p/aye83DjauH/?foo=bar#abc',
123 'md5': '0d2da106a9d2631273e192b372806516',
127 'title': 'Video by naomipq',
128 'description': 'md5:1f17f0ab29bd6fe2bfad705f58de3cb8',
129 'thumbnail': r
're:^https?://.*\.jpg',
131 'timestamp': 1371748545,
132 'upload_date': '20130620',
133 'uploader_id': 'naomipq',
134 'uploader': 'B E A U T Y F O R A S H E S',
136 'comment_count': int,
140 # missing description
141 'url': 'https://www.instagram.com/p/BA-pQFBG8HZ/?taken-by=britneyspears',
145 'title': 'Video by britneyspears',
146 'thumbnail': r
're:^https?://.*\.jpg',
148 'timestamp': 1453760977,
149 'upload_date': '20160125',
150 'uploader_id': 'britneyspears',
151 'uploader': 'Britney Spears',
153 'comment_count': int,
157 'skip_download': True,
161 'url': 'https://www.instagram.com/p/BQ0eAlwhDrw/',
183 'title': 'Post by instagram',
184 'description': 'md5:0f9203fc6a2ce4d228da5754bcf54957',
188 'url': 'https://www.instagram.com/tv/BkfuX9UB-eK/',
192 'title': 'Fingerboarding Tricks with @cass.fb',
193 'thumbnail': r
're:^https?://.*\.jpg',
195 'timestamp': 1530032919,
196 'upload_date': '20180626',
197 'uploader_id': 'instagram',
198 'uploader': 'Instagram',
200 'comment_count': int,
202 'description': 'Meet Cass Hirst (@cass.fb), a fingerboarding pro who can perform tiny ollies and kickflips while blindfolded.',
205 'url': 'https://instagram.com/p/-Cmh1cukG2/',
206 'only_matching': True,
208 'url': 'http://instagram.com/p/9o6LshA7zy/embed/',
209 'only_matching': True,
211 'url': 'https://www.instagram.com/tv/aye83DjauH/',
212 'only_matching': True,
214 'url': 'https://www.instagram.com/reel/CDUMkliABpa/',
215 'only_matching': True,
219 def _extract_embed_url(webpage
):
221 r
'<iframe[^>]+src=(["\'])(?P
<url
>(?
:https?
:)?
//(?
:www\
.)?instagram\
.com
/p
/[^
/]+/embed
.*?
)\
1',
224 return mobj.group('url
')
226 blockquote_el = get_element_by_attribute(
227 'class', 'instagram
-media
', webpage)
228 if blockquote_el is None:
232 r'<a
[^
>]+href
=([\'"])(?P<link>[^\'"]+)\
1', blockquote_el)
234 return mobj.group('link
')
236 def _real_extract(self, url):
237 mobj = self._match_valid_url(url)
238 video_id = mobj.group('id')
239 url = mobj.group('url
')
241 webpage, urlh = self._download_webpage_handle(url, video_id)
242 if 'www
.instagram
.com
/accounts
/login
' in urlh.geturl().rstrip('/'):
243 self.raise_login_required('You need to log
in to access this content
')
245 (media, video_url, description, thumbnails, timestamp, uploader,
246 uploader_id, like_count, comment_count, comments, height,
249 shared_data = self._parse_json(
251 r'window\
._sharedData\s
*=\s
*({.+?}
);',
252 webpage, 'shared data
', default='{}'),
253 video_id, fatal=False)
257 (lambda x: x['entry_data
']['PostPage
'][0]['graphql
']['shortcode_media
'],
258 lambda x: x['entry_data
']['PostPage
'][0]['media
']),
260 # _sharedData.entry_data.PostPage is empty when authenticated (see
261 # https://github.com/ytdl-org/youtube-dl/pull/22880)
263 additional_data = self._parse_json(
265 r'window\
.__additionalDataLoaded\s
*\
(\s
*[^
,]+,\s
*({.+?}
)\s
*\
)\s
*;',
266 webpage, 'additional data
', default='{}'),
267 video_id, fatal=False)
270 additional_data, lambda x: x['graphql
']['shortcode_media
'],
273 video_url = media.get('video_url
')
274 height = int_or_none(self._html_search_meta(('og
:video
:height
', 'video
:height
'), webpage)) or try_get(media, lambda x: x['dimensions
']['height
'])
275 width = int_or_none(self._html_search_meta(('og
:video
:width
', 'video
:width
'), webpage)) or try_get(media, lambda x: x['dimensions
']['width
'])
276 description = try_get(
277 media, lambda x: x['edge_media_to_caption
']['edges
'][0]['node
']['text
'],
278 compat_str) or media.get('caption
')
279 title = media.get('title
')
280 display_resources = media.get('display_resources
')
281 if not display_resources:
282 display_resources = [{'src': media.get('display_src')}, {'src': media.get('display_url')}]
283 duration = float_or_none(media.get('video_duration
'))
284 timestamp = int_or_none(media.get('taken_at_timestamp
') or media.get('date
'))
285 uploader = try_get(media, lambda x: x['owner
']['full_name
'])
286 uploader_id = try_get(media, lambda x: x['owner
']['username
'])
288 def get_count(keys, kind):
289 for key in variadic(keys):
290 count = int_or_none(try_get(
291 media, (lambda x: x['edge_media_
%s' % key]['count
'],
292 lambda x: x['%ss' % kind]['count
'])))
293 if count is not None:
296 like_count = get_count('preview_like
', 'like
')
297 comment_count = get_count(
298 ('preview_comment
', 'to_comment
', 'to_parent_comment
'), 'comment
')
301 'url
': thumbnail['src
'],
302 'width
': thumbnail.get('config_width
'),
303 'height
': thumbnail.get('config_height
'),
304 } for thumbnail in display_resources if thumbnail.get('src
')]
307 for comment in try_get(media, lambda x: x['edge_media_to_parent_comment
']['edges
']):
308 comment_dict = comment.get('node
', {})
309 comment_text = comment_dict.get('text
')
312 'author
': try_get(comment_dict, lambda x: x['owner
']['username
']),
313 'author_id
': try_get(comment_dict, lambda x: x['owner
']['id']),
314 'id': comment_dict.get('id'),
315 'text
': comment_text,
316 'timestamp
': int_or_none(comment_dict.get('created_at
')),
320 media, lambda x: x['edge_sidecar_to_children
']['edges
'],
324 for edge_num, edge in enumerate(edges, start=1):
325 node = try_get(edge, lambda x: x['node
'], dict)
328 node_video_url = url_or_none(node.get('video_url
'))
329 if not node_video_url:
332 'id': node.get('shortcode
') or node['id'],
333 'title
': node.get('title
') or 'Video
%d' % edge_num,
334 'url
': node_video_url,
335 'thumbnail
': node.get('display_url
'),
336 'duration
': float_or_none(node.get('video_duration
')),
337 'width
': int_or_none(try_get(node, lambda x: x['dimensions
']['width
'])),
338 'height
': int_or_none(try_get(node, lambda x: x['dimensions
']['height
'])),
339 'view_count
': int_or_none(node.get('video_view_count
')),
341 return self.playlist_result(
343 'Post by
%s' % uploader_id if uploader_id else None,
347 video_url = self._og_search_video_url(webpage, secure=False)
354 dash = try_get(media, lambda x: x['dash_info
']['video_dash_manifest
'])
356 formats.extend(self._parse_mpd_formats(self._parse_xml(dash, video_id), mpd_id='dash
'))
357 self._sort_formats(formats)
360 uploader_id = self._search_regex(
361 r'"owner"\s
*:\s
*{\s
*"username"\s
*:\s
*"(.+?)"',
362 webpage, 'uploader
id', fatal=False)
365 description = self._search_regex(
366 r'"caption"\s
*:\s
*"(.+?)"', webpage, 'description
', default=None)
367 if description is not None:
368 description = lowercase_escape(description)
371 thumbnails = self._og_search_thumbnail(webpage)
377 'title
': title or 'Video by
%s' % uploader_id,
378 'description
': description,
379 'duration
': duration,
380 'thumbnails
': thumbnails,
381 'timestamp
': timestamp,
382 'uploader_id
': uploader_id,
383 'uploader
': uploader,
384 'like_count
': like_count,
385 'comment_count
': comment_count,
386 'comments
': comments,
388 'Referer
': 'https
://www
.instagram
.com
/',
393 class InstagramPlaylistBaseIE(InstagramBaseIE):
394 _gis_tmpl = None # used to cache GIS request type
396 def _parse_graphql(self, webpage, item_id):
397 # Reads a webpage and returns its GraphQL data.
398 return self._parse_json(
400 r'sharedData\s
*=\s
*({.+?}
)\s
*;\s
*[<\n]', webpage, 'data
'),
403 def _extract_graphql(self, data, url):
404 # Parses GraphQL queries containing videos and generates a playlist.
405 def get_count(suffix):
406 return int_or_none(try_get(
407 node, lambda x: x['edge_media_
' + suffix]['count
']))
409 uploader_id = self._match_id(url)
410 csrf_token = data['config
']['csrf_token
']
411 rhx_gis = data.get('rhx_gis
') or '3c7ca9dcefcf966d11dacf1f151335e8
'
414 for page_num in itertools.count(1):
419 variables.update(self._query_vars_for(data))
420 variables = json.dumps(variables)
423 gis_tmpls = [self._gis_tmpl]
428 '%s:%s' % (rhx_gis, csrf_token),
429 '%s:%s:%s' % (rhx_gis, csrf_token, std_headers['User
-Agent
']),
432 # try all of the ways to generate a GIS query, and not only use the
433 # first one that works, but cache it for future requests
434 for gis_tmpl in gis_tmpls:
436 json_data = self._download_json(
437 'https
://www
.instagram
.com
/graphql
/query
/', uploader_id,
438 'Downloading JSON page
%d' % page_num, headers={
439 'X
-Requested
-With
': 'XMLHttpRequest
',
440 'X
-Instagram
-GIS
': hashlib.md5(
441 ('%s:%s' % (gis_tmpl, variables)).encode('utf
-8')).hexdigest(),
443 'query_hash
': self._QUERY_HASH,
444 'variables
': variables,
446 media = self._parse_timeline_from(json_data)
447 self._gis_tmpl = gis_tmpl
449 except ExtractorError as e:
450 # if it's an error caused by a bad query
, and there are
451 # more GIS templates to try, ignore it and keep trying
452 if isinstance(e
.cause
, compat_HTTPError
) and e
.cause
.code
== 403:
453 if gis_tmpl
!= gis_tmpls
[-1]:
457 edges
= media
.get('edges')
458 if not edges
or not isinstance(edges
, list):
462 node
= edge
.get('node')
463 if not node
or not isinstance(node
, dict):
465 if node
.get('__typename') != 'GraphVideo' and node
.get('is_video') is not True:
467 video_id
= node
.get('shortcode')
471 info
= self
.url_result(
472 'https://instagram.com/p/%s/' % video_id
,
473 ie
=InstagramIE
.ie_key(), video_id
=video_id
)
475 description
= try_get(
476 node
, lambda x
: x
['edge_media_to_caption']['edges'][0]['node']['text'],
478 thumbnail
= node
.get('thumbnail_src') or node
.get('display_src')
479 timestamp
= int_or_none(node
.get('taken_at_timestamp'))
481 comment_count
= get_count('to_comment')
482 like_count
= get_count('preview_like')
483 view_count
= int_or_none(node
.get('video_view_count'))
486 'description': description
,
487 'thumbnail': thumbnail
,
488 'timestamp': timestamp
,
489 'comment_count': comment_count
,
490 'like_count': like_count
,
491 'view_count': view_count
,
496 page_info
= media
.get('page_info')
497 if not page_info
or not isinstance(page_info
, dict):
500 has_next_page
= page_info
.get('has_next_page')
501 if not has_next_page
:
504 cursor
= page_info
.get('end_cursor')
505 if not cursor
or not isinstance(cursor
, compat_str
):
508 def _real_extract(self
, url
):
509 user_or_tag
= self
._match
_id
(url
)
510 webpage
= self
._download
_webpage
(url
, user_or_tag
)
511 data
= self
._parse
_graphql
(webpage
, user_or_tag
)
513 self
._set
_cookie
('instagram.com', 'ig_pr', '1')
515 return self
.playlist_result(
516 self
._extract
_graphql
(data
, url
), user_or_tag
, user_or_tag
)
519 class InstagramUserIE(InstagramPlaylistBaseIE
):
520 _VALID_URL
= r
'https?://(?:www\.)?instagram\.com/(?P<id>[^/]{2,})/?(?:$|[?#])'
521 IE_DESC
= 'Instagram user profile'
522 IE_NAME
= 'instagram:user'
524 'url': 'https://instagram.com/porsche',
531 'extract_flat': True,
532 'skip_download': True,
537 _QUERY_HASH
= '42323d64886122307be10013ad2dcc44',
540 def _parse_timeline_from(data
):
541 # extracts the media timeline data from a GraphQL result
542 return data
['data']['user']['edge_owner_to_timeline_media']
545 def _query_vars_for(data
):
546 # returns a dictionary of variables to add to the timeline query based
547 # on the GraphQL of the original page
549 'id': data
['entry_data']['ProfilePage'][0]['graphql']['user']['id']
553 class InstagramTagIE(InstagramPlaylistBaseIE
):
554 _VALID_URL
= r
'https?://(?:www\.)?instagram\.com/explore/tags/(?P<id>[^/]+)'
555 IE_DESC
= 'Instagram hashtag search'
556 IE_NAME
= 'instagram:tag'
558 'url': 'https://instagram.com/explore/tags/lolcats',
563 'playlist_count': 50,
565 'extract_flat': True,
566 'skip_download': True,
571 _QUERY_HASH
= 'f92f56d47dc7a55b606908374b43a314',
574 def _parse_timeline_from(data
):
575 # extracts the media timeline data from a GraphQL result
576 return data
['data']['hashtag']['edge_hashtag_to_media']
579 def _query_vars_for(data
):
580 # returns a dictionary of variables to add to the timeline query based
581 # on the GraphQL of the original page
584 data
['entry_data']['TagPage'][0]['graphql']['hashtag']['name']