]> jfr.im git - yt-dlp.git/blob - yt_dlp/extractor/instagram.py
[bbc] Get all available formats (#1717)
[yt-dlp.git] / yt_dlp / extractor / instagram.py
1 # coding: utf-8
2 from __future__ import unicode_literals
3
4 import itertools
5 import hashlib
6 import json
7 import re
8 import time
9
10 from .common import InfoExtractor
11 from ..compat import (
12 compat_str,
13 compat_HTTPError,
14 )
15 from ..utils import (
16 ExtractorError,
17 float_or_none,
18 get_element_by_attribute,
19 int_or_none,
20 lowercase_escape,
21 std_headers,
22 try_get,
23 url_or_none,
24 variadic,
25 urlencode_postdata,
26 )
27
28
29 class InstagramBaseIE(InfoExtractor):
30 _NETRC_MACHINE = 'instagram'
31 _IS_LOGGED_IN = False
32
33 def _login(self):
34 username, password = self._get_login_info()
35 if username is None or self._IS_LOGGED_IN:
36 return
37
38 login_webpage = self._download_webpage(
39 'https://www.instagram.com/accounts/login/', None,
40 note='Downloading login webpage', errnote='Failed to download login webpage')
41
42 shared_data = self._parse_json(
43 self._search_regex(
44 r'window\._sharedData\s*=\s*({.+?});',
45 login_webpage, 'shared data', default='{}'),
46 None)
47
48 login = self._download_json('https://www.instagram.com/accounts/login/ajax/', None, note='Logging in', headers={
49 'Accept': '*/*',
50 'X-IG-App-ID': '936619743392459',
51 'X-ASBD-ID': '198387',
52 'X-IG-WWW-Claim': '0',
53 'X-Requested-With': 'XMLHttpRequest',
54 'X-CSRFToken': shared_data['config']['csrf_token'],
55 'X-Instagram-AJAX': shared_data['rollout_hash'],
56 'Referer': 'https://www.instagram.com/',
57 }, data=urlencode_postdata({
58 'enc_password': f'#PWD_INSTAGRAM_BROWSER:0:{int(time.time())}:{password}',
59 'username': username,
60 'queryParams': '{}',
61 'optIntoOneTap': 'false',
62 'stopDeletionNonce': '',
63 'trustedDeviceRecords': '{}',
64 }))
65
66 if not login.get('authenticated'):
67 if login.get('message'):
68 raise ExtractorError(f'Unable to login: {login["message"]}')
69 raise ExtractorError('Unable to login')
70 InstagramBaseIE._IS_LOGGED_IN = True
71
72 def _real_initialize(self):
73 self._login()
74
75
76 class InstagramIOSIE(InfoExtractor):
77 IE_DESC = 'IOS instagram:// URL'
78 _VALID_URL = r'instagram://media\?id=(?P<id>[\d_]+)'
79 _TESTS = [{
80 'url': 'instagram://media?id=482584233761418119',
81 'md5': '0d2da106a9d2631273e192b372806516',
82 'info_dict': {
83 'id': 'aye83DjauH',
84 'ext': 'mp4',
85 'title': 'Video by naomipq',
86 'description': 'md5:1f17f0ab29bd6fe2bfad705f58de3cb8',
87 'thumbnail': r're:^https?://.*\.jpg',
88 'duration': 0,
89 'timestamp': 1371748545,
90 'upload_date': '20130620',
91 'uploader_id': 'naomipq',
92 'uploader': 'B E A U T Y F O R A S H E S',
93 'like_count': int,
94 'comment_count': int,
95 'comments': list,
96 },
97 'add_ie': ['Instagram']
98 }]
99
100 def _get_id(self, id):
101 """Source: https://stackoverflow.com/questions/24437823/getting-instagram-post-url-from-media-id"""
102 chrs = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_'
103 media_id = int(id.split('_')[0])
104 shortened_id = ''
105 while media_id > 0:
106 r = media_id % 64
107 media_id = (media_id - r) // 64
108 shortened_id = chrs[r] + shortened_id
109 return shortened_id
110
111 def _real_extract(self, url):
112 return {
113 '_type': 'url_transparent',
114 'url': f'http://instagram.com/tv/{self._get_id(self._match_id(url))}/',
115 'ie_key': 'Instagram',
116 }
117
118
119 class InstagramIE(InstagramBaseIE):
120 _VALID_URL = r'(?P<url>https?://(?:www\.)?instagram\.com/(?:p|tv|reel)/(?P<id>[^/?#&]+))'
121 _TESTS = [{
122 'url': 'https://instagram.com/p/aye83DjauH/?foo=bar#abc',
123 'md5': '0d2da106a9d2631273e192b372806516',
124 'info_dict': {
125 'id': 'aye83DjauH',
126 'ext': 'mp4',
127 'title': 'Video by naomipq',
128 'description': 'md5:1f17f0ab29bd6fe2bfad705f58de3cb8',
129 'thumbnail': r're:^https?://.*\.jpg',
130 'duration': 0,
131 'timestamp': 1371748545,
132 'upload_date': '20130620',
133 'uploader_id': 'naomipq',
134 'uploader': 'B E A U T Y F O R A S H E S',
135 'like_count': int,
136 'comment_count': int,
137 'comments': list,
138 },
139 }, {
140 # missing description
141 'url': 'https://www.instagram.com/p/BA-pQFBG8HZ/?taken-by=britneyspears',
142 'info_dict': {
143 'id': 'BA-pQFBG8HZ',
144 'ext': 'mp4',
145 'title': 'Video by britneyspears',
146 'thumbnail': r're:^https?://.*\.jpg',
147 'duration': 0,
148 'timestamp': 1453760977,
149 'upload_date': '20160125',
150 'uploader_id': 'britneyspears',
151 'uploader': 'Britney Spears',
152 'like_count': int,
153 'comment_count': int,
154 'comments': list,
155 },
156 'params': {
157 'skip_download': True,
158 },
159 }, {
160 # multi video post
161 'url': 'https://www.instagram.com/p/BQ0eAlwhDrw/',
162 'playlist': [{
163 'info_dict': {
164 'id': 'BQ0dSaohpPW',
165 'ext': 'mp4',
166 'title': 'Video 1',
167 },
168 }, {
169 'info_dict': {
170 'id': 'BQ0dTpOhuHT',
171 'ext': 'mp4',
172 'title': 'Video 2',
173 },
174 }, {
175 'info_dict': {
176 'id': 'BQ0dT7RBFeF',
177 'ext': 'mp4',
178 'title': 'Video 3',
179 },
180 }],
181 'info_dict': {
182 'id': 'BQ0eAlwhDrw',
183 'title': 'Post by instagram',
184 'description': 'md5:0f9203fc6a2ce4d228da5754bcf54957',
185 },
186 }, {
187 # IGTV
188 'url': 'https://www.instagram.com/tv/BkfuX9UB-eK/',
189 'info_dict': {
190 'id': 'BkfuX9UB-eK',
191 'ext': 'mp4',
192 'title': 'Fingerboarding Tricks with @cass.fb',
193 'thumbnail': r're:^https?://.*\.jpg',
194 'duration': 53.83,
195 'timestamp': 1530032919,
196 'upload_date': '20180626',
197 'uploader_id': 'instagram',
198 'uploader': 'Instagram',
199 'like_count': int,
200 'comment_count': int,
201 'comments': list,
202 'description': 'Meet Cass Hirst (@cass.fb), a fingerboarding pro who can perform tiny ollies and kickflips while blindfolded.',
203 }
204 }, {
205 'url': 'https://instagram.com/p/-Cmh1cukG2/',
206 'only_matching': True,
207 }, {
208 'url': 'http://instagram.com/p/9o6LshA7zy/embed/',
209 'only_matching': True,
210 }, {
211 'url': 'https://www.instagram.com/tv/aye83DjauH/',
212 'only_matching': True,
213 }, {
214 'url': 'https://www.instagram.com/reel/CDUMkliABpa/',
215 'only_matching': True,
216 }]
217
218 @staticmethod
219 def _extract_embed_url(webpage):
220 mobj = re.search(
221 r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?instagram\.com/p/[^/]+/embed.*?)\1',
222 webpage)
223 if mobj:
224 return mobj.group('url')
225
226 blockquote_el = get_element_by_attribute(
227 'class', 'instagram-media', webpage)
228 if blockquote_el is None:
229 return
230
231 mobj = re.search(
232 r'<a[^>]+href=([\'"])(?P<link>[^\'"]+)\1', blockquote_el)
233 if mobj:
234 return mobj.group('link')
235
236 def _real_extract(self, url):
237 mobj = self._match_valid_url(url)
238 video_id = mobj.group('id')
239 url = mobj.group('url')
240
241 webpage, urlh = self._download_webpage_handle(url, video_id)
242 if 'www.instagram.com/accounts/login' in urlh.geturl().rstrip('/'):
243 self.raise_login_required('You need to log in to access this content')
244
245 (media, video_url, description, thumbnails, timestamp, uploader,
246 uploader_id, like_count, comment_count, comments, height,
247 width) = [None] * 12
248
249 shared_data = self._parse_json(
250 self._search_regex(
251 r'window\._sharedData\s*=\s*({.+?});',
252 webpage, 'shared data', default='{}'),
253 video_id, fatal=False)
254 if shared_data:
255 media = try_get(
256 shared_data,
257 (lambda x: x['entry_data']['PostPage'][0]['graphql']['shortcode_media'],
258 lambda x: x['entry_data']['PostPage'][0]['media']),
259 dict)
260 # _sharedData.entry_data.PostPage is empty when authenticated (see
261 # https://github.com/ytdl-org/youtube-dl/pull/22880)
262 if not media:
263 additional_data = self._parse_json(
264 self._search_regex(
265 r'window\.__additionalDataLoaded\s*\(\s*[^,]+,\s*({.+?})\s*\)\s*;',
266 webpage, 'additional data', default='{}'),
267 video_id, fatal=False)
268 if additional_data:
269 media = try_get(
270 additional_data, lambda x: x['graphql']['shortcode_media'],
271 dict)
272 if media:
273 video_url = media.get('video_url')
274 height = int_or_none(self._html_search_meta(('og:video:height', 'video:height'), webpage)) or try_get(media, lambda x: x['dimensions']['height'])
275 width = int_or_none(self._html_search_meta(('og:video:width', 'video:width'), webpage)) or try_get(media, lambda x: x['dimensions']['width'])
276 description = try_get(
277 media, lambda x: x['edge_media_to_caption']['edges'][0]['node']['text'],
278 compat_str) or media.get('caption')
279 title = media.get('title')
280 display_resources = media.get('display_resources')
281 if not display_resources:
282 display_resources = [{'src': media.get('display_src')}, {'src': media.get('display_url')}]
283 duration = float_or_none(media.get('video_duration'))
284 timestamp = int_or_none(media.get('taken_at_timestamp') or media.get('date'))
285 uploader = try_get(media, lambda x: x['owner']['full_name'])
286 uploader_id = try_get(media, lambda x: x['owner']['username'])
287
288 def get_count(keys, kind):
289 for key in variadic(keys):
290 count = int_or_none(try_get(
291 media, (lambda x: x['edge_media_%s' % key]['count'],
292 lambda x: x['%ss' % kind]['count'])))
293 if count is not None:
294 return count
295
296 like_count = get_count('preview_like', 'like')
297 comment_count = get_count(
298 ('preview_comment', 'to_comment', 'to_parent_comment'), 'comment')
299
300 thumbnails = [{
301 'url': thumbnail['src'],
302 'width': thumbnail.get('config_width'),
303 'height': thumbnail.get('config_height'),
304 } for thumbnail in display_resources if thumbnail.get('src')]
305
306 comments = []
307 for comment in try_get(media, lambda x: x['edge_media_to_parent_comment']['edges']):
308 comment_dict = comment.get('node', {})
309 comment_text = comment_dict.get('text')
310 if comment_text:
311 comments.append({
312 'author': try_get(comment_dict, lambda x: x['owner']['username']),
313 'author_id': try_get(comment_dict, lambda x: x['owner']['id']),
314 'id': comment_dict.get('id'),
315 'text': comment_text,
316 'timestamp': int_or_none(comment_dict.get('created_at')),
317 })
318 if not video_url:
319 edges = try_get(
320 media, lambda x: x['edge_sidecar_to_children']['edges'],
321 list) or []
322 if edges:
323 entries = []
324 for edge_num, edge in enumerate(edges, start=1):
325 node = try_get(edge, lambda x: x['node'], dict)
326 if not node:
327 continue
328 node_video_url = url_or_none(node.get('video_url'))
329 if not node_video_url:
330 continue
331 entries.append({
332 'id': node.get('shortcode') or node['id'],
333 'title': node.get('title') or 'Video %d' % edge_num,
334 'url': node_video_url,
335 'thumbnail': node.get('display_url'),
336 'duration': float_or_none(node.get('video_duration')),
337 'width': int_or_none(try_get(node, lambda x: x['dimensions']['width'])),
338 'height': int_or_none(try_get(node, lambda x: x['dimensions']['height'])),
339 'view_count': int_or_none(node.get('video_view_count')),
340 })
341 return self.playlist_result(
342 entries, video_id,
343 'Post by %s' % uploader_id if uploader_id else None,
344 description)
345
346 if not video_url:
347 video_url = self._og_search_video_url(webpage, secure=False)
348
349 formats = [{
350 'url': video_url,
351 'width': width,
352 'height': height,
353 }]
354 dash = try_get(media, lambda x: x['dash_info']['video_dash_manifest'])
355 if dash:
356 formats.extend(self._parse_mpd_formats(self._parse_xml(dash, video_id), mpd_id='dash'))
357 self._sort_formats(formats)
358
359 if not uploader_id:
360 uploader_id = self._search_regex(
361 r'"owner"\s*:\s*{\s*"username"\s*:\s*"(.+?)"',
362 webpage, 'uploader id', fatal=False)
363
364 if not description:
365 description = self._search_regex(
366 r'"caption"\s*:\s*"(.+?)"', webpage, 'description', default=None)
367 if description is not None:
368 description = lowercase_escape(description)
369
370 if not thumbnails:
371 thumbnails = self._og_search_thumbnail(webpage)
372
373 return {
374 'id': video_id,
375 'formats': formats,
376 'ext': 'mp4',
377 'title': title or 'Video by %s' % uploader_id,
378 'description': description,
379 'duration': duration,
380 'thumbnails': thumbnails,
381 'timestamp': timestamp,
382 'uploader_id': uploader_id,
383 'uploader': uploader,
384 'like_count': like_count,
385 'comment_count': comment_count,
386 'comments': comments,
387 'http_headers': {
388 'Referer': 'https://www.instagram.com/',
389 }
390 }
391
392
393 class InstagramPlaylistBaseIE(InstagramBaseIE):
394 _gis_tmpl = None # used to cache GIS request type
395
396 def _parse_graphql(self, webpage, item_id):
397 # Reads a webpage and returns its GraphQL data.
398 return self._parse_json(
399 self._search_regex(
400 r'sharedData\s*=\s*({.+?})\s*;\s*[<\n]', webpage, 'data'),
401 item_id)
402
403 def _extract_graphql(self, data, url):
404 # Parses GraphQL queries containing videos and generates a playlist.
405 def get_count(suffix):
406 return int_or_none(try_get(
407 node, lambda x: x['edge_media_' + suffix]['count']))
408
409 uploader_id = self._match_id(url)
410 csrf_token = data['config']['csrf_token']
411 rhx_gis = data.get('rhx_gis') or '3c7ca9dcefcf966d11dacf1f151335e8'
412
413 cursor = ''
414 for page_num in itertools.count(1):
415 variables = {
416 'first': 12,
417 'after': cursor,
418 }
419 variables.update(self._query_vars_for(data))
420 variables = json.dumps(variables)
421
422 if self._gis_tmpl:
423 gis_tmpls = [self._gis_tmpl]
424 else:
425 gis_tmpls = [
426 '%s' % rhx_gis,
427 '',
428 '%s:%s' % (rhx_gis, csrf_token),
429 '%s:%s:%s' % (rhx_gis, csrf_token, std_headers['User-Agent']),
430 ]
431
432 # try all of the ways to generate a GIS query, and not only use the
433 # first one that works, but cache it for future requests
434 for gis_tmpl in gis_tmpls:
435 try:
436 json_data = self._download_json(
437 'https://www.instagram.com/graphql/query/', uploader_id,
438 'Downloading JSON page %d' % page_num, headers={
439 'X-Requested-With': 'XMLHttpRequest',
440 'X-Instagram-GIS': hashlib.md5(
441 ('%s:%s' % (gis_tmpl, variables)).encode('utf-8')).hexdigest(),
442 }, query={
443 'query_hash': self._QUERY_HASH,
444 'variables': variables,
445 })
446 media = self._parse_timeline_from(json_data)
447 self._gis_tmpl = gis_tmpl
448 break
449 except ExtractorError as e:
450 # if it's an error caused by a bad query, and there are
451 # more GIS templates to try, ignore it and keep trying
452 if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403:
453 if gis_tmpl != gis_tmpls[-1]:
454 continue
455 raise
456
457 edges = media.get('edges')
458 if not edges or not isinstance(edges, list):
459 break
460
461 for edge in edges:
462 node = edge.get('node')
463 if not node or not isinstance(node, dict):
464 continue
465 if node.get('__typename') != 'GraphVideo' and node.get('is_video') is not True:
466 continue
467 video_id = node.get('shortcode')
468 if not video_id:
469 continue
470
471 info = self.url_result(
472 'https://instagram.com/p/%s/' % video_id,
473 ie=InstagramIE.ie_key(), video_id=video_id)
474
475 description = try_get(
476 node, lambda x: x['edge_media_to_caption']['edges'][0]['node']['text'],
477 compat_str)
478 thumbnail = node.get('thumbnail_src') or node.get('display_src')
479 timestamp = int_or_none(node.get('taken_at_timestamp'))
480
481 comment_count = get_count('to_comment')
482 like_count = get_count('preview_like')
483 view_count = int_or_none(node.get('video_view_count'))
484
485 info.update({
486 'description': description,
487 'thumbnail': thumbnail,
488 'timestamp': timestamp,
489 'comment_count': comment_count,
490 'like_count': like_count,
491 'view_count': view_count,
492 })
493
494 yield info
495
496 page_info = media.get('page_info')
497 if not page_info or not isinstance(page_info, dict):
498 break
499
500 has_next_page = page_info.get('has_next_page')
501 if not has_next_page:
502 break
503
504 cursor = page_info.get('end_cursor')
505 if not cursor or not isinstance(cursor, compat_str):
506 break
507
508 def _real_extract(self, url):
509 user_or_tag = self._match_id(url)
510 webpage = self._download_webpage(url, user_or_tag)
511 data = self._parse_graphql(webpage, user_or_tag)
512
513 self._set_cookie('instagram.com', 'ig_pr', '1')
514
515 return self.playlist_result(
516 self._extract_graphql(data, url), user_or_tag, user_or_tag)
517
518
519 class InstagramUserIE(InstagramPlaylistBaseIE):
520 _VALID_URL = r'https?://(?:www\.)?instagram\.com/(?P<id>[^/]{2,})/?(?:$|[?#])'
521 IE_DESC = 'Instagram user profile'
522 IE_NAME = 'instagram:user'
523 _TESTS = [{
524 'url': 'https://instagram.com/porsche',
525 'info_dict': {
526 'id': 'porsche',
527 'title': 'porsche',
528 },
529 'playlist_count': 5,
530 'params': {
531 'extract_flat': True,
532 'skip_download': True,
533 'playlistend': 5,
534 }
535 }]
536
537 _QUERY_HASH = '42323d64886122307be10013ad2dcc44',
538
539 @staticmethod
540 def _parse_timeline_from(data):
541 # extracts the media timeline data from a GraphQL result
542 return data['data']['user']['edge_owner_to_timeline_media']
543
544 @staticmethod
545 def _query_vars_for(data):
546 # returns a dictionary of variables to add to the timeline query based
547 # on the GraphQL of the original page
548 return {
549 'id': data['entry_data']['ProfilePage'][0]['graphql']['user']['id']
550 }
551
552
553 class InstagramTagIE(InstagramPlaylistBaseIE):
554 _VALID_URL = r'https?://(?:www\.)?instagram\.com/explore/tags/(?P<id>[^/]+)'
555 IE_DESC = 'Instagram hashtag search'
556 IE_NAME = 'instagram:tag'
557 _TESTS = [{
558 'url': 'https://instagram.com/explore/tags/lolcats',
559 'info_dict': {
560 'id': 'lolcats',
561 'title': 'lolcats',
562 },
563 'playlist_count': 50,
564 'params': {
565 'extract_flat': True,
566 'skip_download': True,
567 'playlistend': 50,
568 }
569 }]
570
571 _QUERY_HASH = 'f92f56d47dc7a55b606908374b43a314',
572
573 @staticmethod
574 def _parse_timeline_from(data):
575 # extracts the media timeline data from a GraphQL result
576 return data['data']['hashtag']['edge_hashtag_to_media']
577
578 @staticmethod
579 def _query_vars_for(data):
580 # returns a dictionary of variables to add to the timeline query based
581 # on the GraphQL of the original page
582 return {
583 'tag_name':
584 data['entry_data']['TagPage'][0]['graphql']['hashtag']['name']
585 }