yt_dlp/extractor/instagram.py

   1 # coding: utf-8
   2
   3 import itertools
   4 import hashlib
   5 import json
   6 import re
   7 import time
   8
   9 from .common import InfoExtractor
  10 from ..compat import (
  11     compat_HTTPError,
  12 )
  13 from ..utils import (
  14     ExtractorError,
  15     float_or_none,
  16     get_element_by_attribute,
  17     int_or_none,
  18     lowercase_escape,
  19     std_headers,
  20     str_to_int,
  21     traverse_obj,
  22     url_or_none,
  23     urlencode_postdata,
  24 )
  25
  26
  27 class InstagramBaseIE(InfoExtractor):
  28     _NETRC_MACHINE = 'instagram'
  29     _IS_LOGGED_IN = False
  30
  31     def _login(self):
  32         username, password = self._get_login_info()
  33         if username is None or self._IS_LOGGED_IN:
  34             return
  35
  36         login_webpage = self._download_webpage(
  37             'https://www.instagram.com/accounts/login/', None,
  38             note='Downloading login webpage', errnote='Failed to download login webpage')
  39
  40         shared_data = self._parse_json(
  41             self._search_regex(
  42                 r'window\._sharedData\s*=\s*({.+?});',
  43                 login_webpage, 'shared data', default='{}'),
  44             None)
  45
  46         login = self._download_json('https://www.instagram.com/accounts/login/ajax/', None, note='Logging in', headers={
  47             'Accept': '*/*',
  48             'X-IG-App-ID': '936619743392459',
  49             'X-ASBD-ID': '198387',
  50             'X-IG-WWW-Claim': '0',
  51             'X-Requested-With': 'XMLHttpRequest',
  52             'X-CSRFToken': shared_data['config']['csrf_token'],
  53             'X-Instagram-AJAX': shared_data['rollout_hash'],
  54             'Referer': 'https://www.instagram.com/',
  55         }, data=urlencode_postdata({
  56             'enc_password': f'#PWD_INSTAGRAM_BROWSER:0:{int(time.time())}:{password}',
  57             'username': username,
  58             'queryParams': '{}',
  59             'optIntoOneTap': 'false',
  60             'stopDeletionNonce': '',
  61             'trustedDeviceRecords': '{}',
  62         }))
  63
  64         if not login.get('authenticated'):
  65             if login.get('message'):
  66                 raise ExtractorError(f'Unable to login: {login["message"]}')
  67             elif login.get('user'):
  68                 raise ExtractorError('Unable to login: Sorry, your password was incorrect. Please double-check your password.', expected=True)
  69             elif login.get('user') is False:
  70                 raise ExtractorError('Unable to login: The username you entered doesn\'t belong to an account. Please check your username and try again.', expected=True)
  71             raise ExtractorError('Unable to login')
  72         InstagramBaseIE._IS_LOGGED_IN = True
  73
  74     def _real_initialize(self):
  75         self._login()
  76
  77     def _get_count(self, media, kind, *keys):
  78         return traverse_obj(
  79             media, (kind, 'count'), *((f'edge_media_{key}', 'count') for key in keys),
  80             expected_type=int_or_none)
  81
  82     def _get_dimension(self, name, media, webpage=None):
  83         return (
  84             traverse_obj(media, ('dimensions', name), expected_type=int_or_none)
  85             or int_or_none(self._html_search_meta(
  86                 (f'og:video:{name}', f'video:{name}'), webpage or '', default=None)))
  87
  88     def _extract_nodes(self, nodes, is_direct=False):
  89         for idx, node in enumerate(nodes, start=1):
  90             if node.get('__typename') != 'GraphVideo' and node.get('is_video') is not True:
  91                 continue
  92
  93             video_id = node.get('shortcode')
  94
  95             if is_direct:
  96                 info = {
  97                     'id': video_id or node['id'],
  98                     'url': node.get('video_url'),
  99                     'width': self._get_dimension('width', node),
 100                     'height': self._get_dimension('height', node),
 101                     'http_headers': {
 102                         'Referer': 'https://www.instagram.com/',
 103                     }
 104                 }
 105             elif not video_id:
 106                 continue
 107             else:
 108                 info = {
 109                     '_type': 'url',
 110                     'ie_key': 'Instagram',
 111                     'id': video_id,
 112                     'url': f'https://instagram.com/p/{video_id}',
 113                 }
 114
 115             yield {
 116                 **info,
 117                 'title': node.get('title') or (f'Video {idx}' if is_direct else None),
 118                 'description': traverse_obj(
 119                     node, ('edge_media_to_caption', 'edges', 0, 'node', 'text'), expected_type=str),
 120                 'thumbnail': traverse_obj(
 121                     node, 'display_url', 'thumbnail_src', 'display_src', expected_type=url_or_none),
 122                 'duration': float_or_none(node.get('video_duration')),
 123                 'timestamp': int_or_none(node.get('taken_at_timestamp')),
 124                 'view_count': int_or_none(node.get('video_view_count')),
 125                 'comment_count': self._get_count(node, 'comments', 'preview_comment', 'to_comment', 'to_parent_comment'),
 126                 'like_count': self._get_count(node, 'likes', 'preview_like'),
 127             }
 128
 129
 130 class InstagramIOSIE(InfoExtractor):
 131     IE_DESC = 'IOS instagram:// URL'
 132     _VALID_URL = r'instagram://media\?id=(?P<id>[\d_]+)'
 133     _TESTS = [{
 134         'url': 'instagram://media?id=482584233761418119',
 135         'md5': '0d2da106a9d2631273e192b372806516',
 136         'info_dict': {
 137             'id': 'aye83DjauH',
 138             'ext': 'mp4',
 139             'title': 'Video by naomipq',
 140             'description': 'md5:1f17f0ab29bd6fe2bfad705f58de3cb8',
 141             'thumbnail': r're:^https?://.*\.jpg',
 142             'duration': 0,
 143             'timestamp': 1371748545,
 144             'upload_date': '20130620',
 145             'uploader_id': 'naomipq',
 146             'uploader': 'B E A U T Y  F O R  A S H E S',
 147             'like_count': int,
 148             'comment_count': int,
 149             'comments': list,
 150         },
 151         'add_ie': ['Instagram']
 152     }]
 153
 154     def _get_id(self, id):
 155         """Source: https://stackoverflow.com/questions/24437823/getting-instagram-post-url-from-media-id"""
 156         chrs = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_'
 157         media_id = int(id.split('_')[0])
 158         shortened_id = ''
 159         while media_id > 0:
 160             r = media_id % 64
 161             media_id = (media_id - r) // 64
 162             shortened_id = chrs[r] + shortened_id
 163         return shortened_id
 164
 165     def _real_extract(self, url):
 166         return {
 167             '_type': 'url_transparent',
 168             'url': f'http://instagram.com/tv/{self._get_id(self._match_id(url))}/',
 169             'ie_key': 'Instagram',
 170         }
 171
 172
 173 class InstagramIE(InstagramBaseIE):
 174     _VALID_URL = r'(?P<url>https?://(?:www\.)?instagram\.com(?:/[^/]+)?/(?:p|tv|reel)/(?P<id>[^/?#&]+))'
 175     _TESTS = [{
 176         'url': 'https://instagram.com/p/aye83DjauH/?foo=bar#abc',
 177         'md5': '0d2da106a9d2631273e192b372806516',
 178         'info_dict': {
 179             'id': 'aye83DjauH',
 180             'ext': 'mp4',
 181             'title': 'Video by naomipq',
 182             'description': 'md5:1f17f0ab29bd6fe2bfad705f58de3cb8',
 183             'thumbnail': r're:^https?://.*\.jpg',
 184             'duration': 0,
 185             'timestamp': 1371748545,
 186             'upload_date': '20130620',
 187             'uploader_id': 'naomipq',
 188             'uploader': 'B E A U T Y  F O R  A S H E S',
 189             'like_count': int,
 190             'comment_count': int,
 191             'comments': list,
 192         },
 193     }, {
 194         # missing description
 195         'url': 'https://www.instagram.com/p/BA-pQFBG8HZ/?taken-by=britneyspears',
 196         'info_dict': {
 197             'id': 'BA-pQFBG8HZ',
 198             'ext': 'mp4',
 199             'title': 'Video by britneyspears',
 200             'thumbnail': r're:^https?://.*\.jpg',
 201             'duration': 0,
 202             'timestamp': 1453760977,
 203             'upload_date': '20160125',
 204             'uploader_id': 'britneyspears',
 205             'uploader': 'Britney Spears',
 206             'like_count': int,
 207             'comment_count': int,
 208             'comments': list,
 209         },
 210         'params': {
 211             'skip_download': True,
 212         },
 213     }, {
 214         # multi video post
 215         'url': 'https://www.instagram.com/p/BQ0eAlwhDrw/',
 216         'playlist': [{
 217             'info_dict': {
 218                 'id': 'BQ0dSaohpPW',
 219                 'ext': 'mp4',
 220                 'title': 'Video 1',
 221             },
 222         }, {
 223             'info_dict': {
 224                 'id': 'BQ0dTpOhuHT',
 225                 'ext': 'mp4',
 226                 'title': 'Video 2',
 227             },
 228         }, {
 229             'info_dict': {
 230                 'id': 'BQ0dT7RBFeF',
 231                 'ext': 'mp4',
 232                 'title': 'Video 3',
 233             },
 234         }],
 235         'info_dict': {
 236             'id': 'BQ0eAlwhDrw',
 237             'title': 'Post by instagram',
 238             'description': 'md5:0f9203fc6a2ce4d228da5754bcf54957',
 239         },
 240     }, {
 241         # IGTV
 242         'url': 'https://www.instagram.com/tv/BkfuX9UB-eK/',
 243         'info_dict': {
 244             'id': 'BkfuX9UB-eK',
 245             'ext': 'mp4',
 246             'title': 'Fingerboarding Tricks with @cass.fb',
 247             'thumbnail': r're:^https?://.*\.jpg',
 248             'duration': 53.83,
 249             'timestamp': 1530032919,
 250             'upload_date': '20180626',
 251             'uploader_id': 'instagram',
 252             'uploader': 'Instagram',
 253             'like_count': int,
 254             'comment_count': int,
 255             'comments': list,
 256             'description': 'Meet Cass Hirst (@cass.fb), a fingerboarding pro who can perform tiny ollies and kickflips while blindfolded.',
 257         }
 258     }, {
 259         'url': 'https://instagram.com/p/-Cmh1cukG2/',
 260         'only_matching': True,
 261     }, {
 262         'url': 'http://instagram.com/p/9o6LshA7zy/embed/',
 263         'only_matching': True,
 264     }, {
 265         'url': 'https://www.instagram.com/tv/aye83DjauH/',
 266         'only_matching': True,
 267     }, {
 268         'url': 'https://www.instagram.com/reel/CDUMkliABpa/',
 269         'only_matching': True,
 270     }, {
 271         'url': 'https://www.instagram.com/marvelskies.fc/reel/CWqAgUZgCku/',
 272         'only_matching': True,
 273     }]
 274
 275     @staticmethod
 276     def _extract_embed_url(webpage):
 277         mobj = re.search(
 278             r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?instagram\.com/p/[^/]+/embed.*?)\1',
 279             webpage)
 280         if mobj:
 281             return mobj.group('url')
 282
 283         blockquote_el = get_element_by_attribute(
 284             'class', 'instagram-media', webpage)
 285         if blockquote_el is None:
 286             return
 287
 288         mobj = re.search(
 289             r'<a[^>]+href=([\'"])(?P<link>[^\'"]+)\1', blockquote_el)
 290         if mobj:
 291             return mobj.group('link')
 292
 293     def _real_extract(self, url):
 294         video_id, url = self._match_valid_url(url).group('id', 'url')
 295         webpage, urlh = self._download_webpage_handle(url, video_id)
 296         if 'www.instagram.com/accounts/login' in urlh.geturl():
 297             self.report_warning('Main webpage is locked behind the login page. '
 298                                 'Retrying with embed webpage (Note that some metadata might be missing)')
 299             webpage = self._download_webpage(
 300                 'https://www.instagram.com/p/%s/embed/' % video_id, video_id, note='Downloading embed webpage')
 301
 302         shared_data = self._parse_json(
 303             self._search_regex(
 304                 r'window\._sharedData\s*=\s*({.+?});',
 305                 webpage, 'shared data', default='{}'),
 306             video_id, fatal=False)
 307         media = traverse_obj(
 308             shared_data,
 309             ('entry_data', 'PostPage', 0, 'graphql', 'shortcode_media'),
 310             ('entry_data', 'PostPage', 0, 'media'),
 311             expected_type=dict)
 312
 313         # _sharedData.entry_data.PostPage is empty when authenticated (see
 314         # https://github.com/ytdl-org/youtube-dl/pull/22880)
 315         if not media:
 316             additional_data = self._parse_json(
 317                 self._search_regex(
 318                     r'window\.__additionalDataLoaded\s*\(\s*[^,]+,\s*({.+?})\s*\)\s*;',
 319                     webpage, 'additional data', default='{}'),
 320                 video_id, fatal=False)
 321             media = traverse_obj(additional_data, ('graphql', 'shortcode_media'), 'shortcode_media', expected_type=dict) or {}
 322
 323         if not media and 'www.instagram.com/accounts/login' in urlh.geturl():
 324             self.raise_login_required('You need to log in to access this content')
 325
 326         uploader_id = traverse_obj(media, ('owner', 'username')) or self._search_regex(
 327             r'"owner"\s*:\s*{\s*"username"\s*:\s*"(.+?)"', webpage, 'uploader id', fatal=False)
 328
 329         description = (
 330             traverse_obj(media, ('edge_media_to_caption', 'edges', 0, 'node', 'text'), expected_type=str)
 331             or media.get('caption'))
 332         if not description:
 333             description = self._search_regex(
 334                 r'"caption"\s*:\s*"(.+?)"', webpage, 'description', default=None)
 335             if description is not None:
 336                 description = lowercase_escape(description)
 337
 338         video_url = media.get('video_url')
 339         if not video_url:
 340             nodes = traverse_obj(media, ('edge_sidecar_to_children', 'edges', ..., 'node'), expected_type=dict) or []
 341             if nodes:
 342                 return self.playlist_result(
 343                     self._extract_nodes(nodes, True), video_id,
 344                     'Post by %s' % uploader_id if uploader_id else None, description)
 345
 346             video_url = self._og_search_video_url(webpage, secure=False)
 347
 348         formats = [{
 349             'url': video_url,
 350             'width': self._get_dimension('width', media, webpage),
 351             'height': self._get_dimension('height', media, webpage),
 352         }]
 353         dash = traverse_obj(media, ('dash_info', 'video_dash_manifest'))
 354         if dash:
 355             formats.extend(self._parse_mpd_formats(self._parse_xml(dash, video_id), mpd_id='dash'))
 356         self._sort_formats(formats)
 357
 358         comment_data = traverse_obj(media, ('edge_media_to_parent_comment', 'edges'))
 359         comments = [{
 360             'author': traverse_obj(comment_dict, ('node', 'owner', 'username')),
 361             'author_id': traverse_obj(comment_dict, ('node', 'owner', 'id')),
 362             'id': traverse_obj(comment_dict, ('node', 'id')),
 363             'text': traverse_obj(comment_dict, ('node', 'text')),
 364             'timestamp': traverse_obj(comment_dict, ('node', 'created_at'), expected_type=int_or_none),
 365         } for comment_dict in comment_data] if comment_data else None
 366
 367         display_resources = (
 368             media.get('display_resources')
 369             or [{'src': media.get(key)} for key in ('display_src', 'display_url')]
 370             or [{'src': self._og_search_thumbnail(webpage)}])
 371         thumbnails = [{
 372             'url': thumbnail['src'],
 373             'width': thumbnail.get('config_width'),
 374             'height': thumbnail.get('config_height'),
 375         } for thumbnail in display_resources if thumbnail.get('src')]
 376
 377         return {
 378             'id': video_id,
 379             'formats': formats,
 380             'title': media.get('title') or 'Video by %s' % uploader_id,
 381             'description': description,
 382             'duration': float_or_none(media.get('video_duration')),
 383             'timestamp': traverse_obj(media, 'taken_at_timestamp', 'date', expected_type=int_or_none),
 384             'uploader_id': uploader_id,
 385             'uploader': traverse_obj(media, ('owner', 'full_name')),
 386             'like_count': self._get_count(media, 'likes', 'preview_like') or str_to_int(self._search_regex(
 387                 r'data-log-event="likeCountClick"[^>]*>[^\d]*([\d,\.]+)', webpage, 'like count', fatal=False)),
 388             'comment_count': self._get_count(media, 'comments', 'preview_comment', 'to_comment', 'to_parent_comment'),
 389             'comments': comments,
 390             'thumbnails': thumbnails,
 391             'http_headers': {
 392                 'Referer': 'https://www.instagram.com/',
 393             }
 394         }
 395
 396
 397 class InstagramPlaylistBaseIE(InstagramBaseIE):
 398     _gis_tmpl = None  # used to cache GIS request type
 399
 400     def _parse_graphql(self, webpage, item_id):
 401         # Reads a webpage and returns its GraphQL data.
 402         return self._parse_json(
 403             self._search_regex(
 404                 r'sharedData\s*=\s*({.+?})\s*;\s*[<\n]', webpage, 'data'),
 405             item_id)
 406
 407     def _extract_graphql(self, data, url):
 408         # Parses GraphQL queries containing videos and generates a playlist.
 409         uploader_id = self._match_id(url)
 410         csrf_token = data['config']['csrf_token']
 411         rhx_gis = data.get('rhx_gis') or '3c7ca9dcefcf966d11dacf1f151335e8'
 412
 413         cursor = ''
 414         for page_num in itertools.count(1):
 415             variables = {
 416                 'first': 12,
 417                 'after': cursor,
 418             }
 419             variables.update(self._query_vars_for(data))
 420             variables = json.dumps(variables)
 421
 422             if self._gis_tmpl:
 423                 gis_tmpls = [self._gis_tmpl]
 424             else:
 425                 gis_tmpls = [
 426                     '%s' % rhx_gis,
 427                     '',
 428                     '%s:%s' % (rhx_gis, csrf_token),
 429                     '%s:%s:%s' % (rhx_gis, csrf_token, std_headers['User-Agent']),
 430                 ]
 431
 432             # try all of the ways to generate a GIS query, and not only use the
 433             # first one that works, but cache it for future requests
 434             for gis_tmpl in gis_tmpls:
 435                 try:
 436                     json_data = self._download_json(
 437                         'https://www.instagram.com/graphql/query/', uploader_id,
 438                         'Downloading JSON page %d' % page_num, headers={
 439                             'X-Requested-With': 'XMLHttpRequest',
 440                             'X-Instagram-GIS': hashlib.md5(
 441                                 ('%s:%s' % (gis_tmpl, variables)).encode('utf-8')).hexdigest(),
 442                         }, query={
 443                             'query_hash': self._QUERY_HASH,
 444                             'variables': variables,
 445                         })
 446                     media = self._parse_timeline_from(json_data)
 447                     self._gis_tmpl = gis_tmpl
 448                     break
 449                 except ExtractorError as e:
 450                     # if it's an error caused by a bad query, and there are
 451                     # more GIS templates to try, ignore it and keep trying
 452                     if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403:
 453                         if gis_tmpl != gis_tmpls[-1]:
 454                             continue
 455                     raise
 456
 457             nodes = traverse_obj(media, ('edges', ..., 'node'), expected_type=dict) or []
 458             if not nodes:
 459                 break
 460             yield from self._extract_nodes(nodes)
 461
 462             has_next_page = traverse_obj(media, ('page_info', 'has_next_page'))
 463             cursor = traverse_obj(media, ('page_info', 'end_cursor'), expected_type=str)
 464             if not has_next_page or not cursor:
 465                 break
 466
 467     def _real_extract(self, url):
 468         user_or_tag = self._match_id(url)
 469         webpage = self._download_webpage(url, user_or_tag)
 470         data = self._parse_graphql(webpage, user_or_tag)
 471
 472         self._set_cookie('instagram.com', 'ig_pr', '1')
 473
 474         return self.playlist_result(
 475             self._extract_graphql(data, url), user_or_tag, user_or_tag)
 476
 477
 478 class InstagramUserIE(InstagramPlaylistBaseIE):
 479     _VALID_URL = r'https?://(?:www\.)?instagram\.com/(?P<id>[^/]{2,})/?(?:$|[?#])'
 480     IE_DESC = 'Instagram user profile'
 481     IE_NAME = 'instagram:user'
 482     _TESTS = [{
 483         'url': 'https://instagram.com/porsche',
 484         'info_dict': {
 485             'id': 'porsche',
 486             'title': 'porsche',
 487         },
 488         'playlist_count': 5,
 489         'params': {
 490             'extract_flat': True,
 491             'skip_download': True,
 492             'playlistend': 5,
 493         }
 494     }]
 495
 496     _QUERY_HASH = '42323d64886122307be10013ad2dcc44',
 497
 498     @staticmethod
 499     def _parse_timeline_from(data):
 500         # extracts the media timeline data from a GraphQL result
 501         return data['data']['user']['edge_owner_to_timeline_media']
 502
 503     @staticmethod
 504     def _query_vars_for(data):
 505         # returns a dictionary of variables to add to the timeline query based
 506         # on the GraphQL of the original page
 507         return {
 508             'id': data['entry_data']['ProfilePage'][0]['graphql']['user']['id']
 509         }
 510
 511
 512 class InstagramTagIE(InstagramPlaylistBaseIE):
 513     _VALID_URL = r'https?://(?:www\.)?instagram\.com/explore/tags/(?P<id>[^/]+)'
 514     IE_DESC = 'Instagram hashtag search URLs'
 515     IE_NAME = 'instagram:tag'
 516     _TESTS = [{
 517         'url': 'https://instagram.com/explore/tags/lolcats',
 518         'info_dict': {
 519             'id': 'lolcats',
 520             'title': 'lolcats',
 521         },
 522         'playlist_count': 50,
 523         'params': {
 524             'extract_flat': True,
 525             'skip_download': True,
 526             'playlistend': 50,
 527         }
 528     }]
 529
 530     _QUERY_HASH = 'f92f56d47dc7a55b606908374b43a314',
 531
 532     @staticmethod
 533     def _parse_timeline_from(data):
 534         # extracts the media timeline data from a GraphQL result
 535         return data['data']['hashtag']['edge_hashtag_to_media']
 536
 537     @staticmethod
 538     def _query_vars_for(data):
 539         # returns a dictionary of variables to add to the timeline query based
 540         # on the GraphQL of the original page
 541         return {
 542             'tag_name':
 543                 data['entry_data']['TagPage'][0]['graphql']['hashtag']['name']
 544         }