yt_dlp/extractor/instagram.py

   1 # coding: utf-8
   2 from __future__ import unicode_literals
   3
   4 import itertools
   5 import hashlib
   6 import json
   7 import re
   8 import time
   9
  10 from .common import InfoExtractor
  11 from ..compat import (
  12     compat_str,
  13     compat_HTTPError,
  14 )
  15 from ..utils import (
  16     ExtractorError,
  17     float_or_none,
  18     get_element_by_attribute,
  19     int_or_none,
  20     lowercase_escape,
  21     std_headers,
  22     try_get,
  23     url_or_none,
  24     variadic,
  25     urlencode_postdata,
  26 )
  27
  28
  29 class InstagramBaseIE(InfoExtractor):
  30     _NETRC_MACHINE = 'instagram'
  31     _IS_LOGGED_IN = False
  32
  33     def _login(self):
  34         username, password = self._get_login_info()
  35         if username is None or self._IS_LOGGED_IN:
  36             return
  37
  38         login_webpage = self._download_webpage(
  39             'https://www.instagram.com/accounts/login/', None,
  40             note='Downloading login webpage', errnote='Failed to download login webpage')
  41
  42         shared_data = self._parse_json(
  43             self._search_regex(
  44                 r'window\._sharedData\s*=\s*({.+?});',
  45                 login_webpage, 'shared data', default='{}'),
  46             None)
  47
  48         login = self._download_json('https://www.instagram.com/accounts/login/ajax/', None, note='Logging in', headers={
  49             'Accept': '*/*',
  50             'X-IG-App-ID': '936619743392459',
  51             'X-ASBD-ID': '198387',
  52             'X-IG-WWW-Claim': '0',
  53             'X-Requested-With': 'XMLHttpRequest',
  54             'X-CSRFToken': shared_data['config']['csrf_token'],
  55             'X-Instagram-AJAX': shared_data['rollout_hash'],
  56             'Referer': 'https://www.instagram.com/',
  57         }, data=urlencode_postdata({
  58             'enc_password': f'#PWD_INSTAGRAM_BROWSER:0:{int(time.time())}:{password}',
  59             'username': username,
  60             'queryParams': '{}',
  61             'optIntoOneTap': 'false',
  62             'stopDeletionNonce': '',
  63             'trustedDeviceRecords': '{}',
  64         }))
  65
  66         if not login.get('authenticated'):
  67             if login.get('message'):
  68                 raise ExtractorError(f'Unable to login: {login["message"]}')
  69             raise ExtractorError('Unable to login')
  70         InstagramBaseIE._IS_LOGGED_IN = True
  71
  72     def _real_initialize(self):
  73         self._login()
  74
  75
  76 class InstagramIOSIE(InfoExtractor):
  77     IE_DESC = 'IOS instagram:// URL'
  78     _VALID_URL = r'instagram://media\?id=(?P<id>[\d_]+)'
  79     _TESTS = [{
  80         'url': 'instagram://media?id=482584233761418119',
  81         'md5': '0d2da106a9d2631273e192b372806516',
  82         'info_dict': {
  83             'id': 'aye83DjauH',
  84             'ext': 'mp4',
  85             'title': 'Video by naomipq',
  86             'description': 'md5:1f17f0ab29bd6fe2bfad705f58de3cb8',
  87             'thumbnail': r're:^https?://.*\.jpg',
  88             'duration': 0,
  89             'timestamp': 1371748545,
  90             'upload_date': '20130620',
  91             'uploader_id': 'naomipq',
  92             'uploader': 'B E A U T Y  F O R  A S H E S',
  93             'like_count': int,
  94             'comment_count': int,
  95             'comments': list,
  96         },
  97         'add_ie': ['Instagram']
  98     }]
  99
 100     def _get_id(self, id):
 101         """Source: https://stackoverflow.com/questions/24437823/getting-instagram-post-url-from-media-id"""
 102         chrs = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_'
 103         media_id = int(id.split('_')[0])
 104         shortened_id = ''
 105         while media_id > 0:
 106             r = media_id % 64
 107             media_id = (media_id - r) // 64
 108             shortened_id = chrs[r] + shortened_id
 109         return shortened_id
 110
 111     def _real_extract(self, url):
 112         return {
 113             '_type': 'url_transparent',
 114             'url': f'http://instagram.com/tv/{self._get_id(self._match_id(url))}/',
 115             'ie_key': 'Instagram',
 116         }
 117
 118
 119 class InstagramIE(InstagramBaseIE):
 120     _VALID_URL = r'(?P<url>https?://(?:www\.)?instagram\.com/(?:p|tv|reel)/(?P<id>[^/?#&]+))'
 121     _TESTS = [{
 122         'url': 'https://instagram.com/p/aye83DjauH/?foo=bar#abc',
 123         'md5': '0d2da106a9d2631273e192b372806516',
 124         'info_dict': {
 125             'id': 'aye83DjauH',
 126             'ext': 'mp4',
 127             'title': 'Video by naomipq',
 128             'description': 'md5:1f17f0ab29bd6fe2bfad705f58de3cb8',
 129             'thumbnail': r're:^https?://.*\.jpg',
 130             'duration': 0,
 131             'timestamp': 1371748545,
 132             'upload_date': '20130620',
 133             'uploader_id': 'naomipq',
 134             'uploader': 'B E A U T Y  F O R  A S H E S',
 135             'like_count': int,
 136             'comment_count': int,
 137             'comments': list,
 138         },
 139     }, {
 140         # missing description
 141         'url': 'https://www.instagram.com/p/BA-pQFBG8HZ/?taken-by=britneyspears',
 142         'info_dict': {
 143             'id': 'BA-pQFBG8HZ',
 144             'ext': 'mp4',
 145             'title': 'Video by britneyspears',
 146             'thumbnail': r're:^https?://.*\.jpg',
 147             'duration': 0,
 148             'timestamp': 1453760977,
 149             'upload_date': '20160125',
 150             'uploader_id': 'britneyspears',
 151             'uploader': 'Britney Spears',
 152             'like_count': int,
 153             'comment_count': int,
 154             'comments': list,
 155         },
 156         'params': {
 157             'skip_download': True,
 158         },
 159     }, {
 160         # multi video post
 161         'url': 'https://www.instagram.com/p/BQ0eAlwhDrw/',
 162         'playlist': [{
 163             'info_dict': {
 164                 'id': 'BQ0dSaohpPW',
 165                 'ext': 'mp4',
 166                 'title': 'Video 1',
 167             },
 168         }, {
 169             'info_dict': {
 170                 'id': 'BQ0dTpOhuHT',
 171                 'ext': 'mp4',
 172                 'title': 'Video 2',
 173             },
 174         }, {
 175             'info_dict': {
 176                 'id': 'BQ0dT7RBFeF',
 177                 'ext': 'mp4',
 178                 'title': 'Video 3',
 179             },
 180         }],
 181         'info_dict': {
 182             'id': 'BQ0eAlwhDrw',
 183             'title': 'Post by instagram',
 184             'description': 'md5:0f9203fc6a2ce4d228da5754bcf54957',
 185         },
 186     }, {
 187         # IGTV
 188         'url': 'https://www.instagram.com/tv/BkfuX9UB-eK/',
 189         'info_dict': {
 190             'id': 'BkfuX9UB-eK',
 191             'ext': 'mp4',
 192             'title': 'Fingerboarding Tricks with @cass.fb',
 193             'thumbnail': r're:^https?://.*\.jpg',
 194             'duration': 53.83,
 195             'timestamp': 1530032919,
 196             'upload_date': '20180626',
 197             'uploader_id': 'instagram',
 198             'uploader': 'Instagram',
 199             'like_count': int,
 200             'comment_count': int,
 201             'comments': list,
 202             'description': 'Meet Cass Hirst (@cass.fb), a fingerboarding pro who can perform tiny ollies and kickflips while blindfolded.',
 203         }
 204     }, {
 205         'url': 'https://instagram.com/p/-Cmh1cukG2/',
 206         'only_matching': True,
 207     }, {
 208         'url': 'http://instagram.com/p/9o6LshA7zy/embed/',
 209         'only_matching': True,
 210     }, {
 211         'url': 'https://www.instagram.com/tv/aye83DjauH/',
 212         'only_matching': True,
 213     }, {
 214         'url': 'https://www.instagram.com/reel/CDUMkliABpa/',
 215         'only_matching': True,
 216     }]
 217
 218     @staticmethod
 219     def _extract_embed_url(webpage):
 220         mobj = re.search(
 221             r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?instagram\.com/p/[^/]+/embed.*?)\1',
 222             webpage)
 223         if mobj:
 224             return mobj.group('url')
 225
 226         blockquote_el = get_element_by_attribute(
 227             'class', 'instagram-media', webpage)
 228         if blockquote_el is None:
 229             return
 230
 231         mobj = re.search(
 232             r'<a[^>]+href=([\'"])(?P<link>[^\'"]+)\1', blockquote_el)
 233         if mobj:
 234             return mobj.group('link')
 235
 236     def _real_extract(self, url):
 237         mobj = self._match_valid_url(url)
 238         video_id = mobj.group('id')
 239         url = mobj.group('url')
 240
 241         webpage, urlh = self._download_webpage_handle(url, video_id)
 242         if 'www.instagram.com/accounts/login' in urlh.geturl().rstrip('/'):
 243             self.raise_login_required('You need to log in to access this content')
 244
 245         (media, video_url, description, thumbnails, timestamp, uploader,
 246          uploader_id, like_count, comment_count, comments, height,
 247          width) = [None] * 12
 248
 249         shared_data = self._parse_json(
 250             self._search_regex(
 251                 r'window\._sharedData\s*=\s*({.+?});',
 252                 webpage, 'shared data', default='{}'),
 253             video_id, fatal=False)
 254         if shared_data:
 255             media = try_get(
 256                 shared_data,
 257                 (lambda x: x['entry_data']['PostPage'][0]['graphql']['shortcode_media'],
 258                  lambda x: x['entry_data']['PostPage'][0]['media']),
 259                 dict)
 260         # _sharedData.entry_data.PostPage is empty when authenticated (see
 261         # https://github.com/ytdl-org/youtube-dl/pull/22880)
 262         if not media:
 263             additional_data = self._parse_json(
 264                 self._search_regex(
 265                     r'window\.__additionalDataLoaded\s*\(\s*[^,]+,\s*({.+?})\s*\)\s*;',
 266                     webpage, 'additional data', default='{}'),
 267                 video_id, fatal=False)
 268             if additional_data:
 269                 media = try_get(
 270                     additional_data, lambda x: x['graphql']['shortcode_media'],
 271                     dict)
 272         if media:
 273             video_url = media.get('video_url')
 274             height = int_or_none(self._html_search_meta(('og:video:height', 'video:height'), webpage)) or try_get(media, lambda x: x['dimensions']['height'])
 275             width = int_or_none(self._html_search_meta(('og:video:width', 'video:width'), webpage)) or try_get(media, lambda x: x['dimensions']['width'])
 276             description = try_get(
 277                 media, lambda x: x['edge_media_to_caption']['edges'][0]['node']['text'],
 278                 compat_str) or media.get('caption')
 279             title = media.get('title')
 280             display_resources = media.get('display_resources')
 281             if not display_resources:
 282                 display_resources = [{'src': media.get('display_src')}, {'src': media.get('display_url')}]
 283             duration = float_or_none(media.get('video_duration'))
 284             timestamp = int_or_none(media.get('taken_at_timestamp') or media.get('date'))
 285             uploader = try_get(media, lambda x: x['owner']['full_name'])
 286             uploader_id = try_get(media, lambda x: x['owner']['username'])
 287
 288             def get_count(keys, kind):
 289                 for key in variadic(keys):
 290                     count = int_or_none(try_get(
 291                         media, (lambda x: x['edge_media_%s' % key]['count'],
 292                                 lambda x: x['%ss' % kind]['count'])))
 293                     if count is not None:
 294                         return count
 295
 296             like_count = get_count('preview_like', 'like')
 297             comment_count = get_count(
 298                 ('preview_comment', 'to_comment', 'to_parent_comment'), 'comment')
 299
 300             thumbnails = [{
 301                 'url': thumbnail['src'],
 302                 'width': thumbnail.get('config_width'),
 303                 'height': thumbnail.get('config_height'),
 304             } for thumbnail in display_resources if thumbnail.get('src')]
 305
 306             comments = []
 307             for comment in try_get(media, lambda x: x['edge_media_to_parent_comment']['edges']):
 308                 comment_dict = comment.get('node', {})
 309                 comment_text = comment_dict.get('text')
 310                 if comment_text:
 311                     comments.append({
 312                         'author': try_get(comment_dict, lambda x: x['owner']['username']),
 313                         'author_id': try_get(comment_dict, lambda x: x['owner']['id']),
 314                         'id': comment_dict.get('id'),
 315                         'text': comment_text,
 316                         'timestamp': int_or_none(comment_dict.get('created_at')),
 317                     })
 318             if not video_url:
 319                 edges = try_get(
 320                     media, lambda x: x['edge_sidecar_to_children']['edges'],
 321                     list) or []
 322                 if edges:
 323                     entries = []
 324                     for edge_num, edge in enumerate(edges, start=1):
 325                         node = try_get(edge, lambda x: x['node'], dict)
 326                         if not node:
 327                             continue
 328                         node_video_url = url_or_none(node.get('video_url'))
 329                         if not node_video_url:
 330                             continue
 331                         entries.append({
 332                             'id': node.get('shortcode') or node['id'],
 333                             'title': node.get('title') or 'Video %d' % edge_num,
 334                             'url': node_video_url,
 335                             'thumbnail': node.get('display_url'),
 336                             'duration': float_or_none(node.get('video_duration')),
 337                             'width': int_or_none(try_get(node, lambda x: x['dimensions']['width'])),
 338                             'height': int_or_none(try_get(node, lambda x: x['dimensions']['height'])),
 339                             'view_count': int_or_none(node.get('video_view_count')),
 340                         })
 341                     return self.playlist_result(
 342                         entries, video_id,
 343                         'Post by %s' % uploader_id if uploader_id else None,
 344                         description)
 345
 346         if not video_url:
 347             video_url = self._og_search_video_url(webpage, secure=False)
 348
 349         formats = [{
 350             'url': video_url,
 351             'width': width,
 352             'height': height,
 353         }]
 354         dash = try_get(media, lambda x: x['dash_info']['video_dash_manifest'])
 355         if dash:
 356             formats.extend(self._parse_mpd_formats(self._parse_xml(dash, video_id), mpd_id='dash'))
 357         self._sort_formats(formats)
 358
 359         if not uploader_id:
 360             uploader_id = self._search_regex(
 361                 r'"owner"\s*:\s*{\s*"username"\s*:\s*"(.+?)"',
 362                 webpage, 'uploader id', fatal=False)
 363
 364         if not description:
 365             description = self._search_regex(
 366                 r'"caption"\s*:\s*"(.+?)"', webpage, 'description', default=None)
 367             if description is not None:
 368                 description = lowercase_escape(description)
 369
 370         if not thumbnails:
 371             thumbnails = self._og_search_thumbnail(webpage)
 372
 373         return {
 374             'id': video_id,
 375             'formats': formats,
 376             'ext': 'mp4',
 377             'title': title or 'Video by %s' % uploader_id,
 378             'description': description,
 379             'duration': duration,
 380             'thumbnails': thumbnails,
 381             'timestamp': timestamp,
 382             'uploader_id': uploader_id,
 383             'uploader': uploader,
 384             'like_count': like_count,
 385             'comment_count': comment_count,
 386             'comments': comments,
 387             'http_headers': {
 388                 'Referer': 'https://www.instagram.com/',
 389             }
 390         }
 391
 392
 393 class InstagramPlaylistBaseIE(InstagramBaseIE):
 394     _gis_tmpl = None  # used to cache GIS request type
 395
 396     def _parse_graphql(self, webpage, item_id):
 397         # Reads a webpage and returns its GraphQL data.
 398         return self._parse_json(
 399             self._search_regex(
 400                 r'sharedData\s*=\s*({.+?})\s*;\s*[<\n]', webpage, 'data'),
 401             item_id)
 402
 403     def _extract_graphql(self, data, url):
 404         # Parses GraphQL queries containing videos and generates a playlist.
 405         def get_count(suffix):
 406             return int_or_none(try_get(
 407                 node, lambda x: x['edge_media_' + suffix]['count']))
 408
 409         uploader_id = self._match_id(url)
 410         csrf_token = data['config']['csrf_token']
 411         rhx_gis = data.get('rhx_gis') or '3c7ca9dcefcf966d11dacf1f151335e8'
 412
 413         cursor = ''
 414         for page_num in itertools.count(1):
 415             variables = {
 416                 'first': 12,
 417                 'after': cursor,
 418             }
 419             variables.update(self._query_vars_for(data))
 420             variables = json.dumps(variables)
 421
 422             if self._gis_tmpl:
 423                 gis_tmpls = [self._gis_tmpl]
 424             else:
 425                 gis_tmpls = [
 426                     '%s' % rhx_gis,
 427                     '',
 428                     '%s:%s' % (rhx_gis, csrf_token),
 429                     '%s:%s:%s' % (rhx_gis, csrf_token, std_headers['User-Agent']),
 430                 ]
 431
 432             # try all of the ways to generate a GIS query, and not only use the
 433             # first one that works, but cache it for future requests
 434             for gis_tmpl in gis_tmpls:
 435                 try:
 436                     json_data = self._download_json(
 437                         'https://www.instagram.com/graphql/query/', uploader_id,
 438                         'Downloading JSON page %d' % page_num, headers={
 439                             'X-Requested-With': 'XMLHttpRequest',
 440                             'X-Instagram-GIS': hashlib.md5(
 441                                 ('%s:%s' % (gis_tmpl, variables)).encode('utf-8')).hexdigest(),
 442                         }, query={
 443                             'query_hash': self._QUERY_HASH,
 444                             'variables': variables,
 445                         })
 446                     media = self._parse_timeline_from(json_data)
 447                     self._gis_tmpl = gis_tmpl
 448                     break
 449                 except ExtractorError as e:
 450                     # if it's an error caused by a bad query, and there are
 451                     # more GIS templates to try, ignore it and keep trying
 452                     if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403:
 453                         if gis_tmpl != gis_tmpls[-1]:
 454                             continue
 455                     raise
 456
 457             edges = media.get('edges')
 458             if not edges or not isinstance(edges, list):
 459                 break
 460
 461             for edge in edges:
 462                 node = edge.get('node')
 463                 if not node or not isinstance(node, dict):
 464                     continue
 465                 if node.get('__typename') != 'GraphVideo' and node.get('is_video') is not True:
 466                     continue
 467                 video_id = node.get('shortcode')
 468                 if not video_id:
 469                     continue
 470
 471                 info = self.url_result(
 472                     'https://instagram.com/p/%s/' % video_id,
 473                     ie=InstagramIE.ie_key(), video_id=video_id)
 474
 475                 description = try_get(
 476                     node, lambda x: x['edge_media_to_caption']['edges'][0]['node']['text'],
 477                     compat_str)
 478                 thumbnail = node.get('thumbnail_src') or node.get('display_src')
 479                 timestamp = int_or_none(node.get('taken_at_timestamp'))
 480
 481                 comment_count = get_count('to_comment')
 482                 like_count = get_count('preview_like')
 483                 view_count = int_or_none(node.get('video_view_count'))
 484
 485                 info.update({
 486                     'description': description,
 487                     'thumbnail': thumbnail,
 488                     'timestamp': timestamp,
 489                     'comment_count': comment_count,
 490                     'like_count': like_count,
 491                     'view_count': view_count,
 492                 })
 493
 494                 yield info
 495
 496             page_info = media.get('page_info')
 497             if not page_info or not isinstance(page_info, dict):
 498                 break
 499
 500             has_next_page = page_info.get('has_next_page')
 501             if not has_next_page:
 502                 break
 503
 504             cursor = page_info.get('end_cursor')
 505             if not cursor or not isinstance(cursor, compat_str):
 506                 break
 507
 508     def _real_extract(self, url):
 509         user_or_tag = self._match_id(url)
 510         webpage = self._download_webpage(url, user_or_tag)
 511         data = self._parse_graphql(webpage, user_or_tag)
 512
 513         self._set_cookie('instagram.com', 'ig_pr', '1')
 514
 515         return self.playlist_result(
 516             self._extract_graphql(data, url), user_or_tag, user_or_tag)
 517
 518
 519 class InstagramUserIE(InstagramPlaylistBaseIE):
 520     _VALID_URL = r'https?://(?:www\.)?instagram\.com/(?P<id>[^/]{2,})/?(?:$|[?#])'
 521     IE_DESC = 'Instagram user profile'
 522     IE_NAME = 'instagram:user'
 523     _TESTS = [{
 524         'url': 'https://instagram.com/porsche',
 525         'info_dict': {
 526             'id': 'porsche',
 527             'title': 'porsche',
 528         },
 529         'playlist_count': 5,
 530         'params': {
 531             'extract_flat': True,
 532             'skip_download': True,
 533             'playlistend': 5,
 534         }
 535     }]
 536
 537     _QUERY_HASH = '42323d64886122307be10013ad2dcc44',
 538
 539     @staticmethod
 540     def _parse_timeline_from(data):
 541         # extracts the media timeline data from a GraphQL result
 542         return data['data']['user']['edge_owner_to_timeline_media']
 543
 544     @staticmethod
 545     def _query_vars_for(data):
 546         # returns a dictionary of variables to add to the timeline query based
 547         # on the GraphQL of the original page
 548         return {
 549             'id': data['entry_data']['ProfilePage'][0]['graphql']['user']['id']
 550         }
 551
 552
 553 class InstagramTagIE(InstagramPlaylistBaseIE):
 554     _VALID_URL = r'https?://(?:www\.)?instagram\.com/explore/tags/(?P<id>[^/]+)'
 555     IE_DESC = 'Instagram hashtag search'
 556     IE_NAME = 'instagram:tag'
 557     _TESTS = [{
 558         'url': 'https://instagram.com/explore/tags/lolcats',
 559         'info_dict': {
 560             'id': 'lolcats',
 561             'title': 'lolcats',
 562         },
 563         'playlist_count': 50,
 564         'params': {
 565             'extract_flat': True,
 566             'skip_download': True,
 567             'playlistend': 50,
 568         }
 569     }]
 570
 571     _QUERY_HASH = 'f92f56d47dc7a55b606908374b43a314',
 572
 573     @staticmethod
 574     def _parse_timeline_from(data):
 575         # extracts the media timeline data from a GraphQL result
 576         return data['data']['hashtag']['edge_hashtag_to_media']
 577
 578     @staticmethod
 579     def _query_vars_for(data):
 580         # returns a dictionary of variables to add to the timeline query based
 581         # on the GraphQL of the original page
 582         return {
 583             'tag_name':
 584                 data['entry_data']['TagPage'][0]['graphql']['hashtag']['name']
 585         }