yt_dlp/extractor/instagram.py

   1 from __future__ import unicode_literals
   2
   3 import itertools
   4 import hashlib
   5 import json
   6 import re
   7
   8 from .common import InfoExtractor
   9 from ..compat import (
  10     compat_str,
  11     compat_HTTPError,
  12 )
  13 from ..utils import (
  14     ExtractorError,
  15     float_or_none,
  16     get_element_by_attribute,
  17     int_or_none,
  18     lowercase_escape,
  19     std_headers,
  20     try_get,
  21     url_or_none,
  22 )
  23
  24
  25 class InstagramIE(InfoExtractor):
  26     _VALID_URL = r'(?P<url>https?://(?:www\.)?instagram\.com/(?:p|tv|reel)/(?P<id>[^/?#&]+))'
  27     _TESTS = [{
  28         'url': 'https://instagram.com/p/aye83DjauH/?foo=bar#abc',
  29         'md5': '0d2da106a9d2631273e192b372806516',
  30         'info_dict': {
  31             'id': 'aye83DjauH',
  32             'ext': 'mp4',
  33             'title': 'Video by naomipq',
  34             'description': 'md5:1f17f0ab29bd6fe2bfad705f58de3cb8',
  35             'thumbnail': r're:^https?://.*\.jpg',
  36             'duration': 0,
  37             'timestamp': 1371748545,
  38             'upload_date': '20130620',
  39             'uploader_id': 'naomipq',
  40             'uploader': 'B E A U T Y  F O R  A S H E S',
  41             'like_count': int,
  42             'comment_count': int,
  43             'comments': list,
  44         },
  45     }, {
  46         # missing description
  47         'url': 'https://www.instagram.com/p/BA-pQFBG8HZ/?taken-by=britneyspears',
  48         'info_dict': {
  49             'id': 'BA-pQFBG8HZ',
  50             'ext': 'mp4',
  51             'title': 'Video by britneyspears',
  52             'thumbnail': r're:^https?://.*\.jpg',
  53             'duration': 0,
  54             'timestamp': 1453760977,
  55             'upload_date': '20160125',
  56             'uploader_id': 'britneyspears',
  57             'uploader': 'Britney Spears',
  58             'like_count': int,
  59             'comment_count': int,
  60             'comments': list,
  61         },
  62         'params': {
  63             'skip_download': True,
  64         },
  65     }, {
  66         # multi video post
  67         'url': 'https://www.instagram.com/p/BQ0eAlwhDrw/',
  68         'playlist': [{
  69             'info_dict': {
  70                 'id': 'BQ0dSaohpPW',
  71                 'ext': 'mp4',
  72                 'title': 'Video 1',
  73             },
  74         }, {
  75             'info_dict': {
  76                 'id': 'BQ0dTpOhuHT',
  77                 'ext': 'mp4',
  78                 'title': 'Video 2',
  79             },
  80         }, {
  81             'info_dict': {
  82                 'id': 'BQ0dT7RBFeF',
  83                 'ext': 'mp4',
  84                 'title': 'Video 3',
  85             },
  86         }],
  87         'info_dict': {
  88             'id': 'BQ0eAlwhDrw',
  89             'title': 'Post by instagram',
  90             'description': 'md5:0f9203fc6a2ce4d228da5754bcf54957',
  91         },
  92     }, {
  93         # IGTV
  94         'url': 'https://www.instagram.com/tv/BkfuX9UB-eK/',
  95         'info_dict': {
  96             'id': 'BkfuX9UB-eK',
  97             'ext': 'mp4',
  98             'title': 'Fingerboarding Tricks with @cass.fb',
  99             'thumbnail': r're:^https?://.*\.jpg',
 100             'duration': 53.83,
 101             'timestamp': 1530032919,
 102             'upload_date': '20180626',
 103             'uploader_id': 'instagram',
 104             'uploader': 'Instagram',
 105             'like_count': int,
 106             'comment_count': int,
 107             'comments': list,
 108             'description': 'Meet Cass Hirst (@cass.fb), a fingerboarding pro who can perform tiny ollies and kickflips while blindfolded.',
 109         }
 110     }, {
 111         'url': 'https://instagram.com/p/-Cmh1cukG2/',
 112         'only_matching': True,
 113     }, {
 114         'url': 'http://instagram.com/p/9o6LshA7zy/embed/',
 115         'only_matching': True,
 116     }, {
 117         'url': 'https://www.instagram.com/tv/aye83DjauH/',
 118         'only_matching': True,
 119     }, {
 120         'url': 'https://www.instagram.com/reel/CDUMkliABpa/',
 121         'only_matching': True,
 122     }]
 123
 124     @staticmethod
 125     def _extract_embed_url(webpage):
 126         mobj = re.search(
 127             r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?instagram\.com/p/[^/]+/embed.*?)\1',
 128             webpage)
 129         if mobj:
 130             return mobj.group('url')
 131
 132         blockquote_el = get_element_by_attribute(
 133             'class', 'instagram-media', webpage)
 134         if blockquote_el is None:
 135             return
 136
 137         mobj = re.search(
 138             r'<a[^>]+href=([\'"])(?P<link>[^\'"]+)\1', blockquote_el)
 139         if mobj:
 140             return mobj.group('link')
 141
 142     def _real_extract(self, url):
 143         mobj = re.match(self._VALID_URL, url)
 144         video_id = mobj.group('id')
 145         url = mobj.group('url')
 146
 147         webpage = self._download_webpage(url, video_id)
 148
 149         (media, video_url, description, thumbnail, timestamp, uploader,
 150          uploader_id, like_count, comment_count, comments, height,
 151          width) = [None] * 12
 152
 153         shared_data = self._parse_json(
 154             self._search_regex(
 155                 r'window\._sharedData\s*=\s*({.+?});',
 156                 webpage, 'shared data', default='{}'),
 157             video_id, fatal=False)
 158         if shared_data:
 159             media = try_get(
 160                 shared_data,
 161                 (lambda x: x['entry_data']['PostPage'][0]['graphql']['shortcode_media'],
 162                  lambda x: x['entry_data']['PostPage'][0]['media']),
 163                 dict)
 164         # _sharedData.entry_data.PostPage is empty when authenticated (see
 165         # https://github.com/ytdl-org/youtube-dl/pull/22880)
 166         if not media:
 167             additional_data = self._parse_json(
 168                 self._search_regex(
 169                     r'window\.__additionalDataLoaded\s*\(\s*[^,]+,\s*({.+?})\s*\)\s*;',
 170                     webpage, 'additional data', default='{}'),
 171                 video_id, fatal=False)
 172             if additional_data:
 173                 media = try_get(
 174                     additional_data, lambda x: x['graphql']['shortcode_media'],
 175                     dict)
 176         if media:
 177             video_url = media.get('video_url')
 178             height = int_or_none(media.get('dimensions', {}).get('height'))
 179             width = int_or_none(media.get('dimensions', {}).get('width'))
 180             description = try_get(
 181                 media, lambda x: x['edge_media_to_caption']['edges'][0]['node']['text'],
 182                 compat_str) or media.get('caption')
 183             title = media.get('title')
 184             thumbnail = media.get('display_src') or media.get('display_url')
 185             duration = float_or_none(media.get('video_duration'))
 186             timestamp = int_or_none(media.get('taken_at_timestamp') or media.get('date'))
 187             uploader = media.get('owner', {}).get('full_name')
 188             uploader_id = media.get('owner', {}).get('username')
 189
 190             def get_count(keys, kind):
 191                 if not isinstance(keys, (list, tuple)):
 192                     keys = [keys]
 193                 for key in keys:
 194                     count = int_or_none(try_get(
 195                         media, (lambda x: x['edge_media_%s' % key]['count'],
 196                                 lambda x: x['%ss' % kind]['count'])))
 197                     if count is not None:
 198                         return count
 199             like_count = get_count('preview_like', 'like')
 200             comment_count = get_count(
 201                 ('preview_comment', 'to_comment', 'to_parent_comment'), 'comment')
 202
 203             comments = [{
 204                 'author': comment.get('user', {}).get('username'),
 205                 'author_id': comment.get('user', {}).get('id'),
 206                 'id': comment.get('id'),
 207                 'text': comment.get('text'),
 208                 'timestamp': int_or_none(comment.get('created_at')),
 209             } for comment in media.get(
 210                 'comments', {}).get('nodes', []) if comment.get('text')]
 211             if not video_url:
 212                 edges = try_get(
 213                     media, lambda x: x['edge_sidecar_to_children']['edges'],
 214                     list) or []
 215                 if edges:
 216                     entries = []
 217                     for edge_num, edge in enumerate(edges, start=1):
 218                         node = try_get(edge, lambda x: x['node'], dict)
 219                         if not node:
 220                             continue
 221                         node_video_url = url_or_none(node.get('video_url'))
 222                         if not node_video_url:
 223                             continue
 224                         entries.append({
 225                             'id': node.get('shortcode') or node['id'],
 226                             'title': node.get('title') or 'Video %d' % edge_num,
 227                             'url': node_video_url,
 228                             'thumbnail': node.get('display_url'),
 229                             'duration': float_or_none(node.get('video_duration')),
 230                             'width': int_or_none(try_get(node, lambda x: x['dimensions']['width'])),
 231                             'height': int_or_none(try_get(node, lambda x: x['dimensions']['height'])),
 232                             'view_count': int_or_none(node.get('video_view_count')),
 233                         })
 234                     return self.playlist_result(
 235                         entries, video_id,
 236                         'Post by %s' % uploader_id if uploader_id else None,
 237                         description)
 238
 239         if not video_url:
 240             video_url = self._og_search_video_url(webpage, secure=False)
 241
 242         formats = [{
 243             'url': video_url,
 244             'width': width,
 245             'height': height,
 246         }]
 247
 248         if not uploader_id:
 249             uploader_id = self._search_regex(
 250                 r'"owner"\s*:\s*{\s*"username"\s*:\s*"(.+?)"',
 251                 webpage, 'uploader id', fatal=False)
 252
 253         if not description:
 254             description = self._search_regex(
 255                 r'"caption"\s*:\s*"(.+?)"', webpage, 'description', default=None)
 256             if description is not None:
 257                 description = lowercase_escape(description)
 258
 259         if not thumbnail:
 260             thumbnail = self._og_search_thumbnail(webpage)
 261
 262         return {
 263             'id': video_id,
 264             'formats': formats,
 265             'ext': 'mp4',
 266             'title': title or 'Video by %s' % uploader_id,
 267             'description': description,
 268             'duration': duration,
 269             'thumbnail': thumbnail,
 270             'timestamp': timestamp,
 271             'uploader_id': uploader_id,
 272             'uploader': uploader,
 273             'like_count': like_count,
 274             'comment_count': comment_count,
 275             'comments': comments,
 276         }
 277
 278
 279 class InstagramPlaylistIE(InfoExtractor):
 280     # A superclass for handling any kind of query based on GraphQL which
 281     # results in a playlist.
 282
 283     _gis_tmpl = None  # used to cache GIS request type
 284
 285     def _parse_graphql(self, webpage, item_id):
 286         # Reads a webpage and returns its GraphQL data.
 287         return self._parse_json(
 288             self._search_regex(
 289                 r'sharedData\s*=\s*({.+?})\s*;\s*[<\n]', webpage, 'data'),
 290             item_id)
 291
 292     def _extract_graphql(self, data, url):
 293         # Parses GraphQL queries containing videos and generates a playlist.
 294         def get_count(suffix):
 295             return int_or_none(try_get(
 296                 node, lambda x: x['edge_media_' + suffix]['count']))
 297
 298         uploader_id = self._match_id(url)
 299         csrf_token = data['config']['csrf_token']
 300         rhx_gis = data.get('rhx_gis') or '3c7ca9dcefcf966d11dacf1f151335e8'
 301
 302         cursor = ''
 303         for page_num in itertools.count(1):
 304             variables = {
 305                 'first': 12,
 306                 'after': cursor,
 307             }
 308             variables.update(self._query_vars_for(data))
 309             variables = json.dumps(variables)
 310
 311             if self._gis_tmpl:
 312                 gis_tmpls = [self._gis_tmpl]
 313             else:
 314                 gis_tmpls = [
 315                     '%s' % rhx_gis,
 316                     '',
 317                     '%s:%s' % (rhx_gis, csrf_token),
 318                     '%s:%s:%s' % (rhx_gis, csrf_token, std_headers['User-Agent']),
 319                 ]
 320
 321             # try all of the ways to generate a GIS query, and not only use the
 322             # first one that works, but cache it for future requests
 323             for gis_tmpl in gis_tmpls:
 324                 try:
 325                     json_data = self._download_json(
 326                         'https://www.instagram.com/graphql/query/', uploader_id,
 327                         'Downloading JSON page %d' % page_num, headers={
 328                             'X-Requested-With': 'XMLHttpRequest',
 329                             'X-Instagram-GIS': hashlib.md5(
 330                                 ('%s:%s' % (gis_tmpl, variables)).encode('utf-8')).hexdigest(),
 331                         }, query={
 332                             'query_hash': self._QUERY_HASH,
 333                             'variables': variables,
 334                         })
 335                     media = self._parse_timeline_from(json_data)
 336                     self._gis_tmpl = gis_tmpl
 337                     break
 338                 except ExtractorError as e:
 339                     # if it's an error caused by a bad query, and there are
 340                     # more GIS templates to try, ignore it and keep trying
 341                     if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403:
 342                         if gis_tmpl != gis_tmpls[-1]:
 343                             continue
 344                     raise
 345
 346             edges = media.get('edges')
 347             if not edges or not isinstance(edges, list):
 348                 break
 349
 350             for edge in edges:
 351                 node = edge.get('node')
 352                 if not node or not isinstance(node, dict):
 353                     continue
 354                 if node.get('__typename') != 'GraphVideo' and node.get('is_video') is not True:
 355                     continue
 356                 video_id = node.get('shortcode')
 357                 if not video_id:
 358                     continue
 359
 360                 info = self.url_result(
 361                     'https://instagram.com/p/%s/' % video_id,
 362                     ie=InstagramIE.ie_key(), video_id=video_id)
 363
 364                 description = try_get(
 365                     node, lambda x: x['edge_media_to_caption']['edges'][0]['node']['text'],
 366                     compat_str)
 367                 thumbnail = node.get('thumbnail_src') or node.get('display_src')
 368                 timestamp = int_or_none(node.get('taken_at_timestamp'))
 369
 370                 comment_count = get_count('to_comment')
 371                 like_count = get_count('preview_like')
 372                 view_count = int_or_none(node.get('video_view_count'))
 373
 374                 info.update({
 375                     'description': description,
 376                     'thumbnail': thumbnail,
 377                     'timestamp': timestamp,
 378                     'comment_count': comment_count,
 379                     'like_count': like_count,
 380                     'view_count': view_count,
 381                 })
 382
 383                 yield info
 384
 385             page_info = media.get('page_info')
 386             if not page_info or not isinstance(page_info, dict):
 387                 break
 388
 389             has_next_page = page_info.get('has_next_page')
 390             if not has_next_page:
 391                 break
 392
 393             cursor = page_info.get('end_cursor')
 394             if not cursor or not isinstance(cursor, compat_str):
 395                 break
 396
 397     def _real_extract(self, url):
 398         user_or_tag = self._match_id(url)
 399         webpage = self._download_webpage(url, user_or_tag)
 400         data = self._parse_graphql(webpage, user_or_tag)
 401
 402         self._set_cookie('instagram.com', 'ig_pr', '1')
 403
 404         return self.playlist_result(
 405             self._extract_graphql(data, url), user_or_tag, user_or_tag)
 406
 407
 408 class InstagramUserIE(InstagramPlaylistIE):
 409     _VALID_URL = r'https?://(?:www\.)?instagram\.com/(?P<id>[^/]{2,})/?(?:$|[?#])'
 410     IE_DESC = 'Instagram user profile'
 411     IE_NAME = 'instagram:user'
 412     _TEST = {
 413         'url': 'https://instagram.com/porsche',
 414         'info_dict': {
 415             'id': 'porsche',
 416             'title': 'porsche',
 417         },
 418         'playlist_count': 5,
 419         'params': {
 420             'extract_flat': True,
 421             'skip_download': True,
 422             'playlistend': 5,
 423         }
 424     }
 425
 426     _QUERY_HASH = '42323d64886122307be10013ad2dcc44',
 427
 428     @staticmethod
 429     def _parse_timeline_from(data):
 430         # extracts the media timeline data from a GraphQL result
 431         return data['data']['user']['edge_owner_to_timeline_media']
 432
 433     @staticmethod
 434     def _query_vars_for(data):
 435         # returns a dictionary of variables to add to the timeline query based
 436         # on the GraphQL of the original page
 437         return {
 438             'id': data['entry_data']['ProfilePage'][0]['graphql']['user']['id']
 439         }
 440
 441
 442 class InstagramTagIE(InstagramPlaylistIE):
 443     _VALID_URL = r'https?://(?:www\.)?instagram\.com/explore/tags/(?P<id>[^/]+)'
 444     IE_DESC = 'Instagram hashtag search'
 445     IE_NAME = 'instagram:tag'
 446     _TEST = {
 447         'url': 'https://instagram.com/explore/tags/lolcats',
 448         'info_dict': {
 449             'id': 'lolcats',
 450             'title': 'lolcats',
 451         },
 452         'playlist_count': 50,
 453         'params': {
 454             'extract_flat': True,
 455             'skip_download': True,
 456             'playlistend': 50,
 457         }
 458     }
 459
 460     _QUERY_HASH = 'f92f56d47dc7a55b606908374b43a314',
 461
 462     @staticmethod
 463     def _parse_timeline_from(data):
 464         # extracts the media timeline data from a GraphQL result
 465         return data['data']['hashtag']['edge_hashtag_to_media']
 466
 467     @staticmethod
 468     def _query_vars_for(data):
 469         # returns a dictionary of variables to add to the timeline query based
 470         # on the GraphQL of the original page
 471         return {
 472             'tag_name':
 473                 data['entry_data']['TagPage'][0]['graphql']['hashtag']['name']
 474         }