]>
Commit | Line | Data |
---|---|---|
0de668af JMF |
1 | from __future__ import unicode_literals |
2 | ||
59fc531f JMF |
3 | import re |
4 | ||
5 | from .common import InfoExtractor | |
e1ec9330 | 6 | from ..utils import ( |
c4096e8a | 7 | get_element_by_attribute, |
e1ec9330 YCH |
8 | int_or_none, |
9 | limit_length, | |
87696e78 | 10 | lowercase_escape, |
98960c91 | 11 | try_get, |
e1ec9330 | 12 | ) |
59fc531f | 13 | |
0de668af | 14 | |
59fc531f | 15 | class InstagramIE(InfoExtractor): |
0dafea02 | 16 | _VALID_URL = r'(?P<url>https?://(?:www\.)?instagram\.com/p/(?P<id>[^/?#&]+))' |
4479600d | 17 | _TESTS = [{ |
fc6e75dd | 18 | 'url': 'https://instagram.com/p/aye83DjauH/?foo=bar#abc', |
0de668af JMF |
19 | 'md5': '0d2da106a9d2631273e192b372806516', |
20 | 'info_dict': { | |
21 | 'id': 'aye83DjauH', | |
22 | 'ext': 'mp4', | |
0de668af JMF |
23 | 'title': 'Video by naomipq', |
24 | 'description': 'md5:1f17f0ab29bd6fe2bfad705f58de3cb8', | |
98960c91 S |
25 | 'thumbnail': 're:^https?://.*\.jpg', |
26 | 'timestamp': 1371748545, | |
27 | 'upload_date': '20130620', | |
28 | 'uploader_id': 'naomipq', | |
29 | 'uploader': 'Naomi Leonor Phan-Quang', | |
30 | 'like_count': int, | |
31 | 'comment_count': int, | |
a56e74e2 | 32 | 'comments': list, |
98960c91 | 33 | }, |
fb4b3458 S |
34 | }, { |
35 | # missing description | |
36 | 'url': 'https://www.instagram.com/p/BA-pQFBG8HZ/?taken-by=britneyspears', | |
37 | 'info_dict': { | |
38 | 'id': 'BA-pQFBG8HZ', | |
39 | 'ext': 'mp4', | |
fb4b3458 | 40 | 'title': 'Video by britneyspears', |
98960c91 S |
41 | 'thumbnail': 're:^https?://.*\.jpg', |
42 | 'timestamp': 1453760977, | |
43 | 'upload_date': '20160125', | |
44 | 'uploader_id': 'britneyspears', | |
45 | 'uploader': 'Britney Spears', | |
46 | 'like_count': int, | |
47 | 'comment_count': int, | |
a56e74e2 | 48 | 'comments': list, |
fb4b3458 S |
49 | }, |
50 | 'params': { | |
51 | 'skip_download': True, | |
52 | }, | |
4479600d S |
53 | }, { |
54 | 'url': 'https://instagram.com/p/-Cmh1cukG2/', | |
55 | 'only_matching': True, | |
0dafea02 S |
56 | }, { |
57 | 'url': 'http://instagram.com/p/9o6LshA7zy/embed/', | |
58 | 'only_matching': True, | |
4479600d | 59 | }] |
59fc531f | 60 | |
c4096e8a YCH |
61 | @staticmethod |
62 | def _extract_embed_url(webpage): | |
c23533a1 S |
63 | mobj = re.search( |
64 | r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?instagram\.com/p/[^/]+/embed.*?)\1', | |
65 | webpage) | |
66 | if mobj: | |
67 | return mobj.group('url') | |
68 | ||
c4096e8a YCH |
69 | blockquote_el = get_element_by_attribute( |
70 | 'class', 'instagram-media', webpage) | |
71 | if blockquote_el is None: | |
72 | return | |
73 | ||
74 | mobj = re.search( | |
75 | r'<a[^>]+href=([\'"])(?P<link>[^\'"]+)\1', blockquote_el) | |
76 | if mobj: | |
77 | return mobj.group('link') | |
78 | ||
59fc531f | 79 | def _real_extract(self, url): |
0dafea02 S |
80 | mobj = re.match(self._VALID_URL, url) |
81 | video_id = mobj.group('id') | |
82 | url = mobj.group('url') | |
d2d8248f | 83 | |
59fc531f | 84 | webpage = self._download_webpage(url, video_id) |
98960c91 S |
85 | |
86 | (video_url, description, thumbnail, timestamp, uploader, | |
16097822 | 87 | uploader_id, like_count, comment_count, height, width) = [None] * 10 |
98960c91 S |
88 | |
89 | shared_data = self._parse_json( | |
90 | self._search_regex( | |
91 | r'window\._sharedData\s*=\s*({.+?});', | |
92 | webpage, 'shared data', default='{}'), | |
93 | video_id, fatal=False) | |
94 | if shared_data: | |
95 | media = try_get( | |
96 | shared_data, lambda x: x['entry_data']['PostPage'][0]['media'], dict) | |
97 | if media: | |
98 | video_url = media.get('video_url') | |
16097822 DR |
99 | height = int_or_none(media.get('dimensions', {}).get('height')) |
100 | width = int_or_none(media.get('dimensions', {}).get('width')) | |
98960c91 S |
101 | description = media.get('caption') |
102 | thumbnail = media.get('display_src') | |
103 | timestamp = int_or_none(media.get('date')) | |
104 | uploader = media.get('owner', {}).get('full_name') | |
105 | uploader_id = media.get('owner', {}).get('username') | |
106 | like_count = int_or_none(media.get('likes', {}).get('count')) | |
107 | comment_count = int_or_none(media.get('comments', {}).get('count')) | |
a56e74e2 DR |
108 | comments = [{ |
109 | 'author': comment.get('user', {}).get('username'), | |
110 | 'author_id': comment.get('user', {}).get('id'), | |
111 | 'id': comment.get('id'), | |
112 | 'text': comment.get('text'), | |
113 | 'timestamp': int_or_none(comment.get('created_at')), | |
a1001f47 S |
114 | } for comment in media.get( |
115 | 'comments', {}).get('nodes', []) if comment.get('text')] | |
98960c91 S |
116 | |
117 | if not video_url: | |
118 | video_url = self._og_search_video_url(webpage, secure=False) | |
119 | ||
16097822 DR |
120 | formats = [{ |
121 | 'url': video_url, | |
122 | 'width': width, | |
123 | 'height': height, | |
124 | }] | |
125 | ||
98960c91 S |
126 | if not uploader_id: |
127 | uploader_id = self._search_regex( | |
128 | r'"owner"\s*:\s*{\s*"username"\s*:\s*"(.+?)"', | |
129 | webpage, 'uploader id', fatal=False) | |
130 | ||
131 | if not description: | |
132 | description = self._search_regex( | |
133 | r'"caption"\s*:\s*"(.+?)"', webpage, 'description', default=None) | |
134 | if description is not None: | |
135 | description = lowercase_escape(description) | |
136 | ||
137 | if not thumbnail: | |
138 | thumbnail = self._og_search_thumbnail(webpage) | |
59fc531f | 139 | |
0de668af JMF |
140 | return { |
141 | 'id': video_id, | |
16097822 | 142 | 'formats': formats, |
0de668af JMF |
143 | 'ext': 'mp4', |
144 | 'title': 'Video by %s' % uploader_id, | |
98960c91 S |
145 | 'description': description, |
146 | 'thumbnail': thumbnail, | |
147 | 'timestamp': timestamp, | |
0de668af | 148 | 'uploader_id': uploader_id, |
98960c91 S |
149 | 'uploader': uploader, |
150 | 'like_count': like_count, | |
151 | 'comment_count': comment_count, | |
a56e74e2 | 152 | 'comments': comments, |
0de668af | 153 | } |
ea38e55f PH |
154 | |
155 | ||
156 | class InstagramUserIE(InfoExtractor): | |
dcdc3523 | 157 | _VALID_URL = r'https?://(?:www\.)?instagram\.com/(?P<username>[^/]{2,})/?(?:$|[?#])' |
ea38e55f PH |
158 | IE_DESC = 'Instagram user profile' |
159 | IE_NAME = 'instagram:user' | |
22a6f150 | 160 | _TEST = { |
fc6e75dd | 161 | 'url': 'https://instagram.com/porsche', |
22a6f150 PH |
162 | 'info_dict': { |
163 | 'id': 'porsche', | |
164 | 'title': 'porsche', | |
165 | }, | |
166 | 'playlist_mincount': 2, | |
167 | 'playlist': [{ | |
168 | 'info_dict': { | |
169 | 'id': '614605558512799803_462752227', | |
170 | 'ext': 'mp4', | |
171 | 'title': '#Porsche Intelligent Performance.', | |
172 | 'thumbnail': 're:^https?://.*\.jpg', | |
173 | 'uploader': 'Porsche', | |
174 | 'uploader_id': 'porsche', | |
175 | 'timestamp': 1387486713, | |
176 | 'upload_date': '20131219', | |
177 | }, | |
178 | }], | |
179 | 'params': { | |
180 | 'extract_flat': True, | |
181 | 'skip_download': True, | |
182 | } | |
183 | } | |
ea38e55f PH |
184 | |
185 | def _real_extract(self, url): | |
186 | mobj = re.match(self._VALID_URL, url) | |
187 | uploader_id = mobj.group('username') | |
188 | ||
189 | entries = [] | |
190 | page_count = 0 | |
191 | media_url = 'http://instagram.com/%s/media' % uploader_id | |
192 | while True: | |
193 | page = self._download_json( | |
194 | media_url, uploader_id, | |
195 | note='Downloading page %d ' % (page_count + 1), | |
196 | ) | |
197 | page_count += 1 | |
198 | ||
199 | for it in page['items']: | |
200 | if it.get('type') != 'video': | |
201 | continue | |
202 | like_count = int_or_none(it.get('likes', {}).get('count')) | |
203 | user = it.get('user', {}) | |
204 | ||
205 | formats = [{ | |
206 | 'format_id': k, | |
207 | 'height': v.get('height'), | |
208 | 'width': v.get('width'), | |
209 | 'url': v['url'], | |
210 | } for k, v in it['videos'].items()] | |
211 | self._sort_formats(formats) | |
212 | ||
213 | thumbnails_el = it.get('images', {}) | |
214 | thumbnail = thumbnails_el.get('thumbnail', {}).get('url') | |
215 | ||
edb99d4c YCH |
216 | # In some cases caption is null, which corresponds to None |
217 | # in python. As a result, it.get('caption', {}) gives None | |
218 | title = (it.get('caption') or {}).get('text', it['id']) | |
ea38e55f PH |
219 | |
220 | entries.append({ | |
221 | 'id': it['id'], | |
e1ec9330 | 222 | 'title': limit_length(title, 80), |
ea38e55f PH |
223 | 'formats': formats, |
224 | 'thumbnail': thumbnail, | |
225 | 'webpage_url': it.get('link'), | |
226 | 'uploader': user.get('full_name'), | |
227 | 'uploader_id': user.get('username'), | |
228 | 'like_count': like_count, | |
912b38b4 | 229 | 'timestamp': int_or_none(it.get('created_time')), |
ea38e55f PH |
230 | }) |
231 | ||
232 | if not page['items']: | |
233 | break | |
2defa7d7 | 234 | max_id = page['items'][-1]['id'].split('_')[0] |
ea38e55f PH |
235 | media_url = ( |
236 | 'http://instagram.com/%s/media?max_id=%s' % ( | |
237 | uploader_id, max_id)) | |
238 | ||
239 | return { | |
240 | '_type': 'playlist', | |
241 | 'entries': entries, | |
242 | 'id': uploader_id, | |
243 | 'title': uploader_id, | |
244 | } |