]> jfr.im git - z_archive/twitter.git/blob - twitter/archiver.py
Do not download the same tweet twice which could slowly waste API calls and bandwidth
[z_archive/twitter.git] / twitter / archiver.py
1 """USAGE
2 twitter-archiver [options] <-|user> [<user> ...]
3
4 DESCRIPTION
5 Archive tweets of users, sorted by date from oldest to newest, in
6 the following format: <id> <date> <<screen_name>> <tweet_text>
7 Date format is: YYYY-MM-DD HH:MM:SS TZ. Tweet <id> is used to
8 resume archiving on next run. Archive file name is the user name.
9 Provide "-" instead of users to read users from standard input.
10
11 OPTIONS
12 -o --oauth authenticate to Twitter using OAuth (default: no)
13 -s --save-dir <path> directory to save archives (default: current dir)
14 -a --api-rate see current API rate limit status
15 -t --timeline <file> archive own timeline into given file name (requires
16 OAuth, max 800 statuses)
17 -m --mentions <file> archive own mentions instead of timeline into
18 given file name (requires OAuth, max 800 statuses)
19 -v --favorites archive user's favorites instead of timeline
20 -f --follow-redirects follow redirects of urls
21 -r --redirect-sites follow redirects for this comma separated list of hosts
22
23 AUTHENTICATION
24 Authenticate to Twitter using OAuth to archive tweets of private profiles
25 and have higher API rate limits. OAuth authentication tokens are stored
26 in ~/.twitter-archiver_oauth.
27 """
28
29 from __future__ import print_function
30
31 import os, sys, time, calendar, functools
32 from getopt import gnu_getopt as getopt, GetoptError
33
34 try:
35 import urllib.request as urllib2
36 import http.client as httplib
37 except ImportError:
38 import urllib2
39 import httplib
40
41
42 # T-Archiver (Twitter-Archiver) application registered by @stalkr_
43 CONSUMER_KEY='d8hIyfzs7ievqeeZLjZrqQ'
44 CONSUMER_SECRET='AnZmK0rnvaX7BoJ75l6XlilnbyMv7FoiDXWVmPD8'
45
46 from .api import Twitter, TwitterError
47 from .oauth import OAuth, read_token_file
48 from .oauth_dance import oauth_dance
49 from .auth import NoAuth
50 from .util import Fail, err, expand_line, parse_host_list
51 from .follow import lookup
52
53 def parse_args(args, options):
54 """Parse arguments from command-line to set options."""
55 long_opts = ['help', 'oauth', 'save-dir=', 'api-rate', 'timeline=', 'mentions=', 'favorites', 'follow-redirects',"redirect-sites="]
56 short_opts = "hos:at:m:vfr:"
57 opts, extra_args = getopt(args, short_opts, long_opts)
58
59 for opt, arg in opts:
60 if opt in ('-h', '--help'):
61 print(__doc__)
62 raise SystemExit(0)
63 elif opt in ('-o', '--oauth'):
64 options['oauth'] = True
65 elif opt in ('-s', '--save-dir'):
66 options['save-dir'] = arg
67 elif opt in ('-a', '--api-rate'):
68 options['api-rate' ] = True
69 elif opt in ('-t', '--timeline'):
70 options['timeline'] = arg
71 elif opt in ('-m', '--mentions'):
72 options['mentions'] = arg
73 elif opt in ('-v', '--favorites'):
74 options['favorites'] = True
75 elif opt in ('-f', '--follow-redirects'):
76 options['follow-redirects'] = True
77 elif opt in ('-r', '--redirect-sites'):
78 options['redirect-sites'] = arg
79
80 options['extra_args'] = extra_args
81
82 def load_tweets(filename):
83 """Load tweets from file into dict, see save_tweets()."""
84 try:
85 archive = open(filename,"r")
86 except IOError: # no archive (yet)
87 return {}
88
89 tweets = {}
90 for line in archive.readlines():
91 tid, text = line.strip().split(" ", 1)
92 tweets[int(tid)] = text.decode("utf-8")
93
94 archive.close()
95 return tweets
96
97 def save_tweets(filename, tweets):
98 """Save tweets from dict to file.
99
100 Save tweets from dict to UTF-8 encoded file, one per line:
101 <tweet id (number)> <tweet text>
102 Tweet text is:
103 <date> <<user>> [RT @<user>: ]<text>
104
105 Args:
106 filename: A string representing the file name to save tweets to.
107 tweets: A dict mapping tweet-ids (int) to tweet text (str).
108 """
109 if len(tweets) == 0:
110 return
111
112 try:
113 archive = open(filename,"w")
114 except IOError as e:
115 err("Cannot save tweets: %s" % str(e))
116 return
117
118 for k in sorted(tweets.keys()):
119 archive.write("%i %s\n" % (k, tweets[k].encode('utf-8')))
120
121 archive.close()
122
123 def format_date(utc, to_localtime=True):
124 """Parse Twitter's UTC date into UTC or local time."""
125 u = time.strptime(utc.replace('+0000','UTC'), '%a %b %d %H:%M:%S %Z %Y')
126 if to_localtime and time.timezone != 0:
127 t = time.localtime(calendar.timegm(u))
128 return time.strftime("%Y-%m-%d %H:%M:%S", t) + " " + time.tzname[1]
129 else:
130 return time.strftime("%Y-%m-%d %H:%M:%S UTC", u)
131
132 def expand_format_text(hosts, text):
133 """Following redirects in links."""
134 return direct_format_text(expand_line(text, hosts))
135
136 def direct_format_text(text):
137 """Transform special chars in text to have only one line."""
138 return text.replace('\n','\\n').replace('\r','\\r')
139
140 def statuses_resolve_uids(twitter, tl):
141 """Resolve user ids to screen names from statuses."""
142 # get all user ids that needs a lookup (no screen_name key)
143 user_ids = []
144 for t in tl:
145 rt = t.get('retweeted_status')
146 if rt and not rt['user'].get('screen_name'):
147 user_ids.append(rt['user']['id'])
148 if not t['user'].get('screen_name'):
149 user_ids.append(t['user']['id'])
150
151 # resolve all of them at once
152 names = lookup(twitter, list(set(user_ids)))
153
154 # build new statuses with resolved uids
155 new_tl = []
156 for t in tl:
157 rt = t.get('retweeted_status')
158 if rt and not rt['user'].get('screen_name'):
159 name = names[rt['user']['id']]
160 t['retweeted_status']['user']['screen_name'] = name
161 if not t['user'].get('screen_name'):
162 name = names[t['user']['id']]
163 t['user']['screen_name'] = name
164 new_tl.append(t)
165
166 return new_tl
167
168 def statuses_portion(twitter, screen_name, max_id=None, mentions=False, favorites=False):
169 """Get a portion of the statuses of a screen name."""
170 kwargs = dict(count=200, include_rts=1, screen_name=screen_name)
171 if max_id:
172 kwargs['max_id'] = max_id
173
174 tweets = {}
175 if mentions:
176 tl = twitter.statuses.mentions(**kwargs)
177 elif favorites:
178 tl = twitter.favorites(**kwargs) # API v1, favorites.list() in v1.1
179 else: # timeline
180 if screen_name:
181 tl = twitter.statuses.user_timeline(**kwargs)
182 else: # self
183 tl = twitter.statuses.home_timeline(**kwargs)
184
185 # some tweets do not provide screen name but user id, resolve those
186 for t in statuses_resolve_uids(twitter, tl):
187 text = t['text']
188 rt = t.get('retweeted_status')
189 if rt:
190 text = "RT @%s: %s" % (rt['user']['screen_name'], rt['text'])
191 tweets[t['id']] = "%s <%s> %s" % (format_date(t['created_at']),
192 t['user']['screen_name'],
193 format_text(text))
194 return tweets
195
196 def statuses(twitter, screen_name, tweets, mentions=False, favorites=False):
197 """Get all the statuses for a screen name."""
198 max_id = None
199 fail = Fail()
200 # get portions of statuses, incrementing max id until no new tweets appear
201 while True:
202 try:
203 portion = statuses_portion(twitter, screen_name, max_id, mentions, favorites)
204 except TwitterError as e:
205 if e.e.code == 401:
206 err("Fail: %i Unauthorized (tweets of that user are protected)"
207 % e.e.code)
208 break
209 elif e.e.code == 400:
210 err("Fail: %i API rate limit exceeded" % e.e.code)
211 rate = twitter.account.rate_limit_status()
212 reset = rate['reset_time_in_seconds']
213 reset = time.asctime(time.localtime(reset))
214 delay = int(rate['reset_time_in_seconds']
215 - time.time()) + 5 # avoid race
216 err("Hourly limit of %i requests reached, next reset on %s: "
217 "going to sleep for %i secs" % (rate['hourly_limit'],
218 reset, delay))
219 fail.wait(delay)
220 continue
221 elif e.e.code == 404:
222 err("Fail: %i This profile does not exist" % e.e.code)
223 break
224 elif e.e.code == 502:
225 err("Fail: %i Service currently unavailable, retrying..."
226 % e.e.code)
227 else:
228 err("Fail: %s\nRetrying..." % str(e)[:500])
229 fail.wait(3)
230 except urllib2.URLError as e:
231 err("Fail: urllib2.URLError %s - Retrying..." % str(e))
232 fail.wait(3)
233 except httplib.error as e:
234 err("Fail: httplib.error %s - Retrying..." % str(e))
235 fail.wait(3)
236 except KeyError as e:
237 err("Fail: KeyError %s - Retrying..." % str(e))
238 fail.wait(3)
239 else:
240 new = -len(tweets)
241 tweets.update(portion)
242 new += len(tweets)
243 err("Browsing %s statuses, new tweets: %i"
244 % (screen_name if screen_name else "home", new))
245 if new < 190:
246 break
247 max_id = min(portion.keys())-1 # browse backwards
248 fail = Fail()
249
250 def rate_limit_status(twitter):
251 """Print current Twitter API rate limit status."""
252 r = twitter.account.rate_limit_status()
253 print("Remaining API requests: %i/%i (hourly limit)"
254 % (r['remaining_hits'], r['hourly_limit']))
255 print("Next reset in %is (%s)"
256 % (int(r['reset_time_in_seconds'] - time.time()),
257 time.asctime(time.localtime(r['reset_time_in_seconds']))))
258
259 def main(args=sys.argv[1:]):
260 options = {
261 'oauth': False,
262 'save-dir': ".",
263 'api-rate': False,
264 'timeline': "",
265 'mentions': "",
266 'favorites': False,
267 'follow-redirects': False,
268 'redirect-sites': None,
269 }
270 try:
271 parse_args(args, options)
272 except GetoptError as e:
273 err("I can't do that, %s." % e)
274 raise SystemExit(1)
275
276 # exit if no user given
277 # except if asking for API rate, or archive of timeline or mentions
278 if not options['extra_args'] and not (options['api-rate'] or
279 options['timeline'] or
280 options['mentions']):
281 print(__doc__)
282 return
283
284 # authenticate using OAuth, asking for token if necessary
285 if options['oauth']:
286 oauth_filename = (os.getenv("HOME", "") + os.sep
287 + ".twitter-archiver_oauth")
288 if not os.path.exists(oauth_filename):
289 oauth_dance("Twitter-Archiver", CONSUMER_KEY, CONSUMER_SECRET,
290 oauth_filename)
291 oauth_token, oauth_token_secret = read_token_file(oauth_filename)
292 auth = OAuth(oauth_token, oauth_token_secret, CONSUMER_KEY,
293 CONSUMER_SECRET)
294 else:
295 auth = NoAuth()
296
297 twitter = Twitter(auth=auth, api_version='1', domain='api.twitter.com')
298
299 if options['api-rate']:
300 rate_limit_status(twitter)
301 return
302
303 global format_text
304 if options['follow-redirects'] or options['redirect-sites'] :
305 if options['redirect-sites']:
306 hosts = parse_host_list(options['redirect-sites'])
307 else:
308 hosts = None
309 format_text = functools.partial(expand_format_text, hosts)
310 else:
311 format_text = direct_format_text
312
313 # save own timeline or mentions (the user used in OAuth)
314 if options['timeline'] or options['mentions']:
315 if isinstance(auth, NoAuth):
316 err("You must be authenticated to save timeline or mentions.")
317 raise SystemExit(1)
318
319 if options['timeline']:
320 filename = options['save-dir'] + os.sep + options['timeline']
321 print("* Archiving own timeline in %s" % filename)
322 elif options['mentions']:
323 filename = options['save-dir'] + os.sep + options['mentions']
324 print("* Archiving own mentions in %s" % filename)
325
326 tweets = {}
327 try:
328 tweets = load_tweets(filename)
329 except Exception as e:
330 err("Error when loading saved tweets: %s - continuing without"
331 % str(e))
332
333 try:
334 statuses(twitter, "", tweets, options['mentions'], options['favorites'])
335 except KeyboardInterrupt:
336 err()
337 err("Interrupted")
338 raise SystemExit(1)
339
340 save_tweets(filename, tweets)
341 if options['timeline']:
342 print("Total tweets in own timeline: %i" % len(tweets))
343 elif options['mentions']:
344 print("Total mentions: %i" % len(tweets))
345
346 # read users from command-line or stdin
347 users = options['extra_args']
348 if len(users) == 1 and users[0] == "-":
349 users = [line.strip() for line in sys.stdin.readlines()]
350
351 # save tweets for every user
352 total, total_new = 0, 0
353 for user in users:
354 filename = options['save-dir'] + os.sep + user
355 if options['favorites']:
356 filename = filename + "-favorites"
357 print("* Archiving %s tweets in %s" % (user, filename))
358
359 tweets = {}
360 try:
361 tweets = load_tweets(filename)
362 except Exception as e:
363 err("Error when loading saved tweets: %s - continuing without"
364 % str(e))
365
366 new = 0
367 before = len(tweets)
368 try:
369 statuses(twitter, user, tweets, options['mentions'], options['favorites'])
370 except KeyboardInterrupt:
371 err()
372 err("Interrupted")
373 raise SystemExit(1)
374
375 save_tweets(filename, tweets)
376 total += len(tweets)
377 new = len(tweets) - before
378 total_new += new
379 print("Total tweets for %s: %i (%i new)" % (user, len(tweets), new))
380
381 print("Total: %i tweets (%i new) for %i users"
382 % (total, total_new, len(users)))