]> jfr.im git - z_archive/twitter.git/blob - twitter/archiver.py
Fixed broken rate limiting error handling in archiver/follow.
[z_archive/twitter.git] / twitter / archiver.py
1 """USAGE
2 twitter-archiver [options] <-|user> [<user> ...]
3
4 DESCRIPTION
5 Archive tweets of users, sorted by date from oldest to newest, in
6 the following format: <id> <date> <<screen_name>> <tweet_text>
7 Date format is: YYYY-MM-DD HH:MM:SS TZ. Tweet <id> is used to
8 resume archiving on next run. Archive file name is the user name.
9 Provide "-" instead of users to read users from standard input.
10
11 OPTIONS
12 -o --oauth authenticate to Twitter using OAuth (default: no)
13 -s --save-dir <path> directory to save archives (default: current dir)
14 -a --api-rate see current API rate limit status
15 -t --timeline <file> archive own timeline into given file name (requires
16 OAuth, max 800 statuses)
17 -m --mentions <file> archive own mentions instead of timeline into
18 given file name (requires OAuth, max 800 statuses)
19 -v --favorites archive user's favorites instead of timeline
20 -f --follow-redirects follow redirects of urls
21 -r --redirect-sites follow redirects for this comma separated list of hosts
22 -d --dms <file> archive own direct messages (both received and
23 sent) into given file name.
24 -i --isoformat store dates in ISO format (specifically RFC 3339)
25
26 AUTHENTICATION
27 Authenticate to Twitter using OAuth to archive tweets of private profiles
28 and have higher API rate limits. OAuth authentication tokens are stored
29 in ~/.twitter-archiver_oauth.
30 """
31
32 from __future__ import print_function
33
34 import os, sys, time as _time, calendar, functools
35 from datetime import time, date, datetime
36 from getopt import gnu_getopt as getopt, GetoptError
37
38 try:
39 import urllib.request as urllib2
40 import http.client as httplib
41 except ImportError:
42 import urllib2
43 import httplib
44
45
46 # T-Archiver (Twitter-Archiver) application registered by @stalkr_
47 CONSUMER_KEY='d8hIyfzs7ievqeeZLjZrqQ'
48 CONSUMER_SECRET='AnZmK0rnvaX7BoJ75l6XlilnbyMv7FoiDXWVmPD8'
49
50 from .api import Twitter, TwitterError
51 from .oauth import OAuth, read_token_file
52 from .oauth_dance import oauth_dance
53 from .auth import NoAuth
54 from .util import Fail, err, expand_line, parse_host_list
55 from .follow import lookup
56 from .timezones import utc as UTC, Local
57
58 def parse_args(args, options):
59 """Parse arguments from command-line to set options."""
60 long_opts = ['help', 'oauth', 'save-dir=', 'api-rate', 'timeline=', 'mentions=', 'favorites', 'follow-redirects',"redirect-sites=", 'dms=', 'isoformat']
61 short_opts = "hos:at:m:vfr:d:i"
62 opts, extra_args = getopt(args, short_opts, long_opts)
63
64 for opt, arg in opts:
65 if opt in ('-h', '--help'):
66 print(__doc__)
67 raise SystemExit(0)
68 elif opt in ('-o', '--oauth'):
69 options['oauth'] = True
70 elif opt in ('-s', '--save-dir'):
71 options['save-dir'] = arg
72 elif opt in ('-a', '--api-rate'):
73 options['api-rate' ] = True
74 elif opt in ('-t', '--timeline'):
75 options['timeline'] = arg
76 elif opt in ('-m', '--mentions'):
77 options['mentions'] = arg
78 elif opt in ('-v', '--favorites'):
79 options['favorites'] = True
80 elif opt in ('-f', '--follow-redirects'):
81 options['follow-redirects'] = True
82 elif opt in ('-r', '--redirect-sites'):
83 options['redirect-sites'] = arg
84 elif opt in ('-d', '--dms'):
85 options['dms'] = arg
86 elif opt in ('-i', '--isoformat'):
87 options['isoformat'] = True
88
89 options['extra_args'] = extra_args
90
91 def load_tweets(filename):
92 """Load tweets from file into dict, see save_tweets()."""
93 try:
94 archive = open(filename,"r")
95 except IOError: # no archive (yet)
96 return {}
97
98 tweets = {}
99 for line in archive.readlines():
100 tid, text = line.strip().split(" ", 1)
101 tweets[int(tid)] = text.decode("utf-8")
102
103 archive.close()
104 return tweets
105
106 def save_tweets(filename, tweets):
107 """Save tweets from dict to file.
108
109 Save tweets from dict to UTF-8 encoded file, one per line:
110 <tweet id (number)> <tweet text>
111 Tweet text is:
112 <date> <<user>> [RT @<user>: ]<text>
113
114 Args:
115 filename: A string representing the file name to save tweets to.
116 tweets: A dict mapping tweet-ids (int) to tweet text (str).
117 """
118 if len(tweets) == 0:
119 return
120
121 try:
122 archive = open(filename,"w")
123 except IOError as e:
124 err("Cannot save tweets: %s" % str(e))
125 return
126
127 for k in sorted(tweets.keys()):
128 archive.write("%i %s\n" % (k, tweets[k].encode('utf-8')))
129
130 archive.close()
131
132 def format_date(utc, isoformat=False):
133 """Parse Twitter's UTC date into UTC or local time."""
134 u = datetime.strptime(utc.replace('+0000','UTC'), '%a %b %d %H:%M:%S %Z %Y')
135 # This is the least painful way I could find to create a non-naive
136 # datetime including a UTC timezone. Alternative suggestions
137 # welcome.
138 unew = datetime.combine(u.date(), time(u.time().hour,
139 u.time().minute, u.time().second, tzinfo=UTC))
140
141 # Convert to localtime
142 unew = unew.astimezone(Local)
143
144 if isoformat:
145 return unew.isoformat()
146 else:
147 return unew.strftime('%Y-%m-%d %H:%M:%S %Z')
148
149 def expand_format_text(hosts, text):
150 """Following redirects in links."""
151 return direct_format_text(expand_line(text, hosts))
152
153 def direct_format_text(text):
154 """Transform special chars in text to have only one line."""
155 return text.replace('\n','\\n').replace('\r','\\r')
156
157 def statuses_resolve_uids(twitter, tl):
158 """Resolve user ids to screen names from statuses."""
159 # get all user ids that needs a lookup (no screen_name key)
160 user_ids = []
161 for t in tl:
162 rt = t.get('retweeted_status')
163 if rt and not rt['user'].get('screen_name'):
164 user_ids.append(rt['user']['id'])
165 if not t['user'].get('screen_name'):
166 user_ids.append(t['user']['id'])
167
168 # resolve all of them at once
169 names = lookup(twitter, list(set(user_ids)))
170
171 # build new statuses with resolved uids
172 new_tl = []
173 for t in tl:
174 rt = t.get('retweeted_status')
175 if rt and not rt['user'].get('screen_name'):
176 name = names[rt['user']['id']]
177 t['retweeted_status']['user']['screen_name'] = name
178 if not t['user'].get('screen_name'):
179 name = names[t['user']['id']]
180 t['user']['screen_name'] = name
181 new_tl.append(t)
182
183 return new_tl
184
185 def statuses_portion(twitter, screen_name, max_id=None, mentions=False, favorites=False, received_dms=None, isoformat=False):
186 """Get a portion of the statuses of a screen name."""
187 kwargs = dict(count=200, include_rts=1, screen_name=screen_name)
188 if max_id:
189 kwargs['max_id'] = max_id
190
191 tweets = {}
192 if mentions:
193 tl = twitter.statuses.mentions_timeline(**kwargs)
194 elif favorites:
195 tl = twitter.favorites.list(**kwargs)
196 elif received_dms != None:
197 if received_dms:
198 tl = twitter.direct_messages(**kwargs)
199 else: # sent DMs
200 tl = twitter.direct_messages.sent(**kwargs)
201 else: # timeline
202 if screen_name:
203 tl = twitter.statuses.user_timeline(**kwargs)
204 else: # self
205 tl = twitter.statuses.home_timeline(**kwargs)
206
207 # some tweets do not provide screen name but user id, resolve those
208 # this isn't a valid operation for DMs, so special-case them
209 if received_dms == None:
210 newtl = statuses_resolve_uids(twitter, tl)
211 else:
212 newtl = tl
213 for t in newtl:
214 text = t['text']
215 rt = t.get('retweeted_status')
216 if rt:
217 text = "RT @%s: %s" % (rt['user']['screen_name'], rt['text'])
218 # DMs don't include mentions by default, so in order to show who
219 # the recipient was, we synthesise a mention. If we're not
220 # operating on DMs, behave as normal
221 if received_dms == None:
222 tweets[t['id']] = "%s <%s> %s" % (format_date(t['created_at'], isoformat=isoformat),
223 t['user']['screen_name'],
224 format_text(text))
225 else:
226 tweets[t['id']] = "%s <%s> @%s %s" % (format_date(t['created_at'], isoformat=isoformat),
227 t['sender_screen_name'],
228 t['recipient']['screen_name'],
229 format_text(text))
230 return tweets
231
232 def statuses(twitter, screen_name, tweets, mentions=False, favorites=False, received_dms=None, isoformat=False):
233 """Get all the statuses for a screen name."""
234 max_id = None
235 fail = Fail()
236 # get portions of statuses, incrementing max id until no new tweets appear
237 while True:
238 try:
239 portion = statuses_portion(twitter, screen_name, max_id, mentions, favorites, received_dms, isoformat)
240 except TwitterError as e:
241 if e.e.code == 401:
242 err("Fail: %i Unauthorized (tweets of that user are protected)"
243 % e.e.code)
244 break
245 elif e.e.code == 429:
246 err("Fail: %i API rate limit exceeded" % e.e.code)
247 rls = twitter.application.rate_limit_status()
248 reset = rls.rate_limit_reset
249 reset = _time.asctime(_time.localtime(reset))
250 delay = int(rls.rate_limit_reset
251 - _time.time()) + 5 # avoid race
252 err("Interval limit of %i requests reached, next reset on %s: "
253 "going to sleep for %i secs" % (rls.rate_limit_limit,
254 reset, delay))
255 fail.wait(delay)
256 continue
257 elif e.e.code == 404:
258 err("Fail: %i This profile does not exist" % e.e.code)
259 break
260 elif e.e.code == 502:
261 err("Fail: %i Service currently unavailable, retrying..."
262 % e.e.code)
263 else:
264 err("Fail: %s\nRetrying..." % str(e)[:500])
265 fail.wait(3)
266 except urllib2.URLError as e:
267 err("Fail: urllib2.URLError %s - Retrying..." % str(e))
268 fail.wait(3)
269 except httplib.error as e:
270 err("Fail: httplib.error %s - Retrying..." % str(e))
271 fail.wait(3)
272 except KeyError as e:
273 err("Fail: KeyError %s - Retrying..." % str(e))
274 fail.wait(3)
275 else:
276 new = -len(tweets)
277 tweets.update(portion)
278 new += len(tweets)
279 err("Browsing %s statuses, new tweets: %i"
280 % (screen_name if screen_name else "home", new))
281 if new < 190:
282 break
283 max_id = min(portion.keys())-1 # browse backwards
284 fail = Fail()
285
286 def rate_limit_status(twitter):
287 """Print current Twitter API rate limit status."""
288 rls = twitter.application.rate_limit_status()
289 print("Remaining API requests: %i/%i (interval limit)"
290 % (rls.rate_limit_remaining, rls.rate_limit_limit))
291 print("Next reset in %is (%s)"
292 % (int(rls.rate_limit_reset - _time.time()),
293 _time.asctime(_time.localtime(rls.rate_limit_reset))))
294
295 def main(args=sys.argv[1:]):
296 options = {
297 'oauth': False,
298 'save-dir': ".",
299 'api-rate': False,
300 'timeline': "",
301 'mentions': "",
302 'dms': "",
303 'favorites': False,
304 'follow-redirects': False,
305 'redirect-sites': None,
306 'isoformat': False,
307 }
308 try:
309 parse_args(args, options)
310 except GetoptError as e:
311 err("I can't do that, %s." % e)
312 raise SystemExit(1)
313
314 # exit if no user given
315 # except if asking for API rate, or archive of timeline or mentions
316 if not options['extra_args'] and not (options['api-rate'] or
317 options['timeline'] or
318 options['mentions'] or
319 options['dms']):
320 print(__doc__)
321 return
322
323 # authenticate using OAuth, asking for token if necessary
324 if options['oauth']:
325 oauth_filename = (os.getenv("HOME", "") + os.sep
326 + ".twitter-archiver_oauth")
327 if not os.path.exists(oauth_filename):
328 oauth_dance("Twitter-Archiver", CONSUMER_KEY, CONSUMER_SECRET,
329 oauth_filename)
330 oauth_token, oauth_token_secret = read_token_file(oauth_filename)
331 auth = OAuth(oauth_token, oauth_token_secret, CONSUMER_KEY,
332 CONSUMER_SECRET)
333 else:
334 auth = NoAuth()
335
336 twitter = Twitter(auth=auth, api_version='1.1', domain='api.twitter.com')
337
338 if options['api-rate']:
339 rate_limit_status(twitter)
340 return
341
342 global format_text
343 if options['follow-redirects'] or options['redirect-sites'] :
344 if options['redirect-sites']:
345 hosts = parse_host_list(options['redirect-sites'])
346 else:
347 hosts = None
348 format_text = functools.partial(expand_format_text, hosts)
349 else:
350 format_text = direct_format_text
351
352 # save own timeline or mentions (the user used in OAuth)
353 if options['timeline'] or options['mentions']:
354 if isinstance(auth, NoAuth):
355 err("You must be authenticated to save timeline or mentions.")
356 raise SystemExit(1)
357
358 if options['timeline']:
359 filename = options['save-dir'] + os.sep + options['timeline']
360 print("* Archiving own timeline in %s" % filename)
361 elif options['mentions']:
362 filename = options['save-dir'] + os.sep + options['mentions']
363 print("* Archiving own mentions in %s" % filename)
364
365 tweets = {}
366 try:
367 tweets = load_tweets(filename)
368 except Exception as e:
369 err("Error when loading saved tweets: %s - continuing without"
370 % str(e))
371
372 try:
373 statuses(twitter, "", tweets, options['mentions'], options['favorites'], isoformat=options['isoformat'])
374 except KeyboardInterrupt:
375 err()
376 err("Interrupted")
377 raise SystemExit(1)
378
379 save_tweets(filename, tweets)
380 if options['timeline']:
381 print("Total tweets in own timeline: %i" % len(tweets))
382 elif options['mentions']:
383 print("Total mentions: %i" % len(tweets))
384
385 if options['dms']:
386 if isinstance(auth, NoAuth):
387 err("You must be authenticated to save DMs.")
388 raise SystemExit(1)
389
390 filename = options['save-dir'] + os.sep + options['dms']
391 print("* Archiving own DMs in %s" % filename)
392
393 dms = {}
394 try:
395 dms = load_tweets(filename)
396 except Exception as e:
397 err("Error when loading saved DMs: %s - continuing without"
398 % str(e))
399
400 try:
401 statuses(twitter, "", dms, received_dms=True, isoformat=options['isoformat'])
402 statuses(twitter, "", dms, received_dms=False, isoformat=options['isoformat'])
403 except KeyboardInterrupt:
404 err()
405 err("Interrupted")
406 raise SystemExit(1)
407
408 save_tweets(filename, dms)
409 print("Total DMs sent and received: %i" % len(dms))
410
411
412 # read users from command-line or stdin
413 users = options['extra_args']
414 if len(users) == 1 and users[0] == "-":
415 users = [line.strip() for line in sys.stdin.readlines()]
416
417 # save tweets for every user
418 total, total_new = 0, 0
419 for user in users:
420 filename = options['save-dir'] + os.sep + user
421 if options['favorites']:
422 filename = filename + "-favorites"
423 print("* Archiving %s tweets in %s" % (user, filename))
424
425 tweets = {}
426 try:
427 tweets = load_tweets(filename)
428 except Exception as e:
429 err("Error when loading saved tweets: %s - continuing without"
430 % str(e))
431
432 new = 0
433 before = len(tweets)
434 try:
435 statuses(twitter, user, tweets, options['mentions'], options['favorites'], isoformat=options['isoformat'])
436 except KeyboardInterrupt:
437 err()
438 err("Interrupted")
439 raise SystemExit(1)
440
441 save_tweets(filename, tweets)
442 total += len(tweets)
443 new = len(tweets) - before
444 total_new += new
445 print("Total tweets for %s: %i (%i new)" % (user, len(tweets), new))
446
447 print("Total: %i tweets (%i new) for %i users"
448 % (total, total_new, len(users)))