]> jfr.im git - z_archive/twitter.git/blame - twitter/archiver.py
Fixed broken rate limiting error handling in archiver/follow.
[z_archive/twitter.git] / twitter / archiver.py
CommitLineData
a7282452
S
1"""USAGE
2 twitter-archiver [options] <-|user> [<user> ...]
3
4DESCRIPTION
5 Archive tweets of users, sorted by date from oldest to newest, in
6 the following format: <id> <date> <<screen_name>> <tweet_text>
7 Date format is: YYYY-MM-DD HH:MM:SS TZ. Tweet <id> is used to
8 resume archiving on next run. Archive file name is the user name.
9 Provide "-" instead of users to read users from standard input.
10
11OPTIONS
4f0b5ca6 12 -o --oauth authenticate to Twitter using OAuth (default: no)
a7282452
S
13 -s --save-dir <path> directory to save archives (default: current dir)
14 -a --api-rate see current API rate limit status
15 -t --timeline <file> archive own timeline into given file name (requires
4f0b5ca6
H
16 OAuth, max 800 statuses)
17 -m --mentions <file> archive own mentions instead of timeline into
18 given file name (requires OAuth, max 800 statuses)
19 -v --favorites archive user's favorites instead of timeline
907402f6 20 -f --follow-redirects follow redirects of urls
21 -r --redirect-sites follow redirects for this comma separated list of hosts
01618308
MC
22 -d --dms <file> archive own direct messages (both received and
23 sent) into given file name.
694aaadf 24 -i --isoformat store dates in ISO format (specifically RFC 3339)
a7282452
S
25
26AUTHENTICATION
27 Authenticate to Twitter using OAuth to archive tweets of private profiles
28 and have higher API rate limits. OAuth authentication tokens are stored
29 in ~/.twitter-archiver_oauth.
30"""
31
32from __future__ import print_function
33
694aaadf
MC
34import os, sys, time as _time, calendar, functools
35from datetime import time, date, datetime
a7282452
S
36from getopt import gnu_getopt as getopt, GetoptError
37
62ec1b07 38try:
39 import urllib.request as urllib2
40 import http.client as httplib
41except ImportError:
42 import urllib2
43 import httplib
44
45
a7282452
S
46# T-Archiver (Twitter-Archiver) application registered by @stalkr_
47CONSUMER_KEY='d8hIyfzs7ievqeeZLjZrqQ'
48CONSUMER_SECRET='AnZmK0rnvaX7BoJ75l6XlilnbyMv7FoiDXWVmPD8'
49
50from .api import Twitter, TwitterError
51from .oauth import OAuth, read_token_file
52from .oauth_dance import oauth_dance
53from .auth import NoAuth
907402f6 54from .util import Fail, err, expand_line, parse_host_list
a7282452 55from .follow import lookup
694aaadf 56from .timezones import utc as UTC, Local
a7282452
S
57
58def parse_args(args, options):
59 """Parse arguments from command-line to set options."""
694aaadf
MC
60 long_opts = ['help', 'oauth', 'save-dir=', 'api-rate', 'timeline=', 'mentions=', 'favorites', 'follow-redirects',"redirect-sites=", 'dms=', 'isoformat']
61 short_opts = "hos:at:m:vfr:d:i"
a7282452
S
62 opts, extra_args = getopt(args, short_opts, long_opts)
63
64 for opt, arg in opts:
65 if opt in ('-h', '--help'):
66 print(__doc__)
67 raise SystemExit(0)
68 elif opt in ('-o', '--oauth'):
69 options['oauth'] = True
70 elif opt in ('-s', '--save-dir'):
71 options['save-dir'] = arg
72 elif opt in ('-a', '--api-rate'):
73 options['api-rate' ] = True
74 elif opt in ('-t', '--timeline'):
75 options['timeline'] = arg
4f0b5ca6
H
76 elif opt in ('-m', '--mentions'):
77 options['mentions'] = arg
78 elif opt in ('-v', '--favorites'):
79 options['favorites'] = True
907402f6 80 elif opt in ('-f', '--follow-redirects'):
81 options['follow-redirects'] = True
82 elif opt in ('-r', '--redirect-sites'):
83 options['redirect-sites'] = arg
01618308
MC
84 elif opt in ('-d', '--dms'):
85 options['dms'] = arg
694aaadf
MC
86 elif opt in ('-i', '--isoformat'):
87 options['isoformat'] = True
a7282452
S
88
89 options['extra_args'] = extra_args
90
91def load_tweets(filename):
92 """Load tweets from file into dict, see save_tweets()."""
93 try:
94 archive = open(filename,"r")
95 except IOError: # no archive (yet)
96 return {}
97
98 tweets = {}
99 for line in archive.readlines():
100 tid, text = line.strip().split(" ", 1)
101 tweets[int(tid)] = text.decode("utf-8")
102
103 archive.close()
104 return tweets
105
106def save_tweets(filename, tweets):
107 """Save tweets from dict to file.
108
109 Save tweets from dict to UTF-8 encoded file, one per line:
110 <tweet id (number)> <tweet text>
111 Tweet text is:
112 <date> <<user>> [RT @<user>: ]<text>
113
114 Args:
115 filename: A string representing the file name to save tweets to.
116 tweets: A dict mapping tweet-ids (int) to tweet text (str).
117 """
118 if len(tweets) == 0:
119 return
120
121 try:
122 archive = open(filename,"w")
123 except IOError as e:
124 err("Cannot save tweets: %s" % str(e))
125 return
126
127 for k in sorted(tweets.keys()):
128 archive.write("%i %s\n" % (k, tweets[k].encode('utf-8')))
129
130 archive.close()
131
941cdf0a 132def format_date(utc, isoformat=False):
a7282452 133 """Parse Twitter's UTC date into UTC or local time."""
694aaadf 134 u = datetime.strptime(utc.replace('+0000','UTC'), '%a %b %d %H:%M:%S %Z %Y')
941cdf0a 135 # This is the least painful way I could find to create a non-naive
7ab9bcfe 136 # datetime including a UTC timezone. Alternative suggestions
941cdf0a 137 # welcome.
694aaadf
MC
138 unew = datetime.combine(u.date(), time(u.time().hour,
139 u.time().minute, u.time().second, tzinfo=UTC))
140
7ab9bcfe
MC
141 # Convert to localtime
142 unew = unew.astimezone(Local)
143
694aaadf
MC
144 if isoformat:
145 return unew.isoformat()
a7282452 146 else:
694aaadf 147 return unew.strftime('%Y-%m-%d %H:%M:%S %Z')
a7282452 148
907402f6 149def expand_format_text(hosts, text):
150 """Following redirects in links."""
151 return direct_format_text(expand_line(text, hosts))
152
153def direct_format_text(text):
a7282452
S
154 """Transform special chars in text to have only one line."""
155 return text.replace('\n','\\n').replace('\r','\\r')
156
4f0b5ca6
H
157def statuses_resolve_uids(twitter, tl):
158 """Resolve user ids to screen names from statuses."""
a7282452
S
159 # get all user ids that needs a lookup (no screen_name key)
160 user_ids = []
161 for t in tl:
162 rt = t.get('retweeted_status')
163 if rt and not rt['user'].get('screen_name'):
164 user_ids.append(rt['user']['id'])
165 if not t['user'].get('screen_name'):
166 user_ids.append(t['user']['id'])
167
168 # resolve all of them at once
169 names = lookup(twitter, list(set(user_ids)))
170
4f0b5ca6 171 # build new statuses with resolved uids
a7282452
S
172 new_tl = []
173 for t in tl:
174 rt = t.get('retweeted_status')
175 if rt and not rt['user'].get('screen_name'):
176 name = names[rt['user']['id']]
177 t['retweeted_status']['user']['screen_name'] = name
178 if not t['user'].get('screen_name'):
179 name = names[t['user']['id']]
180 t['user']['screen_name'] = name
181 new_tl.append(t)
182
183 return new_tl
184
694aaadf 185def statuses_portion(twitter, screen_name, max_id=None, mentions=False, favorites=False, received_dms=None, isoformat=False):
4f0b5ca6 186 """Get a portion of the statuses of a screen name."""
a7282452
S
187 kwargs = dict(count=200, include_rts=1, screen_name=screen_name)
188 if max_id:
189 kwargs['max_id'] = max_id
190
191 tweets = {}
4f0b5ca6 192 if mentions:
7a5610b1 193 tl = twitter.statuses.mentions_timeline(**kwargs)
4f0b5ca6 194 elif favorites:
7a5610b1 195 tl = twitter.favorites.list(**kwargs)
01618308
MC
196 elif received_dms != None:
197 if received_dms:
198 tl = twitter.direct_messages(**kwargs)
199 else: # sent DMs
200 tl = twitter.direct_messages.sent(**kwargs)
4f0b5ca6
H
201 else: # timeline
202 if screen_name:
203 tl = twitter.statuses.user_timeline(**kwargs)
204 else: # self
205 tl = twitter.statuses.home_timeline(**kwargs)
a7282452
S
206
207 # some tweets do not provide screen name but user id, resolve those
01618308
MC
208 # this isn't a valid operation for DMs, so special-case them
209 if received_dms == None:
210 newtl = statuses_resolve_uids(twitter, tl)
211 else:
212 newtl = tl
213 for t in newtl:
a7282452
S
214 text = t['text']
215 rt = t.get('retweeted_status')
216 if rt:
217 text = "RT @%s: %s" % (rt['user']['screen_name'], rt['text'])
01618308
MC
218 # DMs don't include mentions by default, so in order to show who
219 # the recipient was, we synthesise a mention. If we're not
220 # operating on DMs, behave as normal
221 if received_dms == None:
694aaadf 222 tweets[t['id']] = "%s <%s> %s" % (format_date(t['created_at'], isoformat=isoformat),
01618308
MC
223 t['user']['screen_name'],
224 format_text(text))
225 else:
694aaadf 226 tweets[t['id']] = "%s <%s> @%s %s" % (format_date(t['created_at'], isoformat=isoformat),
01618308
MC
227 t['sender_screen_name'],
228 t['recipient']['screen_name'],
229 format_text(text))
a7282452
S
230 return tweets
231
694aaadf 232def statuses(twitter, screen_name, tweets, mentions=False, favorites=False, received_dms=None, isoformat=False):
4f0b5ca6 233 """Get all the statuses for a screen name."""
a7282452
S
234 max_id = None
235 fail = Fail()
4f0b5ca6 236 # get portions of statuses, incrementing max id until no new tweets appear
a7282452
S
237 while True:
238 try:
694aaadf 239 portion = statuses_portion(twitter, screen_name, max_id, mentions, favorites, received_dms, isoformat)
a7282452
S
240 except TwitterError as e:
241 if e.e.code == 401:
242 err("Fail: %i Unauthorized (tweets of that user are protected)"
243 % e.e.code)
244 break
780044e6 245 elif e.e.code == 429:
a7282452 246 err("Fail: %i API rate limit exceeded" % e.e.code)
780044e6
EB
247 rls = twitter.application.rate_limit_status()
248 reset = rls.rate_limit_reset
249 reset = _time.asctime(_time.localtime(reset))
250 delay = int(rls.rate_limit_reset
251 - _time.time()) + 5 # avoid race
252 err("Interval limit of %i requests reached, next reset on %s: "
253 "going to sleep for %i secs" % (rls.rate_limit_limit,
a7282452
S
254 reset, delay))
255 fail.wait(delay)
256 continue
bb2a70ee
S
257 elif e.e.code == 404:
258 err("Fail: %i This profile does not exist" % e.e.code)
259 break
a7282452
S
260 elif e.e.code == 502:
261 err("Fail: %i Service currently unavailable, retrying..."
262 % e.e.code)
263 else:
264 err("Fail: %s\nRetrying..." % str(e)[:500])
265 fail.wait(3)
266 except urllib2.URLError as e:
267 err("Fail: urllib2.URLError %s - Retrying..." % str(e))
268 fail.wait(3)
269 except httplib.error as e:
270 err("Fail: httplib.error %s - Retrying..." % str(e))
271 fail.wait(3)
272 except KeyError as e:
273 err("Fail: KeyError %s - Retrying..." % str(e))
274 fail.wait(3)
275 else:
276 new = -len(tweets)
277 tweets.update(portion)
278 new += len(tweets)
4f0b5ca6 279 err("Browsing %s statuses, new tweets: %i"
a7282452
S
280 % (screen_name if screen_name else "home", new))
281 if new < 190:
282 break
aa3d6d75 283 max_id = min(portion.keys())-1 # browse backwards
a7282452
S
284 fail = Fail()
285
286def rate_limit_status(twitter):
287 """Print current Twitter API rate limit status."""
780044e6
EB
288 rls = twitter.application.rate_limit_status()
289 print("Remaining API requests: %i/%i (interval limit)"
290 % (rls.rate_limit_remaining, rls.rate_limit_limit))
a7282452 291 print("Next reset in %is (%s)"
780044e6
EB
292 % (int(rls.rate_limit_reset - _time.time()),
293 _time.asctime(_time.localtime(rls.rate_limit_reset))))
a7282452
S
294
295def main(args=sys.argv[1:]):
296 options = {
297 'oauth': False,
298 'save-dir': ".",
299 'api-rate': False,
907402f6 300 'timeline': "",
4f0b5ca6 301 'mentions': "",
01618308 302 'dms': "",
4f0b5ca6 303 'favorites': False,
907402f6 304 'follow-redirects': False,
305 'redirect-sites': None,
694aaadf 306 'isoformat': False,
a7282452
S
307 }
308 try:
309 parse_args(args, options)
310 except GetoptError as e:
311 err("I can't do that, %s." % e)
312 raise SystemExit(1)
313
314 # exit if no user given
4f0b5ca6 315 # except if asking for API rate, or archive of timeline or mentions
a7282452 316 if not options['extra_args'] and not (options['api-rate'] or
4f0b5ca6 317 options['timeline'] or
01618308
MC
318 options['mentions'] or
319 options['dms']):
a7282452
S
320 print(__doc__)
321 return
322
323 # authenticate using OAuth, asking for token if necessary
324 if options['oauth']:
325 oauth_filename = (os.getenv("HOME", "") + os.sep
326 + ".twitter-archiver_oauth")
327 if not os.path.exists(oauth_filename):
328 oauth_dance("Twitter-Archiver", CONSUMER_KEY, CONSUMER_SECRET,
329 oauth_filename)
330 oauth_token, oauth_token_secret = read_token_file(oauth_filename)
331 auth = OAuth(oauth_token, oauth_token_secret, CONSUMER_KEY,
332 CONSUMER_SECRET)
333 else:
334 auth = NoAuth()
335
7a5610b1 336 twitter = Twitter(auth=auth, api_version='1.1', domain='api.twitter.com')
a7282452
S
337
338 if options['api-rate']:
339 rate_limit_status(twitter)
340 return
341
907402f6 342 global format_text
343 if options['follow-redirects'] or options['redirect-sites'] :
344 if options['redirect-sites']:
345 hosts = parse_host_list(options['redirect-sites'])
346 else:
347 hosts = None
348 format_text = functools.partial(expand_format_text, hosts)
349 else:
350 format_text = direct_format_text
be5f32da 351
4f0b5ca6
H
352 # save own timeline or mentions (the user used in OAuth)
353 if options['timeline'] or options['mentions']:
a7282452 354 if isinstance(auth, NoAuth):
4f0b5ca6 355 err("You must be authenticated to save timeline or mentions.")
a7282452
S
356 raise SystemExit(1)
357
4f0b5ca6
H
358 if options['timeline']:
359 filename = options['save-dir'] + os.sep + options['timeline']
360 print("* Archiving own timeline in %s" % filename)
361 elif options['mentions']:
362 filename = options['save-dir'] + os.sep + options['mentions']
363 print("* Archiving own mentions in %s" % filename)
a7282452
S
364
365 tweets = {}
366 try:
367 tweets = load_tweets(filename)
62ec1b07 368 except Exception as e:
a7282452
S
369 err("Error when loading saved tweets: %s - continuing without"
370 % str(e))
371
372 try:
694aaadf 373 statuses(twitter, "", tweets, options['mentions'], options['favorites'], isoformat=options['isoformat'])
a7282452
S
374 except KeyboardInterrupt:
375 err()
376 err("Interrupted")
377 raise SystemExit(1)
378
379 save_tweets(filename, tweets)
4f0b5ca6
H
380 if options['timeline']:
381 print("Total tweets in own timeline: %i" % len(tweets))
382 elif options['mentions']:
383 print("Total mentions: %i" % len(tweets))
a7282452 384
01618308
MC
385 if options['dms']:
386 if isinstance(auth, NoAuth):
387 err("You must be authenticated to save DMs.")
388 raise SystemExit(1)
389
390 filename = options['save-dir'] + os.sep + options['dms']
391 print("* Archiving own DMs in %s" % filename)
392
393 dms = {}
394 try:
395 dms = load_tweets(filename)
04c483ab 396 except Exception as e:
01618308
MC
397 err("Error when loading saved DMs: %s - continuing without"
398 % str(e))
399
400 try:
694aaadf
MC
401 statuses(twitter, "", dms, received_dms=True, isoformat=options['isoformat'])
402 statuses(twitter, "", dms, received_dms=False, isoformat=options['isoformat'])
01618308
MC
403 except KeyboardInterrupt:
404 err()
405 err("Interrupted")
406 raise SystemExit(1)
407
408 save_tweets(filename, dms)
409 print("Total DMs sent and received: %i" % len(dms))
410
411
a7282452
S
412 # read users from command-line or stdin
413 users = options['extra_args']
414 if len(users) == 1 and users[0] == "-":
415 users = [line.strip() for line in sys.stdin.readlines()]
416
417 # save tweets for every user
418 total, total_new = 0, 0
419 for user in users:
420 filename = options['save-dir'] + os.sep + user
4f0b5ca6
H
421 if options['favorites']:
422 filename = filename + "-favorites"
a7282452
S
423 print("* Archiving %s tweets in %s" % (user, filename))
424
425 tweets = {}
426 try:
427 tweets = load_tweets(filename)
62ec1b07 428 except Exception as e:
a7282452
S
429 err("Error when loading saved tweets: %s - continuing without"
430 % str(e))
431
432 new = 0
433 before = len(tweets)
434 try:
694aaadf 435 statuses(twitter, user, tweets, options['mentions'], options['favorites'], isoformat=options['isoformat'])
a7282452
S
436 except KeyboardInterrupt:
437 err()
438 err("Interrupted")
439 raise SystemExit(1)
440
441 save_tweets(filename, tweets)
442 total += len(tweets)
443 new = len(tweets) - before
444 total_new += new
445 print("Total tweets for %s: %i (%i new)" % (user, len(tweets), new))
446
447 print("Total: %i tweets (%i new) for %i users"
448 % (total, total_new, len(users)))