]> jfr.im git - z_archive/twitter.git/blame - twitter/archiver.py
twitter-archiver and twitter-follow initial import
[z_archive/twitter.git] / twitter / archiver.py
CommitLineData
a7282452
S
1"""USAGE
2 twitter-archiver [options] <-|user> [<user> ...]
3
4DESCRIPTION
5 Archive tweets of users, sorted by date from oldest to newest, in
6 the following format: <id> <date> <<screen_name>> <tweet_text>
7 Date format is: YYYY-MM-DD HH:MM:SS TZ. Tweet <id> is used to
8 resume archiving on next run. Archive file name is the user name.
9 Provide "-" instead of users to read users from standard input.
10
11OPTIONS
12 -o --oauth authenticate to Twitter using OAuth (default no)
13 -s --save-dir <path> directory to save archives (default: current dir)
14 -a --api-rate see current API rate limit status
15 -t --timeline <file> archive own timeline into given file name (requires
16 OAuth, max 800 statuses).
17
18AUTHENTICATION
19 Authenticate to Twitter using OAuth to archive tweets of private profiles
20 and have higher API rate limits. OAuth authentication tokens are stored
21 in ~/.twitter-archiver_oauth.
22"""
23
24from __future__ import print_function
25
26import os, sys, time, calendar, urllib2, httplib
27from getopt import gnu_getopt as getopt, GetoptError
28
29# T-Archiver (Twitter-Archiver) application registered by @stalkr_
30CONSUMER_KEY='d8hIyfzs7ievqeeZLjZrqQ'
31CONSUMER_SECRET='AnZmK0rnvaX7BoJ75l6XlilnbyMv7FoiDXWVmPD8'
32
33from .api import Twitter, TwitterError
34from .oauth import OAuth, read_token_file
35from .oauth_dance import oauth_dance
36from .auth import NoAuth
37from .util import Fail, err
38from .follow import lookup
39
40def parse_args(args, options):
41 """Parse arguments from command-line to set options."""
42 long_opts = ['help', 'oauth', 'save-dir=', 'api-rate', 'timeline=']
43 short_opts = "hos:at:"
44 opts, extra_args = getopt(args, short_opts, long_opts)
45
46 for opt, arg in opts:
47 if opt in ('-h', '--help'):
48 print(__doc__)
49 raise SystemExit(0)
50 elif opt in ('-o', '--oauth'):
51 options['oauth'] = True
52 elif opt in ('-s', '--save-dir'):
53 options['save-dir'] = arg
54 elif opt in ('-a', '--api-rate'):
55 options['api-rate' ] = True
56 elif opt in ('-t', '--timeline'):
57 options['timeline'] = arg
58
59 options['extra_args'] = extra_args
60
61def load_tweets(filename):
62 """Load tweets from file into dict, see save_tweets()."""
63 try:
64 archive = open(filename,"r")
65 except IOError: # no archive (yet)
66 return {}
67
68 tweets = {}
69 for line in archive.readlines():
70 tid, text = line.strip().split(" ", 1)
71 tweets[int(tid)] = text.decode("utf-8")
72
73 archive.close()
74 return tweets
75
76def save_tweets(filename, tweets):
77 """Save tweets from dict to file.
78
79 Save tweets from dict to UTF-8 encoded file, one per line:
80 <tweet id (number)> <tweet text>
81 Tweet text is:
82 <date> <<user>> [RT @<user>: ]<text>
83
84 Args:
85 filename: A string representing the file name to save tweets to.
86 tweets: A dict mapping tweet-ids (int) to tweet text (str).
87 """
88 if len(tweets) == 0:
89 return
90
91 try:
92 archive = open(filename,"w")
93 except IOError as e:
94 err("Cannot save tweets: %s" % str(e))
95 return
96
97 for k in sorted(tweets.keys()):
98 archive.write("%i %s\n" % (k, tweets[k].encode('utf-8')))
99
100 archive.close()
101
102def format_date(utc, to_localtime=True):
103 """Parse Twitter's UTC date into UTC or local time."""
104 u = time.strptime(utc.replace('+0000','UTC'), '%a %b %d %H:%M:%S %Z %Y')
105 if to_localtime and time.timezone != 0:
106 t = time.localtime(calendar.timegm(u))
107 return time.strftime("%Y-%m-%d %H:%M:%S", t) + " " + time.tzname[1]
108 else:
109 return time.strftime("%Y-%m-%d %H:%M:%S UTC", u)
110
111def format_text(text):
112 """Transform special chars in text to have only one line."""
113 return text.replace('\n','\\n').replace('\r','\\r')
114
115def timeline_resolve_uids(twitter, tl):
116 """Resolve user ids to screen names from a timeline."""
117 # get all user ids that needs a lookup (no screen_name key)
118 user_ids = []
119 for t in tl:
120 rt = t.get('retweeted_status')
121 if rt and not rt['user'].get('screen_name'):
122 user_ids.append(rt['user']['id'])
123 if not t['user'].get('screen_name'):
124 user_ids.append(t['user']['id'])
125
126 # resolve all of them at once
127 names = lookup(twitter, list(set(user_ids)))
128
129 # build new timeline with resolved uids
130 new_tl = []
131 for t in tl:
132 rt = t.get('retweeted_status')
133 if rt and not rt['user'].get('screen_name'):
134 name = names[rt['user']['id']]
135 t['retweeted_status']['user']['screen_name'] = name
136 if not t['user'].get('screen_name'):
137 name = names[t['user']['id']]
138 t['user']['screen_name'] = name
139 new_tl.append(t)
140
141 return new_tl
142
143def timeline_portion(twitter, screen_name, max_id=None):
144 """Get a portion of the timeline of a screen name."""
145 kwargs = dict(count=200, include_rts=1, screen_name=screen_name)
146 if max_id:
147 kwargs['max_id'] = max_id
148
149 tweets = {}
150 if screen_name:
151 tl = twitter.statuses.user_timeline(**kwargs)
152 else: # self
153 tl = twitter.statuses.home_timeline(**kwargs)
154
155 # some tweets do not provide screen name but user id, resolve those
156 for t in timeline_resolve_uids(twitter, tl):
157 text = t['text']
158 rt = t.get('retweeted_status')
159 if rt:
160 text = "RT @%s: %s" % (rt['user']['screen_name'], rt['text'])
161 tweets[t['id']] = "%s <%s> %s" % (format_date(t['created_at']),
162 t['user']['screen_name'],
163 format_text(text))
164
165 return tweets
166
167def timeline(twitter, screen_name, tweets):
168 """Get the entire timeline of tweets for a screen name."""
169 max_id = None
170 fail = Fail()
171 # get portions of timeline, incrementing max id until no new tweets appear
172 while True:
173 try:
174 portion = timeline_portion(twitter, screen_name, max_id)
175 except TwitterError as e:
176 if e.e.code == 401:
177 err("Fail: %i Unauthorized (tweets of that user are protected)"
178 % e.e.code)
179 break
180 elif e.e.code == 400:
181 err("Fail: %i API rate limit exceeded" % e.e.code)
182 rate = twitter.account.rate_limit_status()
183 reset = rate['reset_time_in_seconds']
184 reset = time.asctime(time.localtime(reset))
185 delay = int(rate['reset_time_in_seconds']
186 - time.time()) + 5 # avoid race
187 err("Hourly limit of %i requests reached, next reset on %s: "
188 "going to sleep for %i secs" % (rate['hourly_limit'],
189 reset, delay))
190 fail.wait(delay)
191 continue
192 elif e.e.code == 502:
193 err("Fail: %i Service currently unavailable, retrying..."
194 % e.e.code)
195 else:
196 err("Fail: %s\nRetrying..." % str(e)[:500])
197 fail.wait(3)
198 except urllib2.URLError as e:
199 err("Fail: urllib2.URLError %s - Retrying..." % str(e))
200 fail.wait(3)
201 except httplib.error as e:
202 err("Fail: httplib.error %s - Retrying..." % str(e))
203 fail.wait(3)
204 except KeyError as e:
205 err("Fail: KeyError %s - Retrying..." % str(e))
206 fail.wait(3)
207 else:
208 new = -len(tweets)
209 tweets.update(portion)
210 new += len(tweets)
211 err("Browsing %s timeline, new tweets: %i"
212 % (screen_name if screen_name else "home", new))
213 if new < 190:
214 break
215 max_id = min(portion.keys()) # browse backwards
216 fail = Fail()
217
218def rate_limit_status(twitter):
219 """Print current Twitter API rate limit status."""
220 r = twitter.account.rate_limit_status()
221 print("Remaining API requests: %i/%i (hourly limit)"
222 % (r['remaining_hits'], r['hourly_limit']))
223 print("Next reset in %is (%s)"
224 % (int(r['reset_time_in_seconds'] - time.time()),
225 time.asctime(time.localtime(r['reset_time_in_seconds']))))
226
227def main(args=sys.argv[1:]):
228 options = {
229 'oauth': False,
230 'save-dir': ".",
231 'api-rate': False,
232 'timeline': ""
233 }
234 try:
235 parse_args(args, options)
236 except GetoptError as e:
237 err("I can't do that, %s." % e)
238 raise SystemExit(1)
239
240 # exit if no user given
241 # except if asking for API rate or archive of timeline
242 if not options['extra_args'] and not (options['api-rate'] or
243 options['timeline']):
244 print(__doc__)
245 return
246
247 # authenticate using OAuth, asking for token if necessary
248 if options['oauth']:
249 oauth_filename = (os.getenv("HOME", "") + os.sep
250 + ".twitter-archiver_oauth")
251 if not os.path.exists(oauth_filename):
252 oauth_dance("Twitter-Archiver", CONSUMER_KEY, CONSUMER_SECRET,
253 oauth_filename)
254 oauth_token, oauth_token_secret = read_token_file(oauth_filename)
255 auth = OAuth(oauth_token, oauth_token_secret, CONSUMER_KEY,
256 CONSUMER_SECRET)
257 else:
258 auth = NoAuth()
259
260 twitter = Twitter(auth=auth, api_version='1', domain='api.twitter.com')
261
262 if options['api-rate']:
263 rate_limit_status(twitter)
264 return
265
266 # save own timeline (the user used in OAuth)
267 if options['timeline']:
268 if isinstance(auth, NoAuth):
269 err("You must be authenticated to save timeline.")
270 raise SystemExit(1)
271
272 filename = options['save-dir'] + os.sep + options['timeline']
273 print("* Archiving own timeline in %s" % filename)
274
275 tweets = {}
276 try:
277 tweets = load_tweets(filename)
278 except Exception, e:
279 err("Error when loading saved tweets: %s - continuing without"
280 % str(e))
281
282 try:
283 # no screen_name means we want home_timeline, not user_timeline
284 timeline(twitter, "", tweets)
285 except KeyboardInterrupt:
286 err()
287 err("Interrupted")
288 raise SystemExit(1)
289
290 save_tweets(filename, tweets)
291 print("Total tweets in own timeline: %i" % len(tweets))
292
293 # read users from command-line or stdin
294 users = options['extra_args']
295 if len(users) == 1 and users[0] == "-":
296 users = [line.strip() for line in sys.stdin.readlines()]
297
298 # save tweets for every user
299 total, total_new = 0, 0
300 for user in users:
301 filename = options['save-dir'] + os.sep + user
302 print("* Archiving %s tweets in %s" % (user, filename))
303
304 tweets = {}
305 try:
306 tweets = load_tweets(filename)
307 except Exception, e:
308 err("Error when loading saved tweets: %s - continuing without"
309 % str(e))
310
311 new = 0
312 before = len(tweets)
313 try:
314 timeline(twitter, user, tweets)
315 except KeyboardInterrupt:
316 err()
317 err("Interrupted")
318 raise SystemExit(1)
319
320 save_tweets(filename, tweets)
321 total += len(tweets)
322 new = len(tweets) - before
323 total_new += new
324 print("Total tweets for %s: %i (%i new)" % (user, len(tweets), new))
325
326 print("Total: %i tweets (%i new) for %i users"
327 % (total, total_new, len(users)))