]> jfr.im git - z_archive/twitter.git/blame - twitter/archiver.py
Merge pull request #69 from Lacrymology/master
[z_archive/twitter.git] / twitter / archiver.py
CommitLineData
a7282452
S
1"""USAGE
2 twitter-archiver [options] <-|user> [<user> ...]
3
4DESCRIPTION
5 Archive tweets of users, sorted by date from oldest to newest, in
6 the following format: <id> <date> <<screen_name>> <tweet_text>
7 Date format is: YYYY-MM-DD HH:MM:SS TZ. Tweet <id> is used to
8 resume archiving on next run. Archive file name is the user name.
9 Provide "-" instead of users to read users from standard input.
10
11OPTIONS
12 -o --oauth authenticate to Twitter using OAuth (default no)
13 -s --save-dir <path> directory to save archives (default: current dir)
14 -a --api-rate see current API rate limit status
15 -t --timeline <file> archive own timeline into given file name (requires
16 OAuth, max 800 statuses).
17
18AUTHENTICATION
19 Authenticate to Twitter using OAuth to archive tweets of private profiles
20 and have higher API rate limits. OAuth authentication tokens are stored
21 in ~/.twitter-archiver_oauth.
22"""
23
24from __future__ import print_function
25
26import os, sys, time, calendar, urllib2, httplib
27from getopt import gnu_getopt as getopt, GetoptError
28
29# T-Archiver (Twitter-Archiver) application registered by @stalkr_
30CONSUMER_KEY='d8hIyfzs7ievqeeZLjZrqQ'
31CONSUMER_SECRET='AnZmK0rnvaX7BoJ75l6XlilnbyMv7FoiDXWVmPD8'
32
33from .api import Twitter, TwitterError
34from .oauth import OAuth, read_token_file
35from .oauth_dance import oauth_dance
36from .auth import NoAuth
37from .util import Fail, err
38from .follow import lookup
39
40def parse_args(args, options):
41 """Parse arguments from command-line to set options."""
42 long_opts = ['help', 'oauth', 'save-dir=', 'api-rate', 'timeline=']
43 short_opts = "hos:at:"
44 opts, extra_args = getopt(args, short_opts, long_opts)
45
46 for opt, arg in opts:
47 if opt in ('-h', '--help'):
48 print(__doc__)
49 raise SystemExit(0)
50 elif opt in ('-o', '--oauth'):
51 options['oauth'] = True
52 elif opt in ('-s', '--save-dir'):
53 options['save-dir'] = arg
54 elif opt in ('-a', '--api-rate'):
55 options['api-rate' ] = True
56 elif opt in ('-t', '--timeline'):
57 options['timeline'] = arg
58
59 options['extra_args'] = extra_args
60
61def load_tweets(filename):
62 """Load tweets from file into dict, see save_tweets()."""
63 try:
64 archive = open(filename,"r")
65 except IOError: # no archive (yet)
66 return {}
67
68 tweets = {}
69 for line in archive.readlines():
70 tid, text = line.strip().split(" ", 1)
71 tweets[int(tid)] = text.decode("utf-8")
72
73 archive.close()
74 return tweets
75
76def save_tweets(filename, tweets):
77 """Save tweets from dict to file.
78
79 Save tweets from dict to UTF-8 encoded file, one per line:
80 <tweet id (number)> <tweet text>
81 Tweet text is:
82 <date> <<user>> [RT @<user>: ]<text>
83
84 Args:
85 filename: A string representing the file name to save tweets to.
86 tweets: A dict mapping tweet-ids (int) to tweet text (str).
87 """
88 if len(tweets) == 0:
89 return
90
91 try:
92 archive = open(filename,"w")
93 except IOError as e:
94 err("Cannot save tweets: %s" % str(e))
95 return
96
97 for k in sorted(tweets.keys()):
98 archive.write("%i %s\n" % (k, tweets[k].encode('utf-8')))
99
100 archive.close()
101
102def format_date(utc, to_localtime=True):
103 """Parse Twitter's UTC date into UTC or local time."""
104 u = time.strptime(utc.replace('+0000','UTC'), '%a %b %d %H:%M:%S %Z %Y')
105 if to_localtime and time.timezone != 0:
106 t = time.localtime(calendar.timegm(u))
107 return time.strftime("%Y-%m-%d %H:%M:%S", t) + " " + time.tzname[1]
108 else:
109 return time.strftime("%Y-%m-%d %H:%M:%S UTC", u)
110
111def format_text(text):
112 """Transform special chars in text to have only one line."""
113 return text.replace('\n','\\n').replace('\r','\\r')
114
115def timeline_resolve_uids(twitter, tl):
116 """Resolve user ids to screen names from a timeline."""
117 # get all user ids that needs a lookup (no screen_name key)
118 user_ids = []
119 for t in tl:
120 rt = t.get('retweeted_status')
121 if rt and not rt['user'].get('screen_name'):
122 user_ids.append(rt['user']['id'])
123 if not t['user'].get('screen_name'):
124 user_ids.append(t['user']['id'])
125
126 # resolve all of them at once
127 names = lookup(twitter, list(set(user_ids)))
128
129 # build new timeline with resolved uids
130 new_tl = []
131 for t in tl:
132 rt = t.get('retweeted_status')
133 if rt and not rt['user'].get('screen_name'):
134 name = names[rt['user']['id']]
135 t['retweeted_status']['user']['screen_name'] = name
136 if not t['user'].get('screen_name'):
137 name = names[t['user']['id']]
138 t['user']['screen_name'] = name
139 new_tl.append(t)
140
141 return new_tl
142
143def timeline_portion(twitter, screen_name, max_id=None):
144 """Get a portion of the timeline of a screen name."""
145 kwargs = dict(count=200, include_rts=1, screen_name=screen_name)
146 if max_id:
147 kwargs['max_id'] = max_id
148
149 tweets = {}
150 if screen_name:
151 tl = twitter.statuses.user_timeline(**kwargs)
152 else: # self
153 tl = twitter.statuses.home_timeline(**kwargs)
154
155 # some tweets do not provide screen name but user id, resolve those
156 for t in timeline_resolve_uids(twitter, tl):
157 text = t['text']
158 rt = t.get('retweeted_status')
159 if rt:
160 text = "RT @%s: %s" % (rt['user']['screen_name'], rt['text'])
161 tweets[t['id']] = "%s <%s> %s" % (format_date(t['created_at']),
162 t['user']['screen_name'],
163 format_text(text))
164
165 return tweets
166
167def timeline(twitter, screen_name, tweets):
168 """Get the entire timeline of tweets for a screen name."""
169 max_id = None
170 fail = Fail()
171 # get portions of timeline, incrementing max id until no new tweets appear
172 while True:
173 try:
174 portion = timeline_portion(twitter, screen_name, max_id)
175 except TwitterError as e:
176 if e.e.code == 401:
177 err("Fail: %i Unauthorized (tweets of that user are protected)"
178 % e.e.code)
179 break
180 elif e.e.code == 400:
181 err("Fail: %i API rate limit exceeded" % e.e.code)
182 rate = twitter.account.rate_limit_status()
183 reset = rate['reset_time_in_seconds']
184 reset = time.asctime(time.localtime(reset))
185 delay = int(rate['reset_time_in_seconds']
186 - time.time()) + 5 # avoid race
187 err("Hourly limit of %i requests reached, next reset on %s: "
188 "going to sleep for %i secs" % (rate['hourly_limit'],
189 reset, delay))
190 fail.wait(delay)
191 continue
bb2a70ee
S
192 elif e.e.code == 404:
193 err("Fail: %i This profile does not exist" % e.e.code)
194 break
a7282452
S
195 elif e.e.code == 502:
196 err("Fail: %i Service currently unavailable, retrying..."
197 % e.e.code)
198 else:
199 err("Fail: %s\nRetrying..." % str(e)[:500])
200 fail.wait(3)
201 except urllib2.URLError as e:
202 err("Fail: urllib2.URLError %s - Retrying..." % str(e))
203 fail.wait(3)
204 except httplib.error as e:
205 err("Fail: httplib.error %s - Retrying..." % str(e))
206 fail.wait(3)
207 except KeyError as e:
208 err("Fail: KeyError %s - Retrying..." % str(e))
209 fail.wait(3)
210 else:
211 new = -len(tweets)
212 tweets.update(portion)
213 new += len(tweets)
214 err("Browsing %s timeline, new tweets: %i"
215 % (screen_name if screen_name else "home", new))
216 if new < 190:
217 break
218 max_id = min(portion.keys()) # browse backwards
219 fail = Fail()
220
221def rate_limit_status(twitter):
222 """Print current Twitter API rate limit status."""
223 r = twitter.account.rate_limit_status()
224 print("Remaining API requests: %i/%i (hourly limit)"
225 % (r['remaining_hits'], r['hourly_limit']))
226 print("Next reset in %is (%s)"
227 % (int(r['reset_time_in_seconds'] - time.time()),
228 time.asctime(time.localtime(r['reset_time_in_seconds']))))
229
230def main(args=sys.argv[1:]):
231 options = {
232 'oauth': False,
233 'save-dir': ".",
234 'api-rate': False,
235 'timeline': ""
236 }
237 try:
238 parse_args(args, options)
239 except GetoptError as e:
240 err("I can't do that, %s." % e)
241 raise SystemExit(1)
242
243 # exit if no user given
244 # except if asking for API rate or archive of timeline
245 if not options['extra_args'] and not (options['api-rate'] or
246 options['timeline']):
247 print(__doc__)
248 return
249
250 # authenticate using OAuth, asking for token if necessary
251 if options['oauth']:
252 oauth_filename = (os.getenv("HOME", "") + os.sep
253 + ".twitter-archiver_oauth")
254 if not os.path.exists(oauth_filename):
255 oauth_dance("Twitter-Archiver", CONSUMER_KEY, CONSUMER_SECRET,
256 oauth_filename)
257 oauth_token, oauth_token_secret = read_token_file(oauth_filename)
258 auth = OAuth(oauth_token, oauth_token_secret, CONSUMER_KEY,
259 CONSUMER_SECRET)
260 else:
261 auth = NoAuth()
262
263 twitter = Twitter(auth=auth, api_version='1', domain='api.twitter.com')
264
265 if options['api-rate']:
266 rate_limit_status(twitter)
267 return
268
269 # save own timeline (the user used in OAuth)
270 if options['timeline']:
271 if isinstance(auth, NoAuth):
272 err("You must be authenticated to save timeline.")
273 raise SystemExit(1)
274
275 filename = options['save-dir'] + os.sep + options['timeline']
276 print("* Archiving own timeline in %s" % filename)
277
278 tweets = {}
279 try:
280 tweets = load_tweets(filename)
281 except Exception, e:
282 err("Error when loading saved tweets: %s - continuing without"
283 % str(e))
284
285 try:
286 # no screen_name means we want home_timeline, not user_timeline
287 timeline(twitter, "", tweets)
288 except KeyboardInterrupt:
289 err()
290 err("Interrupted")
291 raise SystemExit(1)
292
293 save_tweets(filename, tweets)
294 print("Total tweets in own timeline: %i" % len(tweets))
295
296 # read users from command-line or stdin
297 users = options['extra_args']
298 if len(users) == 1 and users[0] == "-":
299 users = [line.strip() for line in sys.stdin.readlines()]
300
301 # save tweets for every user
302 total, total_new = 0, 0
303 for user in users:
304 filename = options['save-dir'] + os.sep + user
305 print("* Archiving %s tweets in %s" % (user, filename))
306
307 tweets = {}
308 try:
309 tweets = load_tweets(filename)
310 except Exception, e:
311 err("Error when loading saved tweets: %s - continuing without"
312 % str(e))
313
314 new = 0
315 before = len(tweets)
316 try:
317 timeline(twitter, user, tweets)
318 except KeyboardInterrupt:
319 err()
320 err("Interrupted")
321 raise SystemExit(1)
322
323 save_tweets(filename, tweets)
324 total += len(tweets)
325 new = len(tweets) - before
326 total_new += new
327 print("Total tweets for %s: %i (%i new)" % (user, len(tweets), new))
328
329 print("Total: %i tweets (%i new) for %i users"
330 % (total, total_new, len(users)))