]>
Commit | Line | Data |
---|---|---|
a7282452 S |
1 | """USAGE |
2 | twitter-archiver [options] <-|user> [<user> ...] | |
3 | ||
4 | DESCRIPTION | |
5 | Archive tweets of users, sorted by date from oldest to newest, in | |
6 | the following format: <id> <date> <<screen_name>> <tweet_text> | |
7 | Date format is: YYYY-MM-DD HH:MM:SS TZ. Tweet <id> is used to | |
8 | resume archiving on next run. Archive file name is the user name. | |
9 | Provide "-" instead of users to read users from standard input. | |
10 | ||
11 | OPTIONS | |
12 | -o --oauth authenticate to Twitter using OAuth (default no) | |
13 | -s --save-dir <path> directory to save archives (default: current dir) | |
14 | -a --api-rate see current API rate limit status | |
15 | -t --timeline <file> archive own timeline into given file name (requires | |
16 | OAuth, max 800 statuses). | |
17 | ||
18 | AUTHENTICATION | |
19 | Authenticate to Twitter using OAuth to archive tweets of private profiles | |
20 | and have higher API rate limits. OAuth authentication tokens are stored | |
21 | in ~/.twitter-archiver_oauth. | |
22 | """ | |
23 | ||
24 | from __future__ import print_function | |
25 | ||
26 | import os, sys, time, calendar, urllib2, httplib | |
27 | from getopt import gnu_getopt as getopt, GetoptError | |
28 | ||
29 | # T-Archiver (Twitter-Archiver) application registered by @stalkr_ | |
30 | CONSUMER_KEY='d8hIyfzs7ievqeeZLjZrqQ' | |
31 | CONSUMER_SECRET='AnZmK0rnvaX7BoJ75l6XlilnbyMv7FoiDXWVmPD8' | |
32 | ||
33 | from .api import Twitter, TwitterError | |
34 | from .oauth import OAuth, read_token_file | |
35 | from .oauth_dance import oauth_dance | |
36 | from .auth import NoAuth | |
37 | from .util import Fail, err | |
38 | from .follow import lookup | |
39 | ||
40 | def parse_args(args, options): | |
41 | """Parse arguments from command-line to set options.""" | |
42 | long_opts = ['help', 'oauth', 'save-dir=', 'api-rate', 'timeline='] | |
43 | short_opts = "hos:at:" | |
44 | opts, extra_args = getopt(args, short_opts, long_opts) | |
45 | ||
46 | for opt, arg in opts: | |
47 | if opt in ('-h', '--help'): | |
48 | print(__doc__) | |
49 | raise SystemExit(0) | |
50 | elif opt in ('-o', '--oauth'): | |
51 | options['oauth'] = True | |
52 | elif opt in ('-s', '--save-dir'): | |
53 | options['save-dir'] = arg | |
54 | elif opt in ('-a', '--api-rate'): | |
55 | options['api-rate' ] = True | |
56 | elif opt in ('-t', '--timeline'): | |
57 | options['timeline'] = arg | |
58 | ||
59 | options['extra_args'] = extra_args | |
60 | ||
61 | def load_tweets(filename): | |
62 | """Load tweets from file into dict, see save_tweets().""" | |
63 | try: | |
64 | archive = open(filename,"r") | |
65 | except IOError: # no archive (yet) | |
66 | return {} | |
67 | ||
68 | tweets = {} | |
69 | for line in archive.readlines(): | |
70 | tid, text = line.strip().split(" ", 1) | |
71 | tweets[int(tid)] = text.decode("utf-8") | |
72 | ||
73 | archive.close() | |
74 | return tweets | |
75 | ||
76 | def save_tweets(filename, tweets): | |
77 | """Save tweets from dict to file. | |
78 | ||
79 | Save tweets from dict to UTF-8 encoded file, one per line: | |
80 | <tweet id (number)> <tweet text> | |
81 | Tweet text is: | |
82 | <date> <<user>> [RT @<user>: ]<text> | |
83 | ||
84 | Args: | |
85 | filename: A string representing the file name to save tweets to. | |
86 | tweets: A dict mapping tweet-ids (int) to tweet text (str). | |
87 | """ | |
88 | if len(tweets) == 0: | |
89 | return | |
90 | ||
91 | try: | |
92 | archive = open(filename,"w") | |
93 | except IOError as e: | |
94 | err("Cannot save tweets: %s" % str(e)) | |
95 | return | |
96 | ||
97 | for k in sorted(tweets.keys()): | |
98 | archive.write("%i %s\n" % (k, tweets[k].encode('utf-8'))) | |
99 | ||
100 | archive.close() | |
101 | ||
102 | def format_date(utc, to_localtime=True): | |
103 | """Parse Twitter's UTC date into UTC or local time.""" | |
104 | u = time.strptime(utc.replace('+0000','UTC'), '%a %b %d %H:%M:%S %Z %Y') | |
105 | if to_localtime and time.timezone != 0: | |
106 | t = time.localtime(calendar.timegm(u)) | |
107 | return time.strftime("%Y-%m-%d %H:%M:%S", t) + " " + time.tzname[1] | |
108 | else: | |
109 | return time.strftime("%Y-%m-%d %H:%M:%S UTC", u) | |
110 | ||
111 | def format_text(text): | |
112 | """Transform special chars in text to have only one line.""" | |
113 | return text.replace('\n','\\n').replace('\r','\\r') | |
114 | ||
115 | def timeline_resolve_uids(twitter, tl): | |
116 | """Resolve user ids to screen names from a timeline.""" | |
117 | # get all user ids that needs a lookup (no screen_name key) | |
118 | user_ids = [] | |
119 | for t in tl: | |
120 | rt = t.get('retweeted_status') | |
121 | if rt and not rt['user'].get('screen_name'): | |
122 | user_ids.append(rt['user']['id']) | |
123 | if not t['user'].get('screen_name'): | |
124 | user_ids.append(t['user']['id']) | |
125 | ||
126 | # resolve all of them at once | |
127 | names = lookup(twitter, list(set(user_ids))) | |
128 | ||
129 | # build new timeline with resolved uids | |
130 | new_tl = [] | |
131 | for t in tl: | |
132 | rt = t.get('retweeted_status') | |
133 | if rt and not rt['user'].get('screen_name'): | |
134 | name = names[rt['user']['id']] | |
135 | t['retweeted_status']['user']['screen_name'] = name | |
136 | if not t['user'].get('screen_name'): | |
137 | name = names[t['user']['id']] | |
138 | t['user']['screen_name'] = name | |
139 | new_tl.append(t) | |
140 | ||
141 | return new_tl | |
142 | ||
143 | def timeline_portion(twitter, screen_name, max_id=None): | |
144 | """Get a portion of the timeline of a screen name.""" | |
145 | kwargs = dict(count=200, include_rts=1, screen_name=screen_name) | |
146 | if max_id: | |
147 | kwargs['max_id'] = max_id | |
148 | ||
149 | tweets = {} | |
150 | if screen_name: | |
151 | tl = twitter.statuses.user_timeline(**kwargs) | |
152 | else: # self | |
153 | tl = twitter.statuses.home_timeline(**kwargs) | |
154 | ||
155 | # some tweets do not provide screen name but user id, resolve those | |
156 | for t in timeline_resolve_uids(twitter, tl): | |
157 | text = t['text'] | |
158 | rt = t.get('retweeted_status') | |
159 | if rt: | |
160 | text = "RT @%s: %s" % (rt['user']['screen_name'], rt['text']) | |
161 | tweets[t['id']] = "%s <%s> %s" % (format_date(t['created_at']), | |
162 | t['user']['screen_name'], | |
163 | format_text(text)) | |
164 | ||
165 | return tweets | |
166 | ||
167 | def timeline(twitter, screen_name, tweets): | |
168 | """Get the entire timeline of tweets for a screen name.""" | |
169 | max_id = None | |
170 | fail = Fail() | |
171 | # get portions of timeline, incrementing max id until no new tweets appear | |
172 | while True: | |
173 | try: | |
174 | portion = timeline_portion(twitter, screen_name, max_id) | |
175 | except TwitterError as e: | |
176 | if e.e.code == 401: | |
177 | err("Fail: %i Unauthorized (tweets of that user are protected)" | |
178 | % e.e.code) | |
179 | break | |
180 | elif e.e.code == 400: | |
181 | err("Fail: %i API rate limit exceeded" % e.e.code) | |
182 | rate = twitter.account.rate_limit_status() | |
183 | reset = rate['reset_time_in_seconds'] | |
184 | reset = time.asctime(time.localtime(reset)) | |
185 | delay = int(rate['reset_time_in_seconds'] | |
186 | - time.time()) + 5 # avoid race | |
187 | err("Hourly limit of %i requests reached, next reset on %s: " | |
188 | "going to sleep for %i secs" % (rate['hourly_limit'], | |
189 | reset, delay)) | |
190 | fail.wait(delay) | |
191 | continue | |
192 | elif e.e.code == 502: | |
193 | err("Fail: %i Service currently unavailable, retrying..." | |
194 | % e.e.code) | |
195 | else: | |
196 | err("Fail: %s\nRetrying..." % str(e)[:500]) | |
197 | fail.wait(3) | |
198 | except urllib2.URLError as e: | |
199 | err("Fail: urllib2.URLError %s - Retrying..." % str(e)) | |
200 | fail.wait(3) | |
201 | except httplib.error as e: | |
202 | err("Fail: httplib.error %s - Retrying..." % str(e)) | |
203 | fail.wait(3) | |
204 | except KeyError as e: | |
205 | err("Fail: KeyError %s - Retrying..." % str(e)) | |
206 | fail.wait(3) | |
207 | else: | |
208 | new = -len(tweets) | |
209 | tweets.update(portion) | |
210 | new += len(tweets) | |
211 | err("Browsing %s timeline, new tweets: %i" | |
212 | % (screen_name if screen_name else "home", new)) | |
213 | if new < 190: | |
214 | break | |
215 | max_id = min(portion.keys()) # browse backwards | |
216 | fail = Fail() | |
217 | ||
218 | def rate_limit_status(twitter): | |
219 | """Print current Twitter API rate limit status.""" | |
220 | r = twitter.account.rate_limit_status() | |
221 | print("Remaining API requests: %i/%i (hourly limit)" | |
222 | % (r['remaining_hits'], r['hourly_limit'])) | |
223 | print("Next reset in %is (%s)" | |
224 | % (int(r['reset_time_in_seconds'] - time.time()), | |
225 | time.asctime(time.localtime(r['reset_time_in_seconds'])))) | |
226 | ||
227 | def main(args=sys.argv[1:]): | |
228 | options = { | |
229 | 'oauth': False, | |
230 | 'save-dir': ".", | |
231 | 'api-rate': False, | |
232 | 'timeline': "" | |
233 | } | |
234 | try: | |
235 | parse_args(args, options) | |
236 | except GetoptError as e: | |
237 | err("I can't do that, %s." % e) | |
238 | raise SystemExit(1) | |
239 | ||
240 | # exit if no user given | |
241 | # except if asking for API rate or archive of timeline | |
242 | if not options['extra_args'] and not (options['api-rate'] or | |
243 | options['timeline']): | |
244 | print(__doc__) | |
245 | return | |
246 | ||
247 | # authenticate using OAuth, asking for token if necessary | |
248 | if options['oauth']: | |
249 | oauth_filename = (os.getenv("HOME", "") + os.sep | |
250 | + ".twitter-archiver_oauth") | |
251 | if not os.path.exists(oauth_filename): | |
252 | oauth_dance("Twitter-Archiver", CONSUMER_KEY, CONSUMER_SECRET, | |
253 | oauth_filename) | |
254 | oauth_token, oauth_token_secret = read_token_file(oauth_filename) | |
255 | auth = OAuth(oauth_token, oauth_token_secret, CONSUMER_KEY, | |
256 | CONSUMER_SECRET) | |
257 | else: | |
258 | auth = NoAuth() | |
259 | ||
260 | twitter = Twitter(auth=auth, api_version='1', domain='api.twitter.com') | |
261 | ||
262 | if options['api-rate']: | |
263 | rate_limit_status(twitter) | |
264 | return | |
265 | ||
266 | # save own timeline (the user used in OAuth) | |
267 | if options['timeline']: | |
268 | if isinstance(auth, NoAuth): | |
269 | err("You must be authenticated to save timeline.") | |
270 | raise SystemExit(1) | |
271 | ||
272 | filename = options['save-dir'] + os.sep + options['timeline'] | |
273 | print("* Archiving own timeline in %s" % filename) | |
274 | ||
275 | tweets = {} | |
276 | try: | |
277 | tweets = load_tweets(filename) | |
278 | except Exception, e: | |
279 | err("Error when loading saved tweets: %s - continuing without" | |
280 | % str(e)) | |
281 | ||
282 | try: | |
283 | # no screen_name means we want home_timeline, not user_timeline | |
284 | timeline(twitter, "", tweets) | |
285 | except KeyboardInterrupt: | |
286 | err() | |
287 | err("Interrupted") | |
288 | raise SystemExit(1) | |
289 | ||
290 | save_tweets(filename, tweets) | |
291 | print("Total tweets in own timeline: %i" % len(tweets)) | |
292 | ||
293 | # read users from command-line or stdin | |
294 | users = options['extra_args'] | |
295 | if len(users) == 1 and users[0] == "-": | |
296 | users = [line.strip() for line in sys.stdin.readlines()] | |
297 | ||
298 | # save tweets for every user | |
299 | total, total_new = 0, 0 | |
300 | for user in users: | |
301 | filename = options['save-dir'] + os.sep + user | |
302 | print("* Archiving %s tweets in %s" % (user, filename)) | |
303 | ||
304 | tweets = {} | |
305 | try: | |
306 | tweets = load_tweets(filename) | |
307 | except Exception, e: | |
308 | err("Error when loading saved tweets: %s - continuing without" | |
309 | % str(e)) | |
310 | ||
311 | new = 0 | |
312 | before = len(tweets) | |
313 | try: | |
314 | timeline(twitter, user, tweets) | |
315 | except KeyboardInterrupt: | |
316 | err() | |
317 | err("Interrupted") | |
318 | raise SystemExit(1) | |
319 | ||
320 | save_tweets(filename, tweets) | |
321 | total += len(tweets) | |
322 | new = len(tweets) - before | |
323 | total_new += new | |
324 | print("Total tweets for %s: %i (%i new)" % (user, len(tweets), new)) | |
325 | ||
326 | print("Total: %i tweets (%i new) for %i users" | |
327 | % (total, total_new, len(users))) |