]>
Commit | Line | Data |
---|---|---|
1 | """USAGE | |
2 | twitter-archiver [options] <-|user> [<user> ...] | |
3 | ||
4 | DESCRIPTION | |
5 | Archive tweets of users, sorted by date from oldest to newest, in | |
6 | the following format: <id> <date> <<screen_name>> <tweet_text> | |
7 | Date format is: YYYY-MM-DD HH:MM:SS TZ. Tweet <id> is used to | |
8 | resume archiving on next run. Archive file name is the user name. | |
9 | Provide "-" instead of users to read users from standard input. | |
10 | ||
11 | OPTIONS | |
12 | -o --oauth authenticate to Twitter using OAuth (default: no) | |
13 | -s --save-dir <path> directory to save archives (default: current dir) | |
14 | -a --api-rate see current API rate limit status | |
15 | -t --timeline <file> archive own timeline into given file name (requires | |
16 | OAuth, max 800 statuses) | |
17 | -m --mentions <file> archive own mentions instead of timeline into | |
18 | given file name (requires OAuth, max 800 statuses) | |
19 | -v --favorites archive user's favorites instead of timeline | |
20 | -f --follow-redirects follow redirects of urls | |
21 | -r --redirect-sites follow redirects for this comma separated list of hosts | |
22 | -d --dms <file> archive own direct messages (both received and | |
23 | sent) into given file name. | |
24 | ||
25 | AUTHENTICATION | |
26 | Authenticate to Twitter using OAuth to archive tweets of private profiles | |
27 | and have higher API rate limits. OAuth authentication tokens are stored | |
28 | in ~/.twitter-archiver_oauth. | |
29 | """ | |
30 | ||
31 | from __future__ import print_function | |
32 | ||
33 | import os, sys, time, calendar, functools | |
34 | from getopt import gnu_getopt as getopt, GetoptError | |
35 | ||
36 | try: | |
37 | import urllib.request as urllib2 | |
38 | import http.client as httplib | |
39 | except ImportError: | |
40 | import urllib2 | |
41 | import httplib | |
42 | ||
43 | ||
44 | # T-Archiver (Twitter-Archiver) application registered by @stalkr_ | |
45 | CONSUMER_KEY='d8hIyfzs7ievqeeZLjZrqQ' | |
46 | CONSUMER_SECRET='AnZmK0rnvaX7BoJ75l6XlilnbyMv7FoiDXWVmPD8' | |
47 | ||
48 | from .api import Twitter, TwitterError | |
49 | from .oauth import OAuth, read_token_file | |
50 | from .oauth_dance import oauth_dance | |
51 | from .auth import NoAuth | |
52 | from .util import Fail, err, expand_line, parse_host_list | |
53 | from .follow import lookup | |
54 | ||
55 | def parse_args(args, options): | |
56 | """Parse arguments from command-line to set options.""" | |
57 | long_opts = ['help', 'oauth', 'save-dir=', 'api-rate', 'timeline=', 'mentions=', 'favorites', 'follow-redirects',"redirect-sites=", 'dms='] | |
58 | short_opts = "hos:at:m:vfr:d:" | |
59 | opts, extra_args = getopt(args, short_opts, long_opts) | |
60 | ||
61 | for opt, arg in opts: | |
62 | if opt in ('-h', '--help'): | |
63 | print(__doc__) | |
64 | raise SystemExit(0) | |
65 | elif opt in ('-o', '--oauth'): | |
66 | options['oauth'] = True | |
67 | elif opt in ('-s', '--save-dir'): | |
68 | options['save-dir'] = arg | |
69 | elif opt in ('-a', '--api-rate'): | |
70 | options['api-rate' ] = True | |
71 | elif opt in ('-t', '--timeline'): | |
72 | options['timeline'] = arg | |
73 | elif opt in ('-m', '--mentions'): | |
74 | options['mentions'] = arg | |
75 | elif opt in ('-v', '--favorites'): | |
76 | options['favorites'] = True | |
77 | elif opt in ('-f', '--follow-redirects'): | |
78 | options['follow-redirects'] = True | |
79 | elif opt in ('-r', '--redirect-sites'): | |
80 | options['redirect-sites'] = arg | |
81 | elif opt in ('-d', '--dms'): | |
82 | options['dms'] = arg | |
83 | ||
84 | options['extra_args'] = extra_args | |
85 | ||
86 | def load_tweets(filename): | |
87 | """Load tweets from file into dict, see save_tweets().""" | |
88 | try: | |
89 | archive = open(filename,"r") | |
90 | except IOError: # no archive (yet) | |
91 | return {} | |
92 | ||
93 | tweets = {} | |
94 | for line in archive.readlines(): | |
95 | tid, text = line.strip().split(" ", 1) | |
96 | tweets[int(tid)] = text.decode("utf-8") | |
97 | ||
98 | archive.close() | |
99 | return tweets | |
100 | ||
101 | def save_tweets(filename, tweets): | |
102 | """Save tweets from dict to file. | |
103 | ||
104 | Save tweets from dict to UTF-8 encoded file, one per line: | |
105 | <tweet id (number)> <tweet text> | |
106 | Tweet text is: | |
107 | <date> <<user>> [RT @<user>: ]<text> | |
108 | ||
109 | Args: | |
110 | filename: A string representing the file name to save tweets to. | |
111 | tweets: A dict mapping tweet-ids (int) to tweet text (str). | |
112 | """ | |
113 | if len(tweets) == 0: | |
114 | return | |
115 | ||
116 | try: | |
117 | archive = open(filename,"w") | |
118 | except IOError as e: | |
119 | err("Cannot save tweets: %s" % str(e)) | |
120 | return | |
121 | ||
122 | for k in sorted(tweets.keys()): | |
123 | archive.write("%i %s\n" % (k, tweets[k].encode('utf-8'))) | |
124 | ||
125 | archive.close() | |
126 | ||
127 | def format_date(utc, to_localtime=True): | |
128 | """Parse Twitter's UTC date into UTC or local time.""" | |
129 | u = time.strptime(utc.replace('+0000','UTC'), '%a %b %d %H:%M:%S %Z %Y') | |
130 | if to_localtime and time.timezone != 0: | |
131 | t = time.localtime(calendar.timegm(u)) | |
132 | return time.strftime("%Y-%m-%d %H:%M:%S", t) + " " + time.tzname[1] | |
133 | else: | |
134 | return time.strftime("%Y-%m-%d %H:%M:%S UTC", u) | |
135 | ||
136 | def expand_format_text(hosts, text): | |
137 | """Following redirects in links.""" | |
138 | return direct_format_text(expand_line(text, hosts)) | |
139 | ||
140 | def direct_format_text(text): | |
141 | """Transform special chars in text to have only one line.""" | |
142 | return text.replace('\n','\\n').replace('\r','\\r') | |
143 | ||
144 | def statuses_resolve_uids(twitter, tl): | |
145 | """Resolve user ids to screen names from statuses.""" | |
146 | # get all user ids that needs a lookup (no screen_name key) | |
147 | user_ids = [] | |
148 | for t in tl: | |
149 | rt = t.get('retweeted_status') | |
150 | if rt and not rt['user'].get('screen_name'): | |
151 | user_ids.append(rt['user']['id']) | |
152 | if not t['user'].get('screen_name'): | |
153 | user_ids.append(t['user']['id']) | |
154 | ||
155 | # resolve all of them at once | |
156 | names = lookup(twitter, list(set(user_ids))) | |
157 | ||
158 | # build new statuses with resolved uids | |
159 | new_tl = [] | |
160 | for t in tl: | |
161 | rt = t.get('retweeted_status') | |
162 | if rt and not rt['user'].get('screen_name'): | |
163 | name = names[rt['user']['id']] | |
164 | t['retweeted_status']['user']['screen_name'] = name | |
165 | if not t['user'].get('screen_name'): | |
166 | name = names[t['user']['id']] | |
167 | t['user']['screen_name'] = name | |
168 | new_tl.append(t) | |
169 | ||
170 | return new_tl | |
171 | ||
172 | def statuses_portion(twitter, screen_name, max_id=None, mentions=False, favorites=False, received_dms=None): | |
173 | """Get a portion of the statuses of a screen name.""" | |
174 | kwargs = dict(count=200, include_rts=1, screen_name=screen_name) | |
175 | if max_id: | |
176 | kwargs['max_id'] = max_id | |
177 | ||
178 | tweets = {} | |
179 | if mentions: | |
180 | tl = twitter.statuses.mentions(**kwargs) | |
181 | elif favorites: | |
182 | tl = twitter.favorites(**kwargs) # API v1, favorites.list() in v1.1 | |
183 | elif received_dms != None: | |
184 | if received_dms: | |
185 | tl = twitter.direct_messages(**kwargs) | |
186 | else: # sent DMs | |
187 | tl = twitter.direct_messages.sent(**kwargs) | |
188 | else: # timeline | |
189 | if screen_name: | |
190 | tl = twitter.statuses.user_timeline(**kwargs) | |
191 | else: # self | |
192 | tl = twitter.statuses.home_timeline(**kwargs) | |
193 | ||
194 | # some tweets do not provide screen name but user id, resolve those | |
195 | # this isn't a valid operation for DMs, so special-case them | |
196 | if received_dms == None: | |
197 | newtl = statuses_resolve_uids(twitter, tl) | |
198 | else: | |
199 | newtl = tl | |
200 | for t in newtl: | |
201 | text = t['text'] | |
202 | rt = t.get('retweeted_status') | |
203 | if rt: | |
204 | text = "RT @%s: %s" % (rt['user']['screen_name'], rt['text']) | |
205 | # DMs don't include mentions by default, so in order to show who | |
206 | # the recipient was, we synthesise a mention. If we're not | |
207 | # operating on DMs, behave as normal | |
208 | if received_dms == None: | |
209 | tweets[t['id']] = "%s <%s> %s" % (format_date(t['created_at']), | |
210 | t['user']['screen_name'], | |
211 | format_text(text)) | |
212 | else: | |
213 | tweets[t['id']] = "%s <%s> @%s %s" % (format_date(t['created_at']), | |
214 | t['sender_screen_name'], | |
215 | t['recipient']['screen_name'], | |
216 | format_text(text)) | |
217 | return tweets | |
218 | ||
219 | def statuses(twitter, screen_name, tweets, mentions=False, favorites=False, received_dms=None): | |
220 | """Get all the statuses for a screen name.""" | |
221 | max_id = None | |
222 | fail = Fail() | |
223 | # get portions of statuses, incrementing max id until no new tweets appear | |
224 | while True: | |
225 | try: | |
226 | portion = statuses_portion(twitter, screen_name, max_id, mentions, favorites, received_dms) | |
227 | except TwitterError as e: | |
228 | if e.e.code == 401: | |
229 | err("Fail: %i Unauthorized (tweets of that user are protected)" | |
230 | % e.e.code) | |
231 | break | |
232 | elif e.e.code == 400: | |
233 | err("Fail: %i API rate limit exceeded" % e.e.code) | |
234 | rate = twitter.account.rate_limit_status() | |
235 | reset = rate['reset_time_in_seconds'] | |
236 | reset = time.asctime(time.localtime(reset)) | |
237 | delay = int(rate['reset_time_in_seconds'] | |
238 | - time.time()) + 5 # avoid race | |
239 | err("Hourly limit of %i requests reached, next reset on %s: " | |
240 | "going to sleep for %i secs" % (rate['hourly_limit'], | |
241 | reset, delay)) | |
242 | fail.wait(delay) | |
243 | continue | |
244 | elif e.e.code == 404: | |
245 | err("Fail: %i This profile does not exist" % e.e.code) | |
246 | break | |
247 | elif e.e.code == 502: | |
248 | err("Fail: %i Service currently unavailable, retrying..." | |
249 | % e.e.code) | |
250 | else: | |
251 | err("Fail: %s\nRetrying..." % str(e)[:500]) | |
252 | fail.wait(3) | |
253 | except urllib2.URLError as e: | |
254 | err("Fail: urllib2.URLError %s - Retrying..." % str(e)) | |
255 | fail.wait(3) | |
256 | except httplib.error as e: | |
257 | err("Fail: httplib.error %s - Retrying..." % str(e)) | |
258 | fail.wait(3) | |
259 | except KeyError as e: | |
260 | err("Fail: KeyError %s - Retrying..." % str(e)) | |
261 | fail.wait(3) | |
262 | else: | |
263 | new = -len(tweets) | |
264 | tweets.update(portion) | |
265 | new += len(tweets) | |
266 | err("Browsing %s statuses, new tweets: %i" | |
267 | % (screen_name if screen_name else "home", new)) | |
268 | if new < 190: | |
269 | break | |
270 | max_id = min(portion.keys())-1 # browse backwards | |
271 | fail = Fail() | |
272 | ||
273 | def rate_limit_status(twitter): | |
274 | """Print current Twitter API rate limit status.""" | |
275 | r = twitter.account.rate_limit_status() | |
276 | print("Remaining API requests: %i/%i (hourly limit)" | |
277 | % (r['remaining_hits'], r['hourly_limit'])) | |
278 | print("Next reset in %is (%s)" | |
279 | % (int(r['reset_time_in_seconds'] - time.time()), | |
280 | time.asctime(time.localtime(r['reset_time_in_seconds'])))) | |
281 | ||
282 | def main(args=sys.argv[1:]): | |
283 | options = { | |
284 | 'oauth': False, | |
285 | 'save-dir': ".", | |
286 | 'api-rate': False, | |
287 | 'timeline': "", | |
288 | 'mentions': "", | |
289 | 'dms': "", | |
290 | 'favorites': False, | |
291 | 'follow-redirects': False, | |
292 | 'redirect-sites': None, | |
293 | } | |
294 | try: | |
295 | parse_args(args, options) | |
296 | except GetoptError as e: | |
297 | err("I can't do that, %s." % e) | |
298 | raise SystemExit(1) | |
299 | ||
300 | # exit if no user given | |
301 | # except if asking for API rate, or archive of timeline or mentions | |
302 | if not options['extra_args'] and not (options['api-rate'] or | |
303 | options['timeline'] or | |
304 | options['mentions'] or | |
305 | options['dms']): | |
306 | print(__doc__) | |
307 | return | |
308 | ||
309 | # authenticate using OAuth, asking for token if necessary | |
310 | if options['oauth']: | |
311 | oauth_filename = (os.getenv("HOME", "") + os.sep | |
312 | + ".twitter-archiver_oauth") | |
313 | if not os.path.exists(oauth_filename): | |
314 | oauth_dance("Twitter-Archiver", CONSUMER_KEY, CONSUMER_SECRET, | |
315 | oauth_filename) | |
316 | oauth_token, oauth_token_secret = read_token_file(oauth_filename) | |
317 | auth = OAuth(oauth_token, oauth_token_secret, CONSUMER_KEY, | |
318 | CONSUMER_SECRET) | |
319 | else: | |
320 | auth = NoAuth() | |
321 | ||
322 | twitter = Twitter(auth=auth, api_version='1', domain='api.twitter.com') | |
323 | ||
324 | if options['api-rate']: | |
325 | rate_limit_status(twitter) | |
326 | return | |
327 | ||
328 | global format_text | |
329 | if options['follow-redirects'] or options['redirect-sites'] : | |
330 | if options['redirect-sites']: | |
331 | hosts = parse_host_list(options['redirect-sites']) | |
332 | else: | |
333 | hosts = None | |
334 | format_text = functools.partial(expand_format_text, hosts) | |
335 | else: | |
336 | format_text = direct_format_text | |
337 | ||
338 | # save own timeline or mentions (the user used in OAuth) | |
339 | if options['timeline'] or options['mentions']: | |
340 | if isinstance(auth, NoAuth): | |
341 | err("You must be authenticated to save timeline or mentions.") | |
342 | raise SystemExit(1) | |
343 | ||
344 | if options['timeline']: | |
345 | filename = options['save-dir'] + os.sep + options['timeline'] | |
346 | print("* Archiving own timeline in %s" % filename) | |
347 | elif options['mentions']: | |
348 | filename = options['save-dir'] + os.sep + options['mentions'] | |
349 | print("* Archiving own mentions in %s" % filename) | |
350 | ||
351 | tweets = {} | |
352 | try: | |
353 | tweets = load_tweets(filename) | |
354 | except Exception as e: | |
355 | err("Error when loading saved tweets: %s - continuing without" | |
356 | % str(e)) | |
357 | ||
358 | try: | |
359 | statuses(twitter, "", tweets, options['mentions'], options['favorites']) | |
360 | except KeyboardInterrupt: | |
361 | err() | |
362 | err("Interrupted") | |
363 | raise SystemExit(1) | |
364 | ||
365 | save_tweets(filename, tweets) | |
366 | if options['timeline']: | |
367 | print("Total tweets in own timeline: %i" % len(tweets)) | |
368 | elif options['mentions']: | |
369 | print("Total mentions: %i" % len(tweets)) | |
370 | ||
371 | if options['dms']: | |
372 | if isinstance(auth, NoAuth): | |
373 | err("You must be authenticated to save DMs.") | |
374 | raise SystemExit(1) | |
375 | ||
376 | filename = options['save-dir'] + os.sep + options['dms'] | |
377 | print("* Archiving own DMs in %s" % filename) | |
378 | ||
379 | dms = {} | |
380 | try: | |
381 | dms = load_tweets(filename) | |
382 | except Exception, e: | |
383 | err("Error when loading saved DMs: %s - continuing without" | |
384 | % str(e)) | |
385 | ||
386 | try: | |
387 | statuses(twitter, "", dms, received_dms=True) | |
388 | statuses(twitter, "", dms, received_dms=False) | |
389 | except KeyboardInterrupt: | |
390 | err() | |
391 | err("Interrupted") | |
392 | raise SystemExit(1) | |
393 | ||
394 | save_tweets(filename, dms) | |
395 | print("Total DMs sent and received: %i" % len(dms)) | |
396 | ||
397 | ||
398 | # read users from command-line or stdin | |
399 | users = options['extra_args'] | |
400 | if len(users) == 1 and users[0] == "-": | |
401 | users = [line.strip() for line in sys.stdin.readlines()] | |
402 | ||
403 | # save tweets for every user | |
404 | total, total_new = 0, 0 | |
405 | for user in users: | |
406 | filename = options['save-dir'] + os.sep + user | |
407 | if options['favorites']: | |
408 | filename = filename + "-favorites" | |
409 | print("* Archiving %s tweets in %s" % (user, filename)) | |
410 | ||
411 | tweets = {} | |
412 | try: | |
413 | tweets = load_tweets(filename) | |
414 | except Exception as e: | |
415 | err("Error when loading saved tweets: %s - continuing without" | |
416 | % str(e)) | |
417 | ||
418 | new = 0 | |
419 | before = len(tweets) | |
420 | try: | |
421 | statuses(twitter, user, tweets, options['mentions'], options['favorites']) | |
422 | except KeyboardInterrupt: | |
423 | err() | |
424 | err("Interrupted") | |
425 | raise SystemExit(1) | |
426 | ||
427 | save_tweets(filename, tweets) | |
428 | total += len(tweets) | |
429 | new = len(tweets) - before | |
430 | total_new += new | |
431 | print("Total tweets for %s: %i (%i new)" % (user, len(tweets), new)) | |
432 | ||
433 | print("Total: %i tweets (%i new) for %i users" | |
434 | % (total, total_new, len(users))) |