]>
Commit | Line | Data |
---|---|---|
a7282452 S |
1 | """USAGE |
2 | twitter-archiver [options] <-|user> [<user> ...] | |
3 | ||
4 | DESCRIPTION | |
5 | Archive tweets of users, sorted by date from oldest to newest, in | |
6 | the following format: <id> <date> <<screen_name>> <tweet_text> | |
7 | Date format is: YYYY-MM-DD HH:MM:SS TZ. Tweet <id> is used to | |
8 | resume archiving on next run. Archive file name is the user name. | |
9 | Provide "-" instead of users to read users from standard input. | |
10 | ||
11 | OPTIONS | |
4f0b5ca6 | 12 | -o --oauth authenticate to Twitter using OAuth (default: no) |
a7282452 S |
13 | -s --save-dir <path> directory to save archives (default: current dir) |
14 | -a --api-rate see current API rate limit status | |
15 | -t --timeline <file> archive own timeline into given file name (requires | |
4f0b5ca6 H |
16 | OAuth, max 800 statuses) |
17 | -m --mentions <file> archive own mentions instead of timeline into | |
18 | given file name (requires OAuth, max 800 statuses) | |
19 | -v --favorites archive user's favorites instead of timeline | |
907402f6 | 20 | -f --follow-redirects follow redirects of urls |
21 | -r --redirect-sites follow redirects for this comma separated list of hosts | |
a7282452 S |
22 | |
23 | AUTHENTICATION | |
24 | Authenticate to Twitter using OAuth to archive tweets of private profiles | |
25 | and have higher API rate limits. OAuth authentication tokens are stored | |
26 | in ~/.twitter-archiver_oauth. | |
27 | """ | |
28 | ||
29 | from __future__ import print_function | |
30 | ||
62ec1b07 | 31 | import os, sys, time, calendar, functools |
a7282452 S |
32 | from getopt import gnu_getopt as getopt, GetoptError |
33 | ||
62ec1b07 | 34 | try: |
35 | import urllib.request as urllib2 | |
36 | import http.client as httplib | |
37 | except ImportError: | |
38 | import urllib2 | |
39 | import httplib | |
40 | ||
41 | ||
a7282452 S |
42 | # T-Archiver (Twitter-Archiver) application registered by @stalkr_ |
43 | CONSUMER_KEY='d8hIyfzs7ievqeeZLjZrqQ' | |
44 | CONSUMER_SECRET='AnZmK0rnvaX7BoJ75l6XlilnbyMv7FoiDXWVmPD8' | |
45 | ||
46 | from .api import Twitter, TwitterError | |
47 | from .oauth import OAuth, read_token_file | |
48 | from .oauth_dance import oauth_dance | |
49 | from .auth import NoAuth | |
907402f6 | 50 | from .util import Fail, err, expand_line, parse_host_list |
a7282452 S |
51 | from .follow import lookup |
52 | ||
53 | def parse_args(args, options): | |
54 | """Parse arguments from command-line to set options.""" | |
4f0b5ca6 H |
55 | long_opts = ['help', 'oauth', 'save-dir=', 'api-rate', 'timeline=', 'mentions=', 'favorites', 'follow-redirects',"redirect-sites="] |
56 | short_opts = "hos:at:m:vfr:" | |
a7282452 S |
57 | opts, extra_args = getopt(args, short_opts, long_opts) |
58 | ||
59 | for opt, arg in opts: | |
60 | if opt in ('-h', '--help'): | |
61 | print(__doc__) | |
62 | raise SystemExit(0) | |
63 | elif opt in ('-o', '--oauth'): | |
64 | options['oauth'] = True | |
65 | elif opt in ('-s', '--save-dir'): | |
66 | options['save-dir'] = arg | |
67 | elif opt in ('-a', '--api-rate'): | |
68 | options['api-rate' ] = True | |
69 | elif opt in ('-t', '--timeline'): | |
70 | options['timeline'] = arg | |
4f0b5ca6 H |
71 | elif opt in ('-m', '--mentions'): |
72 | options['mentions'] = arg | |
73 | elif opt in ('-v', '--favorites'): | |
74 | options['favorites'] = True | |
907402f6 | 75 | elif opt in ('-f', '--follow-redirects'): |
76 | options['follow-redirects'] = True | |
77 | elif opt in ('-r', '--redirect-sites'): | |
78 | options['redirect-sites'] = arg | |
a7282452 S |
79 | |
80 | options['extra_args'] = extra_args | |
81 | ||
82 | def load_tweets(filename): | |
83 | """Load tweets from file into dict, see save_tweets().""" | |
84 | try: | |
85 | archive = open(filename,"r") | |
86 | except IOError: # no archive (yet) | |
87 | return {} | |
88 | ||
89 | tweets = {} | |
90 | for line in archive.readlines(): | |
91 | tid, text = line.strip().split(" ", 1) | |
92 | tweets[int(tid)] = text.decode("utf-8") | |
93 | ||
94 | archive.close() | |
95 | return tweets | |
96 | ||
97 | def save_tweets(filename, tweets): | |
98 | """Save tweets from dict to file. | |
99 | ||
100 | Save tweets from dict to UTF-8 encoded file, one per line: | |
101 | <tweet id (number)> <tweet text> | |
102 | Tweet text is: | |
103 | <date> <<user>> [RT @<user>: ]<text> | |
104 | ||
105 | Args: | |
106 | filename: A string representing the file name to save tweets to. | |
107 | tweets: A dict mapping tweet-ids (int) to tweet text (str). | |
108 | """ | |
109 | if len(tweets) == 0: | |
110 | return | |
111 | ||
112 | try: | |
113 | archive = open(filename,"w") | |
114 | except IOError as e: | |
115 | err("Cannot save tweets: %s" % str(e)) | |
116 | return | |
117 | ||
118 | for k in sorted(tweets.keys()): | |
119 | archive.write("%i %s\n" % (k, tweets[k].encode('utf-8'))) | |
120 | ||
121 | archive.close() | |
122 | ||
123 | def format_date(utc, to_localtime=True): | |
124 | """Parse Twitter's UTC date into UTC or local time.""" | |
125 | u = time.strptime(utc.replace('+0000','UTC'), '%a %b %d %H:%M:%S %Z %Y') | |
126 | if to_localtime and time.timezone != 0: | |
127 | t = time.localtime(calendar.timegm(u)) | |
128 | return time.strftime("%Y-%m-%d %H:%M:%S", t) + " " + time.tzname[1] | |
129 | else: | |
130 | return time.strftime("%Y-%m-%d %H:%M:%S UTC", u) | |
131 | ||
907402f6 | 132 | def expand_format_text(hosts, text): |
133 | """Following redirects in links.""" | |
134 | return direct_format_text(expand_line(text, hosts)) | |
135 | ||
136 | def direct_format_text(text): | |
a7282452 S |
137 | """Transform special chars in text to have only one line.""" |
138 | return text.replace('\n','\\n').replace('\r','\\r') | |
139 | ||
4f0b5ca6 H |
140 | def statuses_resolve_uids(twitter, tl): |
141 | """Resolve user ids to screen names from statuses.""" | |
a7282452 S |
142 | # get all user ids that needs a lookup (no screen_name key) |
143 | user_ids = [] | |
144 | for t in tl: | |
145 | rt = t.get('retweeted_status') | |
146 | if rt and not rt['user'].get('screen_name'): | |
147 | user_ids.append(rt['user']['id']) | |
148 | if not t['user'].get('screen_name'): | |
149 | user_ids.append(t['user']['id']) | |
150 | ||
151 | # resolve all of them at once | |
152 | names = lookup(twitter, list(set(user_ids))) | |
153 | ||
4f0b5ca6 | 154 | # build new statuses with resolved uids |
a7282452 S |
155 | new_tl = [] |
156 | for t in tl: | |
157 | rt = t.get('retweeted_status') | |
158 | if rt and not rt['user'].get('screen_name'): | |
159 | name = names[rt['user']['id']] | |
160 | t['retweeted_status']['user']['screen_name'] = name | |
161 | if not t['user'].get('screen_name'): | |
162 | name = names[t['user']['id']] | |
163 | t['user']['screen_name'] = name | |
164 | new_tl.append(t) | |
165 | ||
166 | return new_tl | |
167 | ||
4f0b5ca6 H |
168 | def statuses_portion(twitter, screen_name, max_id=None, mentions=False, favorites=False): |
169 | """Get a portion of the statuses of a screen name.""" | |
a7282452 S |
170 | kwargs = dict(count=200, include_rts=1, screen_name=screen_name) |
171 | if max_id: | |
172 | kwargs['max_id'] = max_id | |
173 | ||
174 | tweets = {} | |
4f0b5ca6 H |
175 | if mentions: |
176 | tl = twitter.statuses.mentions(**kwargs) | |
177 | elif favorites: | |
178 | tl = twitter.favorites(**kwargs) # API v1, favorites.list() in v1.1 | |
179 | else: # timeline | |
180 | if screen_name: | |
181 | tl = twitter.statuses.user_timeline(**kwargs) | |
182 | else: # self | |
183 | tl = twitter.statuses.home_timeline(**kwargs) | |
a7282452 S |
184 | |
185 | # some tweets do not provide screen name but user id, resolve those | |
4f0b5ca6 | 186 | for t in statuses_resolve_uids(twitter, tl): |
a7282452 S |
187 | text = t['text'] |
188 | rt = t.get('retweeted_status') | |
189 | if rt: | |
190 | text = "RT @%s: %s" % (rt['user']['screen_name'], rt['text']) | |
191 | tweets[t['id']] = "%s <%s> %s" % (format_date(t['created_at']), | |
192 | t['user']['screen_name'], | |
193 | format_text(text)) | |
a7282452 S |
194 | return tweets |
195 | ||
4f0b5ca6 H |
196 | def statuses(twitter, screen_name, tweets, mentions=False, favorites=False): |
197 | """Get all the statuses for a screen name.""" | |
a7282452 S |
198 | max_id = None |
199 | fail = Fail() | |
4f0b5ca6 | 200 | # get portions of statuses, incrementing max id until no new tweets appear |
a7282452 S |
201 | while True: |
202 | try: | |
4f0b5ca6 | 203 | portion = statuses_portion(twitter, screen_name, max_id, mentions, favorites) |
a7282452 S |
204 | except TwitterError as e: |
205 | if e.e.code == 401: | |
206 | err("Fail: %i Unauthorized (tweets of that user are protected)" | |
207 | % e.e.code) | |
208 | break | |
209 | elif e.e.code == 400: | |
210 | err("Fail: %i API rate limit exceeded" % e.e.code) | |
211 | rate = twitter.account.rate_limit_status() | |
212 | reset = rate['reset_time_in_seconds'] | |
213 | reset = time.asctime(time.localtime(reset)) | |
214 | delay = int(rate['reset_time_in_seconds'] | |
215 | - time.time()) + 5 # avoid race | |
216 | err("Hourly limit of %i requests reached, next reset on %s: " | |
217 | "going to sleep for %i secs" % (rate['hourly_limit'], | |
218 | reset, delay)) | |
219 | fail.wait(delay) | |
220 | continue | |
bb2a70ee S |
221 | elif e.e.code == 404: |
222 | err("Fail: %i This profile does not exist" % e.e.code) | |
223 | break | |
a7282452 S |
224 | elif e.e.code == 502: |
225 | err("Fail: %i Service currently unavailable, retrying..." | |
226 | % e.e.code) | |
227 | else: | |
228 | err("Fail: %s\nRetrying..." % str(e)[:500]) | |
229 | fail.wait(3) | |
230 | except urllib2.URLError as e: | |
231 | err("Fail: urllib2.URLError %s - Retrying..." % str(e)) | |
232 | fail.wait(3) | |
233 | except httplib.error as e: | |
234 | err("Fail: httplib.error %s - Retrying..." % str(e)) | |
235 | fail.wait(3) | |
236 | except KeyError as e: | |
237 | err("Fail: KeyError %s - Retrying..." % str(e)) | |
238 | fail.wait(3) | |
239 | else: | |
240 | new = -len(tweets) | |
241 | tweets.update(portion) | |
242 | new += len(tweets) | |
4f0b5ca6 | 243 | err("Browsing %s statuses, new tweets: %i" |
a7282452 S |
244 | % (screen_name if screen_name else "home", new)) |
245 | if new < 190: | |
246 | break | |
aa3d6d75 | 247 | max_id = min(portion.keys())-1 # browse backwards |
a7282452 S |
248 | fail = Fail() |
249 | ||
250 | def rate_limit_status(twitter): | |
251 | """Print current Twitter API rate limit status.""" | |
252 | r = twitter.account.rate_limit_status() | |
253 | print("Remaining API requests: %i/%i (hourly limit)" | |
254 | % (r['remaining_hits'], r['hourly_limit'])) | |
255 | print("Next reset in %is (%s)" | |
256 | % (int(r['reset_time_in_seconds'] - time.time()), | |
257 | time.asctime(time.localtime(r['reset_time_in_seconds'])))) | |
258 | ||
259 | def main(args=sys.argv[1:]): | |
260 | options = { | |
261 | 'oauth': False, | |
262 | 'save-dir': ".", | |
263 | 'api-rate': False, | |
907402f6 | 264 | 'timeline': "", |
4f0b5ca6 H |
265 | 'mentions': "", |
266 | 'favorites': False, | |
907402f6 | 267 | 'follow-redirects': False, |
268 | 'redirect-sites': None, | |
a7282452 S |
269 | } |
270 | try: | |
271 | parse_args(args, options) | |
272 | except GetoptError as e: | |
273 | err("I can't do that, %s." % e) | |
274 | raise SystemExit(1) | |
275 | ||
276 | # exit if no user given | |
4f0b5ca6 | 277 | # except if asking for API rate, or archive of timeline or mentions |
a7282452 | 278 | if not options['extra_args'] and not (options['api-rate'] or |
4f0b5ca6 H |
279 | options['timeline'] or |
280 | options['mentions']): | |
a7282452 S |
281 | print(__doc__) |
282 | return | |
283 | ||
284 | # authenticate using OAuth, asking for token if necessary | |
285 | if options['oauth']: | |
286 | oauth_filename = (os.getenv("HOME", "") + os.sep | |
287 | + ".twitter-archiver_oauth") | |
288 | if not os.path.exists(oauth_filename): | |
289 | oauth_dance("Twitter-Archiver", CONSUMER_KEY, CONSUMER_SECRET, | |
290 | oauth_filename) | |
291 | oauth_token, oauth_token_secret = read_token_file(oauth_filename) | |
292 | auth = OAuth(oauth_token, oauth_token_secret, CONSUMER_KEY, | |
293 | CONSUMER_SECRET) | |
294 | else: | |
295 | auth = NoAuth() | |
296 | ||
297 | twitter = Twitter(auth=auth, api_version='1', domain='api.twitter.com') | |
298 | ||
299 | if options['api-rate']: | |
300 | rate_limit_status(twitter) | |
301 | return | |
302 | ||
907402f6 | 303 | global format_text |
304 | if options['follow-redirects'] or options['redirect-sites'] : | |
305 | if options['redirect-sites']: | |
306 | hosts = parse_host_list(options['redirect-sites']) | |
307 | else: | |
308 | hosts = None | |
309 | format_text = functools.partial(expand_format_text, hosts) | |
310 | else: | |
311 | format_text = direct_format_text | |
be5f32da | 312 | |
4f0b5ca6 H |
313 | # save own timeline or mentions (the user used in OAuth) |
314 | if options['timeline'] or options['mentions']: | |
a7282452 | 315 | if isinstance(auth, NoAuth): |
4f0b5ca6 | 316 | err("You must be authenticated to save timeline or mentions.") |
a7282452 S |
317 | raise SystemExit(1) |
318 | ||
4f0b5ca6 H |
319 | if options['timeline']: |
320 | filename = options['save-dir'] + os.sep + options['timeline'] | |
321 | print("* Archiving own timeline in %s" % filename) | |
322 | elif options['mentions']: | |
323 | filename = options['save-dir'] + os.sep + options['mentions'] | |
324 | print("* Archiving own mentions in %s" % filename) | |
a7282452 S |
325 | |
326 | tweets = {} | |
327 | try: | |
328 | tweets = load_tweets(filename) | |
62ec1b07 | 329 | except Exception as e: |
a7282452 S |
330 | err("Error when loading saved tweets: %s - continuing without" |
331 | % str(e)) | |
332 | ||
333 | try: | |
4f0b5ca6 | 334 | statuses(twitter, "", tweets, options['mentions'], options['favorites']) |
a7282452 S |
335 | except KeyboardInterrupt: |
336 | err() | |
337 | err("Interrupted") | |
338 | raise SystemExit(1) | |
339 | ||
340 | save_tweets(filename, tweets) | |
4f0b5ca6 H |
341 | if options['timeline']: |
342 | print("Total tweets in own timeline: %i" % len(tweets)) | |
343 | elif options['mentions']: | |
344 | print("Total mentions: %i" % len(tweets)) | |
a7282452 S |
345 | |
346 | # read users from command-line or stdin | |
347 | users = options['extra_args'] | |
348 | if len(users) == 1 and users[0] == "-": | |
349 | users = [line.strip() for line in sys.stdin.readlines()] | |
350 | ||
351 | # save tweets for every user | |
352 | total, total_new = 0, 0 | |
353 | for user in users: | |
354 | filename = options['save-dir'] + os.sep + user | |
4f0b5ca6 H |
355 | if options['favorites']: |
356 | filename = filename + "-favorites" | |
a7282452 S |
357 | print("* Archiving %s tweets in %s" % (user, filename)) |
358 | ||
359 | tweets = {} | |
360 | try: | |
361 | tweets = load_tweets(filename) | |
62ec1b07 | 362 | except Exception as e: |
a7282452 S |
363 | err("Error when loading saved tweets: %s - continuing without" |
364 | % str(e)) | |
365 | ||
366 | new = 0 | |
367 | before = len(tweets) | |
368 | try: | |
4f0b5ca6 | 369 | statuses(twitter, user, tweets, options['mentions'], options['favorites']) |
a7282452 S |
370 | except KeyboardInterrupt: |
371 | err() | |
372 | err("Interrupted") | |
373 | raise SystemExit(1) | |
374 | ||
375 | save_tweets(filename, tweets) | |
376 | total += len(tweets) | |
377 | new = len(tweets) - before | |
378 | total_new += new | |
379 | print("Total tweets for %s: %i (%i new)" % (user, len(tweets), new)) | |
380 | ||
381 | print("Total: %i tweets (%i new) for %i users" | |
382 | % (total, total_new, len(users))) |