]>
Commit | Line | Data |
---|---|---|
a7282452 S |
1 | """USAGE |
2 | twitter-archiver [options] <-|user> [<user> ...] | |
3 | ||
4 | DESCRIPTION | |
5 | Archive tweets of users, sorted by date from oldest to newest, in | |
6 | the following format: <id> <date> <<screen_name>> <tweet_text> | |
7 | Date format is: YYYY-MM-DD HH:MM:SS TZ. Tweet <id> is used to | |
8 | resume archiving on next run. Archive file name is the user name. | |
9 | Provide "-" instead of users to read users from standard input. | |
10 | ||
11 | OPTIONS | |
4f0b5ca6 | 12 | -o --oauth authenticate to Twitter using OAuth (default: no) |
a7282452 S |
13 | -s --save-dir <path> directory to save archives (default: current dir) |
14 | -a --api-rate see current API rate limit status | |
15 | -t --timeline <file> archive own timeline into given file name (requires | |
4f0b5ca6 H |
16 | OAuth, max 800 statuses) |
17 | -m --mentions <file> archive own mentions instead of timeline into | |
18 | given file name (requires OAuth, max 800 statuses) | |
19 | -v --favorites archive user's favorites instead of timeline | |
907402f6 | 20 | -f --follow-redirects follow redirects of urls |
21 | -r --redirect-sites follow redirects for this comma separated list of hosts | |
01618308 MC |
22 | -d --dms <file> archive own direct messages (both received and |
23 | sent) into given file name. | |
694aaadf | 24 | -i --isoformat store dates in ISO format (specifically RFC 3339) |
a7282452 S |
25 | |
26 | AUTHENTICATION | |
27 | Authenticate to Twitter using OAuth to archive tweets of private profiles | |
28 | and have higher API rate limits. OAuth authentication tokens are stored | |
29 | in ~/.twitter-archiver_oauth. | |
30 | """ | |
31 | ||
32 | from __future__ import print_function | |
33 | ||
694aaadf MC |
34 | import os, sys, time as _time, calendar, functools |
35 | from datetime import time, date, datetime | |
a7282452 S |
36 | from getopt import gnu_getopt as getopt, GetoptError |
37 | ||
62ec1b07 | 38 | try: |
39 | import urllib.request as urllib2 | |
40 | import http.client as httplib | |
41 | except ImportError: | |
42 | import urllib2 | |
43 | import httplib | |
44 | ||
45 | ||
a7282452 S |
46 | # T-Archiver (Twitter-Archiver) application registered by @stalkr_ |
47 | CONSUMER_KEY='d8hIyfzs7ievqeeZLjZrqQ' | |
48 | CONSUMER_SECRET='AnZmK0rnvaX7BoJ75l6XlilnbyMv7FoiDXWVmPD8' | |
49 | ||
50 | from .api import Twitter, TwitterError | |
51 | from .oauth import OAuth, read_token_file | |
52 | from .oauth_dance import oauth_dance | |
53 | from .auth import NoAuth | |
907402f6 | 54 | from .util import Fail, err, expand_line, parse_host_list |
a7282452 | 55 | from .follow import lookup |
694aaadf | 56 | from .timezones import utc as UTC, Local |
a7282452 S |
57 | |
58 | def parse_args(args, options): | |
59 | """Parse arguments from command-line to set options.""" | |
694aaadf MC |
60 | long_opts = ['help', 'oauth', 'save-dir=', 'api-rate', 'timeline=', 'mentions=', 'favorites', 'follow-redirects',"redirect-sites=", 'dms=', 'isoformat'] |
61 | short_opts = "hos:at:m:vfr:d:i" | |
a7282452 S |
62 | opts, extra_args = getopt(args, short_opts, long_opts) |
63 | ||
64 | for opt, arg in opts: | |
65 | if opt in ('-h', '--help'): | |
66 | print(__doc__) | |
67 | raise SystemExit(0) | |
68 | elif opt in ('-o', '--oauth'): | |
69 | options['oauth'] = True | |
70 | elif opt in ('-s', '--save-dir'): | |
71 | options['save-dir'] = arg | |
72 | elif opt in ('-a', '--api-rate'): | |
73 | options['api-rate' ] = True | |
74 | elif opt in ('-t', '--timeline'): | |
75 | options['timeline'] = arg | |
4f0b5ca6 H |
76 | elif opt in ('-m', '--mentions'): |
77 | options['mentions'] = arg | |
78 | elif opt in ('-v', '--favorites'): | |
79 | options['favorites'] = True | |
907402f6 | 80 | elif opt in ('-f', '--follow-redirects'): |
81 | options['follow-redirects'] = True | |
82 | elif opt in ('-r', '--redirect-sites'): | |
83 | options['redirect-sites'] = arg | |
01618308 MC |
84 | elif opt in ('-d', '--dms'): |
85 | options['dms'] = arg | |
694aaadf MC |
86 | elif opt in ('-i', '--isoformat'): |
87 | options['isoformat'] = True | |
a7282452 S |
88 | |
89 | options['extra_args'] = extra_args | |
90 | ||
91 | def load_tweets(filename): | |
92 | """Load tweets from file into dict, see save_tweets().""" | |
93 | try: | |
94 | archive = open(filename,"r") | |
95 | except IOError: # no archive (yet) | |
96 | return {} | |
97 | ||
98 | tweets = {} | |
99 | for line in archive.readlines(): | |
100 | tid, text = line.strip().split(" ", 1) | |
101 | tweets[int(tid)] = text.decode("utf-8") | |
102 | ||
103 | archive.close() | |
104 | return tweets | |
105 | ||
106 | def save_tweets(filename, tweets): | |
107 | """Save tweets from dict to file. | |
108 | ||
109 | Save tweets from dict to UTF-8 encoded file, one per line: | |
110 | <tweet id (number)> <tweet text> | |
111 | Tweet text is: | |
112 | <date> <<user>> [RT @<user>: ]<text> | |
113 | ||
114 | Args: | |
115 | filename: A string representing the file name to save tweets to. | |
116 | tweets: A dict mapping tweet-ids (int) to tweet text (str). | |
117 | """ | |
118 | if len(tweets) == 0: | |
119 | return | |
120 | ||
121 | try: | |
122 | archive = open(filename,"w") | |
123 | except IOError as e: | |
124 | err("Cannot save tweets: %s" % str(e)) | |
125 | return | |
126 | ||
127 | for k in sorted(tweets.keys()): | |
128 | archive.write("%i %s\n" % (k, tweets[k].encode('utf-8'))) | |
129 | ||
130 | archive.close() | |
131 | ||
941cdf0a | 132 | def format_date(utc, isoformat=False): |
a7282452 | 133 | """Parse Twitter's UTC date into UTC or local time.""" |
694aaadf | 134 | u = datetime.strptime(utc.replace('+0000','UTC'), '%a %b %d %H:%M:%S %Z %Y') |
941cdf0a | 135 | # This is the least painful way I could find to create a non-naive |
7ab9bcfe | 136 | # datetime including a UTC timezone. Alternative suggestions |
941cdf0a | 137 | # welcome. |
694aaadf MC |
138 | unew = datetime.combine(u.date(), time(u.time().hour, |
139 | u.time().minute, u.time().second, tzinfo=UTC)) | |
140 | ||
7ab9bcfe MC |
141 | # Convert to localtime |
142 | unew = unew.astimezone(Local) | |
143 | ||
694aaadf MC |
144 | if isoformat: |
145 | return unew.isoformat() | |
a7282452 | 146 | else: |
694aaadf | 147 | return unew.strftime('%Y-%m-%d %H:%M:%S %Z') |
a7282452 | 148 | |
907402f6 | 149 | def expand_format_text(hosts, text): |
150 | """Following redirects in links.""" | |
151 | return direct_format_text(expand_line(text, hosts)) | |
152 | ||
153 | def direct_format_text(text): | |
a7282452 S |
154 | """Transform special chars in text to have only one line.""" |
155 | return text.replace('\n','\\n').replace('\r','\\r') | |
156 | ||
4f0b5ca6 H |
157 | def statuses_resolve_uids(twitter, tl): |
158 | """Resolve user ids to screen names from statuses.""" | |
a7282452 S |
159 | # get all user ids that needs a lookup (no screen_name key) |
160 | user_ids = [] | |
161 | for t in tl: | |
162 | rt = t.get('retweeted_status') | |
163 | if rt and not rt['user'].get('screen_name'): | |
164 | user_ids.append(rt['user']['id']) | |
165 | if not t['user'].get('screen_name'): | |
166 | user_ids.append(t['user']['id']) | |
167 | ||
168 | # resolve all of them at once | |
169 | names = lookup(twitter, list(set(user_ids))) | |
170 | ||
4f0b5ca6 | 171 | # build new statuses with resolved uids |
a7282452 S |
172 | new_tl = [] |
173 | for t in tl: | |
174 | rt = t.get('retweeted_status') | |
175 | if rt and not rt['user'].get('screen_name'): | |
176 | name = names[rt['user']['id']] | |
177 | t['retweeted_status']['user']['screen_name'] = name | |
178 | if not t['user'].get('screen_name'): | |
179 | name = names[t['user']['id']] | |
180 | t['user']['screen_name'] = name | |
181 | new_tl.append(t) | |
182 | ||
183 | return new_tl | |
184 | ||
694aaadf | 185 | def statuses_portion(twitter, screen_name, max_id=None, mentions=False, favorites=False, received_dms=None, isoformat=False): |
4f0b5ca6 | 186 | """Get a portion of the statuses of a screen name.""" |
a7282452 S |
187 | kwargs = dict(count=200, include_rts=1, screen_name=screen_name) |
188 | if max_id: | |
189 | kwargs['max_id'] = max_id | |
190 | ||
191 | tweets = {} | |
4f0b5ca6 | 192 | if mentions: |
7a5610b1 | 193 | tl = twitter.statuses.mentions_timeline(**kwargs) |
4f0b5ca6 | 194 | elif favorites: |
7a5610b1 | 195 | tl = twitter.favorites.list(**kwargs) |
01618308 MC |
196 | elif received_dms != None: |
197 | if received_dms: | |
198 | tl = twitter.direct_messages(**kwargs) | |
199 | else: # sent DMs | |
200 | tl = twitter.direct_messages.sent(**kwargs) | |
4f0b5ca6 H |
201 | else: # timeline |
202 | if screen_name: | |
203 | tl = twitter.statuses.user_timeline(**kwargs) | |
204 | else: # self | |
205 | tl = twitter.statuses.home_timeline(**kwargs) | |
a7282452 S |
206 | |
207 | # some tweets do not provide screen name but user id, resolve those | |
01618308 MC |
208 | # this isn't a valid operation for DMs, so special-case them |
209 | if received_dms == None: | |
210 | newtl = statuses_resolve_uids(twitter, tl) | |
211 | else: | |
212 | newtl = tl | |
213 | for t in newtl: | |
a7282452 S |
214 | text = t['text'] |
215 | rt = t.get('retweeted_status') | |
216 | if rt: | |
217 | text = "RT @%s: %s" % (rt['user']['screen_name'], rt['text']) | |
01618308 MC |
218 | # DMs don't include mentions by default, so in order to show who |
219 | # the recipient was, we synthesise a mention. If we're not | |
220 | # operating on DMs, behave as normal | |
221 | if received_dms == None: | |
694aaadf | 222 | tweets[t['id']] = "%s <%s> %s" % (format_date(t['created_at'], isoformat=isoformat), |
01618308 MC |
223 | t['user']['screen_name'], |
224 | format_text(text)) | |
225 | else: | |
694aaadf | 226 | tweets[t['id']] = "%s <%s> @%s %s" % (format_date(t['created_at'], isoformat=isoformat), |
01618308 MC |
227 | t['sender_screen_name'], |
228 | t['recipient']['screen_name'], | |
229 | format_text(text)) | |
a7282452 S |
230 | return tweets |
231 | ||
694aaadf | 232 | def statuses(twitter, screen_name, tweets, mentions=False, favorites=False, received_dms=None, isoformat=False): |
4f0b5ca6 | 233 | """Get all the statuses for a screen name.""" |
a7282452 S |
234 | max_id = None |
235 | fail = Fail() | |
4f0b5ca6 | 236 | # get portions of statuses, incrementing max id until no new tweets appear |
a7282452 S |
237 | while True: |
238 | try: | |
694aaadf | 239 | portion = statuses_portion(twitter, screen_name, max_id, mentions, favorites, received_dms, isoformat) |
a7282452 S |
240 | except TwitterError as e: |
241 | if e.e.code == 401: | |
242 | err("Fail: %i Unauthorized (tweets of that user are protected)" | |
243 | % e.e.code) | |
244 | break | |
245 | elif e.e.code == 400: | |
246 | err("Fail: %i API rate limit exceeded" % e.e.code) | |
247 | rate = twitter.account.rate_limit_status() | |
248 | reset = rate['reset_time_in_seconds'] | |
249 | reset = time.asctime(time.localtime(reset)) | |
250 | delay = int(rate['reset_time_in_seconds'] | |
251 | - time.time()) + 5 # avoid race | |
252 | err("Hourly limit of %i requests reached, next reset on %s: " | |
253 | "going to sleep for %i secs" % (rate['hourly_limit'], | |
254 | reset, delay)) | |
255 | fail.wait(delay) | |
256 | continue | |
bb2a70ee S |
257 | elif e.e.code == 404: |
258 | err("Fail: %i This profile does not exist" % e.e.code) | |
259 | break | |
a7282452 S |
260 | elif e.e.code == 502: |
261 | err("Fail: %i Service currently unavailable, retrying..." | |
262 | % e.e.code) | |
263 | else: | |
264 | err("Fail: %s\nRetrying..." % str(e)[:500]) | |
265 | fail.wait(3) | |
266 | except urllib2.URLError as e: | |
267 | err("Fail: urllib2.URLError %s - Retrying..." % str(e)) | |
268 | fail.wait(3) | |
269 | except httplib.error as e: | |
270 | err("Fail: httplib.error %s - Retrying..." % str(e)) | |
271 | fail.wait(3) | |
272 | except KeyError as e: | |
273 | err("Fail: KeyError %s - Retrying..." % str(e)) | |
274 | fail.wait(3) | |
275 | else: | |
276 | new = -len(tweets) | |
277 | tweets.update(portion) | |
278 | new += len(tweets) | |
4f0b5ca6 | 279 | err("Browsing %s statuses, new tweets: %i" |
a7282452 S |
280 | % (screen_name if screen_name else "home", new)) |
281 | if new < 190: | |
282 | break | |
aa3d6d75 | 283 | max_id = min(portion.keys())-1 # browse backwards |
a7282452 S |
284 | fail = Fail() |
285 | ||
286 | def rate_limit_status(twitter): | |
287 | """Print current Twitter API rate limit status.""" | |
288 | r = twitter.account.rate_limit_status() | |
289 | print("Remaining API requests: %i/%i (hourly limit)" | |
290 | % (r['remaining_hits'], r['hourly_limit'])) | |
291 | print("Next reset in %is (%s)" | |
292 | % (int(r['reset_time_in_seconds'] - time.time()), | |
293 | time.asctime(time.localtime(r['reset_time_in_seconds'])))) | |
294 | ||
295 | def main(args=sys.argv[1:]): | |
296 | options = { | |
297 | 'oauth': False, | |
298 | 'save-dir': ".", | |
299 | 'api-rate': False, | |
907402f6 | 300 | 'timeline': "", |
4f0b5ca6 | 301 | 'mentions': "", |
01618308 | 302 | 'dms': "", |
4f0b5ca6 | 303 | 'favorites': False, |
907402f6 | 304 | 'follow-redirects': False, |
305 | 'redirect-sites': None, | |
694aaadf | 306 | 'isoformat': False, |
a7282452 S |
307 | } |
308 | try: | |
309 | parse_args(args, options) | |
310 | except GetoptError as e: | |
311 | err("I can't do that, %s." % e) | |
312 | raise SystemExit(1) | |
313 | ||
314 | # exit if no user given | |
4f0b5ca6 | 315 | # except if asking for API rate, or archive of timeline or mentions |
a7282452 | 316 | if not options['extra_args'] and not (options['api-rate'] or |
4f0b5ca6 | 317 | options['timeline'] or |
01618308 MC |
318 | options['mentions'] or |
319 | options['dms']): | |
a7282452 S |
320 | print(__doc__) |
321 | return | |
322 | ||
323 | # authenticate using OAuth, asking for token if necessary | |
324 | if options['oauth']: | |
325 | oauth_filename = (os.getenv("HOME", "") + os.sep | |
326 | + ".twitter-archiver_oauth") | |
327 | if not os.path.exists(oauth_filename): | |
328 | oauth_dance("Twitter-Archiver", CONSUMER_KEY, CONSUMER_SECRET, | |
329 | oauth_filename) | |
330 | oauth_token, oauth_token_secret = read_token_file(oauth_filename) | |
331 | auth = OAuth(oauth_token, oauth_token_secret, CONSUMER_KEY, | |
332 | CONSUMER_SECRET) | |
333 | else: | |
334 | auth = NoAuth() | |
335 | ||
7a5610b1 | 336 | twitter = Twitter(auth=auth, api_version='1.1', domain='api.twitter.com') |
a7282452 S |
337 | |
338 | if options['api-rate']: | |
339 | rate_limit_status(twitter) | |
340 | return | |
341 | ||
907402f6 | 342 | global format_text |
343 | if options['follow-redirects'] or options['redirect-sites'] : | |
344 | if options['redirect-sites']: | |
345 | hosts = parse_host_list(options['redirect-sites']) | |
346 | else: | |
347 | hosts = None | |
348 | format_text = functools.partial(expand_format_text, hosts) | |
349 | else: | |
350 | format_text = direct_format_text | |
be5f32da | 351 | |
4f0b5ca6 H |
352 | # save own timeline or mentions (the user used in OAuth) |
353 | if options['timeline'] or options['mentions']: | |
a7282452 | 354 | if isinstance(auth, NoAuth): |
4f0b5ca6 | 355 | err("You must be authenticated to save timeline or mentions.") |
a7282452 S |
356 | raise SystemExit(1) |
357 | ||
4f0b5ca6 H |
358 | if options['timeline']: |
359 | filename = options['save-dir'] + os.sep + options['timeline'] | |
360 | print("* Archiving own timeline in %s" % filename) | |
361 | elif options['mentions']: | |
362 | filename = options['save-dir'] + os.sep + options['mentions'] | |
363 | print("* Archiving own mentions in %s" % filename) | |
a7282452 S |
364 | |
365 | tweets = {} | |
366 | try: | |
367 | tweets = load_tweets(filename) | |
62ec1b07 | 368 | except Exception as e: |
a7282452 S |
369 | err("Error when loading saved tweets: %s - continuing without" |
370 | % str(e)) | |
371 | ||
372 | try: | |
694aaadf | 373 | statuses(twitter, "", tweets, options['mentions'], options['favorites'], isoformat=options['isoformat']) |
a7282452 S |
374 | except KeyboardInterrupt: |
375 | err() | |
376 | err("Interrupted") | |
377 | raise SystemExit(1) | |
378 | ||
379 | save_tweets(filename, tweets) | |
4f0b5ca6 H |
380 | if options['timeline']: |
381 | print("Total tweets in own timeline: %i" % len(tweets)) | |
382 | elif options['mentions']: | |
383 | print("Total mentions: %i" % len(tweets)) | |
a7282452 | 384 | |
01618308 MC |
385 | if options['dms']: |
386 | if isinstance(auth, NoAuth): | |
387 | err("You must be authenticated to save DMs.") | |
388 | raise SystemExit(1) | |
389 | ||
390 | filename = options['save-dir'] + os.sep + options['dms'] | |
391 | print("* Archiving own DMs in %s" % filename) | |
392 | ||
393 | dms = {} | |
394 | try: | |
395 | dms = load_tweets(filename) | |
04c483ab | 396 | except Exception as e: |
01618308 MC |
397 | err("Error when loading saved DMs: %s - continuing without" |
398 | % str(e)) | |
399 | ||
400 | try: | |
694aaadf MC |
401 | statuses(twitter, "", dms, received_dms=True, isoformat=options['isoformat']) |
402 | statuses(twitter, "", dms, received_dms=False, isoformat=options['isoformat']) | |
01618308 MC |
403 | except KeyboardInterrupt: |
404 | err() | |
405 | err("Interrupted") | |
406 | raise SystemExit(1) | |
407 | ||
408 | save_tweets(filename, dms) | |
409 | print("Total DMs sent and received: %i" % len(dms)) | |
410 | ||
411 | ||
a7282452 S |
412 | # read users from command-line or stdin |
413 | users = options['extra_args'] | |
414 | if len(users) == 1 and users[0] == "-": | |
415 | users = [line.strip() for line in sys.stdin.readlines()] | |
416 | ||
417 | # save tweets for every user | |
418 | total, total_new = 0, 0 | |
419 | for user in users: | |
420 | filename = options['save-dir'] + os.sep + user | |
4f0b5ca6 H |
421 | if options['favorites']: |
422 | filename = filename + "-favorites" | |
a7282452 S |
423 | print("* Archiving %s tweets in %s" % (user, filename)) |
424 | ||
425 | tweets = {} | |
426 | try: | |
427 | tweets = load_tweets(filename) | |
62ec1b07 | 428 | except Exception as e: |
a7282452 S |
429 | err("Error when loading saved tweets: %s - continuing without" |
430 | % str(e)) | |
431 | ||
432 | new = 0 | |
433 | before = len(tweets) | |
434 | try: | |
694aaadf | 435 | statuses(twitter, user, tweets, options['mentions'], options['favorites'], isoformat=options['isoformat']) |
a7282452 S |
436 | except KeyboardInterrupt: |
437 | err() | |
438 | err("Interrupted") | |
439 | raise SystemExit(1) | |
440 | ||
441 | save_tweets(filename, tweets) | |
442 | total += len(tweets) | |
443 | new = len(tweets) - before | |
444 | total_new += new | |
445 | print("Total tweets for %s: %i (%i new)" % (user, len(tweets), new)) | |
446 | ||
447 | print("Total: %i tweets (%i new) for %i users" | |
448 | % (total, total_new, len(users))) |