]>
Commit | Line | Data |
---|---|---|
a7282452 S |
1 | """USAGE |
2 | twitter-archiver [options] <-|user> [<user> ...] | |
3 | ||
4 | DESCRIPTION | |
5 | Archive tweets of users, sorted by date from oldest to newest, in | |
6 | the following format: <id> <date> <<screen_name>> <tweet_text> | |
7 | Date format is: YYYY-MM-DD HH:MM:SS TZ. Tweet <id> is used to | |
8 | resume archiving on next run. Archive file name is the user name. | |
9 | Provide "-" instead of users to read users from standard input. | |
10 | ||
11 | OPTIONS | |
4f0b5ca6 | 12 | -o --oauth authenticate to Twitter using OAuth (default: no) |
a7282452 S |
13 | -s --save-dir <path> directory to save archives (default: current dir) |
14 | -a --api-rate see current API rate limit status | |
15 | -t --timeline <file> archive own timeline into given file name (requires | |
4f0b5ca6 H |
16 | OAuth, max 800 statuses) |
17 | -m --mentions <file> archive own mentions instead of timeline into | |
18 | given file name (requires OAuth, max 800 statuses) | |
19 | -v --favorites archive user's favorites instead of timeline | |
907402f6 | 20 | -f --follow-redirects follow redirects of urls |
21 | -r --redirect-sites follow redirects for this comma separated list of hosts | |
01618308 MC |
22 | -d --dms <file> archive own direct messages (both received and |
23 | sent) into given file name. | |
694aaadf | 24 | -i --isoformat store dates in ISO format (specifically RFC 3339) |
a7282452 S |
25 | |
26 | AUTHENTICATION | |
27 | Authenticate to Twitter using OAuth to archive tweets of private profiles | |
28 | and have higher API rate limits. OAuth authentication tokens are stored | |
29 | in ~/.twitter-archiver_oauth. | |
30 | """ | |
31 | ||
32 | from __future__ import print_function | |
33 | ||
694aaadf MC |
34 | import os, sys, time as _time, calendar, functools |
35 | from datetime import time, date, datetime | |
a7282452 S |
36 | from getopt import gnu_getopt as getopt, GetoptError |
37 | ||
62ec1b07 | 38 | try: |
39 | import urllib.request as urllib2 | |
40 | import http.client as httplib | |
41 | except ImportError: | |
42 | import urllib2 | |
43 | import httplib | |
44 | ||
45 | ||
a7282452 S |
46 | # T-Archiver (Twitter-Archiver) application registered by @stalkr_ |
47 | CONSUMER_KEY='d8hIyfzs7ievqeeZLjZrqQ' | |
48 | CONSUMER_SECRET='AnZmK0rnvaX7BoJ75l6XlilnbyMv7FoiDXWVmPD8' | |
49 | ||
50 | from .api import Twitter, TwitterError | |
51 | from .oauth import OAuth, read_token_file | |
52 | from .oauth_dance import oauth_dance | |
53 | from .auth import NoAuth | |
907402f6 | 54 | from .util import Fail, err, expand_line, parse_host_list |
a7282452 | 55 | from .follow import lookup |
694aaadf | 56 | from .timezones import utc as UTC, Local |
a7282452 S |
57 | |
58 | def parse_args(args, options): | |
59 | """Parse arguments from command-line to set options.""" | |
694aaadf MC |
60 | long_opts = ['help', 'oauth', 'save-dir=', 'api-rate', 'timeline=', 'mentions=', 'favorites', 'follow-redirects',"redirect-sites=", 'dms=', 'isoformat'] |
61 | short_opts = "hos:at:m:vfr:d:i" | |
a7282452 S |
62 | opts, extra_args = getopt(args, short_opts, long_opts) |
63 | ||
64 | for opt, arg in opts: | |
65 | if opt in ('-h', '--help'): | |
66 | print(__doc__) | |
67 | raise SystemExit(0) | |
68 | elif opt in ('-o', '--oauth'): | |
69 | options['oauth'] = True | |
70 | elif opt in ('-s', '--save-dir'): | |
71 | options['save-dir'] = arg | |
72 | elif opt in ('-a', '--api-rate'): | |
73 | options['api-rate' ] = True | |
74 | elif opt in ('-t', '--timeline'): | |
75 | options['timeline'] = arg | |
4f0b5ca6 H |
76 | elif opt in ('-m', '--mentions'): |
77 | options['mentions'] = arg | |
78 | elif opt in ('-v', '--favorites'): | |
79 | options['favorites'] = True | |
907402f6 | 80 | elif opt in ('-f', '--follow-redirects'): |
81 | options['follow-redirects'] = True | |
82 | elif opt in ('-r', '--redirect-sites'): | |
83 | options['redirect-sites'] = arg | |
01618308 MC |
84 | elif opt in ('-d', '--dms'): |
85 | options['dms'] = arg | |
694aaadf MC |
86 | elif opt in ('-i', '--isoformat'): |
87 | options['isoformat'] = True | |
a7282452 S |
88 | |
89 | options['extra_args'] = extra_args | |
90 | ||
91 | def load_tweets(filename): | |
92 | """Load tweets from file into dict, see save_tweets().""" | |
93 | try: | |
94 | archive = open(filename,"r") | |
95 | except IOError: # no archive (yet) | |
96 | return {} | |
97 | ||
98 | tweets = {} | |
99 | for line in archive.readlines(): | |
62f2a207 EB |
100 | try: |
101 | tid, text = line.strip().split(" ", 1) | |
102 | tweets[int(tid)] = text.decode("utf-8") | |
103 | except Exception as e: | |
104 | err("loading tweet %s failed due to %s" % (line, unicode(e))) | |
a7282452 S |
105 | |
106 | archive.close() | |
107 | return tweets | |
108 | ||
109 | def save_tweets(filename, tweets): | |
110 | """Save tweets from dict to file. | |
111 | ||
112 | Save tweets from dict to UTF-8 encoded file, one per line: | |
113 | <tweet id (number)> <tweet text> | |
114 | Tweet text is: | |
115 | <date> <<user>> [RT @<user>: ]<text> | |
116 | ||
117 | Args: | |
118 | filename: A string representing the file name to save tweets to. | |
119 | tweets: A dict mapping tweet-ids (int) to tweet text (str). | |
120 | """ | |
121 | if len(tweets) == 0: | |
122 | return | |
123 | ||
124 | try: | |
125 | archive = open(filename,"w") | |
126 | except IOError as e: | |
127 | err("Cannot save tweets: %s" % str(e)) | |
128 | return | |
129 | ||
130 | for k in sorted(tweets.keys()): | |
62f2a207 EB |
131 | try: |
132 | archive.write("%i %s\n" % (k, tweets[k].encode('utf-8'))) | |
133 | except Exception as ex: | |
134 | err("archiving tweet %s failed due to %s" % (k, unicode(ex))) | |
a7282452 S |
135 | |
136 | archive.close() | |
137 | ||
941cdf0a | 138 | def format_date(utc, isoformat=False): |
a7282452 | 139 | """Parse Twitter's UTC date into UTC or local time.""" |
694aaadf | 140 | u = datetime.strptime(utc.replace('+0000','UTC'), '%a %b %d %H:%M:%S %Z %Y') |
941cdf0a | 141 | # This is the least painful way I could find to create a non-naive |
7ab9bcfe | 142 | # datetime including a UTC timezone. Alternative suggestions |
941cdf0a | 143 | # welcome. |
694aaadf MC |
144 | unew = datetime.combine(u.date(), time(u.time().hour, |
145 | u.time().minute, u.time().second, tzinfo=UTC)) | |
146 | ||
7ab9bcfe MC |
147 | # Convert to localtime |
148 | unew = unew.astimezone(Local) | |
149 | ||
694aaadf MC |
150 | if isoformat: |
151 | return unew.isoformat() | |
a7282452 | 152 | else: |
694aaadf | 153 | return unew.strftime('%Y-%m-%d %H:%M:%S %Z') |
a7282452 | 154 | |
907402f6 | 155 | def expand_format_text(hosts, text): |
156 | """Following redirects in links.""" | |
157 | return direct_format_text(expand_line(text, hosts)) | |
158 | ||
159 | def direct_format_text(text): | |
a7282452 S |
160 | """Transform special chars in text to have only one line.""" |
161 | return text.replace('\n','\\n').replace('\r','\\r') | |
162 | ||
4f0b5ca6 H |
163 | def statuses_resolve_uids(twitter, tl): |
164 | """Resolve user ids to screen names from statuses.""" | |
a7282452 S |
165 | # get all user ids that needs a lookup (no screen_name key) |
166 | user_ids = [] | |
167 | for t in tl: | |
168 | rt = t.get('retweeted_status') | |
169 | if rt and not rt['user'].get('screen_name'): | |
170 | user_ids.append(rt['user']['id']) | |
171 | if not t['user'].get('screen_name'): | |
172 | user_ids.append(t['user']['id']) | |
173 | ||
174 | # resolve all of them at once | |
175 | names = lookup(twitter, list(set(user_ids))) | |
176 | ||
4f0b5ca6 | 177 | # build new statuses with resolved uids |
a7282452 S |
178 | new_tl = [] |
179 | for t in tl: | |
180 | rt = t.get('retweeted_status') | |
181 | if rt and not rt['user'].get('screen_name'): | |
182 | name = names[rt['user']['id']] | |
183 | t['retweeted_status']['user']['screen_name'] = name | |
184 | if not t['user'].get('screen_name'): | |
185 | name = names[t['user']['id']] | |
186 | t['user']['screen_name'] = name | |
187 | new_tl.append(t) | |
188 | ||
189 | return new_tl | |
190 | ||
694aaadf | 191 | def statuses_portion(twitter, screen_name, max_id=None, mentions=False, favorites=False, received_dms=None, isoformat=False): |
4f0b5ca6 | 192 | """Get a portion of the statuses of a screen name.""" |
a7282452 S |
193 | kwargs = dict(count=200, include_rts=1, screen_name=screen_name) |
194 | if max_id: | |
195 | kwargs['max_id'] = max_id | |
196 | ||
197 | tweets = {} | |
4f0b5ca6 | 198 | if mentions: |
7a5610b1 | 199 | tl = twitter.statuses.mentions_timeline(**kwargs) |
4f0b5ca6 | 200 | elif favorites: |
7a5610b1 | 201 | tl = twitter.favorites.list(**kwargs) |
01618308 MC |
202 | elif received_dms != None: |
203 | if received_dms: | |
204 | tl = twitter.direct_messages(**kwargs) | |
205 | else: # sent DMs | |
206 | tl = twitter.direct_messages.sent(**kwargs) | |
4f0b5ca6 H |
207 | else: # timeline |
208 | if screen_name: | |
209 | tl = twitter.statuses.user_timeline(**kwargs) | |
210 | else: # self | |
211 | tl = twitter.statuses.home_timeline(**kwargs) | |
a7282452 S |
212 | |
213 | # some tweets do not provide screen name but user id, resolve those | |
01618308 MC |
214 | # this isn't a valid operation for DMs, so special-case them |
215 | if received_dms == None: | |
216 | newtl = statuses_resolve_uids(twitter, tl) | |
217 | else: | |
218 | newtl = tl | |
219 | for t in newtl: | |
a7282452 S |
220 | text = t['text'] |
221 | rt = t.get('retweeted_status') | |
222 | if rt: | |
223 | text = "RT @%s: %s" % (rt['user']['screen_name'], rt['text']) | |
01618308 MC |
224 | # DMs don't include mentions by default, so in order to show who |
225 | # the recipient was, we synthesise a mention. If we're not | |
226 | # operating on DMs, behave as normal | |
227 | if received_dms == None: | |
694aaadf | 228 | tweets[t['id']] = "%s <%s> %s" % (format_date(t['created_at'], isoformat=isoformat), |
01618308 MC |
229 | t['user']['screen_name'], |
230 | format_text(text)) | |
231 | else: | |
694aaadf | 232 | tweets[t['id']] = "%s <%s> @%s %s" % (format_date(t['created_at'], isoformat=isoformat), |
01618308 MC |
233 | t['sender_screen_name'], |
234 | t['recipient']['screen_name'], | |
235 | format_text(text)) | |
a7282452 S |
236 | return tweets |
237 | ||
694aaadf | 238 | def statuses(twitter, screen_name, tweets, mentions=False, favorites=False, received_dms=None, isoformat=False): |
4f0b5ca6 | 239 | """Get all the statuses for a screen name.""" |
a7282452 S |
240 | max_id = None |
241 | fail = Fail() | |
4f0b5ca6 | 242 | # get portions of statuses, incrementing max id until no new tweets appear |
a7282452 S |
243 | while True: |
244 | try: | |
694aaadf | 245 | portion = statuses_portion(twitter, screen_name, max_id, mentions, favorites, received_dms, isoformat) |
a7282452 S |
246 | except TwitterError as e: |
247 | if e.e.code == 401: | |
248 | err("Fail: %i Unauthorized (tweets of that user are protected)" | |
249 | % e.e.code) | |
250 | break | |
780044e6 | 251 | elif e.e.code == 429: |
a7282452 | 252 | err("Fail: %i API rate limit exceeded" % e.e.code) |
780044e6 EB |
253 | rls = twitter.application.rate_limit_status() |
254 | reset = rls.rate_limit_reset | |
255 | reset = _time.asctime(_time.localtime(reset)) | |
256 | delay = int(rls.rate_limit_reset | |
257 | - _time.time()) + 5 # avoid race | |
258 | err("Interval limit of %i requests reached, next reset on %s: " | |
259 | "going to sleep for %i secs" % (rls.rate_limit_limit, | |
a7282452 S |
260 | reset, delay)) |
261 | fail.wait(delay) | |
262 | continue | |
bb2a70ee S |
263 | elif e.e.code == 404: |
264 | err("Fail: %i This profile does not exist" % e.e.code) | |
265 | break | |
a7282452 S |
266 | elif e.e.code == 502: |
267 | err("Fail: %i Service currently unavailable, retrying..." | |
268 | % e.e.code) | |
269 | else: | |
270 | err("Fail: %s\nRetrying..." % str(e)[:500]) | |
271 | fail.wait(3) | |
272 | except urllib2.URLError as e: | |
273 | err("Fail: urllib2.URLError %s - Retrying..." % str(e)) | |
274 | fail.wait(3) | |
275 | except httplib.error as e: | |
276 | err("Fail: httplib.error %s - Retrying..." % str(e)) | |
277 | fail.wait(3) | |
278 | except KeyError as e: | |
279 | err("Fail: KeyError %s - Retrying..." % str(e)) | |
280 | fail.wait(3) | |
281 | else: | |
282 | new = -len(tweets) | |
283 | tweets.update(portion) | |
284 | new += len(tweets) | |
4f0b5ca6 | 285 | err("Browsing %s statuses, new tweets: %i" |
a7282452 S |
286 | % (screen_name if screen_name else "home", new)) |
287 | if new < 190: | |
288 | break | |
aa3d6d75 | 289 | max_id = min(portion.keys())-1 # browse backwards |
a7282452 S |
290 | fail = Fail() |
291 | ||
292 | def rate_limit_status(twitter): | |
293 | """Print current Twitter API rate limit status.""" | |
780044e6 EB |
294 | rls = twitter.application.rate_limit_status() |
295 | print("Remaining API requests: %i/%i (interval limit)" | |
296 | % (rls.rate_limit_remaining, rls.rate_limit_limit)) | |
a7282452 | 297 | print("Next reset in %is (%s)" |
780044e6 EB |
298 | % (int(rls.rate_limit_reset - _time.time()), |
299 | _time.asctime(_time.localtime(rls.rate_limit_reset)))) | |
a7282452 S |
300 | |
301 | def main(args=sys.argv[1:]): | |
302 | options = { | |
303 | 'oauth': False, | |
304 | 'save-dir': ".", | |
305 | 'api-rate': False, | |
907402f6 | 306 | 'timeline': "", |
4f0b5ca6 | 307 | 'mentions': "", |
01618308 | 308 | 'dms': "", |
4f0b5ca6 | 309 | 'favorites': False, |
907402f6 | 310 | 'follow-redirects': False, |
311 | 'redirect-sites': None, | |
694aaadf | 312 | 'isoformat': False, |
a7282452 S |
313 | } |
314 | try: | |
315 | parse_args(args, options) | |
316 | except GetoptError as e: | |
317 | err("I can't do that, %s." % e) | |
318 | raise SystemExit(1) | |
319 | ||
320 | # exit if no user given | |
4f0b5ca6 | 321 | # except if asking for API rate, or archive of timeline or mentions |
a7282452 | 322 | if not options['extra_args'] and not (options['api-rate'] or |
4f0b5ca6 | 323 | options['timeline'] or |
01618308 MC |
324 | options['mentions'] or |
325 | options['dms']): | |
a7282452 S |
326 | print(__doc__) |
327 | return | |
328 | ||
329 | # authenticate using OAuth, asking for token if necessary | |
330 | if options['oauth']: | |
8f6a4c32 | 331 | oauth_filename = (os.environ.get('HOME', |
332 | os.environ.get('USERPROFILE', '')) | |
333 | + os.sep | |
334 | + '.twitter-archiver_oauth') | |
335 | ||
a7282452 S |
336 | if not os.path.exists(oauth_filename): |
337 | oauth_dance("Twitter-Archiver", CONSUMER_KEY, CONSUMER_SECRET, | |
338 | oauth_filename) | |
339 | oauth_token, oauth_token_secret = read_token_file(oauth_filename) | |
340 | auth = OAuth(oauth_token, oauth_token_secret, CONSUMER_KEY, | |
341 | CONSUMER_SECRET) | |
342 | else: | |
343 | auth = NoAuth() | |
344 | ||
7a5610b1 | 345 | twitter = Twitter(auth=auth, api_version='1.1', domain='api.twitter.com') |
a7282452 S |
346 | |
347 | if options['api-rate']: | |
348 | rate_limit_status(twitter) | |
349 | return | |
350 | ||
907402f6 | 351 | global format_text |
352 | if options['follow-redirects'] or options['redirect-sites'] : | |
353 | if options['redirect-sites']: | |
354 | hosts = parse_host_list(options['redirect-sites']) | |
355 | else: | |
356 | hosts = None | |
357 | format_text = functools.partial(expand_format_text, hosts) | |
358 | else: | |
359 | format_text = direct_format_text | |
be5f32da | 360 | |
4f0b5ca6 H |
361 | # save own timeline or mentions (the user used in OAuth) |
362 | if options['timeline'] or options['mentions']: | |
a7282452 | 363 | if isinstance(auth, NoAuth): |
4f0b5ca6 | 364 | err("You must be authenticated to save timeline or mentions.") |
a7282452 S |
365 | raise SystemExit(1) |
366 | ||
4f0b5ca6 H |
367 | if options['timeline']: |
368 | filename = options['save-dir'] + os.sep + options['timeline'] | |
369 | print("* Archiving own timeline in %s" % filename) | |
370 | elif options['mentions']: | |
371 | filename = options['save-dir'] + os.sep + options['mentions'] | |
372 | print("* Archiving own mentions in %s" % filename) | |
a7282452 S |
373 | |
374 | tweets = {} | |
375 | try: | |
376 | tweets = load_tweets(filename) | |
62ec1b07 | 377 | except Exception as e: |
a7282452 S |
378 | err("Error when loading saved tweets: %s - continuing without" |
379 | % str(e)) | |
380 | ||
381 | try: | |
694aaadf | 382 | statuses(twitter, "", tweets, options['mentions'], options['favorites'], isoformat=options['isoformat']) |
a7282452 S |
383 | except KeyboardInterrupt: |
384 | err() | |
385 | err("Interrupted") | |
386 | raise SystemExit(1) | |
387 | ||
388 | save_tweets(filename, tweets) | |
4f0b5ca6 H |
389 | if options['timeline']: |
390 | print("Total tweets in own timeline: %i" % len(tweets)) | |
391 | elif options['mentions']: | |
392 | print("Total mentions: %i" % len(tweets)) | |
a7282452 | 393 | |
01618308 MC |
394 | if options['dms']: |
395 | if isinstance(auth, NoAuth): | |
396 | err("You must be authenticated to save DMs.") | |
397 | raise SystemExit(1) | |
398 | ||
399 | filename = options['save-dir'] + os.sep + options['dms'] | |
400 | print("* Archiving own DMs in %s" % filename) | |
401 | ||
402 | dms = {} | |
403 | try: | |
404 | dms = load_tweets(filename) | |
04c483ab | 405 | except Exception as e: |
01618308 MC |
406 | err("Error when loading saved DMs: %s - continuing without" |
407 | % str(e)) | |
408 | ||
409 | try: | |
694aaadf MC |
410 | statuses(twitter, "", dms, received_dms=True, isoformat=options['isoformat']) |
411 | statuses(twitter, "", dms, received_dms=False, isoformat=options['isoformat']) | |
01618308 MC |
412 | except KeyboardInterrupt: |
413 | err() | |
414 | err("Interrupted") | |
415 | raise SystemExit(1) | |
416 | ||
417 | save_tweets(filename, dms) | |
418 | print("Total DMs sent and received: %i" % len(dms)) | |
419 | ||
420 | ||
a7282452 S |
421 | # read users from command-line or stdin |
422 | users = options['extra_args'] | |
423 | if len(users) == 1 and users[0] == "-": | |
424 | users = [line.strip() for line in sys.stdin.readlines()] | |
425 | ||
426 | # save tweets for every user | |
427 | total, total_new = 0, 0 | |
428 | for user in users: | |
429 | filename = options['save-dir'] + os.sep + user | |
4f0b5ca6 H |
430 | if options['favorites']: |
431 | filename = filename + "-favorites" | |
a7282452 S |
432 | print("* Archiving %s tweets in %s" % (user, filename)) |
433 | ||
434 | tweets = {} | |
435 | try: | |
436 | tweets = load_tweets(filename) | |
62ec1b07 | 437 | except Exception as e: |
a7282452 S |
438 | err("Error when loading saved tweets: %s - continuing without" |
439 | % str(e)) | |
440 | ||
441 | new = 0 | |
442 | before = len(tweets) | |
443 | try: | |
694aaadf | 444 | statuses(twitter, user, tweets, options['mentions'], options['favorites'], isoformat=options['isoformat']) |
a7282452 S |
445 | except KeyboardInterrupt: |
446 | err() | |
447 | err("Interrupted") | |
448 | raise SystemExit(1) | |
449 | ||
450 | save_tweets(filename, tweets) | |
451 | total += len(tweets) | |
452 | new = len(tweets) - before | |
453 | total_new += new | |
454 | print("Total tweets for %s: %i (%i new)" % (user, len(tweets), new)) | |
455 | ||
456 | print("Total: %i tweets (%i new) for %i users" | |
457 | % (total, total_new, len(users))) |