]> jfr.im git - z_archive/twitter.git/blame - twitter/archiver.py
Version 1.17.0
[z_archive/twitter.git] / twitter / archiver.py
CommitLineData
a7282452
S
1"""USAGE
2 twitter-archiver [options] <-|user> [<user> ...]
3
4DESCRIPTION
5 Archive tweets of users, sorted by date from oldest to newest, in
6 the following format: <id> <date> <<screen_name>> <tweet_text>
7 Date format is: YYYY-MM-DD HH:MM:SS TZ. Tweet <id> is used to
8 resume archiving on next run. Archive file name is the user name.
9 Provide "-" instead of users to read users from standard input.
10
11OPTIONS
4f0b5ca6 12 -o --oauth authenticate to Twitter using OAuth (default: no)
a7282452
S
13 -s --save-dir <path> directory to save archives (default: current dir)
14 -a --api-rate see current API rate limit status
15 -t --timeline <file> archive own timeline into given file name (requires
4f0b5ca6
H
16 OAuth, max 800 statuses)
17 -m --mentions <file> archive own mentions instead of timeline into
18 given file name (requires OAuth, max 800 statuses)
19 -v --favorites archive user's favorites instead of timeline
907402f6 20 -f --follow-redirects follow redirects of urls
21 -r --redirect-sites follow redirects for this comma separated list of hosts
01618308
MC
22 -d --dms <file> archive own direct messages (both received and
23 sent) into given file name.
694aaadf 24 -i --isoformat store dates in ISO format (specifically RFC 3339)
a7282452
S
25
26AUTHENTICATION
27 Authenticate to Twitter using OAuth to archive tweets of private profiles
28 and have higher API rate limits. OAuth authentication tokens are stored
29 in ~/.twitter-archiver_oauth.
30"""
31
32from __future__ import print_function
33
694aaadf
MC
34import os, sys, time as _time, calendar, functools
35from datetime import time, date, datetime
a7282452
S
36from getopt import gnu_getopt as getopt, GetoptError
37
62ec1b07 38try:
39 import urllib.request as urllib2
40 import http.client as httplib
41except ImportError:
42 import urllib2
43 import httplib
44
45
a7282452
S
46# T-Archiver (Twitter-Archiver) application registered by @stalkr_
47CONSUMER_KEY='d8hIyfzs7ievqeeZLjZrqQ'
48CONSUMER_SECRET='AnZmK0rnvaX7BoJ75l6XlilnbyMv7FoiDXWVmPD8'
49
50from .api import Twitter, TwitterError
51from .oauth import OAuth, read_token_file
52from .oauth_dance import oauth_dance
53from .auth import NoAuth
907402f6 54from .util import Fail, err, expand_line, parse_host_list
a7282452 55from .follow import lookup
694aaadf 56from .timezones import utc as UTC, Local
a7282452
S
57
58def parse_args(args, options):
59 """Parse arguments from command-line to set options."""
694aaadf
MC
60 long_opts = ['help', 'oauth', 'save-dir=', 'api-rate', 'timeline=', 'mentions=', 'favorites', 'follow-redirects',"redirect-sites=", 'dms=', 'isoformat']
61 short_opts = "hos:at:m:vfr:d:i"
a7282452
S
62 opts, extra_args = getopt(args, short_opts, long_opts)
63
64 for opt, arg in opts:
65 if opt in ('-h', '--help'):
66 print(__doc__)
67 raise SystemExit(0)
68 elif opt in ('-o', '--oauth'):
69 options['oauth'] = True
70 elif opt in ('-s', '--save-dir'):
71 options['save-dir'] = arg
72 elif opt in ('-a', '--api-rate'):
73 options['api-rate' ] = True
74 elif opt in ('-t', '--timeline'):
75 options['timeline'] = arg
4f0b5ca6
H
76 elif opt in ('-m', '--mentions'):
77 options['mentions'] = arg
78 elif opt in ('-v', '--favorites'):
79 options['favorites'] = True
907402f6 80 elif opt in ('-f', '--follow-redirects'):
81 options['follow-redirects'] = True
82 elif opt in ('-r', '--redirect-sites'):
83 options['redirect-sites'] = arg
01618308
MC
84 elif opt in ('-d', '--dms'):
85 options['dms'] = arg
694aaadf
MC
86 elif opt in ('-i', '--isoformat'):
87 options['isoformat'] = True
a7282452
S
88
89 options['extra_args'] = extra_args
90
91def load_tweets(filename):
92 """Load tweets from file into dict, see save_tweets()."""
93 try:
94 archive = open(filename,"r")
95 except IOError: # no archive (yet)
96 return {}
97
98 tweets = {}
99 for line in archive.readlines():
62f2a207
EB
100 try:
101 tid, text = line.strip().split(" ", 1)
102 tweets[int(tid)] = text.decode("utf-8")
103 except Exception as e:
104 err("loading tweet %s failed due to %s" % (line, unicode(e)))
a7282452
S
105
106 archive.close()
107 return tweets
108
109def save_tweets(filename, tweets):
110 """Save tweets from dict to file.
111
112 Save tweets from dict to UTF-8 encoded file, one per line:
113 <tweet id (number)> <tweet text>
114 Tweet text is:
115 <date> <<user>> [RT @<user>: ]<text>
116
117 Args:
118 filename: A string representing the file name to save tweets to.
119 tweets: A dict mapping tweet-ids (int) to tweet text (str).
120 """
121 if len(tweets) == 0:
122 return
123
124 try:
125 archive = open(filename,"w")
126 except IOError as e:
127 err("Cannot save tweets: %s" % str(e))
128 return
129
130 for k in sorted(tweets.keys()):
62f2a207
EB
131 try:
132 archive.write("%i %s\n" % (k, tweets[k].encode('utf-8')))
133 except Exception as ex:
134 err("archiving tweet %s failed due to %s" % (k, unicode(ex)))
a7282452
S
135
136 archive.close()
137
941cdf0a 138def format_date(utc, isoformat=False):
a7282452 139 """Parse Twitter's UTC date into UTC or local time."""
694aaadf 140 u = datetime.strptime(utc.replace('+0000','UTC'), '%a %b %d %H:%M:%S %Z %Y')
941cdf0a 141 # This is the least painful way I could find to create a non-naive
7ab9bcfe 142 # datetime including a UTC timezone. Alternative suggestions
941cdf0a 143 # welcome.
694aaadf
MC
144 unew = datetime.combine(u.date(), time(u.time().hour,
145 u.time().minute, u.time().second, tzinfo=UTC))
146
7ab9bcfe
MC
147 # Convert to localtime
148 unew = unew.astimezone(Local)
149
694aaadf
MC
150 if isoformat:
151 return unew.isoformat()
a7282452 152 else:
694aaadf 153 return unew.strftime('%Y-%m-%d %H:%M:%S %Z')
a7282452 154
907402f6 155def expand_format_text(hosts, text):
156 """Following redirects in links."""
157 return direct_format_text(expand_line(text, hosts))
158
159def direct_format_text(text):
a7282452
S
160 """Transform special chars in text to have only one line."""
161 return text.replace('\n','\\n').replace('\r','\\r')
162
4f0b5ca6
H
163def statuses_resolve_uids(twitter, tl):
164 """Resolve user ids to screen names from statuses."""
a7282452
S
165 # get all user ids that needs a lookup (no screen_name key)
166 user_ids = []
167 for t in tl:
168 rt = t.get('retweeted_status')
169 if rt and not rt['user'].get('screen_name'):
170 user_ids.append(rt['user']['id'])
171 if not t['user'].get('screen_name'):
172 user_ids.append(t['user']['id'])
173
174 # resolve all of them at once
175 names = lookup(twitter, list(set(user_ids)))
176
4f0b5ca6 177 # build new statuses with resolved uids
a7282452
S
178 new_tl = []
179 for t in tl:
180 rt = t.get('retweeted_status')
181 if rt and not rt['user'].get('screen_name'):
182 name = names[rt['user']['id']]
183 t['retweeted_status']['user']['screen_name'] = name
184 if not t['user'].get('screen_name'):
185 name = names[t['user']['id']]
186 t['user']['screen_name'] = name
187 new_tl.append(t)
188
189 return new_tl
190
694aaadf 191def statuses_portion(twitter, screen_name, max_id=None, mentions=False, favorites=False, received_dms=None, isoformat=False):
4f0b5ca6 192 """Get a portion of the statuses of a screen name."""
a7282452
S
193 kwargs = dict(count=200, include_rts=1, screen_name=screen_name)
194 if max_id:
195 kwargs['max_id'] = max_id
196
197 tweets = {}
4f0b5ca6 198 if mentions:
7a5610b1 199 tl = twitter.statuses.mentions_timeline(**kwargs)
4f0b5ca6 200 elif favorites:
7a5610b1 201 tl = twitter.favorites.list(**kwargs)
01618308
MC
202 elif received_dms != None:
203 if received_dms:
204 tl = twitter.direct_messages(**kwargs)
205 else: # sent DMs
206 tl = twitter.direct_messages.sent(**kwargs)
4f0b5ca6
H
207 else: # timeline
208 if screen_name:
209 tl = twitter.statuses.user_timeline(**kwargs)
210 else: # self
211 tl = twitter.statuses.home_timeline(**kwargs)
a7282452
S
212
213 # some tweets do not provide screen name but user id, resolve those
01618308
MC
214 # this isn't a valid operation for DMs, so special-case them
215 if received_dms == None:
216 newtl = statuses_resolve_uids(twitter, tl)
217 else:
218 newtl = tl
219 for t in newtl:
a7282452
S
220 text = t['text']
221 rt = t.get('retweeted_status')
222 if rt:
223 text = "RT @%s: %s" % (rt['user']['screen_name'], rt['text'])
01618308
MC
224 # DMs don't include mentions by default, so in order to show who
225 # the recipient was, we synthesise a mention. If we're not
226 # operating on DMs, behave as normal
227 if received_dms == None:
694aaadf 228 tweets[t['id']] = "%s <%s> %s" % (format_date(t['created_at'], isoformat=isoformat),
01618308
MC
229 t['user']['screen_name'],
230 format_text(text))
231 else:
694aaadf 232 tweets[t['id']] = "%s <%s> @%s %s" % (format_date(t['created_at'], isoformat=isoformat),
01618308
MC
233 t['sender_screen_name'],
234 t['recipient']['screen_name'],
235 format_text(text))
a7282452
S
236 return tweets
237
694aaadf 238def statuses(twitter, screen_name, tweets, mentions=False, favorites=False, received_dms=None, isoformat=False):
4f0b5ca6 239 """Get all the statuses for a screen name."""
a7282452
S
240 max_id = None
241 fail = Fail()
4f0b5ca6 242 # get portions of statuses, incrementing max id until no new tweets appear
a7282452
S
243 while True:
244 try:
694aaadf 245 portion = statuses_portion(twitter, screen_name, max_id, mentions, favorites, received_dms, isoformat)
a7282452
S
246 except TwitterError as e:
247 if e.e.code == 401:
248 err("Fail: %i Unauthorized (tweets of that user are protected)"
249 % e.e.code)
250 break
780044e6 251 elif e.e.code == 429:
a7282452 252 err("Fail: %i API rate limit exceeded" % e.e.code)
780044e6
EB
253 rls = twitter.application.rate_limit_status()
254 reset = rls.rate_limit_reset
255 reset = _time.asctime(_time.localtime(reset))
256 delay = int(rls.rate_limit_reset
257 - _time.time()) + 5 # avoid race
258 err("Interval limit of %i requests reached, next reset on %s: "
259 "going to sleep for %i secs" % (rls.rate_limit_limit,
a7282452
S
260 reset, delay))
261 fail.wait(delay)
262 continue
bb2a70ee
S
263 elif e.e.code == 404:
264 err("Fail: %i This profile does not exist" % e.e.code)
265 break
a7282452
S
266 elif e.e.code == 502:
267 err("Fail: %i Service currently unavailable, retrying..."
268 % e.e.code)
269 else:
270 err("Fail: %s\nRetrying..." % str(e)[:500])
271 fail.wait(3)
272 except urllib2.URLError as e:
273 err("Fail: urllib2.URLError %s - Retrying..." % str(e))
274 fail.wait(3)
275 except httplib.error as e:
276 err("Fail: httplib.error %s - Retrying..." % str(e))
277 fail.wait(3)
278 except KeyError as e:
279 err("Fail: KeyError %s - Retrying..." % str(e))
280 fail.wait(3)
281 else:
282 new = -len(tweets)
283 tweets.update(portion)
284 new += len(tweets)
4f0b5ca6 285 err("Browsing %s statuses, new tweets: %i"
a7282452
S
286 % (screen_name if screen_name else "home", new))
287 if new < 190:
288 break
aa3d6d75 289 max_id = min(portion.keys())-1 # browse backwards
a7282452
S
290 fail = Fail()
291
292def rate_limit_status(twitter):
293 """Print current Twitter API rate limit status."""
780044e6
EB
294 rls = twitter.application.rate_limit_status()
295 print("Remaining API requests: %i/%i (interval limit)"
296 % (rls.rate_limit_remaining, rls.rate_limit_limit))
a7282452 297 print("Next reset in %is (%s)"
780044e6
EB
298 % (int(rls.rate_limit_reset - _time.time()),
299 _time.asctime(_time.localtime(rls.rate_limit_reset))))
a7282452
S
300
301def main(args=sys.argv[1:]):
302 options = {
303 'oauth': False,
304 'save-dir': ".",
305 'api-rate': False,
907402f6 306 'timeline': "",
4f0b5ca6 307 'mentions': "",
01618308 308 'dms': "",
4f0b5ca6 309 'favorites': False,
907402f6 310 'follow-redirects': False,
311 'redirect-sites': None,
694aaadf 312 'isoformat': False,
a7282452
S
313 }
314 try:
315 parse_args(args, options)
316 except GetoptError as e:
317 err("I can't do that, %s." % e)
318 raise SystemExit(1)
319
320 # exit if no user given
4f0b5ca6 321 # except if asking for API rate, or archive of timeline or mentions
a7282452 322 if not options['extra_args'] and not (options['api-rate'] or
4f0b5ca6 323 options['timeline'] or
01618308
MC
324 options['mentions'] or
325 options['dms']):
a7282452
S
326 print(__doc__)
327 return
328
329 # authenticate using OAuth, asking for token if necessary
330 if options['oauth']:
8f6a4c32 331 oauth_filename = (os.environ.get('HOME',
332 os.environ.get('USERPROFILE', ''))
333 + os.sep
334 + '.twitter-archiver_oauth')
335
a7282452
S
336 if not os.path.exists(oauth_filename):
337 oauth_dance("Twitter-Archiver", CONSUMER_KEY, CONSUMER_SECRET,
338 oauth_filename)
339 oauth_token, oauth_token_secret = read_token_file(oauth_filename)
340 auth = OAuth(oauth_token, oauth_token_secret, CONSUMER_KEY,
341 CONSUMER_SECRET)
342 else:
343 auth = NoAuth()
344
7a5610b1 345 twitter = Twitter(auth=auth, api_version='1.1', domain='api.twitter.com')
a7282452
S
346
347 if options['api-rate']:
348 rate_limit_status(twitter)
349 return
350
907402f6 351 global format_text
352 if options['follow-redirects'] or options['redirect-sites'] :
353 if options['redirect-sites']:
354 hosts = parse_host_list(options['redirect-sites'])
355 else:
356 hosts = None
357 format_text = functools.partial(expand_format_text, hosts)
358 else:
359 format_text = direct_format_text
be5f32da 360
4f0b5ca6
H
361 # save own timeline or mentions (the user used in OAuth)
362 if options['timeline'] or options['mentions']:
a7282452 363 if isinstance(auth, NoAuth):
4f0b5ca6 364 err("You must be authenticated to save timeline or mentions.")
a7282452
S
365 raise SystemExit(1)
366
4f0b5ca6
H
367 if options['timeline']:
368 filename = options['save-dir'] + os.sep + options['timeline']
369 print("* Archiving own timeline in %s" % filename)
370 elif options['mentions']:
371 filename = options['save-dir'] + os.sep + options['mentions']
372 print("* Archiving own mentions in %s" % filename)
a7282452
S
373
374 tweets = {}
375 try:
376 tweets = load_tweets(filename)
62ec1b07 377 except Exception as e:
a7282452
S
378 err("Error when loading saved tweets: %s - continuing without"
379 % str(e))
380
381 try:
694aaadf 382 statuses(twitter, "", tweets, options['mentions'], options['favorites'], isoformat=options['isoformat'])
a7282452
S
383 except KeyboardInterrupt:
384 err()
385 err("Interrupted")
386 raise SystemExit(1)
387
388 save_tweets(filename, tweets)
4f0b5ca6
H
389 if options['timeline']:
390 print("Total tweets in own timeline: %i" % len(tweets))
391 elif options['mentions']:
392 print("Total mentions: %i" % len(tweets))
a7282452 393
01618308
MC
394 if options['dms']:
395 if isinstance(auth, NoAuth):
396 err("You must be authenticated to save DMs.")
397 raise SystemExit(1)
398
399 filename = options['save-dir'] + os.sep + options['dms']
400 print("* Archiving own DMs in %s" % filename)
401
402 dms = {}
403 try:
404 dms = load_tweets(filename)
04c483ab 405 except Exception as e:
01618308
MC
406 err("Error when loading saved DMs: %s - continuing without"
407 % str(e))
408
409 try:
694aaadf
MC
410 statuses(twitter, "", dms, received_dms=True, isoformat=options['isoformat'])
411 statuses(twitter, "", dms, received_dms=False, isoformat=options['isoformat'])
01618308
MC
412 except KeyboardInterrupt:
413 err()
414 err("Interrupted")
415 raise SystemExit(1)
416
417 save_tweets(filename, dms)
418 print("Total DMs sent and received: %i" % len(dms))
419
420
a7282452
S
421 # read users from command-line or stdin
422 users = options['extra_args']
423 if len(users) == 1 and users[0] == "-":
424 users = [line.strip() for line in sys.stdin.readlines()]
425
426 # save tweets for every user
427 total, total_new = 0, 0
428 for user in users:
429 filename = options['save-dir'] + os.sep + user
4f0b5ca6
H
430 if options['favorites']:
431 filename = filename + "-favorites"
a7282452
S
432 print("* Archiving %s tweets in %s" % (user, filename))
433
434 tweets = {}
435 try:
436 tweets = load_tweets(filename)
62ec1b07 437 except Exception as e:
a7282452
S
438 err("Error when loading saved tweets: %s - continuing without"
439 % str(e))
440
441 new = 0
442 before = len(tweets)
443 try:
694aaadf 444 statuses(twitter, user, tweets, options['mentions'], options['favorites'], isoformat=options['isoformat'])
a7282452
S
445 except KeyboardInterrupt:
446 err()
447 err("Interrupted")
448 raise SystemExit(1)
449
450 save_tweets(filename, tweets)
451 total += len(tweets)
452 new = len(tweets) - before
453 total_new += new
454 print("Total tweets for %s: %i (%i new)" % (user, len(tweets), new))
455
456 print("Total: %i tweets (%i new) for %i users"
457 % (total, total_new, len(users)))