]> jfr.im git - z_archive/twitter.git/blame - twitter/archiver.py
Adapt code to archive DMs
[z_archive/twitter.git] / twitter / archiver.py
CommitLineData
a7282452
S
1"""USAGE
2 twitter-archiver [options] <-|user> [<user> ...]
3
4DESCRIPTION
5 Archive tweets of users, sorted by date from oldest to newest, in
6 the following format: <id> <date> <<screen_name>> <tweet_text>
7 Date format is: YYYY-MM-DD HH:MM:SS TZ. Tweet <id> is used to
8 resume archiving on next run. Archive file name is the user name.
9 Provide "-" instead of users to read users from standard input.
10
11OPTIONS
4f0b5ca6 12 -o --oauth authenticate to Twitter using OAuth (default: no)
a7282452
S
13 -s --save-dir <path> directory to save archives (default: current dir)
14 -a --api-rate see current API rate limit status
15 -t --timeline <file> archive own timeline into given file name (requires
4f0b5ca6
H
16 OAuth, max 800 statuses)
17 -m --mentions <file> archive own mentions instead of timeline into
18 given file name (requires OAuth, max 800 statuses)
19 -v --favorites archive user's favorites instead of timeline
907402f6 20 -f --follow-redirects follow redirects of urls
21 -r --redirect-sites follow redirects for this comma separated list of hosts
01618308
MC
22 -d --dms <file> archive own direct messages (both received and
23 sent) into given file name.
a7282452
S
24
25AUTHENTICATION
26 Authenticate to Twitter using OAuth to archive tweets of private profiles
27 and have higher API rate limits. OAuth authentication tokens are stored
28 in ~/.twitter-archiver_oauth.
29"""
30
31from __future__ import print_function
32
62ec1b07 33import os, sys, time, calendar, functools
a7282452
S
34from getopt import gnu_getopt as getopt, GetoptError
35
62ec1b07 36try:
37 import urllib.request as urllib2
38 import http.client as httplib
39except ImportError:
40 import urllib2
41 import httplib
42
43
a7282452
S
44# T-Archiver (Twitter-Archiver) application registered by @stalkr_
45CONSUMER_KEY='d8hIyfzs7ievqeeZLjZrqQ'
46CONSUMER_SECRET='AnZmK0rnvaX7BoJ75l6XlilnbyMv7FoiDXWVmPD8'
47
48from .api import Twitter, TwitterError
49from .oauth import OAuth, read_token_file
50from .oauth_dance import oauth_dance
51from .auth import NoAuth
907402f6 52from .util import Fail, err, expand_line, parse_host_list
a7282452
S
53from .follow import lookup
54
55def parse_args(args, options):
56 """Parse arguments from command-line to set options."""
01618308
MC
57 long_opts = ['help', 'oauth', 'save-dir=', 'api-rate', 'timeline=', 'mentions=', 'favorites', 'follow-redirects',"redirect-sites=", 'dms=']
58 short_opts = "hos:at:m:vfr:d:"
a7282452
S
59 opts, extra_args = getopt(args, short_opts, long_opts)
60
61 for opt, arg in opts:
62 if opt in ('-h', '--help'):
63 print(__doc__)
64 raise SystemExit(0)
65 elif opt in ('-o', '--oauth'):
66 options['oauth'] = True
67 elif opt in ('-s', '--save-dir'):
68 options['save-dir'] = arg
69 elif opt in ('-a', '--api-rate'):
70 options['api-rate' ] = True
71 elif opt in ('-t', '--timeline'):
72 options['timeline'] = arg
4f0b5ca6
H
73 elif opt in ('-m', '--mentions'):
74 options['mentions'] = arg
75 elif opt in ('-v', '--favorites'):
76 options['favorites'] = True
907402f6 77 elif opt in ('-f', '--follow-redirects'):
78 options['follow-redirects'] = True
79 elif opt in ('-r', '--redirect-sites'):
80 options['redirect-sites'] = arg
01618308
MC
81 elif opt in ('-d', '--dms'):
82 options['dms'] = arg
a7282452
S
83
84 options['extra_args'] = extra_args
85
86def load_tweets(filename):
87 """Load tweets from file into dict, see save_tweets()."""
88 try:
89 archive = open(filename,"r")
90 except IOError: # no archive (yet)
91 return {}
92
93 tweets = {}
94 for line in archive.readlines():
95 tid, text = line.strip().split(" ", 1)
96 tweets[int(tid)] = text.decode("utf-8")
97
98 archive.close()
99 return tweets
100
101def save_tweets(filename, tweets):
102 """Save tweets from dict to file.
103
104 Save tweets from dict to UTF-8 encoded file, one per line:
105 <tweet id (number)> <tweet text>
106 Tweet text is:
107 <date> <<user>> [RT @<user>: ]<text>
108
109 Args:
110 filename: A string representing the file name to save tweets to.
111 tweets: A dict mapping tweet-ids (int) to tweet text (str).
112 """
113 if len(tweets) == 0:
114 return
115
116 try:
117 archive = open(filename,"w")
118 except IOError as e:
119 err("Cannot save tweets: %s" % str(e))
120 return
121
122 for k in sorted(tweets.keys()):
123 archive.write("%i %s\n" % (k, tweets[k].encode('utf-8')))
124
125 archive.close()
126
127def format_date(utc, to_localtime=True):
128 """Parse Twitter's UTC date into UTC or local time."""
129 u = time.strptime(utc.replace('+0000','UTC'), '%a %b %d %H:%M:%S %Z %Y')
130 if to_localtime and time.timezone != 0:
131 t = time.localtime(calendar.timegm(u))
132 return time.strftime("%Y-%m-%d %H:%M:%S", t) + " " + time.tzname[1]
133 else:
134 return time.strftime("%Y-%m-%d %H:%M:%S UTC", u)
135
907402f6 136def expand_format_text(hosts, text):
137 """Following redirects in links."""
138 return direct_format_text(expand_line(text, hosts))
139
140def direct_format_text(text):
a7282452
S
141 """Transform special chars in text to have only one line."""
142 return text.replace('\n','\\n').replace('\r','\\r')
143
4f0b5ca6
H
144def statuses_resolve_uids(twitter, tl):
145 """Resolve user ids to screen names from statuses."""
a7282452
S
146 # get all user ids that needs a lookup (no screen_name key)
147 user_ids = []
148 for t in tl:
149 rt = t.get('retweeted_status')
150 if rt and not rt['user'].get('screen_name'):
151 user_ids.append(rt['user']['id'])
152 if not t['user'].get('screen_name'):
153 user_ids.append(t['user']['id'])
154
155 # resolve all of them at once
156 names = lookup(twitter, list(set(user_ids)))
157
4f0b5ca6 158 # build new statuses with resolved uids
a7282452
S
159 new_tl = []
160 for t in tl:
161 rt = t.get('retweeted_status')
162 if rt and not rt['user'].get('screen_name'):
163 name = names[rt['user']['id']]
164 t['retweeted_status']['user']['screen_name'] = name
165 if not t['user'].get('screen_name'):
166 name = names[t['user']['id']]
167 t['user']['screen_name'] = name
168 new_tl.append(t)
169
170 return new_tl
171
01618308 172def statuses_portion(twitter, screen_name, max_id=None, mentions=False, favorites=False, received_dms=None):
4f0b5ca6 173 """Get a portion of the statuses of a screen name."""
a7282452
S
174 kwargs = dict(count=200, include_rts=1, screen_name=screen_name)
175 if max_id:
176 kwargs['max_id'] = max_id
177
178 tweets = {}
4f0b5ca6
H
179 if mentions:
180 tl = twitter.statuses.mentions(**kwargs)
181 elif favorites:
182 tl = twitter.favorites(**kwargs) # API v1, favorites.list() in v1.1
01618308
MC
183 elif received_dms != None:
184 if received_dms:
185 tl = twitter.direct_messages(**kwargs)
186 else: # sent DMs
187 tl = twitter.direct_messages.sent(**kwargs)
4f0b5ca6
H
188 else: # timeline
189 if screen_name:
190 tl = twitter.statuses.user_timeline(**kwargs)
191 else: # self
192 tl = twitter.statuses.home_timeline(**kwargs)
a7282452
S
193
194 # some tweets do not provide screen name but user id, resolve those
01618308
MC
195 # this isn't a valid operation for DMs, so special-case them
196 if received_dms == None:
197 newtl = statuses_resolve_uids(twitter, tl)
198 else:
199 newtl = tl
200 for t in newtl:
a7282452
S
201 text = t['text']
202 rt = t.get('retweeted_status')
203 if rt:
204 text = "RT @%s: %s" % (rt['user']['screen_name'], rt['text'])
01618308
MC
205 # DMs don't include mentions by default, so in order to show who
206 # the recipient was, we synthesise a mention. If we're not
207 # operating on DMs, behave as normal
208 if received_dms == None:
209 tweets[t['id']] = "%s <%s> %s" % (format_date(t['created_at']),
210 t['user']['screen_name'],
211 format_text(text))
212 else:
213 tweets[t['id']] = "%s <%s> @%s %s" % (format_date(t['created_at']),
214 t['sender_screen_name'],
215 t['recipient']['screen_name'],
216 format_text(text))
a7282452
S
217 return tweets
218
01618308 219def statuses(twitter, screen_name, tweets, mentions=False, favorites=False, received_dms=None):
4f0b5ca6 220 """Get all the statuses for a screen name."""
a7282452
S
221 max_id = None
222 fail = Fail()
4f0b5ca6 223 # get portions of statuses, incrementing max id until no new tweets appear
a7282452
S
224 while True:
225 try:
01618308 226 portion = statuses_portion(twitter, screen_name, max_id, mentions, favorites, received_dms)
a7282452
S
227 except TwitterError as e:
228 if e.e.code == 401:
229 err("Fail: %i Unauthorized (tweets of that user are protected)"
230 % e.e.code)
231 break
232 elif e.e.code == 400:
233 err("Fail: %i API rate limit exceeded" % e.e.code)
234 rate = twitter.account.rate_limit_status()
235 reset = rate['reset_time_in_seconds']
236 reset = time.asctime(time.localtime(reset))
237 delay = int(rate['reset_time_in_seconds']
238 - time.time()) + 5 # avoid race
239 err("Hourly limit of %i requests reached, next reset on %s: "
240 "going to sleep for %i secs" % (rate['hourly_limit'],
241 reset, delay))
242 fail.wait(delay)
243 continue
bb2a70ee
S
244 elif e.e.code == 404:
245 err("Fail: %i This profile does not exist" % e.e.code)
246 break
a7282452
S
247 elif e.e.code == 502:
248 err("Fail: %i Service currently unavailable, retrying..."
249 % e.e.code)
250 else:
251 err("Fail: %s\nRetrying..." % str(e)[:500])
252 fail.wait(3)
253 except urllib2.URLError as e:
254 err("Fail: urllib2.URLError %s - Retrying..." % str(e))
255 fail.wait(3)
256 except httplib.error as e:
257 err("Fail: httplib.error %s - Retrying..." % str(e))
258 fail.wait(3)
259 except KeyError as e:
260 err("Fail: KeyError %s - Retrying..." % str(e))
261 fail.wait(3)
262 else:
263 new = -len(tweets)
264 tweets.update(portion)
265 new += len(tweets)
4f0b5ca6 266 err("Browsing %s statuses, new tweets: %i"
a7282452
S
267 % (screen_name if screen_name else "home", new))
268 if new < 190:
269 break
aa3d6d75 270 max_id = min(portion.keys())-1 # browse backwards
a7282452
S
271 fail = Fail()
272
273def rate_limit_status(twitter):
274 """Print current Twitter API rate limit status."""
275 r = twitter.account.rate_limit_status()
276 print("Remaining API requests: %i/%i (hourly limit)"
277 % (r['remaining_hits'], r['hourly_limit']))
278 print("Next reset in %is (%s)"
279 % (int(r['reset_time_in_seconds'] - time.time()),
280 time.asctime(time.localtime(r['reset_time_in_seconds']))))
281
282def main(args=sys.argv[1:]):
283 options = {
284 'oauth': False,
285 'save-dir': ".",
286 'api-rate': False,
907402f6 287 'timeline': "",
4f0b5ca6 288 'mentions': "",
01618308 289 'dms': "",
4f0b5ca6 290 'favorites': False,
907402f6 291 'follow-redirects': False,
292 'redirect-sites': None,
a7282452
S
293 }
294 try:
295 parse_args(args, options)
296 except GetoptError as e:
297 err("I can't do that, %s." % e)
298 raise SystemExit(1)
299
300 # exit if no user given
4f0b5ca6 301 # except if asking for API rate, or archive of timeline or mentions
a7282452 302 if not options['extra_args'] and not (options['api-rate'] or
4f0b5ca6 303 options['timeline'] or
01618308
MC
304 options['mentions'] or
305 options['dms']):
a7282452
S
306 print(__doc__)
307 return
308
309 # authenticate using OAuth, asking for token if necessary
310 if options['oauth']:
311 oauth_filename = (os.getenv("HOME", "") + os.sep
312 + ".twitter-archiver_oauth")
313 if not os.path.exists(oauth_filename):
314 oauth_dance("Twitter-Archiver", CONSUMER_KEY, CONSUMER_SECRET,
315 oauth_filename)
316 oauth_token, oauth_token_secret = read_token_file(oauth_filename)
317 auth = OAuth(oauth_token, oauth_token_secret, CONSUMER_KEY,
318 CONSUMER_SECRET)
319 else:
320 auth = NoAuth()
321
322 twitter = Twitter(auth=auth, api_version='1', domain='api.twitter.com')
323
324 if options['api-rate']:
325 rate_limit_status(twitter)
326 return
327
907402f6 328 global format_text
329 if options['follow-redirects'] or options['redirect-sites'] :
330 if options['redirect-sites']:
331 hosts = parse_host_list(options['redirect-sites'])
332 else:
333 hosts = None
334 format_text = functools.partial(expand_format_text, hosts)
335 else:
336 format_text = direct_format_text
be5f32da 337
4f0b5ca6
H
338 # save own timeline or mentions (the user used in OAuth)
339 if options['timeline'] or options['mentions']:
a7282452 340 if isinstance(auth, NoAuth):
4f0b5ca6 341 err("You must be authenticated to save timeline or mentions.")
a7282452
S
342 raise SystemExit(1)
343
4f0b5ca6
H
344 if options['timeline']:
345 filename = options['save-dir'] + os.sep + options['timeline']
346 print("* Archiving own timeline in %s" % filename)
347 elif options['mentions']:
348 filename = options['save-dir'] + os.sep + options['mentions']
349 print("* Archiving own mentions in %s" % filename)
a7282452
S
350
351 tweets = {}
352 try:
353 tweets = load_tweets(filename)
62ec1b07 354 except Exception as e:
a7282452
S
355 err("Error when loading saved tweets: %s - continuing without"
356 % str(e))
357
358 try:
4f0b5ca6 359 statuses(twitter, "", tweets, options['mentions'], options['favorites'])
a7282452
S
360 except KeyboardInterrupt:
361 err()
362 err("Interrupted")
363 raise SystemExit(1)
364
365 save_tweets(filename, tweets)
4f0b5ca6
H
366 if options['timeline']:
367 print("Total tweets in own timeline: %i" % len(tweets))
368 elif options['mentions']:
369 print("Total mentions: %i" % len(tweets))
a7282452 370
01618308
MC
371 if options['dms']:
372 if isinstance(auth, NoAuth):
373 err("You must be authenticated to save DMs.")
374 raise SystemExit(1)
375
376 filename = options['save-dir'] + os.sep + options['dms']
377 print("* Archiving own DMs in %s" % filename)
378
379 dms = {}
380 try:
381 dms = load_tweets(filename)
382 except Exception, e:
383 err("Error when loading saved DMs: %s - continuing without"
384 % str(e))
385
386 try:
387 statuses(twitter, "", dms, received_dms=True)
388 statuses(twitter, "", dms, received_dms=False)
389 except KeyboardInterrupt:
390 err()
391 err("Interrupted")
392 raise SystemExit(1)
393
394 save_tweets(filename, dms)
395 print("Total DMs sent and received: %i" % len(dms))
396
397
a7282452
S
398 # read users from command-line or stdin
399 users = options['extra_args']
400 if len(users) == 1 and users[0] == "-":
401 users = [line.strip() for line in sys.stdin.readlines()]
402
403 # save tweets for every user
404 total, total_new = 0, 0
405 for user in users:
406 filename = options['save-dir'] + os.sep + user
4f0b5ca6
H
407 if options['favorites']:
408 filename = filename + "-favorites"
a7282452
S
409 print("* Archiving %s tweets in %s" % (user, filename))
410
411 tweets = {}
412 try:
413 tweets = load_tweets(filename)
62ec1b07 414 except Exception as e:
a7282452
S
415 err("Error when loading saved tweets: %s - continuing without"
416 % str(e))
417
418 new = 0
419 before = len(tweets)
420 try:
4f0b5ca6 421 statuses(twitter, user, tweets, options['mentions'], options['favorites'])
a7282452
S
422 except KeyboardInterrupt:
423 err()
424 err("Interrupted")
425 raise SystemExit(1)
426
427 save_tweets(filename, tweets)
428 total += len(tweets)
429 new = len(tweets) - before
430 total_new += new
431 print("Total tweets for %s: %i (%i new)" % (user, len(tweets), new))
432
433 print("Total: %i tweets (%i new) for %i users"
434 % (total, total_new, len(users)))