]> jfr.im git - z_archive/twitter.git/blame_incremental - twitter/archiver.py
Adapt code to archive DMs
[z_archive/twitter.git] / twitter / archiver.py
... / ...
CommitLineData
1"""USAGE
2 twitter-archiver [options] <-|user> [<user> ...]
3
4DESCRIPTION
5 Archive tweets of users, sorted by date from oldest to newest, in
6 the following format: <id> <date> <<screen_name>> <tweet_text>
7 Date format is: YYYY-MM-DD HH:MM:SS TZ. Tweet <id> is used to
8 resume archiving on next run. Archive file name is the user name.
9 Provide "-" instead of users to read users from standard input.
10
11OPTIONS
12 -o --oauth authenticate to Twitter using OAuth (default: no)
13 -s --save-dir <path> directory to save archives (default: current dir)
14 -a --api-rate see current API rate limit status
15 -t --timeline <file> archive own timeline into given file name (requires
16 OAuth, max 800 statuses)
17 -m --mentions <file> archive own mentions instead of timeline into
18 given file name (requires OAuth, max 800 statuses)
19 -v --favorites archive user's favorites instead of timeline
20 -f --follow-redirects follow redirects of urls
21 -r --redirect-sites follow redirects for this comma separated list of hosts
22 -d --dms <file> archive own direct messages (both received and
23 sent) into given file name.
24
25AUTHENTICATION
26 Authenticate to Twitter using OAuth to archive tweets of private profiles
27 and have higher API rate limits. OAuth authentication tokens are stored
28 in ~/.twitter-archiver_oauth.
29"""
30
31from __future__ import print_function
32
33import os, sys, time, calendar, functools
34from getopt import gnu_getopt as getopt, GetoptError
35
36try:
37 import urllib.request as urllib2
38 import http.client as httplib
39except ImportError:
40 import urllib2
41 import httplib
42
43
44# T-Archiver (Twitter-Archiver) application registered by @stalkr_
45CONSUMER_KEY='d8hIyfzs7ievqeeZLjZrqQ'
46CONSUMER_SECRET='AnZmK0rnvaX7BoJ75l6XlilnbyMv7FoiDXWVmPD8'
47
48from .api import Twitter, TwitterError
49from .oauth import OAuth, read_token_file
50from .oauth_dance import oauth_dance
51from .auth import NoAuth
52from .util import Fail, err, expand_line, parse_host_list
53from .follow import lookup
54
55def parse_args(args, options):
56 """Parse arguments from command-line to set options."""
57 long_opts = ['help', 'oauth', 'save-dir=', 'api-rate', 'timeline=', 'mentions=', 'favorites', 'follow-redirects',"redirect-sites=", 'dms=']
58 short_opts = "hos:at:m:vfr:d:"
59 opts, extra_args = getopt(args, short_opts, long_opts)
60
61 for opt, arg in opts:
62 if opt in ('-h', '--help'):
63 print(__doc__)
64 raise SystemExit(0)
65 elif opt in ('-o', '--oauth'):
66 options['oauth'] = True
67 elif opt in ('-s', '--save-dir'):
68 options['save-dir'] = arg
69 elif opt in ('-a', '--api-rate'):
70 options['api-rate' ] = True
71 elif opt in ('-t', '--timeline'):
72 options['timeline'] = arg
73 elif opt in ('-m', '--mentions'):
74 options['mentions'] = arg
75 elif opt in ('-v', '--favorites'):
76 options['favorites'] = True
77 elif opt in ('-f', '--follow-redirects'):
78 options['follow-redirects'] = True
79 elif opt in ('-r', '--redirect-sites'):
80 options['redirect-sites'] = arg
81 elif opt in ('-d', '--dms'):
82 options['dms'] = arg
83
84 options['extra_args'] = extra_args
85
86def load_tweets(filename):
87 """Load tweets from file into dict, see save_tweets()."""
88 try:
89 archive = open(filename,"r")
90 except IOError: # no archive (yet)
91 return {}
92
93 tweets = {}
94 for line in archive.readlines():
95 tid, text = line.strip().split(" ", 1)
96 tweets[int(tid)] = text.decode("utf-8")
97
98 archive.close()
99 return tweets
100
101def save_tweets(filename, tweets):
102 """Save tweets from dict to file.
103
104 Save tweets from dict to UTF-8 encoded file, one per line:
105 <tweet id (number)> <tweet text>
106 Tweet text is:
107 <date> <<user>> [RT @<user>: ]<text>
108
109 Args:
110 filename: A string representing the file name to save tweets to.
111 tweets: A dict mapping tweet-ids (int) to tweet text (str).
112 """
113 if len(tweets) == 0:
114 return
115
116 try:
117 archive = open(filename,"w")
118 except IOError as e:
119 err("Cannot save tweets: %s" % str(e))
120 return
121
122 for k in sorted(tweets.keys()):
123 archive.write("%i %s\n" % (k, tweets[k].encode('utf-8')))
124
125 archive.close()
126
127def format_date(utc, to_localtime=True):
128 """Parse Twitter's UTC date into UTC or local time."""
129 u = time.strptime(utc.replace('+0000','UTC'), '%a %b %d %H:%M:%S %Z %Y')
130 if to_localtime and time.timezone != 0:
131 t = time.localtime(calendar.timegm(u))
132 return time.strftime("%Y-%m-%d %H:%M:%S", t) + " " + time.tzname[1]
133 else:
134 return time.strftime("%Y-%m-%d %H:%M:%S UTC", u)
135
136def expand_format_text(hosts, text):
137 """Following redirects in links."""
138 return direct_format_text(expand_line(text, hosts))
139
140def direct_format_text(text):
141 """Transform special chars in text to have only one line."""
142 return text.replace('\n','\\n').replace('\r','\\r')
143
144def statuses_resolve_uids(twitter, tl):
145 """Resolve user ids to screen names from statuses."""
146 # get all user ids that needs a lookup (no screen_name key)
147 user_ids = []
148 for t in tl:
149 rt = t.get('retweeted_status')
150 if rt and not rt['user'].get('screen_name'):
151 user_ids.append(rt['user']['id'])
152 if not t['user'].get('screen_name'):
153 user_ids.append(t['user']['id'])
154
155 # resolve all of them at once
156 names = lookup(twitter, list(set(user_ids)))
157
158 # build new statuses with resolved uids
159 new_tl = []
160 for t in tl:
161 rt = t.get('retweeted_status')
162 if rt and not rt['user'].get('screen_name'):
163 name = names[rt['user']['id']]
164 t['retweeted_status']['user']['screen_name'] = name
165 if not t['user'].get('screen_name'):
166 name = names[t['user']['id']]
167 t['user']['screen_name'] = name
168 new_tl.append(t)
169
170 return new_tl
171
172def statuses_portion(twitter, screen_name, max_id=None, mentions=False, favorites=False, received_dms=None):
173 """Get a portion of the statuses of a screen name."""
174 kwargs = dict(count=200, include_rts=1, screen_name=screen_name)
175 if max_id:
176 kwargs['max_id'] = max_id
177
178 tweets = {}
179 if mentions:
180 tl = twitter.statuses.mentions(**kwargs)
181 elif favorites:
182 tl = twitter.favorites(**kwargs) # API v1, favorites.list() in v1.1
183 elif received_dms != None:
184 if received_dms:
185 tl = twitter.direct_messages(**kwargs)
186 else: # sent DMs
187 tl = twitter.direct_messages.sent(**kwargs)
188 else: # timeline
189 if screen_name:
190 tl = twitter.statuses.user_timeline(**kwargs)
191 else: # self
192 tl = twitter.statuses.home_timeline(**kwargs)
193
194 # some tweets do not provide screen name but user id, resolve those
195 # this isn't a valid operation for DMs, so special-case them
196 if received_dms == None:
197 newtl = statuses_resolve_uids(twitter, tl)
198 else:
199 newtl = tl
200 for t in newtl:
201 text = t['text']
202 rt = t.get('retweeted_status')
203 if rt:
204 text = "RT @%s: %s" % (rt['user']['screen_name'], rt['text'])
205 # DMs don't include mentions by default, so in order to show who
206 # the recipient was, we synthesise a mention. If we're not
207 # operating on DMs, behave as normal
208 if received_dms == None:
209 tweets[t['id']] = "%s <%s> %s" % (format_date(t['created_at']),
210 t['user']['screen_name'],
211 format_text(text))
212 else:
213 tweets[t['id']] = "%s <%s> @%s %s" % (format_date(t['created_at']),
214 t['sender_screen_name'],
215 t['recipient']['screen_name'],
216 format_text(text))
217 return tweets
218
219def statuses(twitter, screen_name, tweets, mentions=False, favorites=False, received_dms=None):
220 """Get all the statuses for a screen name."""
221 max_id = None
222 fail = Fail()
223 # get portions of statuses, incrementing max id until no new tweets appear
224 while True:
225 try:
226 portion = statuses_portion(twitter, screen_name, max_id, mentions, favorites, received_dms)
227 except TwitterError as e:
228 if e.e.code == 401:
229 err("Fail: %i Unauthorized (tweets of that user are protected)"
230 % e.e.code)
231 break
232 elif e.e.code == 400:
233 err("Fail: %i API rate limit exceeded" % e.e.code)
234 rate = twitter.account.rate_limit_status()
235 reset = rate['reset_time_in_seconds']
236 reset = time.asctime(time.localtime(reset))
237 delay = int(rate['reset_time_in_seconds']
238 - time.time()) + 5 # avoid race
239 err("Hourly limit of %i requests reached, next reset on %s: "
240 "going to sleep for %i secs" % (rate['hourly_limit'],
241 reset, delay))
242 fail.wait(delay)
243 continue
244 elif e.e.code == 404:
245 err("Fail: %i This profile does not exist" % e.e.code)
246 break
247 elif e.e.code == 502:
248 err("Fail: %i Service currently unavailable, retrying..."
249 % e.e.code)
250 else:
251 err("Fail: %s\nRetrying..." % str(e)[:500])
252 fail.wait(3)
253 except urllib2.URLError as e:
254 err("Fail: urllib2.URLError %s - Retrying..." % str(e))
255 fail.wait(3)
256 except httplib.error as e:
257 err("Fail: httplib.error %s - Retrying..." % str(e))
258 fail.wait(3)
259 except KeyError as e:
260 err("Fail: KeyError %s - Retrying..." % str(e))
261 fail.wait(3)
262 else:
263 new = -len(tweets)
264 tweets.update(portion)
265 new += len(tweets)
266 err("Browsing %s statuses, new tweets: %i"
267 % (screen_name if screen_name else "home", new))
268 if new < 190:
269 break
270 max_id = min(portion.keys())-1 # browse backwards
271 fail = Fail()
272
273def rate_limit_status(twitter):
274 """Print current Twitter API rate limit status."""
275 r = twitter.account.rate_limit_status()
276 print("Remaining API requests: %i/%i (hourly limit)"
277 % (r['remaining_hits'], r['hourly_limit']))
278 print("Next reset in %is (%s)"
279 % (int(r['reset_time_in_seconds'] - time.time()),
280 time.asctime(time.localtime(r['reset_time_in_seconds']))))
281
282def main(args=sys.argv[1:]):
283 options = {
284 'oauth': False,
285 'save-dir': ".",
286 'api-rate': False,
287 'timeline': "",
288 'mentions': "",
289 'dms': "",
290 'favorites': False,
291 'follow-redirects': False,
292 'redirect-sites': None,
293 }
294 try:
295 parse_args(args, options)
296 except GetoptError as e:
297 err("I can't do that, %s." % e)
298 raise SystemExit(1)
299
300 # exit if no user given
301 # except if asking for API rate, or archive of timeline or mentions
302 if not options['extra_args'] and not (options['api-rate'] or
303 options['timeline'] or
304 options['mentions'] or
305 options['dms']):
306 print(__doc__)
307 return
308
309 # authenticate using OAuth, asking for token if necessary
310 if options['oauth']:
311 oauth_filename = (os.getenv("HOME", "") + os.sep
312 + ".twitter-archiver_oauth")
313 if not os.path.exists(oauth_filename):
314 oauth_dance("Twitter-Archiver", CONSUMER_KEY, CONSUMER_SECRET,
315 oauth_filename)
316 oauth_token, oauth_token_secret = read_token_file(oauth_filename)
317 auth = OAuth(oauth_token, oauth_token_secret, CONSUMER_KEY,
318 CONSUMER_SECRET)
319 else:
320 auth = NoAuth()
321
322 twitter = Twitter(auth=auth, api_version='1', domain='api.twitter.com')
323
324 if options['api-rate']:
325 rate_limit_status(twitter)
326 return
327
328 global format_text
329 if options['follow-redirects'] or options['redirect-sites'] :
330 if options['redirect-sites']:
331 hosts = parse_host_list(options['redirect-sites'])
332 else:
333 hosts = None
334 format_text = functools.partial(expand_format_text, hosts)
335 else:
336 format_text = direct_format_text
337
338 # save own timeline or mentions (the user used in OAuth)
339 if options['timeline'] or options['mentions']:
340 if isinstance(auth, NoAuth):
341 err("You must be authenticated to save timeline or mentions.")
342 raise SystemExit(1)
343
344 if options['timeline']:
345 filename = options['save-dir'] + os.sep + options['timeline']
346 print("* Archiving own timeline in %s" % filename)
347 elif options['mentions']:
348 filename = options['save-dir'] + os.sep + options['mentions']
349 print("* Archiving own mentions in %s" % filename)
350
351 tweets = {}
352 try:
353 tweets = load_tweets(filename)
354 except Exception as e:
355 err("Error when loading saved tweets: %s - continuing without"
356 % str(e))
357
358 try:
359 statuses(twitter, "", tweets, options['mentions'], options['favorites'])
360 except KeyboardInterrupt:
361 err()
362 err("Interrupted")
363 raise SystemExit(1)
364
365 save_tweets(filename, tweets)
366 if options['timeline']:
367 print("Total tweets in own timeline: %i" % len(tweets))
368 elif options['mentions']:
369 print("Total mentions: %i" % len(tweets))
370
371 if options['dms']:
372 if isinstance(auth, NoAuth):
373 err("You must be authenticated to save DMs.")
374 raise SystemExit(1)
375
376 filename = options['save-dir'] + os.sep + options['dms']
377 print("* Archiving own DMs in %s" % filename)
378
379 dms = {}
380 try:
381 dms = load_tweets(filename)
382 except Exception, e:
383 err("Error when loading saved DMs: %s - continuing without"
384 % str(e))
385
386 try:
387 statuses(twitter, "", dms, received_dms=True)
388 statuses(twitter, "", dms, received_dms=False)
389 except KeyboardInterrupt:
390 err()
391 err("Interrupted")
392 raise SystemExit(1)
393
394 save_tweets(filename, dms)
395 print("Total DMs sent and received: %i" % len(dms))
396
397
398 # read users from command-line or stdin
399 users = options['extra_args']
400 if len(users) == 1 and users[0] == "-":
401 users = [line.strip() for line in sys.stdin.readlines()]
402
403 # save tweets for every user
404 total, total_new = 0, 0
405 for user in users:
406 filename = options['save-dir'] + os.sep + user
407 if options['favorites']:
408 filename = filename + "-favorites"
409 print("* Archiving %s tweets in %s" % (user, filename))
410
411 tweets = {}
412 try:
413 tweets = load_tweets(filename)
414 except Exception as e:
415 err("Error when loading saved tweets: %s - continuing without"
416 % str(e))
417
418 new = 0
419 before = len(tweets)
420 try:
421 statuses(twitter, user, tweets, options['mentions'], options['favorites'])
422 except KeyboardInterrupt:
423 err()
424 err("Interrupted")
425 raise SystemExit(1)
426
427 save_tweets(filename, tweets)
428 total += len(tweets)
429 new = len(tweets) - before
430 total_new += new
431 print("Total tweets for %s: %i (%i new)" % (user, len(tweets), new))
432
433 print("Total: %i tweets (%i new) for %i users"
434 % (total, total_new, len(users)))