]> jfr.im git - z_archive/twitter.git/blob - twitter/archiver.py
bandaid unicode/str.encode-related crash bug
[z_archive/twitter.git] / twitter / archiver.py
1 """USAGE
2 twitter-archiver [options] <-|user> [<user> ...]
3
4 DESCRIPTION
5 Archive tweets of users, sorted by date from oldest to newest, in
6 the following format: <id> <date> <<screen_name>> <tweet_text>
7 Date format is: YYYY-MM-DD HH:MM:SS TZ. Tweet <id> is used to
8 resume archiving on next run. Archive file name is the user name.
9 Provide "-" instead of users to read users from standard input.
10
11 OPTIONS
12 -o --oauth authenticate to Twitter using OAuth (default: no)
13 -s --save-dir <path> directory to save archives (default: current dir)
14 -a --api-rate see current API rate limit status
15 -t --timeline <file> archive own timeline into given file name (requires
16 OAuth, max 800 statuses)
17 -m --mentions <file> archive own mentions instead of timeline into
18 given file name (requires OAuth, max 800 statuses)
19 -v --favorites archive user's favorites instead of timeline
20 -f --follow-redirects follow redirects of urls
21 -r --redirect-sites follow redirects for this comma separated list of hosts
22 -d --dms <file> archive own direct messages (both received and
23 sent) into given file name.
24 -i --isoformat store dates in ISO format (specifically RFC 3339)
25
26 AUTHENTICATION
27 Authenticate to Twitter using OAuth to archive tweets of private profiles
28 and have higher API rate limits. OAuth authentication tokens are stored
29 in ~/.twitter-archiver_oauth.
30 """
31
32 from __future__ import print_function
33
34 import os, sys, time as _time, calendar, functools
35 from datetime import time, date, datetime
36 from getopt import gnu_getopt as getopt, GetoptError
37
38 try:
39 import urllib.request as urllib2
40 import http.client as httplib
41 except ImportError:
42 import urllib2
43 import httplib
44
45
46 # T-Archiver (Twitter-Archiver) application registered by @stalkr_
47 CONSUMER_KEY='d8hIyfzs7ievqeeZLjZrqQ'
48 CONSUMER_SECRET='AnZmK0rnvaX7BoJ75l6XlilnbyMv7FoiDXWVmPD8'
49
50 from .api import Twitter, TwitterError
51 from .oauth import OAuth, read_token_file
52 from .oauth_dance import oauth_dance
53 from .auth import NoAuth
54 from .util import Fail, err, expand_line, parse_host_list
55 from .follow import lookup
56 from .timezones import utc as UTC, Local
57
58 def parse_args(args, options):
59 """Parse arguments from command-line to set options."""
60 long_opts = ['help', 'oauth', 'save-dir=', 'api-rate', 'timeline=', 'mentions=', 'favorites', 'follow-redirects',"redirect-sites=", 'dms=', 'isoformat']
61 short_opts = "hos:at:m:vfr:d:i"
62 opts, extra_args = getopt(args, short_opts, long_opts)
63
64 for opt, arg in opts:
65 if opt in ('-h', '--help'):
66 print(__doc__)
67 raise SystemExit(0)
68 elif opt in ('-o', '--oauth'):
69 options['oauth'] = True
70 elif opt in ('-s', '--save-dir'):
71 options['save-dir'] = arg
72 elif opt in ('-a', '--api-rate'):
73 options['api-rate' ] = True
74 elif opt in ('-t', '--timeline'):
75 options['timeline'] = arg
76 elif opt in ('-m', '--mentions'):
77 options['mentions'] = arg
78 elif opt in ('-v', '--favorites'):
79 options['favorites'] = True
80 elif opt in ('-f', '--follow-redirects'):
81 options['follow-redirects'] = True
82 elif opt in ('-r', '--redirect-sites'):
83 options['redirect-sites'] = arg
84 elif opt in ('-d', '--dms'):
85 options['dms'] = arg
86 elif opt in ('-i', '--isoformat'):
87 options['isoformat'] = True
88
89 options['extra_args'] = extra_args
90
91 def load_tweets(filename):
92 """Load tweets from file into dict, see save_tweets()."""
93 try:
94 archive = open(filename,"r")
95 except IOError: # no archive (yet)
96 return {}
97
98 tweets = {}
99 for line in archive.readlines():
100 try:
101 tid, text = line.strip().split(" ", 1)
102 tweets[int(tid)] = text.decode("utf-8")
103 except Exception as e:
104 err("loading tweet %s failed due to %s" % (line, unicode(e)))
105
106 archive.close()
107 return tweets
108
109 def save_tweets(filename, tweets):
110 """Save tweets from dict to file.
111
112 Save tweets from dict to UTF-8 encoded file, one per line:
113 <tweet id (number)> <tweet text>
114 Tweet text is:
115 <date> <<user>> [RT @<user>: ]<text>
116
117 Args:
118 filename: A string representing the file name to save tweets to.
119 tweets: A dict mapping tweet-ids (int) to tweet text (str).
120 """
121 if len(tweets) == 0:
122 return
123
124 try:
125 archive = open(filename,"w")
126 except IOError as e:
127 err("Cannot save tweets: %s" % str(e))
128 return
129
130 for k in sorted(tweets.keys()):
131 try:
132 archive.write("%i %s\n" % (k, tweets[k].encode('utf-8')))
133 except Exception as ex:
134 err("archiving tweet %s failed due to %s" % (k, unicode(ex)))
135
136 archive.close()
137
138 def format_date(utc, isoformat=False):
139 """Parse Twitter's UTC date into UTC or local time."""
140 u = datetime.strptime(utc.replace('+0000','UTC'), '%a %b %d %H:%M:%S %Z %Y')
141 # This is the least painful way I could find to create a non-naive
142 # datetime including a UTC timezone. Alternative suggestions
143 # welcome.
144 unew = datetime.combine(u.date(), time(u.time().hour,
145 u.time().minute, u.time().second, tzinfo=UTC))
146
147 # Convert to localtime
148 unew = unew.astimezone(Local)
149
150 if isoformat:
151 return unew.isoformat()
152 else:
153 return unew.strftime('%Y-%m-%d %H:%M:%S %Z')
154
155 def expand_format_text(hosts, text):
156 """Following redirects in links."""
157 return direct_format_text(expand_line(text, hosts))
158
159 def direct_format_text(text):
160 """Transform special chars in text to have only one line."""
161 return text.replace('\n','\\n').replace('\r','\\r')
162
163 def statuses_resolve_uids(twitter, tl):
164 """Resolve user ids to screen names from statuses."""
165 # get all user ids that needs a lookup (no screen_name key)
166 user_ids = []
167 for t in tl:
168 rt = t.get('retweeted_status')
169 if rt and not rt['user'].get('screen_name'):
170 user_ids.append(rt['user']['id'])
171 if not t['user'].get('screen_name'):
172 user_ids.append(t['user']['id'])
173
174 # resolve all of them at once
175 names = lookup(twitter, list(set(user_ids)))
176
177 # build new statuses with resolved uids
178 new_tl = []
179 for t in tl:
180 rt = t.get('retweeted_status')
181 if rt and not rt['user'].get('screen_name'):
182 name = names[rt['user']['id']]
183 t['retweeted_status']['user']['screen_name'] = name
184 if not t['user'].get('screen_name'):
185 name = names[t['user']['id']]
186 t['user']['screen_name'] = name
187 new_tl.append(t)
188
189 return new_tl
190
191 def statuses_portion(twitter, screen_name, max_id=None, mentions=False, favorites=False, received_dms=None, isoformat=False):
192 """Get a portion of the statuses of a screen name."""
193 kwargs = dict(count=200, include_rts=1, screen_name=screen_name)
194 if max_id:
195 kwargs['max_id'] = max_id
196
197 tweets = {}
198 if mentions:
199 tl = twitter.statuses.mentions_timeline(**kwargs)
200 elif favorites:
201 tl = twitter.favorites.list(**kwargs)
202 elif received_dms != None:
203 if received_dms:
204 tl = twitter.direct_messages(**kwargs)
205 else: # sent DMs
206 tl = twitter.direct_messages.sent(**kwargs)
207 else: # timeline
208 if screen_name:
209 tl = twitter.statuses.user_timeline(**kwargs)
210 else: # self
211 tl = twitter.statuses.home_timeline(**kwargs)
212
213 # some tweets do not provide screen name but user id, resolve those
214 # this isn't a valid operation for DMs, so special-case them
215 if received_dms == None:
216 newtl = statuses_resolve_uids(twitter, tl)
217 else:
218 newtl = tl
219 for t in newtl:
220 text = t['text']
221 rt = t.get('retweeted_status')
222 if rt:
223 text = "RT @%s: %s" % (rt['user']['screen_name'], rt['text'])
224 # DMs don't include mentions by default, so in order to show who
225 # the recipient was, we synthesise a mention. If we're not
226 # operating on DMs, behave as normal
227 if received_dms == None:
228 tweets[t['id']] = "%s <%s> %s" % (format_date(t['created_at'], isoformat=isoformat),
229 t['user']['screen_name'],
230 format_text(text))
231 else:
232 tweets[t['id']] = "%s <%s> @%s %s" % (format_date(t['created_at'], isoformat=isoformat),
233 t['sender_screen_name'],
234 t['recipient']['screen_name'],
235 format_text(text))
236 return tweets
237
238 def statuses(twitter, screen_name, tweets, mentions=False, favorites=False, received_dms=None, isoformat=False):
239 """Get all the statuses for a screen name."""
240 max_id = None
241 fail = Fail()
242 # get portions of statuses, incrementing max id until no new tweets appear
243 while True:
244 try:
245 portion = statuses_portion(twitter, screen_name, max_id, mentions, favorites, received_dms, isoformat)
246 except TwitterError as e:
247 if e.e.code == 401:
248 err("Fail: %i Unauthorized (tweets of that user are protected)"
249 % e.e.code)
250 break
251 elif e.e.code == 429:
252 err("Fail: %i API rate limit exceeded" % e.e.code)
253 rls = twitter.application.rate_limit_status()
254 reset = rls.rate_limit_reset
255 reset = _time.asctime(_time.localtime(reset))
256 delay = int(rls.rate_limit_reset
257 - _time.time()) + 5 # avoid race
258 err("Interval limit of %i requests reached, next reset on %s: "
259 "going to sleep for %i secs" % (rls.rate_limit_limit,
260 reset, delay))
261 fail.wait(delay)
262 continue
263 elif e.e.code == 404:
264 err("Fail: %i This profile does not exist" % e.e.code)
265 break
266 elif e.e.code == 502:
267 err("Fail: %i Service currently unavailable, retrying..."
268 % e.e.code)
269 else:
270 err("Fail: %s\nRetrying..." % str(e)[:500])
271 fail.wait(3)
272 except urllib2.URLError as e:
273 err("Fail: urllib2.URLError %s - Retrying..." % str(e))
274 fail.wait(3)
275 except httplib.error as e:
276 err("Fail: httplib.error %s - Retrying..." % str(e))
277 fail.wait(3)
278 except KeyError as e:
279 err("Fail: KeyError %s - Retrying..." % str(e))
280 fail.wait(3)
281 else:
282 new = -len(tweets)
283 tweets.update(portion)
284 new += len(tweets)
285 err("Browsing %s statuses, new tweets: %i"
286 % (screen_name if screen_name else "home", new))
287 if new < 190:
288 break
289 max_id = min(portion.keys())-1 # browse backwards
290 fail = Fail()
291
292 def rate_limit_status(twitter):
293 """Print current Twitter API rate limit status."""
294 rls = twitter.application.rate_limit_status()
295 print("Remaining API requests: %i/%i (interval limit)"
296 % (rls.rate_limit_remaining, rls.rate_limit_limit))
297 print("Next reset in %is (%s)"
298 % (int(rls.rate_limit_reset - _time.time()),
299 _time.asctime(_time.localtime(rls.rate_limit_reset))))
300
301 def main(args=sys.argv[1:]):
302 options = {
303 'oauth': False,
304 'save-dir': ".",
305 'api-rate': False,
306 'timeline': "",
307 'mentions': "",
308 'dms': "",
309 'favorites': False,
310 'follow-redirects': False,
311 'redirect-sites': None,
312 'isoformat': False,
313 }
314 try:
315 parse_args(args, options)
316 except GetoptError as e:
317 err("I can't do that, %s." % e)
318 raise SystemExit(1)
319
320 # exit if no user given
321 # except if asking for API rate, or archive of timeline or mentions
322 if not options['extra_args'] and not (options['api-rate'] or
323 options['timeline'] or
324 options['mentions'] or
325 options['dms']):
326 print(__doc__)
327 return
328
329 # authenticate using OAuth, asking for token if necessary
330 if options['oauth']:
331 oauth_filename = (os.environ.get('HOME',
332 os.environ.get('USERPROFILE', ''))
333 + os.sep
334 + '.twitter-archiver_oauth')
335
336 if not os.path.exists(oauth_filename):
337 oauth_dance("Twitter-Archiver", CONSUMER_KEY, CONSUMER_SECRET,
338 oauth_filename)
339 oauth_token, oauth_token_secret = read_token_file(oauth_filename)
340 auth = OAuth(oauth_token, oauth_token_secret, CONSUMER_KEY,
341 CONSUMER_SECRET)
342 else:
343 auth = NoAuth()
344
345 twitter = Twitter(auth=auth, api_version='1.1', domain='api.twitter.com')
346
347 if options['api-rate']:
348 rate_limit_status(twitter)
349 return
350
351 global format_text
352 if options['follow-redirects'] or options['redirect-sites'] :
353 if options['redirect-sites']:
354 hosts = parse_host_list(options['redirect-sites'])
355 else:
356 hosts = None
357 format_text = functools.partial(expand_format_text, hosts)
358 else:
359 format_text = direct_format_text
360
361 # save own timeline or mentions (the user used in OAuth)
362 if options['timeline'] or options['mentions']:
363 if isinstance(auth, NoAuth):
364 err("You must be authenticated to save timeline or mentions.")
365 raise SystemExit(1)
366
367 if options['timeline']:
368 filename = options['save-dir'] + os.sep + options['timeline']
369 print("* Archiving own timeline in %s" % filename)
370 elif options['mentions']:
371 filename = options['save-dir'] + os.sep + options['mentions']
372 print("* Archiving own mentions in %s" % filename)
373
374 tweets = {}
375 try:
376 tweets = load_tweets(filename)
377 except Exception as e:
378 err("Error when loading saved tweets: %s - continuing without"
379 % str(e))
380
381 try:
382 statuses(twitter, "", tweets, options['mentions'], options['favorites'], isoformat=options['isoformat'])
383 except KeyboardInterrupt:
384 err()
385 err("Interrupted")
386 raise SystemExit(1)
387
388 save_tweets(filename, tweets)
389 if options['timeline']:
390 print("Total tweets in own timeline: %i" % len(tweets))
391 elif options['mentions']:
392 print("Total mentions: %i" % len(tweets))
393
394 if options['dms']:
395 if isinstance(auth, NoAuth):
396 err("You must be authenticated to save DMs.")
397 raise SystemExit(1)
398
399 filename = options['save-dir'] + os.sep + options['dms']
400 print("* Archiving own DMs in %s" % filename)
401
402 dms = {}
403 try:
404 dms = load_tweets(filename)
405 except Exception as e:
406 err("Error when loading saved DMs: %s - continuing without"
407 % str(e))
408
409 try:
410 statuses(twitter, "", dms, received_dms=True, isoformat=options['isoformat'])
411 statuses(twitter, "", dms, received_dms=False, isoformat=options['isoformat'])
412 except KeyboardInterrupt:
413 err()
414 err("Interrupted")
415 raise SystemExit(1)
416
417 save_tweets(filename, dms)
418 print("Total DMs sent and received: %i" % len(dms))
419
420
421 # read users from command-line or stdin
422 users = options['extra_args']
423 if len(users) == 1 and users[0] == "-":
424 users = [line.strip() for line in sys.stdin.readlines()]
425
426 # save tweets for every user
427 total, total_new = 0, 0
428 for user in users:
429 filename = options['save-dir'] + os.sep + user
430 if options['favorites']:
431 filename = filename + "-favorites"
432 print("* Archiving %s tweets in %s" % (user, filename))
433
434 tweets = {}
435 try:
436 tweets = load_tweets(filename)
437 except Exception as e:
438 err("Error when loading saved tweets: %s - continuing without"
439 % str(e))
440
441 new = 0
442 before = len(tweets)
443 try:
444 statuses(twitter, user, tweets, options['mentions'], options['favorites'], isoformat=options['isoformat'])
445 except KeyboardInterrupt:
446 err()
447 err("Interrupted")
448 raise SystemExit(1)
449
450 save_tweets(filename, tweets)
451 total += len(tweets)
452 new = len(tweets) - before
453 total_new += new
454 print("Total tweets for %s: %i (%i new)" % (user, len(tweets), new))
455
456 print("Total: %i tweets (%i new) for %i users"
457 % (total, total_new, len(users)))