]> jfr.im git - z_archive/twitter.git/blame - twitter/archiver.py
Fix some tests.
[z_archive/twitter.git] / twitter / archiver.py
CommitLineData
a7282452
S
1"""USAGE
2 twitter-archiver [options] <-|user> [<user> ...]
3
4DESCRIPTION
5 Archive tweets of users, sorted by date from oldest to newest, in
6 the following format: <id> <date> <<screen_name>> <tweet_text>
7 Date format is: YYYY-MM-DD HH:MM:SS TZ. Tweet <id> is used to
8 resume archiving on next run. Archive file name is the user name.
9 Provide "-" instead of users to read users from standard input.
10
11OPTIONS
4f0b5ca6 12 -o --oauth authenticate to Twitter using OAuth (default: no)
a7282452
S
13 -s --save-dir <path> directory to save archives (default: current dir)
14 -a --api-rate see current API rate limit status
15 -t --timeline <file> archive own timeline into given file name (requires
4f0b5ca6
H
16 OAuth, max 800 statuses)
17 -m --mentions <file> archive own mentions instead of timeline into
18 given file name (requires OAuth, max 800 statuses)
19 -v --favorites archive user's favorites instead of timeline
907402f6 20 -f --follow-redirects follow redirects of urls
21 -r --redirect-sites follow redirects for this comma separated list of hosts
a7282452
S
22
23AUTHENTICATION
24 Authenticate to Twitter using OAuth to archive tweets of private profiles
25 and have higher API rate limits. OAuth authentication tokens are stored
26 in ~/.twitter-archiver_oauth.
27"""
28
29from __future__ import print_function
30
62ec1b07 31import os, sys, time, calendar, functools
a7282452
S
32from getopt import gnu_getopt as getopt, GetoptError
33
62ec1b07 34try:
35 import urllib.request as urllib2
36 import http.client as httplib
37except ImportError:
38 import urllib2
39 import httplib
40
41
a7282452
S
42# T-Archiver (Twitter-Archiver) application registered by @stalkr_
43CONSUMER_KEY='d8hIyfzs7ievqeeZLjZrqQ'
44CONSUMER_SECRET='AnZmK0rnvaX7BoJ75l6XlilnbyMv7FoiDXWVmPD8'
45
46from .api import Twitter, TwitterError
47from .oauth import OAuth, read_token_file
48from .oauth_dance import oauth_dance
49from .auth import NoAuth
907402f6 50from .util import Fail, err, expand_line, parse_host_list
a7282452
S
51from .follow import lookup
52
53def parse_args(args, options):
54 """Parse arguments from command-line to set options."""
4f0b5ca6
H
55 long_opts = ['help', 'oauth', 'save-dir=', 'api-rate', 'timeline=', 'mentions=', 'favorites', 'follow-redirects',"redirect-sites="]
56 short_opts = "hos:at:m:vfr:"
a7282452
S
57 opts, extra_args = getopt(args, short_opts, long_opts)
58
59 for opt, arg in opts:
60 if opt in ('-h', '--help'):
61 print(__doc__)
62 raise SystemExit(0)
63 elif opt in ('-o', '--oauth'):
64 options['oauth'] = True
65 elif opt in ('-s', '--save-dir'):
66 options['save-dir'] = arg
67 elif opt in ('-a', '--api-rate'):
68 options['api-rate' ] = True
69 elif opt in ('-t', '--timeline'):
70 options['timeline'] = arg
4f0b5ca6
H
71 elif opt in ('-m', '--mentions'):
72 options['mentions'] = arg
73 elif opt in ('-v', '--favorites'):
74 options['favorites'] = True
907402f6 75 elif opt in ('-f', '--follow-redirects'):
76 options['follow-redirects'] = True
77 elif opt in ('-r', '--redirect-sites'):
78 options['redirect-sites'] = arg
a7282452
S
79
80 options['extra_args'] = extra_args
81
82def load_tweets(filename):
83 """Load tweets from file into dict, see save_tweets()."""
84 try:
85 archive = open(filename,"r")
86 except IOError: # no archive (yet)
87 return {}
88
89 tweets = {}
90 for line in archive.readlines():
91 tid, text = line.strip().split(" ", 1)
92 tweets[int(tid)] = text.decode("utf-8")
93
94 archive.close()
95 return tweets
96
97def save_tweets(filename, tweets):
98 """Save tweets from dict to file.
99
100 Save tweets from dict to UTF-8 encoded file, one per line:
101 <tweet id (number)> <tweet text>
102 Tweet text is:
103 <date> <<user>> [RT @<user>: ]<text>
104
105 Args:
106 filename: A string representing the file name to save tweets to.
107 tweets: A dict mapping tweet-ids (int) to tweet text (str).
108 """
109 if len(tweets) == 0:
110 return
111
112 try:
113 archive = open(filename,"w")
114 except IOError as e:
115 err("Cannot save tweets: %s" % str(e))
116 return
117
118 for k in sorted(tweets.keys()):
119 archive.write("%i %s\n" % (k, tweets[k].encode('utf-8')))
120
121 archive.close()
122
123def format_date(utc, to_localtime=True):
124 """Parse Twitter's UTC date into UTC or local time."""
125 u = time.strptime(utc.replace('+0000','UTC'), '%a %b %d %H:%M:%S %Z %Y')
126 if to_localtime and time.timezone != 0:
127 t = time.localtime(calendar.timegm(u))
128 return time.strftime("%Y-%m-%d %H:%M:%S", t) + " " + time.tzname[1]
129 else:
130 return time.strftime("%Y-%m-%d %H:%M:%S UTC", u)
131
907402f6 132def expand_format_text(hosts, text):
133 """Following redirects in links."""
134 return direct_format_text(expand_line(text, hosts))
135
136def direct_format_text(text):
a7282452
S
137 """Transform special chars in text to have only one line."""
138 return text.replace('\n','\\n').replace('\r','\\r')
139
4f0b5ca6
H
140def statuses_resolve_uids(twitter, tl):
141 """Resolve user ids to screen names from statuses."""
a7282452
S
142 # get all user ids that needs a lookup (no screen_name key)
143 user_ids = []
144 for t in tl:
145 rt = t.get('retweeted_status')
146 if rt and not rt['user'].get('screen_name'):
147 user_ids.append(rt['user']['id'])
148 if not t['user'].get('screen_name'):
149 user_ids.append(t['user']['id'])
150
151 # resolve all of them at once
152 names = lookup(twitter, list(set(user_ids)))
153
4f0b5ca6 154 # build new statuses with resolved uids
a7282452
S
155 new_tl = []
156 for t in tl:
157 rt = t.get('retweeted_status')
158 if rt and not rt['user'].get('screen_name'):
159 name = names[rt['user']['id']]
160 t['retweeted_status']['user']['screen_name'] = name
161 if not t['user'].get('screen_name'):
162 name = names[t['user']['id']]
163 t['user']['screen_name'] = name
164 new_tl.append(t)
165
166 return new_tl
167
4f0b5ca6
H
168def statuses_portion(twitter, screen_name, max_id=None, mentions=False, favorites=False):
169 """Get a portion of the statuses of a screen name."""
a7282452
S
170 kwargs = dict(count=200, include_rts=1, screen_name=screen_name)
171 if max_id:
172 kwargs['max_id'] = max_id
173
174 tweets = {}
4f0b5ca6
H
175 if mentions:
176 tl = twitter.statuses.mentions(**kwargs)
177 elif favorites:
178 tl = twitter.favorites(**kwargs) # API v1, favorites.list() in v1.1
179 else: # timeline
180 if screen_name:
181 tl = twitter.statuses.user_timeline(**kwargs)
182 else: # self
183 tl = twitter.statuses.home_timeline(**kwargs)
a7282452
S
184
185 # some tweets do not provide screen name but user id, resolve those
4f0b5ca6 186 for t in statuses_resolve_uids(twitter, tl):
a7282452
S
187 text = t['text']
188 rt = t.get('retweeted_status')
189 if rt:
190 text = "RT @%s: %s" % (rt['user']['screen_name'], rt['text'])
191 tweets[t['id']] = "%s <%s> %s" % (format_date(t['created_at']),
192 t['user']['screen_name'],
193 format_text(text))
a7282452
S
194 return tweets
195
4f0b5ca6
H
196def statuses(twitter, screen_name, tweets, mentions=False, favorites=False):
197 """Get all the statuses for a screen name."""
a7282452
S
198 max_id = None
199 fail = Fail()
4f0b5ca6 200 # get portions of statuses, incrementing max id until no new tweets appear
a7282452
S
201 while True:
202 try:
4f0b5ca6 203 portion = statuses_portion(twitter, screen_name, max_id, mentions, favorites)
a7282452
S
204 except TwitterError as e:
205 if e.e.code == 401:
206 err("Fail: %i Unauthorized (tweets of that user are protected)"
207 % e.e.code)
208 break
209 elif e.e.code == 400:
210 err("Fail: %i API rate limit exceeded" % e.e.code)
211 rate = twitter.account.rate_limit_status()
212 reset = rate['reset_time_in_seconds']
213 reset = time.asctime(time.localtime(reset))
214 delay = int(rate['reset_time_in_seconds']
215 - time.time()) + 5 # avoid race
216 err("Hourly limit of %i requests reached, next reset on %s: "
217 "going to sleep for %i secs" % (rate['hourly_limit'],
218 reset, delay))
219 fail.wait(delay)
220 continue
bb2a70ee
S
221 elif e.e.code == 404:
222 err("Fail: %i This profile does not exist" % e.e.code)
223 break
a7282452
S
224 elif e.e.code == 502:
225 err("Fail: %i Service currently unavailable, retrying..."
226 % e.e.code)
227 else:
228 err("Fail: %s\nRetrying..." % str(e)[:500])
229 fail.wait(3)
230 except urllib2.URLError as e:
231 err("Fail: urllib2.URLError %s - Retrying..." % str(e))
232 fail.wait(3)
233 except httplib.error as e:
234 err("Fail: httplib.error %s - Retrying..." % str(e))
235 fail.wait(3)
236 except KeyError as e:
237 err("Fail: KeyError %s - Retrying..." % str(e))
238 fail.wait(3)
239 else:
240 new = -len(tweets)
241 tweets.update(portion)
242 new += len(tweets)
4f0b5ca6 243 err("Browsing %s statuses, new tweets: %i"
a7282452
S
244 % (screen_name if screen_name else "home", new))
245 if new < 190:
246 break
aa3d6d75 247 max_id = min(portion.keys())-1 # browse backwards
a7282452
S
248 fail = Fail()
249
250def rate_limit_status(twitter):
251 """Print current Twitter API rate limit status."""
252 r = twitter.account.rate_limit_status()
253 print("Remaining API requests: %i/%i (hourly limit)"
254 % (r['remaining_hits'], r['hourly_limit']))
255 print("Next reset in %is (%s)"
256 % (int(r['reset_time_in_seconds'] - time.time()),
257 time.asctime(time.localtime(r['reset_time_in_seconds']))))
258
259def main(args=sys.argv[1:]):
260 options = {
261 'oauth': False,
262 'save-dir': ".",
263 'api-rate': False,
907402f6 264 'timeline': "",
4f0b5ca6
H
265 'mentions': "",
266 'favorites': False,
907402f6 267 'follow-redirects': False,
268 'redirect-sites': None,
a7282452
S
269 }
270 try:
271 parse_args(args, options)
272 except GetoptError as e:
273 err("I can't do that, %s." % e)
274 raise SystemExit(1)
275
276 # exit if no user given
4f0b5ca6 277 # except if asking for API rate, or archive of timeline or mentions
a7282452 278 if not options['extra_args'] and not (options['api-rate'] or
4f0b5ca6
H
279 options['timeline'] or
280 options['mentions']):
a7282452
S
281 print(__doc__)
282 return
283
284 # authenticate using OAuth, asking for token if necessary
285 if options['oauth']:
286 oauth_filename = (os.getenv("HOME", "") + os.sep
287 + ".twitter-archiver_oauth")
288 if not os.path.exists(oauth_filename):
289 oauth_dance("Twitter-Archiver", CONSUMER_KEY, CONSUMER_SECRET,
290 oauth_filename)
291 oauth_token, oauth_token_secret = read_token_file(oauth_filename)
292 auth = OAuth(oauth_token, oauth_token_secret, CONSUMER_KEY,
293 CONSUMER_SECRET)
294 else:
295 auth = NoAuth()
296
297 twitter = Twitter(auth=auth, api_version='1', domain='api.twitter.com')
298
299 if options['api-rate']:
300 rate_limit_status(twitter)
301 return
302
907402f6 303 global format_text
304 if options['follow-redirects'] or options['redirect-sites'] :
305 if options['redirect-sites']:
306 hosts = parse_host_list(options['redirect-sites'])
307 else:
308 hosts = None
309 format_text = functools.partial(expand_format_text, hosts)
310 else:
311 format_text = direct_format_text
be5f32da 312
4f0b5ca6
H
313 # save own timeline or mentions (the user used in OAuth)
314 if options['timeline'] or options['mentions']:
a7282452 315 if isinstance(auth, NoAuth):
4f0b5ca6 316 err("You must be authenticated to save timeline or mentions.")
a7282452
S
317 raise SystemExit(1)
318
4f0b5ca6
H
319 if options['timeline']:
320 filename = options['save-dir'] + os.sep + options['timeline']
321 print("* Archiving own timeline in %s" % filename)
322 elif options['mentions']:
323 filename = options['save-dir'] + os.sep + options['mentions']
324 print("* Archiving own mentions in %s" % filename)
a7282452
S
325
326 tweets = {}
327 try:
328 tweets = load_tweets(filename)
62ec1b07 329 except Exception as e:
a7282452
S
330 err("Error when loading saved tweets: %s - continuing without"
331 % str(e))
332
333 try:
4f0b5ca6 334 statuses(twitter, "", tweets, options['mentions'], options['favorites'])
a7282452
S
335 except KeyboardInterrupt:
336 err()
337 err("Interrupted")
338 raise SystemExit(1)
339
340 save_tweets(filename, tweets)
4f0b5ca6
H
341 if options['timeline']:
342 print("Total tweets in own timeline: %i" % len(tweets))
343 elif options['mentions']:
344 print("Total mentions: %i" % len(tweets))
a7282452
S
345
346 # read users from command-line or stdin
347 users = options['extra_args']
348 if len(users) == 1 and users[0] == "-":
349 users = [line.strip() for line in sys.stdin.readlines()]
350
351 # save tweets for every user
352 total, total_new = 0, 0
353 for user in users:
354 filename = options['save-dir'] + os.sep + user
4f0b5ca6
H
355 if options['favorites']:
356 filename = filename + "-favorites"
a7282452
S
357 print("* Archiving %s tweets in %s" % (user, filename))
358
359 tweets = {}
360 try:
361 tweets = load_tweets(filename)
62ec1b07 362 except Exception as e:
a7282452
S
363 err("Error when loading saved tweets: %s - continuing without"
364 % str(e))
365
366 new = 0
367 before = len(tweets)
368 try:
4f0b5ca6 369 statuses(twitter, user, tweets, options['mentions'], options['favorites'])
a7282452
S
370 except KeyboardInterrupt:
371 err()
372 err("Interrupted")
373 raise SystemExit(1)
374
375 save_tweets(filename, tweets)
376 total += len(tweets)
377 new = len(tweets) - before
378 total_new += new
379 print("Total tweets for %s: %i (%i new)" % (user, len(tweets), new))
380
381 print("Total: %i tweets (%i new) for %i users"
382 % (total, total_new, len(users)))