]>
jfr.im git - z_archive/twitter.git/blob - twitter/archiver.py
2 twitter-archiver [options] <-|user> [<user> ...]
5 Archive tweets of users, sorted by date from oldest to newest, in
6 the following format: <id> <date> <<screen_name>> <tweet_text>
7 Date format is: YYYY-MM-DD HH:MM:SS TZ. Tweet <id> is used to
8 resume archiving on next run. Archive file name is the user name.
9 Provide "-" instead of users to read users from standard input.
12 -o --oauth authenticate to Twitter using OAuth (default no)
13 -s --save-dir <path> directory to save archives (default: current dir)
14 -a --api-rate see current API rate limit status
15 -t --timeline <file> archive own timeline into given file name (requires
16 OAuth, max 800 statuses).
19 Authenticate to Twitter using OAuth to archive tweets of private profiles
20 and have higher API rate limits. OAuth authentication tokens are stored
21 in ~/.twitter-archiver_oauth.
24 from __future__
import print_function
26 import os
, sys
, time
, calendar
, urllib2
, httplib
27 from getopt
import gnu_getopt
as getopt
, GetoptError
29 # T-Archiver (Twitter-Archiver) application registered by @stalkr_
30 CONSUMER_KEY
='d8hIyfzs7ievqeeZLjZrqQ'
31 CONSUMER_SECRET
='AnZmK0rnvaX7BoJ75l6XlilnbyMv7FoiDXWVmPD8'
33 from .api
import Twitter
, TwitterError
34 from .oauth
import OAuth
, read_token_file
35 from .oauth_dance
import oauth_dance
36 from .auth
import NoAuth
37 from .util
import Fail
, err
38 from .follow
import lookup
40 def parse_args(args
, options
):
41 """Parse arguments from command-line to set options."""
42 long_opts
= ['help', 'oauth', 'save-dir=', 'api-rate', 'timeline=']
43 short_opts
= "hos:at:"
44 opts
, extra_args
= getopt(args
, short_opts
, long_opts
)
47 if opt
in ('-h', '--help'):
50 elif opt
in ('-o', '--oauth'):
51 options
['oauth'] = True
52 elif opt
in ('-s', '--save-dir'):
53 options
['save-dir'] = arg
54 elif opt
in ('-a', '--api-rate'):
55 options
['api-rate' ] = True
56 elif opt
in ('-t', '--timeline'):
57 options
['timeline'] = arg
59 options
['extra_args'] = extra_args
61 def load_tweets(filename
):
62 """Load tweets from file into dict, see save_tweets()."""
64 archive
= open(filename
,"r")
65 except IOError: # no archive (yet)
69 for line
in archive
.readlines():
70 tid
, text
= line
.strip().split(" ", 1)
71 tweets
[int(tid
)] = text
.decode("utf-8")
76 def save_tweets(filename
, tweets
):
77 """Save tweets from dict to file.
79 Save tweets from dict to UTF-8 encoded file, one per line:
80 <tweet id (number)> <tweet text>
82 <date> <<user>> [RT @<user>: ]<text>
85 filename: A string representing the file name to save tweets to.
86 tweets: A dict mapping tweet-ids (int) to tweet text (str).
92 archive
= open(filename
,"w")
94 err("Cannot save tweets: %s" % str(e
))
97 for k
in sorted(tweets
.keys()):
98 archive
.write("%i %s\n" % (k
, tweets
[k
].encode('utf-8')))
102 def format_date(utc
, to_localtime
=True):
103 """Parse Twitter's UTC date into UTC or local time."""
104 u
= time
.strptime(utc
.replace('+0000','UTC'), '%a %b %d %H:%M:%S %Z %Y')
105 if to_localtime
and time
.timezone
!= 0:
106 t
= time
.localtime(calendar
.timegm(u
))
107 return time
.strftime("%Y-%m-%d %H:%M:%S", t
) + " " + time
.tzname
[1]
109 return time
.strftime("%Y-%m-%d %H:%M:%S UTC", u
)
111 def format_text(text
):
112 """Transform special chars in text to have only one line."""
113 return text
.replace('\n','\\n').replace('\r','\\r')
115 def timeline_resolve_uids(twitter
, tl
):
116 """Resolve user ids to screen names from a timeline."""
117 # get all user ids that needs a lookup (no screen_name key)
120 rt
= t
.get('retweeted_status')
121 if rt
and not rt
['user'].get('screen_name'):
122 user_ids
.append(rt
['user']['id'])
123 if not t
['user'].get('screen_name'):
124 user_ids
.append(t
['user']['id'])
126 # resolve all of them at once
127 names
= lookup(twitter
, list(set(user_ids
)))
129 # build new timeline with resolved uids
132 rt
= t
.get('retweeted_status')
133 if rt
and not rt
['user'].get('screen_name'):
134 name
= names
[rt
['user']['id']]
135 t
['retweeted_status']['user']['screen_name'] = name
136 if not t
['user'].get('screen_name'):
137 name
= names
[t
['user']['id']]
138 t
['user']['screen_name'] = name
143 def timeline_portion(twitter
, screen_name
, max_id
=None):
144 """Get a portion of the timeline of a screen name."""
145 kwargs
= dict(count
=200, include_rts
=1, screen_name
=screen_name
)
147 kwargs
['max_id'] = max_id
151 tl
= twitter
.statuses
.user_timeline(**kwargs
)
153 tl
= twitter
.statuses
.home_timeline(**kwargs
)
155 # some tweets do not provide screen name but user id, resolve those
156 for t
in timeline_resolve_uids(twitter
, tl
):
158 rt
= t
.get('retweeted_status')
160 text
= "RT @%s: %s" % (rt
['user']['screen_name'], rt
['text'])
161 tweets
[t
['id']] = "%s <%s> %s" % (format_date(t
['created_at']),
162 t
['user']['screen_name'],
167 def timeline(twitter
, screen_name
, tweets
):
168 """Get the entire timeline of tweets for a screen name."""
171 # get portions of timeline, incrementing max id until no new tweets appear
174 portion
= timeline_portion(twitter
, screen_name
, max_id
)
175 except TwitterError
as e
:
177 err("Fail: %i Unauthorized (tweets of that user are protected)"
180 elif e
.e
.code
== 400:
181 err("Fail: %i API rate limit exceeded" % e
.e
.code
)
182 rate
= twitter
.account
.rate_limit_status()
183 reset
= rate
['reset_time_in_seconds']
184 reset
= time
.asctime(time
.localtime(reset
))
185 delay
= int(rate
['reset_time_in_seconds']
186 - time
.time()) + 5 # avoid race
187 err("Hourly limit of %i requests reached, next reset on %s: "
188 "going to sleep for %i secs" % (rate
['hourly_limit'],
192 elif e
.e
.code
== 404:
193 err("Fail: %i This profile does not exist" % e
.e
.code
)
195 elif e
.e
.code
== 502:
196 err("Fail: %i Service currently unavailable, retrying..."
199 err("Fail: %s\nRetrying..." % str(e
)[:500])
201 except urllib2
.URLError
as e
:
202 err("Fail: urllib2.URLError %s - Retrying..." % str(e
))
204 except httplib
.error
as e
:
205 err("Fail: httplib.error %s - Retrying..." % str(e
))
207 except KeyError as e
:
208 err("Fail: KeyError %s - Retrying..." % str(e
))
212 tweets
.update(portion
)
214 err("Browsing %s timeline, new tweets: %i"
215 % (screen_name
if screen_name
else "home", new
))
218 max_id
= min(portion
.keys()) # browse backwards
221 def rate_limit_status(twitter
):
222 """Print current Twitter API rate limit status."""
223 r
= twitter
.account
.rate_limit_status()
224 print("Remaining API requests: %i/%i (hourly limit)"
225 % (r
['remaining_hits'], r
['hourly_limit']))
226 print("Next reset in %is (%s)"
227 % (int(r
['reset_time_in_seconds'] - time
.time()),
228 time
.asctime(time
.localtime(r
['reset_time_in_seconds']))))
230 def main(args
=sys
.argv
[1:]):
238 parse_args(args
, options
)
239 except GetoptError
as e
:
240 err("I can't do that, %s." % e
)
243 # exit if no user given
244 # except if asking for API rate or archive of timeline
245 if not options
['extra_args'] and not (options
['api-rate'] or
246 options
['timeline']):
250 # authenticate using OAuth, asking for token if necessary
252 oauth_filename
= (os
.getenv("HOME", "") + os
.sep
253 + ".twitter-archiver_oauth")
254 if not os
.path
.exists(oauth_filename
):
255 oauth_dance("Twitter-Archiver", CONSUMER_KEY
, CONSUMER_SECRET
,
257 oauth_token
, oauth_token_secret
= read_token_file(oauth_filename
)
258 auth
= OAuth(oauth_token
, oauth_token_secret
, CONSUMER_KEY
,
263 twitter
= Twitter(auth
=auth
, api_version
='1', domain
='api.twitter.com')
265 if options
['api-rate']:
266 rate_limit_status(twitter
)
269 # save own timeline (the user used in OAuth)
270 if options
['timeline']:
271 if isinstance(auth
, NoAuth
):
272 err("You must be authenticated to save timeline.")
275 filename
= options
['save-dir'] + os
.sep
+ options
['timeline']
276 print("* Archiving own timeline in %s" % filename
)
280 tweets
= load_tweets(filename
)
282 err("Error when loading saved tweets: %s - continuing without"
286 # no screen_name means we want home_timeline, not user_timeline
287 timeline(twitter
, "", tweets
)
288 except KeyboardInterrupt:
293 save_tweets(filename
, tweets
)
294 print("Total tweets in own timeline: %i" % len(tweets
))
296 # read users from command-line or stdin
297 users
= options
['extra_args']
298 if len(users
) == 1 and users
[0] == "-":
299 users
= [line
.strip() for line
in sys
.stdin
.readlines()]
301 # save tweets for every user
302 total
, total_new
= 0, 0
304 filename
= options
['save-dir'] + os
.sep
+ user
305 print("* Archiving %s tweets in %s" % (user
, filename
))
309 tweets
= load_tweets(filename
)
311 err("Error when loading saved tweets: %s - continuing without"
317 timeline(twitter
, user
, tweets
)
318 except KeyboardInterrupt:
323 save_tweets(filename
, tweets
)
325 new
= len(tweets
) - before
327 print("Total tweets for %s: %i (%i new)" % (user
, len(tweets
), new
))
329 print("Total: %i tweets (%i new) for %i users"
330 % (total
, total_new
, len(users
)))