]>
jfr.im git - z_archive/twitter.git/blob - twitter/archiver.py
2 twitter-archiver [options] <-|user> [<user> ...]
5 Archive tweets of users, sorted by date from oldest to newest, in
6 the following format: <id> <date> <<screen_name>> <tweet_text>
7 Date format is: YYYY-MM-DD HH:MM:SS TZ. Tweet <id> is used to
8 resume archiving on next run. Archive file name is the user name.
9 Provide "-" instead of users to read users from standard input.
12 -o --oauth authenticate to Twitter using OAuth (default no)
13 -s --save-dir <path> directory to save archives (default: current dir)
14 -a --api-rate see current API rate limit status
15 -t --timeline <file> archive own timeline into given file name (requires
16 OAuth, max 800 statuses).
19 Authenticate to Twitter using OAuth to archive tweets of private profiles
20 and have higher API rate limits. OAuth authentication tokens are stored
21 in ~/.twitter-archiver_oauth.
24 from __future__
import print_function
26 import os
, sys
, time
, calendar
, urllib2
, httplib
27 from getopt
import gnu_getopt
as getopt
, GetoptError
29 # T-Archiver (Twitter-Archiver) application registered by @stalkr_
30 CONSUMER_KEY
='d8hIyfzs7ievqeeZLjZrqQ'
31 CONSUMER_SECRET
='AnZmK0rnvaX7BoJ75l6XlilnbyMv7FoiDXWVmPD8'
33 from .api
import Twitter
, TwitterError
34 from .oauth
import OAuth
, read_token_file
35 from .oauth_dance
import oauth_dance
36 from .auth
import NoAuth
37 from .util
import Fail
, err
38 from .follow
import lookup
40 def parse_args(args
, options
):
41 """Parse arguments from command-line to set options."""
42 long_opts
= ['help', 'oauth', 'save-dir=', 'api-rate', 'timeline=']
43 short_opts
= "hos:at:"
44 opts
, extra_args
= getopt(args
, short_opts
, long_opts
)
47 if opt
in ('-h', '--help'):
50 elif opt
in ('-o', '--oauth'):
51 options
['oauth'] = True
52 elif opt
in ('-s', '--save-dir'):
53 options
['save-dir'] = arg
54 elif opt
in ('-a', '--api-rate'):
55 options
['api-rate' ] = True
56 elif opt
in ('-t', '--timeline'):
57 options
['timeline'] = arg
59 options
['extra_args'] = extra_args
61 def load_tweets(filename
):
62 """Load tweets from file into dict, see save_tweets()."""
64 archive
= open(filename
,"r")
65 except IOError: # no archive (yet)
69 for line
in archive
.readlines():
70 tid
, text
= line
.strip().split(" ", 1)
71 tweets
[int(tid
)] = text
.decode("utf-8")
76 def save_tweets(filename
, tweets
):
77 """Save tweets from dict to file.
79 Save tweets from dict to UTF-8 encoded file, one per line:
80 <tweet id (number)> <tweet text>
82 <date> <<user>> [RT @<user>: ]<text>
85 filename: A string representing the file name to save tweets to.
86 tweets: A dict mapping tweet-ids (int) to tweet text (str).
92 archive
= open(filename
,"w")
94 err("Cannot save tweets: %s" % str(e
))
97 for k
in sorted(tweets
.keys()):
98 archive
.write("%i %s\n" % (k
, tweets
[k
].encode('utf-8')))
102 def format_date(utc
, to_localtime
=True):
103 """Parse Twitter's UTC date into UTC or local time."""
104 u
= time
.strptime(utc
.replace('+0000','UTC'), '%a %b %d %H:%M:%S %Z %Y')
105 if to_localtime
and time
.timezone
!= 0:
106 t
= time
.localtime(calendar
.timegm(u
))
107 return time
.strftime("%Y-%m-%d %H:%M:%S", t
) + " " + time
.tzname
[1]
109 return time
.strftime("%Y-%m-%d %H:%M:%S UTC", u
)
111 def format_text(text
):
112 """Transform special chars in text to have only one line."""
113 return text
.replace('\n','\\n').replace('\r','\\r')
115 def timeline_resolve_uids(twitter
, tl
):
116 """Resolve user ids to screen names from a timeline."""
117 # get all user ids that needs a lookup (no screen_name key)
120 rt
= t
.get('retweeted_status')
121 if rt
and not rt
['user'].get('screen_name'):
122 user_ids
.append(rt
['user']['id'])
123 if not t
['user'].get('screen_name'):
124 user_ids
.append(t
['user']['id'])
126 # resolve all of them at once
127 names
= lookup(twitter
, list(set(user_ids
)))
129 # build new timeline with resolved uids
132 rt
= t
.get('retweeted_status')
133 if rt
and not rt
['user'].get('screen_name'):
134 name
= names
[rt
['user']['id']]
135 t
['retweeted_status']['user']['screen_name'] = name
136 if not t
['user'].get('screen_name'):
137 name
= names
[t
['user']['id']]
138 t
['user']['screen_name'] = name
143 def timeline_portion(twitter
, screen_name
, max_id
=None):
144 """Get a portion of the timeline of a screen name."""
145 kwargs
= dict(count
=200, include_rts
=1, screen_name
=screen_name
)
147 kwargs
['max_id'] = max_id
151 tl
= twitter
.statuses
.user_timeline(**kwargs
)
153 tl
= twitter
.statuses
.home_timeline(**kwargs
)
155 # some tweets do not provide screen name but user id, resolve those
156 for t
in timeline_resolve_uids(twitter
, tl
):
158 rt
= t
.get('retweeted_status')
160 text
= "RT @%s: %s" % (rt
['user']['screen_name'], rt
['text'])
161 tweets
[t
['id']] = "%s <%s> %s" % (format_date(t
['created_at']),
162 t
['user']['screen_name'],
167 def timeline(twitter
, screen_name
, tweets
):
168 """Get the entire timeline of tweets for a screen name."""
171 # get portions of timeline, incrementing max id until no new tweets appear
174 portion
= timeline_portion(twitter
, screen_name
, max_id
)
175 except TwitterError
as e
:
177 err("Fail: %i Unauthorized (tweets of that user are protected)"
180 elif e
.e
.code
== 400:
181 err("Fail: %i API rate limit exceeded" % e
.e
.code
)
182 rate
= twitter
.account
.rate_limit_status()
183 reset
= rate
['reset_time_in_seconds']
184 reset
= time
.asctime(time
.localtime(reset
))
185 delay
= int(rate
['reset_time_in_seconds']
186 - time
.time()) + 5 # avoid race
187 err("Hourly limit of %i requests reached, next reset on %s: "
188 "going to sleep for %i secs" % (rate
['hourly_limit'],
192 elif e
.e
.code
== 502:
193 err("Fail: %i Service currently unavailable, retrying..."
196 err("Fail: %s\nRetrying..." % str(e
)[:500])
198 except urllib2
.URLError
as e
:
199 err("Fail: urllib2.URLError %s - Retrying..." % str(e
))
201 except httplib
.error
as e
:
202 err("Fail: httplib.error %s - Retrying..." % str(e
))
204 except KeyError as e
:
205 err("Fail: KeyError %s - Retrying..." % str(e
))
209 tweets
.update(portion
)
211 err("Browsing %s timeline, new tweets: %i"
212 % (screen_name
if screen_name
else "home", new
))
215 max_id
= min(portion
.keys()) # browse backwards
218 def rate_limit_status(twitter
):
219 """Print current Twitter API rate limit status."""
220 r
= twitter
.account
.rate_limit_status()
221 print("Remaining API requests: %i/%i (hourly limit)"
222 % (r
['remaining_hits'], r
['hourly_limit']))
223 print("Next reset in %is (%s)"
224 % (int(r
['reset_time_in_seconds'] - time
.time()),
225 time
.asctime(time
.localtime(r
['reset_time_in_seconds']))))
227 def main(args
=sys
.argv
[1:]):
235 parse_args(args
, options
)
236 except GetoptError
as e
:
237 err("I can't do that, %s." % e
)
240 # exit if no user given
241 # except if asking for API rate or archive of timeline
242 if not options
['extra_args'] and not (options
['api-rate'] or
243 options
['timeline']):
247 # authenticate using OAuth, asking for token if necessary
249 oauth_filename
= (os
.getenv("HOME", "") + os
.sep
250 + ".twitter-archiver_oauth")
251 if not os
.path
.exists(oauth_filename
):
252 oauth_dance("Twitter-Archiver", CONSUMER_KEY
, CONSUMER_SECRET
,
254 oauth_token
, oauth_token_secret
= read_token_file(oauth_filename
)
255 auth
= OAuth(oauth_token
, oauth_token_secret
, CONSUMER_KEY
,
260 twitter
= Twitter(auth
=auth
, api_version
='1', domain
='api.twitter.com')
262 if options
['api-rate']:
263 rate_limit_status(twitter
)
266 # save own timeline (the user used in OAuth)
267 if options
['timeline']:
268 if isinstance(auth
, NoAuth
):
269 err("You must be authenticated to save timeline.")
272 filename
= options
['save-dir'] + os
.sep
+ options
['timeline']
273 print("* Archiving own timeline in %s" % filename
)
277 tweets
= load_tweets(filename
)
279 err("Error when loading saved tweets: %s - continuing without"
283 # no screen_name means we want home_timeline, not user_timeline
284 timeline(twitter
, "", tweets
)
285 except KeyboardInterrupt:
290 save_tweets(filename
, tweets
)
291 print("Total tweets in own timeline: %i" % len(tweets
))
293 # read users from command-line or stdin
294 users
= options
['extra_args']
295 if len(users
) == 1 and users
[0] == "-":
296 users
= [line
.strip() for line
in sys
.stdin
.readlines()]
298 # save tweets for every user
299 total
, total_new
= 0, 0
301 filename
= options
['save-dir'] + os
.sep
+ user
302 print("* Archiving %s tweets in %s" % (user
, filename
))
306 tweets
= load_tweets(filename
)
308 err("Error when loading saved tweets: %s - continuing without"
314 timeline(twitter
, user
, tweets
)
315 except KeyboardInterrupt:
320 save_tweets(filename
, tweets
)
322 new
= len(tweets
) - before
324 print("Total tweets for %s: %i (%i new)" % (user
, len(tweets
), new
))
326 print("Total: %i tweets (%i new) for %i users"
327 % (total
, total_new
, len(users
)))