]>
jfr.im git - z_archive/twitter.git/blob - twitter/archiver.py
2 twitter-archiver [options] <-|user> [<user> ...]
5 Archive tweets of users, sorted by date from oldest to newest, in
6 the following format: <id> <date> <<screen_name>> <tweet_text>
7 Date format is: YYYY-MM-DD HH:MM:SS TZ. Tweet <id> is used to
8 resume archiving on next run. Archive file name is the user name.
9 Provide "-" instead of users to read users from standard input.
12 -o --oauth authenticate to Twitter using OAuth (default: no)
13 -s --save-dir <path> directory to save archives (default: current dir)
14 -a --api-rate see current API rate limit status
15 -t --timeline <file> archive own timeline into given file name (requires
16 OAuth, max 800 statuses)
17 -m --mentions <file> archive own mentions instead of timeline into
18 given file name (requires OAuth, max 800 statuses)
19 -v --favorites archive user's favorites instead of timeline
20 -f --follow-redirects follow redirects of urls
21 -r --redirect-sites follow redirects for this comma separated list of hosts
24 Authenticate to Twitter using OAuth to archive tweets of private profiles
25 and have higher API rate limits. OAuth authentication tokens are stored
26 in ~/.twitter-archiver_oauth.
29 from __future__
import print_function
31 import os
, sys
, time
, calendar
, functools
32 from getopt
import gnu_getopt
as getopt
, GetoptError
35 import urllib
.request
as urllib2
36 import http
.client
as httplib
42 # T-Archiver (Twitter-Archiver) application registered by @stalkr_
43 CONSUMER_KEY
='d8hIyfzs7ievqeeZLjZrqQ'
44 CONSUMER_SECRET
='AnZmK0rnvaX7BoJ75l6XlilnbyMv7FoiDXWVmPD8'
46 from .api
import Twitter
, TwitterError
47 from .oauth
import OAuth
, read_token_file
48 from .oauth_dance
import oauth_dance
49 from .auth
import NoAuth
50 from .util
import Fail
, err
, expand_line
, parse_host_list
51 from .follow
import lookup
53 def parse_args(args
, options
):
54 """Parse arguments from command-line to set options."""
55 long_opts
= ['help', 'oauth', 'save-dir=', 'api-rate', 'timeline=', 'mentions=', 'favorites', 'follow-redirects',"redirect-sites="]
56 short_opts
= "hos:at:m:vfr:"
57 opts
, extra_args
= getopt(args
, short_opts
, long_opts
)
60 if opt
in ('-h', '--help'):
63 elif opt
in ('-o', '--oauth'):
64 options
['oauth'] = True
65 elif opt
in ('-s', '--save-dir'):
66 options
['save-dir'] = arg
67 elif opt
in ('-a', '--api-rate'):
68 options
['api-rate' ] = True
69 elif opt
in ('-t', '--timeline'):
70 options
['timeline'] = arg
71 elif opt
in ('-m', '--mentions'):
72 options
['mentions'] = arg
73 elif opt
in ('-v', '--favorites'):
74 options
['favorites'] = True
75 elif opt
in ('-f', '--follow-redirects'):
76 options
['follow-redirects'] = True
77 elif opt
in ('-r', '--redirect-sites'):
78 options
['redirect-sites'] = arg
80 options
['extra_args'] = extra_args
82 def load_tweets(filename
):
83 """Load tweets from file into dict, see save_tweets()."""
85 archive
= open(filename
,"r")
86 except IOError: # no archive (yet)
90 for line
in archive
.readlines():
91 tid
, text
= line
.strip().split(" ", 1)
92 tweets
[int(tid
)] = text
.decode("utf-8")
97 def save_tweets(filename
, tweets
):
98 """Save tweets from dict to file.
100 Save tweets from dict to UTF-8 encoded file, one per line:
101 <tweet id (number)> <tweet text>
103 <date> <<user>> [RT @<user>: ]<text>
106 filename: A string representing the file name to save tweets to.
107 tweets: A dict mapping tweet-ids (int) to tweet text (str).
113 archive
= open(filename
,"w")
115 err("Cannot save tweets: %s" % str(e
))
118 for k
in sorted(tweets
.keys()):
119 archive
.write("%i %s\n" % (k
, tweets
[k
].encode('utf-8')))
123 def format_date(utc
, to_localtime
=True):
124 """Parse Twitter's UTC date into UTC or local time."""
125 u
= time
.strptime(utc
.replace('+0000','UTC'), '%a %b %d %H:%M:%S %Z %Y')
126 if to_localtime
and time
.timezone
!= 0:
127 t
= time
.localtime(calendar
.timegm(u
))
128 return time
.strftime("%Y-%m-%d %H:%M:%S", t
) + " " + time
.tzname
[1]
130 return time
.strftime("%Y-%m-%d %H:%M:%S UTC", u
)
132 def expand_format_text(hosts
, text
):
133 """Following redirects in links."""
134 return direct_format_text(expand_line(text
, hosts
))
136 def direct_format_text(text
):
137 """Transform special chars in text to have only one line."""
138 return text
.replace('\n','\\n').replace('\r','\\r')
140 def statuses_resolve_uids(twitter
, tl
):
141 """Resolve user ids to screen names from statuses."""
142 # get all user ids that needs a lookup (no screen_name key)
145 rt
= t
.get('retweeted_status')
146 if rt
and not rt
['user'].get('screen_name'):
147 user_ids
.append(rt
['user']['id'])
148 if not t
['user'].get('screen_name'):
149 user_ids
.append(t
['user']['id'])
151 # resolve all of them at once
152 names
= lookup(twitter
, list(set(user_ids
)))
154 # build new statuses with resolved uids
157 rt
= t
.get('retweeted_status')
158 if rt
and not rt
['user'].get('screen_name'):
159 name
= names
[rt
['user']['id']]
160 t
['retweeted_status']['user']['screen_name'] = name
161 if not t
['user'].get('screen_name'):
162 name
= names
[t
['user']['id']]
163 t
['user']['screen_name'] = name
168 def statuses_portion(twitter
, screen_name
, max_id
=None, mentions
=False, favorites
=False):
169 """Get a portion of the statuses of a screen name."""
170 kwargs
= dict(count
=200, include_rts
=1, screen_name
=screen_name
)
172 kwargs
['max_id'] = max_id
176 tl
= twitter
.statuses
.mentions(**kwargs
)
178 tl
= twitter
.favorites(**kwargs
) # API v1, favorites.list() in v1.1
181 tl
= twitter
.statuses
.user_timeline(**kwargs
)
183 tl
= twitter
.statuses
.home_timeline(**kwargs
)
185 # some tweets do not provide screen name but user id, resolve those
186 for t
in statuses_resolve_uids(twitter
, tl
):
188 rt
= t
.get('retweeted_status')
190 text
= "RT @%s: %s" % (rt
['user']['screen_name'], rt
['text'])
191 tweets
[t
['id']] = "%s <%s> %s" % (format_date(t
['created_at']),
192 t
['user']['screen_name'],
196 def statuses(twitter
, screen_name
, tweets
, mentions
=False, favorites
=False):
197 """Get all the statuses for a screen name."""
200 # get portions of statuses, incrementing max id until no new tweets appear
203 portion
= statuses_portion(twitter
, screen_name
, max_id
, mentions
, favorites
)
204 except TwitterError
as e
:
206 err("Fail: %i Unauthorized (tweets of that user are protected)"
209 elif e
.e
.code
== 400:
210 err("Fail: %i API rate limit exceeded" % e
.e
.code
)
211 rate
= twitter
.account
.rate_limit_status()
212 reset
= rate
['reset_time_in_seconds']
213 reset
= time
.asctime(time
.localtime(reset
))
214 delay
= int(rate
['reset_time_in_seconds']
215 - time
.time()) + 5 # avoid race
216 err("Hourly limit of %i requests reached, next reset on %s: "
217 "going to sleep for %i secs" % (rate
['hourly_limit'],
221 elif e
.e
.code
== 404:
222 err("Fail: %i This profile does not exist" % e
.e
.code
)
224 elif e
.e
.code
== 502:
225 err("Fail: %i Service currently unavailable, retrying..."
228 err("Fail: %s\nRetrying..." % str(e
)[:500])
230 except urllib2
.URLError
as e
:
231 err("Fail: urllib2.URLError %s - Retrying..." % str(e
))
233 except httplib
.error
as e
:
234 err("Fail: httplib.error %s - Retrying..." % str(e
))
236 except KeyError as e
:
237 err("Fail: KeyError %s - Retrying..." % str(e
))
241 tweets
.update(portion
)
243 err("Browsing %s statuses, new tweets: %i"
244 % (screen_name
if screen_name
else "home", new
))
247 max_id
= min(portion
.keys())-1 # browse backwards
250 def rate_limit_status(twitter
):
251 """Print current Twitter API rate limit status."""
252 r
= twitter
.account
.rate_limit_status()
253 print("Remaining API requests: %i/%i (hourly limit)"
254 % (r
['remaining_hits'], r
['hourly_limit']))
255 print("Next reset in %is (%s)"
256 % (int(r
['reset_time_in_seconds'] - time
.time()),
257 time
.asctime(time
.localtime(r
['reset_time_in_seconds']))))
259 def main(args
=sys
.argv
[1:]):
267 'follow-redirects': False,
268 'redirect-sites': None,
271 parse_args(args
, options
)
272 except GetoptError
as e
:
273 err("I can't do that, %s." % e
)
276 # exit if no user given
277 # except if asking for API rate, or archive of timeline or mentions
278 if not options
['extra_args'] and not (options
['api-rate'] or
279 options
['timeline'] or
280 options
['mentions']):
284 # authenticate using OAuth, asking for token if necessary
286 oauth_filename
= (os
.getenv("HOME", "") + os
.sep
287 + ".twitter-archiver_oauth")
288 if not os
.path
.exists(oauth_filename
):
289 oauth_dance("Twitter-Archiver", CONSUMER_KEY
, CONSUMER_SECRET
,
291 oauth_token
, oauth_token_secret
= read_token_file(oauth_filename
)
292 auth
= OAuth(oauth_token
, oauth_token_secret
, CONSUMER_KEY
,
297 twitter
= Twitter(auth
=auth
, api_version
='1', domain
='api.twitter.com')
299 if options
['api-rate']:
300 rate_limit_status(twitter
)
304 if options
['follow-redirects'] or options
['redirect-sites'] :
305 if options
['redirect-sites']:
306 hosts
= parse_host_list(options
['redirect-sites'])
309 format_text
= functools
.partial(expand_format_text
, hosts
)
311 format_text
= direct_format_text
313 # save own timeline or mentions (the user used in OAuth)
314 if options
['timeline'] or options
['mentions']:
315 if isinstance(auth
, NoAuth
):
316 err("You must be authenticated to save timeline or mentions.")
319 if options
['timeline']:
320 filename
= options
['save-dir'] + os
.sep
+ options
['timeline']
321 print("* Archiving own timeline in %s" % filename
)
322 elif options
['mentions']:
323 filename
= options
['save-dir'] + os
.sep
+ options
['mentions']
324 print("* Archiving own mentions in %s" % filename
)
328 tweets
= load_tweets(filename
)
329 except Exception as e
:
330 err("Error when loading saved tweets: %s - continuing without"
334 statuses(twitter
, "", tweets
, options
['mentions'], options
['favorites'])
335 except KeyboardInterrupt:
340 save_tweets(filename
, tweets
)
341 if options
['timeline']:
342 print("Total tweets in own timeline: %i" % len(tweets
))
343 elif options
['mentions']:
344 print("Total mentions: %i" % len(tweets
))
346 # read users from command-line or stdin
347 users
= options
['extra_args']
348 if len(users
) == 1 and users
[0] == "-":
349 users
= [line
.strip() for line
in sys
.stdin
.readlines()]
351 # save tweets for every user
352 total
, total_new
= 0, 0
354 filename
= options
['save-dir'] + os
.sep
+ user
355 if options
['favorites']:
356 filename
= filename
+ "-favorites"
357 print("* Archiving %s tweets in %s" % (user
, filename
))
361 tweets
= load_tweets(filename
)
362 except Exception as e
:
363 err("Error when loading saved tweets: %s - continuing without"
369 statuses(twitter
, user
, tweets
, options
['mentions'], options
['favorites'])
370 except KeyboardInterrupt:
375 save_tweets(filename
, tweets
)
377 new
= len(tweets
) - before
379 print("Total tweets for %s: %i (%i new)" % (user
, len(tweets
), new
))
381 print("Total: %i tweets (%i new) for %i users"
382 % (total
, total_new
, len(users
)))