]>
jfr.im git - z_archive/twitter.git/blob - twitter/archiver.py
2 twitter-archiver [options] <-|user> [<user> ...]
5 Archive tweets of users, sorted by date from oldest to newest, in
6 the following format: <id> <date> <<screen_name>> <tweet_text>
7 Date format is: YYYY-MM-DD HH:MM:SS TZ. Tweet <id> is used to
8 resume archiving on next run. Archive file name is the user name.
9 Provide "-" instead of users to read users from standard input.
12 -o --oauth authenticate to Twitter using OAuth (default: no)
13 -s --save-dir <path> directory to save archives (default: current dir)
14 -a --api-rate see current API rate limit status
15 -t --timeline <file> archive own timeline into given file name (requires
16 OAuth, max 800 statuses)
17 -m --mentions <file> archive own mentions instead of timeline into
18 given file name (requires OAuth, max 800 statuses)
19 -v --favorites archive user's favorites instead of timeline
20 -f --follow-redirects follow redirects of urls
21 -r --redirect-sites follow redirects for this comma separated list of hosts
22 -d --dms <file> archive own direct messages (both received and
23 sent) into given file name.
26 Authenticate to Twitter using OAuth to archive tweets of private profiles
27 and have higher API rate limits. OAuth authentication tokens are stored
28 in ~/.twitter-archiver_oauth.
31 from __future__
import print_function
33 import os
, sys
, time
, calendar
, functools
34 from getopt
import gnu_getopt
as getopt
, GetoptError
37 import urllib
.request
as urllib2
38 import http
.client
as httplib
44 # T-Archiver (Twitter-Archiver) application registered by @stalkr_
45 CONSUMER_KEY
='d8hIyfzs7ievqeeZLjZrqQ'
46 CONSUMER_SECRET
='AnZmK0rnvaX7BoJ75l6XlilnbyMv7FoiDXWVmPD8'
48 from .api
import Twitter
, TwitterError
49 from .oauth
import OAuth
, read_token_file
50 from .oauth_dance
import oauth_dance
51 from .auth
import NoAuth
52 from .util
import Fail
, err
, expand_line
, parse_host_list
53 from .follow
import lookup
55 def parse_args(args
, options
):
56 """Parse arguments from command-line to set options."""
57 long_opts
= ['help', 'oauth', 'save-dir=', 'api-rate', 'timeline=', 'mentions=', 'favorites', 'follow-redirects',"redirect-sites=", 'dms=']
58 short_opts
= "hos:at:m:vfr:d:"
59 opts
, extra_args
= getopt(args
, short_opts
, long_opts
)
62 if opt
in ('-h', '--help'):
65 elif opt
in ('-o', '--oauth'):
66 options
['oauth'] = True
67 elif opt
in ('-s', '--save-dir'):
68 options
['save-dir'] = arg
69 elif opt
in ('-a', '--api-rate'):
70 options
['api-rate' ] = True
71 elif opt
in ('-t', '--timeline'):
72 options
['timeline'] = arg
73 elif opt
in ('-m', '--mentions'):
74 options
['mentions'] = arg
75 elif opt
in ('-v', '--favorites'):
76 options
['favorites'] = True
77 elif opt
in ('-f', '--follow-redirects'):
78 options
['follow-redirects'] = True
79 elif opt
in ('-r', '--redirect-sites'):
80 options
['redirect-sites'] = arg
81 elif opt
in ('-d', '--dms'):
84 options
['extra_args'] = extra_args
86 def load_tweets(filename
):
87 """Load tweets from file into dict, see save_tweets()."""
89 archive
= open(filename
,"r")
90 except IOError: # no archive (yet)
94 for line
in archive
.readlines():
95 tid
, text
= line
.strip().split(" ", 1)
96 tweets
[int(tid
)] = text
.decode("utf-8")
101 def save_tweets(filename
, tweets
):
102 """Save tweets from dict to file.
104 Save tweets from dict to UTF-8 encoded file, one per line:
105 <tweet id (number)> <tweet text>
107 <date> <<user>> [RT @<user>: ]<text>
110 filename: A string representing the file name to save tweets to.
111 tweets: A dict mapping tweet-ids (int) to tweet text (str).
117 archive
= open(filename
,"w")
119 err("Cannot save tweets: %s" % str(e
))
122 for k
in sorted(tweets
.keys()):
123 archive
.write("%i %s\n" % (k
, tweets
[k
].encode('utf-8')))
127 def format_date(utc
, to_localtime
=True):
128 """Parse Twitter's UTC date into UTC or local time."""
129 u
= time
.strptime(utc
.replace('+0000','UTC'), '%a %b %d %H:%M:%S %Z %Y')
130 if to_localtime
and time
.timezone
!= 0:
131 t
= time
.localtime(calendar
.timegm(u
))
132 return time
.strftime("%Y-%m-%d %H:%M:%S", t
) + " " + time
.tzname
[1]
134 return time
.strftime("%Y-%m-%d %H:%M:%S UTC", u
)
136 def expand_format_text(hosts
, text
):
137 """Following redirects in links."""
138 return direct_format_text(expand_line(text
, hosts
))
140 def direct_format_text(text
):
141 """Transform special chars in text to have only one line."""
142 return text
.replace('\n','\\n').replace('\r','\\r')
144 def statuses_resolve_uids(twitter
, tl
):
145 """Resolve user ids to screen names from statuses."""
146 # get all user ids that needs a lookup (no screen_name key)
149 rt
= t
.get('retweeted_status')
150 if rt
and not rt
['user'].get('screen_name'):
151 user_ids
.append(rt
['user']['id'])
152 if not t
['user'].get('screen_name'):
153 user_ids
.append(t
['user']['id'])
155 # resolve all of them at once
156 names
= lookup(twitter
, list(set(user_ids
)))
158 # build new statuses with resolved uids
161 rt
= t
.get('retweeted_status')
162 if rt
and not rt
['user'].get('screen_name'):
163 name
= names
[rt
['user']['id']]
164 t
['retweeted_status']['user']['screen_name'] = name
165 if not t
['user'].get('screen_name'):
166 name
= names
[t
['user']['id']]
167 t
['user']['screen_name'] = name
172 def statuses_portion(twitter
, screen_name
, max_id
=None, mentions
=False, favorites
=False, received_dms
=None):
173 """Get a portion of the statuses of a screen name."""
174 kwargs
= dict(count
=200, include_rts
=1, screen_name
=screen_name
)
176 kwargs
['max_id'] = max_id
180 tl
= twitter
.statuses
.mentions(**kwargs
)
182 tl
= twitter
.favorites(**kwargs
) # API v1, favorites.list() in v1.1
183 elif received_dms
!= None:
185 tl
= twitter
.direct_messages(**kwargs
)
187 tl
= twitter
.direct_messages
.sent(**kwargs
)
190 tl
= twitter
.statuses
.user_timeline(**kwargs
)
192 tl
= twitter
.statuses
.home_timeline(**kwargs
)
194 # some tweets do not provide screen name but user id, resolve those
195 # this isn't a valid operation for DMs, so special-case them
196 if received_dms
== None:
197 newtl
= statuses_resolve_uids(twitter
, tl
)
202 rt
= t
.get('retweeted_status')
204 text
= "RT @%s: %s" % (rt
['user']['screen_name'], rt
['text'])
205 # DMs don't include mentions by default, so in order to show who
206 # the recipient was, we synthesise a mention. If we're not
207 # operating on DMs, behave as normal
208 if received_dms
== None:
209 tweets
[t
['id']] = "%s <%s> %s" % (format_date(t
['created_at']),
210 t
['user']['screen_name'],
213 tweets
[t
['id']] = "%s <%s> @%s %s" % (format_date(t
['created_at']),
214 t
['sender_screen_name'],
215 t
['recipient']['screen_name'],
219 def statuses(twitter
, screen_name
, tweets
, mentions
=False, favorites
=False, received_dms
=None):
220 """Get all the statuses for a screen name."""
223 # get portions of statuses, incrementing max id until no new tweets appear
226 portion
= statuses_portion(twitter
, screen_name
, max_id
, mentions
, favorites
, received_dms
)
227 except TwitterError
as e
:
229 err("Fail: %i Unauthorized (tweets of that user are protected)"
232 elif e
.e
.code
== 400:
233 err("Fail: %i API rate limit exceeded" % e
.e
.code
)
234 rate
= twitter
.account
.rate_limit_status()
235 reset
= rate
['reset_time_in_seconds']
236 reset
= time
.asctime(time
.localtime(reset
))
237 delay
= int(rate
['reset_time_in_seconds']
238 - time
.time()) + 5 # avoid race
239 err("Hourly limit of %i requests reached, next reset on %s: "
240 "going to sleep for %i secs" % (rate
['hourly_limit'],
244 elif e
.e
.code
== 404:
245 err("Fail: %i This profile does not exist" % e
.e
.code
)
247 elif e
.e
.code
== 502:
248 err("Fail: %i Service currently unavailable, retrying..."
251 err("Fail: %s\nRetrying..." % str(e
)[:500])
253 except urllib2
.URLError
as e
:
254 err("Fail: urllib2.URLError %s - Retrying..." % str(e
))
256 except httplib
.error
as e
:
257 err("Fail: httplib.error %s - Retrying..." % str(e
))
259 except KeyError as e
:
260 err("Fail: KeyError %s - Retrying..." % str(e
))
264 tweets
.update(portion
)
266 err("Browsing %s statuses, new tweets: %i"
267 % (screen_name
if screen_name
else "home", new
))
270 max_id
= min(portion
.keys())-1 # browse backwards
273 def rate_limit_status(twitter
):
274 """Print current Twitter API rate limit status."""
275 r
= twitter
.account
.rate_limit_status()
276 print("Remaining API requests: %i/%i (hourly limit)"
277 % (r
['remaining_hits'], r
['hourly_limit']))
278 print("Next reset in %is (%s)"
279 % (int(r
['reset_time_in_seconds'] - time
.time()),
280 time
.asctime(time
.localtime(r
['reset_time_in_seconds']))))
282 def main(args
=sys
.argv
[1:]):
291 'follow-redirects': False,
292 'redirect-sites': None,
295 parse_args(args
, options
)
296 except GetoptError
as e
:
297 err("I can't do that, %s." % e
)
300 # exit if no user given
301 # except if asking for API rate, or archive of timeline or mentions
302 if not options
['extra_args'] and not (options
['api-rate'] or
303 options
['timeline'] or
304 options
['mentions'] or
309 # authenticate using OAuth, asking for token if necessary
311 oauth_filename
= (os
.getenv("HOME", "") + os
.sep
312 + ".twitter-archiver_oauth")
313 if not os
.path
.exists(oauth_filename
):
314 oauth_dance("Twitter-Archiver", CONSUMER_KEY
, CONSUMER_SECRET
,
316 oauth_token
, oauth_token_secret
= read_token_file(oauth_filename
)
317 auth
= OAuth(oauth_token
, oauth_token_secret
, CONSUMER_KEY
,
322 twitter
= Twitter(auth
=auth
, api_version
='1', domain
='api.twitter.com')
324 if options
['api-rate']:
325 rate_limit_status(twitter
)
329 if options
['follow-redirects'] or options
['redirect-sites'] :
330 if options
['redirect-sites']:
331 hosts
= parse_host_list(options
['redirect-sites'])
334 format_text
= functools
.partial(expand_format_text
, hosts
)
336 format_text
= direct_format_text
338 # save own timeline or mentions (the user used in OAuth)
339 if options
['timeline'] or options
['mentions']:
340 if isinstance(auth
, NoAuth
):
341 err("You must be authenticated to save timeline or mentions.")
344 if options
['timeline']:
345 filename
= options
['save-dir'] + os
.sep
+ options
['timeline']
346 print("* Archiving own timeline in %s" % filename
)
347 elif options
['mentions']:
348 filename
= options
['save-dir'] + os
.sep
+ options
['mentions']
349 print("* Archiving own mentions in %s" % filename
)
353 tweets
= load_tweets(filename
)
354 except Exception as e
:
355 err("Error when loading saved tweets: %s - continuing without"
359 statuses(twitter
, "", tweets
, options
['mentions'], options
['favorites'])
360 except KeyboardInterrupt:
365 save_tweets(filename
, tweets
)
366 if options
['timeline']:
367 print("Total tweets in own timeline: %i" % len(tweets
))
368 elif options
['mentions']:
369 print("Total mentions: %i" % len(tweets
))
372 if isinstance(auth
, NoAuth
):
373 err("You must be authenticated to save DMs.")
376 filename
= options
['save-dir'] + os
.sep
+ options
['dms']
377 print("* Archiving own DMs in %s" % filename
)
381 dms
= load_tweets(filename
)
383 err("Error when loading saved DMs: %s - continuing without"
387 statuses(twitter
, "", dms
, received_dms
=True)
388 statuses(twitter
, "", dms
, received_dms
=False)
389 except KeyboardInterrupt:
394 save_tweets(filename
, dms
)
395 print("Total DMs sent and received: %i" % len(dms
))
398 # read users from command-line or stdin
399 users
= options
['extra_args']
400 if len(users
) == 1 and users
[0] == "-":
401 users
= [line
.strip() for line
in sys
.stdin
.readlines()]
403 # save tweets for every user
404 total
, total_new
= 0, 0
406 filename
= options
['save-dir'] + os
.sep
+ user
407 if options
['favorites']:
408 filename
= filename
+ "-favorites"
409 print("* Archiving %s tweets in %s" % (user
, filename
))
413 tweets
= load_tweets(filename
)
414 except Exception as e
:
415 err("Error when loading saved tweets: %s - continuing without"
421 statuses(twitter
, user
, tweets
, options
['mentions'], options
['favorites'])
422 except KeyboardInterrupt:
427 save_tweets(filename
, tweets
)
429 new
= len(tweets
) - before
431 print("Total tweets for %s: %i (%i new)" % (user
, len(tweets
), new
))
433 print("Total: %i tweets (%i new) for %i users"
434 % (total
, total_new
, len(users
)))