]>
jfr.im git - z_archive/twitter.git/blob - twitter/archiver.py
2 twitter-archiver [options] <-|user> [<user> ...]
5 Archive tweets of users, sorted by date from oldest to newest, in
6 the following format: <id> <date> <<screen_name>> <tweet_text>
7 Date format is: YYYY-MM-DD HH:MM:SS TZ. Tweet <id> is used to
8 resume archiving on next run. Archive file name is the user name.
9 Provide "-" instead of users to read users from standard input.
12 -o --oauth authenticate to Twitter using OAuth (default: no)
13 -s --save-dir <path> directory to save archives (default: current dir)
14 -a --api-rate see current API rate limit status
15 -t --timeline <file> archive own timeline into given file name (requires
16 OAuth, max 800 statuses)
17 -m --mentions <file> archive own mentions instead of timeline into
18 given file name (requires OAuth, max 800 statuses)
19 -v --favorites archive user's favorites instead of timeline
20 -f --follow-redirects follow redirects of urls
21 -r --redirect-sites follow redirects for this comma separated list of hosts
22 -d --dms <file> archive own direct messages (both received and
23 sent) into given file name.
24 -i --isoformat store dates in ISO format (specifically RFC 3339)
27 Authenticate to Twitter using OAuth to archive tweets of private profiles
28 and have higher API rate limits. OAuth authentication tokens are stored
29 in ~/.twitter-archiver_oauth.
32 from __future__
import print_function
34 import os
, sys
, time
as _time
, calendar
, functools
35 from datetime
import time
, date
, datetime
36 from getopt
import gnu_getopt
as getopt
, GetoptError
39 import urllib
.request
as urllib2
40 import http
.client
as httplib
46 # T-Archiver (Twitter-Archiver) application registered by @stalkr_
47 CONSUMER_KEY
='d8hIyfzs7ievqeeZLjZrqQ'
48 CONSUMER_SECRET
='AnZmK0rnvaX7BoJ75l6XlilnbyMv7FoiDXWVmPD8'
50 from .api
import Twitter
, TwitterError
51 from .oauth
import OAuth
, read_token_file
52 from .oauth_dance
import oauth_dance
53 from .auth
import NoAuth
54 from .util
import Fail
, err
, expand_line
, parse_host_list
55 from .follow
import lookup
56 from .timezones
import utc
as UTC
, Local
58 def parse_args(args
, options
):
59 """Parse arguments from command-line to set options."""
60 long_opts
= ['help', 'oauth', 'save-dir=', 'api-rate', 'timeline=', 'mentions=', 'favorites', 'follow-redirects',"redirect-sites=", 'dms=', 'isoformat']
61 short_opts
= "hos:at:m:vfr:d:i"
62 opts
, extra_args
= getopt(args
, short_opts
, long_opts
)
65 if opt
in ('-h', '--help'):
68 elif opt
in ('-o', '--oauth'):
69 options
['oauth'] = True
70 elif opt
in ('-s', '--save-dir'):
71 options
['save-dir'] = arg
72 elif opt
in ('-a', '--api-rate'):
73 options
['api-rate' ] = True
74 elif opt
in ('-t', '--timeline'):
75 options
['timeline'] = arg
76 elif opt
in ('-m', '--mentions'):
77 options
['mentions'] = arg
78 elif opt
in ('-v', '--favorites'):
79 options
['favorites'] = True
80 elif opt
in ('-f', '--follow-redirects'):
81 options
['follow-redirects'] = True
82 elif opt
in ('-r', '--redirect-sites'):
83 options
['redirect-sites'] = arg
84 elif opt
in ('-d', '--dms'):
86 elif opt
in ('-i', '--isoformat'):
87 options
['isoformat'] = True
89 options
['extra_args'] = extra_args
91 def load_tweets(filename
):
92 """Load tweets from file into dict, see save_tweets()."""
94 archive
= open(filename
,"r")
95 except IOError: # no archive (yet)
99 for line
in archive
.readlines():
100 tid
, text
= line
.strip().split(" ", 1)
101 tweets
[int(tid
)] = text
.decode("utf-8")
106 def save_tweets(filename
, tweets
):
107 """Save tweets from dict to file.
109 Save tweets from dict to UTF-8 encoded file, one per line:
110 <tweet id (number)> <tweet text>
112 <date> <<user>> [RT @<user>: ]<text>
115 filename: A string representing the file name to save tweets to.
116 tweets: A dict mapping tweet-ids (int) to tweet text (str).
122 archive
= open(filename
,"w")
124 err("Cannot save tweets: %s" % str(e
))
127 for k
in sorted(tweets
.keys()):
128 archive
.write("%i %s\n" % (k
, tweets
[k
].encode('utf-8')))
132 def format_date(utc
, isoformat
=False):
133 """Parse Twitter's UTC date into UTC or local time."""
134 u
= datetime
.strptime(utc
.replace('+0000','UTC'), '%a %b %d %H:%M:%S %Z %Y')
135 # This is the least painful way I could find to create a non-naive
136 # datetime including a UTC timezone. Alternative suggestions
138 unew
= datetime
.combine(u
.date(), time(u
.time().hour
,
139 u
.time().minute
, u
.time().second
, tzinfo
=UTC
))
141 # Convert to localtime
142 unew
= unew
.astimezone(Local
)
145 return unew
.isoformat()
147 return unew
.strftime('%Y-%m-%d %H:%M:%S %Z')
149 def expand_format_text(hosts
, text
):
150 """Following redirects in links."""
151 return direct_format_text(expand_line(text
, hosts
))
153 def direct_format_text(text
):
154 """Transform special chars in text to have only one line."""
155 return text
.replace('\n','\\n').replace('\r','\\r')
157 def statuses_resolve_uids(twitter
, tl
):
158 """Resolve user ids to screen names from statuses."""
159 # get all user ids that needs a lookup (no screen_name key)
162 rt
= t
.get('retweeted_status')
163 if rt
and not rt
['user'].get('screen_name'):
164 user_ids
.append(rt
['user']['id'])
165 if not t
['user'].get('screen_name'):
166 user_ids
.append(t
['user']['id'])
168 # resolve all of them at once
169 names
= lookup(twitter
, list(set(user_ids
)))
171 # build new statuses with resolved uids
174 rt
= t
.get('retweeted_status')
175 if rt
and not rt
['user'].get('screen_name'):
176 name
= names
[rt
['user']['id']]
177 t
['retweeted_status']['user']['screen_name'] = name
178 if not t
['user'].get('screen_name'):
179 name
= names
[t
['user']['id']]
180 t
['user']['screen_name'] = name
185 def statuses_portion(twitter
, screen_name
, max_id
=None, mentions
=False, favorites
=False, received_dms
=None, isoformat
=False):
186 """Get a portion of the statuses of a screen name."""
187 kwargs
= dict(count
=200, include_rts
=1, screen_name
=screen_name
)
189 kwargs
['max_id'] = max_id
193 tl
= twitter
.statuses
.mentions_timeline(**kwargs
)
195 tl
= twitter
.favorites
.list(**kwargs
)
196 elif received_dms
!= None:
198 tl
= twitter
.direct_messages(**kwargs
)
200 tl
= twitter
.direct_messages
.sent(**kwargs
)
203 tl
= twitter
.statuses
.user_timeline(**kwargs
)
205 tl
= twitter
.statuses
.home_timeline(**kwargs
)
207 # some tweets do not provide screen name but user id, resolve those
208 # this isn't a valid operation for DMs, so special-case them
209 if received_dms
== None:
210 newtl
= statuses_resolve_uids(twitter
, tl
)
215 rt
= t
.get('retweeted_status')
217 text
= "RT @%s: %s" % (rt
['user']['screen_name'], rt
['text'])
218 # DMs don't include mentions by default, so in order to show who
219 # the recipient was, we synthesise a mention. If we're not
220 # operating on DMs, behave as normal
221 if received_dms
== None:
222 tweets
[t
['id']] = "%s <%s> %s" % (format_date(t
['created_at'], isoformat
=isoformat
),
223 t
['user']['screen_name'],
226 tweets
[t
['id']] = "%s <%s> @%s %s" % (format_date(t
['created_at'], isoformat
=isoformat
),
227 t
['sender_screen_name'],
228 t
['recipient']['screen_name'],
232 def statuses(twitter
, screen_name
, tweets
, mentions
=False, favorites
=False, received_dms
=None, isoformat
=False):
233 """Get all the statuses for a screen name."""
236 # get portions of statuses, incrementing max id until no new tweets appear
239 portion
= statuses_portion(twitter
, screen_name
, max_id
, mentions
, favorites
, received_dms
, isoformat
)
240 except TwitterError
as e
:
242 err("Fail: %i Unauthorized (tweets of that user are protected)"
245 elif e
.e
.code
== 400:
246 err("Fail: %i API rate limit exceeded" % e
.e
.code
)
247 rate
= twitter
.account
.rate_limit_status()
248 reset
= rate
['reset_time_in_seconds']
249 reset
= time
.asctime(time
.localtime(reset
))
250 delay
= int(rate
['reset_time_in_seconds']
251 - time
.time()) + 5 # avoid race
252 err("Hourly limit of %i requests reached, next reset on %s: "
253 "going to sleep for %i secs" % (rate
['hourly_limit'],
257 elif e
.e
.code
== 404:
258 err("Fail: %i This profile does not exist" % e
.e
.code
)
260 elif e
.e
.code
== 502:
261 err("Fail: %i Service currently unavailable, retrying..."
264 err("Fail: %s\nRetrying..." % str(e
)[:500])
266 except urllib2
.URLError
as e
:
267 err("Fail: urllib2.URLError %s - Retrying..." % str(e
))
269 except httplib
.error
as e
:
270 err("Fail: httplib.error %s - Retrying..." % str(e
))
272 except KeyError as e
:
273 err("Fail: KeyError %s - Retrying..." % str(e
))
277 tweets
.update(portion
)
279 err("Browsing %s statuses, new tweets: %i"
280 % (screen_name
if screen_name
else "home", new
))
283 max_id
= min(portion
.keys())-1 # browse backwards
286 def rate_limit_status(twitter
):
287 """Print current Twitter API rate limit status."""
288 r
= twitter
.account
.rate_limit_status()
289 print("Remaining API requests: %i/%i (hourly limit)"
290 % (r
['remaining_hits'], r
['hourly_limit']))
291 print("Next reset in %is (%s)"
292 % (int(r
['reset_time_in_seconds'] - time
.time()),
293 time
.asctime(time
.localtime(r
['reset_time_in_seconds']))))
295 def main(args
=sys
.argv
[1:]):
304 'follow-redirects': False,
305 'redirect-sites': None,
309 parse_args(args
, options
)
310 except GetoptError
as e
:
311 err("I can't do that, %s." % e
)
314 # exit if no user given
315 # except if asking for API rate, or archive of timeline or mentions
316 if not options
['extra_args'] and not (options
['api-rate'] or
317 options
['timeline'] or
318 options
['mentions'] or
323 # authenticate using OAuth, asking for token if necessary
325 oauth_filename
= (os
.getenv("HOME", "") + os
.sep
326 + ".twitter-archiver_oauth")
327 if not os
.path
.exists(oauth_filename
):
328 oauth_dance("Twitter-Archiver", CONSUMER_KEY
, CONSUMER_SECRET
,
330 oauth_token
, oauth_token_secret
= read_token_file(oauth_filename
)
331 auth
= OAuth(oauth_token
, oauth_token_secret
, CONSUMER_KEY
,
336 twitter
= Twitter(auth
=auth
, api_version
='1.1', domain
='api.twitter.com')
338 if options
['api-rate']:
339 rate_limit_status(twitter
)
343 if options
['follow-redirects'] or options
['redirect-sites'] :
344 if options
['redirect-sites']:
345 hosts
= parse_host_list(options
['redirect-sites'])
348 format_text
= functools
.partial(expand_format_text
, hosts
)
350 format_text
= direct_format_text
352 # save own timeline or mentions (the user used in OAuth)
353 if options
['timeline'] or options
['mentions']:
354 if isinstance(auth
, NoAuth
):
355 err("You must be authenticated to save timeline or mentions.")
358 if options
['timeline']:
359 filename
= options
['save-dir'] + os
.sep
+ options
['timeline']
360 print("* Archiving own timeline in %s" % filename
)
361 elif options
['mentions']:
362 filename
= options
['save-dir'] + os
.sep
+ options
['mentions']
363 print("* Archiving own mentions in %s" % filename
)
367 tweets
= load_tweets(filename
)
368 except Exception as e
:
369 err("Error when loading saved tweets: %s - continuing without"
373 statuses(twitter
, "", tweets
, options
['mentions'], options
['favorites'], isoformat
=options
['isoformat'])
374 except KeyboardInterrupt:
379 save_tweets(filename
, tweets
)
380 if options
['timeline']:
381 print("Total tweets in own timeline: %i" % len(tweets
))
382 elif options
['mentions']:
383 print("Total mentions: %i" % len(tweets
))
386 if isinstance(auth
, NoAuth
):
387 err("You must be authenticated to save DMs.")
390 filename
= options
['save-dir'] + os
.sep
+ options
['dms']
391 print("* Archiving own DMs in %s" % filename
)
395 dms
= load_tweets(filename
)
396 except Exception as e
:
397 err("Error when loading saved DMs: %s - continuing without"
401 statuses(twitter
, "", dms
, received_dms
=True, isoformat
=options
['isoformat'])
402 statuses(twitter
, "", dms
, received_dms
=False, isoformat
=options
['isoformat'])
403 except KeyboardInterrupt:
408 save_tweets(filename
, dms
)
409 print("Total DMs sent and received: %i" % len(dms
))
412 # read users from command-line or stdin
413 users
= options
['extra_args']
414 if len(users
) == 1 and users
[0] == "-":
415 users
= [line
.strip() for line
in sys
.stdin
.readlines()]
417 # save tweets for every user
418 total
, total_new
= 0, 0
420 filename
= options
['save-dir'] + os
.sep
+ user
421 if options
['favorites']:
422 filename
= filename
+ "-favorites"
423 print("* Archiving %s tweets in %s" % (user
, filename
))
427 tweets
= load_tweets(filename
)
428 except Exception as e
:
429 err("Error when loading saved tweets: %s - continuing without"
435 statuses(twitter
, user
, tweets
, options
['mentions'], options
['favorites'], isoformat
=options
['isoformat'])
436 except KeyboardInterrupt:
441 save_tweets(filename
, tweets
)
443 new
= len(tweets
) - before
445 print("Total tweets for %s: %i (%i new)" % (user
, len(tweets
), new
))
447 print("Total: %i tweets (%i new) for %i users"
448 % (total
, total_new
, len(users
)))