]>
jfr.im git - z_archive/twitter.git/blob - twitter/archiver.py
2 twitter-archiver [options] <-|user> [<user> ...]
5 Archive tweets of users, sorted by date from oldest to newest, in
6 the following format: <id> <date> <<screen_name>> <tweet_text>
7 Date format is: YYYY-MM-DD HH:MM:SS TZ. Tweet <id> is used to
8 resume archiving on next run. Archive file name is the user name.
9 Provide "-" instead of users to read users from standard input.
12 -o --oauth authenticate to Twitter using OAuth (default: no)
13 -s --save-dir <path> directory to save archives (default: current dir)
14 -a --api-rate see current API rate limit status
15 -t --timeline <file> archive own timeline into given file name (requires
16 OAuth, max 800 statuses)
17 -m --mentions <file> archive own mentions instead of timeline into
18 given file name (requires OAuth, max 800 statuses)
19 -v --favorites archive user's favorites instead of timeline
20 -f --follow-redirects follow redirects of urls
21 -r --redirect-sites follow redirects for this comma separated list of hosts
22 -d --dms <file> archive own direct messages (both received and
23 sent) into given file name.
24 -i --isoformat store dates in ISO format (specifically RFC 3339)
27 Authenticate to Twitter using OAuth to archive tweets of private profiles
28 and have higher API rate limits. OAuth authentication tokens are stored
29 in ~/.twitter-archiver_oauth.
32 from __future__
import print_function
34 import os
, sys
, time
as _time
, calendar
, functools
35 from datetime
import time
, date
, datetime
36 from getopt
import gnu_getopt
as getopt
, GetoptError
39 import urllib
.request
as urllib2
40 import http
.client
as httplib
46 # T-Archiver (Twitter-Archiver) application registered by @stalkr_
47 CONSUMER_KEY
='d8hIyfzs7ievqeeZLjZrqQ'
48 CONSUMER_SECRET
='AnZmK0rnvaX7BoJ75l6XlilnbyMv7FoiDXWVmPD8'
50 from .api
import Twitter
, TwitterError
51 from .oauth
import OAuth
, read_token_file
52 from .oauth_dance
import oauth_dance
53 from .auth
import NoAuth
54 from .util
import Fail
, err
, expand_line
, parse_host_list
55 from .follow
import lookup
56 from .timezones
import utc
as UTC
, Local
58 def parse_args(args
, options
):
59 """Parse arguments from command-line to set options."""
60 long_opts
= ['help', 'oauth', 'save-dir=', 'api-rate', 'timeline=', 'mentions=', 'favorites', 'follow-redirects',"redirect-sites=", 'dms=', 'isoformat']
61 short_opts
= "hos:at:m:vfr:d:i"
62 opts
, extra_args
= getopt(args
, short_opts
, long_opts
)
65 if opt
in ('-h', '--help'):
68 elif opt
in ('-o', '--oauth'):
69 options
['oauth'] = True
70 elif opt
in ('-s', '--save-dir'):
71 options
['save-dir'] = arg
72 elif opt
in ('-a', '--api-rate'):
73 options
['api-rate' ] = True
74 elif opt
in ('-t', '--timeline'):
75 options
['timeline'] = arg
76 elif opt
in ('-m', '--mentions'):
77 options
['mentions'] = arg
78 elif opt
in ('-v', '--favorites'):
79 options
['favorites'] = True
80 elif opt
in ('-f', '--follow-redirects'):
81 options
['follow-redirects'] = True
82 elif opt
in ('-r', '--redirect-sites'):
83 options
['redirect-sites'] = arg
84 elif opt
in ('-d', '--dms'):
86 elif opt
in ('-i', '--isoformat'):
87 options
['isoformat'] = True
89 options
['extra_args'] = extra_args
91 def load_tweets(filename
):
92 """Load tweets from file into dict, see save_tweets()."""
94 archive
= open(filename
,"r")
95 except IOError: # no archive (yet)
99 for line
in archive
.readlines():
101 tid
, text
= line
.strip().split(" ", 1)
102 tweets
[int(tid
)] = text
.decode("utf-8")
103 except Exception as e
:
104 err("loading tweet %s failed due to %s" % (line
, unicode(e
)))
109 def save_tweets(filename
, tweets
):
110 """Save tweets from dict to file.
112 Save tweets from dict to UTF-8 encoded file, one per line:
113 <tweet id (number)> <tweet text>
115 <date> <<user>> [RT @<user>: ]<text>
118 filename: A string representing the file name to save tweets to.
119 tweets: A dict mapping tweet-ids (int) to tweet text (str).
125 archive
= open(filename
,"w")
127 err("Cannot save tweets: %s" % str(e
))
130 for k
in sorted(tweets
.keys()):
132 archive
.write("%i %s\n" % (k
, tweets
[k
].encode('utf-8')))
133 except Exception as ex
:
134 err("archiving tweet %s failed due to %s" % (k
, unicode(ex
)))
138 def format_date(utc
, isoformat
=False):
139 """Parse Twitter's UTC date into UTC or local time."""
140 u
= datetime
.strptime(utc
.replace('+0000','UTC'), '%a %b %d %H:%M:%S %Z %Y')
141 # This is the least painful way I could find to create a non-naive
142 # datetime including a UTC timezone. Alternative suggestions
144 unew
= datetime
.combine(u
.date(), time(u
.time().hour
,
145 u
.time().minute
, u
.time().second
, tzinfo
=UTC
))
147 # Convert to localtime
148 unew
= unew
.astimezone(Local
)
151 return unew
.isoformat()
153 return unew
.strftime('%Y-%m-%d %H:%M:%S %Z')
155 def expand_format_text(hosts
, text
):
156 """Following redirects in links."""
157 return direct_format_text(expand_line(text
, hosts
))
159 def direct_format_text(text
):
160 """Transform special chars in text to have only one line."""
161 return text
.replace('\n','\\n').replace('\r','\\r')
163 def statuses_resolve_uids(twitter
, tl
):
164 """Resolve user ids to screen names from statuses."""
165 # get all user ids that needs a lookup (no screen_name key)
168 rt
= t
.get('retweeted_status')
169 if rt
and not rt
['user'].get('screen_name'):
170 user_ids
.append(rt
['user']['id'])
171 if not t
['user'].get('screen_name'):
172 user_ids
.append(t
['user']['id'])
174 # resolve all of them at once
175 names
= lookup(twitter
, list(set(user_ids
)))
177 # build new statuses with resolved uids
180 rt
= t
.get('retweeted_status')
181 if rt
and not rt
['user'].get('screen_name'):
182 name
= names
[rt
['user']['id']]
183 t
['retweeted_status']['user']['screen_name'] = name
184 if not t
['user'].get('screen_name'):
185 name
= names
[t
['user']['id']]
186 t
['user']['screen_name'] = name
191 def statuses_portion(twitter
, screen_name
, max_id
=None, mentions
=False, favorites
=False, received_dms
=None, isoformat
=False):
192 """Get a portion of the statuses of a screen name."""
193 kwargs
= dict(count
=200, include_rts
=1, screen_name
=screen_name
)
195 kwargs
['max_id'] = max_id
199 tl
= twitter
.statuses
.mentions_timeline(**kwargs
)
201 tl
= twitter
.favorites
.list(**kwargs
)
202 elif received_dms
!= None:
204 tl
= twitter
.direct_messages(**kwargs
)
206 tl
= twitter
.direct_messages
.sent(**kwargs
)
209 tl
= twitter
.statuses
.user_timeline(**kwargs
)
211 tl
= twitter
.statuses
.home_timeline(**kwargs
)
213 # some tweets do not provide screen name but user id, resolve those
214 # this isn't a valid operation for DMs, so special-case them
215 if received_dms
== None:
216 newtl
= statuses_resolve_uids(twitter
, tl
)
221 rt
= t
.get('retweeted_status')
223 text
= "RT @%s: %s" % (rt
['user']['screen_name'], rt
['text'])
224 # DMs don't include mentions by default, so in order to show who
225 # the recipient was, we synthesise a mention. If we're not
226 # operating on DMs, behave as normal
227 if received_dms
== None:
228 tweets
[t
['id']] = "%s <%s> %s" % (format_date(t
['created_at'], isoformat
=isoformat
),
229 t
['user']['screen_name'],
232 tweets
[t
['id']] = "%s <%s> @%s %s" % (format_date(t
['created_at'], isoformat
=isoformat
),
233 t
['sender_screen_name'],
234 t
['recipient']['screen_name'],
238 def statuses(twitter
, screen_name
, tweets
, mentions
=False, favorites
=False, received_dms
=None, isoformat
=False):
239 """Get all the statuses for a screen name."""
242 # get portions of statuses, incrementing max id until no new tweets appear
245 portion
= statuses_portion(twitter
, screen_name
, max_id
, mentions
, favorites
, received_dms
, isoformat
)
246 except TwitterError
as e
:
248 err("Fail: %i Unauthorized (tweets of that user are protected)"
251 elif e
.e
.code
== 429:
252 err("Fail: %i API rate limit exceeded" % e
.e
.code
)
253 rls
= twitter
.application
.rate_limit_status()
254 reset
= rls
.rate_limit_reset
255 reset
= _time
.asctime(_time
.localtime(reset
))
256 delay
= int(rls
.rate_limit_reset
257 - _time
.time()) + 5 # avoid race
258 err("Interval limit of %i requests reached, next reset on %s: "
259 "going to sleep for %i secs" % (rls
.rate_limit_limit
,
263 elif e
.e
.code
== 404:
264 err("Fail: %i This profile does not exist" % e
.e
.code
)
266 elif e
.e
.code
== 502:
267 err("Fail: %i Service currently unavailable, retrying..."
270 err("Fail: %s\nRetrying..." % str(e
)[:500])
272 except urllib2
.URLError
as e
:
273 err("Fail: urllib2.URLError %s - Retrying..." % str(e
))
275 except httplib
.error
as e
:
276 err("Fail: httplib.error %s - Retrying..." % str(e
))
278 except KeyError as e
:
279 err("Fail: KeyError %s - Retrying..." % str(e
))
283 tweets
.update(portion
)
285 err("Browsing %s statuses, new tweets: %i"
286 % (screen_name
if screen_name
else "home", new
))
289 max_id
= min(portion
.keys())-1 # browse backwards
292 def rate_limit_status(twitter
):
293 """Print current Twitter API rate limit status."""
294 rls
= twitter
.application
.rate_limit_status()
295 print("Remaining API requests: %i/%i (interval limit)"
296 % (rls
.rate_limit_remaining
, rls
.rate_limit_limit
))
297 print("Next reset in %is (%s)"
298 % (int(rls
.rate_limit_reset
- _time
.time()),
299 _time
.asctime(_time
.localtime(rls
.rate_limit_reset
))))
301 def main(args
=sys
.argv
[1:]):
310 'follow-redirects': False,
311 'redirect-sites': None,
315 parse_args(args
, options
)
316 except GetoptError
as e
:
317 err("I can't do that, %s." % e
)
320 # exit if no user given
321 # except if asking for API rate, or archive of timeline or mentions
322 if not options
['extra_args'] and not (options
['api-rate'] or
323 options
['timeline'] or
324 options
['mentions'] or
329 # authenticate using OAuth, asking for token if necessary
331 oauth_filename
= (os
.environ
.get('HOME',
332 os
.environ
.get('USERPROFILE', ''))
334 + '.twitter-archiver_oauth')
336 if not os
.path
.exists(oauth_filename
):
337 oauth_dance("Twitter-Archiver", CONSUMER_KEY
, CONSUMER_SECRET
,
339 oauth_token
, oauth_token_secret
= read_token_file(oauth_filename
)
340 auth
= OAuth(oauth_token
, oauth_token_secret
, CONSUMER_KEY
,
345 twitter
= Twitter(auth
=auth
, api_version
='1.1', domain
='api.twitter.com')
347 if options
['api-rate']:
348 rate_limit_status(twitter
)
352 if options
['follow-redirects'] or options
['redirect-sites'] :
353 if options
['redirect-sites']:
354 hosts
= parse_host_list(options
['redirect-sites'])
357 format_text
= functools
.partial(expand_format_text
, hosts
)
359 format_text
= direct_format_text
361 # save own timeline or mentions (the user used in OAuth)
362 if options
['timeline'] or options
['mentions']:
363 if isinstance(auth
, NoAuth
):
364 err("You must be authenticated to save timeline or mentions.")
367 if options
['timeline']:
368 filename
= options
['save-dir'] + os
.sep
+ options
['timeline']
369 print("* Archiving own timeline in %s" % filename
)
370 elif options
['mentions']:
371 filename
= options
['save-dir'] + os
.sep
+ options
['mentions']
372 print("* Archiving own mentions in %s" % filename
)
376 tweets
= load_tweets(filename
)
377 except Exception as e
:
378 err("Error when loading saved tweets: %s - continuing without"
382 statuses(twitter
, "", tweets
, options
['mentions'], options
['favorites'], isoformat
=options
['isoformat'])
383 except KeyboardInterrupt:
388 save_tweets(filename
, tweets
)
389 if options
['timeline']:
390 print("Total tweets in own timeline: %i" % len(tweets
))
391 elif options
['mentions']:
392 print("Total mentions: %i" % len(tweets
))
395 if isinstance(auth
, NoAuth
):
396 err("You must be authenticated to save DMs.")
399 filename
= options
['save-dir'] + os
.sep
+ options
['dms']
400 print("* Archiving own DMs in %s" % filename
)
404 dms
= load_tweets(filename
)
405 except Exception as e
:
406 err("Error when loading saved DMs: %s - continuing without"
410 statuses(twitter
, "", dms
, received_dms
=True, isoformat
=options
['isoformat'])
411 statuses(twitter
, "", dms
, received_dms
=False, isoformat
=options
['isoformat'])
412 except KeyboardInterrupt:
417 save_tweets(filename
, dms
)
418 print("Total DMs sent and received: %i" % len(dms
))
421 # read users from command-line or stdin
422 users
= options
['extra_args']
423 if len(users
) == 1 and users
[0] == "-":
424 users
= [line
.strip() for line
in sys
.stdin
.readlines()]
426 # save tweets for every user
427 total
, total_new
= 0, 0
429 filename
= options
['save-dir'] + os
.sep
+ user
430 if options
['favorites']:
431 filename
= filename
+ "-favorites"
432 print("* Archiving %s tweets in %s" % (user
, filename
))
436 tweets
= load_tweets(filename
)
437 except Exception as e
:
438 err("Error when loading saved tweets: %s - continuing without"
444 statuses(twitter
, user
, tweets
, options
['mentions'], options
['favorites'], isoformat
=options
['isoformat'])
445 except KeyboardInterrupt:
450 save_tweets(filename
, tweets
)
452 new
= len(tweets
) - before
454 print("Total tweets for %s: %i (%i new)" % (user
, len(tweets
), new
))
456 print("Total: %i tweets (%i new) for %i users"
457 % (total
, total_new
, len(users
)))