Provide "-" instead of users to read users from standard input.
OPTIONS
- -o --oauth authenticate to Twitter using OAuth (default no)
+ -o --oauth authenticate to Twitter using OAuth (default: no)
-s --save-dir <path> directory to save archives (default: current dir)
-a --api-rate see current API rate limit status
-t --timeline <file> archive own timeline into given file name (requires
- OAuth, max 800 statuses).
+ OAuth, max 800 statuses)
+ -m --mentions <file> archive own mentions instead of timeline into
+ given file name (requires OAuth, max 800 statuses)
+ -v --favorites archive user's favorites instead of timeline
+ -f --follow-redirects follow redirects of urls
+ -r --redirect-sites follow redirects for this comma separated list of hosts
+ -d --dms <file> archive own direct messages (both received and
+ sent) into given file name.
+ -i --isoformat store dates in ISO format (specifically RFC 3339)
AUTHENTICATION
Authenticate to Twitter using OAuth to archive tweets of private profiles
from __future__ import print_function
-import os, sys, time, calendar, urllib2, httplib
+import os, sys, time as _time, calendar, functools
+from datetime import time, date, datetime
from getopt import gnu_getopt as getopt, GetoptError
+try:
+ import urllib.request as urllib2
+ import http.client as httplib
+except ImportError:
+ import urllib2
+ import httplib
+
+
# T-Archiver (Twitter-Archiver) application registered by @stalkr_
CONSUMER_KEY='d8hIyfzs7ievqeeZLjZrqQ'
CONSUMER_SECRET='AnZmK0rnvaX7BoJ75l6XlilnbyMv7FoiDXWVmPD8'
from .oauth import OAuth, read_token_file
from .oauth_dance import oauth_dance
from .auth import NoAuth
-from .util import Fail, err
+from .util import Fail, err, expand_line, parse_host_list
from .follow import lookup
+from .timezones import utc as UTC, Local
def parse_args(args, options):
"""Parse arguments from command-line to set options."""
- long_opts = ['help', 'oauth', 'save-dir=', 'api-rate', 'timeline=']
- short_opts = "hos:at:"
+ long_opts = ['help', 'oauth', 'save-dir=', 'api-rate', 'timeline=', 'mentions=', 'favorites', 'follow-redirects',"redirect-sites=", 'dms=', 'isoformat']
+ short_opts = "hos:at:m:vfr:d:i"
opts, extra_args = getopt(args, short_opts, long_opts)
for opt, arg in opts:
options['api-rate' ] = True
elif opt in ('-t', '--timeline'):
options['timeline'] = arg
+ elif opt in ('-m', '--mentions'):
+ options['mentions'] = arg
+ elif opt in ('-v', '--favorites'):
+ options['favorites'] = True
+ elif opt in ('-f', '--follow-redirects'):
+ options['follow-redirects'] = True
+ elif opt in ('-r', '--redirect-sites'):
+ options['redirect-sites'] = arg
+ elif opt in ('-d', '--dms'):
+ options['dms'] = arg
+ elif opt in ('-i', '--isoformat'):
+ options['isoformat'] = True
options['extra_args'] = extra_args
tweets = {}
for line in archive.readlines():
- tid, text = line.strip().split(" ", 1)
- tweets[int(tid)] = text.decode("utf-8")
+ try:
+ tid, text = line.strip().split(" ", 1)
+ tweets[int(tid)] = text.decode("utf-8")
+ except Exception as e:
+ err("loading tweet %s failed due to %s" % (line, unicode(e)))
archive.close()
return tweets
return
for k in sorted(tweets.keys()):
- archive.write("%i %s\n" % (k, tweets[k].encode('utf-8')))
+ try:
+ archive.write("%i %s\n" % (k, tweets[k].encode('utf-8')))
+ except Exception as ex:
+ err("archiving tweet %s failed due to %s" % (k, unicode(ex)))
archive.close()
-def format_date(utc, to_localtime=True):
+def format_date(utc, isoformat=False):
"""Parse Twitter's UTC date into UTC or local time."""
- u = time.strptime(utc.replace('+0000','UTC'), '%a %b %d %H:%M:%S %Z %Y')
- if to_localtime and time.timezone != 0:
- t = time.localtime(calendar.timegm(u))
- return time.strftime("%Y-%m-%d %H:%M:%S", t) + " " + time.tzname[1]
+ u = datetime.strptime(utc.replace('+0000','UTC'), '%a %b %d %H:%M:%S %Z %Y')
+ # This is the least painful way I could find to create a non-naive
+ # datetime including a UTC timezone. Alternative suggestions
+ # welcome.
+ unew = datetime.combine(u.date(), time(u.time().hour,
+ u.time().minute, u.time().second, tzinfo=UTC))
+
+ # Convert to localtime
+ unew = unew.astimezone(Local)
+
+ if isoformat:
+ return unew.isoformat()
else:
- return time.strftime("%Y-%m-%d %H:%M:%S UTC", u)
+ return unew.strftime('%Y-%m-%d %H:%M:%S %Z')
-def format_text(text):
+def expand_format_text(hosts, text):
+ """Following redirects in links."""
+ return direct_format_text(expand_line(text, hosts))
+
+def direct_format_text(text):
"""Transform special chars in text to have only one line."""
return text.replace('\n','\\n').replace('\r','\\r')
-def timeline_resolve_uids(twitter, tl):
- """Resolve user ids to screen names from a timeline."""
+def statuses_resolve_uids(twitter, tl):
+ """Resolve user ids to screen names from statuses."""
# get all user ids that needs a lookup (no screen_name key)
user_ids = []
for t in tl:
# resolve all of them at once
names = lookup(twitter, list(set(user_ids)))
- # build new timeline with resolved uids
+ # build new statuses with resolved uids
new_tl = []
for t in tl:
rt = t.get('retweeted_status')
return new_tl
-def timeline_portion(twitter, screen_name, max_id=None):
- """Get a portion of the timeline of a screen name."""
+def statuses_portion(twitter, screen_name, max_id=None, mentions=False, favorites=False, received_dms=None, isoformat=False):
+ """Get a portion of the statuses of a screen name."""
kwargs = dict(count=200, include_rts=1, screen_name=screen_name)
if max_id:
kwargs['max_id'] = max_id
tweets = {}
- if screen_name:
- tl = twitter.statuses.user_timeline(**kwargs)
- else: # self
- tl = twitter.statuses.home_timeline(**kwargs)
+ if mentions:
+ tl = twitter.statuses.mentions_timeline(**kwargs)
+ elif favorites:
+ tl = twitter.favorites.list(**kwargs)
+ elif received_dms != None:
+ if received_dms:
+ tl = twitter.direct_messages(**kwargs)
+ else: # sent DMs
+ tl = twitter.direct_messages.sent(**kwargs)
+ else: # timeline
+ if screen_name:
+ tl = twitter.statuses.user_timeline(**kwargs)
+ else: # self
+ tl = twitter.statuses.home_timeline(**kwargs)
# some tweets do not provide screen name but user id, resolve those
- for t in timeline_resolve_uids(twitter, tl):
+ # this isn't a valid operation for DMs, so special-case them
+ if received_dms == None:
+ newtl = statuses_resolve_uids(twitter, tl)
+ else:
+ newtl = tl
+ for t in newtl:
text = t['text']
rt = t.get('retweeted_status')
if rt:
text = "RT @%s: %s" % (rt['user']['screen_name'], rt['text'])
- tweets[t['id']] = "%s <%s> %s" % (format_date(t['created_at']),
- t['user']['screen_name'],
- format_text(text))
-
+ # DMs don't include mentions by default, so in order to show who
+ # the recipient was, we synthesise a mention. If we're not
+ # operating on DMs, behave as normal
+ if received_dms == None:
+ tweets[t['id']] = "%s <%s> %s" % (format_date(t['created_at'], isoformat=isoformat),
+ t['user']['screen_name'],
+ format_text(text))
+ else:
+ tweets[t['id']] = "%s <%s> @%s %s" % (format_date(t['created_at'], isoformat=isoformat),
+ t['sender_screen_name'],
+ t['recipient']['screen_name'],
+ format_text(text))
return tweets
-def timeline(twitter, screen_name, tweets):
- """Get the entire timeline of tweets for a screen name."""
+def statuses(twitter, screen_name, tweets, mentions=False, favorites=False, received_dms=None, isoformat=False):
+ """Get all the statuses for a screen name."""
max_id = None
fail = Fail()
- # get portions of timeline, incrementing max id until no new tweets appear
+ # get portions of statuses, incrementing max id until no new tweets appear
while True:
try:
- portion = timeline_portion(twitter, screen_name, max_id)
+ portion = statuses_portion(twitter, screen_name, max_id, mentions, favorites, received_dms, isoformat)
except TwitterError as e:
if e.e.code == 401:
err("Fail: %i Unauthorized (tweets of that user are protected)"
% e.e.code)
break
- elif e.e.code == 400:
+ elif e.e.code == 429:
err("Fail: %i API rate limit exceeded" % e.e.code)
- rate = twitter.account.rate_limit_status()
- reset = rate['reset_time_in_seconds']
- reset = time.asctime(time.localtime(reset))
- delay = int(rate['reset_time_in_seconds']
- - time.time()) + 5 # avoid race
- err("Hourly limit of %i requests reached, next reset on %s: "
- "going to sleep for %i secs" % (rate['hourly_limit'],
+ rls = twitter.application.rate_limit_status()
+ reset = rls.rate_limit_reset
+ reset = _time.asctime(_time.localtime(reset))
+ delay = int(rls.rate_limit_reset
+ - _time.time()) + 5 # avoid race
+ err("Interval limit of %i requests reached, next reset on %s: "
+ "going to sleep for %i secs" % (rls.rate_limit_limit,
reset, delay))
fail.wait(delay)
continue
new = -len(tweets)
tweets.update(portion)
new += len(tweets)
- err("Browsing %s timeline, new tweets: %i"
+ err("Browsing %s statuses, new tweets: %i"
% (screen_name if screen_name else "home", new))
if new < 190:
break
- max_id = min(portion.keys()) # browse backwards
+ max_id = min(portion.keys())-1 # browse backwards
fail = Fail()
def rate_limit_status(twitter):
"""Print current Twitter API rate limit status."""
- r = twitter.account.rate_limit_status()
- print("Remaining API requests: %i/%i (hourly limit)"
- % (r['remaining_hits'], r['hourly_limit']))
+ rls = twitter.application.rate_limit_status()
+ print("Remaining API requests: %i/%i (interval limit)"
+ % (rls.rate_limit_remaining, rls.rate_limit_limit))
print("Next reset in %is (%s)"
- % (int(r['reset_time_in_seconds'] - time.time()),
- time.asctime(time.localtime(r['reset_time_in_seconds']))))
+ % (int(rls.rate_limit_reset - _time.time()),
+ _time.asctime(_time.localtime(rls.rate_limit_reset))))
def main(args=sys.argv[1:]):
options = {
'oauth': False,
'save-dir': ".",
'api-rate': False,
- 'timeline': ""
+ 'timeline': "",
+ 'mentions': "",
+ 'dms': "",
+ 'favorites': False,
+ 'follow-redirects': False,
+ 'redirect-sites': None,
+ 'isoformat': False,
}
try:
parse_args(args, options)
raise SystemExit(1)
# exit if no user given
- # except if asking for API rate or archive of timeline
+ # except if asking for API rate, or archive of timeline or mentions
if not options['extra_args'] and not (options['api-rate'] or
- options['timeline']):
+ options['timeline'] or
+ options['mentions'] or
+ options['dms']):
print(__doc__)
return
# authenticate using OAuth, asking for token if necessary
if options['oauth']:
- oauth_filename = (os.getenv("HOME", "") + os.sep
- + ".twitter-archiver_oauth")
+ oauth_filename = (os.environ.get('HOME',
+ os.environ.get('USERPROFILE', ''))
+ + os.sep
+ + '.twitter-archiver_oauth')
+
if not os.path.exists(oauth_filename):
oauth_dance("Twitter-Archiver", CONSUMER_KEY, CONSUMER_SECRET,
oauth_filename)
else:
auth = NoAuth()
- twitter = Twitter(auth=auth, api_version='1', domain='api.twitter.com')
+ twitter = Twitter(auth=auth, api_version='1.1', domain='api.twitter.com')
if options['api-rate']:
rate_limit_status(twitter)
return
- # save own timeline (the user used in OAuth)
- if options['timeline']:
+ global format_text
+ if options['follow-redirects'] or options['redirect-sites'] :
+ if options['redirect-sites']:
+ hosts = parse_host_list(options['redirect-sites'])
+ else:
+ hosts = None
+ format_text = functools.partial(expand_format_text, hosts)
+ else:
+ format_text = direct_format_text
+
+ # save own timeline or mentions (the user used in OAuth)
+ if options['timeline'] or options['mentions']:
if isinstance(auth, NoAuth):
- err("You must be authenticated to save timeline.")
+ err("You must be authenticated to save timeline or mentions.")
raise SystemExit(1)
- filename = options['save-dir'] + os.sep + options['timeline']
- print("* Archiving own timeline in %s" % filename)
+ if options['timeline']:
+ filename = options['save-dir'] + os.sep + options['timeline']
+ print("* Archiving own timeline in %s" % filename)
+ elif options['mentions']:
+ filename = options['save-dir'] + os.sep + options['mentions']
+ print("* Archiving own mentions in %s" % filename)
tweets = {}
try:
tweets = load_tweets(filename)
- except Exception, e:
+ except Exception as e:
err("Error when loading saved tweets: %s - continuing without"
% str(e))
try:
- # no screen_name means we want home_timeline, not user_timeline
- timeline(twitter, "", tweets)
+ statuses(twitter, "", tweets, options['mentions'], options['favorites'], isoformat=options['isoformat'])
except KeyboardInterrupt:
err()
err("Interrupted")
raise SystemExit(1)
save_tweets(filename, tweets)
- print("Total tweets in own timeline: %i" % len(tweets))
+ if options['timeline']:
+ print("Total tweets in own timeline: %i" % len(tweets))
+ elif options['mentions']:
+ print("Total mentions: %i" % len(tweets))
+
+ if options['dms']:
+ if isinstance(auth, NoAuth):
+ err("You must be authenticated to save DMs.")
+ raise SystemExit(1)
+
+ filename = options['save-dir'] + os.sep + options['dms']
+ print("* Archiving own DMs in %s" % filename)
+
+ dms = {}
+ try:
+ dms = load_tweets(filename)
+ except Exception as e:
+ err("Error when loading saved DMs: %s - continuing without"
+ % str(e))
+
+ try:
+ statuses(twitter, "", dms, received_dms=True, isoformat=options['isoformat'])
+ statuses(twitter, "", dms, received_dms=False, isoformat=options['isoformat'])
+ except KeyboardInterrupt:
+ err()
+ err("Interrupted")
+ raise SystemExit(1)
+
+ save_tweets(filename, dms)
+ print("Total DMs sent and received: %i" % len(dms))
+
# read users from command-line or stdin
users = options['extra_args']
total, total_new = 0, 0
for user in users:
filename = options['save-dir'] + os.sep + user
+ if options['favorites']:
+ filename = filename + "-favorites"
print("* Archiving %s tweets in %s" % (user, filename))
tweets = {}
try:
tweets = load_tweets(filename)
- except Exception, e:
+ except Exception as e:
err("Error when loading saved tweets: %s - continuing without"
% str(e))
new = 0
before = len(tweets)
try:
- timeline(twitter, user, tweets)
+ statuses(twitter, user, tweets, options['mentions'], options['favorites'], isoformat=options['isoformat'])
except KeyboardInterrupt:
err()
err("Interrupted")