X-Git-Url: https://jfr.im/git/z_archive/twitter.git/blobdiff_plain/016183087df8e7902787dbd81bb09ac7a7129f18..17b9ff10207340026b876eb623660f2c79bfe85d:/twitter/archiver.py diff --git a/twitter/archiver.py b/twitter/archiver.py index 587b63e..ef2dc1f 100644 --- a/twitter/archiver.py +++ b/twitter/archiver.py @@ -21,6 +21,7 @@ OPTIONS -r --redirect-sites follow redirects for this comma separated list of hosts -d --dms archive own direct messages (both received and sent) into given file name. + -i --isoformat store dates in ISO format (specifically RFC 3339) AUTHENTICATION Authenticate to Twitter using OAuth to archive tweets of private profiles @@ -30,7 +31,8 @@ AUTHENTICATION from __future__ import print_function -import os, sys, time, calendar, functools +import os, sys, time as _time, calendar, functools +from datetime import time, date, datetime from getopt import gnu_getopt as getopt, GetoptError try: @@ -51,11 +53,12 @@ from .oauth_dance import oauth_dance from .auth import NoAuth from .util import Fail, err, expand_line, parse_host_list from .follow import lookup +from .timezones import utc as UTC, Local def parse_args(args, options): """Parse arguments from command-line to set options.""" - long_opts = ['help', 'oauth', 'save-dir=', 'api-rate', 'timeline=', 'mentions=', 'favorites', 'follow-redirects',"redirect-sites=", 'dms='] - short_opts = "hos:at:m:vfr:d:" + long_opts = ['help', 'oauth', 'save-dir=', 'api-rate', 'timeline=', 'mentions=', 'favorites', 'follow-redirects',"redirect-sites=", 'dms=', 'isoformat'] + short_opts = "hos:at:m:vfr:d:i" opts, extra_args = getopt(args, short_opts, long_opts) for opt, arg in opts: @@ -80,6 +83,8 @@ def parse_args(args, options): options['redirect-sites'] = arg elif opt in ('-d', '--dms'): options['dms'] = arg + elif opt in ('-i', '--isoformat'): + options['isoformat'] = True options['extra_args'] = extra_args @@ -92,8 +97,11 @@ def load_tweets(filename): tweets = {} for line in archive.readlines(): - tid, text = line.strip().split(" ", 1) - tweets[int(tid)] = text.decode("utf-8") + try: + tid, text = line.strip().split(" ", 1) + tweets[int(tid)] = text.decode("utf-8") + except Exception as e: + err("loading tweet %s failed due to %s" % (line, unicode(e))) archive.close() return tweets @@ -120,18 +128,29 @@ def save_tweets(filename, tweets): return for k in sorted(tweets.keys()): - archive.write("%i %s\n" % (k, tweets[k].encode('utf-8'))) + try: + archive.write("%i %s\n" % (k, tweets[k].encode('utf-8'))) + except Exception as ex: + err("archiving tweet %s failed due to %s" % (k, unicode(ex))) archive.close() -def format_date(utc, to_localtime=True): +def format_date(utc, isoformat=False): """Parse Twitter's UTC date into UTC or local time.""" - u = time.strptime(utc.replace('+0000','UTC'), '%a %b %d %H:%M:%S %Z %Y') - if to_localtime and time.timezone != 0: - t = time.localtime(calendar.timegm(u)) - return time.strftime("%Y-%m-%d %H:%M:%S", t) + " " + time.tzname[1] + u = datetime.strptime(utc.replace('+0000','UTC'), '%a %b %d %H:%M:%S %Z %Y') + # This is the least painful way I could find to create a non-naive + # datetime including a UTC timezone. Alternative suggestions + # welcome. + unew = datetime.combine(u.date(), time(u.time().hour, + u.time().minute, u.time().second, tzinfo=UTC)) + + # Convert to localtime + unew = unew.astimezone(Local) + + if isoformat: + return unew.isoformat() else: - return time.strftime("%Y-%m-%d %H:%M:%S UTC", u) + return unew.strftime('%Y-%m-%d %H:%M:%S %Z') def expand_format_text(hosts, text): """Following redirects in links.""" @@ -169,7 +188,7 @@ def statuses_resolve_uids(twitter, tl): return new_tl -def statuses_portion(twitter, screen_name, max_id=None, mentions=False, favorites=False, received_dms=None): +def statuses_portion(twitter, screen_name, max_id=None, mentions=False, favorites=False, received_dms=None, isoformat=False): """Get a portion of the statuses of a screen name.""" kwargs = dict(count=200, include_rts=1, screen_name=screen_name) if max_id: @@ -177,9 +196,9 @@ def statuses_portion(twitter, screen_name, max_id=None, mentions=False, favorite tweets = {} if mentions: - tl = twitter.statuses.mentions(**kwargs) + tl = twitter.statuses.mentions_timeline(**kwargs) elif favorites: - tl = twitter.favorites(**kwargs) # API v1, favorites.list() in v1.1 + tl = twitter.favorites.list(**kwargs) elif received_dms != None: if received_dms: tl = twitter.direct_messages(**kwargs) @@ -206,38 +225,38 @@ def statuses_portion(twitter, screen_name, max_id=None, mentions=False, favorite # the recipient was, we synthesise a mention. If we're not # operating on DMs, behave as normal if received_dms == None: - tweets[t['id']] = "%s <%s> %s" % (format_date(t['created_at']), + tweets[t['id']] = "%s <%s> %s" % (format_date(t['created_at'], isoformat=isoformat), t['user']['screen_name'], format_text(text)) else: - tweets[t['id']] = "%s <%s> @%s %s" % (format_date(t['created_at']), + tweets[t['id']] = "%s <%s> @%s %s" % (format_date(t['created_at'], isoformat=isoformat), t['sender_screen_name'], t['recipient']['screen_name'], format_text(text)) return tweets -def statuses(twitter, screen_name, tweets, mentions=False, favorites=False, received_dms=None): +def statuses(twitter, screen_name, tweets, mentions=False, favorites=False, received_dms=None, isoformat=False): """Get all the statuses for a screen name.""" max_id = None fail = Fail() # get portions of statuses, incrementing max id until no new tweets appear while True: try: - portion = statuses_portion(twitter, screen_name, max_id, mentions, favorites, received_dms) + portion = statuses_portion(twitter, screen_name, max_id, mentions, favorites, received_dms, isoformat) except TwitterError as e: if e.e.code == 401: err("Fail: %i Unauthorized (tweets of that user are protected)" % e.e.code) break - elif e.e.code == 400: + elif e.e.code == 429: err("Fail: %i API rate limit exceeded" % e.e.code) - rate = twitter.account.rate_limit_status() - reset = rate['reset_time_in_seconds'] - reset = time.asctime(time.localtime(reset)) - delay = int(rate['reset_time_in_seconds'] - - time.time()) + 5 # avoid race - err("Hourly limit of %i requests reached, next reset on %s: " - "going to sleep for %i secs" % (rate['hourly_limit'], + rls = twitter.application.rate_limit_status() + reset = rls.rate_limit_reset + reset = _time.asctime(_time.localtime(reset)) + delay = int(rls.rate_limit_reset + - _time.time()) + 5 # avoid race + err("Interval limit of %i requests reached, next reset on %s: " + "going to sleep for %i secs" % (rls.rate_limit_limit, reset, delay)) fail.wait(delay) continue @@ -272,12 +291,12 @@ def statuses(twitter, screen_name, tweets, mentions=False, favorites=False, rece def rate_limit_status(twitter): """Print current Twitter API rate limit status.""" - r = twitter.account.rate_limit_status() - print("Remaining API requests: %i/%i (hourly limit)" - % (r['remaining_hits'], r['hourly_limit'])) + rls = twitter.application.rate_limit_status() + print("Remaining API requests: %i/%i (interval limit)" + % (rls.rate_limit_remaining, rls.rate_limit_limit)) print("Next reset in %is (%s)" - % (int(r['reset_time_in_seconds'] - time.time()), - time.asctime(time.localtime(r['reset_time_in_seconds'])))) + % (int(rls.rate_limit_reset - _time.time()), + _time.asctime(_time.localtime(rls.rate_limit_reset)))) def main(args=sys.argv[1:]): options = { @@ -290,6 +309,7 @@ def main(args=sys.argv[1:]): 'favorites': False, 'follow-redirects': False, 'redirect-sites': None, + 'isoformat': False, } try: parse_args(args, options) @@ -308,8 +328,11 @@ def main(args=sys.argv[1:]): # authenticate using OAuth, asking for token if necessary if options['oauth']: - oauth_filename = (os.getenv("HOME", "") + os.sep - + ".twitter-archiver_oauth") + oauth_filename = (os.environ.get('HOME', + os.environ.get('USERPROFILE', '')) + + os.sep + + '.twitter-archiver_oauth') + if not os.path.exists(oauth_filename): oauth_dance("Twitter-Archiver", CONSUMER_KEY, CONSUMER_SECRET, oauth_filename) @@ -319,7 +342,7 @@ def main(args=sys.argv[1:]): else: auth = NoAuth() - twitter = Twitter(auth=auth, api_version='1', domain='api.twitter.com') + twitter = Twitter(auth=auth, api_version='1.1', domain='api.twitter.com') if options['api-rate']: rate_limit_status(twitter) @@ -356,7 +379,7 @@ def main(args=sys.argv[1:]): % str(e)) try: - statuses(twitter, "", tweets, options['mentions'], options['favorites']) + statuses(twitter, "", tweets, options['mentions'], options['favorites'], isoformat=options['isoformat']) except KeyboardInterrupt: err() err("Interrupted") @@ -379,13 +402,13 @@ def main(args=sys.argv[1:]): dms = {} try: dms = load_tweets(filename) - except Exception, e: + except Exception as e: err("Error when loading saved DMs: %s - continuing without" % str(e)) try: - statuses(twitter, "", dms, received_dms=True) - statuses(twitter, "", dms, received_dms=False) + statuses(twitter, "", dms, received_dms=True, isoformat=options['isoformat']) + statuses(twitter, "", dms, received_dms=False, isoformat=options['isoformat']) except KeyboardInterrupt: err() err("Interrupted") @@ -418,7 +441,7 @@ def main(args=sys.argv[1:]): new = 0 before = len(tweets) try: - statuses(twitter, user, tweets, options['mentions'], options['favorites']) + statuses(twitter, user, tweets, options['mentions'], options['favorites'], isoformat=options['isoformat']) except KeyboardInterrupt: err() err("Interrupted")