From: StalkR Date: Tue, 7 Feb 2012 08:03:26 +0000 (+0100) Subject: twitter-archiver and twitter-follow initial import X-Git-Tag: twitter-1.8.0~6^2~1 X-Git-Url: https://jfr.im/git/z_archive/twitter.git/commitdiff_plain/a72824526aa3f87b5aa5cf5121f62f71aca42269?ds=sidebyside twitter-archiver and twitter-follow initial import --- diff --git a/setup.py b/setup.py index e9e8bff..7fb987e 100644 --- a/setup.py +++ b/setup.py @@ -41,6 +41,8 @@ setup(name='twitter', twitter=twitter.cmdline:main twitterbot=twitter.ircbot:main twitter-log=twitter.logger:main + twitter-archiver=twitter.archiver:main + twitter-follow=twitter.follow:main twitter-stream-example=twitter.stream_example:main """, ) diff --git a/twitter/archiver.py b/twitter/archiver.py new file mode 100644 index 0000000..a5edb05 --- /dev/null +++ b/twitter/archiver.py @@ -0,0 +1,327 @@ +"""USAGE + twitter-archiver [options] <-|user> [ ...] + +DESCRIPTION + Archive tweets of users, sorted by date from oldest to newest, in + the following format: <> + Date format is: YYYY-MM-DD HH:MM:SS TZ. Tweet is used to + resume archiving on next run. Archive file name is the user name. + Provide "-" instead of users to read users from standard input. + +OPTIONS + -o --oauth authenticate to Twitter using OAuth (default no) + -s --save-dir directory to save archives (default: current dir) + -a --api-rate see current API rate limit status + -t --timeline archive own timeline into given file name (requires + OAuth, max 800 statuses). + +AUTHENTICATION + Authenticate to Twitter using OAuth to archive tweets of private profiles + and have higher API rate limits. OAuth authentication tokens are stored + in ~/.twitter-archiver_oauth. +""" + +from __future__ import print_function + +import os, sys, time, calendar, urllib2, httplib +from getopt import gnu_getopt as getopt, GetoptError + +# T-Archiver (Twitter-Archiver) application registered by @stalkr_ +CONSUMER_KEY='d8hIyfzs7ievqeeZLjZrqQ' +CONSUMER_SECRET='AnZmK0rnvaX7BoJ75l6XlilnbyMv7FoiDXWVmPD8' + +from .api import Twitter, TwitterError +from .oauth import OAuth, read_token_file +from .oauth_dance import oauth_dance +from .auth import NoAuth +from .util import Fail, err +from .follow import lookup + +def parse_args(args, options): + """Parse arguments from command-line to set options.""" + long_opts = ['help', 'oauth', 'save-dir=', 'api-rate', 'timeline='] + short_opts = "hos:at:" + opts, extra_args = getopt(args, short_opts, long_opts) + + for opt, arg in opts: + if opt in ('-h', '--help'): + print(__doc__) + raise SystemExit(0) + elif opt in ('-o', '--oauth'): + options['oauth'] = True + elif opt in ('-s', '--save-dir'): + options['save-dir'] = arg + elif opt in ('-a', '--api-rate'): + options['api-rate' ] = True + elif opt in ('-t', '--timeline'): + options['timeline'] = arg + + options['extra_args'] = extra_args + +def load_tweets(filename): + """Load tweets from file into dict, see save_tweets().""" + try: + archive = open(filename,"r") + except IOError: # no archive (yet) + return {} + + tweets = {} + for line in archive.readlines(): + tid, text = line.strip().split(" ", 1) + tweets[int(tid)] = text.decode("utf-8") + + archive.close() + return tweets + +def save_tweets(filename, tweets): + """Save tweets from dict to file. + + Save tweets from dict to UTF-8 encoded file, one per line: + + Tweet text is: + <> [RT @: ] + + Args: + filename: A string representing the file name to save tweets to. + tweets: A dict mapping tweet-ids (int) to tweet text (str). + """ + if len(tweets) == 0: + return + + try: + archive = open(filename,"w") + except IOError as e: + err("Cannot save tweets: %s" % str(e)) + return + + for k in sorted(tweets.keys()): + archive.write("%i %s\n" % (k, tweets[k].encode('utf-8'))) + + archive.close() + +def format_date(utc, to_localtime=True): + """Parse Twitter's UTC date into UTC or local time.""" + u = time.strptime(utc.replace('+0000','UTC'), '%a %b %d %H:%M:%S %Z %Y') + if to_localtime and time.timezone != 0: + t = time.localtime(calendar.timegm(u)) + return time.strftime("%Y-%m-%d %H:%M:%S", t) + " " + time.tzname[1] + else: + return time.strftime("%Y-%m-%d %H:%M:%S UTC", u) + +def format_text(text): + """Transform special chars in text to have only one line.""" + return text.replace('\n','\\n').replace('\r','\\r') + +def timeline_resolve_uids(twitter, tl): + """Resolve user ids to screen names from a timeline.""" + # get all user ids that needs a lookup (no screen_name key) + user_ids = [] + for t in tl: + rt = t.get('retweeted_status') + if rt and not rt['user'].get('screen_name'): + user_ids.append(rt['user']['id']) + if not t['user'].get('screen_name'): + user_ids.append(t['user']['id']) + + # resolve all of them at once + names = lookup(twitter, list(set(user_ids))) + + # build new timeline with resolved uids + new_tl = [] + for t in tl: + rt = t.get('retweeted_status') + if rt and not rt['user'].get('screen_name'): + name = names[rt['user']['id']] + t['retweeted_status']['user']['screen_name'] = name + if not t['user'].get('screen_name'): + name = names[t['user']['id']] + t['user']['screen_name'] = name + new_tl.append(t) + + return new_tl + +def timeline_portion(twitter, screen_name, max_id=None): + """Get a portion of the timeline of a screen name.""" + kwargs = dict(count=200, include_rts=1, screen_name=screen_name) + if max_id: + kwargs['max_id'] = max_id + + tweets = {} + if screen_name: + tl = twitter.statuses.user_timeline(**kwargs) + else: # self + tl = twitter.statuses.home_timeline(**kwargs) + + # some tweets do not provide screen name but user id, resolve those + for t in timeline_resolve_uids(twitter, tl): + text = t['text'] + rt = t.get('retweeted_status') + if rt: + text = "RT @%s: %s" % (rt['user']['screen_name'], rt['text']) + tweets[t['id']] = "%s <%s> %s" % (format_date(t['created_at']), + t['user']['screen_name'], + format_text(text)) + + return tweets + +def timeline(twitter, screen_name, tweets): + """Get the entire timeline of tweets for a screen name.""" + max_id = None + fail = Fail() + # get portions of timeline, incrementing max id until no new tweets appear + while True: + try: + portion = timeline_portion(twitter, screen_name, max_id) + except TwitterError as e: + if e.e.code == 401: + err("Fail: %i Unauthorized (tweets of that user are protected)" + % e.e.code) + break + elif e.e.code == 400: + err("Fail: %i API rate limit exceeded" % e.e.code) + rate = twitter.account.rate_limit_status() + reset = rate['reset_time_in_seconds'] + reset = time.asctime(time.localtime(reset)) + delay = int(rate['reset_time_in_seconds'] + - time.time()) + 5 # avoid race + err("Hourly limit of %i requests reached, next reset on %s: " + "going to sleep for %i secs" % (rate['hourly_limit'], + reset, delay)) + fail.wait(delay) + continue + elif e.e.code == 502: + err("Fail: %i Service currently unavailable, retrying..." + % e.e.code) + else: + err("Fail: %s\nRetrying..." % str(e)[:500]) + fail.wait(3) + except urllib2.URLError as e: + err("Fail: urllib2.URLError %s - Retrying..." % str(e)) + fail.wait(3) + except httplib.error as e: + err("Fail: httplib.error %s - Retrying..." % str(e)) + fail.wait(3) + except KeyError as e: + err("Fail: KeyError %s - Retrying..." % str(e)) + fail.wait(3) + else: + new = -len(tweets) + tweets.update(portion) + new += len(tweets) + err("Browsing %s timeline, new tweets: %i" + % (screen_name if screen_name else "home", new)) + if new < 190: + break + max_id = min(portion.keys()) # browse backwards + fail = Fail() + +def rate_limit_status(twitter): + """Print current Twitter API rate limit status.""" + r = twitter.account.rate_limit_status() + print("Remaining API requests: %i/%i (hourly limit)" + % (r['remaining_hits'], r['hourly_limit'])) + print("Next reset in %is (%s)" + % (int(r['reset_time_in_seconds'] - time.time()), + time.asctime(time.localtime(r['reset_time_in_seconds'])))) + +def main(args=sys.argv[1:]): + options = { + 'oauth': False, + 'save-dir': ".", + 'api-rate': False, + 'timeline': "" + } + try: + parse_args(args, options) + except GetoptError as e: + err("I can't do that, %s." % e) + raise SystemExit(1) + + # exit if no user given + # except if asking for API rate or archive of timeline + if not options['extra_args'] and not (options['api-rate'] or + options['timeline']): + print(__doc__) + return + + # authenticate using OAuth, asking for token if necessary + if options['oauth']: + oauth_filename = (os.getenv("HOME", "") + os.sep + + ".twitter-archiver_oauth") + if not os.path.exists(oauth_filename): + oauth_dance("Twitter-Archiver", CONSUMER_KEY, CONSUMER_SECRET, + oauth_filename) + oauth_token, oauth_token_secret = read_token_file(oauth_filename) + auth = OAuth(oauth_token, oauth_token_secret, CONSUMER_KEY, + CONSUMER_SECRET) + else: + auth = NoAuth() + + twitter = Twitter(auth=auth, api_version='1', domain='api.twitter.com') + + if options['api-rate']: + rate_limit_status(twitter) + return + + # save own timeline (the user used in OAuth) + if options['timeline']: + if isinstance(auth, NoAuth): + err("You must be authenticated to save timeline.") + raise SystemExit(1) + + filename = options['save-dir'] + os.sep + options['timeline'] + print("* Archiving own timeline in %s" % filename) + + tweets = {} + try: + tweets = load_tweets(filename) + except Exception, e: + err("Error when loading saved tweets: %s - continuing without" + % str(e)) + + try: + # no screen_name means we want home_timeline, not user_timeline + timeline(twitter, "", tweets) + except KeyboardInterrupt: + err() + err("Interrupted") + raise SystemExit(1) + + save_tweets(filename, tweets) + print("Total tweets in own timeline: %i" % len(tweets)) + + # read users from command-line or stdin + users = options['extra_args'] + if len(users) == 1 and users[0] == "-": + users = [line.strip() for line in sys.stdin.readlines()] + + # save tweets for every user + total, total_new = 0, 0 + for user in users: + filename = options['save-dir'] + os.sep + user + print("* Archiving %s tweets in %s" % (user, filename)) + + tweets = {} + try: + tweets = load_tweets(filename) + except Exception, e: + err("Error when loading saved tweets: %s - continuing without" + % str(e)) + + new = 0 + before = len(tweets) + try: + timeline(twitter, user, tweets) + except KeyboardInterrupt: + err() + err("Interrupted") + raise SystemExit(1) + + save_tweets(filename, tweets) + total += len(tweets) + new = len(tweets) - before + total_new += new + print("Total tweets for %s: %i (%i new)" % (user, len(tweets), new)) + + print("Total: %i tweets (%i new) for %i users" + % (total, total_new, len(users))) diff --git a/twitter/follow.py b/twitter/follow.py new file mode 100644 index 0000000..abc5fda --- /dev/null +++ b/twitter/follow.py @@ -0,0 +1,233 @@ +"""USAGE + twitter-follow [options] + +DESCRIPTION + Display all following/followers of a user, one user per line. + +OPTIONS + -o --oauth authenticate to Twitter using OAuth (default no) + -r --followers display followers of the given user (default) + -g --following display users the given user is following + -a --api-rate see your current API rate limit status + +AUTHENTICATION + Authenticate to Twitter using OAuth to see following/followers of private + profiles and have higher API rate limits. OAuth authentication tokens + are stored in the file .twitter-follow_oauth in your home directory. +""" + +from __future__ import print_function + +import os, sys, time, calendar, urllib2, httplib +from getopt import gnu_getopt as getopt, GetoptError + +# T-Follow (Twitter-Follow) application registered by @stalkr_ +CONSUMER_KEY='USRZQfvFFjB6UvZIN2Edww' +CONSUMER_SECRET='AwGAaSzZa5r0TDL8RKCDtffnI9H9mooZUdOa95nw8' + +from .api import Twitter, TwitterError +from .oauth import OAuth, read_token_file +from .oauth_dance import oauth_dance +from .auth import NoAuth +from .util import Fail, err + +def parse_args(args, options): + """Parse arguments from command-line to set options.""" + long_opts = ['help', 'oauth', 'followers', 'following', 'api-rate'] + short_opts = "horga" + opts, extra_args = getopt(args, short_opts, long_opts) + + for opt, arg in opts: + if opt in ('-h', '--help'): + print(__doc__) + raise SystemExit(1) + elif opt in ('-o', '--oauth'): + options['oauth'] = True + elif opt in ('-r', '--followers'): + options['followers'] = True + elif opt in ('-g', '--following'): + options['followers'] = False + elif opt in ('-a', '--api-rate'): + options['api-rate' ] = True + + options['extra_args'] = extra_args + +def lookup_portion(twitter, user_ids): + """Resolve a limited list of user ids to screen names.""" + users = {} + kwargs = dict(user_id=",".join(map(str, user_ids)), skip_status=1) + for u in twitter.users.lookup(**kwargs): + users[int(u['id'])] = u['screen_name'] + return users + +def lookup(twitter, user_ids): + """Resolve an entire list of user ids to screen names.""" + users = {} + api_limit = 100 + for i in range(0, len(user_ids), api_limit): + fail = Fail() + while True: + try: + portion = lookup_portion(twitter, user_ids[i:][:api_limit]) + except TwitterError as e: + if e.e.code == 400: + err("Fail: %i API rate limit exceeded" % e.e.code) + rate = twitter.account.rate_limit_status() + reset = rate['reset_time_in_seconds'] + reset = time.asctime(time.localtime(reset)) + delay = int(rate['reset_time_in_seconds'] + - time.time()) + 5 # avoid race + err("Hourly limit of %i requests reached, next reset on " + "%s: going to sleep for %i secs" + % (rate['hourly_limit'], reset, delay)) + fail.wait(delay) + continue + elif e.e.code == 502: + err("Fail: %i Service currently unavailable, retrying..." + % e.e.code) + else: + err("Fail: %s\nRetrying..." % str(e)[:500]) + fail.wait(3) + except urllib2.URLError as e: + err("Fail: urllib2.URLError %s - Retrying..." % str(e)) + fail.wait(3) + except httplib.error as e: + err("Fail: httplib.error %s - Retrying..." % str(e)) + fail.wait(3) + except KeyError as e: + err("Fail: KeyError %s - Retrying..." % str(e)) + fail.wait(3) + else: + users.update(portion) + err("Resolving user ids to screen names: %i/%i" + % (len(users), len(user_ids))) + break + return users + +def follow_portion(twitter, screen_name, cursor=-1, followers=True): + """Get a portion of followers/following for a user.""" + kwargs = dict(screen_name=screen_name, cursor=cursor) + if followers: + t = twitter.followers.ids(**kwargs) + else: # following + t = twitter.friends.ids(**kwargs) + return t['ids'], t['next_cursor'] + +def follow(twitter, screen_name, followers=True): + """Get the entire list of followers/following for a user.""" + user_ids = [] + cursor = -1 + fail = Fail() + while True: + try: + portion, cursor = follow_portion(twitter, screen_name, cursor, + followers) + except TwitterError as e: + if e.e.code == 401: + reason = ("follow%s of that user are protected" + % ("ers" if followers else "ing")) + err("Fail: %i Unauthorized (%s)" % (e.e.code, reason)) + break + elif e.e.code == 400: + err("Fail: %i API rate limit exceeded" % e.e.code) + rate = twitter.account.rate_limit_status() + reset = rate['reset_time_in_seconds'] + reset = time.asctime(time.localtime(reset)) + delay = int(rate['reset_time_in_seconds'] + - time.time()) + 5 # avoid race + err("Hourly limit of %i requests reached, next reset on %s: " + "going to sleep for %i secs" % (rate['hourly_limit'], + reset, delay)) + fail.wait(delay) + continue + elif e.e.code == 502: + err("Fail: %i Service currently unavailable, retrying..." + % e.e.code) + else: + err("Fail: %s\nRetrying..." % str(e)[:500]) + fail.wait(3) + except urllib2.URLError as e: + err("Fail: urllib2.URLError %s - Retrying..." % str(e)) + fail.wait(3) + except httplib.error as e: + err("Fail: httplib.error %s - Retrying..." % str(e)) + fail.wait(3) + except KeyError as e: + err("Fail: KeyError %s - Retrying..." % str(e)) + fail.wait(3) + else: + new = -len(user_ids) + user_ids = list(set(user_ids + portion)) + new += len(user_ids) + what = "follow%s" % ("ers" if followers else "ing") + err("Browsing %s %s, new: %i" % (screen_name, what, new)) + if cursor == 0: + break + fail = Fail() + return user_ids + + +def rate_limit_status(twitter): + """Print current Twitter API rate limit status.""" + r = twitter.account.rate_limit_status() + print("Remaining API requests: %i/%i (hourly limit)" + % (r['remaining_hits'], r['hourly_limit'])) + print("Next reset in %is (%s)" + % (int(r['reset_time_in_seconds'] - time.time()), + time.asctime(time.localtime(r['reset_time_in_seconds'])))) + +def main(args=sys.argv[1:]): + options = { + 'oauth': False, + 'followers': True, + 'api-rate': False + } + try: + parse_args(args, options) + except GetoptError as e: + err("I can't do that, %s." % e) + raise SystemExit(1) + + # exit if no user or given, except if asking for API rate + if not options['extra_args'] and not options['api-rate']: + print(__doc__) + raise SystemExit(1) + + # authenticate using OAuth, asking for token if necessary + if options['oauth']: + oauth_filename = (os.getenv("HOME", "") + os.sep + + ".twitter-follow_oauth") + if not os.path.exists(oauth_filename): + oauth_dance("Twitter-Follow", CONSUMER_KEY, CONSUMER_SECRET, + oauth_filename) + oauth_token, oauth_token_secret = read_token_file(oauth_filename) + auth = OAuth(oauth_token, oauth_token_secret, CONSUMER_KEY, + CONSUMER_SECRET) + else: + auth = NoAuth() + + twitter = Twitter(auth=auth, api_version='1', domain='api.twitter.com') + + if options['api-rate']: + rate_limit_status(twitter) + return + + # obtain list of followers (or following) for every given user + for user in options['extra_args']: + user_ids, users = [], {} + try: + user_ids = follow(twitter, user, options['followers']) + users = lookup(twitter, user_ids) + except KeyboardInterrupt as e: + err() + err("Interrupted.") + raise SystemExit(1) + + for uid in user_ids: + print(users[uid].encode("utf-8")) + + # print total on stderr to separate from user list on stdout + if options['followers']: + err("Total followers for %s: %i" % (user, len(user_ids))) + else: + err("Total users %s is following: %i" % (user, len(user_ids))) diff --git a/twitter/util.py b/twitter/util.py index b0a1f48..27142af 100644 --- a/twitter/util.py +++ b/twitter/util.py @@ -5,9 +5,12 @@ Internal utility functions. http://wiki.python.org/moin/EscapingHtml """ +from __future__ import print_function import re import sys +import time + try: from html.entities import name2codepoint unichr = chr @@ -43,3 +46,32 @@ def printNicely(string): print(string.encode('utf8')) __all__ = ["htmlentitydecode", "smrt_input"] + +def err(msg=""): + print(msg, file=sys.stderr) + +class Fail(object): + """A class to count fails during a repetitive task. + + Args: + maximum: An integer for the maximum of fails to allow. + exit: An integer for the exit code when maximum of fail is reached. + + Methods: + count: Count a fail, exit when maximum of fails is reached. + wait: Same as count but also sleep for a given time in seconds. + """ + def __init__(self, maximum=10, exit=1): + self.i = maximum + self.exit = exit + + def count(self): + self.i -= 1 + if self.i == 0: + err("Too many consecutive fails, exiting.") + raise SystemExit(self.exit) + + def wait(self, delay=0): + self.count() + if delay > 0: + time.sleep(delay)