[csw-devel] SF.net SVN: opencsw:[577] twitter/sweets/sweets.py
skayser at users.sourceforge.net
skayser at users.sourceforge.net
Thu Oct 27 19:09:37 CEST 2011
Revision: 577
http://opencsw.svn.sourceforge.net/opencsw/?rev=577&view=rev
Author: skayser
Date: 2011-10-27 17:09:36 +0000 (Thu, 27 Oct 2011)
Log Message:
-----------
twitter/sweets: rewrite to persist full response object
Modified Paths:
--------------
twitter/sweets/sweets.py
Modified: twitter/sweets/sweets.py
===================================================================
--- twitter/sweets/sweets.py 2011-10-27 13:11:33 UTC (rev 576)
+++ twitter/sweets/sweets.py 2011-10-27 17:09:36 UTC (rev 577)
@@ -5,12 +5,11 @@
# can be used for further processing (e.g. to generate RSS)
#
# TODO:
-# * Persist state across invocations to make it cron (and Twitter) friendly
# * Add logging for debug purposes (URL fetches, reponses received)
# * Rewrite file open/close blocks with "with" once python2.5+ on dev box
#
-import json
+import simplejson
import urllib
import urllib2
import os
@@ -20,116 +19,172 @@
import types
from optparse import OptionParser,SUPPRESS_HELP
-def twitter_search_mock(query="", since=""):
+class JsonParseError(Exception):
+ """Specific error for simplejson which only raises unspecific ValueError."""
+ pass
+
+class TSResponse:
+ """Holds twitter response data and offers operations on it."""
+ def __init__(self, file="", json=""):
+ self.data = {}
+ try:
+ if file and os.path.exists(file):
+ self.data = self._import_file(file)
+ elif json:
+ self.data = simplejson.loads(json)
+ except ValueError:
+ raise JsonParseError()
+
+ def _import_file(self, file):
+ """Import previously stored data. Should cope with older formats."""
+ data = simplejson.load(open(file))
+ if isinstance(data, list):
+ return { "results": data }
+ else:
+ return data
+
+ def has_tweets(self):
+ return self.data['results']
+
+ def export(self, destfile):
+ """Export result object to a file."""
+ (fd, name) = tempfile.mkstemp()
+ f = os.fdopen(fd, 'w')
+ f.write(simplejson.dumps(self.data).encode('utf-8'))
+ f.close()
+ os.rename(name, destfile)
+
+ def get_tweets(self):
+ """Get the list of tweets in the response object."""
+ return self.data.get('results', [])
+
+ def set_tweets(self, tweets):
+ """Set the list of tweets in the response object."""
+ self.data['results'] = tweets
+
+ def dedup_tweets(self):
+ """Dedup tweets in the response object by tweet ID."""
+ seen = set()
+ dtweets = []
+ for t in self.get_tweets():
+ if t['id'] in seen: continue
+ seen.add(t['id'])
+ dtweets.append(t)
+ self.set_tweets(dtweets)
+
+ def cap(self, n):
+ """Limit tweets in the response object to the last N tweets."""
+ self.set_tweets(self.get_tweets()[-n:])
+
+ def update(self, response):
+ """
+ Joins tweets and state from another response object (implicitly
+ deduplicates tweets after the join).
+ """
+ self.set_tweets(self.get_tweets() + response.get_tweets())
+ self.dedup_tweets()
+ self.set_state(response)
+
+ def get_state(self):
+ """Return response meta-data, excluding tweet results."""
+ dict = {}
+ for k, v in self.data.iteritems():
+ if not k == "results": dict[k] = v
+ return dict
+
+ def set_state(self, response):
+ """Set response meta-data to that of another response."""
+ for k, v in response.data.iteritems():
+ if not k == "results": self.data[k] = v
+
+ def print_tweets(self):
+ # Tweet URL #http://twitter.com/#!/<from_user>/status/<id_str>
+ for tweet in self.get_tweets():
+ msg = "New tweet: \"%s...\" http://twitter.com/#!/%s/status/%s" % (
+ tweet['text'][0:40],
+ tweet['from_user'],
+ tweet['id_str']
+ )
+ print msg.encode(sys.stdout.encoding, 'replace')
+
+
+def twitter_search_mock(query="", state=""):
"""Search twitter (mockup), returns a single-line, JSON search result."""
f = open("sample-search-results.txt", "r")
data = ''.join(f.readlines()) # ensure single line string
f.close()
return data
-def twitter_search_real(query, since_id=""):
+def twitter_search_real(query, state=""):
"""Search twitter, returns a single-line, JSON search result."""
url_base = "http://search.twitter.com/search.json"
- url_params = []
- url_params.append(( "q" , query ))
- if since_id: url_params.append(( "since_id" , since_id))
- url = url_base + "?" + urllib.urlencode(url_params)
+ if state.get("refresh_url"):
+ url = url_base + state.get("refresh_url")
+ else:
+ url_params = []
+ url_params.append(( "q" , query ))
+ url = url_base + "?" + urllib.urlencode(url_params)
print "Querying", url
output = urllib2.urlopen(url).read()
- return output
+ return TSResponse(json=output)
twitter_search = twitter_search_real
-def get_tweets(query, state):
- """Trigger twitter search and extract results from JSON response."""
- json_data = twitter_search(query, state.get('max_seen'))
- response = json.read(json_data)
- state['max_seen'] = response.get('max_id')
- return response['results']
-
-def read_jsonfile(file):
- """Read an object from a JSON file."""
- if not os.path.exists(file): return []
- f = open(file, "r")
- content = f.read()
- f.close()
- return json.read(content)
-
-def write_jsonfile(destfile, data):
- """Write an object to a JSON file."""
- (fd, name) = tempfile.mkstemp()
- f = os.fdopen(fd, 'w')
- f.write(json.write(data).encode('utf-8'))
- f.close()
- os.rename(name, destfile)
-
-def dedup_tweets(tweets):
- """Traverse list of tweets, de-dup them by ID and return new list."""
- seen = set()
- dtweets = []
- for t in tweets:
- if t['id'] in seen: continue
- seen.add(t['id'])
- dtweets.append(t)
- return dtweets
-
-def log_tweet(tweet):
- # Tweet URL #http://twitter.com/#!/<from_user>/status/<id_str>
- msg = "New tweet: \"%s...\" http://twitter.com/#!/%s/status/%s" % (
- tweet['text'][0:40],
- tweet['from_user'],
- tweet['id_str']
- )
- print msg.encode(sys.stdout.encoding, 'replace')
-
-def run(interval, query, outfile, keep, state={}):
- """Run the main loop which queries Twitter and writes the output."""
- tweets = read_jsonfile(outfile)
+def run(query, outfile, keep, interval=None):
+ """
+ Run the main loop which queries Twitter and writes the output. Quits after
+ one pass, unless interval is given upon which it actually loops.
+ """
+ response = TSResponse(file=outfile)
while 1:
try:
- new_tweets = get_tweets(query, state)
- if new_tweets:
- tweets.extend(new_tweets)
- tweets = dedup_tweets(tweets)
- for t in new_tweets: log_tweet(t)
- write_jsonfile(outfile, tweets[-keep:])
- else:
- print "No new tweets (%s in archive)" % len(tweets)
+ new_response = twitter_search(query, state=response.get_state())
+ if new_response.has_tweets():
+ new_response.print_tweets()
+ response.update(new_response)
+ response.cap(keep)
+ response.export(outfile)
+ print "%s new tweets, %s total in archive" % (
+ len(new_response.get_tweets()),
+ len(response.get_tweets()))
+
except urllib2.URLError, e:
print "Couldn't retrieve URL:", e
- except json.ReadException, e:
- print "Couldn't read JSON response", e
- except json.WriteException, e:
- print "Couldn't write JSON output", e
+ except JsonParseError:
+ print "Couldn't parse JSON data"
- if hasattr(interval, "__call__"): interval()
- else: time.sleep(float(interval))
+ if not interval:
+ break
+ elif hasattr(interval, "__call__"):
+ interval()
+ else:
+ time.sleep(float(interval))
if __name__ == '__main__':
usage = "%prog -s <query> [options]"
op = OptionParser(usage=usage)
op.add_option("-s", dest="query", help="query term to run on Twitter")
- op.add_option("-t", dest="interval", default=300,
- help="query every X seconds (def: 300)")
op.add_option("-n", dest="limit", default=100,
help="amount of results to keep (def: 100)")
op.add_option("-o", dest="output", metavar="FILE",
help="store tweets in FILE (def: tweets-<query>.json)")
op.add_option("-d", dest="debug", action="store_true",
help=SUPPRESS_HELP)
+ op.add_option("-t", dest="interval", help=SUPPRESS_HELP)
(options, args) = op.parse_args()
if not options.query:
- op.print_help()
- sys.exit(1)
+ op.print_help()
+ sys.exit(1)
- if not options.output:
- options.output = "tweets-%s.json" % options.query
+ if not options.output: options.output = "tweets-%s.json" % options.query
+ if options.debug:
+ if not options.interval:
+ options.interval = sys.stdin.readline
try:
- if options.debug: options.interval = sys.stdin.readline
- run(interval=options.interval,
- query=options.query,
- output=options.output,
- keep=options.limit)
+ run(query=options.query,
+ outfile=options.output,
+ keep=options.limit,
+ interval=options.interval)
except KeyboardInterrupt: pass
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
More information about the devel
mailing list