[csw-devel] SF.net SVN: opencsw:[575] twitter/sweets/sweets.py

skayser at users.sourceforge.net skayser at users.sourceforge.net
Wed Oct 26 11:51:35 CEST 2011


Revision: 575
          http://opencsw.svn.sourceforge.net/opencsw/?rev=575&view=rev
Author:   skayser
Date:     2011-10-26 09:51:35 +0000 (Wed, 26 Oct 2011)
Log Message:
-----------
twitter/sweets: add JSON file storage for found tweets

Modified Paths:
--------------
    twitter/sweets/sweets.py

Modified: twitter/sweets/sweets.py
===================================================================
--- twitter/sweets/sweets.py	2011-10-25 09:57:29 UTC (rev 574)
+++ twitter/sweets/sweets.py	2011-10-26 09:51:35 UTC (rev 575)
@@ -5,18 +5,19 @@
 #   can be used for further processing (e.g. to generate RSS)
 #
 # TODO:
+# * Persist state across invocations to make it cron (and Twitter) friendly
 # * Add logging for debug purposes (URL fetches, reponses received)
-# * run: write or append result JSON to output file
 # * Rewrite file open/close blocks with "with" once python2.5+ on dev box
 #
 
 import json
 import urllib
 import urllib2
+import os
 import sys
+import tempfile
 import time
 import types
-import StringIO
 from optparse import OptionParser,SUPPRESS_HELP
 
 def twitter_search_mock(query="", since=""):
@@ -33,56 +34,99 @@
     url_params.append(( "q" , query ))
     if since_id: url_params.append(( "since_id" , since_id))
     url = url_base + "?" + urllib.urlencode(url_params)
+    print "Querying", url
     output = urllib2.urlopen(url).read()
     return output
 
 twitter_search = twitter_search_real
 
-def get_results(query, state):
+def get_tweets(query, state):
     """Trigger twitter search and extract results from JSON response.""" 
     json_data = twitter_search(query, state.get('max_seen'))
     response = json.read(json_data)
     state['max_seen'] = response.get('max_id')
     return response['results']
 
-def run(interval, query, state={}):
+def read_jsonfile(file):
+    """Read an object from a JSON file."""
+    if not os.path.exists(file): return []
+    f = open(file, "r")
+    content = f.read()
+    f.close()
+    return json.read(content)
+
+def write_jsonfile(destfile, data):
+    """Write an object to a JSON file."""
+    (fd, name) = tempfile.mkstemp()
+    f = os.fdopen(fd, 'w')
+    f.write(json.write(data).encode('utf-8'))
+    f.close()
+    os.rename(name, destfile)
+
+def dedup_tweets(tweets):
+    """Traverse list of tweets, de-dup them by ID and return new list."""
+    seen = set()
+    dtweets = []
+    for t in tweets:
+        if t['id'] in seen: continue
+        seen.add(t['id'])
+        dtweets.append(t)
+    return dtweets
+
+def log_tweet(tweet):
+    # Tweet URL #http://twitter.com/#!/<from_user>/status/<id_str>
+    msg = "New tweet: \"%s...\" http://twitter.com/#!/%s/status/%s" % (
+            tweet['text'][0:40],
+            tweet['from_user'],
+            tweet['id_str']
+    )
+    print msg.encode(sys.stdout.encoding, 'replace')
+
+def run(interval, query, outfile, state={}):
     """Run the main loop which queries Twitter and writes the output."""
+    tweets = read_jsonfile(outfile)
     while 1:
         try:
-            results = get_results(query, state)
-            for result in results:
-                # Tweet URL #http://twitter.com/#!/<from_user>/status/<id_str>
-                msg = "%s - http://twitter.com/#!/%s/status/%s" % (
-                        result['text'],
-                        result['from_user'],
-                        result['id_str']
-                )
-                print msg.encode(sys.stdout.encoding, 'replace')
+            new_tweets = get_tweets(query, state)
+            if new_tweets:
+                tweets.extend(new_tweets)
+                tweets = dedup_tweets(tweets)
+                for t in new_tweets: log_tweet(t)
+                write_jsonfile(outfile, tweets)
+            else:
+                print "No new tweets (%s in archive)" % len(tweets)
         except urllib2.URLError, e:
             print "Couldn't retrieve URL:", e
         except json.ReadException, e:
             print "Couldn't read JSON response", e
+        except json.WriteException, e:
+            print "Couldn't write JSON output", e
 
         if hasattr(interval, "__call__"): interval()
         else: time.sleep(float(interval))
 
 if __name__ == '__main__':
-    usage = "%prog -s <searchterm> -t <secs>"
-    parser = OptionParser(usage=usage)
-    parser.add_option("-s", dest="searchterm", help="search term to run on Twitter")
-    parser.add_option("-t", dest="interval", default=300,
-                            help="query every X seconds (default 300)")
-    parser.add_option("-d", dest="debug", action="store_true",
-                            help=SUPPRESS_HELP)
-    (options, args) = parser.parse_args()
+    usage = "%prog -s <query> [options]"
+    op = OptionParser(usage=usage)
+    op.add_option("-s", dest="query", help="query term to run on Twitter")
+    op.add_option("-t", dest="interval", default=300,
+                        help="query every X seconds (def: 300)")
+    op.add_option("-o", dest="output", metavar="FILE",
+                        help="store tweets in FILE (def: tweets-<query>.json)")
+    op.add_option("-d", dest="debug", action="store_true",
+                        help=SUPPRESS_HELP)
+    (options, args) = op.parse_args()
 
-    required_options = ( "searchterm", )
+    required_options = ( "query", )
     for o in required_options:
         if not getattr(options, o):
             parser.print_help()
             sys.exit(1)
 
+    if not options.output:
+        options.output = "tweets-%s.json" % options.query
+
     try:
         if options.debug: options.interval = sys.stdin.readline
-        run(options.interval, options.searchterm)
+        run(options.interval, options.query, options.output)
     except KeyboardInterrupt: pass

This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.



More information about the devel mailing list