[csw-devel] SF.net SVN: opencsw:[594] twitter/sweets/sweets.py

Thu Jan 12 00:00:58 CET 2012

Revision: 594
          http://opencsw.svn.sourceforge.net/opencsw/?rev=594&view=rev
Author:   skayser
Date:     2012-01-11 23:00:58 +0000 (Wed, 11 Jan 2012)
Log Message:
-----------
twitter/sweets: cleanup earlier ad-hoc mess, probably introduce some new

Modified Paths:
--------------
    twitter/sweets/sweets.py

Modified: twitter/sweets/sweets.py
===================================================================

--- twitter/sweets/sweets.py	2012-01-11 22:20:38 UTC (rev 593)
+++ twitter/sweets/sweets.py	2012-01-11 23:00:58 UTC (rev 594)
@@ -30,6 +30,7 @@
     """Holds twitter response data and offers operations on it."""
     def __init__(self, file="", json=""):
         self.data = {}
+        self._num_tweets_dropped = 0
         try:
             if file and os.path.exists(file):
                 self.data = self._import_file(file)
@@ -125,7 +126,18 @@
         """Get list of tweets where the tweet contains a given string."""
         return [ t for t in self.get_tweets() if t['text'].find(string) != -1 ]
 
+    def num_tweets(self):
+        return len(self.get_tweets())
 
+    def num_tweets_dropped(self):
+        return self._num_tweets_dropped
+
+    def drop_nonmatching_tweets(self, string):
+        n1 = self.num_tweets()
+        self.set_tweets(self.get_matching_tweets(string))
+        n2 = self.num_tweets()
+        self._num_tweets_dropped += n2 - n1
+
 def twitter_search_mock(query="", state=""):
     """Search twitter (mockup), returns a single-line, JSON search result."""
     f = open("sample-search-results.txt", "r")
@@ -156,19 +168,20 @@
     response = TSResponse(file=outfile)
     while 1:
         try:
-            # Unfortunately, some search results are spam (text in tweet not
-            # related to query), thus we need to filter the search results
             new_response = twitter_search(query, state=response.get_state())
-            new_response.set_tweets(new_response.get_matching_tweets(query))
 
             if new_response.has_tweets():
+                # Unfortunately, some search results are spam (tweet not related
+                # to query, URLs point to spam sites), drop irrelevant tweets
+                new_response.drop_nonmatching_tweets(query)
                 new_response.print_tweets()
             response.update(new_response)
             response.cap(keep)
             response.export(outfile)
-            print "%s new tweets, %s total in archive" % (
-                    len(new_response.get_tweets()),
-                    len(response.get_tweets()))
+            print "%s new tweets (%s dropped), %s total in archive" % (
+                    new_response.num_tweets(),
+                    new_response.num_tweets_dropped(),
+                    response.num_tweets())
 
         except urllib2.URLError, e:
             print "Couldn't retrieve URL:", e

This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.