tags 615817 + patch
thanks

Hi!

I am trying the attached patch. I will keep you posted of my results.

Index: /sandbox/spam-filter-captcha/tracspamfilter/filters/bayes.py
===================================================================
--- spam-filter-captcha/tracspamfilter/filters/bayes.py	(revision 4731)
+++ spam-filter-captcha/tracspamfilter/filters/bayes.py	(revision 8319)
@@ -104,5 +104,10 @@
             self.nspam = self.nham = 0
 
+    def _sanitize(self, text):
+        """Remove invalid byte sequences from utf-8 encoded text"""
+        return text.decode('utf-8', 'ignore')
+
     def _get_row(self, word):
+        word = self._sanitize(word)
         cursor = self.db.cursor()
         cursor.execute("SELECT nspam,nham FROM spamfilter_bayes WHERE word=%s",
@@ -111,8 +116,8 @@
         if not row:
             return {}
-
         return {'nspam': row[0], 'nham': row[1]}
 
     def _set_row(self, word, nspam, nham):
+        word = self._sanitize(word)
         cursor = self.db.cursor()
         if self._has_key(word):
@@ -125,4 +130,5 @@
 
     def _delete_row(self, word):
+        word = self._sanitize(word)
         cursor = self.db.cursor()
         cursor.execute("DELETE FROM spamfilter_bayes WHERE word=%s", (word,))
@@ -130,4 +136,5 @@
 
     def _has_key(self, key):
+        key = self._sanitize(key)
         cursor = self.db.cursor()
         cursor.execute("SELECT COUNT(*) FROM spamfilter_bayes WHERE word=%s",
Index: /plugins/0.12/spam-filter-captcha/tracspamfilter/filters/bayes.py
===================================================================
--- spam-filter-captcha/tracspamfilter/filters/bayes.py	(revision 9932)
+++ spam-filter-captcha/tracspamfilter/filters/bayes.py	(revision 9933)
@@ -76,5 +76,5 @@
 
         hammie = self._get_hammie()
-        hammie.train(content.encode('utf-8'), spam)
+        hammie.train(content.encode('utf-8','ignore'), spam)
         hammie.store()
 
@@ -108,4 +108,6 @@
 
     def _sanitize(self, text):
+        if isinstance(text, unicode):
+            return text
         """Remove invalid byte sequences from utf-8 encoded text"""
         return text.decode('utf-8', 'ignore')
@@ -154,7 +156,4 @@
 
     def _wordinfoget(self, word):
-        if isinstance(word, unicode):
-            word = word.encode("utf-8")
-
         row = self._get_row(word)
         if row:
-- 
Let the machine do the dirty work.
            - The Elements of Programming Style (Kernighan & Plauger)

Reply via email to