branch: externals/xeft
commit fb504c0a19f19e8ae555cf934ef71aec9292cecf
Author: Yuan Fu <caso...@gmail.com>
Commit: Yuan Fu <caso...@gmail.com>

    Upgrade xapian-lite
---
 xapian-lite.cc | 79 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 77 insertions(+), 2 deletions(-)

diff --git a/xapian-lite.cc b/xapian-lite.cc
index e1c9499560..d231011480 100644
--- a/xapian-lite.cc
+++ b/xapian-lite.cc
@@ -27,6 +27,7 @@ along with GNU Emacs.  If not, see 
<https://www.gnu.org/licenses/>.  */
 #include <exception>
 #include <iterator>
 #include <cstdarg>
+#include <string_view>
 
 #include <stdlib.h>
 #include <assert.h>
@@ -80,12 +81,31 @@ int plugin_is_GPL_compatible;
 static const Xapian::valueno DOC_MTIME = 0;
 // The index of the document value that store the file path.
 static const Xapian::valueno DOC_FILEPATH = 1;
+// If a stretch of text contains only base64 characters and exceeds
+// this length, we consider it base64 text and skip it when indexing.
+// I chose 70 because some base64 encoding has line wrapping with 76
+// characters per line. This might cause the indexer to exclude some
+// urls from indexing too, but we probably don't want to index urls
+// anyway.
+static const size_t BASE64_LEN_THRESHOLD = 70;
 
 static Xapian::WritableDatabase database;
 static string cached_dbpath = "";
 
 class xapian_lite_cannot_open_file: public exception {};
 
+static bool
+is_base64_char (char character)
+{
+  return (character >= 'A' && character <= 'z')
+       || (character >= '0' && character <= '9')
+       || (character == '+')
+       || (character == '/')
+       || (character == '-')
+       || (character == '_')
+       || (character == '=');
+}
+
 // Return the hash of KEY.
 static uint64_t
 fingerprint (string key)
@@ -217,6 +237,8 @@ reindex_file
       ifstream infile (path);
       string content ((istreambuf_iterator<char>(infile)),
                       (istreambuf_iterator<char>()));
+
+      std::string_view content_view = std::string_view(content);
       // Create the indexer.
       Xapian::TermGenerator indexer;
       Xapian::Stem stemmer (lang);
@@ -228,8 +250,61 @@ reindex_file
       // Index file content.
       Xapian::Document new_doc;
       indexer.set_document (new_doc);
-      indexer.index_text (content);
-      // Set doc info.
+
+         // Index the file, skipping base64 text.
+         Xapian::Utf8Iterator iter;
+         size_t unindexed_start = 0;
+         size_t base64_start = 0;
+         bool prev_char_is_base64 = false;
+         int mode = 0;
+      for (int idx = 0; idx < content.length (); idx++)
+               {
+                 bool this_char_is_base64 = is_base64_char (content[idx]);
+                 switch (mode)
+                       {
+                       // Looking for base64 start.
+                       case 0:
+                         if (!prev_char_is_base64 && this_char_is_base64)
+                               {
+                                 base64_start = idx;
+                                 mode = 1;
+                                 break;
+                               }
+                       // Found a base64 start, keep reading and see if this is
+                       // actually base64 text.
+                       case 1:
+                         if (!this_char_is_base64)
+                               {
+                                 mode = 0;
+                                 break;
+                               }
+              if (idx - base64_start > BASE64_LEN_THRESHOLD)
+                               {
+                                 mode = 2;
+                                 break;
+                }
+                       // Detected base64, read until the end of the base64
+                       // text.
+                       case 2:
+              if (!this_char_is_base64)
+                               {
+                                 iter.assign (&content[unindexed_start], 
base64_start - unindexed_start);
+                                 indexer.index_text (iter);
+
+                                 mode = 0;
+                                 unindexed_start = idx;
+                                 break;
+                }
+            }
+
+                 prev_char_is_base64 = this_char_is_base64;
+        }
+
+         // Index the remaining content.
+         iter.assign (&content[unindexed_start], content.length () - 
unindexed_start);
+         indexer.index_text (iter);
+
+         // Set doc info.
       new_doc.add_boolean_term (termID);
       // We store the path in value, no need to use set_data.
       new_doc.add_value (DOC_FILEPATH, path);

Reply via email to