This patch adds an implementation of Levenshtein distance to gcc,
along with unit testing of the algorithm.

The unit testing is implemented via a plugin within gcc.dg/plugin.
(I'd prefer to do this via the unit testing patches I've been
proposing in a separate patch kit, but to avoid depending on that
this kit does it within a custom plugin.)

The plugin actually fails until followup patches are applied, with:

 cc1: error: cannot load plugin ./levenshtein_plugin.so
 ./levenshtein_plugin.so: undefined symbol: _Z20levenshtein_distancePKcS0_

due to nothing in the tree initially using the API, but I've broken
it out in the hope that it makes review easier.

gcc/ChangeLog:
        * Makefile.in (OBJS): Add spellcheck.o.
        * spellcheck.c: New file.
        * spellcheck.h: New file.

gcc/testsuite/ChangeLog:
        * gcc.dg/plugin/levenshtein-test-1.c: New file.
        * gcc.dg/plugin/levenshtein_plugin.c: New file.
        * gcc.dg/plugin/plugin.exp (plugin_test_list): Add
        levenshtein_plugin.c.
---
 gcc/Makefile.in                                  |   1 +
 gcc/spellcheck.c                                 | 136 +++++++++++++++++++++++
 gcc/spellcheck.h                                 |  32 ++++++
 gcc/testsuite/gcc.dg/plugin/levenshtein-test-1.c |   9 ++
 gcc/testsuite/gcc.dg/plugin/levenshtein_plugin.c |  64 +++++++++++
 gcc/testsuite/gcc.dg/plugin/plugin.exp           |   1 +
 6 files changed, 243 insertions(+)
 create mode 100644 gcc/spellcheck.c
 create mode 100644 gcc/spellcheck.h
 create mode 100644 gcc/testsuite/gcc.dg/plugin/levenshtein-test-1.c
 create mode 100644 gcc/testsuite/gcc.dg/plugin/levenshtein_plugin.c

diff --git a/gcc/Makefile.in b/gcc/Makefile.in
index 2685b38..9fb643e 100644
--- a/gcc/Makefile.in
+++ b/gcc/Makefile.in
@@ -1394,6 +1394,7 @@ OBJS = \
        shrink-wrap.o \
        simplify-rtx.o \
        sparseset.o \
+       spellcheck.o \
        sreal.o \
        stack-ptr-mod.o \
        statistics.o \
diff --git a/gcc/spellcheck.c b/gcc/spellcheck.c
new file mode 100644
index 0000000..532df58
--- /dev/null
+++ b/gcc/spellcheck.c
@@ -0,0 +1,136 @@
+/* Find near-matches for strings and identifiers.
+   Copyright (C) 2015 Free Software Foundation, Inc.
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free
+Software Foundation; either version 3, or (at your option) any later
+version.
+
+GCC is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or
+FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received a copy of the GNU General Public License
+along with GCC; see the file COPYING3.  If not see
+<http://www.gnu.org/licenses/>.  */
+
+#include "config.h"
+#include "system.h"
+#include "coretypes.h"
+#include "tm.h"
+#include "tree.h"
+#include "spellcheck.h"
+
+/* The Levenshtein distance is an "edit-distance": the minimal
+   number of one-character insertions, removals or substitutions
+   that are needed to change one string into another.
+
+   This implementation uses the Wagner-Fischer algorithm.  */
+
+static edit_distance_t
+levenshtein_distance (const char *s, int m,
+                     const char *t, int n)
+{
+  const bool debug = false;
+
+  if (debug)
+    {
+      printf ("s: \"%s\" (m=%i)\n", s, m);
+      printf ("t: \"%s\" (n=%i)\n", t, n);
+    }
+
+  if (m == 0)
+    return n;
+  if (n == 0)
+    return m;
+
+  /* We effectively build a matrix where each (i, j) contains the
+     Levenshtein distance between the prefix strings s[0:i]
+     and t[0:j].
+     Rather than actually build an (m + 1) * (n + 1) matrix,
+     we simply keep track of the last row, v0 and a new row, v1,
+     which avoids an (m + 1) * (n + 1) allocation and memory accesses
+     in favor of two (m + 1) allocations.  These could potentially be
+     statically-allocated if we impose a maximum length on the
+     strings of interest.  */
+  edit_distance_t *v0 = new edit_distance_t[m + 1];
+  edit_distance_t *v1 = new edit_distance_t[m + 1];
+
+  /* The first row is for the case of an empty target string, which
+     we can reach by deleting every character in the source string.  */
+  for (int i = 0; i < m + 1; i++)
+    v0[i] = i;
+
+  /* Build successive rows.  */
+  for (int i = 0; i < n; i++)
+    {
+      if (debug)
+       {
+         printf ("i:%i v0 = ", i);
+         for (int j = 0; j < m + 1; j++)
+           printf ("%i ", v0[j]);
+         printf ("\n");
+       }
+
+      /* The initial column is for the case of an empty source string; we
+        can reach prefixes of the target string of length i
+        by inserting i characters.  */
+      v1[0] = i + 1;
+
+      /* Build the rest of the row by considering neighbours to
+        the north, west and northwest.  */
+      for (int j = 0; j < m; j++)
+       {
+         edit_distance_t cost = (s[j] == t[i] ? 0 : 1);
+         edit_distance_t deletion     = v1[j] + 1;
+         edit_distance_t insertion    = v0[j + 1] + 1;
+         edit_distance_t substitution = v0[j] + cost;
+         edit_distance_t cheapest = MIN (deletion, insertion);
+         cheapest = MIN (cheapest, substitution);
+         v1[j + 1] = cheapest;
+       }
+
+      /* Prepare to move on to next row.  */
+      for (int j = 0; j < m + 1; j++)
+       v0[j] = v1[j];
+    }
+
+  if (debug)
+    {
+      printf ("final v1 = ");
+      for (int j = 0; j < m + 1; j++)
+       printf ("%i ", v1[j]);
+      printf ("\n");
+    }
+
+  edit_distance_t result = v1[m];
+  delete[] v0;
+  delete[] v1;
+  return result;
+}
+
+/* Calculate Levenshtein distance between two nil-terminated strings.
+   This exists purely for the unit tests.  */
+
+edit_distance_t
+levenshtein_distance (const char *s, const char *t)
+{
+  return levenshtein_distance (s, strlen (s), t, strlen (t));
+}
+
+/* Calculate Levenshtein distance between two identifiers.  */
+
+edit_distance_t
+levenshtein_distance (tree ident_s, tree ident_t)
+{
+  gcc_assert (TREE_CODE (ident_s) == IDENTIFIER_NODE);
+  gcc_assert (TREE_CODE (ident_t) == IDENTIFIER_NODE);
+
+  return levenshtein_distance (IDENTIFIER_POINTER (ident_s),
+                              IDENTIFIER_LENGTH (ident_s),
+                              IDENTIFIER_POINTER (ident_t),
+                              IDENTIFIER_LENGTH (ident_t));
+}
diff --git a/gcc/spellcheck.h b/gcc/spellcheck.h
new file mode 100644
index 0000000..58355d6
--- /dev/null
+++ b/gcc/spellcheck.h
@@ -0,0 +1,32 @@
+/* Find near-matches for strings and identifiers.
+   Copyright (C) 2015 Free Software Foundation, Inc.
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free
+Software Foundation; either version 3, or (at your option) any later
+version.
+
+GCC is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or
+FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received a copy of the GNU General Public License
+along with GCC; see the file COPYING3.  If not see
+<http://www.gnu.org/licenses/>.  */
+
+#ifndef GCC_SPELLCHECK_H
+#define GCC_SPELLCHECK_H
+
+typedef unsigned int edit_distance_t;
+const edit_distance_t MAX_EDIT_DISTANCE = UINT_MAX;
+
+extern edit_distance_t
+levenshtein_distance (const char *s, const char *t);
+
+extern edit_distance_t
+levenshtein_distance (tree ident_s, tree ident_t);
+
+#endif  /* GCC_SPELLCHECK_H  */
diff --git a/gcc/testsuite/gcc.dg/plugin/levenshtein-test-1.c 
b/gcc/testsuite/gcc.dg/plugin/levenshtein-test-1.c
new file mode 100644
index 0000000..ac49992
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/plugin/levenshtein-test-1.c
@@ -0,0 +1,9 @@
+/* Placeholder C source file for unit-testing gcc/spellcheck.c.  */
+/* { dg-do compile } */
+/* { dg-options "-O" } */
+
+int
+main (int argc, char **argv)
+{
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.dg/plugin/levenshtein_plugin.c 
b/gcc/testsuite/gcc.dg/plugin/levenshtein_plugin.c
new file mode 100644
index 0000000..3e7dc78
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/plugin/levenshtein_plugin.c
@@ -0,0 +1,64 @@
+/* Plugin for unittesting gcc/spellcheck.h.  */
+
+#include "config.h"
+#include "gcc-plugin.h"
+#include "system.h"
+#include "coretypes.h"
+#include "spellcheck.h"
+#include "diagnostic.h"
+
+int plugin_is_GPL_compatible;
+
+static void
+levenshtein_distance_unit_test_oneway (const char *a, const char *b,
+                                      edit_distance_t expected)
+{
+  edit_distance_t actual = levenshtein_distance (a, b);
+  if (actual != expected)
+    error ("levenshtein_distance (\"%s\", \"%s\") : expected: %i got %i",
+          a, b, expected, actual);
+}
+
+
+static void
+levenshtein_distance_unit_test (const char *a, const char *b,
+                               edit_distance_t expected)
+{
+  /* Run every test both ways to ensure it's symmetric.  */
+  levenshtein_distance_unit_test_oneway (a, b, expected);
+  levenshtein_distance_unit_test_oneway (b, a, expected);
+}
+
+/* Callback handler for the PLUGIN_FINISH event; run
+   levenshtein_distance unit tests here.  */
+
+static void
+on_finish (void */*gcc_data*/, void */*user_data*/)
+{
+  levenshtein_distance_unit_test ("", "nonempty", strlen ("nonempty"));
+  levenshtein_distance_unit_test ("saturday", "sunday", 3);
+  levenshtein_distance_unit_test ("foo", "m_foo", 2);
+  levenshtein_distance_unit_test ("hello_world", "HelloWorld", 3);
+  levenshtein_distance_unit_test
+    ("the quick brown fox jumps over the lazy dog", "dog", 40);
+  levenshtein_distance_unit_test
+    ("the quick brown fox jumps over the lazy dog",
+     "the quick brown dog jumps over the lazy fox",
+     4);
+  levenshtein_distance_unit_test
+    ("Lorem ipsum dolor sit amet, consectetur adipiscing elit,",
+     "All your base are belong to us",
+     44);
+}
+
+int
+plugin_init (struct plugin_name_args *plugin_info,
+            struct plugin_gcc_version */*version*/)
+{
+  register_callback (plugin_info->base_name,
+                    PLUGIN_FINISH,
+                    on_finish,
+                    NULL); /* void *user_data */
+
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.dg/plugin/plugin.exp 
b/gcc/testsuite/gcc.dg/plugin/plugin.exp
index 39fab6e..80fc539 100644
--- a/gcc/testsuite/gcc.dg/plugin/plugin.exp
+++ b/gcc/testsuite/gcc.dg/plugin/plugin.exp
@@ -63,6 +63,7 @@ set plugin_test_list [list \
     { start_unit_plugin.c start_unit-test-1.c } \
     { finish_unit_plugin.c finish_unit-test-1.c } \
     { wide-int_plugin.c wide-int-test-1.c } \
+    { levenshtein_plugin.c levenshtein-test-1.c } \
 ]
 
 foreach plugin_test $plugin_test_list {
-- 
1.8.5.3

Reply via email to