* lib/diffseq.h (CHECK_CONDITION): New required define; may be empty. (compareseq): Use it after each note, and recursive compareseq invocation. * lib/fstrcmp.c (EXTRA_CONTEXT_FIELDS): New field edit_count_limit. (CHECK_CONDITION): New define; check if the bound has been reached. (fstrcmp_internal): New argument 'bound', used to compute edit_count_limit. Return zero if the limit has been reached. (fstrcmp): Adjust caller, using zero bound. (fstrcmp_if_higher): Likewise, using given bound. ---
This patch implements (2), premature termination of the compareseq code. This patch requires diffutils to add an empty CHECK_CONDITION define, but otherwise has no impact on its use of diffseq.h. The patch relies on the previous posted patch. I should note that the computation of edit_count_limit adds 1 (not 0.5) to the computation to avoid issues with strings that have very similar fstrcmp values. This change may slow down fstrcmp a bit (I haven't measured), but it's clearly not an algorithmic decay, and the additional tests are outside of the hot inner loop (diag). OK to apply to gnulib? Thanks, Ralf lib/diffseq.h | 9 ++++++++- lib/fstrcmp.c | 32 ++++++++++++++++++++------------ 2 files changed, 28 insertions(+), 13 deletions(-) diff --git a/lib/diffseq.h b/lib/diffseq.h index 7a62196..93091fd 100644 --- a/lib/diffseq.h +++ b/lib/diffseq.h @@ -49,6 +49,7 @@ EXTRA_CONTEXT_FIELDS Declarations of fields for 'struct context'. NOTE_DELETE(ctxt, xoff) Record the removal of the object xvec[xoff]. NOTE_INSERT(ctxt, yoff) Record the insertion of the object yvec[yoff]. + CHECK_CONDITION Check for premature termination. USE_HEURISTIC (Optional) Define if you want to support the heuristic for large vectors. Before including this file, you also need to include: @@ -407,7 +408,9 @@ diag (OFFSET xoff, OFFSET xlim, OFFSET yoff, OFFSET ylim, bool find_minimal, If FIND_MINIMAL, find a minimal difference no matter how expensive it is. - The results are recorded by invoking NOTE_DELETE and NOTE_INSERT. */ + The results are recorded by invoking NOTE_DELETE and NOTE_INSERT. + The algorithm may be terminated prematurely using CHECK_CONDITION. + */ static void compareseq (OFFSET xoff, OFFSET xlim, OFFSET yoff, OFFSET ylim, @@ -435,12 +438,14 @@ compareseq (OFFSET xoff, OFFSET xlim, OFFSET yoff, OFFSET ylim, while (yoff < ylim) { NOTE_INSERT (ctxt, yoff); + CHECK_CONDITION (ctxt); yoff++; } else if (yoff == ylim) while (xoff < xlim) { NOTE_DELETE (ctxt, xoff); + CHECK_CONDITION (ctxt); xoff++; } else @@ -452,6 +457,7 @@ compareseq (OFFSET xoff, OFFSET xlim, OFFSET yoff, OFFSET ylim, /* Use the partitions to split this problem into subproblems. */ compareseq (xoff, part.xmid, yoff, part.ymid, part.lo_minimal, ctxt); + CHECK_CONDITION (ctxt); compareseq (part.xmid, xlim, part.ymid, ylim, part.hi_minimal, ctxt); } } @@ -462,5 +468,6 @@ compareseq (OFFSET xoff, OFFSET xlim, OFFSET yoff, OFFSET ylim, #undef EXTRA_CONTEXT_FIELDS #undef NOTE_DELETE #undef NOTE_INSERT +#undef CHECK_CONDITION #undef USE_HEURISTIC #undef OFFSET_MAX diff --git a/lib/fstrcmp.c b/lib/fstrcmp.c index 5ab6136..efb7331 100644 --- a/lib/fstrcmp.c +++ b/lib/fstrcmp.c @@ -67,9 +67,13 @@ #define EXTRA_CONTEXT_FIELDS \ /* The number of elements inserted or deleted. */ \ int xvec_edit_count; \ - int yvec_edit_count; + int yvec_edit_count; \ + int edit_count_limit; #define NOTE_DELETE(ctxt, xoff) ctxt->xvec_edit_count++ #define NOTE_INSERT(ctxt, yoff) ctxt->yvec_edit_count++ +#define CHECK_CONDITION(ctxt) \ + if (ctxt->xvec_edit_count + ctxt->yvec_edit_count >= ctxt->edit_count_limit) \ + return /* We don't need USE_HEURISTIC, since it is unlikely in typical uses of fstrcmp(). */ #include "diffseq.h" @@ -98,10 +102,10 @@ gl_once_define(static, keys_init_once) /* Compute and return the fstrcmp similarity value of STRING1 and STRING2 - with lengths XVEC_LENGTH and YVEC_LENGTH. */ + if it is higher than BOUND. Otherwise, return 0. */ static double fstrcmp_internal (const char *string1, const char *string2, - int xvec_length, int yvec_length) + int xvec_length, int yvec_length, double bound) { struct context ctxt; int i; @@ -155,15 +159,19 @@ fstrcmp_internal (const char *string1, const char *string2, /* Now do the main comparison algorithm */ ctxt.xvec_edit_count = 0; ctxt.yvec_edit_count = 0; + ctxt.edit_count_limit = (1. - bound) * (xvec_length + yvec_length) + 1; compareseq (0, xvec_length, 0, yvec_length, 0, &ctxt); - /* The result is - ((number of chars in common) / (average length of the strings)). - This is admittedly biased towards finding that the strings are - similar, however it does produce meaningful results. */ - return ((double) (xvec_length + yvec_length - - ctxt.yvec_edit_count - ctxt.xvec_edit_count) - / (xvec_length + yvec_length)); + if (ctxt.xvec_edit_count + ctxt.yvec_edit_count >= ctxt.edit_count_limit) + return 0.; + else + /* The result is + ((number of chars in common) / (average length of the strings)). + This is admittedly biased towards finding that the strings are + similar, however it does produce meaningful results. */ + return ((double) (xvec_length + yvec_length + - ctxt.yvec_edit_count - ctxt.xvec_edit_count) + / (xvec_length + yvec_length)); } @@ -189,7 +197,7 @@ fstrcmp (const char *string1, const char *string2) { int xvec_length = strlen (string1); int yvec_length = strlen (string2); - return fstrcmp_internal (string1, string2, xvec_length, yvec_length); + return fstrcmp_internal (string1, string2, xvec_length, yvec_length, 0.); } /* Upper bound for the fuzzy comparison of strings of length XVEC_LENGTH and @@ -215,7 +223,7 @@ fstrcmp_if_higher (const char *string1, const char *string2, double bound) if (estimate < bound) return 0.; - value = fstrcmp_internal (string1, string2, xvec_length, yvec_length); + value = fstrcmp_internal (string1, string2, xvec_length, yvec_length, bound); if (value < bound) return 0.; return value; -- 1.6.0.1.286.g599f2