On Fri, Apr 17, 2009 at 01:02:57PM +0200, Bruno Haible wrote: > Hello Ondrej, > > > > Hello. I am writing partial fnmatch to speed up locate et al. > > Cool! We know for some time already that this is a bottleneck [1]. > I find it also interesting that you go for a two-step approach, > preprocess the pattern once and use it for matching often - the same > approach that we considered useful in [2]. I finished version with only * and ? support. I attach it. I looked more into source and discovered fnmatch doesn't work as I imagined. By default it converts strings into widechars and match there. utf8 allows searching be done bitwise. Its in most cases faster.
Is ok just use original fnmatch if pattern contains extended wildcard or [] with nonascii symbol? > Yes, implementing a case-insensitive matching by doing an uppercasing > pass on both sides is an old technique that ignores the properties of > a couple of languages. For good support of all languages, it's necessary > to use a "casefolding" pass, such as the one described by the Unicode > standard and implemented in gnulib (and soon, libunistring), in > "unicase.h". In particular the function ulc_u8_casefold from > lib/unicase/ulc-casecmp.c looks like what's needed as preprocessing pass > for an arbitrary file name in locale encoding. Here is casefold patch for fnmatch. (abusing wchar=u32) Shouldn't be there also added normalization? --- fnmatch.c.old 2008-10-16 20:59:48.000000000 +0200 +++ fnmatch.c 2009-04-18 09:22:26.109919887 +0200 @@ -51,6 +51,7 @@ #if defined _LIBC || WIDE_CHAR_SUPPORT # include <wctype.h> # include <wchar.h> +# include <unicase.h> #endif /* We need some of the locale data (the collation sequence information) @@ -177,7 +178,12 @@ # if HANDLE_MULTIBYTE -# define FOLD(c) ((flags & FNM_CASEFOLD) ? towlower (c) : (c)) +#ifdef _LIBC + #define FOLD(c) (c) +#else + #define FOLD(c) ((flags & FNM_CASEFOLD) ? towlower (c) : (c)) +#endif + # define CHAR wchar_t # define UCHAR wint_t # define INT wint_t @@ -327,10 +333,19 @@ mbsrtowcs (wpattern, &pattern, patsize, &ps); assert (mbsinit (&ps)); mbsrtowcs (wstring, &string, strsize, &ps); - - res = internal_fnwmatch (wpattern, wstring, wstring + strsize - 1, + wchar_t *wfoldpattern,*wfoldstring; + wfoldpattern=wpattern;wfoldstring=wstring; +#ifdef _LIBC + if (flags & FNM_CASEFOLD){ + wfoldpattern=u32_casefold(wpattern,patsize,uc_locale_language (), NULL,NULL,&patsize); + wfoldstring=u32_casefold(wstring,patsize,uc_locale_language (), NULL,NULL,&patsize); + } +#endif + res = internal_fnwmatch (wfoldpattern, wfoldstring, wfoldstring + strsize - 1, flags & FNM_PERIOD, flags); - +#ifdef _LIBC + if (flags & FNM_CASEFOLD){free(wfoldpattern);free(wfoldstring); } +#endif if (__builtin_expect (! (totsize < ALLOCA_LIMIT), 0)) free (wpattern); return res;
#include <stdint.h> #include <fnmatch.h> #include <stdio.h> #include <stdlib.h> #include <string.h> #include "fnmatch2.h" #define UINT uint32_t #define UINTSIZE 4 #define FNM_NOMATCH 1 typedef struct { char tp; char chr; } chr_pattern; typedef struct { char tp; UINT chars; } chars_pattern; typedef struct { char tp; } qst_pattern; typedef struct { char tp; char chr; short sizerest; } star_pattern; typedef struct { char tp; } end_pattern; typedef struct { char tp; } endstar_pattern; typedef struct { char tp; char *chars; } bracket_pattern; enum PATTERNS {PATTERN_chr,PATTERN_qst,PATTERN_star,PATTERN_chars,PATTERN_end,PATTERN_endstar,PATTERN_bracket}; union _patterns{ chr_pattern chr;chars_pattern chars;qst_pattern qst;star_pattern star;end_pattern end;endstar_pattern endstar;bracket_pattern bracket; }; typedef union _patterns patterns; struct states{ patterns *p; char pattern_prefix[4]; struct states *next; }; struct _fnmatch_state{ struct states *states; int flags; }; #define NEXTPATTERN(type) pat->chr.tp=PATTERN_##type;pat=(patterns *)(((void *) pat)+sizeof(type##_pattern )); fnmatch_state *fnmatch_compile(char *str,int flags){ unsigned char *s; patterns *pat,*ret; int canpack; int i,m; ret=pat=(patterns *) malloc(200); int rest=0;//compute minimal string size for (s=(unsigned char *)str;*s;s++) { if (*s=='[') { while (*s!=']') s++; }/*] gets counted line below.*/ if (*s!='*') rest++; } for (s=(unsigned char *)str;*s;s++) { switch(*s){ case '*': while (*(s+1)=='?') {/* *?=?* */ NEXTPATTERN(qst) rest--; s++; } if (*(s+1)){ pat->star.sizerest=rest; pat->star.chr=*(s+1); s++; NEXTPATTERN(star) }else{ NEXTPATTERN(endstar) } break; case '?': NEXTPATTERN(qst) rest--; break; case '[': pat->bracket.chars=malloc(255); char *c= pat->bracket.chars; while (*s!=']') *c++=*s++; rest--; NEXTPATTERN(bracket); default: canpack=1; for (i=0;i<4;i++) if (*(s+i)=='*'||*(s+i)=='?') canpack=0; if (canpack){ pat->chars.chars=*((UINT*) s); rest-=UINTSIZE; s+=UINTSIZE-1;//last get added automaticaly. NEXTPATTERN(chars) }else{ pat->chr.chr=*s; rest--; NEXTPATTERN(chr) } break; } } NEXTPATTERN(end) fnmatch_state *st=(fnmatch_state *) malloc(sizeof(fnmatch_state)); st->states=calloc(1,sizeof(struct states)); st->states->p=ret; return st; } #undef NEXTPATTERN #define NEXTPATTERN(type) p=(patterns *)(((void *) p)+sizeof(type##_pattern )); int fnmatch2_internal(patterns *p,char *str,int len){ #include "fnmatch2_loop.h" } char buf[1000]; int fnmatch_exec(fnmatch_state *p, char *str){ char *bu=buf; int i; struct states *s; int len=strlen(str); for (s=p->states;s;s=s->next){ for (i=0;s->pattern_prefix[i];i++){ if(str[i]!=s->pattern_prefix[i]) goto con; } if(!fnmatch2_internal(s->p,str+i,len-i)) return 0; con:; } return FNM_NOMATCH; } #define FNMATCH_PREFIX int fnmatch2_prefix_internal(patterns *p,char *str,int len,fnmatch_state *ret){ #include "fnmatch2_loop.h" } fnmatch_state* fnmatch_prefix(fnmatch_state *p,char *str){ struct states *s; int len=strlen(str),i; fnmatch_state *ret=(fnmatch_state *) calloc(1,sizeof(fnmatch_state)); ret->flags=p->flags; for (s=p->states;s;s=s->next){ for (i=0;s->pattern_prefix[i];i++){ if(str[i]!=s->pattern_prefix[i]) goto con; } fnmatch2_prefix_internal(s->p,str,len,ret); con:; } return ret; } int main(){int i; char ptr[]="*abc*"; int flags=0; fnmatch_state *p=fnmatch_compile(ptr,flags); int cnt=0; char s1[]="abcda", s2[]="bbida",s3[]="nabcx"; for (i=0;i<1000000;i++) { s1[1]++;s2[1]++;s3[1]++; #ifdef COMPARE cnt+=fnmatch(ptr,s1,flags)+fnmatch(ptr,s2,flags)+fnmatch(ptr,s3,flags); #else cnt+=fnmatch_exec(p,s1)+fnmatch_exec(p,s2)+fnmatch_exec(p,s3); #endif } printf("%i\n",cnt); fnmatch_state* s=fnmatch_compile("*foo*",0); fnmatch_state* home=fnmatch_prefix(s,"/home"); fnmatch_state* bar=fnmatch_prefix(s,"/bar"); printf("%i ",fnmatch_exec(home,"foo")); printf("%i ",fnmatch_exec(bar,"foo")); printf("%i ",fnmatch_exec(bar,"bar")); }
struct _fnmatch_state; typedef struct _fnmatch_state fnmatch_state; /*compile pattern*/ fnmatch_state* fnmatch_compile(char *pattern,int flags); /* add prefix to matched strings. suppose you want match foo againist /home/foo, /home/bar/foo, /home/bar/bar fnmatch_state* s=fnmatch_compile("foo",0); fnmatch_state* home=fnmatch_prefix(s,"/home"); fnmatch_state* bar=fnmatch_prefix(s,"/bar"); fnmatch_exec(home,"foo"); fnmatch_exec(bar,"foo"); fnmatch_exec(bar,"bar"); */ fnmatch_state* fnmatch_prefix(fnmatch_state *pattern,char *prefix); int fnmatch_exec(fnmatch_state * pattern,char *string);
char *s=str; int i,j; int sizerest,chars,chr; while (1){ #ifndef FNMATCH_PREFIX #else struct states *state; #define ADDSTATE state=calloc(1,sizeof(struct states));state->next=ret->states; ret->states=state; state->p=p; if (!*s){ ADDSTATE; return 0; } #endif switch (p->chr.tp){ case PATTERN_chr: if (*s != p->chr.chr) return FNM_NOMATCH; NEXTPATTERN(chr); s++; break; case PATTERN_chars: #ifndef FNMATCH_PREFIX if ((* (UINT *)s)!=p->chars.chars) return FNM_NOMATCH; s+=UINTSIZE; NEXTPATTERN(chars); #else ; int re=p->chars.chars; char *c=(char *) &re; NEXTPATTERN(chars) for (i=0;i<4;i++){ if (!*(s+i)){ ADDSTATE *((int *)ret->states->pattern_prefix)=re; } if (*(s+i)!=*(c+i)) return FNM_NOMATCH; re=re<<8; } #endif break; case PATTERN_qst: s+=1; NEXTPATTERN(qst); break; case PATTERN_star: ; #ifndef FNMATCH_PREFIX sizerest= p->star.sizerest; chr= p->star.chr; NEXTPATTERN(star); for (i=s-str;i<=len-sizerest ; i++ ) { if (( *(str+i) == chr) && !fnmatch2_internal(p,str+i+1,len-i-1)) return 0; } return FNM_NOMATCH; #else ADDSTATE chr= p->star.chr; NEXTPATTERN(star); for (i=s-str;i<len ;i++){ if ( *(str+i) == chr) fnmatch2_prefix_internal(p,str+i+1,len-i-1,ret); } #endif break; case PATTERN_end: return (*s)?FNM_NOMATCH:0 ; break; case PATTERN_endstar: return 0; break; case PATTERN_bracket: ; char *ch=p->bracket.chars; while (*ch) if (*ch++==*s) goto cont; return FNM_NOMATCH; cont:; break; } }