Hi all,

My facet browsing performance has been decent on my system until I add my
custom Analyser. Initially, I facetted "title" field which is of default
string type (no analysers, tokenisers...) and got quick responses (first
query is just under 1s, subsequent queries are < 0.1s). I created a custom
analyser which is not much different from the DefaultAnalyzer in FieldType
class. Essentially, this analyzer will not do any tokonisations, but only
convert the value into lower case, remove spaces, unwanted chars and words.
After I applied the analyser to "title" field, facet performance degraded
considerably. Every query is now > 1.2s and the filterCache hit ratio is
extremely small:

lookups : 918485
hits : 23
hitratio : 0.00
inserts : 918487
evictions : 917971
size : 512
cumulative_lookups : 918485
cumulative_hits : 23
cumulative_hitratio : 0.00
cumulative_inserts : 918487
cumulative_evictions : 917971


Any idea? Here is my analyser code:

public class FacetTextAnalyser extends SolrAnalyzer {
    final int maxChars;
    final Set<Character> ignoredChars;
    final Set<String> ignoredWords;

    public final static char[] IGNORED_CHARS = {'/', '\\', '\'', '\"',
'#', '&', '!', '?', '*', '>', '<', ','};
    public static final String[] IGNORED_WORDS = {
            "a", "an", "and", "are", "as", "at", "be", "but", "by",
            "for", "if", "in", "into", "is",
            "no", "not", "of", "on", "or", "such",
            "that", "the", "their", "then", "there", "these",
            "they", "this", "to", "was", "will", "with"
    };

    public FacetTextAnalyser() {
        maxChars = 255;
        ignoredChars = new HashSet<Character>();
        for (int i = 0; i < IGNORED_CHARS.length; i++) {
            ignoredChars.add(IGNORED_CHARS[i]);
        }
        ignoredWords = new HashSet<String>();
        for (int i = 0; i < IGNORED_WORDS.length; i++) {
            ignoredWords.add(IGNORED_WORDS[i]);
        }

    }

    public FacetTextAnalyser(int maxChars, Set<Character> ignoredChars,
Set<String> ignoredWords) {
        this.maxChars = maxChars;
        this.ignoredChars = ignoredChars;
        this.ignoredWords = ignoredWords;
    }

    @Override
    public TokenStream tokenStream(String fieldName, Reader reader) {
        return new Tokenizer(reader) {
            char[] cbuf = new char[maxChars];

            public Token next() throws IOException {
                int n = input.read(cbuf, 0, maxChars);
                if (n <= 0)
                    return null;
                char[] temp = new char[n];
                int index = 0;
                boolean space = true;
                for (int i = 0; i < n; i++) {
                    char c = cbuf[i];
                    if (ignoredChars.contains(cbuf[i])) {
                        c = ' ';
                    }
                    if (Character.isWhitespace(c)) {
                        if (space)
                            continue;
                        else {
                            temp[index] = ' ';
                            if (index > 0) {
                                int j = index - 1;
                                while (temp[j] != ' ' && j > 0) {
                                    j--;
                                }
                                String str = (j == 0)? new String(temp, 0,
index): new String(temp, j + 1, index - j - 1);
                                System.out.println(str);
                                if (ignoredWords.contains(str))
                                    index = j;
                            }
                            index++;
                            //Check ignored words
                            space = true;
                        }
                    } else {
                        temp[index] = Character.toLowerCase(c);
                        index++;
                        space = false;
                    }

                }
                temp[0] = Character.toUpperCase(temp[0]);
                String s = new String(temp, 0, index);
                return new Token(s, 0, n);
            };
        };
    }
}


Here is how I declare the analyser:

  <fieldType name="text_em" class="solr.TextField"
positionIncrementGap="100">
                <analyzer class="net.jseeker.lucene.FacetTextAnalyser"/>
    </fieldType>



--
Regards,

Cuong Hoang

Reply via email to