Re: custom TokenFilter

Robert Muir Thu, 09 Feb 2012 15:45:42 -0800

If you are writing a custom tokenstream, I recommend using some of the
resources in Lucene's test-framework.jar to test it.
These find lots of bugs! (including thread-safety bugs)


For a filter: I recommend to use the assertions in
BaseTokenStreamTestCase: assertTokenStreamContents, assertAnalyzesTo,
and especially checkRandomData
http://svn.apache.org/repos/asf/lucene/dev/trunk/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java

When testing your filter, for even more checks, don't use Whitespace
or Keyword Tokenizer, use MockTokenizer, it has more checks:
http://svn.apache.org/repos/asf/lucene/dev/trunk/lucene/test-framework/src/java/org/apache/lucene/analysis/MockTokenizer.java

For some examples, you can look at the tests in modules/analysis.

And of course enable assertions (-ea) when testing!

On Thu, Feb 9, 2012 at 6:30 PM, Jamie Johnson <jej2...@gmail.com> wrote:
> I have the need to take user input and index it in a unique fashion,
> essentially the value is some string (say "abcdefghijk") and needs to
> be converted into a set of tokens (say 1 2 3 4).  I am currently have
> implemented a custom TokenFilter to do this, is this appropriate?  In
> cases where I am indexing things slowly (i.e. 1 at a time) this works
> fine, but when I send 10,000 things to solr (all in one thread) I am
> noticing exceptions where it seems that the generated instance
> variable is being used by several threads.  Is my implementation
> appropriate or is there another more appropriate way to do this?  Are
> TokenFilters reused?  Would it be more appropriate to convert the
> stream to 1 token space separated then run that through a
> WhiteSpaceTokenizer?  Any guidance on this would be greatly
> appreciated.
>
>        class CustomFilter extends TokenFilter {
>                private final CharTermAttribute termAtt =
> addAttribute(CharTermAttribute.class);
>                private final PositionIncrementAttribute posAtt =
> addAttribute(PositionIncrementAttribute.class);
>                protected CustomFilter(TokenStream input) {
>                        super(input);
>                }
>
>                Iterator<AttributeSource> replacement;
>                @Override
>                public boolean incrementToken() throws IOException {
>
>
>                        if(generated == null){
>                                //setup generated
>                                if(!input.incrementToken()){
>                                        return false;
>                                }
>
>                                //clearAttributes();
>                                List<String> cells = 
> StaticClass.generateTokens(termAtt.toString());
>                                generated = new 
> ArrayList<AttributeSource>(cells.size());
>                                boolean first = true;
>                                for(String cell : cells) {
>                                        AttributeSource newTokenSource = 
> this.cloneAttributes();
>
>                                        CharTermAttribute newTermAtt =
> newTokenSource.addAttribute(CharTermAttribute.class);
>                                        newTermAtt.setEmpty();
>                                        newTermAtt.append(cell);
>                                        OffsetAttribute newOffsetAtt =
> newTokenSource.addAttribute(OffsetAttribute.class);
>                                        PositionIncrementAttribute 
> newPosIncAtt =
> newTokenSource.addAttribute(PositionIncrementAttribute.class);
>                                        newOffsetAtt.setOffset(0,0);
>                                        
> newPosIncAtt.setPositionIncrement(first ? 1 : 0);
>                                        generated.add(newTokenSource);
>                                        first = false;
>                                        generated.add(newTokenSource);
>                                }
>
>                        }
>                        if(!generated.isEmpty()){
>                                copy(this, generated.remove(0));
>                                return true;
>                        }
>
>                        return false;
>
>                }
>
>                private void copy(AttributeSource target, AttributeSource 
> source) {
>                        if (target != source)
>                                source.copyTo(target);
>                }
>
>                private LinkedList<AttributeSource> buffer;
>                private LinkedList<AttributeSource> matched;
>
>                private boolean exhausted;
>
>                private AttributeSource nextTok() throws IOException {
>                        if (buffer != null && !buffer.isEmpty()) {
>                                return buffer.removeFirst();
>                        } else {
>                                if (!exhausted && input.incrementToken()) {
>                                        return this;
>                                } else {
>                                        exhausted = true;
>                                        return null;
>                                }
>                        }
>                }
>                @Override
>                public void reset() throws IOException {
>                        super.reset();
>                        generated = null;
>                }
>        }



-- 
lucidimagination.com

Re: custom TokenFilter

Reply via email to