Hi,
I have a question about SynonymGraphFilter.
During the query parsing I expected a query phrase for multi word
synonyms but the query produced is an or of all the tokens that compose
the multi word. is the correct behavior? I attach a test for this
question. Examples: query: "text analysis is a serious thing" (quoted
query) Query after parsing: BooleanQuery -> spanNear([spanOr([text:nlp,
spanNear([text:text, text:analysis], 0, true)]), text:is, text:a,
text:serius, text:thing], 0, true) in this query nlp is a synonym of
text analysis query: the labrador is an awesome dog Query after parsing:
(((+text:a +text:big +text:dog) (+text:the +text:labrador))) (text:is)
(text:an) (text:awesome) (((+text:mammal +text:animal) text:dog)) Query
expected: (spanOr([spanNear([text:a ,text:big, text:dog],0,true),
spanNear([text:the, text:labrador], 0, true)])) (text:is) (text:an)
(text:awesome) (spanOr([spanNear([text: mammal, text:animal],0, true),
text:dog]))
in this query the synonyms are:
the labrador -> a big dog
dog -> mammal animal
Thank you,
Gianpiero Sportelli
--
*CELI srl*
via San Quintino, 31 - Torino
<https://www.google.com/maps/place/Via+S.+Quintino,+31,+10121+Torino+TO/@45.0668691,7.6684529,17z/data=%213m1%214b1%214m5%213m4%211s0x47886d13c6b49f81:0x2b74ae2a12fca9de%218m2%213d45.0668653%214d7.6706416>
Torino IT – 10121
<https://www.google.com/maps/place/Via+S.+Quintino,+31,+10121+Torino+TO/@45.0668691,7.6684529,17z/data=%213m1%214b1%214m5%213m4%211s0x47886d13c6b49f81:0x2b74ae2a12fca9de%218m2%213d45.0668653%214d7.6706416>
*
*
*T *+39 011 5627115
*W *www.celi.it <https://www.celi.it/>
package it.celi.sophia.lucene7.analysis;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.LowerCaseFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
import org.apache.lucene.analysis.synonym.SynonymGraphFilter;
import org.apache.lucene.analysis.synonym.SynonymMap;
import org.apache.lucene.queryparser.classic.MultiFieldQueryParser;
import org.apache.lucene.queryparser.classic.ParseException;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.util.CharsRefBuilder;
import org.junit.Test;
import java.io.IOException;
import static org.apache.commons.lang3.StringUtils.EMPTY;
import static org.apache.commons.lang3.StringUtils.repeat;
class Formatter{
public static String formatQuery(final Query q) {
final StringBuilder sb = new StringBuilder();
formatQuery(q, EMPTY, 0, sb);
return sb.toString();
}
private static void formatQuery(final Query q, final String prefix, final int level, final StringBuilder sb) {
final String indent = repeat(' ', level * 3);
sb.append(indent)
.append(prefix)
.append(q.getClass().getSimpleName())
.append(" -> ")
.append(q.toString())
.append(System.lineSeparator());
if (q instanceof BooleanQuery) {
for (final BooleanClause bc : ((BooleanQuery) q).clauses()) {
final String occur = bc.getOccur().equals("+") ? "AND " : bc.getOccur().equals("-") ? "NOT " : "OR ";
formatQuery(bc.getQuery(), occur, level + 1, sb);
}
}
}
}
public class SynonymGraphFilterTest {
@Test
public void testSynonymAnalyzer() throws ParseException {
final Analyzer analyzer = createAnalyzer();
final QueryParser qp = new MultiFieldQueryParser(new String[]{"text"}, analyzer) {
@Override
protected Query newFieldQuery(final Analyzer analyzer, final String field, final String queryText, final boolean quoted)
throws ParseException {
System.out.println("text:: " + queryText + " - quoted:: " + quoted + " analyzer:: " + analyzer);
return super.newFieldQuery(analyzer, field, queryText, quoted);
}
};
System.out.println("--------------------------------------------");
String text = "\"text analysis is a serious thing\"";
System.out.println(text);
Query q = qp.parse(text);
System.out.println(Formatter.formatQuery(q));
text = "the labrador is an awesome dog";
System.out.println(text);
q = qp.parse(text);
System.out.println(Formatter.formatQuery(q));
}
private Analyzer createAnalyzer() {
return new Analyzer() {
@Override
protected Analyzer.TokenStreamComponents createComponents(final String fieldName) {
SynonymMap smap = null;
SynonymMap.Builder builder = new SynonymMap.Builder();
add("text analysis", "nlp", true, builder);
add("the labrador", "a big dog", true, builder);
add("dog", "mammal animal", true, builder);
try {
smap = builder.build();
} catch (IOException e) {
e.printStackTrace();
}
final Tokenizer src = new WhitespaceTokenizer();
TokenStream ts = new LowerCaseFilter(src);
ts = new SynonymGraphFilter(ts, smap, false);
return new Analyzer.TokenStreamComponents(src, ts);
}
@Override
protected TokenStream normalize(final String fieldName, final TokenStream in) {
TokenStream ts = new LowerCaseFilter(in);
return ts;
}
};
}
private static void add(String input, String output, boolean keepOrig, SynonymMap.Builder builder) {
System.out.println(" add input=" + input + " output=" + output + " keepOrig=" + keepOrig);
CharsRefBuilder inputCharsRef = new CharsRefBuilder();
SynonymMap.Builder.join(input.split(" +"), inputCharsRef);
CharsRefBuilder outputCharsRef = new CharsRefBuilder();
SynonymMap.Builder.join(output.split(" +"), outputCharsRef);
builder.add(inputCharsRef.get(), outputCharsRef.get(), keepOrig);
}
}