jpountz commented on a change in pull request #84: URL: https://github.com/apache/lucene/pull/84#discussion_r615920337
########## File path: lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/ScandinavianNormalizer.java ########## @@ -0,0 +1,134 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.analysis.miscellaneous; + +import java.util.Set; +import org.apache.lucene.analysis.util.StemmerUtil; + +/** + * This Normalizer does the heavy lifting for a set of Scandinavian normalization filters, + * normalizing use of the interchangeable Scandinavian characters æÆäÄöÖøØ and folded variants (aa, + * ao, ae, oe and oo) by transforming them to åÅæÆøØ. + * + * @since 9.0 Review comment: should we make it `lucene.internal`, my understanding is that it's really only used as a way to share code between the various scandinavian normalization filters, and that we would make it pkg-private if the scandinavian and the norwegian filters shared the same package? ########## File path: lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/ScandinavianNormalizer.java ########## @@ -0,0 +1,134 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.analysis.miscellaneous; + +import java.util.Set; +import org.apache.lucene.analysis.util.StemmerUtil; + +/** + * This Normalizer does the heavy lifting for a set of Scandinavian normalization filters, + * normalizing use of the interchangeable Scandinavian characters æÆäÄöÖøØ and folded variants (aa, + * ao, ae, oe and oo) by transforming them to åÅæÆøØ. + * + * @since 9.0 + */ +public final class ScandinavianNormalizer { + + /** + * Create the instance, while choosing which foldings to apply. This may differ between Norwegian, + * Danish and Swedish. + * + * @param foldings a Set of Foldings to apply (i.e. AE, OE, AA, AO, OO) + */ + public ScandinavianNormalizer(Set<Foldings> foldings) { + this.foldings = foldings; + } + + /** List of possible foldings that can be used when configuring the filter */ + public enum Foldings { + AA, + AO, + AE, + OE, + OO + } + + private final Set<Foldings> foldings; + + public static final Set<Foldings> ALL_FOLDINGS = + Set.of(Foldings.AA, Foldings.AO, Foldings.OO, Foldings.AE, Foldings.OE); Review comment: Use `EnumSet#allOf`? ########## File path: lucene/analysis/common/src/java/org/apache/lucene/analysis/no/NorwegianNormalizationFilter.java ########## @@ -0,0 +1,37 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.analysis.no; + +import java.util.Set; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.miscellaneous.ScandinavianNormalizationFilter; +import org.apache.lucene.analysis.miscellaneous.ScandinavianNormalizer.Foldings; + +/** + * This filter normalize use of the interchangeable Scandinavian characters æÆäÄöÖøØ and folded + * variants (ae, oe, aa) by transforming them to åÅæÆøØ. This is similar to + * ScandinavianNormalizationFilter, except for the folding rules customized for Norwegian. + * + * <p>blåbærsyltetøj == blåbärsyltetöj == blaabaersyltetoej + * + * @see ScandinavianNormalizationFilter + */ +public final class NorwegianNormalizationFilter extends ScandinavianNormalizationFilter { Review comment: Let's favor composition over inheritance and reuse the ScandinavianNormalizer instead of extending ScandinavianNormalizationFilter? ########## File path: lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestScandinavianNormalizer.java ########## @@ -0,0 +1,137 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.analysis.miscellaneous; + +import java.io.IOException; +import java.util.Collections; +import java.util.Set; +import org.apache.lucene.analysis.*; +import org.apache.lucene.analysis.miscellaneous.ScandinavianNormalizer.Foldings; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.junit.Test; + +/** Tests low level the normalizer functionality */ +public class TestScandinavianNormalizer extends BaseTokenStreamTestCase { + @Test Review comment: nit: most test suites we have don't use `@Test` and rely on naming conventions instead, let's not grow the number of `@Test` annotations with this PR? ########## File path: lucene/analysis/common/src/java/org/apache/lucene/analysis/no/NorwegianNormalizationFilter.java ########## @@ -0,0 +1,37 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.analysis.no; + +import java.util.Set; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.miscellaneous.ScandinavianNormalizationFilter; +import org.apache.lucene.analysis.miscellaneous.ScandinavianNormalizer.Foldings; + +/** + * This filter normalize use of the interchangeable Scandinavian characters æÆäÄöÖøØ and folded + * variants (ae, oe, aa) by transforming them to åÅæÆøØ. This is similar to + * ScandinavianNormalizationFilter, except for the folding rules customized for Norwegian. + * + * <p>blåbærsyltetøj == blåbärsyltetöj == blaabaersyltetoej + * + * @see ScandinavianNormalizationFilter + */ +public final class NorwegianNormalizationFilter extends ScandinavianNormalizationFilter { + public NorwegianNormalizationFilter(TokenStream input) { + super(input, Set.of(Foldings.AE, Foldings.OE, Foldings.AA)); Review comment: Use `EnumSet#of` instead of `Set#of` to make this set internally represented as a BitSet? -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For additional commands, e-mail: issues-h...@lucene.apache.org