ACCUMULO-374 Removed stop list stuff git-svn-id: https://svn.apache.org/repos/asf/incubator/accumulo/branches/1.4@1241141 13f79535-47bb-0310-9956-ffa450edef68
Project: http://git-wip-us.apache.org/repos/asf/accumulo-wikisearch/repo Commit: http://git-wip-us.apache.org/repos/asf/accumulo-wikisearch/commit/6a3b4190 Tree: http://git-wip-us.apache.org/repos/asf/accumulo-wikisearch/tree/6a3b4190 Diff: http://git-wip-us.apache.org/repos/asf/accumulo-wikisearch/diff/6a3b4190 Branch: refs/heads/1.4.5-SNAPSHOT Commit: 6a3b41909815ba1e44ce6ebf6de24ba756f1ccde Parents: b9cf294 Author: Adam Fuchs <afu...@apache.org> Authored: Mon Feb 6 20:07:59 2012 +0000 Committer: Adam Fuchs <afu...@apache.org> Committed: Mon Feb 6 20:07:59 2012 +0000 ---------------------------------------------------------------------- .../wikisearch/ingest/WikipediaMapper.java | 35 +------------------- 1 file changed, 1 insertion(+), 34 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/accumulo-wikisearch/blob/6a3b4190/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/ingest/WikipediaMapper.java ---------------------------------------------------------------------- diff --git a/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/ingest/WikipediaMapper.java b/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/ingest/WikipediaMapper.java index b25c042..c343f52 100644 --- a/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/ingest/WikipediaMapper.java +++ b/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/ingest/WikipediaMapper.java @@ -32,7 +32,6 @@ import java.util.Set; import java.util.regex.Matcher; import java.util.regex.Pattern; - import org.apache.accumulo.core.data.Mutation; import org.apache.accumulo.core.data.Value; import org.apache.accumulo.core.security.ColumnVisibility; @@ -48,20 +47,9 @@ import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.lib.input.FileSplit; import org.apache.log4j.Logger; -import org.apache.lucene.analysis.StopAnalyzer; -import org.apache.lucene.analysis.StopFilter; -import org.apache.lucene.analysis.ar.ArabicAnalyzer; -import org.apache.lucene.analysis.br.BrazilianAnalyzer; -import org.apache.lucene.analysis.cjk.CJKAnalyzer; -import org.apache.lucene.analysis.de.GermanAnalyzer; -import org.apache.lucene.analysis.el.GreekAnalyzer; -import org.apache.lucene.analysis.fa.PersianAnalyzer; -import org.apache.lucene.analysis.fr.FrenchAnalyzer; -import org.apache.lucene.analysis.nl.DutchAnalyzer; import org.apache.lucene.analysis.tokenattributes.TermAttribute; import org.apache.lucene.wikipedia.analysis.WikipediaTokenizer; - import com.google.common.collect.HashMultimap; import com.google.common.collect.Multimap; @@ -82,7 +70,6 @@ public class WikipediaMapper extends Mapper<LongWritable,Text,Text,Mutation> { private ArticleExtractor extractor; private String language; private int numPartitions = 0; - private Set<?> stopwords = null; private ColumnVisibility cv = null; private Text tablename = null; @@ -103,25 +90,6 @@ public class WikipediaMapper extends Mapper<LongWritable,Text,Text,Mutation> { Matcher matcher = languagePattern.matcher(fileName); if (matcher.matches()) { language = matcher.group(1).replace('_', '-').toLowerCase(); - if (language.equals("arwiki")) - stopwords = ArabicAnalyzer.getDefaultStopSet(); - else if (language.equals("brwiki")) - stopwords = BrazilianAnalyzer.getDefaultStopSet(); - else if (language.startsWith("zh")) - stopwords = CJKAnalyzer.getDefaultStopSet(); - else if (language.equals("dewiki")) - stopwords = GermanAnalyzer.getDefaultStopSet(); - else if (language.equals("elwiki")) - stopwords = GreekAnalyzer.getDefaultStopSet(); - else if (language.equals("fawiki")) - stopwords = PersianAnalyzer.getDefaultStopSet(); - else if (language.equals("frwiki")) - stopwords = FrenchAnalyzer.getDefaultStopSet(); - else if (language.equals("nlwiki")) - stopwords = DutchAnalyzer.getDefaultStopSet(); - else - stopwords = StopAnalyzer.ENGLISH_STOP_WORDS_SET; - } else { throw new RuntimeException("Unknown ingest language! " + fileName); } @@ -230,9 +198,8 @@ public class WikipediaMapper extends Mapper<LongWritable,Text,Text,Mutation> { Set<String> tokenList = new HashSet<String>(); WikipediaTokenizer tok = new WikipediaTokenizer(new StringReader(article.getText())); TermAttribute term = tok.addAttribute(TermAttribute.class); - StopFilter filter = new StopFilter(false, tok, stopwords, true); try { - while (filter.incrementToken()) { + while (tok.incrementToken()) { String token = term.term(); if (!StringUtils.isEmpty(token)) tokenList.add(token);