[
https://issues.apache.org/jira/browse/OPENNLP-1594?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=17866133#comment-17866133
]
ASF GitHub Bot commented on OPENNLP-1594:
-----------------------------------------
rzo1 commented on code in PR #158:
URL: https://github.com/apache/opennlp-sandbox/pull/158#discussion_r1678162623
##########
summarizer/src/main/java/opennlp/summarization/textrank/TextRank.java:
##########
@@ -97,13 +125,17 @@ public double getWeightedSimilarity(String sent1, String
sent2,
return (wordsInCommon) / (words1.length + words2.length);
}
- // Gets the current score from the list of scores passed ...
+ /**
+ * @param scores A list of {@link Score} instances.
+ * @param id The sentence id to check for.
+ * @return Gets the element from {@code scores} that matches the passed
sentence {@code id}.
+ */
public double getScoreFrom(List<Score> scores, int id) {
for (Score s : scores) {
if (s.getSentId() == id)
return s.getScore();
}
- return 1;
+ return 1; // Why is the default score "1" here?
Review Comment:
:)
##########
summarizer/src/main/java/opennlp/summarization/lexicalchaining/LexicalChainingSummarizer.java:
##########
@@ -44,95 +48,128 @@ public class LexicalChainingSummarizer implements
Summarizer {
private final DocProcessor docProcessor;
private final WordRelationshipDetermination wordRel;
- public LexicalChainingSummarizer(DocProcessor dp, OpenNLPPOSTagger
posTagger) {
- docProcessor = dp;
- tagger = posTagger;
- wordRel = new WordRelationshipDetermination();
+ /**
+ * Instantiates a {@link LexicalChainingSummarizer}.
+ *
+ * @param docProcessor The {@link DocProcessor} to use at runtime. Must not
be {@code null}.
+ * @param languageCode An ISO-language code for obtaining a {@link POSModel}.
+ * Must not be {@code null}.
+ *
+ * @throws IllegalArgumentException Thrown if parameters are invalid.
+ */
+ public LexicalChainingSummarizer(DocProcessor docProcessor, String
languageCode) throws IOException {
+ this(docProcessor, new NounPOSTagger(languageCode));
}
- public LexicalChainingSummarizer(DocProcessor dp, InputStream posModelFile)
throws Exception {
- this(dp, new OpenNLPPOSTagger(dp, posModelFile));
+ /**
+ * Instantiates a {@link LexicalChainingSummarizer}.
+ *
+ * @param docProcessor The {@link DocProcessor} to use at runtime. Must not
be {@code null}.
+ * @param posTagger The {@link NounPOSTagger} to use at runtime. Must not be
{@code null}.
+ *
+ * @throws IllegalArgumentException Thrown if parameters are invalid.
+ */
+ public LexicalChainingSummarizer(DocProcessor docProcessor, NounPOSTagger
posTagger) {
+ if (docProcessor == null) throw new IllegalArgumentException("Parameter
'docProcessor' must not be null!");
+ if (posTagger == null) throw new IllegalArgumentException("Parameter
'posTagger' must not be null!");
+
+ this.docProcessor = docProcessor;
+ tagger = posTagger;
+ wordRel = new WordRelationshipDetermination();
}
- //Build Lexical chains..
- public List<LexicalChain> buildLexicalChains(String article, List<Sentence>
sent) {
- // POS tag article
- Hashtable<String, List<LexicalChain>> chains = new Hashtable<>();
- List<LexicalChain> lc = new ArrayList<>();
- // Build lexical chains
- // For each sentence
- for (Sentence currSent : sent) {
- String taggedSent = tagger.getTaggedString(currSent.getStringVal());
- List<String> nouns = tagger.getWordsOfType(taggedSent, POSTagger.NOUN);
- // For each noun
- for (String noun : nouns) {
- int chainsAddCnt = 0;
- // Loop through each LC
- for (LexicalChain l : lc) {
- try {
- WordRelation rel = wordRel.getRelation(l, noun,
(currSent.getSentId() - l.start) > 7);
- // Is the noun an exact match to one of the current LCs (Strong
relation)
- // Add sentence to chain
- if (rel.relation() == WordRelation.STRONG_RELATION) {
- addToChain(rel.dest(), l, chains, currSent);
- if (currSent.getSentId() - l.last > 10) {
- l.occurrences++;
- l.start = currSent.getSentId();
- }
- chainsAddCnt++;
- } else if (rel.relation() == WordRelation.MED_RELATION) {
- // Add sentence to chain if it is 7 sentences away from start
of chain
- addToChain(rel.dest(), l, chains, currSent);
- chainsAddCnt++;
- //If greater than 7 we will add it but call it a new occurrence
of the lexical chain...
- if (currSent.getSentId() - l.start > 7) {
- l.occurrences++;
- l.start = currSent.getSentId();
- }
- } else if (rel.relation() == WordRelation.WEAK_RELATION) {
- if (currSent.getSentId() - l.start <= 3) {
+ /**
+ * Constructs a list of {@link LexicalChain lexical chains} from specified
sentences.
+ *
+ * @param sentences The list of {@link Sentence sentences} to build lexical
chains from.
+ * Must not be {@code null}.
+ * @return The result list of {@link LexicalChain lexical chains}.
Guaranteed to be not {@code null}.
+ * @throws IllegalArgumentException Thrown if parameters are invalid.
+ */
+ public List<LexicalChain> buildLexicalChains(List<Sentence> sentences) {
+ if (sentences == null) throw new IllegalArgumentException("Parameter
'sentences' must not be null!");
+ else {
+ if (sentences.isEmpty()) {
+ return Collections.emptyList();
+ }
+ Hashtable<String, List<LexicalChain>> chains = new Hashtable<>();
+ List<LexicalChain> lc = new ArrayList<>();
+ // Build lexical chains
+ // For each sentence
+ for (Sentence currSent : sentences) {
+ // POS tag article
+ String taggedSent =
tagger.getTaggedString(currSent.getStringVal().replace(".", " ."));
+ List<String> nouns =
tagger.getWordsOfType(docProcessor.getWords(taggedSent), POSTagger.NOUN);
+ // For each noun
+ for (String noun : nouns) {
+ int chainsAddCnt = 0;
+ // Loop through each LC
+ for (LexicalChain l : lc) {
+ try {
+ WordRelation rel = wordRel.getRelation(l, noun,
(currSent.getSentId() - l.start) > 7);
Review Comment:
Maybe we can use speaking constants for `3`, `7` and `10` here because it
seems, that they decode the number of sentences from the start of the chain?
> Add stricter tests for Summarizer component
> -------------------------------------------
>
> Key: OPENNLP-1594
> URL: https://issues.apache.org/jira/browse/OPENNLP-1594
> Project: OpenNLP
> Issue Type: Test
> Affects Versions: 2.3.3
> Reporter: Martin Wiesner
> Assignee: Martin Wiesner
> Priority: Major
> Fix For: 2.4.0
>
>
> While OPENNLP-1592 and OPENNLP-1593 raised test coverage for the summarizer
> component, there are still some "dark spots" and probably inaccuracies in the
> implementation of that component. More, stricter tests shall shed light on
> the situation.
> Aims:
> * add further, stricter tests for the summarizer component
> * clarify, at API level, the semantics and constraints of parameters
> * better separate tests so that each test class has a clear responsibility
> for its class under test
> * improve / enhance the JavaDoc further
--
This message was sent by Atlassian Jira
(v8.20.10#820010)