[
https://issues.apache.org/jira/browse/OPENNLP-1416?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=17649612#comment-17649612
]
ASF GitHub Bot commented on OPENNLP-1416:
-----------------------------------------
rzo1 commented on code in PR #461:
URL: https://github.com/apache/opennlp/pull/461#discussion_r1052995489
##########
opennlp-tools/src/main/java/opennlp/tools/formats/ad/ADNameSampleStream.java:
##########
@@ -154,61 +153,49 @@ public class ADNameSampleStream implements
ObjectStream<NameSample> {
private final ObjectStream<ADSentenceStream.Sentence> adSentenceStream;
- /**
+ /*
* To keep the last left contraction part
*/
private String leftContractionPart = null;
private final boolean splitHyphenatedTokens;
/**
- * Creates a new {@link NameSample} stream from a line stream, i.e.
- * {@link ObjectStream}<{@link String}>, that could be a
- * {@link PlainTextByLineStream} object.
+ * Initializes a new {@link ADNameSampleStream} stream from a {@link
ObjectStream<String>},
+ * that could be a {@link PlainTextByLineStream} object.
*
- * @param lineStream
- * a stream of lines as {@link String}
- * @param splitHyphenatedTokens
- * if true hyphenated tokens will be separated: "carros-monstro"
>
- * "carros" "-" "monstro"
+ * @param lineStream An {@link ObjectStream<String>} as input.
+ * @param splitHyphenatedTokens If {@code true} hyphenated tokens will be
separated:
+ * "carros-monstro" > "carros" "-" "monstro".
*/
public ADNameSampleStream(ObjectStream<String> lineStream, boolean
splitHyphenatedTokens) {
this.adSentenceStream = new ADSentenceStream(lineStream);
this.splitHyphenatedTokens = splitHyphenatedTokens;
}
/**
- * Creates a new {@link NameSample} stream from a {@link InputStream}
+ * Initializes a new {@link ADNameSampleStream} from an {@link
InputStreamFactory}
*
- * @param in
- * the Corpus {@link InputStream}
- * @param charsetName
- * the charset of the Arvores Deitadas Corpus
- * @param splitHyphenatedTokens
- * if true hyphenated tokens will be separated: "carros-monstro"
>
- * "carros" "-" "monstro"
+ * @param in The Corpus {@link InputStreamFactory}.
+ * @param charsetName The {@link java.nio.charset.Charset charset} to use
+ * for reading of the corpus.
+ * @param splitHyphenatedTokens If {@code true} hyphenated tokens will be
separated:
+ * "carros-monstro" > "carros" "-" "monstro".
*/
@Deprecated
public ADNameSampleStream(InputStreamFactory in, String charsetName,
boolean splitHyphenatedTokens) throws IOException {
-
- try {
- this.adSentenceStream = new ADSentenceStream(new PlainTextByLineStream(
- in, charsetName));
- this.splitHyphenatedTokens = splitHyphenatedTokens;
- } catch (UnsupportedEncodingException e) {
- // UTF-8 is available on all JVMs, will never happen
- throw new IllegalStateException(e);
- }
+ this(new PlainTextByLineStream(in, charsetName), splitHyphenatedTokens);
}
private int textID = -1;
+ @Override
public NameSample read() throws IOException {
Sentence paragraph;
// we should look for text here.
- while ((paragraph = this.adSentenceStream.read()) != null) {
+ if ((paragraph = this.adSentenceStream.read()) != null) {
Review Comment:
That looks like the `while` was a bug previously (emptying the whole
underlying stream...) - good catch!
##########
opennlp-tools/src/main/java/opennlp/tools/formats/ad/ADPOSSampleStream.java:
##########
@@ -64,35 +59,26 @@ public ADPOSSampleStream(ObjectStream<String> lineStream,
boolean expandME,
}
/**
- * Creates a new {@link POSSample} stream from a {@link InputStream}
+ * Creates a new {@link POSSample} stream from an {@link InputStreamFactory}
*
- * @param in
- * the Corpus {@link InputStream}
- * @param charsetName
- * the charset of the Arvores Deitadas Corpus
- * @param expandME
- * if true will expand the multiword expressions, each word of the
+ * @param in The {@link InputStreamFactory} for the corpus.
+ * @param charsetName The {@link java.nio.charset.Charset charset} to use
+ * for reading of the corpus.
+ * @param expandME If {@code true} will expand the multiword expressions,
each word of the
* expression will have the POS Tag that was attributed to the
- * expression plus the prefix B- or I- (CONLL convention)
- * @param includeFeatures
- * if true will combine the POS Tag with the feature tags
+ * expression plus the prefix {@code B-} or {@code I-} (CONLL
convention).
+ * @param includeFeatures If {@code true} will combine the POS Tag with the
feature tags.
*/
public ADPOSSampleStream(InputStreamFactory in, String charsetName,
boolean expandME, boolean includeFeatures) throws IOException {
- try {
- this.adSentenceStream = new ADSentenceStream(new
PlainTextByLineStream(in, charsetName));
- this.expandME = expandME;
- this.isIncludeFeatures = includeFeatures;
- } catch (UnsupportedEncodingException e) {
- // UTF-8 is available on all JVMs, will never happen
- throw new IllegalStateException(e);
- }
+ this(new PlainTextByLineStream(in, charsetName), expandME,
includeFeatures);
}
+ @Override
public POSSample read() throws IOException {
Sentence paragraph;
- while ((paragraph = this.adSentenceStream.read()) != null) {
+ if ((paragraph = this.adSentenceStream.read()) != null) {
Review Comment:
:-)
> Enhance JavaDoc in opennlp.tools.formats.ad package
> ---------------------------------------------------
>
> Key: OPENNLP-1416
> URL: https://issues.apache.org/jira/browse/OPENNLP-1416
> Project: OpenNLP
> Issue Type: Improvement
> Components: Formats
> Affects Versions: 2.1.0
> Reporter: Martin Wiesner
> Assignee: Martin Wiesner
> Priority: Minor
> Fix For: 2.1.1
>
>
> The JavaDoc the _opennlp.tools.formats.ad_ package suffers from several
> inconsistencies and missing descriptions. Moreover, several typos are present
> that need sanitizing.
> It needs enhancements and/or additions to provide more clarity for readers.
--
This message was sent by Atlassian Jira
(v8.20.10#820010)