Repository: commons-text Updated Branches: refs/heads/pr/4 [created] 9e84145b1
salutations: matches salutations. Miss, Dr, .. Project: http://git-wip-us.apache.org/repos/asf/commons-text/repo Commit: http://git-wip-us.apache.org/repos/asf/commons-text/commit/1c640335 Tree: http://git-wip-us.apache.org/repos/asf/commons-text/tree/1c640335 Diff: http://git-wip-us.apache.org/repos/asf/commons-text/diff/1c640335 Branch: refs/heads/pr/4 Commit: 1c6403353bad890d0338d7f2d50274c7d79e4a3b Parents: a0178d0 Author: Tom MacKenzie <tom.s.macken...@gmail.com> Authored: Mon Jun 20 11:13:42 2016 -0500 Committer: Tom MacKenzie <tom.s.macken...@gmail.com> Committed: Mon Jun 20 11:13:42 2016 -0500 ---------------------------------------------------------------------- .gitignore | 1 + .../commons/text/names/HumanNameParser.java | 11 +++- .../org/apache/commons/text/names/Name.java | 9 ++- .../commons/text/names/HumanNameParserTest.java | 6 +- .../org/apache/commons/text/names/testNames.txt | 65 ++++++++++---------- 5 files changed, 57 insertions(+), 35 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/commons-text/blob/1c640335/.gitignore ---------------------------------------------------------------------- diff --git a/.gitignore b/.gitignore index c8130e7..7eaf4b9 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,5 @@ # Include only files generated during build, and avoid IDE specific files target/ site-content +*.iml http://git-wip-us.apache.org/repos/asf/commons-text/blob/1c640335/src/main/java/org/apache/commons/text/names/HumanNameParser.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/commons/text/names/HumanNameParser.java b/src/main/java/org/apache/commons/text/names/HumanNameParser.java index d713e9f..6780194 100644 --- a/src/main/java/org/apache/commons/text/names/HumanNameParser.java +++ b/src/main/java/org/apache/commons/text/names/HumanNameParser.java @@ -100,6 +100,7 @@ import org.apache.commons.lang3.StringUtils; */ public final class HumanNameParser { + private final List<String> salutations; private final List<String> suffixes; private final List<String> prefixes; @@ -108,6 +109,10 @@ public final class HumanNameParser { */ public HumanNameParser() { // TODO make this configurable + this.salutations = Arrays.asList( + "Mr", "Mrs", "Ms", "Miss", "Dr", + "Mr.", "Mrs.", "Ms.", "Miss.", "Dr." + ); this.suffixes = Arrays.asList( "esq", "esquire", "jr", "sr", "2", "ii", "iii", "iv"); @@ -131,6 +136,7 @@ public final class HumanNameParser { NameString nameString = new NameString(name); // TODO compile regexes only once when the parser is created + String salutations = StringUtils.join(this.salutations, " |") + ""; String suffixes = StringUtils.join(this.suffixes, "\\.*|") + "\\.*"; String prefixes = StringUtils.join(this.prefixes, " |") + " "; @@ -138,6 +144,7 @@ public final class HumanNameParser { // but you can select a particular parenthesized submatch to be returned. // Also, note that each regex requres that the preceding ones have been run, and matches chopped out. // names that starts or end w/ an apostrophe break this + String salutationRegex = "^(("+salutations+"))"; String nicknamesRegex = "(?i) ('|\\\"|\\(\\\"*'*)(.+?)('|\\\"|\\\"*'*\\)) "; String suffixRegex = "(?i),* *((" + suffixes + ")$)"; String lastRegex = "(?i)(?!^)\\b([^ ]+ y |" + prefixes + ")*[^ ]+$"; @@ -145,6 +152,8 @@ public final class HumanNameParser { String leadingInitRegex = "(?i)(^(.\\.*)(?= \\p{L}{2}))"; String firstRegex = "(?i)^([^ ]+)"; + String salutation = nameString.chopWithRegex(salutationRegex, 1); + // get nickname, if there is one String nickname = nameString.chopWithRegex(nicknamesRegex, 2); @@ -169,7 +178,7 @@ public final class HumanNameParser { // if anything's left, that's the middle name String middle = nameString.getWrappedString(); - return new Name(leadingInit, first, nickname, middle, last, suffix); + return new Name(leadingInit, salutation, first, nickname, middle, last, suffix); } } http://git-wip-us.apache.org/repos/asf/commons-text/blob/1c640335/src/main/java/org/apache/commons/text/names/Name.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/commons/text/names/Name.java b/src/main/java/org/apache/commons/text/names/Name.java index 7e32de4..ef3d36a 100644 --- a/src/main/java/org/apache/commons/text/names/Name.java +++ b/src/main/java/org/apache/commons/text/names/Name.java @@ -26,14 +26,16 @@ import java.util.Objects; public final class Name { private final String leadingInitial; + private final String salutation; private final String firstName; private final String nickName; private final String middleName; private final String lastName; private final String suffix; - Name(String leadingInitial, String firstName, String nickName, String middleName, String lastName, String suffix) { + Name(String leadingInitial, String salutation, String firstName, String nickName, String middleName, String lastName, String suffix) { this.leadingInitial = leadingInitial; + this.salutation = salutation; this.firstName = firstName; this.nickName = nickName; this.middleName = middleName; @@ -52,6 +54,11 @@ public final class Name { return leadingInitial; } + + public String getSalutation() { + return salutation; + } + /** * Gets the first name. * http://git-wip-us.apache.org/repos/asf/commons-text/blob/1c640335/src/test/java/org/apache/commons/text/names/HumanNameParserTest.java ---------------------------------------------------------------------- diff --git a/src/test/java/org/apache/commons/text/names/HumanNameParserTest.java b/src/test/java/org/apache/commons/text/names/HumanNameParserTest.java index f6c9ba6..22c96cc 100644 --- a/src/test/java/org/apache/commons/text/names/HumanNameParserTest.java +++ b/src/test/java/org/apache/commons/text/names/HumanNameParserTest.java @@ -74,6 +74,10 @@ public class HumanNameParserTest { Name result = nameParser.parse(record.get(Columns.Name)); long recordNum = record.getRecordNumber(); + + assertThat("Wrong LeadingInit in record " + recordNum, + result.getSalutation(), equalTo(record.get(Columns.Salutation))); + assertThat("Wrong LeadingInit in record " + recordNum, result.getLeadingInitial(), equalTo(record.get(Columns.LeadingInit))); @@ -94,6 +98,6 @@ public class HumanNameParserTest { } private enum Columns { - Name,LeadingInit,FirstName,NickName,MiddleName,LastName,Suffix + Name,Salutation,LeadingInit,FirstName,NickName,MiddleName,LastName,Suffix } } http://git-wip-us.apache.org/repos/asf/commons-text/blob/1c640335/src/test/resources/org/apache/commons/text/names/testNames.txt ---------------------------------------------------------------------- diff --git a/src/test/resources/org/apache/commons/text/names/testNames.txt b/src/test/resources/org/apache/commons/text/names/testNames.txt index 8e32bf1..2cd2b4d 100644 --- a/src/test/resources/org/apache/commons/text/names/testNames.txt +++ b/src/test/resources/org/apache/commons/text/names/testNames.txt @@ -1,32 +1,33 @@ -Name|LeadingInit|FirstName|NickName|MiddleName|LastName|Suffix -Björn O'Malley||Björn|||O'Malley| -Bin Lin||Bin|||Lin| -Linda Jones||Linda|||Jones| -Jason H. Priem||Jason||H.|Priem| -Björn O'Malley-Muñoz||Björn|||O'Malley-Muñoz| -Björn C. O'Malley||Björn||C.|O'Malley| -Björn "Bill" O'Malley||Björn|Bill||O'Malley| -Björn ("Bill") O'Malley||Björn|Bill||O'Malley| -Björn ("Wild Bill") O'Malley||Björn|Wild Bill||O'Malley| -Björn (Bill) O'Malley||Björn|Bill||O'Malley| -Björn 'Bill' O'Malley||Björn|Bill||O'Malley| -Björn C O'Malley||Björn||C|O'Malley| -Björn C. R. O'Malley||Björn||C. R.|O'Malley| -Björn Charles O'Malley||Björn||Charles|O'Malley| -Björn Charles R. O'Malley||Björn||Charles R.|O'Malley| -Björn van O'Malley||Björn|||van O'Malley| -Björn Charles van der O'Malley||Björn||Charles|van der O'Malley| -Björn Charles O'Malley y Muñoz||Björn||Charles|O'Malley y Muñoz| -Björn O'Malley, Jr.||Björn|||O'Malley|Jr. -Björn O'Malley Jr||Björn|||O'Malley|Jr -B O'Malley||B|||O'Malley| -William Carlos Williams||William||Carlos|Williams| -C. Björn Roger O'Malley|C.|Björn||Roger|O'Malley| -B. C. O'Malley||B.||C.|O'Malley| -B C O'Malley||B||C|O'Malley| -B.J. Thomas||B.J.|||Thomas| -O'Malley, Björn||Björn|||O'Malley| -O'Malley, Björn Jr||Björn|||O'Malley|Jr -O'Malley, C. Björn|C.|Björn|||O'Malley| -O'Malley, C. Björn III|C.|Björn|||O'Malley|III -O'Malley y Muñoz, C. Björn Roger III|C.|Björn||Roger|O'Malley y Muñoz|III \ No newline at end of file +Name|Salutation|LeadingInit|FirstName|NickName|MiddleName|LastName|Suffix +Björn O'Malley|||Björn|||O'Malley| +Bin Lin|||Bin|||Lin| +Linda Jones|||Linda|||Jones| +Jason H. Priem|||Jason||H.|Priem| +Björn O'Malley-Muñoz|||Björn|||O'Malley-Muñoz| +Björn C. O'Malley|||Björn||C.|O'Malley| +Björn "Bill" O'Malley|||Björn|Bill||O'Malley| +Björn ("Bill") O'Malley|||Björn|Bill||O'Malley| +Björn ("Wild Bill") O'Malley|||Björn|Wild Bill||O'Malley| +Björn (Bill) O'Malley|||Björn|Bill||O'Malley| +Björn 'Bill' O'Malley|||Björn|Bill||O'Malley| +Björn C O'Malley|||Björn||C|O'Malley| +Björn C. R. O'Malley|||Björn||C. R.|O'Malley| +Björn Charles O'Malley|||Björn||Charles|O'Malley| +Björn Charles R. O'Malley|||Björn||Charles R.|O'Malley| +Björn van O'Malley|||Björn|||van O'Malley| +Björn Charles van der O'Malley|||Björn||Charles|van der O'Malley| +Björn Charles O'Malley y Muñoz|||Björn||Charles|O'Malley y Muñoz| +Björn O'Malley, Jr.|||Björn|||O'Malley|Jr. +Björn O'Malley Jr|||Björn|||O'Malley|Jr +B O'Malley|||B|||O'Malley| +William Carlos Williams|||William||Carlos|Williams| +C. Björn Roger O'Malley||C.|Björn||Roger|O'Malley| +B. C. O'Malley|||B.||C.|O'Malley| +B C O'Malley|||B||C|O'Malley| +B.J. Thomas|||B.J.|||Thomas| +O'Malley, Björn|||Björn|||O'Malley| +O'Malley, Björn Jr|||Björn|||O'Malley|Jr +O'Malley, C. Björn||C.|Björn|||O'Malley| +O'Malley, C. Björn III||C.|Björn|||O'Malley|III +O'Malley y Muñoz, C. Björn Roger III||C.|Björn||Roger|O'Malley y Muñoz|III +Dr. Gaius Baltar|Dr.||Gaius|||Baltar| \ No newline at end of file