That's it! I'm posting my solution too them :-).
*** Quick and dirty solution in beanshell *** Here is the contents of uniq.bsh:
reader = new BufferedReader(new FileReader("/usr/share/dict/words"));
dictWords = new HashSet();
while ((line = reader.readLine()) != null) {
dictWords.add(line.toLowerCase());
}
reader = new BufferedReader(new FileReader(bsh.args[0]));
wordCountMap = new HashMap();
while ((line = reader.readLine()) != null) {
lineWords = line.split("\\s+");
for (word : lineWords) {
word = word.toLowerCase();
wordCount = wordCountMap.get(word);
wordCount = (wordCount == null) ? 0 : wordCount;
if (dictWords.contains(word)) {
wordCountMap.put(word, ++wordCount);
}
}
}
words = wordCountMap.keySet();
for (word : words) {
print("'" + word + "' " + wordCountMap.get(word));
}
*** High performance solution in Java *** Contents of Uniq.java:
iimport java.io.BufferedReader;
import java.io.FileReader;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
public class Uniq {
public static void main(String[] args) {
try {
BufferedReader reader = new BufferedReader(new
FileReader("/usr/share/dict/words"));
Set<String> dictWords = new HashSet<String>();
String line = null;
while ((line = reader.readLine()) != null) {
dictWords.add(line.toLowerCase());
}
reader = new BufferedReader(new FileReader(args[0]));
Map<String, Integer> wordCountMap = new HashMap<String, Integer>();
while ((line = reader.readLine()) != null) {
String[] lineWords = line.split("\\s+");
for (String word : lineWords) {
word = word.toLowerCase();
Integer wordCount = wordCountMap.get(word);
wordCount = (wordCount == null) ? 0 : wordCount;
if (dictWords.contains(word)) {
wordCountMap.put(word, ++wordCount);
}
}
}
Set words = wordCountMap.keySet();
for (Object word : words) {
System.out.println("'" + word + "' " + wordCountMap.get(word));
}
}
catch (Exception e) {
e.printStackTrace();
}
}
}
/*
PLUG: http://plug.org, #utah on irc.freenode.net
Unsubscribe: http://plug.org/mailman/options/plug
Don't fear the penguin.
*/