Also from the book, here's an alternative update request processor that uses
a JavaScript script to do the counting and field
creation:
<updateRequestProcessorChain name="script-add-word-count">
<processor class="solr.StatelessScriptUpdateProcessorFactory">
<str name="script">add-word-count.js</str>
<lst name="params">
<str name="fieldName">content</str>
<str name="wordCountFieldName">content_wc_i</str>
</lst>
</processor>
<processor class="solr.LogUpdateProcessorFactory" />
<processor class="solr.RunUpdateProcessorFactory" />
</updateRequestProcessorChain>
Here is the JavaScript script that should be placed in the
"add-word-count.js" file in the "conf" directory for
the Solr collection:
function processAdd(cmd) {
var fieldName;
var wordCountFieldName;
if (typeof params !== "undefined") {
fieldName = params.get("fieldName");
wordCountFieldName = params.get("wordCountFieldName");
}
if (fieldName == null)
fieldName = "content";
if (wordCountFieldName == null)
wordCountFieldName = "content_wc_i";
// Get value(s) for named field
var values = cmd.getSolrInputDocument().getField(fieldName).getValues();
// Combine values into one string
var str = "";
var n = values.size();
for (i = 0; i < n; i++)
str += ' ' + values.get(i);
// Compress out hyphens and underscores to join words
var str_no_dash = str.replace(/-|_/g, '');;
// Replace words with simply "X"
var str_x_words = str_no_dash.replace(/\w+/g, 'X');
// Remove punctuation and white space, leaving just the "X"es.
var str_final = str_x_words.replace(/[^X]+/g, '');
// A count of the "X"es is a good proxy for the word count.
var wordCount = str_final.length;
// Set the word count output field value
cmd.getSolrInputDocument().addField(wordCountFieldName, wordCount);
}
function processDelete() {
// Dummy - add if needed
}
function processCommit() {
// Dummy - add if needed
}
function processRollback() {
// Dummy - add if needed
}
function processMergeIndexes() {
// Dummy - add if needed
}
function finish() {
// Dummy - add if needed
}
A test:
curl
"http://localhost:8983/solr/update?commit=true&update.chain=script-add-word-count"
\
-H 'Content-type:application/json' -d '
[{"id": "doc-1", "content": "Hello World"},
{"id": "doc-2", "content": ""},
{"id": "doc-3", "content": " -- --- !"},
{"id": "doc-4", "content": "This is some more."},
{"id": "doc-5", "content": "The CD-ROM, (and num_events_seen.)"},
{"id": "doc-6", "content": "Four score and seven years ago our fathers
brought forth on this continent a new nation, conceived in liberty,
and dedicated to the proposition that all men are created equal.
Now we are engaged in a great civil war, testing whether that nation,
or any nation so conceived and so dedicated, can long endure. "},
{"id": "doc-7", "content": "401(k)"},
{"id": "doc-8", "content": ["And, this", "is the end", "of this
test."]}]'
Results:
"id":"doc-1",
"content":["Hello World"],
"content_wc_i":2,
"id":"doc-2",
"content":[""],
"content_wc_i":0,
"id":"doc-3",
"content":[" -- --- !"],
"content_wc_i":0,
"id":"doc-4",
"content":["This is some more."],
"content_wc_i":4,
"id":"doc-5",
"content":["The CD-ROM, (and num_events_seen.)"],
"content_wc_i":4,
"id":"doc-6",
"content":["Four score and seven years ago our fathers\n
brought forth on this continent a new nation, conceived in liberty,\n
and dedicated to the proposition that all men are created equal.\n
Now we are engaged in a great civil war, testing whether that
nation,\n
or any nation so conceived and so dedicated, can long endure. "],
"content_wc_i":54,
"id":"doc-7",
"content":["401(k)"],
"content_wc_i":2,
"id":"doc-8",
"content":["And, this",
"is the end",
"of this test."],
"content_wc_i":8,
-- Jack Krupansky
-----Original Message-----
From: Jack Krupansky
Sent: Thursday, June 06, 2013 5:07 PM
To: solr-user@lucene.apache.org
Subject: Re: Filtering on results with more than N words.
From the book, here's an update request processor chain which will count the
words in the "content" field and place it in the "content_len_I" field. Then
you could do a range query on that count.
<updateRequestProcessorChain name="regex-count-words">
<!-- Start with a copy of the "content" field -->
<processor class="solr.CloneFieldUpdateProcessorFactory">
<str name="source">content</str>
<str name="dest">content_len_i</str>
</processor>
<!-- Combine multivalued input into a single string -->
<processor class="solr.ConcatFieldUpdateProcessorFactory">
<str name="fieldName">content_len_i</str>
<str name="delimiter"> </str>
</processor>
<!-- Remove hyphens and underscores - join parts into single word -->
<processor class="solr.RegexReplaceProcessorFactory">
<str name="fieldName">content_len_i</str>
<str name="pattern">-|_</str>
<str name="replacement"></str>
</processor>
<!-- Reduce words into a single letter "X" -->
<processor class="solr.RegexReplaceProcessorFactory">
<str name="fieldName">content_len_i</str>
<str name="pattern">\w+</str>
<str name="replacement">X</str>
</processor>
<!-- Remove punctuation and white space, leaving just the "X"es. -->
<processor class="solr.RegexReplaceProcessorFactory">
<str name="fieldName">content_len_i</str>
<str name="pattern">[^X]</str>
<str name="replacement"></str>
</processor>
<!-- A count of the "X"es is a good proxy for the word count. -->
<processor class="solr.FieldLengthUpdateProcessorFactory">
<str name="fieldName">content_len_i</str>
</processor>
<processor class="solr.LogUpdateProcessorFactory" />
<processor class="solr.RunUpdateProcessorFactory" />
</updateRequestProcessorChain>
Here's a test update using the Solr example schema, assuming you add the
above URP chain to solrconfig:
curl
"http://localhost:8983/solr/update?commit=true&update.chain=regex-count-words"
\
-H 'Content-type:application/json' -d '
[{"id": "doc-1", "content": "Hello World"},
{"id": "doc-2", "content": ""},
{"id": "doc-3", "content": " -- --- !"},
{"id": "doc-4", "content": "This is some more."},
{"id": "doc-5", "content": "The CD-ROM, (and num_events_seen.)"},
{"id": "doc-6", "content": "Four score and seven years ago our fathers
brought forth on this continent a new nation, conceived in liberty,
and dedicated to the proposition that all men are created equal.
Now we are engaged in a great civil war, testing whether that nation,
or any nation so conceived and so dedicated, can long endure. "},
{"id": "doc-7", "content": "401(k)"},
{"id": "doc-8", "content": ["And, this", "is the end", "of this test."]}]'
Results:
"id":"doc-1",
"content":["Hello World"],
"content_len_i":2,
"id":"doc-2",
"content":[""],
"content_len_i":0,
"id":"doc-3",
"content":[" -- --- !"],
"content_len_i":0,
"id":"doc-4",
"content":["This is some more."],
"content_len_i":4,
"id":"doc-5",
"content":["The CD-ROM, (and num_events_seen.)"],
"content_len_i":4,
"id":"doc-6",
"content":["Four score and seven years ago our fathers\n
brought forth on this continent a new nation, conceived in liberty,\n
and dedicated to the proposition that all men are created equal.\n
Now we are engaged in a great civil war, testing whether that
nation,\n
or any nation so conceived and so dedicated, can long endure. "],
"content_len_i":54,
"id":"doc-7",
"content":["401(k)"],
"content_len_i":2,
"id":"doc-8",
"content":["And, this",
"is the end",
"of this test."],
"content_len_i":8,
-- Jack Krupansky
-----Original Message-----
From: Dotan Cohen
Sent: Thursday, June 06, 2013 3:45 AM
To: solr-user@lucene.apache.org
Subject: Filtering on results with more than N words.
Is there any way to restrict the search results to only those
documents with more than N words / tokens in the searched field? I
thought that this would be an easy one to Google for, but I cannot
figure it out. or find any references. There are many references to
word size in characters, but not to filed size in words.
Thank you.
--
Dotan Cohen
http://gibberish.co.il
http://what-is-what.com