Hey ho,

first of all: I am not sure if this topic belongs to the solr or nutch list, sorry for the double post.

For some reasons all of the solr documents have a boost value of 1.0

I indexed them using the solrindex command from nutch 1.3. The pages were scored with Webgraph an the output of the used crawldb is:

11/10/12 01:55:05 INFO crawl.CrawlDbReader: Statistics for CrawlDb: crawldb
11/10/12 01:55:05 INFO crawl.CrawlDbReader: TOTAL urls: 243751
11/10/12 01:55:05 INFO crawl.CrawlDbReader: retry 0:    242738
11/10/12 01:55:05 INFO crawl.CrawlDbReader: retry 1:    627
11/10/12 01:55:05 INFO crawl.CrawlDbReader: retry 2:    127
11/10/12 01:55:05 INFO crawl.CrawlDbReader: retry 3:    148
11/10/12 01:55:05 INFO crawl.CrawlDbReader: retry 4:    111
11/10/12 01:55:05 INFO crawl.CrawlDbReader: min score:  0.0
11/10/12 01:55:05 INFO crawl.CrawlDbReader: avg score:  0.4357474
11/10/12 01:55:05 INFO crawl.CrawlDbReader: max score:  2764.215
11/10/12 01:55:05 INFO crawl.CrawlDbReader: status 1 (db_unfetched): 32425 11/10/12 01:55:05 INFO crawl.CrawlDbReader: status 2 (db_fetched): 182141
11/10/12 01:55:05 INFO crawl.CrawlDbReader: status 3 (db_gone): 17783
11/10/12 01:55:05 INFO crawl.CrawlDbReader: status 4 (db_redir_temp):   8506
11/10/12 01:55:05 INFO crawl.CrawlDbReader: status 5 (db_redir_perm):   2302
11/10/12 01:55:05 INFO crawl.CrawlDbReader: status 6 (db_notmodified):  594
11/10/12 01:55:05 INFO crawl.CrawlDbReader: CrawlDb statistics: done

as you can see, the urls have a score. Shouldn't these values appear in the boost field of the solr documents after indexing?
The version of the solr server is 3.4

Anybody any suggestions?

Thanks in advance

schema.xml:

<schema name="nutch" version="1.3">
    <types>
<fieldType name="string" class="solr.StrField" sortMissingLast="true"
            omitNorms="true"/>
        <fieldType name="long" class="solr.TrieLongField" precisionStep="0"
            omitNorms="true" positionIncrementGap="0"/>
<fieldType name="float" class="solr.TrieFloatField" precisionStep="0"
            omitNorms="true" positionIncrementGap="0"/>
        <fieldType name="date" class="solr.TrieDateField" precisionStep="0"
            omitNorms="true" positionIncrementGap="0"/>

        <fieldType name="text" class="solr.TextField"
            positionIncrementGap="100">

            <analyzer>
                <tokenizer class="solr.WhitespaceTokenizerFactory"/>
                <filter class="solr.StopFilterFactory"
                    ignoreCase="true" words="stopwords.txt"/>
                <filter class="solr.WordDelimiterFilterFactory"
                    generateWordParts="1" generateNumberParts="1"
                    catenateWords="1" catenateNumbers="1" catenateAll="0"
                    splitOnCaseChange="1"/>
                <filter class="solr.LowerCaseFilterFactory"/>
                <filter class="solr.EnglishPorterFilterFactory"
                    protected="protwords.txt"/>
                <filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
            </analyzer>
        </fieldType>

        <fieldType name="url" class="solr.TextField"
            positionIncrementGap="100">
            <analyzer>
                <tokenizer class="solr.StandardTokenizerFactory"/>
                <filter class="solr.LowerCaseFilterFactory"/>
                <filter class="solr.WordDelimiterFilterFactory"
                    generateWordParts="1" generateNumberParts="1"/>
            </analyzer>
        </fieldType>
    </types>
    <fields>

        <field name="id" type="string" stored="true" indexed="true"/>

        <!-- core fields -->
        <field name="segment" type="string" stored="true" indexed="false"/>
        <field name="digest" type="string" stored="true" indexed="false"/>
        <field name="boost" type="float" stored="true" indexed="false"/>

        <!-- fields for index-basic plugin -->
        <field name="host" type="url" stored="false" indexed="true"/>
        <field name="site" type="string" stored="false" indexed="true"/>

        <field name="url" type="url" stored="true" indexed="true"
            required="true"/>
        <field name="content" type="text" stored="true" indexed="true"/>
<field name="title" type="text" stored="true" indexed="true" multiValued="true"/>
        <field name="cache" type="string" stored="true" indexed="false"/>
        <field name="tstamp" type="date" stored="true" indexed="false"/>

        <!-- fields for index-anchor plugin -->
        <field name="anchor" type="string" stored="true" indexed="true"
            multiValued="true"/>

        <!-- fields for index-more plugin -->

        <field name="type" type="string" stored="true" indexed="true"
            multiValued="true"/>
        <field name="contentLength" type="long" stored="true"
            indexed="false"/>
        <field name="lastModified" type="date" stored="true"
            indexed="false"/>
        <field name="date" type="date" stored="true" indexed="true"/>

        <!-- fields for languageidentifier plugin -->
        <field name="lang" type="string" stored="true" indexed="true"/>

        <!-- fields for subcollection plugin -->
        <field name="subcollection" type="string" stored="true"
            indexed="true" multiValued="true"/>

<!-- fields for feed plugin (tag is also used by microformats-reltag)-->
        <field name="author" type="string" stored="true" indexed="true"/>
<field name="tag" type="string" stored="true" indexed="true" multiValued="true"/>
        <field name="feed" type="string" stored="true" indexed="true"/>
        <field name="publishedDate" type="date" stored="true"
            indexed="true"/>
        <field name="updatedDate" type="date" stored="true"
            indexed="true"/>

        <!-- fields for creativecommons plugin -->
        <field name="cc" type="string" stored="true" indexed="true"
            multiValued="true"/>

    </fields>
    <uniqueKey>id</uniqueKey>
    <defaultSearchField>content</defaultSearchField>
    <solrQueryParser defaultOperator="OR"/>
</schema>

solrindex-mapping.xml:

<mapping>
        <!-- Simple mapping of fields created by Nutch IndexingFilters
             to fields defined (and expected) in Solr schema.xml.

             Any fields in NutchDocument that match a name defined
             in field/@source will be renamed to the corresponding
             field/@dest.
             Additionally, if a field name (before mapping) matches
             a copyField/@source then its values will be copied to
             the corresponding copyField/@dest.

             uniqueKey has the same meaning as in Solr schema.xml
             and defaults to "id" if not defined.
         -->
        <fields>
                <field dest="content" source="content"/>
                <field dest="site" source="site"/>
                <field dest="title" source="title"/>
                <field dest="host" source="host"/>
                <field dest="segment" source="segment"/>
                <field dest="boost" source="boost"/>
                <field dest="digest" source="digest"/>
                <field dest="tstamp" source="tstamp"/>
                <field dest="id" source="url"/>
                <copyField source="url" dest="url"/>
        </fields>
        <uniqueKey>id</uniqueKey>
</mapping>

Reply via email to