> > 2) Change 'sort' config (http://www.htdig.org/attrs.html#sort) to
> > 'revtime'. This will bring up the latest messages first. Very good for
> > huge archives where old answers are very likely outdated.

Ok, done. It should take effect on the next indexing run (which
happens every three days.) Tobias, your job is to confirm that
performance is not noticably affected for a big list like jboss. The
documentation claims there will be a hit, but I suspect it will be
neglible.

> >1) Increase the 'maximum_pages' config for htsearch to 30 (or at least
> >20, see http://www.htdig.org/attrs.html#maximum_pages)

I'm now convinced -- send me a patch to make it work. Note that
I only have images for results pages 1-10. I'm happy to replace them
if someone is an artist. Don't make result page images wrap or extend
off into scrollbarland. (Google and AltaVista solve this problem well,
don't know what htdig has up its sleeve.)

Jeff

PS. Web subscription to [EMAIL PROTECTED] seems to be messed up as of recently,
not sure what the deal is.


==============================

#!/bin/bash
#########################################################
#                                                       #
# My goal in life is to index web pages.                #
# I work in conjunction with the htDig search engine.   #
# I will create a separate searchable index             #
# for each mailing list. I'm happiest running as        #
# a batch job every once in a while.                    #
#                                                       #
# Time-stamp: <Thu 04/27/2000 14:16:29>             #
#                                                       #
#########################################################

##############################################
# Set global variables.                      #
##############################################

PATH=/usr/sbin:$PATH

MAIL=xxINSTxx/Mail
PAGES=xxARCHIVExx
CONF_DIR=xxHTCONFxx
VAULT=xxINSTxx/vault
MAINTAINER=xxMAINTAINERxx
TARGET=xxURLxx
CONF=xxCONFxx
#COMPRESSION=on
COMPRESSION=

CRAP1="border=0 align=middle width=30 height=30"
CRAP2="border=2 align=middle width=30 height=30"

####################################
###  function reporting status   ###
####################################

# Usage: echok <err|notice|info> message
# Sample usage: echok info "Shields are down to 14 percent."

echok () { 
        logger -t digger -p local0.$1 $2 
}

##############################################
# Function for indexing a particular list    #
##############################################

churn () {
    MAILLIST=$1

    if [ ! -f $PAGES/$MAILLIST/maillist.html ]
    then
        echo "Can't find $PAGES/$MAILLIST/maillist.html"
        return
    fi

    # Incremental indexing 
    #    Skip stuff that hasn't changed in the last n days.
    #    Bash dependent, since we do math.
    #    Uses GNU extensions for 'ls' and 'date'.
    if [ "$RECENT" = "true" ]
    then
        TIMESTAMP=`ls -l --full-time $PAGES/$MAILLIST/.mhonarc.db|cut -c 43-68`
        THEN=`echo $TIMESTAMP | sed 's/.*/"&"/' | xargs -n 1 date +%s -d`
        NOW=`date +%s`
        if  [ $(( ($NOW-$THEN) >= ($DAYS * 86400) )) = 1 ]
        then
            echok info "Skipping: $MAILLIST"
            return
        fi
    fi

    echok info "Indexing $MAILLIST"
    ESCAPED_NAME=`echo $MAILLIST | tr '@.' '__'`
    CFG=$CONF_DIR/$ESCAPED_NAME.conf
    DATABASE_DIR=$VAULT/$ESCAPED_NAME
    mkdir -p $DATABASE_DIR
    mkdir -p $CONF_DIR

    if [ -f $DATABASE_DIR/rcfile ]
    then
        LANG=`head $DATABASE_DIR/rcfile | grep Localization | cut -f3 -d' '`
    else
        LANG=
    fi

    ################################################
    # Create a custom configuration file.          #
    ################################################

    echo "# HTDIG configuration file."                        > $CFG
    echo "# Automaticly generated. Do not edit."             >> $CFG
    echo                                                     >> $CFG
    echo "start_url:        $TARGET/$MAILLIST/maillist.html" >> $CFG

    echo "database_dir:           $DATABASE_DIR"             >> $CFG
    echo "bad_word_list:          $CONF/bad_words.txt"       >> $CFG
    echo "nothing_found_file:     $CONF/nomatch$LANG.html"   >> $CFG
    echo "search_results_wrapper: $CONF/wrapper$LANG.html"   >> $CFG

    echo "limit_urls_to:        $TARGET/$MAILLIST/"          >> $CFG
    echo "exclude_urls:         .dir .htaccess .mhonarc.db"  >> $CFG
    echo "max_head_length:      10000"                       >> $CFG
    echo "remove_bad_urls:      true"                        >> $CFG
    echo "use_star_image:       no"                          >> $CFG
    echo "maintainer:           $MAINTAINER"                 >> $CFG
    echo "search_algorithm:     exact:1"                     >> $CFG
    echo "allow_virtual_hosts:  true"                        >> $CFG
    echo "allow_numbers:        true"                        >> $CFG
    echo "no_next_page_text:"                                >> $CFG
    echo "no_prev_page_text:"                                >> $CFG
#   echo "uncoded_db_compatible: false"                      >> $CFG
    echo "backlink_factor:      0"                           >> $CFG
    echo "sort:                 revtime"                     >> $CFG
    echo                                                     >> $CFG

    echo "local_urls: \\"                                    >> $CFG
    echo " $TARGET/$MAILLIST/=$PAGES/$MAILLIST/"             >> $CFG
    echo                                                     >> $CFG

    echo "next_page_text: \\"                                >> $CFG
    echo " <img src=/htdig/buttonr.gif $CRAP1 alt=next>"     >> $CFG
    echo                                                     >> $CFG

    echo "common_url_parts: \\"                              >> $CFG
    echo "$TARGET/$MAILLIST/msg .html"   >> $CFG
    echo                                                     >> $CFG

    echo "prev_page_text: \\"                                >> $CFG
    echo " <img src=/htdig/buttonl.gif $CRAP1 alt=prev>"     >> $CFG
    echo                                                     >> $CFG

    echo "page_number_text: \\"                              >> $CFG
    echo " \"<img src=/htdig/button1.gif $CRAP1 alt=1>\" \\" >> $CFG
    echo " \"<img src=/htdig/button2.gif $CRAP1 alt=2>\" \\" >> $CFG
    echo " \"<img src=/htdig/button3.gif $CRAP1 alt=3>\" \\" >> $CFG
    echo " \"<img src=/htdig/button4.gif $CRAP1 alt=4>\" \\" >> $CFG
    echo " \"<img src=/htdig/button5.gif $CRAP1 alt=5>\" \\" >> $CFG
    echo " \"<img src=/htdig/button6.gif $CRAP1 alt=6>\" \\" >> $CFG
    echo " \"<img src=/htdig/button7.gif $CRAP1 alt=7>\" \\" >> $CFG
    echo " \"<img src=/htdig/button8.gif $CRAP1 alt=8>\" \\" >> $CFG
    echo " \"<img src=/htdig/button9.gif $CRAP1 alt=9>\" \\" >> $CFG
    echo " \"<img src=/htdig/button10.gif $CRAP1 alt=10>\""  >> $CFG
    echo                                                     >> $CFG

    echo "no_page_number_text: \\"                           >> $CFG
    echo " \"<img src=/htdig/button1.gif $CRAP2 alt=1>\" \\" >> $CFG
    echo " \"<img src=/htdig/button2.gif $CRAP2 alt=2>\" \\" >> $CFG
    echo " \"<img src=/htdig/button3.gif $CRAP2 alt=3>\" \\" >> $CFG
    echo " \"<img src=/htdig/button4.gif $CRAP2 alt=4>\" \\" >> $CFG
    echo " \"<img src=/htdig/button5.gif $CRAP2 alt=5>\" \\" >> $CFG
    echo " \"<img src=/htdig/button6.gif $CRAP2 alt=6>\" \\" >> $CFG
    echo " \"<img src=/htdig/button7.gif $CRAP2 alt=7>\" \\" >> $CFG
    echo " \"<img src=/htdig/button8.gif $CRAP2 alt=8>\" \\" >> $CFG
    echo " \"<img src=/htdig/button9.gif $CRAP2 alt=9>\" \\" >> $CFG
    echo " \"<img src=/htdig/button10.gif $CRAP2 alt=10>\""  >> $CFG

    ################################################
    # Prepare for incremental indexing             #
    ################################################

    if [ -f $DATABASE_DIR/db.docdb ]
    then
        nice cp -f $DATABASE_DIR/db.docdb $DATABASE_DIR/db.docdb.work 
    fi
    if  [ -f $DATABASE_DIR/db.wordlist.work.bz2 ]
    then
        bunzip2 $DATABASE_DIR/db.wordlist.work.bz2
        [ $? != 0 ] && rm -f $DATABASE_DIR/db.wordlist.work.bz2
    fi

    ################################################
    # Run the indexing programs                    #
    ################################################

    echok info "Digging $MAILLIST"
    nice htdig -a -c $CFG
    ex=$?
    if [ "$ex" != "0" ]
    then
        echok err "htdig exited with $ex for $MAILLIST"
        return
    fi

    echok info "Merging $MAILLIST"
    nice htmerge -a -c $CFG
    ex=$?
    if [ "$ex" != "0" ]
    then
        echok err "htmerge exited with $ex for $MAILLIST"
        return
    fi

    ################################################
    # Make new results canonical                   #
    ################################################

    [ "$COMPRESSION" ] && bzip2 -f $DATABASE_DIR/db.wordlist.work
    nice mv  $DATABASE_DIR/db.docdb.work      $DATABASE_DIR/db.docdb
    nice mv  $DATABASE_DIR/db.docs.index.work $DATABASE_DIR/db.docs.index
    nice mv  $DATABASE_DIR/db.words.db.work   $DATABASE_DIR/db.words.db

    echok info "All done."
}

sweep () {
    cd $MAIL
    for i in [a-z]*@* ; do
        churn $i
    done
}

usage () {
    echo "Usage: $0 { <listname> | -all | -recent <days> | -help }"
    echo
    echo "Create search engine index"
    echo "   <listname>         Index a particular mailing list"
    echo "   --all              Index all mailing lists"
    echo "   --recent <days>    Index all with timestamps within <days> days."
    echo "   --help             Display this message"
    echo
}

# Parse command line arguements
if [ "$1" = "" -o "$1" = "-h" -o "$1" = "--help" -o "$1" = "-help" ]
then
    usage
elif [ "$1" = "--all" ]
then
    sweep
elif [ "$1" = "--recent" ]
then
    DAYS=$2
    if [ "$2" = "" ]
    then
        usage
        exit -1
    fi
    RECENT=true
    sweep
else
    churn $1
fi

exit 0



_______________________________________________
Gossip mailing list
[EMAIL PROTECTED]
http://jab.org/cgi-bin/mailman/listinfo/gossip

Reply via email to