Re: [htdig3-dev] Conclusions on prefix compression with db

loic Fri, 23 Jul 1999 09:26:37 -0700

Geoff Hutchison writes:
 > 
 > If you can get me the source, I'll contact Sleepycat and ask about the
 > results. From previous experience, they're willing to answer questions
 > like this.

 Here are the sources. Forget the previous sources I sent:
Thanks for your help on this :-)

-- benchit.cc

#ifdef HAVE_CONFIG_H
#include <htconfig.h>
#endif /* HAVE_CONFIG_H */

#include <fstream.h>

// If we have this, we probably want it.
#ifdef HAVE_GETOPT_H
#include <getopt.h>
#endif
#include <malloc.h>
#include <stdlib.h>

#include <htString.h>
#include <Database.h>

typedef struct {
  char* wordsfile;
  char* data;
  int nwords;
  int add;
  int loop;
  int ndoc;
  int prefix;
  int disable_prefix;
  int suffix;
} params_t;

static void keys();

static void dobench(int mode, params_t* params);
static int verbose;

void usage();

//*****************************************************************************
// int main(int ac, char **av)
//
int main(int ac, char **av)
{
  int                   c;
  extern char           *optarg;
  params_t              params;
  int mode = 1;

  params.wordsfile = strdup("words.all");
  params.nwords = -1;
  params.add = 10000;
  params.loop = 1;
  params.ndoc = 1;
  params.data = strdup("a");
  params.suffix = params.prefix = 0;
  params.disable_prefix = 0;

  while ((c = getopt(ac, av, "v01234w:n:a:l:d:D:kpsP")) != -1)
    {
      switch (c)
        {
        case '0':
        case '1':
        case '2':
        case '3':
        case '4':
          mode = c - '0';
          break;
        case 'k':
          keys();
          exit(0);
          break;
        case 'w':
          free(params.wordsfile);
          params.wordsfile = strdup(optarg);
          break;
        case 'D':
          free(params.data);
          params.data = strdup(optarg);
          break;
        case 'n':
          params.nwords = atoi(optarg);
          break;
        case 'a':
          params.add = atoi(optarg);
          break;
        case 'l':
          params.loop = atoi(optarg);
          break;
        case 'd':
          params.ndoc = atoi(optarg);
          break;
        case 'v':
          verbose++;
          break;
        case 'p':
          params.prefix = 1;
          break;
        case 's':
          params.suffix = 1;
          break;
        case 'P':
          params.disable_prefix = 1;
          break;
        case '?':
          usage();
          break;
        }
    }

  dobench(mode, &params);

  free(params.wordsfile);
  free(params.data);

  return 0;
}


//*****************************************************************************
// void usage()
//   Display program usage information
//
void usage()
{
    cout << "usage: benchit [-v][-1]\n";
    cout << "Options:\n";
    cout << "\t-v\tVerbose mode.  This increases the verbosity of the\n";
    cout << "\t-0\tOnly fill with word/documentid.\n\n";
    cout << "\t-1\tNo dups, get, grow, put.\n\n";
    cout << "\t-2\tDups, get, delete, grow, put.\n\n";
    cout << "\t-3\tDups, put.\n\n";
    cout << "\t-w file\tRead words list from file (default words.all).\n\n";
    cout << "\t-n limit\tRead at most <limit> words (default no limit).\n\n";
    cout << "\t-a loop\tLoop <loop> times when updating a word (default 10 000).\n\n";
    cout << "\t-l loop\tDo the whole test <loop> times (default 1).\n\n";
    cout << "\t-d n\tDo as if each word appear in <n> documents (default 1).\n\n";
    cout << "\t-D data\tdata appended to record is <data> (default a).\n\n";
    cout << "\t-p\tadd prefix 0000000000 to key (default no prefix).\n\n";
    cout << "\t-P\tdisable prefix compression (default enabled).\n\n";
    cout << "\t-s\tadd suffix 0000000000 to key (default no suffix).\n\n";
    exit(0);
}

static void fill(Database* db, params_t* params)
{
  ifstream in(params->wordsfile);
  char  buffer[50000];
  char tmp[1024];
  int words_count = 0;

  cout << "Reading from " << params->wordsfile << " ... ";
  while (!in.bad()) {
    in.getline(buffer, sizeof(buffer));
    if (in.eof())
      break;

    String line;
    line << buffer;
    line.chop("\r\n");

    char *current = line.get();

    char* pad = "0000000000";

    for(int j = 0; j < params->ndoc; j++) {
      sprintf(tmp, "%s%s\001%d%s", (params->prefix ? pad : ""), current, j, 
(params->suffix ? pad : ""));
      db->Put(tmp, line);
    }

    words_count++;
    if(params->nwords > 0 && params->nwords <= words_count) break;
  }

  cout << "pushed " << words_count << " words (" << params->ndoc << " documents) \n";
}

static void try1(Database* db, params_t* params)
{
  char* keys[] = {
    "billions",
    "millions",
    0
  };

  for(char** key = keys; *key; key++) {
    db->Start_Seq(*key);
    *key = strdup(db->Get_Next_Seq());
    cout << "key: " << *key << "\n";
  }

  for(int i = 0; i < params->add; i++) {
    for(char** key = keys; *key; key++) {
      String data;
  
      if(db->Get(*key, data) != OK)
        abort();

      data += "a";

      if(db->Put(*key, data) != OK)
        abort();

      if(i >= params->add - 1)
        cout << "try1: " << *key << " is " << data.length() << " bytes long\n";
    }
  }

  for(char** key = keys; *key; key++) {
    if(db->Put(*key, *key) != OK)
      abort();
  }
}

static void try2(Database* db, params_t* params)
{
  char* keys[] = {
    "billions",
    "millions",
    0
  };

  for(char** key = keys; *key; key++) {
    db->Start_Seq(*key);
    *key = strdup(db->Get_Next_Seq());
    cout << "key: " << *key << "\n";
  }

  for(int i = 0; i < params->add; i++) {
    for(char** key = keys; *key; key++) {
      String data;
  
      if(db->Get(*key, data) != OK)
        abort();
      if(db->Delete(*key) != OK)
        abort();

      data += params->data;
      data += String(i);

      if(db->Put(*key, data) != OK)
        abort();

      if(i >= params->add - 1)
        cout << "try2: " << *key << " is " << data.length() << " bytes long\n";
    }
  }

  for(char** key = keys; *key; key++) {
    if(db->Put(*key, *key) != OK)
      abort();
  }
}

static void try3(Database* db, params_t* params)
{
  char* keys[] = {
    "billions",
    "millions",
    0
  };

  for(int i = 0; i < params->add; i++) {
    for(char** key = keys; *key; key++) {
      String data(params->data);
      data += String(i);
  
      if(db->Put(*key, data) != OK)
        abort();

      if(i >= params->add - 1)
        cout << "try2: " << *key << " is " << data.length() << " bytes long\n";
    }
  }
}

static void dobench(int mode, params_t* params)
{
  Database* db = Database::getDatabaseInstance(DB_BTREE);
  int flags = 0644;

  if(mode == 1)
    flags |= 0x400000;

  if(params->disable_prefix)
    flags |= 0x800000;

  if(db->OpenReadWrite("./test", flags) != OK)
    abort();

  fill(db, params);

  for(int i = 0; i < params->loop; i++) {
    switch(mode) {
    case 1:
      try1(db, params);
      break;
    case 2:
      try2(db, params);
      break;
    case 3:
      try3(db, params);
      break;
    }
  }

  if(db->Close() != OK)
    abort();

  delete db;
}

static void keys()
{
  Database* db = Database::getDatabaseInstance(DB_BTREE);

  if(db->OpenRead("/opt/www/var/htdig/db.words.db") != OK)
    abort();

  db->Start_Get();
  char* key;
  while((key = db->Get_Next())) {
    cout << key << "\n";
  }

  if(db->Close() != OK)
    abort();

  delete db;
}

-- Makefile.am

include $(top_srcdir)/Makefile.config

LOCAL_DEFINES = -p

noinst_PROGRAMS = benchit

benchit_SOURCES = benchit.cc
benchit_DEPENDENCIES = $(HTLIBS)
benchit_LDFLAGS = -p
benchit_LDADD = $(HTLIBS)

#
# Run bench1 once to make sure everything that
# will be cached is cached. 
#
bench: benchit
        ./benchit -1 > /dev/null 2>&1
        for i in 1 2 3 ; do \
          $(MAKE) BENCH=$$i dobench ; \
        done

BENCH=1
LOOP=20

dobench: benchit
          ( rm -f test ; \
            time -v ./benchit -$(BENCH) -l $(LOOP) ; \
            gprof .libs/benchit ; \
            db_dump test > test.dump ; \
            db_load test < test.dump ; \
            db_stat -d test ; \
            ls -l test ) > bench$(BENCH).out 2>&1 

NDOC = 10
doprefix:
        #
        # Without prefix function $(NDOC) documents
        #
        $(MAKE) DISABLE_PREFIX=-P prefix
        #
        # With prefix function $(NDOC) documents
        #
        $(MAKE) prefix

prefix: benchit
        @rm -f test ; \
        nwords=`wc -l words.all | perl -n -e 'print if(s/\s*(\d+).*/\1/)'` ; \
        time ./benchit -0 -d $(NDOC) $(DISABLE_PREFIX) ; \
        s1=`stat test | perl -n -e 'print if(s/.*size:\s+(\d+).*/\1/i)'` ; \
        bw=`expr $$s1 / \( $$nwords \* $(NDOC) \)` ; \
        echo "no prefix : $$bw bytes per word" ; \
        rm -f test ; \
        time ./benchit -0 -d $(NDOC) -p $(DISABLE_PREFIX) ; \
        s1=`stat test | perl -n -e 'print if(s/.*size:\s+(\d+).*/\1/i)'` ; \
        bw=`expr $$s1 / \( $$nwords \* $(NDOC) \)` ; \
        echo "prefix : $$bw bytes per word" ; \
        rm -f test ; \
        time ./benchit -0 -d $(NDOC) -s $(DISABLE_PREFIX) ; \
        s1=`stat test | perl -n -e 'print if(s/.*size:\s+(\d+).*/\1/i)'` ; \
        bw=`expr $$s1 / \( $$nwords \* $(NDOC) \)` ; \
        echo "suffix : $$bw bytes per word"

words:
        find /usr/info -name '*.gz' -print | xargs zcat | perl -n -e 'print join("\n", 
map { lc } grep(length() > 2 && length() < 32, m/[a-z]+/ig)) . "\n"' | grep -v '^$$' | 
sort -u > words

PHONY: words

-- And the small hack to DB2_db.cc that makes it work

/*
 * __bam_defpfx --
 *      Default prefix routine.
 *
 * PUBLIC: size_t __bam_defpfx __P((const DBT *, const DBT *));
 */
static size_t
bam_defpfx(const DBT *a, const DBT *b)
{
        size_t cnt, len;
        u_int8_t *p1, *p2;

        cnt = 1;
        len = a->size > b->size ? b->size : a->size;
        for (p1 = a->data, p2 = b->data; len--; ++p1, ++p2, ++cnt)
                if (*p1 != *p2)
                        return (cnt);

        /*
         * We know that a->size must be <= b->size, or they wouldn't be
         * in this order.
         */
        return (a->size < b->size ? a->size + 1 : a->size);
}


/*
 * __bam_defcmp --
 *      Default comparison routine.
 *
 * PUBLIC: int __bam_defcmp __P((const DBT *, const DBT *));
 */
static int
bam_defcmp(const DBT *a, const DBT *b)
{
        size_t len;
        u_int8_t *p1, *p2;

        /*
         * Returns:
         *      < 0 if a is < b
         *      = 0 if a is = b
         *      > 0 if a is > b
         *
         * XXX
         * If a size_t doesn't fit into a long, or if the difference between
         * any two characters doesn't fit into an int, this routine can lose.
         * What we need is a signed integral type that's guaranteed to be at
         * least as large as a size_t, and there is no such thing.
         */
        len = a->size > b->size ? b->size : a->size;
        for (p1 = a->data, p2 = b->data; len--; ++p1, ++p2)
                if (*p1 != *p2)
                        return ((long)*p1 - (long)*p2);
        return ((long)a->size - (long)b->size);
}

int
DB2_db::OpenReadWrite(char *filename, int mode)
{
    //
    // Initialize the database environment.
    //
    dbenv = db_init((char *)NULL);
    memset(&dbinfo, 0, sizeof(dbinfo));
//    dbinfo.db_cachesize = CACHE_SIZE_IN_KB * 1024;    // Cachesize: 64K.
//    dbinfo.db_pagesize = 1024;                        // Page size: 1K.
    // Hack for benchmarking. Turn off duplicates if bit 0x400000 is on. */
    if(!(mode & 0x800000)) dbinfo.bt_prefix = bam_defpfx;
    dbinfo.bt_compare = bam_defcmp;
    if(!(mode & 0x400000)) dbinfo.flags = DB_DUP;
..


------------------------------------
To unsubscribe from the htdig3-dev mailing list, send a message to
[EMAIL PROTECTED] containing the single word "unsubscribe" in
the SUBJECT of the message.
Re: [htdig3-dev] Conclusions on prefix compression with db

Reply via email to