i made a different version of the LuceneIndexTransformer nearer to the Lucene concepts
(more basic and flexible) based on the old LuceneIndexTransformer


Example of input source:

<page xmlns:lucene="http://apache.org/cocoon/lucene/1.0";>
<lucene:index create="true"
analyzer="org.apache.lucene.analysis.standard.StandardAnalyzer"
directory="d:/indexbase"
merge-factor="merge-factor">
<lucene:document>

<lucene:field name="tile" type="keyword">sqdqsdq</lucene:field>
<lucene:field name="description" type="text"> bla bal blalael balbal </lucene:field>
<lucene:field name="date" type="date" dateformat="MM/dd/yyyy">10/12/2002</lucene:field> (see java API Class SimpleDateFormat for more explanation about the dateFormat attribut)
<lucene:field name="date" type="unstored" >just indexed information (not stored)</lucene:field>
<lucene:field name="date" type="unindexed" >just stored information (not indexed)</lucene:field>
</lucene:document>


<lucene:document>
<lucene:field name="author" type="keyword" boost="2">Mr Author</lucene:field> (boost the field for the search (see Lucene documentation)
<lucene:field name="langage" type="keyword">french</lucene:field>
</lucene:document>
</lucene:index>


<lucene:delete directory="d:/indexbase">
<lucene:document field="id" value="1E3RFE"/> //delete all documents with the field id ="1E3RFE"
<lucene:document field="author" value="Mr Author"/>
</lucene:delete>
</page>




Example of Output Source :

<page xmlns:lucene="http://apache.org/cocoon/lucene/1.0";>
<lucene:index nbdocuments="2"/>
<lucene:delete nbdocuments="1"/>
</page>

_________________________________________________________________
MSN Messenger 6 http://g.msn.fr/FR1001/866 : dialoguez en son et en image avec vos amis.
package org.paris5.cocoon.transformation;

import java.io.File;
import java.io.IOException;
import java.io.Serializable;
import java.util.Map;
import java.util.Stack;
import java.util.Date;
import java.text.SimpleDateFormat;

import org.apache.avalon.framework.activity.Disposable;
import org.apache.avalon.framework.component.ComponentException;
import org.apache.avalon.framework.component.ComponentManager;
import org.apache.avalon.framework.configuration.Configurable;
import org.apache.avalon.framework.configuration.Configuration;
import org.apache.avalon.framework.configuration.ConfigurationException;
import org.apache.avalon.framework.context.Context;
import org.apache.avalon.framework.context.ContextException;
import org.apache.avalon.framework.context.Contextualizable;
import org.apache.avalon.framework.parameters.Parameters;

import org.apache.avalon.excalibur.pool.Recyclable;

import org.apache.cocoon.Constants;
import org.apache.cocoon.ProcessingException;
import org.apache.cocoon.caching.CacheableProcessingComponent;
import org.apache.cocoon.components.search.LuceneCocoonHelper;
import org.apache.cocoon.components.search.LuceneXMLIndexer;
import org.apache.cocoon.transformation.AbstractSAXTransformer;

import org.apache.cocoon.environment.SourceResolver;
import org.apache.excalibur.source.SourceValidity;
import org.apache.excalibur.source.impl.validity.NOPValidity;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.DateField;
import org.apache.lucene.store.*;
import org.apache.lucene.index.*;

import org.xml.sax.Attributes;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.AttributesImpl;
import java.text.*;

/**
 * A lucene index creation transformer.
 * @author Nicolas Maisoneuve
 *
 <p><strong>Example of input source:</strong></p>
 <p>&lt;page xmlns:lucene=&quot;http://apache.org/cocoon/lucene/1.0&quot;&gt;<br>&lt;
   lucene:index create=&quot;true&quot; <br>
   analyzer=&quot;org.apache.lucene.analysis.standard.StandardAnalyzer&quot;<br>
   directory=&quot;d:/indexbase&quot;<br>
   merge-factor=&quot;merge-factor&quot;&gt;<br>
 &lt;lucene:document&gt;</p>
 <p>&lt;lucene:field name=&quot;tile&quot; type=&quot;keyword&quot;&gt;sqdqsdq&lt;/lucene:field&gt;<br>
 &lt;lucene:field name=&quot;description&quot; type=&quot;text&quot;&gt; bla bal
   blalael balbal &lt;/lucene:field&gt;<br>
 &lt;lucene:field name=&quot;date&quot; type=&quot;date&quot; dateformat=&quot;MM/dd/yyyy&quot;&gt;10/12/2002&lt;/lucene:field&gt; </p>
 <p><em>(see
 java API Class SimpleDateFormat for more explanation about the dateFormat attribut)</em></p>
 <p><br>
 &lt;lucene:field name=&quot;date&quot; type=&quot;unstored&quot; &gt;just indexed
     information (not stored)&lt;/lucene:field&gt;<br>
 &lt;lucene:field name=&quot;date&quot; type=&quot;unindexed&quot; &gt;just stored
     information (not indexed)&lt;/lucene:field&gt;<br>
 &lt;/lucene:document&gt;</p>
 <p> &lt;lucene:document&gt;<br>
 &lt;lucene:field name=&quot;author&quot; type=&quot;keyword&quot; boost=&quot;2&quot;&gt;Mr
 Author&lt;/lucene:field&gt; <p><em>(boost the field for the search (see Lucene documentation))</p>
 </em><p>&lt;lucene:field name=&quot;langage&quot; type=&quot;keyword&quot;&gt;french&lt;/lucene:field&gt;<br>
 &lt;/lucene:document&gt;<br>
 &lt; /lucene:index&gt;</p>
 <p>&lt;lucene:delete directory=&quot;d:/indexbase&quot; &gt;<br>
 &lt;lucene:document field=&quot;author&quot; value=&quot;Mr Author&quot;/&gt; <em> (delete
 all documents with the field author =&quot;Mr Author&quot;)</em><br>&lt;lucene:document
 field=&quot;id&quot; value=&quot;1E3RFE&quot;/&gt; <br>
 &lt; /lucene:delete&gt;</p>
&lt;/page&gt;
<p><strong>Example of Output Source</strong></p>
<p>&lt;page xmlns:lucene=&quot;http://apache.org/cocoon/lucene/1.0&quot;&gt;<br>
&lt;
  lucene:index nbdocuments=&quot;2&quot;/&gt;<br>
&lt;
lucene:delete nbdocuments=&quot;1&quot;/&gt;<br>
&lt;/page&gt;
</p>

 */

public class LuceneIndexTransformer
    extends AbstractSAXTransformer
    implements Disposable, CacheableProcessingComponent, Recyclable,
    Configurable, Contextualizable {

  public static final String ANALYZER_CLASSNAME_CONFIG = "analyzer-classname";
  public static final String ANALYZER_CLASSNAME_PARAMETER =
      "analyzer-classname";

  public static final String DIRECTORY_CONFIG = "directory";
  public static final String DIRECTORY_PARAMETER = "directory";

  public static final String MERGE_FACTOR_CONFIG = "merge-factor";
  public static final String MERGE_FACTOR_PARAMETER = "merge-factor";

  public static final String DIRECTORY_DEFAULT = "index";
  public static final int MERGE_FACTOR_DEFAULT = 20;
  public static final String ANALYZER_CLASSNAME_DEFAULT =
      "org.apache.lucene.analysis.standard.StandardAnalyzer";

  public static final String LUCENE_URI = "http://apache.org/cocoon/lucene/1.0";;
  public static final String LUCENE_QUERY_ELEMENT = "index";
  public static final String LUCENE_QUERY_ANALYZER_ATTRIBUTE = "analyzer";
  public static final String LUCENE_QUERY_DIRECTORY_ATTRIBUTE = "directory";
  public static final String LUCENE_QUERY_CREATE_ATTRIBUTE = "create";
  public static final String LUCENE_QUERY_MERGE_FACTOR_ATTRIBUTE =
      "merge-factor";

  public static final String LUCENE_DELETE_ELEMENT = "delete";
  public static final String DOCUMENT_NAME_ATTRIBUTE = "name";
  public static final String DOCUMENT_VALUE_ATTRIBUTE = "value";

  public static final String LUCENE_DOCUMENT_ELEMENT = "document";
  public static final String LUCENE_DOCUMENT_FIELD_ATTRIBUTE = "field";
  public static final String LUCENE_DOCUMENT_VALUE_ATTRIBUTE = "value";
  public static final String LUCENE_FIELD_ELEMENT = "field";
  public static final String LUCENE_FIELD_NAME_ATTRIBUTE = "name";
  public static final String LUCENE_FIELD_TYPE_ATTRIBUTE = "type";
  public static final String LUCENE_FIELD_DATEFORMAT_ATTRIBUTE = "dateformat";
  public static final String LUCENE_FIELD_BOOST_ATTRIBUTE = "boost";

  public static final int TYPE_KEYWORD = 1;
  public static final int TYPE_TEXT = 2;
  public static final int TYPE_DATE = 3;
  public static final int TYPE_UNSTORED = 4;
  public static final int TYPE_UNINDEXED = 5;

  public static final int ADD_ACTION = 1;
  public static final int DELETE_ACTION = 2;

  // Initialization time variables
  protected ComponentManager manager = null;
  protected File workDir = null;
  protected int nbdocuments;
  protected int action;

  // Declaration time parameters values
  private String analyzerClassnameDefault;
  private String directoryDefault;
  private int mergeFactorDefault;

  // Invocation time parameters values
  private String analyzerClassname;
  private String directory;
  private int mergeFactor;

  // Runtime variables
  private int processing;
  private IndexWriter writer;
  private IndexReader reader;

  private Term term;
  private Document bodyDocument;
  private String fieldname;
  private int fieldtype;
  private float fieldboost;
  private String fieldvalue;
  private  SimpleDateFormat df;

  private static String uid(String url) {
    return url.replace('/', '\u0000'); // + "\u0000" + DateField.timeToString(urlConnection.getLastModified());
  }

  public void configure(Configuration conf) throws ConfigurationException {
    this.analyzerClassnameDefault = conf.getChild(ANALYZER_CLASSNAME_CONFIG)
        .getValue(ANALYZER_CLASSNAME_DEFAULT);
    this.mergeFactorDefault = conf.getChild(MERGE_FACTOR_CONFIG)
        .getValueAsInteger(MERGE_FACTOR_DEFAULT);
    this.directoryDefault = conf.getChild(DIRECTORY_CONFIG)
        .getValue(DIRECTORY_DEFAULT);

  }

  /**
   * Setup the transformer.
   */
  public void setup(SourceResolver resolver, Map objectModel, String src,
                    Parameters parameters) throws ProcessingException,
      SAXException, IOException {
    // We don't need all this stuff
    this.analyzerClassname = parameters.getParameter(
        ANALYZER_CLASSNAME_PARAMETER, analyzerClassnameDefault);
    this.directory = parameters.getParameter(DIRECTORY_PARAMETER,
                                             directoryDefault);
    this.mergeFactor = parameters.getParameterAsInteger(MERGE_FACTOR_PARAMETER,
        mergeFactorDefault);
  }

  public void compose(ComponentManager manager) throws ComponentException {
    this.manager = manager;
  }

  /**
   * Contextualize this class
   */
  public void contextualize(Context context) throws ContextException {
    this.workDir = (File) context.get(Constants.CONTEXT_WORK_DIR);
  }

  public void recycle() {
    this.processing = 0;
    if (this.writer != null) {
      try {
        this.writer.close();
      }
      catch (IOException ioe) {}
      this.writer = null;
    }
    if (this.reader != null) {
      try {
        this.reader.close();
      }
      catch (IOException ioe) {}
      this.reader = null;
    }

    this.bodyDocument = null;
  }

  public void dispose() {
  }

  /**
   * Generate the unique key.
   * This key must be unique inside the space of this component.
   *
   * @return The generated key
   */
  public Serializable getKey() {
    return "1";
  }

  /**
   * Generate the validity object.
   *
   * @return The generated validity object or <code>null</code> if the
   *         component is currently not cacheable.
   */
  public SourceValidity getValidity() {
    return NOPValidity.SHARED_INSTANCE;
  }

  public void startDocument() throws SAXException {
    super.startDocument();
  }

  public void endDocument() throws SAXException {
    super.endDocument();
  }

  /**
   * Begin the scope of a prefix-URI Namespace mapping.
   *
   * @param prefix The Namespace prefix being declared.
   * @param uri The Namespace URI the prefix is mapped to.
   */
  public void startPrefixMapping(String prefix, String uri) throws SAXException {
    if (processing == 0) {
      super.startPrefixMapping(prefix, uri);
    }
  }

  /**
   * End the scope of a prefix-URI mapping.
   *
   * @param prefix The prefix that was being mapping.
   */
  public void endPrefixMapping(String prefix) throws SAXException {
    if (processing == 0) {
      super.endPrefixMapping(prefix);
    }
  }

  public void startElement(String namespaceURI, String localName, String qName,
                           Attributes atts) throws SAXException {
    //System.out.println("START processing: "+processing+" "+localName);
    if (processing == 0) {

      if (LUCENE_URI.equals(namespaceURI)) {

        // INDEX ACTION
        if (LUCENE_QUERY_ELEMENT.equals(localName)) {
          action = ADD_ACTION;

          // create base parameter
          String sCreate = atts.getValue(LUCENE_QUERY_CREATE_ATTRIBUTE);
          boolean bCreate = sCreate != null &&
              (sCreate.equalsIgnoreCase("yes") ||
               sCreate.equalsIgnoreCase("true"));

          // analyzer parameter
          String analyzerClassname =
              atts.getValue(LUCENE_QUERY_ANALYZER_ATTRIBUTE);
          if (analyzerClassname == null) {
            analyzerClassname = this.ANALYZER_CLASSNAME_DEFAULT;
          }
          Analyzer analyzer = LuceneCocoonHelper.getAnalyzer(analyzerClassname);

          // mergeFactor parameter
          String sMergeFactor =
              atts.getValue(LUCENE_QUERY_MERGE_FACTOR_ATTRIBUTE);
          int mergeFactor = this.mergeFactor;
          if (sMergeFactor != null) {
            mergeFactor = Integer.parseInt(sMergeFactor);
          }

          // directory parameter
          String directoryName =
              atts.getValue(LUCENE_QUERY_DIRECTORY_ATTRIBUTE);
          if (directoryName == null) {
            directoryName = this.directory;

            //System.out.println("QUERY Create=" + bCreate + ", Directory=" + directoryName + ", Analyzer=" + analyzerClassname);
          }
          try {
            Directory directory = LuceneCocoonHelper.getDirectory(new File(
                workDir, directoryName), bCreate);

            writer = new IndexWriter(directory, analyzer, bCreate);
            writer.mergeFactor = mergeFactor;
          }
          catch (IOException e) {
            throw new SAXException(e);
          }

          processing = 1;
        }

        // DELETE ACTION
        else if (LUCENE_DELETE_ELEMENT.equals(localName)) {
          action = DELETE_ACTION;

          // directory parameter
          String directoryName =
              atts.getValue(LUCENE_QUERY_DIRECTORY_ATTRIBUTE);
          if (directoryName == null) {
            directoryName = this.directory;
          }
          try {
            Directory directory = LuceneCocoonHelper.getDirectory(
                new File(workDir, directoryName), false);

            reader = LuceneCocoonHelper.getIndexReader(directory);

            //System.out.println("DELETE Directory=" + directoryName);
          }
          catch (IOException e) {
            throw new SAXException(e);
          }
          processing = 1;
        }
      }
      else {
        super.startElement(namespaceURI, localName, qName, atts);
      }
    }

    else if (processing == 1) {
      if (LUCENE_URI.equals(namespaceURI) &&
          LUCENE_DOCUMENT_ELEMENT.equals(localName)) {

        if (action == ADD_ACTION) {
          this.bodyDocument = new Document();
        }
        if (action == DELETE_ACTION) {
          this.term = new Term(atts.getValue(LUCENE_DOCUMENT_FIELD_ATTRIBUTE),
                               atts.getValue(LUCENE_DOCUMENT_VALUE_ATTRIBUTE));
        }
        processing = 2;
      }
      else {
        throw new SAXException(
            "<lucene:query> element can contain only <lucene:document> elements!");
      }
    }
    else if (processing == 2) {
      if (LUCENE_URI.equals(namespaceURI) &&
          LUCENE_FIELD_ELEMENT.equals(localName)) {

        this.fieldname = atts.getValue(LUCENE_FIELD_NAME_ATTRIBUTE);
        if (this.fieldname == null || this.fieldname.equals("")) {
          throw new SAXException(
              "<lucene:field> element must contain name attribut");
        }

        String fieldtype = atts.getValue(LUCENE_FIELD_TYPE_ATTRIBUTE);
        if (fieldtype == null || fieldtype.equals("")) {
          throw new SAXException(
              "<lucene:field> element must contain a type attribut");
        }

        if (fieldtype.equals("keyword")) {
          this.fieldtype = TYPE_KEYWORD;
        }
        else if (fieldtype.equals("text")) {
          this.fieldtype = TYPE_TEXT;
        }
        else if (fieldtype.equals("date")) {
          this.fieldtype = TYPE_DATE;
          String pattern = atts.getValue(LUCENE_FIELD_DATEFORMAT_ATTRIBUTE);

          if (pattern == null || pattern.equals("")) {
            throw new SAXException(
                "<lucene:field type=\"date\"> element must contain a dateformat attribut");
          }
          df= new SimpleDateFormat(pattern);

        }
        else if (fieldtype.equals("unstored")) {
          this.fieldtype = TYPE_UNSTORED;
        }
        else if (fieldtype.equals("unindexed")) {
          this.fieldtype = TYPE_UNINDEXED;
        }

        String fieldboost = atts.getValue(LUCENE_FIELD_BOOST_ATTRIBUTE);
        if (fieldboost == null) {
          this.fieldboost = 1.0f;
        }
        else {
          this.fieldboost = Float.parseFloat(fieldboost);
        }
        System.out.println("fieldname: " + fieldname + " type: " + fieldtype +
                           " boost: " + fieldboost);
        processing = 3;
      }
      else {
        throw new SAXException(
            "<lucene:document> element can contain only <lucene:field> elements!");
      }
    }

  }

  public void endElement(String namespaceURI, String localName, String qName) throws
      SAXException {

    //System.out.println("END: processing: " + processing + " el: " + localName);
    if (processing == 1) {

      if (LUCENE_URI.equals(namespaceURI)) {

        //ADD ACTION
        if (LUCENE_QUERY_ELEMENT.equals(localName)) {
          // End query processing
          AttributesImpl attrs = new AttributesImpl();
          attrs.addAttribute(null, "nbdocuments",
                             "nbdocuments", "CDATA",
                             Integer.toString(nbdocuments));

          super.startElement(namespaceURI, localName, qName, attrs);
          super.endElement(namespaceURI, localName, qName);
          nbdocuments = 0;

          try {
            this.writer.optimize();
            this.writer.close();
            this.writer = null;

          }
          catch (IOException e) {
            throw new SAXException(e);
          }
          this.processing = 0;
        }
        // DELETE ACTION
        else if (LUCENE_DELETE_ELEMENT.equals(localName)) {
          try {

            AttributesImpl attrs = new AttributesImpl();
            attrs.addAttribute(null, "nbdocuments",
                               "nbdocuments", "CDATA",
                               Integer.toString(nbdocuments));

            super.startElement(namespaceURI, localName, qName, attrs);
            super.endElement(namespaceURI, localName, qName);
            nbdocuments = 0;

            this.reader.close();
            this.reader = null;

          }
          catch (IOException e) {
            throw new SAXException(e);
          }
          this.processing = 0;
        }

      }
      else {
        if (action == ADD_ACTION) {
          throw new SAXException("</lucene:" + LUCENE_QUERY_ELEMENT +
                                 " was expected!");
        }
        else if (action == DELETE_ACTION) {
          throw new SAXException("</lucene:" + LUCENE_DELETE_ELEMENT +
                                 " was expected!");
        }
      }
    }

    else if (processing == 2) {

      if (action == ADD_ACTION) {
        try {
          //System.out.println("DOCUMENT \n " + this.bodyDocument);
          this.writer.addDocument(this.bodyDocument);
          nbdocuments++;
          this.bodyDocument = null;
        }
        catch (IOException e) {
          throw new SAXException(e);
        }
        this.processing = 1;
      }

      else if (action == DELETE_ACTION) {
        try {
          //System.out.println("term \n " + this.term);
          nbdocuments += reader.delete(this.term);
        }
        catch (IOException e) {
          throw new SAXException(e);
        }

        this.processing = 1;
      }
    }
    else if (processing == 3) {
      Field f = null;
      // add Field
      switch (fieldtype) {
        case TYPE_KEYWORD:
          f = Field.Keyword(fieldname, fieldvalue);
          break;
        case TYPE_TEXT:
          f = Field.Text(fieldname, fieldvalue);
          break;
        case TYPE_DATE:
          try {
            f = Field.Keyword(fieldname,
                              DateField.dateToString(df.parse(fieldvalue)));
          }
          catch (ParseException ex) {
            throw new SAXException(ex);
          }
          break;
        case TYPE_UNSTORED:
          f = Field.UnStored(fieldname, fieldvalue);
          break;
        case TYPE_UNINDEXED:
          f = Field.UnIndexed(fieldname, fieldvalue);
          break;
      }
      if (fieldboost != 1.0f) {
        f.setBoost(fieldboost);
      }
      bodyDocument.add(f);
      processing = 2;

    }
    else {
      super.endElement(namespaceURI, localName, qName);
    }
  }

  public void characters(char[] ch, int start, int length) throws
      SAXException {

    if (processing == 3) {
      this.fieldvalue = new String(ch, start, length);
      //System.out.println("value: "+this.fieldvalue);
    }
    else {
      super.characters(ch, start, length);
    }
  }
}

Reply via email to