Hello again,
I can index pdf using:
*data-config.xml*
<?xml version="1.0" encoding="utf-8"?>
<dataConfig>
<dataSource type="BinFileDataSource" name="binary" />
<document>
<entity name="f" dataSource="binary" rootEntity="false"
processor="FileListEntityProcessor" baseDir="../solr/docu/" fileName=".*pdf"
recursive="true">
<entity name="tika" processor="TikaEntityProcessor"
url="${f.fileAbsolutePath}" format="text">
<field column="id" name="id" meta="true" />
<field column="fake_id" name="fake_id"
meta="true" />
<field column="model" name="model" meta="true"
/>
<field column="text" name="biog" />
</entity>
</entity>
</document>
</dataConfig>
I can also index a database using:
*data-config.xml*
<?xml version="1.0" encoding="utf-8"?>
<dataConfig>
<dataSource type="JdbcDataSource"
driver="com.mysql.jdbc.Driver"
url="jdbc:mysql://127.0.0.1:3306/rental"
user="root"
password="1a2b3c4d"
name="db" />
<dataSource type="BinFileDataSource" name="binary" />
<document>
<entity name="members" dataSource="db"
transformer="HTMLStripTransformer" query="select CONCAT('m_',id) as fake_id,
id, firstname, lastname, biog, model from members">
<field column="id" name="id" />
<field column="fake_id" name="fake_id" />
<field column="firstname" name="firstname" stripHTML="true" />
<field column="lastname" name="lastname" stripHTML="true" />
<field column="biog" name="biog" stripHTML="true" />
<field column="model" name="model" stripHTML="true" />
</entity>
<entity name="new_members" dataSource="db"
transformer="HTMLStripTransformer" query="select CONCAT('nm_',id) as
fake_id, id, firstname, lastname, biog, model from new_members">
<field column="id" name="id" />
<field column="fake_id" name="fake_id" />
<field column="firstname" name="firstname" stripHTML="true" />
<field column="lastname" name="lastname" stripHTML="true" />
<field column="biog" name="biog" stripHTML="true" />
<field column="model" name="model" stripHTML="true" />
</entity>
<entity name="books" dataSource="db" transformer="HTMLStripTransformer"
query="select CONCAT('b_',id) as fake_id, id, title, description, model from
books">
<field column="id" name="id" />
<field column="fake_id" name="fake_id" />
<field column="title" name="title" stripHTML="true" />
<field column="description" name="biog" stripHTML="true" />
<field column="model" name="model" stripHTML="true" />
</entity>
<entity name="journals" dataSource="db" transformer="HTMLStripTransformer"
query="select CONCAT('j_',id) as fake_id, id, title, description, model from
journals">
<field column="id" name="id" />
<field column="fake_id" name="fake_id" />
<field column="title" name="title" stripHTML="true" />
<field column="description" name="biog" stripHTML="true" />
<field column="model" name="model" stripHTML="true" />
</entity>
<entity name="cds" dataSource="db" transformer="HTMLStripTransformer"
query="select CONCAT('c_',id) as fake_id, id, title, description, model from
cd">
<field column="id" name="id" />
<field column="fake_id" name="fake_id" />
<field column="title" name="title" stripHTML="true" />
<field column="description" name="biog" stripHTML="true" />
<field column="model" name="model" stripHTML="true" />
</entity>
</document>
</dataConfig>
For the above I have:
*schema.xml(fields)*
<fields>
<field name="id" type="string" indexed="true" stored="true" />
<field name="fake_id" type="string" indexed="true" stored="true" />
<field name="model" type="text_en" indexed="true" stored="true" />
<field name="firstname" type="text_en" indexed="true" stored="true"/>
<field name="lastname" type="text_en" indexed="true" stored="true"/>
<field name="title" type="text_en" indexed="true" stored="true"/>
<field name="biog" type="text_en" indexed="true" stored="true"/>
</fields>
<uniqueKey>fake_id</uniqueKey>
<defaultSearchField>biog</defaultSearchField>
But when I am using the below data-config.xml indexing fails:
*data-config.xml*
<?xml version="1.0" encoding="utf-8"?>
<dataConfig>
<dataSource type="JdbcDataSource"
driver="com.mysql.jdbc.Driver"
url="jdbc:mysql://127.0.0.1:3306/rental"
user="root"
password="1a2b3c4d"
name="db" />
<dataSource type="BinFileDataSource" name="binary" />
<document>
<entity name="members" dataSource="db"
transformer="HTMLStripTransformer" query="select CONCAT('m_',id) as fake_id,
id, firstname, lastname, biog, model from members">
<field column="id" name="id" />
<field column="fake_id" name="fake_id" />
<field column="firstname" name="firstname" stripHTML="true" />
<field column="lastname" name="lastname" stripHTML="true" />
<field column="biog" name="biog" stripHTML="true" />
<field column="model" name="model" stripHTML="true" />
</entity>
<entity name="new_members" dataSource="db"
transformer="HTMLStripTransformer" query="select CONCAT('nm_',id) as
fake_id, id, firstname, lastname, biog, model from new_members">
<field column="id" name="id" />
<field column="fake_id" name="fake_id" />
<field column="firstname" name="firstname" stripHTML="true" />
<field column="lastname" name="lastname" stripHTML="true" />
<field column="biog" name="biog" stripHTML="true" />
<field column="model" name="model" stripHTML="true" />
</entity>
<entity name="books" dataSource="db" transformer="HTMLStripTransformer"
query="select CONCAT('b_',id) as fake_id, id, title, description, model from
books">
<field column="id" name="id" />
<field column="fake_id" name="fake_id" />
<field column="title" name="title" stripHTML="true" />
<field column="description" name="biog" stripHTML="true" />
<field column="model" name="model" stripHTML="true" />
</entity>
<entity name="journals" dataSource="db" transformer="HTMLStripTransformer"
query="select CONCAT('j_',id) as fake_id, id, title, description, model from
journals">
<field column="id" name="id" />
<field column="fake_id" name="fake_id" />
<field column="title" name="title" stripHTML="true" />
<field column="description" name="biog" stripHTML="true" />
<field column="model" name="model" stripHTML="true" />
</entity>
<entity name="cds" dataSource="db" transformer="HTMLStripTransformer"
query="select CONCAT('c_',id) as fake_id, id, title, description, model from
cd">
<field column="id" name="id" />
<field column="fake_id" name="fake_id" />
<field column="title" name="title" stripHTML="true" />
<field column="description" name="biog" stripHTML="true" />
<field column="model" name="model" stripHTML="true" />
</entity>
<entity name="f" dataSource="binary" rootEntity="false"
processor="FileListEntityProcessor" baseDir="../solr/docu/" fileName=".*pdf"
recursive="true">
<entity name="tika" processor="TikaEntityProcessor"
url="${f.fileAbsolutePath}" format="text">
<field column="id" name="id" meta="true" />
<field column="fake_id" name="fake_id"
meta="true" />
<field column="model" name="model" meta="true"
/>
<field column="text" name="biog" />
</entity>
</entity>
</document>
</dataConfig>
*The log file is outputting:*
SEVERE: Exception while processing: f document :
null:org.apache.solr.handler.dataimport.DataImportHandlerException: Unable
to execute query: C:\solr\tomcat\..\solr\docu\dinos.pdf Processing Document
# 36
at
org.apache.solr.handler.dataimport.DataImportHandlerException.wrapAndThrow(DataImportHandlerException.java:72)
at
org.apache.solr.handler.dataimport.JdbcDataSource$ResultSetIterator.<init>(JdbcDataSource.java:253)
at
org.apache.solr.handler.dataimport.JdbcDataSource.getData(JdbcDataSource.java:210)
at
org.apache.solr.handler.dataimport.JdbcDataSource.getData(JdbcDataSource.java:39)
at
org.apache.solr.handler.dataimport.TikaEntityProcessor.nextRow(TikaEntityProcessor.java:103)
at
org.apache.solr.handler.dataimport.EntityProcessorWrapper.pullRow(EntityProcessorWrapper.java:330)
at
org.apache.solr.handler.dataimport.EntityProcessorWrapper.nextRow(EntityProcessorWrapper.java:296)
at
org.apache.solr.handler.dataimport.DocBuilder.buildDocument(DocBuilder.java:683)
at
org.apache.solr.handler.dataimport.DocBuilder.buildDocument(DocBuilder.java:709)
at
org.apache.solr.handler.dataimport.DocBuilder.buildDocument(DocBuilder.java:619)
at
org.apache.solr.handler.dataimport.DocBuilder.doFullDump(DocBuilder.java:327)
at
org.apache.solr.handler.dataimport.DocBuilder.execute(DocBuilder.java:225)
at
org.apache.solr.handler.dataimport.DataImporter.doFullImport(DataImporter.java:375)
at
org.apache.solr.handler.dataimport.DataImporter.runCmd(DataImporter.java:445)
at
org.apache.solr.handler.dataimport.DataImporter$1.run(DataImporter.java:426)
Caused by: com.mysql.jdbc.exceptions.jdbc4.MySQLSyntaxErrorException: You
have an error in your SQL syntax; check the manual that corresponds to your
MySQL server version for the right syntax to use near
'C:\solr\tomcat\..\solr\docu\dinos.pdf' at line 1
at sun.reflect.NativeConstructorAccessorImpl.newInstance0(Native Method)
at sun.reflect.NativeConstructorAccessorImpl.newInstance(Unknown Source)
at sun.reflect.DelegatingConstructorAccessorImpl.newInstance(Unknown
Source)
at java.lang.reflect.Constructor.newInstance(Unknown Source)
at com.mysql.jdbc.Util.handleNewInstance(Util.java:411)
at com.mysql.jdbc.Util.getInstance(Util.java:386)
at com.mysql.jdbc.SQLError.createSQLException(SQLError.java:1052)
at com.mysql.jdbc.MysqlIO.checkErrorPacket(MysqlIO.java:4096)
at com.mysql.jdbc.MysqlIO.checkErrorPacket(MysqlIO.java:4028)
at com.mysql.jdbc.MysqlIO.sendCommand(MysqlIO.java:2490)
at com.mysql.jdbc.MysqlIO.sqlQueryDirect(MysqlIO.java:2651)
at com.mysql.jdbc.ConnectionImpl.execSQL(ConnectionImpl.java:2677)
at com.mysql.jdbc.ConnectionImpl.execSQL(ConnectionImpl.java:2627)
at com.mysql.jdbc.StatementImpl.execute(StatementImpl.java:841)
at com.mysql.jdbc.StatementImpl.execute(StatementImpl.java:681)
at
org.apache.solr.handler.dataimport.JdbcDataSource$ResultSetIterator.<init>(JdbcDataSource.java:246)
... 13 more
Is it possible to index pdfs, docs, rtf along with database and having a
single document?
Thank in advance,
Tom
--
View this message in context:
http://lucene.472066.n3.nabble.com/Is-it-possible-to-index-pdfs-and-database-into-single-document-tp3980761.html
Sent from the Solr - User mailing list archive at Nabble.com.