mets/alto is an xml standard for describing physical objects. In this
case, we're describing books. The mets file holds the metadata (author,
title, etc.), the alto file is the physical description (words on the
page, formatting of the page). So it's a one (mets) to many (alto)
relationship.
the directory structure:
/our/collection/IDxxx/:
IDxxx-mets.xml
ALTO/
/our/collection/IDxxx/ALTO/:
IDxxx-ALTO001.xml
IDxxx-ALTO002.xml
ie. an xml file per scanned book page.
Beyond the ID number as part of the file names, the mets file contains
no reference to the alto children. The alto children do contain a
reference to the jpg page scan, which is labelled with the ID number as
part of the name.
The idea is to create a full text index of the alto content, accompanied
by the author/title info from the mets file for purposes of results
display. The first try with this is attempting a recursive
FileDataSource approach.
It was relatively easy to create a "content" field which holds the text
of the page (each word is actually an attribute of a separate tag), but
I'm having difficulty determining how I'm going to conditionally add the
author and title data from the METS file to the rows created with the
ALTO content field. It'll involve regex'ing out the ID number
associated with both the mets and alto filenames for starters, but even
at that, I don't see how to keep it straight since it's not one mets=one
alto and it's also not a static string for the entire index.
thanks for any hints you can provide.
Fred
University of Texas at Austin
==========================================
data-config.xml thus far:
<dataConfig>
<dataSource type="FileDataSource" />
<document>
<entity name="landscapes" rootEntity="false"
processor="FileListEntityProcessor" fileName=".xml$" recursive="true"
baseDir="/home/utlol/htdocs/lib-landscapes-new/publications/">
<entity name="sample" rootEntity="true"
stream="true"
pk="filename"
url="${landscapes.fileAbsolutePath}"
processor="XPathEntityProcessor"
forEach="/mets | /alto"
transformer="TemplateTransformer,RegexTransformer,LogTransformer"
logTemplate=" processing ${landscapes.fileAbsolutePath}"
logLevel="info"
>
<!-- use system filename for getting OCLC number -->
<!-- we need it both for linking to results and for referencing the METS
file -->
<field column="fileAbsPath" template="${landscapes.fileAbsolutePath}" />
<field column="title"
xpath="/mets/dmdSec/mdWrap/xmlData/mods/titleInfo/title" />
<!--
<field column="author"
xpath="/mets/dmdSec/mdWrap/xmlData/mods/na...@id='MODSMD_PRINT_N1']/namepa...@type='given']"
/>
-->
<field column="filename"
xpath="/alto/Description/sourceImageInformation/fileName" />
<field column="content"
xpath="/alto/Layout/Page/PrintSpace/TextBlock/TextLine/String/@CONTENT" />
</entity>
</entity>
</document>
</dataConfig>
==============================================
METS example:
<?xml version="1.0" encoding="UTF-8"?>
<mets xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xmlns="http://www.loc.gov/METS/"
xsi:schemaLocation="http://www.loc.gov/METS/
http://schema.ccs-gmbh.com/docworks/version20/mets-docworks.xsd"
xmlns:MODS="http://www.loc.gov/mods/v3"
xmlns:mix="http://www.loc.gov/mix/"
xmlns:xlink="http://www.w3.org/1999/xlink" TYPE="METAe_Monograph"
LABEL="ENVIRONMENTAL GEOLOGIC ATLAS OF THE TEXAS COASTAL ZONE-
Kingsville Area">
<metsHdr CREATEDATE="2010-05-06T11:21:18" LASTMODDATE="2010-05-06T11:21:18">
<agent ROLE="CREATOR" TYPE="OTHER" OTHERTYPE="SOFTWARE">
<name>CCS docWORKS/METAe Version 6.3-0</name>
<note>docWORKS-ID: 1677</note>
</agent>
</metsHdr>
<dmdSec ID="MODSMD_PRINT">
<mdWrap MIMETYPE="text/xml" MDTYPE="MODS" LABEL="Bibliographic meta-data
of the printed version">
<xmlData>
<MODS:mods>
<MODS:titleInfo ID="MODSMD_PRINT_TI1" xml:lang="en">
<MODS:title>ENVIRONMENTAL GEOLOGIC ATLAS OF THE TEXAS COASTAL ZONE-
Kingsville Area</MODS:title>
</MODS:titleInfo>
<MODS:name ID="MODSMD_PRINT_N1" type="personal">
<MODS:namePart type="given">L F. Brown, Jr., J. H. McGowen, T. J. Evans,
C. G.</MODS:namePart>
<MODS:namePart type="family">Groat</MODS:namePart>
<MODS:role>
<MODS:roleTerm>aut</MODS:roleTerm>
</MODS:role>
</MODS:name>
<MODS:name ID="MODSMD_PRINT_N2" type="personal">
<MODS:namePart type="given">W. L.</MODS:namePart>
<MODS:namePart type="family">Fisher</MODS:namePart>
<MODS:role>
<MODS:roleTerm>aut</MODS:roleTerm>
</MODS:role>
</MODS:name>
============================================
ALTO example:
<?xml version="1.0" encoding="UTF-8"?>
<alto xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:noNamespaceSchemaLocation="http://schema.ccs-gmbh.com/metae/alto-1-1.xsd"
xmlns:xlink="http://www.w3.org/TR/xlink">
<Description>
<MeasurementUnit>mm10</MeasurementUnit>
<sourceImageInformation>
<fileName>/Docworks/IN/GeologyBooks/txu-oclc-6917337/txu-oclc-6917337-009.jpg</fileName>
</sourceImageInformation>
<OCRProcessing ID="OCRPROCESSING_1">
<preProcessingStep>
<processingSoftware>
<softwareCreator>CCS Content Conversion Specialists GmbH,
Germany</softwareCreator>
<softwareName>CCS docWORKS</softwareName>
<softwareVersion>6.3-0.93</softwareVersion>
</processingSoftware>
</preProcessingStep>
<ocrProcessingStep>
<processingSoftware>
<softwareCreator>ABBYY (BIT Software), Russia</softwareCreator>
<softwareName>FineReader</softwareName>
<softwareVersion>7.0</softwareVersion>
</processingSoftware>
</ocrProcessingStep>
</OCRProcessing>
</Description>
<Styles>
<TextStyle ID="TXT_0" FONTSIZE="11" FONTFAMILY="Times New Roman"/>
<ParagraphStyle ID="PAR_CENTER" ALIGN="Center"/>
<ParagraphStyle ID="PAR_BLOCK" ALIGN="Block"/>
<ParagraphStyle ID="PAR_RIGHT" ALIGN="Right"/>
<ParagraphStyle ID="PAR_LEFT" ALIGN="Left"/>
</Styles>
<Layout>
<Page ID="P9" PHYSICAL_IMG_NR="9" HEIGHT="2855" WIDTH="2258">
<TopMargin ID="P9_TM00001" HPOS="0" VPOS="0" WIDTH="2258" HEIGHT="196"/>
<LeftMargin ID="P9_LM00001" HPOS="0" VPOS="196" WIDTH="151" HEIGHT="2345"/>
<RightMargin ID="P9_RM00001" HPOS="2104" VPOS="196" WIDTH="154"
HEIGHT="2345"/>
<BottomMargin ID="P9_BM00001" HPOS="0" VPOS="2541" WIDTH="2258"
HEIGHT="314"/>
<PrintSpace ID="P9_PS00001" HPOS="151" VPOS="196" WIDTH="1953"
HEIGHT="2345">
<TextBlock ID="P9_TB00001" HPOS="1045" VPOS="196" WIDTH="173"
HEIGHT="28" STYLEREFS="TXT_0 PAR_CENTER">
<TextLine ID="P9_TL00001" HPOS="1045" VPOS="197" WIDTH="173" HEIGHT="27">
<String ID="P9_ST00001" HPOS="1045" VPOS="197" WIDTH="173" HEIGHT="27"
CONTENT="Preface" WC="0.98" CC="0000000"/>
</TextLine>