Hi all,
I'm getting a strange user error when working with Data.ByteString.Char8 and
Text.Regex.PCRE.
Error I get is:
CustomerMaster: user error (Text.Regex.PCRE.ByteString died: (ReturnCode
0,"Ptr
parameter was nullPtr in Text.Regex.PCRE.Wrap.wrapMatch cstr"))
The part of the code causing problems is at the end of this e-mail. The
complete code is attached.
I'm working at cleansing customer addresses for a ERP system data migration.
The failing function is supposed to split some bytestrings using a list of
regular expressions (split on words like suite, building, attention, etc...)
stored in the "tags" binding. I use a foldl' to try each regular expression
on each address (addr is a list of address part, can be seen as a list of
bytestring values).
The funny and weird thing is that when I have only one regular expression in
tags, it works. When I have more than one regular expression in tags it
fails (user error) the first time one of the regular expression matches. A
match replaces the current address part with 3 new parts, before, match,
after. The next regular expression in the tags list will test the new before
and after parts. I presume it's the testing of those newly created parts
that causes the problem.
Any idea about what could go wrong here?
Thanks,
Olivier.
=== code ===
-- | Split unparsed address parts on keywords (suite, building, doors,
etc...)
splitOnTags :: AddressState
splitOnTags = do
addr <- get
put $ foldl' f addr tags
where
f a t = concatMap (split t) a
split (T {partType = p, regex = r}) a@(AP X v)
| BS.null y = [a]
| otherwise = [AP X x, AE p, AP X z] -- new address parts
where
(x, y, z) = match r v
split _ a = [a]
-- | Regex and part type representing tags used for splitting addresses
tags :: [Tag]
tags = [T AT
(makeRegexOpts compCaseless execBlank
"\\b(?:ATTN|ATTENTION|C/O)\\b[[:punct:]]?")
(BS.pack "ATTN: ")
,T PB
(makeRegexOpts compCaseless execBlank
"\\b(?:(?:P\\.?\\s?O\\.?\\s*)BOX|C\\.P\\.|POSTFACH)\
\[[:punct:]]?\\s+(?:NO.\\s+)?")
(BS.pack "PO BOX: ")
,T BLDG
(makeRegexOpts compCaseless execBlank
"\\b(?:BLDG|BUILDING|HANGAR)[[:punct:]]?")
(BS.pack "BLDG: ")
,T DOCK
(makeRegexOpts compCaseless execBlank
"\\bDOCK[[:punct:]]?")
(BS.pack "DOCK: ")
,T STE
(makeRegexOpts compCaseless execBlank
"\\b(?:SUITE|STE|APT|ROOM)[[:punct:]]?\
\\\s+(?:NO[[:punct:]]?)?")
(BS.pack "STE: ")
,T UNIT
(makeRegexOpts compCaseless execBlank
"\\bUNIT[[:punct:]]?")
(BS.pack "UNIT: ")
,T FLOOR
(makeRegexOpts compCaseless execBlank
"FLOOR")
(BS.pack "FLOOR: ")
,T DOORS
(makeRegexOpts compCaseless execBlank
"\\bDOORS?[[:punct:]]?")
(BS.pack "DOORS: ")]
module Main where
import Control.Monad.State
import qualified Data.ByteString.Char8 as BS
import Data.Char (isSpace)
import Data.List
import System.Environment
import System.IO
import Text.Regex.PCRE
--------------------------------------------------------------------------------
-- Data declarations
--------------------------------------------------------------------------------
-- | An address
type Address = [AddressPart]
-- | An address part (Suite, Dock, City, etc...) or a line separator
data AddressPart = AP AddressPartType BS.ByteString
| AE AddressPartType -- tag found but not linked to text
| S AddressPartType -- line separator
deriving (Show, Eq)
-- | Tags for address parts
data AddressPartType = I -- Id
| N -- Name
| PF -- Partner function
| AT -- Attention
| N2 -- Name 2
| PB -- P.O. Box
| BLDG -- Building
| DOCK -- Dock
| STE -- Suite
| UNIT -- Unit
| FLOOR -- Floor
| DOORS -- Doors
| ST -- Street
| CI -- City
| CO -- Country
| R -- Region
| Z -- Zip code
| X -- Unparsed
| E -- Error message
| L1 -- Start of Line 1
| L2 -- Start of Line 2
| L3 -- Start of Line 3
| L4 -- Start of Line 4 (zip code)
deriving (Show, Eq)
-- | The address state the program will run in
type AddressState = State Address ()
data Tag = T {partType :: AddressPartType,
regex :: Regex,
partName :: BS.ByteString}
--------------------------------------------------------------------------------
-- Formating and extraction of address parts
--------------------------------------------------------------------------------
-- | format address part for printing
format :: AddressPart -> BS.ByteString
format (S _) = BS.empty
format (AE p) = format (AP p tagEMPTY)
format (AP PB v) = BS.append tagPOBOX v
format (AP BLDG v) = BS.append tagBLDG v
format (AP DOCK v) = BS.append tagDOCK v
format (AP STE v) = BS.append tagSTE v
format (AP UNIT v) = BS.append tagUNIT v
format (AP FLOOR v) = BS.append tagFLOOR v
format (AP DOORS v) = BS.append tagDOORS v
format (AP X v) = BS.append tagX v
format (AP _ v) = v
-- | ByteStringS used for reporting
tagPOBOX, tagBLDG, tagDOCK, tagSTE, tagUNIT, tagFLOOR, tagDOORS, tagX, tagEMPTY
:: BS.ByteString
tagPOBOX = BS.pack "PO BOX: "
tagBLDG = BS.pack "BLDG: "
tagDOCK = BS.pack "DOCK: "
tagSTE = BS.pack "STE: "
tagUNIT = BS.pack "UNIT: "
tagFLOOR = BS.pack "FLOOR: "
tagDOORS = BS.pack "DOORS: "
tagX = BS.pack "XXX: "
tagEMPTY = BS.pack "!!!EMPTY!!!"
-- Get address part value
value :: AddressPart -> BS.ByteString
value (AP _ v) = v
value (S _) = error "No value: AddressPart type S is a separator"
value (AE t) = BS.append (BS.pack "AE: ") (BS.pack $ show t)
--value (AE _) = error "No value: AddressPart type AE contains no value" -- TODO reactivate
-- Apply a ByteString conversion function to an AddressPart
apply :: (BS.ByteString -> BS.ByteString) -> AddressPart -> AddressPart
apply f (AP t v) = AP t (f v)
apply _ p = p
--------------------------------------------------------------------------------
-- IO functions
--------------------------------------------------------------------------------
-- | The App entry point
main :: IO ()
main = do
args <- getArgs
case args of
(file_name:[]) -> processFile file_name
_ -> usage
-- | The IO stuff
processFile :: String -> IO ()
processFile file_name = do
content <- BS.readFile file_name
let result = parseAddresses content
BS.putStrLn result
-- | How to use this application
usage :: IO ()
usage = do
progName <- getProgName
hPutStrLn stderr $ "Usage:\n\t" ++ progName ++ " file_to_parse.txt"
--------------------------------------------------------------------------------
-- Parsing functions
--------------------------------------------------------------------------------
-- | Parse the text flow
parseAddresses :: BS.ByteString -> BS.ByteString
parseAddresses = addHeader
. BS.unlines
. map (reportAddress . validateAddress . parseAddressLine)
. drop 1
. BS.lines
-- | add a header line to given ByteString
addHeader :: BS.ByteString -> BS.ByteString
addHeader = BS.append (BS.pack
"Id\tName\tPartnerFunc\tName 2\tAttn\tPO Box\tBldg\tDock\t\
\Suite\tUnit\tFloor\tDoors\tStreet\tCity\tCountry\t\
\Regio/State\tZip\tErrors\tUnparsed\n")
-- | Parse an address line
parseAddressLine :: BS.ByteString -> Address
parseAddressLine s = do
splitOnTags
`execState` ([AP I uid, AP PF pf, AP N name]
++ [S L1, AP X add1, S L2, AP X add2, S L3, AP X add3]
++ [S L4] ++ zipCode)
where
(_:uid:pf:name:add1:add2:add3:zipC:[]) = BS.split '\t' s
zipCode = if BS.null zipCT then [] else [AP Z zipCT]
zipCT = trim zipC
-- | Validate an address line
validateAddress :: Address -> Address
validateAddress = id
-- | Report an address line
reportAddress :: Address -> BS.ByteString
reportAddress addr
= BS.intercalate (BS.singleton '\t') (map (\f -> f addr)
[column (isPartType I)
,column (isPartType N)
,column (isPartType PF)
,column (isPartType N2)
,column (isPartType AT)
,column (isPartType PB)
,column (isPartType BLDG)
,column (isPartType DOCK)
,column (isPartType STE)
,column (isPartType UNIT)
,column (isPartType FLOOR)
,column (isPartType DOORS)
,column (isPartType ST)
,column (isPartType CI)
,column (isPartType CO)
,column (isPartType R)
,column (isPartType Z)
,column (isPartType E)]
++ (map value $ filter (isPartType X) addr))
-- | Write column data
column :: (AddressPart -> Bool) -> Address -> BS.ByteString
column test = BS.intercalate (BS.pack " || ") . map value . filter test
-- | Check if given AddressPart is of given type
isPartType :: AddressPartType -> AddressPart -> Bool
isPartType t (AP t' _) = t == t'
isPartType t (AE t') = t == t'
isPartType t (S t') = t == t'
-- | Remove whitespace from ends of ByteString
trim :: BS.ByteString -> BS.ByteString
trim = BS.reverse . BS.dropWhile isSpace . BS.reverse . BS.dropWhile isSpace
-- | Split unparsed address parts on keywords (suite, building, doors, etc...)
splitOnTags :: AddressState
splitOnTags = do
addr <- get
put $ foldl' f addr tags
where
f a t = concatMap (split t) a
split (T {partType = p, regex = r}) a@(AP X v)
| BS.null y = [a]
| otherwise = [AP X x, AE p, AP X z]
where
(x, y, z) = match r v
split _ a = [a]
-- | Regex and part type representing tags used for splitting addresses
tags :: [Tag]
tags = [T AT
(makeRegexOpts compCaseless execBlank
"\\b(?:ATTN|ATTENTION|C/O)\\b[[:punct:]]?")
(BS.pack "ATTN: ")
,T PB
(makeRegexOpts compCaseless execBlank
"\\b(?:(?:P\\.?\\s?O\\.?\\s*)BOX|C\\.P\\.|POSTFACH)\
\[[:punct:]]?\\s+(?:NO.\\s+)?")
(BS.pack "PO BOX: ")
,T BLDG
(makeRegexOpts compCaseless execBlank
"\\b(?:BLDG|BUILDING|HANGAR)[[:punct:]]?")
(BS.pack "BLDG: ")
,T DOCK
(makeRegexOpts compCaseless execBlank
"\\bDOCK[[:punct:]]?")
(BS.pack "DOCK: ")
,T STE
(makeRegexOpts compCaseless execBlank
"\\b(?:SUITE|STE|APT|ROOM)[[:punct:]]?\
\\\s+(?:NO[[:punct:]]?)?")
(BS.pack "STE: ")
,T UNIT
(makeRegexOpts compCaseless execBlank
"\\bUNIT[[:punct:]]?")
(BS.pack "UNIT: ")
,T FLOOR
(makeRegexOpts compCaseless execBlank
"FLOOR")
(BS.pack "FLOOR: ")
,T DOORS
(makeRegexOpts compCaseless execBlank
"\\bDOORS?[[:punct:]]?")
(BS.pack "DOORS: ")]
_______________________________________________
Haskell-Cafe mailing list
[email protected]
http://www.haskell.org/mailman/listinfo/haskell-cafe