--- "Daniel F. Savarese" <[EMAIL PROTECTED]> wrote: >
> >I have a regex which I use to parse HTML files
> which
> >are marked up with HTML comments. Performance is
> fine
> >but then dips as I increase the number of contained
> >matches in the page.
>
> Can you post sample code reproducing the problem?
> Performance problems
Hi Daniel,
There are two files here
MarkedUpHTML.java
and
MarkedUpHTMLTest.java
Right at the end is the output from my test case
showing the performance dip.
The code won't compile but it will if references to
classes which have nothing to do with the pattern
matching are removed. If this is no good I'll isolate
the ORO code for you but it will have to be later,
just ask.
Many Thanks,
Janek Bogucki
/*
* MarkedUpHTML.java
*
* Created on 08 October 2001, 22:36
*/
package com.studylink.wps.html;
import java.util.Collections ;
import java.util.List ;
import java.util.LinkedList ;
import org.apache.oro.text.regex.* ;
/**
*
* We need to extract sections of HTML marked with
bracketed named comments
*
* From /index.html:
*
* <!-- name="news" template="news-tmpl" -->
* Note from Dru to Liz: The news goes here.
* Add the template in /_include/templates.html.
* Could we use miniburst.gif for the icon?
* <!-- name="/news" -->
*
* In the example above the template for the news is
referenced as template="news-tmpl".
* The WPS locates this in /_include/templates.html
which is like this.
*
* <html>
* <head>
* <title>NineMSN templates</title>
* </head>
* <body>
*
* <h1>This is the template for the news box</h1>
*
* <!-- name="news-tmpl" -->
* <!-- element="header" -->
* <table border=1>
* <tr><tr>Todays Education News</td></tr>
* <!-- element="/header" -->
* <!-- element="item-start">
* <tr><td valign="top" class="news">
* <!-- element="/item-start" -->
* This is some sample news from the world of
education.
* <!-- element="item-end" -->
* </td></tr>
* <!-- element="footer" -->
* </table>
* <!-- element="/footer" -->
* <!-- name="/news" -->
*
* </body>
* </html>
*
*
* Or if the template is embedded in the Server Active
HTML:
*
* From /index.html:
*
* <!-- name="news" -->
* <!-- element="header" -->
* <table border=1>
* <tr><tr>Todays Education News</td></tr>
* <!-- element="/header" -->
* <!-- element="item-start">
* <tr><td valign="top" class="news">
* <!-- element="/item-start" -->
* This is some sample news from the world of
education.
* <!-- element="item-end" -->
* </td></tr>
* <!-- element="/item-end" -->
* <!-- element="footer" -->
* </table>
* <!-- element="/footer" -->
* <!-- name="/news" -->
*
* We can parse out the bracketed sections using
jakarta-oro though the backreference
* feature of Perl 5 regular expressions.
*
*
* @author jdb
* @version $Id: MarkedUpHtml.java,v 1.7 2001/10/15
09:15:42 jdb Exp $
*/
public class MarkedUpHtml {
private static final String
SERVER_ACTIVE_HTML_PATTERN =
/*
* <!-- name="news" -->
* or
* <!-- name="news" template="news-tmpl" -->
*
* (Remember to escape backslash
*
* \n -> \\n
* \w -> \\w
*
* etc)
*
*/
"<!--\\s*name\\s*=\\s*\"([\\w\\-]+)\"\\s*(template\\s*=\\s*\"[\\w\\-]+\")?\\s*-->"
+
/*
* enclosed content
*/
"((\\s|.)*)" +
/*
* <!-- name="/news" -->
*/
"<!--\\s*name\\s*=\\s*\"/\\1\"\\s*-->" ;
protected List mObjectSequence =
Collections.EMPTY_LIST;
/**
* Creates new MarkedUpHTML
*/
public MarkedUpHtml( Fragment html ) throws
MarkedUpHtmlException {
if ( html == null )
throw new MarkedUpHtmlException(
"Constructor called with null Fragment reference" ) ;
parse( html ) ;
}
/**
* Creates new MarkedUpHTML
*/
public MarkedUpHtml( String html ) throws
MarkedUpHtmlException {
if ( html == null )
throw new MarkedUpHtmlException(
"Constructor called with null String reference" ) ;
parse( new Fragment( html ) ) ;
}
/**
* Return the list of objects the HTML parsed to.
*/
public List getSequence() {
return mObjectSequence ;
}
/**
* Parse the HTML fragment (typically a
well-formed HTML document, though the
* implementation does not require this.) into a
sequence of objects where each
* object is either a HTML fragment or a Server
Active HTML object. See MarkedUpHTMLTest.java
* for examples.
*
* Match comment pairs in psuedo XML format:
*
* <!-- name="xxx" -->
* ...
* <!-- name="/xxx" -->
*
* or
*
* <!-- name="xxx" template="yyy" -->
* ...
* <!-- name="/xxx" -->
*
*/
protected void parse( Fragment html ) throws
MarkedUpHtmlException {
PatternMatcher matcher;
PatternCompiler compiler;
Pattern pattern = null ;
PatternMatcherInput input;
MatchResult result;
compiler = new Perl5Compiler();
matcher = new Perl5Matcher();
try {
pattern = compiler.compile(
SERVER_ACTIVE_HTML_PATTERN );
} catch(MalformedPatternException e) {
throw new MarkedUpHtmlException(
e.toString() );
}
String htmlString = html.getValue() ;
input = new PatternMatcherInput( htmlString );
/*
* If the first match is at offset 0 then the
* candidate was wrong so there is no HTML to
* acquire.
*
* If the first match is at offset > 0 then
there
* is HTML to acquire.
*/
int HTMLStartCandidateOffset = 0;
int lastMatchEndOffset = 0 ;
mObjectSequence = new LinkedList() ;
Object o ;
while(matcher.contains(input, pattern)) {
result = matcher.getMatch();
/*
* There was some HTML between either the
beginning of the input
* and the beginning of this match or
between the end of the last match of the
* beginning of this match.
*/
if ( result.beginOffset( 0 ) >
HTMLStartCandidateOffset ) {
/* endIndex in substring is exclusive
and so is endOffset in MatchResult */
o = new Fragment(
htmlString.substring(
result.beginOffset( 0 ), result.endOffset( 0 ) )
) ;
mObjectSequence.add( o ) ;
}
/* get ready for next opportunity */
HTMLStartCandidateOffset =
result.endOffset( 0 ) ;
String name = "my-dummy-name" ;
String template = "my-dummy-template-name"
;
o = new ServerActiveHtml( name, template )
;
mObjectSequence.add( o ) ;
lastMatchEndOffset = result.endOffset( 0 )
;
if ( false ) {
System.out.println( "*** MATCH ***\n"
+ result.toString() );
System.out.println("BEGIN OFFSET: " +
result.beginOffset( 0 ) );
System.out.println("END OFFSET: " +
result.endOffset( 0 ) );
//actualMatchList += result.toString
() + "/" + result.beginOffset (0) + "/" +
result.endOffset (0) + "/" ;
if ( false ) {
int groupCnt = result.groups();
System.out.println("Number of
Groups: " + groupCnt);
/*
* group 0 is the entire matched string.
*
*/
for ( int groupIdx = 1 ; groupIdx
< groupCnt ; groupIdx++ ) {
String group = result.group(
groupIdx ) ;
System.out.println("Group
index " + groupIdx + ":" + group );
//System.out.println ("Begin:
" + result.begin (group));
//System.out.println ("End: "
+ result.end (group));
}
} // false
}
} // false
/*
* Handle potential orphan.
*/
if ( lastMatchEndOffset != htmlString.length()
) {
o = new Fragment(
htmlString.substring( lastMatchEndOffset )
) ;
mObjectSequence.add( o ) ;
}
}
}
--- MarkedUpHtmlTest.java ---
/*
* MarkedUpHtmlTest.java
*
* Created on 08 October 2001, 22:51
*/
package tests.com.studylink.wps.html;
import java.io.*;
import java.util.*;
import junit.framework.*;
import junit.swingui.*;
//import com.studylink.app.*;
//import com.studylink.utility.*;
//import com.studylink.wps.*;
//import com.studylink.wps.channel.*;
import com.studylink.wps.html.*;
//import com.studylink.wps.jsp.*;
//import tests.com.studylink.wps.*;
/**
*
* @author jdb
* @version $Id: MarkedUpHtmlTest.java,v 1.5
2001/10/15 09:15:43 jdb Exp $
*/
public class MarkedUpHtmlTest extends TestCase {
final String MULTILINE_TEST = "<!-- name=\"news\"
-->\nAA\nBB\n<!-- name=\"/news\" -->" ;
final List MULTILINE_TEST_EXPECTED_OBJSEQ =
Arrays.asList( new String [] {
"ACTIVE"
} ) ;
final String EXTRA_SPACES = "<!-- name = \"news\"
-->\nAA\nBB\n<!-- name= \"/news\" -->" ;
final List EXTRA_SPACES_EXPECTED_OBJSEQ =
Arrays.asList( new String [] {
"ACTIVE"
} ) ;
final String NO_SERVER_ACTIVE_HTML =
"<html><head><title>Main Page</title></head>\n" +
"<body bgcolor=\"#336666\">\n" +
"<hr>\n" +
"<p>Welcome to Everville</p>\n" +
/* HTML */
"</body>\n" +
"</html>" ;
final List NO_SERVER_ACTIVE_HTML_EXPECTED_OBJSEQ =
Arrays.asList( new String [] {
"HTML"
} ) ;
final String REFERENCED_TEMPLATE =
/* HTML */
"<html><head><title>Main Page</title></head>\n" +
"<body bgcolor=\"#336666\">\n" +
/* Server Active HTML #1 */
"<!-- name=\"news\" template=\"news-tmpl\" -->\n"
+
"Note from Dru to Liz: The news goes here.<br>\n"
+
"Add the template in /_include/templates.html
later.<br>\n" +
"Could we use miniburst.gif for the icon?\n" +
"<!-- name=\"/news\" -->\n" +
/* HTML */
"<hr>\n" +
/* Server Active HTML #2 */
"<!-- name=\"contact\" -->\n" +
"<!-- element=\"body\" placeholder=\"tel:xxx\"
-->\n" +
"For enquiries about this channel call
toll-free:xxx.<br>\n" +
"<!-- element=\"/body\" -->\n" +
"<!-- name=\"/contact\" -->\n" +
/* HTML */
"</body>\n" +
"</html>" ;
final List REFERENCED_TEMPLATE_EXPECTED_OBJSEQ =
Arrays.asList( new String [] {
"HTML",
"ACTIVE",
"HTML",
"ACTIVE",
"HTML"
} ) ;
final String EMBEDDED_TEMPLATE =
"<html><head><title>Main Page</title></head>\n" +
"<body bgcolor=\"#336666\">\n" +
"<!-- name=\"news\" -->\n" +
/* embedded template */
"<!-- element=\"header\" -->\n" +
"<table border=1\n" + // line break edit
"valign=\"top\">\n" +
"<tr><tr>Todays Education News</td></tr>\n" +
"<!-- element=\"/header\" -->\n" +
"<!-- element=\"item-start\" -->\n" +
"<tr><td valign=\"top\" class=\"news\">\n" +
"<!-- element=\"/item-start\" -->\n" +
"This is some sample news from the world of
education.\n" +
"<!-- element=\"item-end\" -->\n" +
"</td></tr>\n" +
"<!-- element=\"/item-end\" -->\n" +
"<!-- element=\"footer\" -->\n" +
"</table>\n" +
"<!-- element=\"/footer\" -->\n" +
"<!-- name=\"/news\" -->\n" +
"</body>\n" +
"</html>" ;
final List EMBEDDED_TEMPLATE_EXPECTED_OBJSEQ =
Arrays.asList( new String [] {
"HTML",
"ACTIVE",
"HTML"
} ) ;
final String TEMPLATE_PAGE =
"<html>\n" +
"<head>\n" +
"<title>NineMSN templates</title>\n" +
"</head>\n" +
"<body>\n" +
"\n" +
"<h1>This is the template for the news box</h1>\n"
+
"\n" +
"<!-- template=\"news-tmpl\" -->\n" +
"<!-- element=\"header\" -->\n" +
"<table border=1>\n" +
"<tr><tr>Todays Education News</td></tr>\n" +
"<!-- element=\"/header\" -->\n" +
"<!-- element=\"item-start\" -->\n" +
"<tr><td valign=\"top\" class=\"news\">\n" +
"<!-- element=\"/item-start\" -->\n" +
"This is some sample news from the world of
education.\n" +
"<!-- element=\"item-end\" -->\n" +
"</td></tr>\n" +
"<!-- element=\"/item-end\" -->\n" +
"<!-- element=\"footer\" -->\n" +
"</table>\n" +
"<!-- element=\"/footer\" -->\n" +
"<!-- template=\"/news-tmpl\" -->\n" +
"\n" +
"</body>\n" +
"</html>\n" ;
/*
* <-->(\\s|.)*<-->
* matched
* <-->\nAA\nBB\n<-->
*
* <!-- name=news -->(\\s|.)*<!-- -->
* matched
* <!-- name=news -->\nAA\nBB\n<!-- -->
*
* <!-- name=\"news\" -->(\\s|.)*<!--
name=\"/news\" -->
* matched
* <!-- name=\"news\" -->\nAA\nBB\n<!--
name=\"/news\" -->
* (and EMBEDDED_TEMPLATE)
*
* <!-- name=\"([\\w\\-]+)\" -->(\\s|.)*<!--
name=\"/news\" -->
* matched
* <!-- name=\"news\" -->\nAA\nBB\n<!--
name=\"/news\" -->
* (and TEMPLATE_PAGE and EMBEDDED_TEMPLATE but
not REFERENCED_TEMPLATE)
*
* <!-- name=\"([\\w\\-]+)\" -->(\\s|.)*<!--
name=\"/news-tmpl\" -->
* matched
* TEMPLATE_PAGE
*
* <!-- name=\"([\\w\\-]+)\" -->(\\s|.)*<!--
name=\"/news\" -->
* TEMPLATE_PAGE ran so long I terminarted it
*
* <!-- name=\"([\\w\\-]+)\" -->(\\s|.)*<!--
name=\"/\\1\" -->
* matched everything as expected.
*
* <!-- name\\s*=\\s*\"([\\w\\-]+)\"
-->(\\s|.)*<!-- name\\s*=\\s*\"/\\1\" -->
* matched everything as expected.
*
* "<!--\\s*name\\s*=\\s*\"([\\w\\-]+)\"\\s*-->" +
* "(\\s|.)*" +
* "<!--\\s*name\\s*=\\s*\"/\\1\"\\s*-->" ;
* matched everything as expected.
*
*
"<!--\\s*name\\s*=\\s*\"([\\w\\-]+)\"\\s*(template=\"news-tmpl\")?\\s*-->"
+
* "(\\s|.)*" +
* "<!--\\s*name\\s*=\\s*\"/\\1\"\\s*-->" ;
* matched everything as expected.
*
*
"<!--\\s*name\\s*=\\s*\"([\\w\\-]+)\"\\s*(template\\s*=\\s*\"news-tmpl\")?\\s*-->"
+
* "(\\s|.)*" +
* "<!--\\s*name\\s*=\\s*\"/\\1\"\\s*-->" ;
* matched everything as expected.
*/
/** Creates new MarkedUpHtmlTest */
public MarkedUpHtmlTest( String testName ) {
super(testName);
}
public static void main(java.lang.String[] args) {
junit.textui.TestRunner.run(suite());
}
/* Comments copied from junit.framework.TestSuite.
*/
/**
* A <code>TestSuite</code> is a
<code>Composite</code> of Tests.
* It runs a collection of test cases.
*
* This constructor creates a suite with all the
methods
* starting with "test" that take no arguments.
*/
public static Test suite() {
TestSuite suite = new TestSuite(
MarkedUpHtmlTest.class );
return suite;
}
/*
protected void setUp () throws Exception {}
*/
/*
protected void tearDown () throws Exception {}
*/
public void testPages() {
MarkedUpHtml m ;
try {
System.out.println("MULTILINE_TEST");
m = new MarkedUpHtml ( MULTILINE_TEST ) ;
checkObjectSequence( "MULTILINE_TEST", m,
MULTILINE_TEST_EXPECTED_OBJSEQ ) ;
System.out.println("");
System.out.println("EXTRA_SPACES");
m = new MarkedUpHtml ( EXTRA_SPACES ) ;
checkObjectSequence( "EXTRA_SPACES", m,
EXTRA_SPACES_EXPECTED_OBJSEQ ) ;
System.out.println("");
System.out.println("NO_SERVER_ACTIVE_HTML");
m = new MarkedUpHtml (
NO_SERVER_ACTIVE_HTML ) ;
checkObjectSequence(
"NO_SERVER_ACTIVE_HTML", m,
NO_SERVER_ACTIVE_HTML_EXPECTED_OBJSEQ ) ;
System.out.println("");
System.out.println("REFERENCED_TEMPLATE");
//
System.out.println(REFERENCED_TEMPLATE);
m = new MarkedUpHtml (
REFERENCED_TEMPLATE ) ;
checkObjectSequence(
"REFERENCED_TEMPLATE", m,
REFERENCED_TEMPLATE_EXPECTED_OBJSEQ ) ;
System.out.println("");
System.out.println("EMBEDDED_TEMPLATE");
// System.out.println(EMBEDDED_TEMPLATE);
m = new MarkedUpHtml ( EMBEDDED_TEMPLATE
) ;
checkObjectSequence( "EMBEDDED_TEMPLATE",
m, EMBEDDED_TEMPLATE_EXPECTED_OBJSEQ ) ;
System.out.println("TEMPLATE_PAGE");
// System.out.println(TEMPLATE_PAGE);
m = new MarkedUpHtml ( TEMPLATE_PAGE ) ;
System.out.println("TEMPLATE page not
checked");
//checkObjectSequence ( m,
TEMPLATE_PAGE_EXPECTED_OBJSEQ ) ;
System.out.println("");
}
catch ( MarkedUpHtmlException e ) {
fail( e.toString() ) ;
}
}
/**
* Test many sequences
*/
public void testBinarySequences() throws
MarkedUpHtmlException {
/*
* (HH = H)
* A,H
* AA, AH, HA
* AAA, AAH, AHA, HAA, HAH
* AAAA, AAAH, AAHA, AHAA, AHAH, HAAA, HAAH,
HAHA,
*/
/*
List comboList = Arrays.asList( new String []
{
"H", "A",
"AH", "AA", "HA",
"AAH", "AAA", "AHA", "HAH", "HAA",
"AAAH", "AAAA", "AAHA", "AHAH", "AHAA",
"HAAH", "HAAA", "HAHA"
} ) ;
*/
List comboList = Arrays.asList( new
String [] {
"H", "A",
"AH", "AA", "HA",
"AAH", "AAA", "AHA", "HAH", "HAA",
"AAAH", "AAAAH", "AAHA", "AHAH", "AHAA",
"HAAH", "HAAA", "HAHA"
} ) ;
String html = "<b><i>Hi</i></b>\n" ;
String SAHOpen = "<!-- name=\"" ;
String SAHClose = "\" -->" ;
String SAHContent = "<!-- element=\"content\"
-->Hi<!-- element=\"/content\" -->\n";
int namePostfix = 0 ;
Iterator it = comboList.iterator() ;
while ( it.hasNext() ) {
String seq = (String) it.next() ;
StringBuffer sb = new StringBuffer() ;
List expectedOrder = new LinkedList() ;
for ( int i = 0; i < seq.length() ; i++,
namePostfix++ ) {
if ( seq.charAt( i ) == 'A' ) {
sb.append(
SAHOpen + "name-" + namePostfix +
SAHClose +
SAHContent +
SAHOpen + "/name-" + namePostfix +
SAHClose
) ;
expectedOrder.add( "ACTIVE" ) ;
}
else {
sb.append( html ) ;
expectedOrder.add( "HTML" ) ;
}
}
System.out.println("testBinarySequences: "
+ seq);
long t = System.currentTimeMillis() ;
MarkedUpHtml m = new MarkedUpHtml(
sb.toString() ) ;
System.out.println("Seconds: " + ( (
System.currentTimeMillis() - t ) / 1000 ) +"\n" );
System.out.flush();
checkObjectSequence( "testBinarySequences:
" + seq, m, expectedOrder ) ;
}
}
/* Shared methods */
/**
* Check the parsing of the HTML resulted in the
correct order
* of objects
*/
protected void checkObjectSequence( String name,
MarkedUpHtml m, List expectedOrder ) {}
protected void checkObjectSequence_( String name,
MarkedUpHtml m, List expectedOrder ) {
List seq = m.getSequence() ;
assertEquals( "The number of actual objects
was wrong for " + name, expectedOrder.size(),
seq.size() ) ;
for ( int objIdx = 0 ; objIdx <
expectedOrder.size() ; objIdx++ ) {
String expected = (String)
expectedOrder.get( objIdx ) ;
Object actual = seq.get( objIdx ) ;
if ( expected.equals( "HTML" ) ) {
if ( ! ( actual instanceof Fragment )
)
fail( name + ":Expected Fragment
object, had " + actual.getClass().getName() ) ;
}
if ( expected.equals( "ACTIVE" ) ) {
if ( ! ( actual instanceof
ServerActiveHtml ) )
fail( name + ":Expected
ServerActiveHtml object, had " +
actual.getClass().getName() ) ;
}
}
}
}
/*
These were the parse times. The patterns ending in
SAH performed badly.
*
testBinarySequences: H
Seconds: 0
testBinarySequences: A
Seconds: 0
testBinarySequences: AH
Seconds: 0
testBinarySequences: AA
Seconds: 0
testBinarySequences: HA
Seconds: 0
testBinarySequences: AAH
Seconds: 0
testBinarySequences: AAA
Seconds: 12
testBinarySequences: AHA
Seconds: 0
testBinarySequences: HAH
Seconds: 0
testBinarySequences: HAA
Seconds: 0
testBinarySequences: AAAH
Seconds: 24
testBinarySequences: AAAA
Seconds: 3315
testBinarySequences: AAHA
Seconds: 12
testBinarySequences: AHAH
Seconds: 0
testBinarySequences: AHAA
Seconds: 12
testBinarySequences: HAAH
Seconds: 0
testBinarySequences: HAAA
Seconds: 12
testBinarySequences: HAHA
Seconds: 0
Testsuite:
tests.com.studylink.wps.html.MarkedUpHtmlTest
Tests run: 2, Failures: 0, Errors: 0, Time elapsed:
3,403.865 sec
*/
____________________________________________________________
Do You Yahoo!?
Get your free @yahoo.co.uk address at http://mail.yahoo.co.uk
or your free @yahoo.ie address at http://mail.yahoo.ie