Author: aheritier Date: Sat Jan 14 16:36:18 2006 New Revision: 369134 URL: http://svn.apache.org/viewcvs?rev=369134&view=rev Log: PR: MPLINKCHECK-20, MPLINKCHECK-23 Submitted by: Ignacio G. Mac Dowell Reviewed by: aheritier Improve performance getting rid of jtidy dependency via regexps. StackOverflowError processing apidocs/index-all.html.
Added: maven/maven-1/plugins/trunk/linkcheck/src/main/org/apache/maven/plugin/linkcheck/LinkMatcher.java Modified: maven/maven-1/plugins/trunk/linkcheck/project.xml maven/maven-1/plugins/trunk/linkcheck/src/main/org/apache/maven/plugin/linkcheck/FileToCheck.java maven/maven-1/plugins/trunk/linkcheck/xdocs/changes.xml Modified: maven/maven-1/plugins/trunk/linkcheck/project.xml URL: http://svn.apache.org/viewcvs/maven/maven-1/plugins/trunk/linkcheck/project.xml?rev=369134&r1=369133&r2=369134&view=diff ============================================================================== --- maven/maven-1/plugins/trunk/linkcheck/project.xml (original) +++ maven/maven-1/plugins/trunk/linkcheck/project.xml Sat Jan 14 16:36:18 2006 @@ -201,28 +201,9 @@ </properties> </dependency> <dependency> - <groupId>dom4j</groupId> - <artifactId>dom4j</artifactId> - <version>1.4</version> - <properties> - <comment>This library is already loaded by maven's core. Be careful to use the same version number as in the core.</comment> - </properties> - </dependency> - <dependency> - <groupId>jtidy</groupId> - <artifactId>jtidy</artifactId> - <version>4aug2000r7-dev</version> - </dependency> - <dependency> <groupId>maven</groupId> <artifactId>maven</artifactId> <version>1.0.2</version> - </dependency> - <dependency> - <jar>js-1.5R4-RC3.jar</jar> - <groupId>rhino</groupId> - <artifactId>rhino</artifactId> - <version>1.5R4-RC3</version> </dependency> </dependencies> </project> Modified: maven/maven-1/plugins/trunk/linkcheck/src/main/org/apache/maven/plugin/linkcheck/FileToCheck.java URL: http://svn.apache.org/viewcvs/maven/maven-1/plugins/trunk/linkcheck/src/main/org/apache/maven/plugin/linkcheck/FileToCheck.java?rev=369134&r1=369133&r2=369134&view=diff ============================================================================== --- maven/maven-1/plugins/trunk/linkcheck/src/main/org/apache/maven/plugin/linkcheck/FileToCheck.java (original) +++ maven/maven-1/plugins/trunk/linkcheck/src/main/org/apache/maven/plugin/linkcheck/FileToCheck.java Sat Jan 14 16:36:18 2006 @@ -17,29 +17,19 @@ * ==================================================================== */ -import java.io.BufferedInputStream; -import java.io.ByteArrayOutputStream; import java.io.File; -import java.io.FileInputStream; -import java.io.FileNotFoundException; import java.io.InputStream; import java.io.OutputStream; -import java.io.PrintWriter; import java.util.Iterator; import java.util.LinkedList; import java.util.List; import java.util.Set; -import java.util.TreeSet; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.maven.plugin.linkcheck.validation.LinkValidationItem; import org.apache.maven.plugin.linkcheck.validation.LinkValidationResult; import org.apache.maven.plugin.linkcheck.validation.LinkValidatorManager; -import org.dom4j.Document; -import org.dom4j.Node; -import org.dom4j.io.DOMReader; -import org.w3c.tidy.Tidy; /** * @author <a href="mailto:[EMAIL PROTECTED]">Ben Walding</a> @@ -73,30 +63,6 @@ private int unsuccessful; - private Set getLinks() - throws FileNotFoundException - { - ByteArrayOutputStream baos = new ByteArrayOutputStream(); - PrintWriter errOut = new PrintWriter( baos ); - BufferedInputStream bin = new BufferedInputStream( new FileInputStream( fileToCheck ) ); - try - { - Tidy tidy = getTidy(); - tidy.setErrout( errOut ); - org.w3c.dom.Document domDocument = tidy.parseDOM( bin, null ); - // now read a dom4j document from - // JTidy's W3C DOM object - final DOMReader domReader = new DOMReader(); - final Document doc = domReader.read( domDocument ); - return findUniqueLinks( doc ); - } - finally - { - close( bin ); - close( baos ); - } - } - /** * Returns the message. * @return String @@ -172,7 +138,7 @@ final Set hrefs; try { - hrefs = getLinks(); + hrefs = LinkMatcher.match( fileToCheck ); } catch ( Throwable t ) { @@ -259,69 +225,4 @@ { this.links.add( lcr ); } - - private void close( InputStream is ) - { - try - { - is.close(); - } - catch ( Exception e ) - { - //Don't really care. - } - } - - private void close( OutputStream os ) - { - try - { - os.close(); - } - catch ( Exception e ) - { - //Don't really care. - } - } - - private Set findUniqueLinks( Document doc ) - { - List xpathResults = new LinkedList(); - - xpathResults.addAll( doc.selectNodes( "//a/@href" ) ); - xpathResults.addAll( doc.selectNodes( "//img/@src" ) ); - - xpathResults.addAll( doc.selectNodes( "//link/@href" ) ); - - xpathResults.addAll( doc.selectNodes( "//script/@src" ) ); - - Set results = new TreeSet(); - Iterator linkIter = xpathResults.iterator(); - Node node = null; - String href = null; - while ( linkIter.hasNext() ) - { - node = (Node) linkIter.next(); - href = node.getText(); - results.add( href ); - } - xpathResults = null; - linkIter = null; - node = null; - href = null; - return results; - } - - private Tidy getTidy() - { - Tidy tidy = new Tidy(); - tidy.setMakeClean( true ); - tidy.setXmlTags( true ); - tidy.setXmlOut( true ); - tidy.setXHTML( true ); - tidy.setQuiet( true ); - tidy.setShowWarnings( false ); - return tidy; - } - -} \ No newline at end of file +} Added: maven/maven-1/plugins/trunk/linkcheck/src/main/org/apache/maven/plugin/linkcheck/LinkMatcher.java URL: http://svn.apache.org/viewcvs/maven/maven-1/plugins/trunk/linkcheck/src/main/org/apache/maven/plugin/linkcheck/LinkMatcher.java?rev=369134&view=auto ============================================================================== --- maven/maven-1/plugins/trunk/linkcheck/src/main/org/apache/maven/plugin/linkcheck/LinkMatcher.java (added) +++ maven/maven-1/plugins/trunk/linkcheck/src/main/org/apache/maven/plugin/linkcheck/LinkMatcher.java Sat Jan 14 16:36:18 2006 @@ -0,0 +1,122 @@ +package org.apache.maven.plugin.linkcheck; + +/* ==================================================================== + * Copyright 2001-2006 The Apache Software Foundation. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * ==================================================================== + */ + +import java.io.BufferedReader; +import java.io.File; +import java.io.FileReader; +import java.io.IOException; +import java.util.Set; +import java.util.TreeSet; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +/** + * Link matcher. Reads the contents of a file and tries to match the following: + * <code> + * <a href="".... + * <link href="".... + * <img src="".... + * <script src="".... + * </code> + * + * @author <a href="mailto:[EMAIL PROTECTED]">Ignacio G. Mac Dowell </a> + */ +class LinkMatcher +{ + + /** + * Regexp for link matching. + */ + private final static Pattern p = Pattern + .compile( "<(?>link|a|img|script)[^>]*?(?>href|src)\\s*?=\\s*?[\\\"'](.*?)[\\\"'][^>]*?", + Pattern.CASE_INSENSITIVE ); + + /** + * No need to create a new object each time a file is processed. Just clear + * it. + */ + private final static Set linkList = new TreeSet(); + + /** + * Reads a file and returns a StringBuffer with its contents. + * + * TODO: Check for encoding issues + * + * TODO: Better exception handling? + * + * @param file + * the file we are reading + * @return a StringBuffer with file's contents. + * @throws IOException + */ + private static StringBuffer fileToStringBuffer( File file ) + throws IOException + { + BufferedReader reader = null; + final StringBuffer pageBuffer = new StringBuffer(); + try + { + reader = new BufferedReader( new FileReader( file ) ); + String line; + while ( ( line = reader.readLine() ) != null ) + { + pageBuffer.append( line ); + } + } + finally + { + reader.close(); + } + return pageBuffer; + } + + /** + * Performs the actual matching. + * + * @param file + * the file to check + * @return a set with all links to check + * @throws IOException + */ + static Set match( File file ) + throws IOException + { + linkList.clear(); + final Matcher m = p.matcher( fileToStringBuffer( file ) ); + String link; + while ( m.find() ) + { + link = m.group( 1 ).trim(); + if ( link.length() < 1 ) + { + continue; + } + else if ( link.toLowerCase().indexOf( "javascript" ) != -1 ) + { + continue; + } + // else if (link.toLowerCase().indexOf("mailto:") != -1) { + // continue; + // } + linkList.add( link ); + } + return linkList; + } + +} Modified: maven/maven-1/plugins/trunk/linkcheck/xdocs/changes.xml URL: http://svn.apache.org/viewcvs/maven/maven-1/plugins/trunk/linkcheck/xdocs/changes.xml?rev=369134&r1=369133&r2=369134&view=diff ============================================================================== --- maven/maven-1/plugins/trunk/linkcheck/xdocs/changes.xml (original) +++ maven/maven-1/plugins/trunk/linkcheck/xdocs/changes.xml Sat Jan 14 16:36:18 2006 @@ -26,6 +26,8 @@ </properties> <body> <release version="1.4-SNAPSHOT" date="in SVN"> + <action dev="aheritier" type="update" issue="MPLINKCHECK-23" due-to="Ignacio G. Mac Dowell">Improve performance getting rid of jtidy dependency via regexps.</action> + <action dev="aheritier" type="fix" issue="MPLINKCHECK-20" due-to="Ignacio G. Mac Dowell">StackOverflowError processing apidocs/index-all.html.</action> <action dev="aheritier" type="add">If maven is in offline mode the report doesn't test external urls. A warning is displayed in the report.</action> <action dev="aheritier" type="update" issue="MPLINKCHECK-10">"Moved Permanently" sites are reported as a warning and not as an error.</action> <action dev="aheritier" type="update" issue="MPLINKCHECK-24">Speed and stability enhancement [better usage of httpClient].</action>