Violeta,

On 21.12.2013 12:30, Violeta Georgieva wrote:
Thanks for the testing.
I'm still not convinced to stop the voting based on the frequency (1/5000)
of the problem.

Ok.

I'm having hard time creating reproducible test case. As I test more, the frequency of failures is even lower then I initially reported. Sometimes it takes 5k request to create a failure, and sometimes even 500k reqests is not enough.

If anyone wants to try to reproduce the problem, at the end of this message is a test I use. At the first phase, it crawls all pages starting from Tomcat root at localhost:8080. At the second phase, it starts reading all collected URLs, 500 times in a row. In total, it reads around 140k pages. If there is an error it will be printed in the console, e.g:

====
Round: 75 / 500
Round: 76 / 500
ERROR: couldn't open URL: 'http://localhost:82/examples/jsp/jsptoserv/ServletToJsp.java.html
Invalid Http response
Round: 77 / 500
Round: 78 / 500
----

There are also errors during the first phase (crawling), but they are 404s (e.g. /docs/api/*) or 401s (e.g. /manager/html), and may be ignored.

-Ognjen


package webcrawler;

// Based on: http://cs.nyu.edu/courses/fall02/G22.3033-008/WebCrawler.java

import java.io.IOException;
import java.io.InputStream;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLConnection;
import java.util.Hashtable;
import java.util.Vector;

public class WebCrawler {
    public static final int MAX_PAGES = 2000; // Absolute max pages
    public static final int MAXSIZE = 2000000; // Max size of file
    public static final boolean DEBUG = false;

    // URLs to be searched
    Vector<URL> newURLs;
    // Known URLs
    Hashtable<URL, Integer> knownURLs;
    String startUrl;

    public static void main(String[] argv) throws Exception {
        WebCrawler wc = new WebCrawler();

        String start = "http://localhost:8080/";;
        wc.crawl(start);
        wc.repeat();
    }

    // initializes data structures. argv is the command line arguments.
    public void initialize(String start) {
        URL url;
        knownURLs = new Hashtable<URL, Integer>();
        newURLs = new Vector<URL>();
        try {
            startUrl = start;
            url = new URL(startUrl);
        } catch (MalformedURLException e) {
            System.out.println("Invalid starting URL " + startUrl);
            return;
        }
        knownURLs.put(url, new Integer(1));
        newURLs.addElement(url);
System.out.println("Starting search: Initial URL " + url.toString());
        System.out.println("Maximum number of pages:" + MAX_PAGES);

    }

    // adds new URL to the queue. Accept only new URL's that end in
    // htm or html. oldURL is the context, newURLString is the link
    // (either an absolute or a relative URL).

    public void addnewurl(URL oldURL, String newUrlString) {
        URL url;
        if (oldURL.toString().matches(".*?/[a-z0-9_-]+")) {
            try {
                oldURL = new URL(oldURL.toString() + "/");
            } catch (MalformedURLException e) {
                throw new RuntimeException(e);
            }
        }
        if (DEBUG)
            System.out.println("URL String " + newUrlString);
        try {
            url = new URL(oldURL, newUrlString);
if (!knownURLs.containsKey(url) && url.toString().startsWith(startUrl)) {
                knownURLs.put(url, new Integer(1));
                newURLs.addElement(url);
                System.out.println("Found new URL " + url.toString());
            }
        } catch (MalformedURLException e) {
            return;
        }
    }

    // Download contents of URL
    public String getpage(URL url, boolean printMessages) {
        try {
            // try opening the URL
            URLConnection urlConnection = url.openConnection();
            if (printMessages) {
                System.out.println("Downloading " + url.toString());
            }
            if (url.toString().contains("/examples/async/")) {
                System.out.println("skip async url " + url.toString());
                return "";
            }

            urlConnection.setAllowUserInteraction(false);

            InputStream urlStream = url.openStream();
            // search the input stream for links
            // first, read in the entire URL
            byte b[] = new byte[1000];
            int numRead = urlStream.read(b);
            String content = new String(b, 0, numRead);
            while ((numRead != -1) && (content.length() < MAXSIZE)) {
                numRead = urlStream.read(b);
                if (numRead != -1) {
                    String newContent = new String(b, 0, numRead);
                    content += newContent;
                }
            }
            return content;

        } catch (IOException e) {
System.out.println("ERROR: couldn't open URL: '" + url.toString());
            System.out.println(e.getMessage());
            return "";
        }
    }

    // Go through page finding links to URLs. A link is signalled
    // by <a href=" ... It ends with a close angle bracket, preceded
    // by a close quote, possibly preceded by a hatch mark (marking a
    // fragment, an internal page marker)

    public void processpage(URL url, String page) {
        String lcPage = page.toLowerCase(); // Page in lower case
        int index = 0; // position in page
        int iEndAngle, ihref, iURL, iCloseQuote, iHatchMark, iEnd;
        while ((index = lcPage.indexOf("<a", index)) != -1) {
            iEndAngle = lcPage.indexOf(">", index);
            ihref = lcPage.indexOf("href", index);
            if (ihref != -1) {
                iURL = lcPage.indexOf("\"", ihref) + 1;
if ((iURL != -1) && (iEndAngle != -1) && (iURL < iEndAngle)) {
                    iCloseQuote = lcPage.indexOf("\"", iURL);
                    iHatchMark = lcPage.indexOf("#", iURL);
                    if ((iCloseQuote != -1) && (iCloseQuote < iEndAngle)) {
                        iEnd = iCloseQuote;
if ((iHatchMark != -1) && (iHatchMark < iCloseQuote))
                            iEnd = iHatchMark;
                        String newUrlString = page.substring(iURL, iEnd);
                        addnewurl(url, newUrlString);
                    }
                }
            }
            index = iEndAngle;
        }
    }

    // Top-level procedure. Keep popping a url off newURLs, download
    // it, and accumulate new URLs

    public void crawl(String startUrl) {
        initialize(startUrl);
        for (int i = 0; i < MAX_PAGES; i++) {
            URL url = (URL) newURLs.elementAt(0);
            newURLs.removeElementAt(0);
            if (DEBUG) {
                System.out.println("Searching " + url.toString());
            }
            String page = getpage(url, true);
            if (DEBUG) {
                System.out.println(page);
            }
            if (page.length() != 0) {
                processpage(url, page);
            }
            if (newURLs.isEmpty()) {
                break;
            }
        }
System.out.println("Crawl complete, total " + knownURLs.size() + " pages.");
    }

    public void repeat() {
        for (int i = 1; i <= 500; i++) {
            System.out.printf("Round: %d / 500\r\n", i);
            for (URL url : knownURLs.keySet()) {
                String urls = url.toString();
                if (!urls.endsWith("/manager/status")
                        && !urls.endsWith("/manager/html")
                        && !urls.endsWith("/host-manager/html")
                        && !urls.contains("/docs/api/org/apache/")
                        && !urls.contains("/examples/async/")) {
                        getpage(url, false);
                }
            }
        }
    }

}




---------------------------------------------------------------------
To unsubscribe, e-mail: dev-unsubscr...@tomcat.apache.org
For additional commands, e-mail: dev-h...@tomcat.apache.org

Reply via email to