Violeta,
On 21.12.2013 12:30, Violeta Georgieva wrote:
Thanks for the testing.
I'm still not convinced to stop the voting based on the frequency (1/5000)
of the problem.
Ok.
I'm having hard time creating reproducible test case. As I test more,
the frequency of failures is even lower then I initially reported.
Sometimes it takes 5k request to create a failure, and sometimes even
500k reqests is not enough.
If anyone wants to try to reproduce the problem, at the end of this
message is a test I use. At the first phase, it crawls all pages
starting from Tomcat root at localhost:8080. At the second phase, it
starts reading all collected URLs, 500 times in a row. In total, it
reads around 140k pages. If there is an error it will be printed in the
console, e.g:
====
Round: 75 / 500
Round: 76 / 500
ERROR: couldn't open URL:
'http://localhost:82/examples/jsp/jsptoserv/ServletToJsp.java.html
Invalid Http response
Round: 77 / 500
Round: 78 / 500
----
There are also errors during the first phase (crawling), but they are
404s (e.g. /docs/api/*) or 401s (e.g. /manager/html), and may be ignored.
-Ognjen
package webcrawler;
// Based on: http://cs.nyu.edu/courses/fall02/G22.3033-008/WebCrawler.java
import java.io.IOException;
import java.io.InputStream;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLConnection;
import java.util.Hashtable;
import java.util.Vector;
public class WebCrawler {
public static final int MAX_PAGES = 2000; // Absolute max pages
public static final int MAXSIZE = 2000000; // Max size of file
public static final boolean DEBUG = false;
// URLs to be searched
Vector<URL> newURLs;
// Known URLs
Hashtable<URL, Integer> knownURLs;
String startUrl;
public static void main(String[] argv) throws Exception {
WebCrawler wc = new WebCrawler();
String start = "http://localhost:8080/";
wc.crawl(start);
wc.repeat();
}
// initializes data structures. argv is the command line arguments.
public void initialize(String start) {
URL url;
knownURLs = new Hashtable<URL, Integer>();
newURLs = new Vector<URL>();
try {
startUrl = start;
url = new URL(startUrl);
} catch (MalformedURLException e) {
System.out.println("Invalid starting URL " + startUrl);
return;
}
knownURLs.put(url, new Integer(1));
newURLs.addElement(url);
System.out.println("Starting search: Initial URL " +
url.toString());
System.out.println("Maximum number of pages:" + MAX_PAGES);
}
// adds new URL to the queue. Accept only new URL's that end in
// htm or html. oldURL is the context, newURLString is the link
// (either an absolute or a relative URL).
public void addnewurl(URL oldURL, String newUrlString) {
URL url;
if (oldURL.toString().matches(".*?/[a-z0-9_-]+")) {
try {
oldURL = new URL(oldURL.toString() + "/");
} catch (MalformedURLException e) {
throw new RuntimeException(e);
}
}
if (DEBUG)
System.out.println("URL String " + newUrlString);
try {
url = new URL(oldURL, newUrlString);
if (!knownURLs.containsKey(url) &&
url.toString().startsWith(startUrl)) {
knownURLs.put(url, new Integer(1));
newURLs.addElement(url);
System.out.println("Found new URL " + url.toString());
}
} catch (MalformedURLException e) {
return;
}
}
// Download contents of URL
public String getpage(URL url, boolean printMessages) {
try {
// try opening the URL
URLConnection urlConnection = url.openConnection();
if (printMessages) {
System.out.println("Downloading " + url.toString());
}
if (url.toString().contains("/examples/async/")) {
System.out.println("skip async url " + url.toString());
return "";
}
urlConnection.setAllowUserInteraction(false);
InputStream urlStream = url.openStream();
// search the input stream for links
// first, read in the entire URL
byte b[] = new byte[1000];
int numRead = urlStream.read(b);
String content = new String(b, 0, numRead);
while ((numRead != -1) && (content.length() < MAXSIZE)) {
numRead = urlStream.read(b);
if (numRead != -1) {
String newContent = new String(b, 0, numRead);
content += newContent;
}
}
return content;
} catch (IOException e) {
System.out.println("ERROR: couldn't open URL: '" +
url.toString());
System.out.println(e.getMessage());
return "";
}
}
// Go through page finding links to URLs. A link is signalled
// by <a href=" ... It ends with a close angle bracket, preceded
// by a close quote, possibly preceded by a hatch mark (marking a
// fragment, an internal page marker)
public void processpage(URL url, String page) {
String lcPage = page.toLowerCase(); // Page in lower case
int index = 0; // position in page
int iEndAngle, ihref, iURL, iCloseQuote, iHatchMark, iEnd;
while ((index = lcPage.indexOf("<a", index)) != -1) {
iEndAngle = lcPage.indexOf(">", index);
ihref = lcPage.indexOf("href", index);
if (ihref != -1) {
iURL = lcPage.indexOf("\"", ihref) + 1;
if ((iURL != -1) && (iEndAngle != -1) && (iURL <
iEndAngle)) {
iCloseQuote = lcPage.indexOf("\"", iURL);
iHatchMark = lcPage.indexOf("#", iURL);
if ((iCloseQuote != -1) && (iCloseQuote < iEndAngle)) {
iEnd = iCloseQuote;
if ((iHatchMark != -1) && (iHatchMark <
iCloseQuote))
iEnd = iHatchMark;
String newUrlString = page.substring(iURL, iEnd);
addnewurl(url, newUrlString);
}
}
}
index = iEndAngle;
}
}
// Top-level procedure. Keep popping a url off newURLs, download
// it, and accumulate new URLs
public void crawl(String startUrl) {
initialize(startUrl);
for (int i = 0; i < MAX_PAGES; i++) {
URL url = (URL) newURLs.elementAt(0);
newURLs.removeElementAt(0);
if (DEBUG) {
System.out.println("Searching " + url.toString());
}
String page = getpage(url, true);
if (DEBUG) {
System.out.println(page);
}
if (page.length() != 0) {
processpage(url, page);
}
if (newURLs.isEmpty()) {
break;
}
}
System.out.println("Crawl complete, total " + knownURLs.size()
+ " pages.");
}
public void repeat() {
for (int i = 1; i <= 500; i++) {
System.out.printf("Round: %d / 500\r\n", i);
for (URL url : knownURLs.keySet()) {
String urls = url.toString();
if (!urls.endsWith("/manager/status")
&& !urls.endsWith("/manager/html")
&& !urls.endsWith("/host-manager/html")
&& !urls.contains("/docs/api/org/apache/")
&& !urls.contains("/examples/async/")) {
getpage(url, false);
}
}
}
}
}
---------------------------------------------------------------------
To unsubscribe, e-mail: dev-unsubscr...@tomcat.apache.org
For additional commands, e-mail: dev-h...@tomcat.apache.org