Author: markt Date: Sun Feb 6 19:28:48 2011 New Revision: 1067734 URL: http://svn.apache.org/viewvc?rev=1067734&view=rev Log: First attempt at a valve to limit session creation by web crawlers. Docs etc to follow once it is confirmed working.
Added: tomcat/trunk/java/org/apache/catalina/valves/CrawlerSessionManagerValve.java (with props) Added: tomcat/trunk/java/org/apache/catalina/valves/CrawlerSessionManagerValve.java URL: http://svn.apache.org/viewvc/tomcat/trunk/java/org/apache/catalina/valves/CrawlerSessionManagerValve.java?rev=1067734&view=auto ============================================================================== --- tomcat/trunk/java/org/apache/catalina/valves/CrawlerSessionManagerValve.java (added) +++ tomcat/trunk/java/org/apache/catalina/valves/CrawlerSessionManagerValve.java Sun Feb 6 19:28:48 2011 @@ -0,0 +1,195 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.catalina.valves; + +import java.io.IOException; +import java.util.Enumeration; +import java.util.Iterator; +import java.util.Map; +import java.util.Map.Entry; +import java.util.concurrent.ConcurrentHashMap; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import javax.servlet.ServletException; +import javax.servlet.http.HttpSession; + +import org.apache.catalina.LifecycleException; +import org.apache.catalina.connector.Request; +import org.apache.catalina.connector.Response; + +/** + * Web crawlers can trigger the creation of many thousands of sessions as they + * crawl a site which may result in significant memory consumption. This Valve + * ensures that crawlers are associated with a single session - just like normal + * users - regardless of whether or not they provide a session token with their + * requests. + */ +public class CrawlerSessionManagerValve extends ValveBase { + + private Map<String,SessionInfo> uaIpSessionInfo = + new ConcurrentHashMap<String, SessionInfo>(); + + private String crawlerUserAgents = + ".*GoogleBot.*|.*bingbot.*|.*Yahoo! Slurp.*"; + private Matcher uaMatcher = null; + private int sessionInactiveInterval = 60; + + + /** + * Specify the regular expression (using {@link Pattern}) that will be used + * to identify crawlers based in the User-Agent header provided. The default + * is ".*GoogleBot.*|.*bingbot.*|.*Yahoo! Slurp.*" + * + * @param crawlerUserAgents The regular expression using {@link Pattern} + */ + public void setCrawlerUserAgents(String crawlerUserAgents) { + this.crawlerUserAgents = crawlerUserAgents; + if (crawlerUserAgents == null || crawlerUserAgents.length() == 0) { + uaMatcher = null; + } else { + uaMatcher = Pattern.compile(crawlerUserAgents).matcher(""); + } + } + + /** + * @see #setCrawlerUserAgents(String) + * @return The current regular expression being used to match user agents. + */ + public String getCrawlerUserAgents() { + return crawlerUserAgents; + } + + + /** + * Specify the session timeout (in seconds) for a crawler's session. This is + * typically lower than that for a user session. The default is 60 seconds. + * + * @param sessionInactiveInterval The new timeout for crawler sessions + */ + public void setSessionInactiveInterval(int sessionInactiveInterval) { + this.sessionInactiveInterval = sessionInactiveInterval; + } + + /** + * @see #setSessionInactiveInterval(int) + * @return The current timeout in seconds + */ + public int getSessionInactiveInterval() { + return sessionInactiveInterval; + } + + + @Override + protected void initInternal() throws LifecycleException { + super.initInternal(); + + uaMatcher = Pattern.compile(crawlerUserAgents).matcher(""); + } + + + @Override + public void invoke(Request request, Response response) throws IOException, + ServletException { + + boolean isBot = false; + SessionInfo sessionInfo = null; + String clientIp = null; + + // If the incoming request has a session ID, no action is required + if (request.getRequestedSessionId() == null) { + + // Is this a crawler + Enumeration<String> uaHeaders = request.getHeaders("user-agent"); + while (!isBot && uaMatcher != null && + uaHeaders.hasMoreElements()) { + String uaHeader = uaHeaders.nextElement(); + uaMatcher.reset(uaHeader); + if (uaMatcher.matches()) { + isBot = true; + } + } + + // If this is a bot, is the session ID known? + if (isBot) { + clientIp = request.getRemoteAddr(); + sessionInfo = uaIpSessionInfo.get(clientIp); + if (sessionInfo != null) { + request.setRequestedSessionId(sessionInfo.getSessionId()); + } + } + } + + getNext().invoke(request, response); + + if (isBot) { + if (sessionInfo == null) { + // Has bot just created a session, if so make a note of it + HttpSession s = request.getSession(false); + if (s != null) { + uaIpSessionInfo.put(clientIp, new SessionInfo(s.getId())); + s.setMaxInactiveInterval(sessionInactiveInterval); + } + } else { + sessionInfo.access(); + } + } + } + + + @Override + public void backgroundProcess() { + super.backgroundProcess(); + + long expireTime = System.currentTimeMillis() + + (sessionInactiveInterval + 60) * 1000; + + Iterator<Entry<String,SessionInfo>> iter = + uaIpSessionInfo.entrySet().iterator(); + + // Remove any sessions in the cache that have expired. + while (iter.hasNext()) { + Entry<String,SessionInfo> entry = iter.next(); + if (entry.getValue().getLastAccessed() > expireTime) { + iter.remove(); + } + } + } + + + private static final class SessionInfo { + private final String sessionId; + private volatile long lastAccessed; + + public SessionInfo(String sessionId) { + this.sessionId = sessionId; + this.lastAccessed = System.currentTimeMillis(); + } + + public String getSessionId() { + return sessionId; + } + + public long getLastAccessed() { + return lastAccessed; + } + + public void access() { + lastAccessed = System.currentTimeMillis(); + } + } +} Propchange: tomcat/trunk/java/org/apache/catalina/valves/CrawlerSessionManagerValve.java ------------------------------------------------------------------------------ svn:eol-style = native --------------------------------------------------------------------- To unsubscribe, e-mail: dev-unsubscr...@tomcat.apache.org For additional commands, e-mail: dev-h...@tomcat.apache.org