Author: markt
Date: Sun Feb  6 19:28:48 2011
New Revision: 1067734

URL: http://svn.apache.org/viewvc?rev=1067734&view=rev
Log:
First attempt at a valve to limit session creation by web crawlers.
Docs etc to follow once it is confirmed working.

Added:
    
tomcat/trunk/java/org/apache/catalina/valves/CrawlerSessionManagerValve.java   
(with props)

Added: 
tomcat/trunk/java/org/apache/catalina/valves/CrawlerSessionManagerValve.java
URL: 
http://svn.apache.org/viewvc/tomcat/trunk/java/org/apache/catalina/valves/CrawlerSessionManagerValve.java?rev=1067734&view=auto
==============================================================================
--- 
tomcat/trunk/java/org/apache/catalina/valves/CrawlerSessionManagerValve.java 
(added)
+++ 
tomcat/trunk/java/org/apache/catalina/valves/CrawlerSessionManagerValve.java 
Sun Feb  6 19:28:48 2011
@@ -0,0 +1,195 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.catalina.valves;
+
+import java.io.IOException;
+import java.util.Enumeration;
+import java.util.Iterator;
+import java.util.Map;
+import java.util.Map.Entry;
+import java.util.concurrent.ConcurrentHashMap;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import javax.servlet.ServletException;
+import javax.servlet.http.HttpSession;
+
+import org.apache.catalina.LifecycleException;
+import org.apache.catalina.connector.Request;
+import org.apache.catalina.connector.Response;
+
+/**
+ * Web crawlers can trigger the creation of many thousands of sessions as they
+ * crawl a site which may result in significant memory consumption. This Valve
+ * ensures that crawlers are associated with a single session - just like 
normal
+ * users - regardless of whether or not they provide a session token with their
+ * requests.
+ */
+public class CrawlerSessionManagerValve extends ValveBase {
+
+    private Map<String,SessionInfo> uaIpSessionInfo =
+        new ConcurrentHashMap<String, SessionInfo>();
+
+    private String crawlerUserAgents =
+        ".*GoogleBot.*|.*bingbot.*|.*Yahoo! Slurp.*";
+    private Matcher uaMatcher = null;
+    private int sessionInactiveInterval = 60;
+
+
+    /**
+     * Specify the regular expression (using {@link Pattern}) that will be used
+     * to identify crawlers based in the User-Agent header provided. The 
default
+     * is ".*GoogleBot.*|.*bingbot.*|.*Yahoo! Slurp.*"
+     *  
+     * @param crawlerUserAgents The regular expression using {@link Pattern}
+     */
+    public void setCrawlerUserAgents(String crawlerUserAgents) {
+        this.crawlerUserAgents = crawlerUserAgents;
+        if (crawlerUserAgents == null || crawlerUserAgents.length() == 0) {
+            uaMatcher = null;
+        } else {
+            uaMatcher = Pattern.compile(crawlerUserAgents).matcher("");
+        }
+    }
+
+    /**
+     * @see #setCrawlerUserAgents(String)
+     * @return  The current regular expression being used to match user 
agents. 
+     */
+    public String getCrawlerUserAgents() {
+        return crawlerUserAgents;
+    }
+
+
+    /**
+     * Specify the session timeout (in seconds) for a crawler's session. This 
is
+     * typically lower than that for a user session. The default is 60 seconds.
+     *  
+     * @param sessionInactiveInterval   The new timeout for crawler sessions
+     */
+    public void setSessionInactiveInterval(int sessionInactiveInterval) {
+        this.sessionInactiveInterval = sessionInactiveInterval;
+    }
+
+    /**
+     * @see #setSessionInactiveInterval(int)
+     * @return  The current timeout in seconds
+     */
+    public int getSessionInactiveInterval() {
+        return sessionInactiveInterval;
+    }
+
+
+    @Override
+    protected void initInternal() throws LifecycleException {
+        super.initInternal();
+        
+        uaMatcher = Pattern.compile(crawlerUserAgents).matcher("");
+    }
+
+
+    @Override
+    public void invoke(Request request, Response response) throws IOException,
+            ServletException {
+
+        boolean isBot = false;
+        SessionInfo sessionInfo = null;
+        String clientIp = null;
+
+        // If the incoming request has a session ID, no action is required
+        if (request.getRequestedSessionId() == null) {
+
+            // Is this a crawler
+            Enumeration<String> uaHeaders = request.getHeaders("user-agent");
+            while (!isBot && uaMatcher != null &&
+                    uaHeaders.hasMoreElements()) {
+                String uaHeader = uaHeaders.nextElement();
+                uaMatcher.reset(uaHeader);
+                if (uaMatcher.matches()) {
+                    isBot = true;
+                }
+            }
+            
+            // If this is a bot, is the session ID known?
+            if (isBot) {
+                clientIp = request.getRemoteAddr();
+                sessionInfo = uaIpSessionInfo.get(clientIp);
+                if (sessionInfo != null) {
+                    request.setRequestedSessionId(sessionInfo.getSessionId());
+                }
+            }
+        }
+
+        getNext().invoke(request, response);
+        
+        if (isBot) {
+            if (sessionInfo == null) {
+                // Has bot just created a session, if so make a note of it
+                HttpSession s = request.getSession(false);
+                if (s != null) {
+                    uaIpSessionInfo.put(clientIp, new SessionInfo(s.getId()));
+                    s.setMaxInactiveInterval(sessionInactiveInterval);
+                }
+            } else {
+                sessionInfo.access();
+            }
+        }
+    }
+
+
+    @Override
+    public void backgroundProcess() {
+        super.backgroundProcess();
+        
+        long expireTime = System.currentTimeMillis() +
+                (sessionInactiveInterval + 60) * 1000;
+
+        Iterator<Entry<String,SessionInfo>> iter =
+            uaIpSessionInfo.entrySet().iterator();
+
+        // Remove any sessions in the cache that have expired. 
+        while (iter.hasNext()) {
+            Entry<String,SessionInfo> entry = iter.next();
+            if (entry.getValue().getLastAccessed() > expireTime) {
+                iter.remove();
+            }
+        }
+    }
+
+
+    private static final class SessionInfo {
+        private final String sessionId;
+        private volatile long lastAccessed;
+        
+        public SessionInfo(String sessionId) {
+            this.sessionId = sessionId;
+            this.lastAccessed = System.currentTimeMillis();
+        }
+
+        public String getSessionId() {
+            return sessionId;
+        }
+
+        public long getLastAccessed() {
+            return lastAccessed;
+        }
+
+        public void access() {
+            lastAccessed = System.currentTimeMillis();
+        }
+    }
+}

Propchange: 
tomcat/trunk/java/org/apache/catalina/valves/CrawlerSessionManagerValve.java
------------------------------------------------------------------------------
    svn:eol-style = native



---------------------------------------------------------------------
To unsubscribe, e-mail: dev-unsubscr...@tomcat.apache.org
For additional commands, e-mail: dev-h...@tomcat.apache.org

Reply via email to