Author: markt Date: Mon Apr 23 19:48:11 2018 New Revision: 1829934 URL: http://svn.apache.org/viewvc?rev=1829934&view=rev Log: Fix https://bz.apache.org/bugzilla/show_bug.cgi?id=62297 Enable the CrawlerSessionManagerValve to correctly handle bots that crawl multiple hosts and/or web applications when the Valve is configured on a Host or an Engine.
Modified: tomcat/trunk/java/org/apache/catalina/valves/CrawlerSessionManagerValve.java tomcat/trunk/test/org/apache/catalina/valves/TestCrawlerSessionManagerValve.java tomcat/trunk/webapps/docs/changelog.xml tomcat/trunk/webapps/docs/config/valve.xml Modified: tomcat/trunk/java/org/apache/catalina/valves/CrawlerSessionManagerValve.java URL: http://svn.apache.org/viewvc/tomcat/trunk/java/org/apache/catalina/valves/CrawlerSessionManagerValve.java?rev=1829934&r1=1829933&r2=1829934&view=diff ============================================================================== --- tomcat/trunk/java/org/apache/catalina/valves/CrawlerSessionManagerValve.java (original) +++ tomcat/trunk/java/org/apache/catalina/valves/CrawlerSessionManagerValve.java Mon Apr 23 19:48:11 2018 @@ -27,6 +27,8 @@ import javax.servlet.http.HttpSession; import javax.servlet.http.HttpSessionBindingEvent; import javax.servlet.http.HttpSessionBindingListener; +import org.apache.catalina.Context; +import org.apache.catalina.Host; import org.apache.catalina.LifecycleException; import org.apache.catalina.connector.Request; import org.apache.catalina.connector.Response; @@ -44,8 +46,8 @@ public class CrawlerSessionManagerValve private static final Log log = LogFactory.getLog(CrawlerSessionManagerValve.class); - private final Map<String, String> clientIpSessionId = new ConcurrentHashMap<>(); - private final Map<String, String> sessionIdClientIp = new ConcurrentHashMap<>(); + private final Map<String, String> clientIdSessionId = new ConcurrentHashMap<>(); + private final Map<String, String> sessionIdClientId = new ConcurrentHashMap<>(); private String crawlerUserAgents = ".*[bB]ot.*|.*Yahoo! Slurp.*|.*Feedfetcher-Google.*"; private Pattern uaPattern = null; @@ -55,6 +57,10 @@ public class CrawlerSessionManagerValve private int sessionInactiveInterval = 60; + private boolean isHostAware = true; + + private boolean isContextAware = true; + /** * Specifies a default constructor so async support can be configured. @@ -134,7 +140,27 @@ public class CrawlerSessionManagerValve public Map<String, String> getClientIpSessionId() { - return clientIpSessionId; + return clientIdSessionId; + } + + + public boolean isHostAware() { + return isHostAware; + } + + + public void setHostAware(boolean isHostAware) { + this.isHostAware = isHostAware; + } + + + public boolean isContextAware() { + return isContextAware; + } + + + public void setContextAware(boolean isContextAware) { + this.isContextAware = isContextAware; } @@ -152,9 +178,10 @@ public class CrawlerSessionManagerValve boolean isBot = false; String sessionId = null; String clientIp = request.getRemoteAddr(); + String clientIdentifier = getClientIdentifier(request.getHost(), request.getContext(), clientIp); if (log.isDebugEnabled()) { - log.debug(request.hashCode() + ": ClientIp=" + clientIp + ", RequestedSessionId=" + log.debug(request.hashCode() + ": ClientIdentifier=" + clientIdentifier + ", RequestedSessionId=" + request.getRequestedSessionId()); } @@ -194,7 +221,7 @@ public class CrawlerSessionManagerValve // If this is a bot, is the session ID known? if (isBot) { - sessionId = clientIpSessionId.get(clientIp); + sessionId = clientIdSessionId.get(clientIdentifier); if (sessionId != null) { request.setRequestedSessionId(sessionId); if (log.isDebugEnabled()) { @@ -211,8 +238,8 @@ public class CrawlerSessionManagerValve // Has bot just created a session, if so make a note of it HttpSession s = request.getSession(false); if (s != null) { - clientIpSessionId.put(clientIp, s.getId()); - sessionIdClientIp.put(s.getId(), clientIp); + clientIdSessionId.put(clientIdentifier, s.getId()); + sessionIdClientId.put(s.getId(), clientIdentifier); // #valueUnbound() will be called on session expiration s.setAttribute(this.getClass().getName(), this); s.setMaxInactiveInterval(sessionInactiveInterval); @@ -231,11 +258,23 @@ public class CrawlerSessionManagerValve } + private String getClientIdentifier(Host host, Context context, String clientIp) { + StringBuilder result = new StringBuilder(clientIp); + if (isHostAware) { + result.append('-').append(host.getName()); + } + if (isContextAware) { + result.append(context.getName()); + } + return result.toString(); + } + + @Override public void valueUnbound(HttpSessionBindingEvent event) { - String clientIp = sessionIdClientIp.remove(event.getSession().getId()); - if (clientIp != null) { - clientIpSessionId.remove(clientIp); + String clientIdentifier = sessionIdClientId.remove(event.getSession().getId()); + if (clientIdentifier != null) { + clientIdSessionId.remove(clientIdentifier); } } } Modified: tomcat/trunk/test/org/apache/catalina/valves/TestCrawlerSessionManagerValve.java URL: http://svn.apache.org/viewvc/tomcat/trunk/test/org/apache/catalina/valves/TestCrawlerSessionManagerValve.java?rev=1829934&r1=1829933&r2=1829934&view=diff ============================================================================== --- tomcat/trunk/test/org/apache/catalina/valves/TestCrawlerSessionManagerValve.java (original) +++ tomcat/trunk/test/org/apache/catalina/valves/TestCrawlerSessionManagerValve.java Mon Apr 23 19:48:11 2018 @@ -16,12 +16,17 @@ */ package org.apache.catalina.valves; +import java.io.IOException; +import java.util.Arrays; import java.util.Collections; +import javax.servlet.ServletException; import javax.servlet.http.HttpSession; import org.junit.Test; +import org.apache.catalina.Context; +import org.apache.catalina.Host; import org.apache.catalina.Valve; import org.apache.catalina.connector.Request; import org.apache.catalina.connector.Response; @@ -34,6 +39,7 @@ public class TestCrawlerSessionManagerVa public void testCrawlerIpsPositive() throws Exception { CrawlerSessionManagerValve valve = new CrawlerSessionManagerValve(); valve.setCrawlerIps("216\\.58\\.206\\.174"); + valve.setCrawlerUserAgents(valve.getCrawlerUserAgents()); valve.setNext(EasyMock.createMock(Valve.class)); HttpSession session = createSessionExpectations(valve, true); Request request = createRequestExpectations("216.58.206.174", session, true); @@ -49,6 +55,7 @@ public class TestCrawlerSessionManagerVa public void testCrawlerIpsNegative() throws Exception { CrawlerSessionManagerValve valve = new CrawlerSessionManagerValve(); valve.setCrawlerIps("216\\.58\\.206\\.174"); + valve.setCrawlerUserAgents(valve.getCrawlerUserAgents()); valve.setNext(EasyMock.createMock(Valve.class)); HttpSession session = createSessionExpectations(valve, false); Request request = createRequestExpectations("127.0.0.1", session, false); @@ -60,6 +67,32 @@ public class TestCrawlerSessionManagerVa EasyMock.verify(request, session); } + @Test + public void testCrawlerMultipleHostsHostAware() throws Exception { + CrawlerSessionManagerValve valve = new CrawlerSessionManagerValve(); + valve.setCrawlerUserAgents(valve.getCrawlerUserAgents()); + valve.setHostAware(true); + valve.setContextAware(true); + valve.setNext(EasyMock.createMock(Valve.class)); + + verifyCrawlingLocalhost(valve, "localhost"); + verifyCrawlingLocalhost(valve, "example.invalid"); + } + + + private void verifyCrawlingLocalhost(CrawlerSessionManagerValve valve, String hostname) + throws IOException, ServletException { + HttpSession session = createSessionExpectations(valve, true); + Request request = createRequestExpectations("127.0.0.1", session, true, hostname, "tomcatBot 1.0"); + + EasyMock.replay(request, session); + + valve.invoke(request, EasyMock.createMock(Response.class)); + + EasyMock.verify(request, session); + } + + private HttpSession createSessionExpectations(CrawlerSessionManagerValve valve, boolean isBot) { HttpSession session = EasyMock.createMock(HttpSession.class); if (isBot) { @@ -72,15 +105,36 @@ public class TestCrawlerSessionManagerVa return session; } + private Request createRequestExpectations(String ip, HttpSession session, boolean isBot) { + return createRequestExpectations(ip, session, isBot, "localhost", "something 1.0"); + } + + private Request createRequestExpectations(String ip, HttpSession session, boolean isBot, String hostname, String userAgent) { Request request = EasyMock.createMock(Request.class); EasyMock.expect(request.getRemoteAddr()).andReturn(ip); + EasyMock.expect(request.getHost()).andReturn(simpleHostWithName(hostname)); + EasyMock.expect(request.getContext()).andReturn(simpleContextWithName()); IExpectationSetters<HttpSession> setter = EasyMock.expect(request.getSession(false)) .andReturn(null); if (isBot) { setter.andReturn(session); } - EasyMock.expect(request.getHeaders("user-agent")).andReturn(Collections.emptyEnumeration()); + EasyMock.expect(request.getHeaders("user-agent")).andAnswer(() -> Collections.enumeration(Arrays.asList(userAgent))); return request; } + + private Host simpleHostWithName(String hostname) { + Host host = EasyMock.createMock(Host.class); + EasyMock.expect(host.getName()).andReturn(hostname); + EasyMock.replay(host); + return host; + } + + private Context simpleContextWithName() { + Context context = EasyMock.createMock(Context.class); + EasyMock.expect(context.getName()).andReturn("/examples"); + EasyMock.replay(context); + return context; + } } Modified: tomcat/trunk/webapps/docs/changelog.xml URL: http://svn.apache.org/viewvc/tomcat/trunk/webapps/docs/changelog.xml?rev=1829934&r1=1829933&r2=1829934&view=diff ============================================================================== --- tomcat/trunk/webapps/docs/changelog.xml (original) +++ tomcat/trunk/webapps/docs/changelog.xml Mon Apr 23 19:48:11 2018 @@ -73,6 +73,11 @@ access Java 11 support to the annotation scanning code. (markt) </add> <fix> + <bug>62297</bug>: Enable the <code>CrawlerSessionManagerValve</code> to + correctly handle bots that crawl multiple hosts and/or web applications + when the Valve is configured on a Host or an Engine. (fschumacher) + </fix> + <fix> <bug>62309</bug>: Fix a <code>SecurityException</code> when using JASPIC under a <code>SecurityManager</code> when authentication is not mandatory. (markt) Modified: tomcat/trunk/webapps/docs/config/valve.xml URL: http://svn.apache.org/viewvc/tomcat/trunk/webapps/docs/config/valve.xml?rev=1829934&r1=1829933&r2=1829934&view=diff ============================================================================== --- tomcat/trunk/webapps/docs/config/valve.xml (original) +++ tomcat/trunk/webapps/docs/config/valve.xml Mon Apr 23 19:48:11 2018 @@ -1820,6 +1820,13 @@ </p> </attribute> + <attribute name="contextAware" required="false"> + <p>Flag to use the context name together with the client IP to + identify the session to re-use. Can be combined with <code>hostAware</code>. + Default value: <code>true</code> + </p> + </attribute> + <attribute name="crawlerIps" required="false"> <p>Regular expression (using <code>java.util.regex</code>) that client IP is matched against to determine if a request is from a web crawler. @@ -1833,6 +1840,13 @@ <code>.*[bB]ot.*|.*Yahoo! Slurp.*|.*Feedfetcher-Google.*</code> is used.</p> </attribute> + <attribute name="hostAware" required="false"> + <p>Flag to use the configured host together with the client IP to + identify the session to re-use. Can be combined with <code>contextAware</code>. + Default value: <code>true</code> + </p> + </attribute> + <attribute name="sessionInactiveInterval" required="false"> <p>The minimum time in seconds that the Crawler Session Manager Valve should keep the mapping of client IP to session ID in memory without any --------------------------------------------------------------------- To unsubscribe, e-mail: dev-unsubscr...@tomcat.apache.org For additional commands, e-mail: dev-h...@tomcat.apache.org