Author: violetagg Date: Mon Jun 12 21:04:53 2017 New Revision: 1798533 URL: http://svn.apache.org/viewvc?rev=1798533&view=rev Log: A new configuration property 'crawlerIps' is added to the 'o.a.catalina.valves.CrawlerSessionManagerValve'. Using this property one can specify a regular expression that will be used to identify crawlers based on their IP address. Based on a patch provided by Tetradeus via GitHub.
Added: tomcat/trunk/test/org/apache/catalina/valves/TestCrawlerSessionManagerValve.java (with props) Modified: tomcat/trunk/java/org/apache/catalina/valves/CrawlerSessionManagerValve.java tomcat/trunk/webapps/docs/changelog.xml tomcat/trunk/webapps/docs/config/valve.xml Modified: tomcat/trunk/java/org/apache/catalina/valves/CrawlerSessionManagerValve.java URL: http://svn.apache.org/viewvc/tomcat/trunk/java/org/apache/catalina/valves/CrawlerSessionManagerValve.java?rev=1798533&r1=1798532&r2=1798533&view=diff ============================================================================== --- tomcat/trunk/java/org/apache/catalina/valves/CrawlerSessionManagerValve.java (original) +++ tomcat/trunk/java/org/apache/catalina/valves/CrawlerSessionManagerValve.java Mon Jun 12 21:04:53 2017 @@ -49,6 +49,10 @@ public class CrawlerSessionManagerValve private String crawlerUserAgents = ".*[bB]ot.*|.*Yahoo! Slurp.*|.*Feedfetcher-Google.*"; private Pattern uaPattern = null; + + private String crawlerIps = null; + private Pattern ipPattern = null; + private int sessionInactiveInterval = 60; @@ -86,6 +90,31 @@ public class CrawlerSessionManagerValve /** + * Specify the regular expression (using {@link Pattern}) that will be used + * to identify crawlers based on their IP address. The default is no crawler + * IPs. + * + * @param crawlerIps The regular expression using {@link Pattern} + */ + public void setCrawlerIps(String crawlerIps) { + this.crawlerIps = crawlerIps; + if (crawlerIps == null || crawlerIps.length() == 0) { + ipPattern = null; + } else { + ipPattern = Pattern.compile(crawlerIps); + } + } + + /** + * @see #setCrawlerIps(String) + * @return The current regular expression being used to match IP addresses. + */ + public String getCrawlerIps() { + return crawlerIps; + } + + + /** * Specify the session timeout (in seconds) for a crawler's session. This is * typically lower than that for a user session. The default is 60 seconds. * @@ -122,11 +151,11 @@ public class CrawlerSessionManagerValve boolean isBot = false; String sessionId = null; - String clientIp = null; + String clientIp = request.getRemoteAddr(); if (log.isDebugEnabled()) { - log.debug(request.hashCode() + ": ClientIp=" + request.getRemoteAddr() - + ", RequestedSessionId=" + request.getRequestedSessionId()); + log.debug(request.hashCode() + ": ClientIp=" + clientIp + ", RequestedSessionId=" + + request.getRequestedSessionId()); } // If the incoming request has a valid session ID, no action is required @@ -155,9 +184,16 @@ public class CrawlerSessionManagerValve } } + if (ipPattern != null && ipPattern.matcher(clientIp).matches()) { + isBot = true; + + if (log.isDebugEnabled()) { + log.debug(request.hashCode() + ": Bot found. IP=" + clientIp); + } + } + // If this is a bot, is the session ID known? if (isBot) { - clientIp = request.getRemoteAddr(); sessionId = clientIpSessionId.get(clientIp); if (sessionId != null) { request.setRequestedSessionId(sessionId); Added: tomcat/trunk/test/org/apache/catalina/valves/TestCrawlerSessionManagerValve.java URL: http://svn.apache.org/viewvc/tomcat/trunk/test/org/apache/catalina/valves/TestCrawlerSessionManagerValve.java?rev=1798533&view=auto ============================================================================== --- tomcat/trunk/test/org/apache/catalina/valves/TestCrawlerSessionManagerValve.java (added) +++ tomcat/trunk/test/org/apache/catalina/valves/TestCrawlerSessionManagerValve.java Mon Jun 12 21:04:53 2017 @@ -0,0 +1,86 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.catalina.valves; + +import java.util.Collections; + +import javax.servlet.http.HttpSession; + +import org.junit.Test; + +import org.apache.catalina.Valve; +import org.apache.catalina.connector.Request; +import org.apache.catalina.connector.Response; +import org.easymock.EasyMock; +import org.easymock.IExpectationSetters; + +public class TestCrawlerSessionManagerValve { + + @Test + public void testCrawlerIpsPositive() throws Exception { + CrawlerSessionManagerValve valve = new CrawlerSessionManagerValve(); + valve.setCrawlerIps("216\\.58\\.206\\.174"); + valve.setNext(EasyMock.createMock(Valve.class)); + HttpSession session = createSessionExpectations(valve, true); + Request request = createRequestExpectations("216.58.206.174", session, true); + + EasyMock.replay(request, session); + + valve.invoke(request, EasyMock.createMock(Response.class)); + + EasyMock.verify(request, session); + } + + @Test + public void testCrawlerIpsNegative() throws Exception { + CrawlerSessionManagerValve valve = new CrawlerSessionManagerValve(); + valve.setCrawlerIps("216\\.58\\.206\\.174"); + valve.setNext(EasyMock.createMock(Valve.class)); + HttpSession session = createSessionExpectations(valve, false); + Request request = createRequestExpectations("127.0.0.1", session, false); + + EasyMock.replay(request, session); + + valve.invoke(request, EasyMock.createMock(Response.class)); + + EasyMock.verify(request, session); + } + + private HttpSession createSessionExpectations(CrawlerSessionManagerValve valve, boolean isBot) { + HttpSession session = EasyMock.createMock(HttpSession.class); + if (isBot) { + EasyMock.expect(session.getId()).andReturn("id").times(2); + session.setAttribute(valve.getClass().getName(), valve); + EasyMock.expectLastCall(); + session.setMaxInactiveInterval(60); + EasyMock.expectLastCall(); + } + return session; + } + + private Request createRequestExpectations(String ip, HttpSession session, boolean isBot) { + Request request = EasyMock.createMock(Request.class); + EasyMock.expect(request.getRemoteAddr()).andReturn(ip); + IExpectationSetters<HttpSession> setter = EasyMock.expect(request.getSession(false)) + .andReturn(null); + if (isBot) { + setter.andReturn(session); + } + EasyMock.expect(request.getHeaders("user-agent")).andReturn(Collections.emptyEnumeration()); + return request; + } +} Propchange: tomcat/trunk/test/org/apache/catalina/valves/TestCrawlerSessionManagerValve.java ------------------------------------------------------------------------------ svn:eol-style = native Modified: tomcat/trunk/webapps/docs/changelog.xml URL: http://svn.apache.org/viewvc/tomcat/trunk/webapps/docs/changelog.xml?rev=1798533&r1=1798532&r2=1798533&view=diff ============================================================================== --- tomcat/trunk/webapps/docs/changelog.xml (original) +++ tomcat/trunk/webapps/docs/changelog.xml Mon Jun 12 21:04:53 2017 @@ -100,6 +100,13 @@ <code>o.a.catalina.startup.Tomcat</code>. Patch provided by peterhansson_se. (violetagg) </fix> + <add> + A new configuration property <code>crawlerIps</code> is added to the + <code>o.a.catalina.valves.CrawlerSessionManagerValve</code>. Using this + property one can specify a regular expression that will be used to + identify crawlers based on their IP address. Based on a patch provided + by Tetradeus. (violetagg) + </add> </changelog> </subsection> <subsection name="Coyote"> Modified: tomcat/trunk/webapps/docs/config/valve.xml URL: http://svn.apache.org/viewvc/tomcat/trunk/webapps/docs/config/valve.xml?rev=1798533&r1=1798532&r2=1798533&view=diff ============================================================================== --- tomcat/trunk/webapps/docs/config/valve.xml (original) +++ tomcat/trunk/webapps/docs/config/valve.xml Mon Jun 12 21:04:53 2017 @@ -1651,6 +1651,12 @@ </p> </attribute> + <attribute name="crawlerIps" required="false"> + <p>Regular expression (using <code>java.util.regex</code>) that client + IP is matched against to determine if a request is from a web crawler. + By default such regular expression is not set.</p> + </attribute> + <attribute name="crawlerUserAgents" required="false"> <p>Regular expression (using <code>java.util.regex</code>) that the user agent HTTP request header is matched against to determine if a request --------------------------------------------------------------------- To unsubscribe, e-mail: dev-unsubscr...@tomcat.apache.org For additional commands, e-mail: dev-h...@tomcat.apache.org