On 2/5/07, Erik Hatcher <[EMAIL PROTECTED]> wrote:
The facets are bibliographic metadata about library holdings, such as
genre, subject, format, published date (year), and others.  Basically
an open source think like this:

        <http://www2.lib.ncsu.edu/catalog/?N=201015&Ns=Call+Number+sort%
7c0&sort=5>

(if that link didn't work, hit the main page at <http://
www.lib.ncsu.edu/catalog/browse.html> and drill in a little)

The data is real ugly, and there are typically several values per
field, so all facets are currently set as multiValued.

You may have some difficulties with faceting by author...
What type of hardware do you have for this?

For testing, we have an in-house performance lab that collects all
sorts of cool data.
But for quick-n-dirty performance hacking, I prefer something simple
like the following.
It requires a list of queries to start with (we normally do
performance testing with data derived from real query logs).

------------------- perf.py ---------------
import urllib2
import time
import threading
import random

def geturl(url):
 #print "###getting ",url
 f = urllib2.urlopen(url)
 data = f.read()
 if verbose: print url + "\n" + data
 #headers = f.info()
 f.close()
 #print "!!!got request"


res=[]
requests_left=0
errors=[]

def test(uri, urllist):
 global res,requests_left,errors
 for url in urllist:
   if len(res)>=requests: break
   start=time.time()
   try:
     geturl(uri+url+append)
   except Exception,e:
     errors.append((url,e))
     print "CAUGHT EXCEPTION",e
   elapsed=time.time()-start
   res.append( (elapsed,url) )

class TestThread(threading.Thread):
 def __init__ (self,*args):
   threading.Thread.__init__(self)
   self.args = args
 def run(self): test(*self.args)

#argument defaults
clients=1; requests=1; randomize=True; queries='dict.txt'; append=''
verbose=False
uri='http://cn-ewr1-dev40-pi2:5051'

#poor man's argument parsing
import sys
for statement in sys.argv[1:]: exec(statement)

lst = [ line.strip() for line in open(queries).readlines() ]

thr=[]
for client in range(clients):
 ulst=lst
 if randomize:
   ulst=ulst[:]
   random.shuffle(ulst)
 thr.append(TestThread(uri,ulst))
print "############## starting all threads"
start=time.time()
for cli in thr: cli.start()
print "############## waiting for all threads"
for cli in thr: cli.join()
elapsed=time.time()-start


res.sort()
n=len(res)

print 'Slowest Queries:'
for i in res[:-10:-1]: print '%.3f %s' % i

print
print len(errors),'Errors'
for i in errors[:10]: print i

print
print 'total requests=%d    clients=%d' % (n,clients)
print 'throughput=%.1f queries/second' % (n/elapsed)
print "99.9%%=%.3f seconds" % res[int(n*.999)][0]
print "99%%=%.3f" % res[int(n*.99)][0]
print "98%%=%.3f" % res[int(n*.98)][0]
print "95%%=%.3f" % res[int(n*.95)][0]
print "75%%=%.3f" % res[int(n*.75)][0]
print "50%%=%.3f" % res[int(n*.50)][0]
print "avg%%=%.3f" % (sum([ r[0] for r in res ])/n)

Reply via email to