need really help

2008-06-06 Thread Golu
respected please help me i am really need of money please pay me
through donation from my site. http://www.computersolution.co.cc i
will be very thankful to you . please donate atleast 5$ or 2$ through
my site
http://www.computersolution.co.cc hope i will be able to clear my
debts because of you all
--
http://mail.python.org/mailman/listinfo/python-list


problems with mysql db

2009-06-29 Thread golu
here i have posted my code...plz tell why am i getting the error "int
argument required" on the hash marked line(see below) although i am
giving an int value
#the code
import os
import string
import MySQLdb
import stopcheck
conn = MySQLdb.connect(host='localhost',user='root',db='urdb')

def file_extractor(dir_name):
url_count = 0

for file in os.listdir(dir_name):
  if(file[-4:] == '.txt'):
file_path = os.path.join(dir_name,file)
curse = conn.cursor()
url_count += 1
curse.execute("INSERT INTO URL_TABLE VALUES(%d,%s)",
(url_count,file_path)) #error
word_extractor(url_count,file_path)
def word_extractor(url_count,file):
fhandle =  open(file)
line = fhandle.readline()
k=stopcheck.checker()
k.create()

while line:
   words = line.split()
   cursor = conn.cursor()
   for word1 in words:
   if word1 not in string.punctuation:
if (k.check(word1) is 0) and (word1[0:4] != 'http') :
   word_count+=1
   try:
cursor.execute("INSERT INTO word_table(id,word)
VALUES(%d,%s)" , (word_count,word1))
cursor.execute("INSERT INTO wordmatch
(word_id,url_id) values(%d,%d)",(word_count,url_count))
   except MySQLdb.Error, e:
 print "Error %d: %s" % (e.args[0], e.args[1])
   line=fhandle.readline()

if __name__ == '__main__':
#url_count=0
#word_count=0

dir = os.path.join('D://','acm')
file_extractor(dir)
-- 
http://mail.python.org/mailman/listinfo/python-list


code debugging

2009-07-25 Thread golu
here is a code which crawls links sent to it. theres some problem with
the retrieve_url function ,plz help me out in debugging the fuction
retrive_url. This function retrives pages and saves them in file
#TODO:Visited dict grows in size it needs to be handled smartly
#Moreover server program needs to be in sync with the client eg.
Myrobot
#Take care of tag - 'if modified since',repeated links,hash links
#This is the client side of the distributed crawling framework
#It gets the list of urls to be crawled
#Then crawls the urls and stores the pages in a temporary archive
#which is then transferred to the server or grey_matter
import httplib
import os
import sys
import urlparse
import urllib2
import urllib
import zipfile
import threading

from socket import *
PAGE_DIR="C:/users/jayesh/
pages/"  # directory where the
web pages are stored temporarily
 
# before transfer to the grey_matter
visited=
{} # a
dict to remember visited urls
ROBOT_COUNT=4


def fget():
""" This function retrieves the zipped file
 containing the list of urls from the grey_matter and
 saves them in a local file 'list.txt'. """

httplib.HTTPConnection.debuglevel=1
request=urllib2.Request('http://192.168.153.57/list.zip')
#Requesting the zipped file
request.add_header('Accept-encoding','gzip')   #containing
the list of urls
opener=urllib2.build_opener()
flag=1
s='Waiting for server'
while flag==1:
 try:
  op=opener.open(request)
  flag=0
 except:
 s=s+'*'
 print s
f=open('list.zip',"wb")
f.write(op.read())
f.close()
z=zipfile.ZipFile('list.zip')
p=z.namelist()
g=open('list.txt',"wb")
g.write(z.read(p[0]))
g.close()
print 'got zipped file'

def compress():
""" This function compresses the crawled pages and stores them in
a single compressed file ready to be sent to the
grey_matter."""

zfile=zipfile.ZipFile('C:/xampp/htdocs/pages.zip',mode='w')
for fil in os.listdir(PAGE_DIR):
full=os.path.join(PAGE_DIR,fil)
zfile.write(full,fil)
os.remove(full)
os.rmdir(PAGE_DIR) #Removing the directory after
transfer to grey_matter


x=0
class robot(threading.Thread):
""" The main robot class which does the crawling of listed
urls it recieves from the grey matter. It uses 3 threads which
crawl the listed urls synchronously."""

def __init__(self,urllist,urllistlock,dblock):
threading.Thread.__init__(self)
self.urllist=urllist
self.urllistlock=urllistlock
self.dblock=dblock

def popurl(self):
""" This method pops out urls from the urls file one by one
and sends them for retrieval."""

self.urllistlock.acquire(1)
if(len(self.urllist)<1):
Nexturl=None
else:
Nexturl=self.urllist[0]
if Nexturl[-1]=='\n':Nexturl=Nexturl[:-1]
del self.urllist[0]
self.urllistlock.release()
return Nexturl

def retrieve_url(self,url):
""" The main method of the robot class and is called
run method to retrieve the given urls from the web."""
global x
if url is not None:

 try:
if visited.has_key(url): return
pieces=urlparse.urlparse(url)
filepath=pieces[2]
if filepath != '':
 filepath=filepath[1:]
 filename=filepath.split("/")[-1]
else:
  filename=x+'.htm'
  x+=1

path=os.path.join(PAGE_DIR,filename)
url=urlparse.urlunparse(pieces)
p=url.rfind('#')   #temporary
if p!=-1:
url=url[:p]

visited[url]=1
m=urllib2.urlopen(url)

fopen=open(path,'wb')

fopen.seek(0)
fopen.write(url+'|')

fopen.write(m.read())
fopen.close()
print url ,'retrieved'

 except IOError:
print url
print "ERROR:OOPS! THE URL CAN'T BE RETRIEVED"

return

def run(self):
while(1):
url=self.popurl()
if url is None:
break
try:
 self.retrieve_url(url)
except:sys.exit()

if __name__=='__main__':

  s=socket(AF_INET,SOCK_STREAM)
  s.bind(('',444))
  s.listen(5)
  q,v=s.accept()
  count=1
  print 'Connecting...'
  while 1:
print 'Phase: %s' %(count)
message=q.recv(3)

if(message!='yes'):continue
print 'Connected'
count=count+1
fget() # Calling the fget method to get the url list
from
   # grey_matter(server).
try:
 os.mkdir(PAGE_DIR)
except: print 'Cant make dir'
try:
 f=open('list.txt','r')
 urllist=f.readlines()
 f.close()
except:
print 'Error opening ur

Re: code debugging

2009-07-25 Thread golu
On Jul 26, 11:28 am, Chris Rebert  wrote:
> On Sat, Jul 25, 2009 at 11:23 PM, golu wrote:
> > here is a code which crawls links sent to it. theres some problem with
> > the retrieve_url function ,plz help me out in debugging the fuction
> > retrive_url. This function retrives pages and saves them in file
>
> Please specify exactly what the problem is that you are experiencing.
> If you are getting an error, please provide the error message and full
> traceback.
>
> Cheers,
> Chris
> --http://blog.rebertia.com

i want to save pages in a directory and i m using the urls to get
filenames. The program gets stuck in the saving step.can u suggest me
a way to save a page e.g google.com as a file google.html
-- 
http://mail.python.org/mailman/listinfo/python-list


web page retrieve problems

2009-07-26 Thread golu
the following function retrieves pages from the web and saves them in
a specified dir. i want to extract the respective filenames from the
urls e.g the page code.google.com shud be saved as code-google.htm  or
something similar. can u suggest me a way to do it
def retrieve_url(self,url):
""" The main method of the robot class and is called
run method to retrieve the given urls from the web."""

if url is not None:

 try:
if visited.has_key(url): return
pieces=urlparse.urlparse(url)
filepath=pieces[2]
if filepath != '':
 filepath=filepath[1:]
 filename=filepath.split("/")[-1]
else:
  filename='home.htm'


path=os.path.join(PAGE_DIR,filename)
url=urlparse.urlunparse(pieces)
p=url.rfind('#')   #temporary
if p!=-1:
url=url[:p]

visited[url]=1
m=urllib2.urlopen(url)

fopen=open(path,'wb')

fopen.seek(0)
fopen.write(url+'|')

fopen.write(m.read())
fopen.close()
print url ,'retrieved'

 except IOError:
print url
print "ERROR:OOPS! THE URL CAN'T BE RETRIEVED"

return
-- 
http://mail.python.org/mailman/listinfo/python-list


cgi script

2009-08-01 Thread golu
Hi,
i started learning cgi few days ago in python and everything went
fine  until i started getting the follwing error
"
The server encountered an internal error and was unable to complete
your request. Either the server is overloaded or there was an error in
a CGI script.

If you think this is a server error, please contact the webmaster. "
  i am using apache on xampp. plz help
-- 
http://mail.python.org/mailman/listinfo/python-list