need really help
respected please help me i am really need of money please pay me through donation from my site. http://www.computersolution.co.cc i will be very thankful to you . please donate atleast 5$ or 2$ through my site http://www.computersolution.co.cc hope i will be able to clear my debts because of you all -- http://mail.python.org/mailman/listinfo/python-list
problems with mysql db
here i have posted my code...plz tell why am i getting the error "int
argument required" on the hash marked line(see below) although i am
giving an int value
#the code
import os
import string
import MySQLdb
import stopcheck
conn = MySQLdb.connect(host='localhost',user='root',db='urdb')
def file_extractor(dir_name):
url_count = 0
for file in os.listdir(dir_name):
if(file[-4:] == '.txt'):
file_path = os.path.join(dir_name,file)
curse = conn.cursor()
url_count += 1
curse.execute("INSERT INTO URL_TABLE VALUES(%d,%s)",
(url_count,file_path)) #error
word_extractor(url_count,file_path)
def word_extractor(url_count,file):
fhandle = open(file)
line = fhandle.readline()
k=stopcheck.checker()
k.create()
while line:
words = line.split()
cursor = conn.cursor()
for word1 in words:
if word1 not in string.punctuation:
if (k.check(word1) is 0) and (word1[0:4] != 'http') :
word_count+=1
try:
cursor.execute("INSERT INTO word_table(id,word)
VALUES(%d,%s)" , (word_count,word1))
cursor.execute("INSERT INTO wordmatch
(word_id,url_id) values(%d,%d)",(word_count,url_count))
except MySQLdb.Error, e:
print "Error %d: %s" % (e.args[0], e.args[1])
line=fhandle.readline()
if __name__ == '__main__':
#url_count=0
#word_count=0
dir = os.path.join('D://','acm')
file_extractor(dir)
--
http://mail.python.org/mailman/listinfo/python-list
code debugging
here is a code which crawls links sent to it. theres some problem with
the retrieve_url function ,plz help me out in debugging the fuction
retrive_url. This function retrives pages and saves them in file
#TODO:Visited dict grows in size it needs to be handled smartly
#Moreover server program needs to be in sync with the client eg.
Myrobot
#Take care of tag - 'if modified since',repeated links,hash links
#This is the client side of the distributed crawling framework
#It gets the list of urls to be crawled
#Then crawls the urls and stores the pages in a temporary archive
#which is then transferred to the server or grey_matter
import httplib
import os
import sys
import urlparse
import urllib2
import urllib
import zipfile
import threading
from socket import *
PAGE_DIR="C:/users/jayesh/
pages/" # directory where the
web pages are stored temporarily
# before transfer to the grey_matter
visited=
{} # a
dict to remember visited urls
ROBOT_COUNT=4
def fget():
""" This function retrieves the zipped file
containing the list of urls from the grey_matter and
saves them in a local file 'list.txt'. """
httplib.HTTPConnection.debuglevel=1
request=urllib2.Request('http://192.168.153.57/list.zip')
#Requesting the zipped file
request.add_header('Accept-encoding','gzip') #containing
the list of urls
opener=urllib2.build_opener()
flag=1
s='Waiting for server'
while flag==1:
try:
op=opener.open(request)
flag=0
except:
s=s+'*'
print s
f=open('list.zip',"wb")
f.write(op.read())
f.close()
z=zipfile.ZipFile('list.zip')
p=z.namelist()
g=open('list.txt',"wb")
g.write(z.read(p[0]))
g.close()
print 'got zipped file'
def compress():
""" This function compresses the crawled pages and stores them in
a single compressed file ready to be sent to the
grey_matter."""
zfile=zipfile.ZipFile('C:/xampp/htdocs/pages.zip',mode='w')
for fil in os.listdir(PAGE_DIR):
full=os.path.join(PAGE_DIR,fil)
zfile.write(full,fil)
os.remove(full)
os.rmdir(PAGE_DIR) #Removing the directory after
transfer to grey_matter
x=0
class robot(threading.Thread):
""" The main robot class which does the crawling of listed
urls it recieves from the grey matter. It uses 3 threads which
crawl the listed urls synchronously."""
def __init__(self,urllist,urllistlock,dblock):
threading.Thread.__init__(self)
self.urllist=urllist
self.urllistlock=urllistlock
self.dblock=dblock
def popurl(self):
""" This method pops out urls from the urls file one by one
and sends them for retrieval."""
self.urllistlock.acquire(1)
if(len(self.urllist)<1):
Nexturl=None
else:
Nexturl=self.urllist[0]
if Nexturl[-1]=='\n':Nexturl=Nexturl[:-1]
del self.urllist[0]
self.urllistlock.release()
return Nexturl
def retrieve_url(self,url):
""" The main method of the robot class and is called
run method to retrieve the given urls from the web."""
global x
if url is not None:
try:
if visited.has_key(url): return
pieces=urlparse.urlparse(url)
filepath=pieces[2]
if filepath != '':
filepath=filepath[1:]
filename=filepath.split("/")[-1]
else:
filename=x+'.htm'
x+=1
path=os.path.join(PAGE_DIR,filename)
url=urlparse.urlunparse(pieces)
p=url.rfind('#') #temporary
if p!=-1:
url=url[:p]
visited[url]=1
m=urllib2.urlopen(url)
fopen=open(path,'wb')
fopen.seek(0)
fopen.write(url+'|')
fopen.write(m.read())
fopen.close()
print url ,'retrieved'
except IOError:
print url
print "ERROR:OOPS! THE URL CAN'T BE RETRIEVED"
return
def run(self):
while(1):
url=self.popurl()
if url is None:
break
try:
self.retrieve_url(url)
except:sys.exit()
if __name__=='__main__':
s=socket(AF_INET,SOCK_STREAM)
s.bind(('',444))
s.listen(5)
q,v=s.accept()
count=1
print 'Connecting...'
while 1:
print 'Phase: %s' %(count)
message=q.recv(3)
if(message!='yes'):continue
print 'Connected'
count=count+1
fget() # Calling the fget method to get the url list
from
# grey_matter(server).
try:
os.mkdir(PAGE_DIR)
except: print 'Cant make dir'
try:
f=open('list.txt','r')
urllist=f.readlines()
f.close()
except:
print 'Error opening ur
Re: code debugging
On Jul 26, 11:28 am, Chris Rebert wrote: > On Sat, Jul 25, 2009 at 11:23 PM, golu wrote: > > here is a code which crawls links sent to it. theres some problem with > > the retrieve_url function ,plz help me out in debugging the fuction > > retrive_url. This function retrives pages and saves them in file > > Please specify exactly what the problem is that you are experiencing. > If you are getting an error, please provide the error message and full > traceback. > > Cheers, > Chris > --http://blog.rebertia.com i want to save pages in a directory and i m using the urls to get filenames. The program gets stuck in the saving step.can u suggest me a way to save a page e.g google.com as a file google.html -- http://mail.python.org/mailman/listinfo/python-list
web page retrieve problems
the following function retrieves pages from the web and saves them in
a specified dir. i want to extract the respective filenames from the
urls e.g the page code.google.com shud be saved as code-google.htm or
something similar. can u suggest me a way to do it
def retrieve_url(self,url):
""" The main method of the robot class and is called
run method to retrieve the given urls from the web."""
if url is not None:
try:
if visited.has_key(url): return
pieces=urlparse.urlparse(url)
filepath=pieces[2]
if filepath != '':
filepath=filepath[1:]
filename=filepath.split("/")[-1]
else:
filename='home.htm'
path=os.path.join(PAGE_DIR,filename)
url=urlparse.urlunparse(pieces)
p=url.rfind('#') #temporary
if p!=-1:
url=url[:p]
visited[url]=1
m=urllib2.urlopen(url)
fopen=open(path,'wb')
fopen.seek(0)
fopen.write(url+'|')
fopen.write(m.read())
fopen.close()
print url ,'retrieved'
except IOError:
print url
print "ERROR:OOPS! THE URL CAN'T BE RETRIEVED"
return
--
http://mail.python.org/mailman/listinfo/python-list
cgi script
Hi, i started learning cgi few days ago in python and everything went fine until i started getting the follwing error " The server encountered an internal error and was unable to complete your request. Either the server is overloaded or there was an error in a CGI script. If you think this is a server error, please contact the webmaster. " i am using apache on xampp. plz help -- http://mail.python.org/mailman/listinfo/python-list
