strip away html tags from extracted links
I have the following code to extract certain links from a webpage:
from bs4 import BeautifulSoup
import urllib2, sys
import re
def tonaton():
site = "http://tonaton.com/en/job-vacancies-in-ghana";
hdr = {'User-Agent' : 'Mozilla/5.0'}
req = urllib2.Request(site, headers=hdr)
jobpass = urllib2.urlopen(req)
invalid_tag = ('h2')
soup = BeautifulSoup(jobpass)
print soup.find_all('h2')
The links are contained in the 'h2' tags so I get the links as follows:
cashiers
Cake baker
Automobile Technician
Marketing Officer
But I'm interested in getting rid of all the 'h2' tags so that I have links
only in this manner:
cashiers
Cake baker
Automobile Technician
Marketing Officer
I therefore updated my code to look like this:
def tonaton():
site = "http://tonaton.com/en/job-vacancies-in-ghana";
hdr = {'User-Agent' : 'Mozilla/5.0'}
req = urllib2.Request(site, headers=hdr)
jobpass = urllib2.urlopen(req)
invalid_tag = ('h2')
soup = BeautifulSoup(jobpass)
jobs = soup.find_all('h2')
for tag in invalid_tag:
for match in jobs(tag):
match.replaceWithChildren()
print jobs
But I couldn't get it to work, even though I thought that was the best logic i
could come up with.I'm a newbie though so I know there is something better that
could be done.
Any help will be gracefully appreciated
Thanks
--
https://mail.python.org/mailman/listinfo/python-list
Python/Django Extract and append only new links
I am putting together a project using Python 2.7 Django 1.5 on Windows 7.
I believe this should be on the django group but I haven't had help from
there so I figured I would try the python list
I have the following view:
views.py:
def foo():
site = "http://www.foo.com/portal/jobs";
hdr = {'User-Agent' : 'Mozilla/5.0'}
req = urllib2.Request(site, headers=hdr)
jobpass = urllib2.urlopen(req)
soup = BeautifulSoup(jobpass)
for tag in soup.find_all('a', href = True):
tag['href'] = urlparse.urljoin('http://www.businessghana.com/portal/',
tag['href'])
return map(str, soup.find_all('a', href = re.compile('.getJobInfo')))
def example():
site = "http://example.com";
hdr = {'User-Agent' : 'Mozilla/5.0'}
req = urllib2.Request(site, headers=hdr)
jobpass = urllib2.urlopen(req)
soup = BeautifulSoup(jobpass)
return map(str, soup.find_all('a', href = re.compile('.display-job')))
foo_links = foo()
example_links = example()
def all_links():
return (foo_links + example_links)
def display_links(request):
name = all_links()
paginator = Paginator(name, 25)
page = request.GET.get('page')
try:
name = paginator.page(page)
except PageNotAnInteger:
name = paginator.page(1)
except EmptyPage:
name = paginator.page(paginator.num_pages)
return render_to_response('jobs.html', {'name' : name})
My template looks like this:
{% for link in name %}
{{ link|safe }}
{% endfor %}
{% if name.has_previous %}
Previous
{% endif %}
Page {{ name.number }} of {{ name.paginator.num_pages}}.
{% if name.has_next %}
next
{% endif %}
Right now as my code stands, anytime I run it, it scraps all the links on
the frontpage of the sites selected and presents them paginated *all afresh*.
However, I don't think its a good idea for the script to read/write all the
links that had previously extracted links all over again and therefore
would like to check for and append only new links. I would like to save the
previously scraped links so that over the course of say, a week, all the
links that have appeared on the frontpage of these sites will be available
on my site as older pages.
It's my first programming project and don't know how to incorporate this
logic into my code.
Any help/pointers/references will be greatly appreciated.
regards, Max
--
https://mail.python.org/mailman/listinfo/python-list
Pls help me...I want to save data to my database but I am unable to
This is my first programming pet project. I have the following script that
extracts links from specific sites and display them on the web(via django).
The script work fine but I'm unable to save any stuff in my database.
Hence if I run the code, I get the output I want but then it always
extracts only new content. I will rather want to have the content scrapped
earlier saved to the database so that on subsequent run, it only scrap and
append ONLY new links to the list.
[ ]
Any help will be appreciated.
[]
# Create your views here.
from django.template.loader import get_template
from django.core.paginator import Paginator, EmptyPage, PageNotAnInteger
from django.shortcuts import render_to_response
from django.template import Context
from bs4 import BeautifulSoup
import urllib2, sys
import urlparse
import re
from datetime import date, datetime
from listing.models import jobLinks
def businessghana():
site = "http://www.businessghana.com/portal/jobs";
hdr = {'User-Agent' : 'Mozilla/5.0'}
req = urllib2.Request(site, headers=hdr)
jobpass = urllib2.urlopen(req)
soup = BeautifulSoup(jobpass)
for tag in soup.find_all('a', href = True):
tag['href'] = urlparse.urljoin('
http://www.businessghana.com/portal/', tag['href'])
return map(str, soup.find_all('a', href =
re.compile('.getJobInfo')))
def tonaton():
site = "http://tonaton.com/en/job-vacancies-in-ghana";
hdr = {'User-Agent' : 'Mozilla/5.0'}
req = urllib2.Request(site, headers=hdr)
jobpass = urllib2.urlopen(req)
soup = BeautifulSoup(jobpass)
result = []
# next two lines make all the links in the soup absolute
for tag in soup.find_all('a', href=True):
tag['href'] = urlparse.urljoin('http://www.tonaton.com',
tag['href'])
# assign all 'h2' tags to 'jobs'. The 'h2'tag contains the required
links
jobs = soup.find_all('h2')
# Loop through the 'h2' tags and extract all the links
for h2 in soup.find_all('h2'):
n = h2.next_element
if n.name == 'a': result.append(str(n))
return result
def jobscomgh():
site = "http://jobs.com.gh";
hdr = {'User-Agent' : 'Mozilla/5.0'}
req = urllib2.Request(site, headers=hdr)
jobpass = urllib2.urlopen(req)
soup = BeautifulSoup(jobpass)
return map(str, soup.find_all('a', href =
re.compile('.display-job')))
businessghana_links = businessghana()
tonaton_links = tonaton()
jobscomgh_links = jobscomgh()
def all_links():
return (businessghana_links + tonaton_links + jobscomgh_links)
def save_new_links(all_links):
current_links = jobLinks.objects.all()
for i in all_links:
if i not in current_links:
jobLinks.objects.create(url=i)
def this_week_links(all_links):
return jobLinks.objects.filter(date__gte =
datetime.timedelta(days=-7))
save_new_links(all_links)
this_week_links(all_links)
def display_links(request):
name = all_links()
paginator = Paginator(name, 25)
page = request.GET.get('page')
try:
name = paginator.page(page)
except PageNotAnInteger:
name = paginator.page(1)
except EmptyPage:
name = paginator.page(paginator.num_pages)
return render_to_response('jobs.html', {'name' : name})
--
https://mail.python.org/mailman/listinfo/python-list
Pls help me...I want to save scraped data automatically to my database(cleaner version)
I have asked this question earlier but this should make more sense than the
earlier version and I don't want anyone who could potentially helped to be
put off by the initial mess even if I updated it with my cleaner version as
a reply
I want to save the links scraped to be save in my database so that on
subsequent run, it only scrapes and append only new links to the list.
This is my code below but at the end of the day my database is empty. What
changes can I make to overcome this? Thanks in advance
from django.template.loader import get_template
from django.shortcuts import render_to_response
from bs4 import BeautifulSoup
import urllib2, sys
import urlparse
import re
from listing.models import jobLinks
#this function extract the links
def businessghana():
site = "http://www.businessghana.com/portal/jobs";
hdr = {'User-Agent' : 'Mozilla/5.0'}
req = urllib2.Request(site, headers=hdr)
jobpass = urllib2.urlopen(req)
soup = BeautifulSoup(jobpass)
for tag in soup.find_all('a', href = True):
tag['href'] = urlparse.urljoin('
http://www.businessghana.com/portal/', tag['href'])
return map(str, soup.find_all('a', href =
re.compile('.getJobInfo')))
# result from businssghana() saved to a variable to make them iterable as
a list
all_links = businessghana()
#this function should be saving the links to the database unless the link
already exist
def save_new_links(all_links):
current_links = jobLinks.objects.all()
for i in all_links:
if i not in current_links:
jobLinks.objects.create(url=i)
# I called the above function here hoping that it will save to database
save_new_links(all_links)
# return my httpResponse with this function
def display_links(request):
name = all_links()
return render_to_response('jobs.html', {'name' : name})
My django models.py looks like this:
from django.db import models class jobLinks(models.Model): links =
models.URLField() pub_date = models.DateTimeField('date retrieved') def
__unicode__(self): return self.links
--
https://mail.python.org/mailman/listinfo/python-list
