#!/usr/bin/python
"""Cleaning newspages
"""

__author__ = "Matej Cepl (matej@ceplovi.cz)"
__version__ = "$Revision: 1.2 $"
__date__ = "$Date: 2005/07/05 23:26:38 $"
__copyright__ = "Copyright (c) 2004 Matej Cepl"
__license__ = "MIT"

import re, sys, os, os.path
import htmlentitydefs
import urllib
from sgmllib import SGMLParser

OK = 0
FORBIDDEN = 1
INSIDE = 2

MAINHEAD = 1
SUBHEAD = 2
BYLINE = 3
DATE = 4

RAWSUBDIR = "raw_html"
rawdir = os.path.abspath(RAWSUBDIR)

class PageNotAvailable(Exception):
    """Exception for the 404 error."""
    def __init__(self, msg):
        self.msg = msg
    def __str__(self):
        print self.msg

class SoupReader(SGMLParser):
    """Modified SGML parser custom tailored to read junk webpages,
    mainly omitting elements which we are not interested in and providing
    methods for cuttin uninteresting crap from the page."""

    forbiddenclasses = ()

    def __init__(self,url,name):
        """extend the method from SGMLParser
        (called from __init__ in ancestor).
        Reset all data attributes."""
        SGMLParser.reset(self)
        self.mainhead = ""
        self.subhead = ""
        self.byline = ""
        self.date = ""
        self.stuff = ""
        self.unreadable = ""
        self.pieces = []
        self.verbose = False
        self.verbatim = 0
        self.unreadability = OK
        self.url = url
        self.filename = name
        self.feed(self.read())
        self.write(self.output())
        self.close()

    def read(self):
        """Check that already cached version of the page exists
        in the subdirectory of the current one. If not get it
        from the web."""
        rawfilename = os.path.join(rawdir,self.filename)
        if not(os.path.exists(rawfilename)):
            self.suck(self.url,rawfilename)
        ifile = open(rawfilename,"r")
        stuff = ifile.read()
        ifile.close()
        return stuff

    def suck(self,url,filename):
        """Download the newspage from the web and write it to the file with
        filename given as a second argument of the function. When
        the page is not available, then raise exception, which would be caught
        somewhere upstream. FIXME -- does this construct work?"""
        try:
            urllib.urlretrieve(url,filename)
        except IOError:
            raise PageNotAvailable, "Page <%s> is not available (404)!" % url

    def write(self,stuff):
        ofile = open(self.filename,"w")
        tidyin, tidyout = os.popen2('tidy -config /dev/null -asxml')
        tidyin.write(stuff)
        tidyin.close()
        ofile.write(tidyout.read())
        tidyout.close()
        ofile.close()

    def stepUpUnreadability(self):
        self.unreadability += 1

    def stepDownUnreadability(self):
        self.unreadability -= 1

    def test_class(self,attrs):
        """checks whether the class of the document is among forbiddenclasses
        returns OK, Forbidden, or Inside, for class which should be
        read, which is forbidden, or which is inside other class, which is
        forbidden."""
        recclass = [v for k, v in attrs if k=='class']
        if len(recclass)<1:
            recclass = ("",)
        if not(recclass[0] in self.forbiddenclasses) and \
              (self.unreadability == OK):
            return OK
        elif (recclass[0] in self.forbiddenclasses):
            return FORBIDDEN
        elif (self.unreadability > 0):
            return INSIDE
        else:
            raise ValueError, 'Unknown value of checking for forbidden class!'

    def unknown_starttag(self, tag, attrs):
        if self.unreadability == OK:
            strattrs = "".join([' %s="%s"' % (key, value) \
                for key, value in attrs])
            self.pieces.append("<%(tag)s%(strattrs)s>" % locals())

    def unknown_endtag(self, tag):
        if self.unreadability == OK:
            self.pieces.append("</%(tag)s>" % locals())

    def handle_data(self, text):
        if self.unreadability == OK:
            self.pieces.append(text)

    def handle_charref(self, ref):
        if self.unreadability == OK:
            self.pieces.append("&#%(ref)s;" % locals())

    def handle_entityref(self, ref):
        if self.unreadability == OK:
            self.pieces.append("&%(ref)s" % locals())
            # standard HTML entities are closed with a semicolon;
            # other entities are not
            if htmlentitydefs.entitydefs.has_key(ref):
                self.pieces.append(";")

    def handle_comment(self, text):
        pass

    def handle_pi(self, text):
        """called for each processing instruction, e.g. <?instruction>
        Reconstruct original processing instruction."""
        self.pieces.append("<?%(text)s>" % locals())

    def handle_decl(self, text):
        """called for the DOCTYPE, if present, e.g.
        <!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"
              "http://www.w3.org/TR/html4/loose.dtd">
        Reconstruct original DOCTYPE."""
        self.pieces.append("<!%(text)s>" % locals())

    def clean_starttag(self, tag, attrs):
        chkclass = self.test_class(attrs)
        if chkclass == OK:
            self.unknown_starttag(tag, attrs)
        elif chkclass == FORBIDDEN:
            self.stepUpUnreadability()
        elif chkclass == INSIDE:
            self.stepUpUnreadability()

    def clean_endtag(self, tag):
        if self.unreadability == OK:
            self.unknown_endtag(tag)
        else:
            self.stepDownUnreadability()

    def omit_starttag(self, tag, attrs):
        self.stepUpUnreadability()

    def omit_endtag(self, tag):
        if self.unreadability == OK:
            pass
        else:
            self.stepDownUnreadability()

    def output(self):
        """Return processed HTML as a single string"""
        return "".join(self.pieces)

    def start_font(self, attrs):
#         self.omit_starttag("font",attrs)
        pass

    def end_font(self):
#         self.omit_endtag("font")
        pass

    def start_script(self, attrs):
        self.omit_starttag("script",attrs)

    def end_script(self):
        self.omit_endtag("script")

    def start_style(self, attrs):
        self.omit_starttag("style",attrs)

    def end_style(self):
        self.omit_endtag("style")

    def start_noscript(self, attrs):
        """Style should be ommitted as such."""
        self.omit_starttag("noscript",attrs)

    def end_noscript(self):
        self.omit_endtag("noscript")

    def start_body(self, attrs):
        self.unknown_starttag("body",())

    def end_body(self):
        self.unknown_endtag("body")

    def start_td(self, attrs):
        pass

    def end_td(self):
        pass

    def start_center(self, attrs):
        pass

    def end_center(self):
        pass

    def start_tr(self, attrs):
        pass

    def end_tr(self):
        pass

    def start_table(self, attrs):
        pass

    def end_table(self):
        pass

    def start_div(self, attrs):
        self.unknown_starttag("div",attrs)

    def end_div(self):
        self.unknown_endtag("div",attrs)

    def start_h1(self,attrs):
        self.unknown_starttag("h1",attrs)

    def end_h1(self):
        self.unknown_endtag("h1")

    def start_h2(self,attrs):
        self.unknown_starttag("h2",attrs)

    def end_h2(self):
        self.unknown_endtag("h2")

    def start_p(self,attrs):
        self.unknown_starttag("p",attrs)

    def end_p(self):
        self.unknown_endtag("p")

    def start_span(self,attrs):
       pass

    def end_span(self):
        pass

    def do_link(self, attrs):
        pass

    def do_img(self, attrs):
        pass

    def do_hr(self,attrs):
        pass

    def do_br(self,attrs):
        self.pieces.append("<br>")

    def do_spacer(self,attrs):
        pass

    def do_meta(self,attrs):
         self.unknown_starttag("meta",attrs)


class Globe(SoupReader):
    """Specific class for parsing pages from the Boston Globe.
    URL looks like
    http://www.boston.com/news/local/massachusetts/articles/2005/06/08/robbery_stirs_fears_among_residents?rss_id=Boston+Globe+--+City%2FRegion+News
    """

    forbiddenclasses = ('advertisement', 'archivesField', \
        'copyright', 'darkblueMini', 'darkgrayLine', 'footerLinks', \
        'overline', 'padTop5', 'pageBreadcrumbPadding', 'parentBox', \
        'railBox', 'relatedBoxHeadPadding', 'relatedBoxLine', \
        'relatedBoxMainHeadline', 'secNavLinks', 'sponsoredLink', \
        'sponsoredLinks', 'toolRailSingle', 'toolsMain', 'toolsRail', \
         'c1', 'c3', 'c4', 'footerPF', 'small')

    def __init__(self,url,name):
        self.content = ""
        url = re.sub(r"(http://www.boston.com/[^?]*)?rss_id.*","\1",url)
        SoupReader.__init__(self,url,name)

    def handle_data(self, text):
        if self.unreadability == OK:
            self.pieces.append(text)
        if self.content == MAINHEAD:
            self.mainhead = text
            self.content = None
        elif self.content == SUBHEAD:
            self.mainhead = text
            self.content = None
        elif self.content == BYLINE:
            self.byline = text
            self.content = None
        elif self.content == DATE:
            self.date = text
            self.content = None

    def output(self):
        """Return processed HTML as a single string"""
        outstr = """
        <!--
        mainhead: %s
        subhead: %s
        byline: %s
        date: %s
        url: %s
        -->
        """ % (self.mainhead,
              self.subhead,
              self.byline,
              self.date,
              self.url)
        outstr += "".join(self.pieces)
        return outstr

    def start_div(self, attrs):
        self.clean_starttag("div",attr)

    def end_div(self):
        self.clean_endtag("div")

    def start_h1(self,attrs):
        recclass = [v for k, v in attrs if k=='class']
        if (len(recclass)>0) and (recclass[0] in ("mainHead")):
            self.content = MAINHEAD
        self.unknown_starttag("h1",attrs)

    def end_h1(self):
        self.unknown_endtag("h1")

    def start_h2(self,attrs):
        recclass = [v for k, v in attrs if k=='class']
        if (len(recclass)>0) and (recclass[0] in ("subHead")):
            self.content = SUBHEAD
        self.unknown_starttag("h2",attrs)

    def end_h2(self):
        self.unknown_endtag("h2")

    def start_p(self,attrs):
        recclass = [v for k, v in attrs if k=='class']
        if (len(recclass)>0) and (recclass[0] in ("byline")):
            self.content = BYLINE
        self.unknown_starttag("p",attrs)

    def end_p(self):
        self.unknown_endtag("p")

    def start_span(self,attrs):
        recclass = [v for k, v in attrs if k=='style']
        if (len(recclass)>0) and (recclass[0] in ("white-space: nowrap;")):
            self.content = DATE

    def start_td(self, attrs):
        """If class of td is among forbiddenclasses, then not only
        do not output class tags (which is default for <TD>
        element), but even stop outputing its content."""
        self.clean_starttag("td",attrs)

    def end_td(self):
        self.clean_endtag("td")

class Herald(SoupReader):
    """Specific class for parsing pages from the Boston Herald.
    URL looks like
    http://news.bostonherald.com/localRegional/view.bg?articleid=89008 so we can
    leave it as it is.
    """

    forbiddenclasses = ("buttonFont","storyFont","breadFont")

    def __init__(self,url,name):
        SoupReader.__init__(self,url,name)

    def output(self):
        """Return processed HTML as a single string"""
        outstr = """
        <!--
        mainhead: %s
        date: %s
        url: %s
        -->
        """ % (self.mainhead,
              self.date,
              self.url)
        outstr += "".join(self.pieces)
        return outstr

    def handle_data(self, text):
        if self.unreadability == OK:
            self.pieces.append(text)

    def do_meta(self,attrs):
         recclass = [v for k, v in attrs if k=='name']
         if (len(recclass)>0) and (recclass[0] in ("PUBDATE")):
             self.date = [v for k,v in attrs if k=='content'][0]
         elif (len(recclass)>0) and (recclass[0] in ("HEADLINE")):
             self.mainhead = [v for k,v in attrs if k=='content'][0]
         self.unknown_starttag("meta",attrs)

    def start_font(self,attrs):
         self.clean_starttag("font",attrs)

    def end_font(self):
        self.clean_endtag("font")

    def start_form(self,attrs):
        self.omit_starttag("form",attrs)

    def end_form(self):
        self.omit_endtag("form")

class OtherNewspage(SoupReader):
    """Specific class for parsing pages from all other news
    webpages."""

    forbiddenclasses = ()

    def __init__(self,url,name):
        SoupReader.__init__(self,url,name)

    def output(self):
        return "".join(self.pieces)

    def handle_data(self, text):
        self.pieces.append(text)

    def start_font(self,attrs):
         self.clean_starttag("font",attrs)

    def end_font(self):
        self.clean_endtag("font")

    def start_form(self,attrs):
        self.omit_starttag("form",attrs)

    def end_form(self):
        self.omit_endtag("form")

if __name__ == "__main__":
    NewsPage = Herald("herald.html","herald-out.html")