Bug#862004: mwc: Fails to fetch some sites

Jörg Frings-Fürst Sun, 07 May 2017 10:00:37 -0700

tags 862004 + moreinfo
thanks


Hello Manolo,

thank you for spending your time helping to make Debian better with
this bug report.

I have checked your issue and have a quick fix for it.

Please can you test it?

Therefore you must change your setup (add the lines Accept and
UserAgent):

{'shortname': 'WMO Library',
           'type': 'html',
           'uri': 
'https://library.wmo.int/opac/index.php?lvl=infopages&lang=en_UK&pagesid=1',
           'Accept': 
'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
           'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:45.0) 
Gecko/20100101 Firefox/45.0',
           'contentxpath': '//*[@id="overview"]/tbody/tr[1]/td[1]'
           }


backup the original cp /usr/share/mwc/mwc.py /usr/share/mwc/mwc.py.org ,

copy the attached mwc.py to /usr/share/mwc

and test it.

Many thanks!

CU
Jörg

-- 
New:
GPG Fingerprint: 63E0 075F C8D4 3ABB 35AB  30EE 09F8 9F3C 8CA1 D25D
GPG key (long) : 09F89F3C8CA1D25D
GPG Key        : 8CA1D25D
CAcert Key S/N : 0E:D4:56

Old pgp Key: BE581B6E (revoked since 2014-12-31).

Jörg Frings-Fürst
D-54470 Lieser

Threema: SYR8SJXB
Wire: @joergfringsfuerst

IRC: j_...@freenode.net
     j_...@oftc.net

My wish list: 
 - Please send me a picture from the nature at your home.

#!/usr/bin/python3

# Copyright: (2013-2014) Michael Till Beck <debiang...@gmx.de>
# License: GPL-2.0+

import urllib.request, urllib.error, urllib.parse
import urllib.parse
from lxml import etree
from cssselect import GenericTranslator
import re
import io

import smtplib
from email.mime.text import MIMEText
from email.header import Header
from urllib.parse import urljoin

import os
import sys
import getopt
import traceback
import syslog

import subprocess

import time
from time import strftime
import random

import importlib
config = None

defaultEncoding = 'utf-8'
maxTitleLength = 150


# this is how an empty feed looks like
emptyfeed = """<?xml version="1.0"?>
<rss version="2.0">
 <channel>
  <title>MailWebsiteChanges Feed</title>
  <link>https://github.com/Debianguru/MailWebsiteChanges</link>
  <description>MailWebsiteChanges Feed</description>
 </channel>
</rss>"""

# Attributes in HTML files storing URI values. These values are automatically translated to absolute URIs.
uriAttributes = [['//img[@src]', 'src'], ['//a[@href]', 'href']]
cmdscheme = 'cmd://'

mailsession = None


# translates all relative URIs found in trees to absolute URIs
def toAbsoluteURIs(trees, baseuri):
        for tree in trees:
                for uriAttribute in uriAttributes:
                        tags = tree.xpath(uriAttribute[0])
                        for tag in tags:
                                if tag.attrib.get(uriAttribute[1]) != None:
                                        if urllib.parse.urlparse(tag.attrib[uriAttribute[1]]).scheme == '':
                                                tag.attrib[uriAttribute[1]] = urllib.parse.urljoin(baseuri, tag.attrib[uriAttribute[1]])


def parseSite(site):
        file, content, titles, warning = None, None, None, None

        uri = site['uri']
        contenttype = site.get('type', 'html')
        contentregex = site.get('contentregex', '')
        titleregex = site.get('titleregex', '')
        UserAgent = site.get('User-agent', '')
        Accept = site.get('Accept', '')
        enc = site.get('encoding', defaultEncoding)

        contentxpath = site.get('contentxpath', '')
        if contentxpath == '' and site.get('contentcss', '') != '':
                # CSS
                contentxpath = GenericTranslator().css_to_xpath(site.get('contentcss'))
        titlexpath = site.get('titlexpath', '')
        if titlexpath == '' and site.get('titlecss', '') != '':
                titlexpath = GenericTranslator().css_to_xpath(site.get('titlecss'))

        try:

                if uri.startswith(cmdscheme):
                        # run command and retrieve output
                        process = subprocess.Popen(uri[len(cmdscheme):], stdout=subprocess.PIPE, shell=True, close_fds=True)
                        file = process.stdout
                else:
                        # open website
                        req = urllib.request.Request(uri)
                        if UserAgent != '':
                           req.add_header('User-agent', UserAgent)
                        if Accept != '':
                           req.add_header('Accept', Accept)
                        file = urllib.request.urlopen(req)


                if contenttype == 'text' or (contentxpath == '' and titlexpath == ''):
                        contents = [file.read().decode(enc)]
                        titles = []
                else:
                        baseuri = uri
                        if contenttype == 'html':
                                parser = etree.HTMLParser(encoding=enc)
                        else:
                                parser = etree.XMLParser(recover=True, encoding=enc)

                        tree = etree.parse(file, parser)

                        # xpath
                        contentresult = tree.xpath(contentxpath) if contentxpath else []
                        titleresult = tree.xpath(titlexpath) if titlexpath else []

                        # translate relative URIs to absolute URIs
                        if contenttype == 'html':
                                basetaglist = tree.xpath('/html/head/base')
                                if len(basetaglist) != 0:
                                        baseuri = basetaglist[0].attrib['href']
                                if len(contentresult) != 0:
                                        toAbsoluteURIs(contentresult, baseuri)
                                if len(titleresult) != 0:
                                        toAbsoluteURIs(titleresult, baseuri)

                        if contentxpath != '' and titlexpath != '' and len(contentresult) != len(titleresult):
                                warning = 'WARNING: number of title blocks (' + str(len(titleresult)) + ') does not match number of content blocks (' + str(len(contentresult)) + ')'
                        elif contentxpath and len(contentresult) == 0:
                                warning = 'WARNING: content selector became invalid!'
                        elif titlexpath and len(titleresult) == 0:
                                warning = 'WARNING: title selector became invalid!'
                        else:
                                if len(contentresult) == 0:
                                        contentresult = titleresult
                                if len(titleresult) == 0:
                                        titleresult = contentresult

                        contents = [etree.tostring(s, encoding=defaultEncoding, pretty_print=True).decode(defaultEncoding) for s in contentresult]
                        titles = [getSubject(' '.join(s.xpath('.//text()'))) for s in titleresult]

        except IOError as e:
                warning = 'WARNING: could not open URL; maybe content was moved?\n\n' + str(e)

        if file is not None:
                file.close()

        if uri.startswith(cmdscheme) and process.wait() != 0:
                warning = 'WARNING: process terminated with an error'

        if warning:
                return {'content': content, 'titles': titles, 'warning': warning}

        # parse regex
        if contentregex:
                contents = [x for y in [re.findall(r'' + contentregex, c, re.S) for c in contents] for x in y]
        if titleregex:
                titles = [x for y in [re.findall(r'' + titleregex, c, re.S) for c in titles] for x in y]

        if contentregex and titleregex and len(contents) != len(titles):
                warning = 'WARNING: number of title blocks (' + str(len(titles)) + ') does not match number of content blocks (' + str(len(contents)) + ') after regex'
        elif contentregex and len(contents) == 0:
                warning = 'WARNING: content regex became invalid!'
        elif titleregex and len(titles) == 0:
                warning = 'WARNING: title regex became invalid!'
        else:
                if len(contents) == 0:
                        contents = titles
                if len(titles) == 0:
                        titles = [getSubject(c) for c in contents]

        return {'contents': contents, 'titles': titles, 'warning': warning}


# returns a short subject line
def getSubject(textContent):
        if textContent == None or textContent == '':
                return config.subjectPostfix
        textContent = re.sub(' +', ' ', re.sub('\s', ' ', textContent)).strip()
        return (textContent[:maxTitleLength] + ' [..]') if len(textContent) > maxTitleLength else textContent


# generates a new RSS feed item
def genFeedItem(subject, content, link, change):
        feeditem = etree.Element('item')
        titleitem = etree.Element('title')
        titleitem.text = subject + ' #' + str(change)
        feeditem.append(titleitem)
        linkitem = etree.Element('link')
        linkitem.text = link
        feeditem.append(linkitem)
        descriptionitem = etree.Element('description')
        descriptionitem.text = content
        feeditem.append(descriptionitem)
        guiditem = etree.Element('guid')
        guiditem.text = str(random.getrandbits(32))
        feeditem.append(guiditem)
        dateitem = etree.Element('pubDate')
        dateitem.text = strftime("%a, %d %b %Y %H:%M:%S %Z", time.localtime())
        feeditem.append(dateitem)

        return feeditem


# sends mail notification
def sendmail(receiver, subject, content, sendAsHtml, link):
        global mailsession

        if sendAsHtml:
                baseurl = None
                if link != None:
                        content = '<p><a href="' + link + '">' + subject + '</a></p>\n' + content
                        baseurl = urljoin(link, '/')
                mail = MIMEText('<html><head><title>' + subject + '</title>' + ('<base href="' + baseurl + '">' if baseurl else '') + '</head><body>' + content + '</body></html>', 'html', defaultEncoding)
        else:
                if link != None:
                        content = link + '\n\n' + content
                mail = MIMEText(content, 'text', defaultEncoding)

        mail['From'] = config.sender
        mail['To'] = receiver
        mail['Subject'] = Header(subject, defaultEncoding)

        # initialize session once, not each time this method gets called
        #
        # add try / except to open mailsession
	#
	
        try:
                if mailsession is None:
                        mailsession = smtplib.SMTP(config.smtphost, config.smtpport)
                        if config.useTLS:
                                mailsession.ehlo()
                                mailsession.starttls()
                        mailsession.login(config.smtpusername, config.smtppwd)
        except:
                printf('Error: Open smtp-session')
                syslog.syslog(syslog.LOG_ERR, 'can not open smtp session')
                exit(4)
	#
        # add try / except to send mail
        #
        try:
                mailsession.sendmail(config.sender, receiver.split(','), mail.as_string())
        except:
                printf('Error: sendmail')
                syslog.syslog(syslog.LOG_ERR, 'error on sendmail')
                exit(5)

# returns a list of all content that is stored locally for a specific site
def getFileContents(shortname):
        result = []
        for f in os.listdir('.'):
                if f.startswith(shortname + '.') and f.endswith('.txt'):
                        file = open(f, 'r')
                        result.append(file.read())
                        file.close()
        return result


# updates list of content that is stored locally for a specific site
def storeFileContents(shortname, parseResult):
        for f in os.listdir('.'):
                if f.startswith(shortname + '.') and f.endswith('.txt'):
                        os.remove(f)

        i = 0
        for c in parseResult['contents']:
                file = open(shortname + '.' + str(i) + '.txt', 'w')
                file.write(c)
                file.close()
                i += 1


def pollWebsites():

        # parse existing feed or create a new one
        if config.enableRSSFeed:
                if os.path.isfile(config.rssfile):
                        feedXML = etree.parse(config.rssfile)
                else:
                        feedXML = etree.parse(io.StringIO(emptyfeed))

        # start polling sites
        for site in config.sites:

                print('polling site [' + site['shortname'] + '] ...')
                parseResult = parseSite(site)
                receiver = site.get('receiver', config.receiver)

                # if something went wrong, notify the user
                if parseResult['warning']:
                        subject = '[' + site['shortname'] + '] WARNING'
                        print('WARNING: ' + parseResult['warning'])
                        if config.enableMailNotifications:
                                sendmail(receiver, subject, parseResult['warning'], False, None)
                        if config.enableRSSFeed:
                                feedXML.xpath('//channel')[0].append(genFeedItem(subject, parseResult['warning'], site['uri'], 0))
                else:
                        # otherwise, check which parts of the site were updated
                        changes = 0
                        fileContents = getFileContents(site['shortname'])
                        i = 0
                        for content in parseResult['contents']:
                                if content not in fileContents:
                                        changes += 1

                                        subject = '[' + site['shortname'] + '] ' + parseResult['titles'][i]
                                        print('    ' + subject)
                                        if config.enableMailNotifications and len(fileContents) > 0:
                                                sendmail(receiver, subject, content, (site.get('type', 'html') == 'html'), site['uri'])

                                        if config.enableRSSFeed:
                                                feedXML.xpath('//channel')[0].append(genFeedItem(subject, content, site['uri'], changes))
                                i += 1


                        if changes > 0:
                                storeFileContents(site['shortname'], parseResult)
                                print('        ' + str(changes) + ' updates')
 
        # store feed
        if config.enableRSSFeed:
                for o in feedXML.xpath('//channel/item[position()<last()-' + str(config.maxFeeds - 1) + ']'):
                        o.getparent().remove(o)
                file = open(config.rssfile, 'w')
                file.write(etree.tostring(feedXML, pretty_print=True, xml_declaration=True, encoding=defaultEncoding).decode(defaultEncoding))
                file.close()


if __name__ == "__main__":

        configMod = '/etc/mwc/mwc-config'
        dryrun = None
        
        #
        # add syslog open
        #
        syslog.openlog()
        try:
                opts, args = getopt.getopt(sys.argv[1:], 'hc:d:', ['help', 'config=', 'dry-run='])
        except getopt.GetoptError:
                print('Usage: mwc.py --config=config --dry-run=shortname')
                sys.exit(1)
        for opt, arg in opts:
                if opt == '-h':
                        print('Usage: mwc.py --config=config')
                        exit()
                elif opt in ('-c', '--config'):
                        configMod = arg
                elif opt in ('-d', '--dry-run'):
                        dryrun = arg
	# 
	# add code to load config from nonsystem path
	# and change to datadir
	#
        try:
            path = os.path.dirname(configMod)
            fullname = os.path.basename(configMod)
            sys.path.append(path)
            config = importlib.import_module(fullname)
        except: 
            print('Error: loading config')
            syslog.syslog(syslog.LOG_ERR, 'can not found / load mwc-config')
            sys.exit(2)
        try:
            os.chdir(config.datadir)
        except: 
            print('Error: datadir not found')
            syslog.syslog(syslog.LOG_ERR, 'datadir not found')
            sys.exit(3)
    
        if dryrun:
                for site in config.sites:
                        if site['shortname'] == dryrun:
                                parseResult = parseSite(site)
                                print(parseResult)
                                break
        else:
                try:
                        pollWebsites()
                except:
                        msg = str(sys.exc_info()[0]) + '\n\n' + traceback.format_exc()
                        print(msg)
                        if config.receiver != '':
                                sendmail(config.receiver, '[mwc] Something went wrong ...', msg, False, None)

                if mailsession:
                        mailsession.quit()
                        mailsession = None

        syslog.closelog()

signature.asc
Description: This is a digitally signed message part

Bug#862004: mwc: Fails to fetch some sites

Reply via email to