tags 862004 + moreinfo thanks
Hello Manolo, thank you for spending your time helping to make Debian better with this bug report. I have checked your issue and have a quick fix for it. Please can you test it? Therefore you must change your setup (add the lines Accept and UserAgent): {'shortname': 'WMO Library', 'type': 'html', 'uri': 'https://library.wmo.int/opac/index.php?lvl=infopages&lang=en_UK&pagesid=1', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:45.0) Gecko/20100101 Firefox/45.0', 'contentxpath': '//*[@id="overview"]/tbody/tr[1]/td[1]' } backup the original cp /usr/share/mwc/mwc.py /usr/share/mwc/mwc.py.org , copy the attached mwc.py to /usr/share/mwc and test it. Many thanks! CU Jörg -- New: GPG Fingerprint: 63E0 075F C8D4 3ABB 35AB 30EE 09F8 9F3C 8CA1 D25D GPG key (long) : 09F89F3C8CA1D25D GPG Key : 8CA1D25D CAcert Key S/N : 0E:D4:56 Old pgp Key: BE581B6E (revoked since 2014-12-31). Jörg Frings-Fürst D-54470 Lieser Threema: SYR8SJXB Wire: @joergfringsfuerst IRC: j_...@freenode.net j_...@oftc.net My wish list: - Please send me a picture from the nature at your home.
#!/usr/bin/python3 # Copyright: (2013-2014) Michael Till Beck <debiang...@gmx.de> # License: GPL-2.0+ import urllib.request, urllib.error, urllib.parse import urllib.parse from lxml import etree from cssselect import GenericTranslator import re import io import smtplib from email.mime.text import MIMEText from email.header import Header from urllib.parse import urljoin import os import sys import getopt import traceback import syslog import subprocess import time from time import strftime import random import importlib config = None defaultEncoding = 'utf-8' maxTitleLength = 150 # this is how an empty feed looks like emptyfeed = """<?xml version="1.0"?> <rss version="2.0"> <channel> <title>MailWebsiteChanges Feed</title> <link>https://github.com/Debianguru/MailWebsiteChanges</link> <description>MailWebsiteChanges Feed</description> </channel> </rss>""" # Attributes in HTML files storing URI values. These values are automatically translated to absolute URIs. uriAttributes = [['//img[@src]', 'src'], ['//a[@href]', 'href']] cmdscheme = 'cmd://' mailsession = None # translates all relative URIs found in trees to absolute URIs def toAbsoluteURIs(trees, baseuri): for tree in trees: for uriAttribute in uriAttributes: tags = tree.xpath(uriAttribute[0]) for tag in tags: if tag.attrib.get(uriAttribute[1]) != None: if urllib.parse.urlparse(tag.attrib[uriAttribute[1]]).scheme == '': tag.attrib[uriAttribute[1]] = urllib.parse.urljoin(baseuri, tag.attrib[uriAttribute[1]]) def parseSite(site): file, content, titles, warning = None, None, None, None uri = site['uri'] contenttype = site.get('type', 'html') contentregex = site.get('contentregex', '') titleregex = site.get('titleregex', '') UserAgent = site.get('User-agent', '') Accept = site.get('Accept', '') enc = site.get('encoding', defaultEncoding) contentxpath = site.get('contentxpath', '') if contentxpath == '' and site.get('contentcss', '') != '': # CSS contentxpath = GenericTranslator().css_to_xpath(site.get('contentcss')) titlexpath = site.get('titlexpath', '') if titlexpath == '' and site.get('titlecss', '') != '': titlexpath = GenericTranslator().css_to_xpath(site.get('titlecss')) try: if uri.startswith(cmdscheme): # run command and retrieve output process = subprocess.Popen(uri[len(cmdscheme):], stdout=subprocess.PIPE, shell=True, close_fds=True) file = process.stdout else: # open website req = urllib.request.Request(uri) if UserAgent != '': req.add_header('User-agent', UserAgent) if Accept != '': req.add_header('Accept', Accept) file = urllib.request.urlopen(req) if contenttype == 'text' or (contentxpath == '' and titlexpath == ''): contents = [file.read().decode(enc)] titles = [] else: baseuri = uri if contenttype == 'html': parser = etree.HTMLParser(encoding=enc) else: parser = etree.XMLParser(recover=True, encoding=enc) tree = etree.parse(file, parser) # xpath contentresult = tree.xpath(contentxpath) if contentxpath else [] titleresult = tree.xpath(titlexpath) if titlexpath else [] # translate relative URIs to absolute URIs if contenttype == 'html': basetaglist = tree.xpath('/html/head/base') if len(basetaglist) != 0: baseuri = basetaglist[0].attrib['href'] if len(contentresult) != 0: toAbsoluteURIs(contentresult, baseuri) if len(titleresult) != 0: toAbsoluteURIs(titleresult, baseuri) if contentxpath != '' and titlexpath != '' and len(contentresult) != len(titleresult): warning = 'WARNING: number of title blocks (' + str(len(titleresult)) + ') does not match number of content blocks (' + str(len(contentresult)) + ')' elif contentxpath and len(contentresult) == 0: warning = 'WARNING: content selector became invalid!' elif titlexpath and len(titleresult) == 0: warning = 'WARNING: title selector became invalid!' else: if len(contentresult) == 0: contentresult = titleresult if len(titleresult) == 0: titleresult = contentresult contents = [etree.tostring(s, encoding=defaultEncoding, pretty_print=True).decode(defaultEncoding) for s in contentresult] titles = [getSubject(' '.join(s.xpath('.//text()'))) for s in titleresult] except IOError as e: warning = 'WARNING: could not open URL; maybe content was moved?\n\n' + str(e) if file is not None: file.close() if uri.startswith(cmdscheme) and process.wait() != 0: warning = 'WARNING: process terminated with an error' if warning: return {'content': content, 'titles': titles, 'warning': warning} # parse regex if contentregex: contents = [x for y in [re.findall(r'' + contentregex, c, re.S) for c in contents] for x in y] if titleregex: titles = [x for y in [re.findall(r'' + titleregex, c, re.S) for c in titles] for x in y] if contentregex and titleregex and len(contents) != len(titles): warning = 'WARNING: number of title blocks (' + str(len(titles)) + ') does not match number of content blocks (' + str(len(contents)) + ') after regex' elif contentregex and len(contents) == 0: warning = 'WARNING: content regex became invalid!' elif titleregex and len(titles) == 0: warning = 'WARNING: title regex became invalid!' else: if len(contents) == 0: contents = titles if len(titles) == 0: titles = [getSubject(c) for c in contents] return {'contents': contents, 'titles': titles, 'warning': warning} # returns a short subject line def getSubject(textContent): if textContent == None or textContent == '': return config.subjectPostfix textContent = re.sub(' +', ' ', re.sub('\s', ' ', textContent)).strip() return (textContent[:maxTitleLength] + ' [..]') if len(textContent) > maxTitleLength else textContent # generates a new RSS feed item def genFeedItem(subject, content, link, change): feeditem = etree.Element('item') titleitem = etree.Element('title') titleitem.text = subject + ' #' + str(change) feeditem.append(titleitem) linkitem = etree.Element('link') linkitem.text = link feeditem.append(linkitem) descriptionitem = etree.Element('description') descriptionitem.text = content feeditem.append(descriptionitem) guiditem = etree.Element('guid') guiditem.text = str(random.getrandbits(32)) feeditem.append(guiditem) dateitem = etree.Element('pubDate') dateitem.text = strftime("%a, %d %b %Y %H:%M:%S %Z", time.localtime()) feeditem.append(dateitem) return feeditem # sends mail notification def sendmail(receiver, subject, content, sendAsHtml, link): global mailsession if sendAsHtml: baseurl = None if link != None: content = '<p><a href="' + link + '">' + subject + '</a></p>\n' + content baseurl = urljoin(link, '/') mail = MIMEText('<html><head><title>' + subject + '</title>' + ('<base href="' + baseurl + '">' if baseurl else '') + '</head><body>' + content + '</body></html>', 'html', defaultEncoding) else: if link != None: content = link + '\n\n' + content mail = MIMEText(content, 'text', defaultEncoding) mail['From'] = config.sender mail['To'] = receiver mail['Subject'] = Header(subject, defaultEncoding) # initialize session once, not each time this method gets called # # add try / except to open mailsession # try: if mailsession is None: mailsession = smtplib.SMTP(config.smtphost, config.smtpport) if config.useTLS: mailsession.ehlo() mailsession.starttls() mailsession.login(config.smtpusername, config.smtppwd) except: printf('Error: Open smtp-session') syslog.syslog(syslog.LOG_ERR, 'can not open smtp session') exit(4) # # add try / except to send mail # try: mailsession.sendmail(config.sender, receiver.split(','), mail.as_string()) except: printf('Error: sendmail') syslog.syslog(syslog.LOG_ERR, 'error on sendmail') exit(5) # returns a list of all content that is stored locally for a specific site def getFileContents(shortname): result = [] for f in os.listdir('.'): if f.startswith(shortname + '.') and f.endswith('.txt'): file = open(f, 'r') result.append(file.read()) file.close() return result # updates list of content that is stored locally for a specific site def storeFileContents(shortname, parseResult): for f in os.listdir('.'): if f.startswith(shortname + '.') and f.endswith('.txt'): os.remove(f) i = 0 for c in parseResult['contents']: file = open(shortname + '.' + str(i) + '.txt', 'w') file.write(c) file.close() i += 1 def pollWebsites(): # parse existing feed or create a new one if config.enableRSSFeed: if os.path.isfile(config.rssfile): feedXML = etree.parse(config.rssfile) else: feedXML = etree.parse(io.StringIO(emptyfeed)) # start polling sites for site in config.sites: print('polling site [' + site['shortname'] + '] ...') parseResult = parseSite(site) receiver = site.get('receiver', config.receiver) # if something went wrong, notify the user if parseResult['warning']: subject = '[' + site['shortname'] + '] WARNING' print('WARNING: ' + parseResult['warning']) if config.enableMailNotifications: sendmail(receiver, subject, parseResult['warning'], False, None) if config.enableRSSFeed: feedXML.xpath('//channel')[0].append(genFeedItem(subject, parseResult['warning'], site['uri'], 0)) else: # otherwise, check which parts of the site were updated changes = 0 fileContents = getFileContents(site['shortname']) i = 0 for content in parseResult['contents']: if content not in fileContents: changes += 1 subject = '[' + site['shortname'] + '] ' + parseResult['titles'][i] print(' ' + subject) if config.enableMailNotifications and len(fileContents) > 0: sendmail(receiver, subject, content, (site.get('type', 'html') == 'html'), site['uri']) if config.enableRSSFeed: feedXML.xpath('//channel')[0].append(genFeedItem(subject, content, site['uri'], changes)) i += 1 if changes > 0: storeFileContents(site['shortname'], parseResult) print(' ' + str(changes) + ' updates') # store feed if config.enableRSSFeed: for o in feedXML.xpath('//channel/item[position()<last()-' + str(config.maxFeeds - 1) + ']'): o.getparent().remove(o) file = open(config.rssfile, 'w') file.write(etree.tostring(feedXML, pretty_print=True, xml_declaration=True, encoding=defaultEncoding).decode(defaultEncoding)) file.close() if __name__ == "__main__": configMod = '/etc/mwc/mwc-config' dryrun = None # # add syslog open # syslog.openlog() try: opts, args = getopt.getopt(sys.argv[1:], 'hc:d:', ['help', 'config=', 'dry-run=']) except getopt.GetoptError: print('Usage: mwc.py --config=config --dry-run=shortname') sys.exit(1) for opt, arg in opts: if opt == '-h': print('Usage: mwc.py --config=config') exit() elif opt in ('-c', '--config'): configMod = arg elif opt in ('-d', '--dry-run'): dryrun = arg # # add code to load config from nonsystem path # and change to datadir # try: path = os.path.dirname(configMod) fullname = os.path.basename(configMod) sys.path.append(path) config = importlib.import_module(fullname) except: print('Error: loading config') syslog.syslog(syslog.LOG_ERR, 'can not found / load mwc-config') sys.exit(2) try: os.chdir(config.datadir) except: print('Error: datadir not found') syslog.syslog(syslog.LOG_ERR, 'datadir not found') sys.exit(3) if dryrun: for site in config.sites: if site['shortname'] == dryrun: parseResult = parseSite(site) print(parseResult) break else: try: pollWebsites() except: msg = str(sys.exc_info()[0]) + '\n\n' + traceback.format_exc() print(msg) if config.receiver != '': sendmail(config.receiver, '[mwc] Something went wrong ...', msg, False, None) if mailsession: mailsession.quit() mailsession = None syslog.closelog()
signature.asc
Description: This is a digitally signed message part