commit: 926e0d0855afa40f5dcbc16b1b7c66187afd7d73 Author: Göktürk Yüksek <gokturk <AT> gentoo <DOT> org> AuthorDate: Tue Dec 10 02:08:12 2019 +0000 Commit: Göktürk Yüksek <gokturk <AT> gentoo <DOT> org> CommitDate: Thu Dec 19 20:58:02 2019 +0000 URL: https://gitweb.gentoo.org/proj/devmanual.git/commit/?id=926e0d08
Rewrite the search functionality and extend the coverage The current script only indexes the first <p> in a text.xml, and sometimes only partially if the text is interrupted by another tag such as <c/>. Modify build_search_documents.py such that: - It recursively traverses from chapter all the way down to subsubsection - Each <p>, <important>, <note>, <warning> is indexed separately - In the search results, the match entry will have the title in the form "Chapter[ -> Section[ -> Subsection[ -> Subsubsection]]]" Modify search.js such that: - The ref returned for a match is its index into "documents" array, which makes it possible to retrieve the document in O(1). Signed-off-by: Göktürk Yüksek <gokturk <AT> gentoo.org> bin/build_search_documents.py | 112 ++++++++++++++++++++++++++++++++++++------ search.js | 22 ++++----- 2 files changed, 108 insertions(+), 26 deletions(-) diff --git a/bin/build_search_documents.py b/bin/build_search_documents.py index 9af2753..3816fdb 100755 --- a/bin/build_search_documents.py +++ b/bin/build_search_documents.py @@ -1,4 +1,4 @@ -#!/usr/bin/python +#!/usr/bin/python3 # Copyright 2019 Gentoo Authors # Distributed under the terms of the GNU GPL version 2 or later import json @@ -6,19 +6,103 @@ import os.path import sys import xml.etree.ElementTree as ET -files = sys.argv[1:] -documents = [] -url_root = 'https://devmanual.gentoo.org/' -for f in files: - tree = ET.parse(f) - root = tree.getroot() - for chapter in root.findall('chapter'): +def stringify_node(parent: ET.Element) -> str: + """Flatten this node and its immediate children to a string. + + Combine the text and tail of this node, and any of its immediate + children, if there are any, into a flat string. The tag <d/> is a + special case that resolves to the dash ('-') character. + + Keyword arguments: + parent -- the node to convert to a string + + """ + if parent.text: + text = parent.text.lstrip() + else: + text = str() + + for child in parent.getchildren(): + # The '<d/>' tag is simply a fancier '-' character + if child.tag == 'd': + text += '-' + if child.text: + text += child.text.lstrip() + if child.tail: + text += child.tail.rstrip() + + text += parent.tail.rstrip() + return text.replace('\n', ' ') + + +def process_node(documents: list, node: ET.Element, name: str, url: str) -> None: + """Recursively process a given node and its children based on tag values. + + For the top level node <chapter>, extract the title and recurse + down to the children. + For the intermediary nodes with titles, such as <section>, update + the search result title and url, and recurse down. + For the terminal nodes, such as <p>, convert the contents of the + node to a string, and add it to the search documents. + + Keyword arguments: + documents -- the search documents array + node -- the node to process + name -- the title to display for the search term match + url -- the url for the search term match in the document + + """ + if node.tag == 'chapter': + name = stringify_node(node.find('title')) + + for child in node: + process_node(documents, child, name, url) + elif node.tag in ['section', 'subsection', 'subsubsection']: + title = stringify_node(node.find('title')) + name += ' -> ' + title + url = "{url_base}#{anchor}".format( + url_base=url.split('#')[0], + anchor=title.lower().replace(' ', '-')) + + for child in node: + process_node(documents, child, name, url) + elif node.tag in ['body', 'guide']: + for child in node: + process_node(documents, child, name, url) + elif node.tag in ['p', 'important', 'note', 'warning']: + text = stringify_node(node) + + documents.append({'id': len(documents), + 'name': name, + 'text': text, + 'url': url}) + else: + pass + + +def main(pathnames: list) -> None: + """The entry point of the script. + + Keyword arguments: + pathnames -- a list of path names to process in sequential order + """ + url_root = 'https://devmanual.gentoo.org/' + documents = [] + + for path in pathnames: + tree = ET.parse(path) + root = tree.getroot() + try: - documents.append({"name": chapter.find('title').text, - "text": chapter.find('body').find('p').text, - "url": url_root + os.path.dirname(f) + '/'}) - except AttributeError: - pass + url = url_root + os.path.dirname(path) + '/' + + process_node(documents, root, None, url) + except: + raise + + print('var documents = ' + json.dumps(documents) + ';') + -print('var documents = ' + json.dumps(documents) + ';') +if __name__ in '__main__': + main(sys.argv[1:]) diff --git a/search.js b/search.js index 0b9292f..ab28f87 100644 --- a/search.js +++ b/search.js @@ -5,9 +5,9 @@ "use strict"; var search_index = lunr(function () { - this.ref('name'); + this.ref('id'); this.field('text'); - this.field('url'); + this.metadataWhitelist = ['position'] documents.forEach(function (doc) { this.add(doc); @@ -23,15 +23,13 @@ search_input.addEventListener("keyup", function(event) { } }); -function getContents(docs, article) { - var contents = { text: "", url: "" }; +function getContents(docs, uid) { + var contents = { name: "", text: "", url: "" }; + + contents.name = docs[uid].name; + contents.text = docs[uid].text; + contents.url = docs[uid].url; - for (var i = 0; i< docs.length; i++) { - if (docs[i].name == article) { - contents.text = docs[i].text; - contents.url = docs[i].url; - } - } return contents; } @@ -42,8 +40,8 @@ function search() { if (results.length > 0) { $("#searchResults .modal-body").empty(); $.each(results, function(index, result) { - var title = result.ref; - var contents = getContents(documents, title); + var uid = result.ref; + var contents = getContents(documents, uid); $("#searchResults .modal-body").append(`<article><h5><a href="${contents.url}"> ${title}</a></h5><p>${contents.text}</p></article>`);
