commit: 1029f9c624e3f3bf252f20197f357cca00a20410
Author: Göktürk Yüksek <gokturk <AT> gentoo <DOT> org>
AuthorDate: Thu Dec 26 01:37:23 2019 +0000
Commit: Göktürk Yüksek <gokturk <AT> gentoo <DOT> org>
CommitDate: Thu Dec 26 01:37:23 2019 +0000
URL: https://gitweb.gentoo.org/proj/devmanual.git/commit/?id=1029f9c6
bin/build_search_documents.py: handle multi-line indented text better
Beyond replacing newlines that show up in the middle of a text, remove
the whitespace following the newline (which is the indentation) as
well.
Signed-off-by: Göktürk Yüksek <gokturk <AT> gentoo.org>
bin/build_search_documents.py | 12 +++++++++---
1 file changed, 9 insertions(+), 3 deletions(-)
diff --git a/bin/build_search_documents.py b/bin/build_search_documents.py
index 1aac495..38ffd24 100755
--- a/bin/build_search_documents.py
+++ b/bin/build_search_documents.py
@@ -5,6 +5,12 @@ import json
import os.path
import sys
import xml.etree.ElementTree as ET
+import re
+
+
+# The regex for stripping a newline and the possible indentation
+# whitespace following it in multiline content
+whitespace_re = re.compile(r'\n[ \t]*', flags=re.M)
def stringify_node(parent: ET.Element) -> str:
@@ -28,7 +34,7 @@ def stringify_node(parent: ET.Element) -> str:
# For each child, strip the tags and append to text
# along with the tail text following it.
- # The tail may include '\n' if it spans multiple lines.
+ # The tail may include '\n', '\t', ' ' if it spans multiple lines.
# We will worry about those on return, not now.
for child in parent:
# The '<d/>' tag is simply a fancier '-' character
@@ -42,8 +48,8 @@ def stringify_node(parent: ET.Element) -> str:
# A paragraph typically ends with:
# Text\n</p>
# Right strip any spurious whitespace.
- # Finally, get rid of any intermediate newlines.
- return text.rstrip().replace('\n', ' ')
+ # Finally, get rid of any intermediate newlines and indentation whitespace.
+ return whitespace_re.sub(' ', text.rstrip())
def process_node(documents: list, node: ET.Element, name: str, url: str) ->
None: