This is an automated email from the ASF dual-hosted git repository. alexott pushed a commit to branch branch-0.9 in repository https://gitbox.apache.org/repos/asf/zeppelin.git
The following commit(s) were added to refs/heads/branch-0.9 by this push: new d38778d [ZEPPELIN-4870] Improve parsing of the paragraph properties d38778d is described below commit d38778df4079c9945ad88d03d2acfc9748832fb1 Author: Alex Ott <alex...@gmail.com> AuthorDate: Thu Jun 11 15:09:38 2020 +0200 [ZEPPELIN-4870] Improve parsing of the paragraph properties ### What is this PR for? We can provide properties that are local to the paragraph, that could be used to pass an additional information for interpreter that could affect its behavior. Unfortunately existing parsing functionality relies on the fact that key/value pairs need to be separated by `,` character, and doesn't handle values with special characters (`,`, `=`, ...) inside, like this: ``` %cassandra(locale=ruRU, timeFormat="E, d MMM yy", floatPrecision = 5, outputFormat=cql) ``` This PR changes the parsing logic to perform character-by-character parsing, and handling of the quoted keys & values, escaping of the special characters, etc. ### What type of PR is it? Improvement ### What is the Jira issue? * https://issues.apache.org/jira/browse/ZEPPELIN-4870 ### How should this be tested? * https://travis-ci.org/github/alexott/zeppelin/builds/697522260 * additional unit tests were added Author: Alex Ott <alex...@gmail.com> Closes #3799 from alexott/ZEPPELIN-4870 and squashes the following commits: 5fb6ee84e [Alex Ott] [ZEPPELIN-4870] Improve parsing of the paragraph properties (cherry picked from commit 1d1b05839a4b6456492e5fe4dd6682601bf1d402) Signed-off-by: Alex Ott <alex...@apache.org> --- docs/usage/interpreter/overview.md | 8 +- .../zeppelin/notebook/ParagraphTextParser.java | 129 ++++++++++++++++----- .../zeppelin/notebook/ParagraphTextParserTest.java | 105 +++++++++++++++-- 3 files changed, 205 insertions(+), 37 deletions(-) diff --git a/docs/usage/interpreter/overview.md b/docs/usage/interpreter/overview.md index 865c3a6..20989ee 100644 --- a/docs/usage/interpreter/overview.md +++ b/docs/usage/interpreter/overview.md @@ -38,7 +38,13 @@ When you click the ```+Create``` button on the interpreter page, the interpreter You can create multiple interpreters for the same engine with different interpreter setting. e.g. You can create `spark2` for Spark 2.x and create `spark1` for Spark 1.x. -For each paragraph you write in Zeppelin, you need to specify its interpreter first via `%interpreter_group.interpreter_name`. e.g. `%spark.pyspark`, `%spark.r` +For each paragraph you write in Zeppelin, you need to specify its interpreter first via `%interpreter_group.interpreter_name`. e.g. `%spark.pyspark`, `%spark.r`. + +If you specify interpreter, you can also pass local properties to it (if it needs them). This is done by providing a set of key/value pairs, separated by comma, inside the round brackets right after the interpreter name. If key or value contain characters like `=`, or `,`, then you can either escape them with `\` character, or wrap the whole value inside the double quotes For example: + +``` +%cassandra(outputFormat=cql, dateFormat="E, d MMM yy", timeFormat=E\, d MMM yy) +``` ## What are the Interpreter Settings? The interpreter settings are the configuration of a given interpreter on the Zeppelin server. For example, certain properties need to be set for the Apache Hive JDBC interpreter to connect to the Hive server. diff --git a/zeppelin-zengine/src/main/java/org/apache/zeppelin/notebook/ParagraphTextParser.java b/zeppelin-zengine/src/main/java/org/apache/zeppelin/notebook/ParagraphTextParser.java index 5ee7261..f4742b4 100644 --- a/zeppelin-zengine/src/main/java/org/apache/zeppelin/notebook/ParagraphTextParser.java +++ b/zeppelin-zengine/src/main/java/org/apache/zeppelin/notebook/ParagraphTextParser.java @@ -17,13 +17,12 @@ package org.apache.zeppelin.notebook; -import org.apache.commons.lang3.StringUtils; - import java.util.HashMap; import java.util.Map; import java.util.regex.Matcher; import java.util.regex.Pattern; - +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; /** * Parser which is used for parsing the paragraph text. @@ -42,6 +41,7 @@ import java.util.regex.Pattern; * localProperties: Map(pool->pool_1) */ public class ParagraphTextParser { + private static final Logger LOGGER = LoggerFactory.getLogger(ParagraphTextParser.class); public static class ParseResult { private String intpText; @@ -67,41 +67,112 @@ public class ParagraphTextParser { } } - private static Pattern REPL_PATTERN = - Pattern.compile("(\\s*)%([\\w\\.]+)(\\(.*?\\))?.*", Pattern.DOTALL); + private static Pattern REPL_PATTERN = Pattern.compile("^(\\s*)%(\\w+(?:\\.\\w+)*)"); + + private static int parseLocalProperties( + final String text, int startPos, + Map<String, String> localProperties) throws RuntimeException{ + startPos++; + String propKey = null; + boolean insideQuotes = false, parseKey = true, finished = false; + StringBuilder sb = new StringBuilder(); + while(!finished && startPos < text.length()) { + char ch = text.charAt(startPos); + switch (ch) { + case ')': { + if (!insideQuotes) { + if (parseKey) { + propKey = sb.toString().trim(); + if (!propKey.isEmpty()) { + localProperties.put(propKey, propKey); + } + } else { + localProperties.put(propKey, sb.toString().trim()); + } + finished = true; + } else { + sb.append(ch); + } + break; + } + case '\\': { + if ((startPos + 1) == text.length()) { + throw new RuntimeException( + "Problems by parsing paragraph. Unfinished escape sequence"); + } + startPos++; + ch = text.charAt(startPos); + switch (ch) { + case 'n': + sb.append('\n'); + break; + case 't': + sb.append('\t'); + break; + default: + sb.append(ch); + } + break; + } + case '"': { + insideQuotes = !insideQuotes; + break; + } + case '=': { + if (insideQuotes) { + sb.append(ch); + } else { + propKey = sb.toString().trim(); + sb.delete(0, sb.length()); + parseKey = false; + } + break; + } + case ',': { + if (insideQuotes) { + sb.append(ch); + } else if (propKey == null || propKey.trim().isEmpty()) { + throw new RuntimeException( + "Problems by parsing paragraph. Local property key is empty"); + } else { + if (parseKey) { + propKey = sb.toString().trim(); + localProperties.put(propKey, propKey); + } else { + localProperties.put(propKey, sb.toString().trim()); + } + } + propKey = null; + parseKey = true; + sb.delete(0, sb.length()); + break; + } + default: + sb.append(ch); + } + startPos++; + } + if (!finished) { + throw new RuntimeException( + "Problems by parsing paragraph. Not finished interpreter configuration"); + } + return startPos; + } public static ParseResult parse(String text) { Map<String, String> localProperties = new HashMap<>(); - String intpText = null; + String intpText = ""; String scriptText = null; Matcher matcher = REPL_PATTERN.matcher(text); - if (matcher.matches()) { + if (matcher.find()) { String headingSpace = matcher.group(1); intpText = matcher.group(2); - if (matcher.groupCount() == 3 && matcher.group(3) != null) { - String localPropertiesText = matcher.group(3); - String[] splits = localPropertiesText.substring(1, localPropertiesText.length() - 1) - .split(","); - for (String split : splits) { - String[] kv = split.split("="); - if (StringUtils.isBlank(split) || kv.length == 0) { - continue; - } - if (kv.length > 2) { - throw new RuntimeException("Invalid paragraph properties format: " + split); - } - if (kv.length == 1) { - localProperties.put(kv[0].trim(), kv[0].trim()); - } else { - localProperties.put(kv[0].trim(), kv[1].trim()); - } - } - scriptText = text.substring(headingSpace.length() + intpText.length() + - localPropertiesText.length() + 1).trim(); - } else { - scriptText = text.substring(headingSpace.length() + intpText.length() + 1).trim(); + int startPos = headingSpace.length() + intpText.length() + 1; + if (text.charAt(startPos) == '(') { + startPos = parseLocalProperties(text, startPos, localProperties); } + scriptText = text.substring(startPos).trim(); } else { intpText = ""; scriptText = text; diff --git a/zeppelin-zengine/src/test/java/org/apache/zeppelin/notebook/ParagraphTextParserTest.java b/zeppelin-zengine/src/test/java/org/apache/zeppelin/notebook/ParagraphTextParserTest.java index 6da2454..4636105 100644 --- a/zeppelin-zengine/src/test/java/org/apache/zeppelin/notebook/ParagraphTextParserTest.java +++ b/zeppelin-zengine/src/test/java/org/apache/zeppelin/notebook/ParagraphTextParserTest.java @@ -17,7 +17,9 @@ package org.apache.zeppelin.notebook; +import org.junit.Rule; import org.junit.Test; +import org.junit.rules.ExpectedException; import static junit.framework.TestCase.assertEquals; @@ -33,30 +35,119 @@ public class ParagraphTextParserTest { } @Test - public void testParagraphText() { + public void testParagraphTextLocalPropertiesAndText() { ParagraphTextParser.ParseResult parseResult = ParagraphTextParser.parse("%spark.pyspark(pool=pool_1) sc.version"); assertEquals("spark.pyspark", parseResult.getIntpText()); assertEquals(1, parseResult.getLocalProperties().size()); assertEquals("pool_1", parseResult.getLocalProperties().get("pool")); assertEquals("sc.version", parseResult.getScriptText()); + } - // no script text - parseResult = ParagraphTextParser.parse("%spark.pyspark(pool=pool_1)"); + @Test + public void testParagraphTextLocalPropertiesNoText() { + ParagraphTextParser.ParseResult parseResult = ParagraphTextParser.parse("%spark.pyspark(pool=pool_1)"); assertEquals("spark.pyspark", parseResult.getIntpText()); assertEquals(1, parseResult.getLocalProperties().size()); assertEquals("pool_1", parseResult.getLocalProperties().get("pool")); assertEquals("", parseResult.getScriptText()); + } + + @Test + public void testParagraphTextLocalPropertyNoValueNoText() { + ParagraphTextParser.ParseResult parseResult = ParagraphTextParser.parse("%spark.pyspark(pool)"); + assertEquals("spark.pyspark", parseResult.getIntpText()); + assertEquals(1, parseResult.getLocalProperties().size()); + assertEquals("pool", parseResult.getLocalProperties().get("pool")); + assertEquals("", parseResult.getScriptText()); + } - // no paragraph local properties - parseResult = ParagraphTextParser.parse("%spark.pyspark sc.version"); + @Test + public void testParagraphTextNoLocalProperties() { + ParagraphTextParser.ParseResult parseResult = ParagraphTextParser.parse("%spark.pyspark sc.version"); assertEquals("spark.pyspark", parseResult.getIntpText()); assertEquals(0, parseResult.getLocalProperties().size()); assertEquals("sc.version", parseResult.getScriptText()); + } - // no intp text and paragraph local properties - parseResult = ParagraphTextParser.parse("sc.version"); + @Test + public void testParagraphNoInterpreter() { + ParagraphTextParser.ParseResult parseResult = ParagraphTextParser.parse("sc.version"); assertEquals("", parseResult.getIntpText()); assertEquals(0, parseResult.getLocalProperties().size()); assertEquals("sc.version", parseResult.getScriptText()); } + + @Test + public void testParagraphInterpreterWithoutProperties() { + ParagraphTextParser.ParseResult parseResult = ParagraphTextParser.parse("%spark() sc.version"); + assertEquals("spark", parseResult.getIntpText()); + assertEquals(0, parseResult.getLocalProperties().size()); + assertEquals("sc.version", parseResult.getScriptText()); + } + + @Test + public void testParagraphTextQuotedPropertyValue1() { + ParagraphTextParser.ParseResult parseResult = ParagraphTextParser.parse( + "%spark.pyspark(pool=\"value with = inside\")"); + assertEquals("spark.pyspark", parseResult.getIntpText()); + assertEquals(1, parseResult.getLocalProperties().size()); + assertEquals("value with = inside", parseResult.getLocalProperties().get("pool")); + assertEquals("", parseResult.getScriptText()); + } + + @Test + public void testParagraphTextQuotedPropertyValue2() { + ParagraphTextParser.ParseResult parseResult = ParagraphTextParser.parse( + "%spark.pyspark(pool=\"value with \\\" inside\", p=\"eol\\ninside\" )"); + assertEquals("spark.pyspark", parseResult.getIntpText()); + assertEquals(2, parseResult.getLocalProperties().size()); + assertEquals("value with \" inside", parseResult.getLocalProperties().get("pool")); + assertEquals("eol\ninside", parseResult.getLocalProperties().get("p")); + assertEquals("", parseResult.getScriptText()); + } + + @Test + public void testParagraphTextQuotedPropertyKeyAndValue() { + ParagraphTextParser.ParseResult parseResult = ParagraphTextParser.parse( + "%spark.pyspark(\"po ol\"=\"value with \\\" inside\")"); + assertEquals("spark.pyspark", parseResult.getIntpText()); + assertEquals(1, parseResult.getLocalProperties().size()); + assertEquals("value with \" inside", parseResult.getLocalProperties().get("po ol")); + assertEquals("", parseResult.getScriptText()); + } + + @Rule + public ExpectedException exceptionRule = ExpectedException.none(); + + @Test + public void testParagraphTextUnfinishedConfig() { + exceptionRule.expect(RuntimeException.class); + exceptionRule.expectMessage( + "Problems by parsing paragraph. Not finished interpreter configuration"); + ParagraphTextParser.parse("%spark.pyspark(pool="); + } + + @Test + public void testParagraphTextUnfinishedQuote() { + exceptionRule.expect(RuntimeException.class); + exceptionRule.expectMessage( + "Problems by parsing paragraph. Not finished interpreter configuration"); + ParagraphTextParser.parse("%spark.pyspark(pool=\"2314234) sc.version"); + } + + @Test + public void testParagraphTextUnclosedBackslash() { + exceptionRule.expect(RuntimeException.class); + exceptionRule.expectMessage( + "Problems by parsing paragraph. Unfinished escape sequence"); + ParagraphTextParser.parse("%spark.pyspark(pool=\\"); + } + + @Test + public void testParagraphTextEmptyKey() { + exceptionRule.expect(RuntimeException.class); + exceptionRule.expectMessage( + "Problems by parsing paragraph. Local property key is empty"); + ParagraphTextParser.parse("%spark.pyspark(pool=123, ,)"); + } }