kbobyrev created this revision.
kbobyrev added reviewers: ilya-biryukov, ioeric.
Herald added a reviewer: javed.absar.
Herald added a subscriber: kristof.beyls.
This patch significantly improves performance of the YAML serializer by
optimizing `YAML::isNumeric` function. This function is called on the most
strings and is highly inefficient for two reasons:
- It uses `Regex`, which is parsed and compiled each time this function is
called
- It uses multiple passes which are not necessary
This patch introduces stateful ad hoc YAML number parser which does not rely on
`Regex`. It also fixes YAML number format inconsistency: current implementation
supports C-stile octal number format (`01234567`) which was present in YAML 1.0
specialization (http://yaml.org/spec/1.0/), [Section 2.4. Tags, Example 2.19]
but was deprecated and is no longer present in latest YAML 1.2 specification
(http://yaml.org/spec/1.2/spec.html), see [Section 10.3.2. Tag Resolution].
Since the rest of the rest of the implementation does not support other
deprecated YAML 1.0 numeric features such as sexagecimal numbers, commas as
delimiters it is treated as inconsistency and not longer supported.
This performance bottleneck was identified while profiling Clangd's
global-symbol-builder tool with my colleague @ilya-biryukov. The substantial
part of the runtime was spent during a single-thread Reduce phase, which
concludes with YAML serialization of collected symbol collection. Regex
matching was accountable for approximately 45% of the whole runtime (which
involves sharded Map phase), now it is reduced to 18% (which is spent in
`clang::clangd::CanonicalIncludes` and can be also optimized because all used
regexes are in fact either suffix matches or exact matches).
Benchmarking `global-symbol-builder` (using `hyperfine --warmup 2 --min-runs 5
'command 1' 'command 2'` tool by processing a reasonable amount of code (26
source files matched by `clang-tools-extra/clangd/*.cpp` with all transitive
includes) confirmed our understanding of the performance bottleneck nature as
it speeds up the command by the factor of 1.6x:
| Command | Mean [s] | Min…Max [s] |
| :--- | ---: | ---: |
| patch | 84.7 ± 0.6 | 83.3…84.7 |
| master (https://reviews.llvm.org/rL339849) | 133.1 ± 0.8 | 132.4…134.6 |
|
Using smaller samples (e.g. by collecting symbols from
`clang-tools-extra/clangd/AST.cpp` only) yields even better performance
improvement, which is expected because Map phase takes less time compared to
Reduce and is 2.05x faster:
| Command | Mean [ms] | Min…Max [ms] |
| :--- | ---: | ---: |
| patch | 7607.6 ± 109.5 | 7533.3…7796.4 |
| master (https://reviews.llvm.org/rL339849) | 3702.2 ± 48.7 | 3635.1…3752.3 |
https://reviews.llvm.org/D50839
Files:
llvm/include/llvm/Support/YAMLTraits.h
llvm/unittests/Support/YAMLIOTest.cpp
Index: llvm/unittests/Support/YAMLIOTest.cpp
===================================================================
--- llvm/unittests/Support/YAMLIOTest.cpp
+++ llvm/unittests/Support/YAMLIOTest.cpp
@@ -16,16 +16,17 @@
#include "gmock/gmock.h"
#include "gtest/gtest.h"
+using llvm::yaml::Hex16;
+using llvm::yaml::Hex32;
+using llvm::yaml::Hex64;
+using llvm::yaml::Hex8;
using llvm::yaml::Input;
-using llvm::yaml::Output;
using llvm::yaml::IO;
-using llvm::yaml::MappingTraits;
+using llvm::yaml::isNumeric;
using llvm::yaml::MappingNormalization;
+using llvm::yaml::MappingTraits;
+using llvm::yaml::Output;
using llvm::yaml::ScalarTraits;
-using llvm::yaml::Hex8;
-using llvm::yaml::Hex16;
-using llvm::yaml::Hex32;
-using llvm::yaml::Hex64;
using ::testing::StartsWith;
@@ -2569,3 +2570,63 @@
TestEscaped((char const *)foobar, "\"foo\\u200Bbar\"");
}
}
+
+TEST(YAMLIO, Numeric) {
+ EXPECT_TRUE(isNumeric(".inf"));
+ EXPECT_TRUE(isNumeric(".INF"));
+ EXPECT_TRUE(isNumeric(".Inf"));
+ EXPECT_TRUE(isNumeric("-.inf"));
+ EXPECT_TRUE(isNumeric("+.inf"));
+
+ EXPECT_TRUE(isNumeric(".nan"));
+ EXPECT_TRUE(isNumeric(".NaN"));
+ EXPECT_TRUE(isNumeric(".NAN"));
+
+ EXPECT_TRUE(isNumeric("0"));
+ EXPECT_TRUE(isNumeric("0."));
+ EXPECT_TRUE(isNumeric("0.0"));
+ EXPECT_TRUE(isNumeric("-0.0"));
+ EXPECT_TRUE(isNumeric("+0.0"));
+
+ EXPECT_TRUE(isNumeric("+12.0"));
+ EXPECT_TRUE(isNumeric(".5"));
+ EXPECT_TRUE(isNumeric("+.5"));
+ EXPECT_TRUE(isNumeric("-1.0"));
+
+ EXPECT_TRUE(isNumeric("2.3e4"));
+ EXPECT_TRUE(isNumeric("-2E+05"));
+ EXPECT_TRUE(isNumeric("+12e03"));
+ EXPECT_TRUE(isNumeric("6.8523015e+5"));
+
+ EXPECT_TRUE(isNumeric("1.e+1"));
+ EXPECT_TRUE(isNumeric(".0e+1"));
+
+ EXPECT_TRUE(isNumeric("0x2aF3"));
+ EXPECT_TRUE(isNumeric("0o01234567"));
+
+ EXPECT_FALSE(isNumeric("not a number"));
+ EXPECT_FALSE(isNumeric("."));
+ EXPECT_FALSE(isNumeric(".e+1"));
+ EXPECT_FALSE(isNumeric(".1e"));
+ EXPECT_FALSE(isNumeric(".1e+"));
+ EXPECT_FALSE(isNumeric(".1e++1"));
+
+ EXPECT_FALSE(isNumeric("+0x2AF3"));
+ EXPECT_FALSE(isNumeric("-0x2AF3"));
+ EXPECT_FALSE(isNumeric("0x2AF3Z"));
+ EXPECT_FALSE(isNumeric("0o012345678"));
+ EXPECT_FALSE(isNumeric("-0o012345678"));
+
+ // Deprecated formats: as for YAML 1.2 specification, the following are not
+ // valid numbers anymore:
+ //
+ // * Octal numbers with "0" prefix instead of "0o"
+ // * Sexagecimal numbers
+ // * Decimal numbers with comma s the delimiter
+ // * "inf", "nan" without '.' prefix
+ EXPECT_FALSE(isNumeric("3:25:45"));
+ EXPECT_FALSE(isNumeric("014"));
+ EXPECT_FALSE(isNumeric("+12,345"));
+ EXPECT_FALSE(isNumeric("-inf"));
+ EXPECT_FALSE(isNumeric("1,230.15"));
+}
Index: llvm/include/llvm/Support/YAMLTraits.h
===================================================================
--- llvm/include/llvm/Support/YAMLTraits.h
+++ llvm/include/llvm/Support/YAMLTraits.h
@@ -27,6 +27,7 @@
#include <cctype>
#include <cstddef>
#include <cstdint>
+#include <iterator>
#include <map>
#include <memory>
#include <new>
@@ -449,46 +450,76 @@
static bool const value = (sizeof(test<DocumentListTraits<T>>(nullptr))==1);
};
-inline bool isNumber(StringRef S) {
- static const char OctalChars[] = "01234567";
- if (S.startswith("0") &&
- S.drop_front().find_first_not_of(OctalChars) == StringRef::npos)
- return true;
-
- if (S.startswith("0o") &&
- S.drop_front(2).find_first_not_of(OctalChars) == StringRef::npos)
- return true;
+inline bool isNumeric(StringRef S) {
+ if (S.empty())
+ return false;
- static const char HexChars[] = "0123456789abcdefABCDEF";
- if (S.startswith("0x") &&
- S.drop_front(2).find_first_not_of(HexChars) == StringRef::npos)
+ if (S.equals(".nan") || S.equals(".NaN") || S.equals(".NAN"))
return true;
- static const char DecChars[] = "0123456789";
- if (S.find_first_not_of(DecChars) == StringRef::npos)
- return true;
+ StringRef Tail = (S.front() == '-' || S.front() == '+') ? S.drop_front() : S;
- if (S.equals(".inf") || S.equals(".Inf") || S.equals(".INF"))
+ if (Tail.equals(".inf") || Tail.equals(".Inf") || Tail.equals(".INF"))
return true;
- Regex FloatMatcher("^(\\.[0-9]+|[0-9]+(\\.[0-9]*)?)([eE][-+]?[0-9]+)?$");
- if (FloatMatcher.match(S))
+ static const char HexChars[] = "0123456789abcdefABCDEF";
+ static const char OctalChars[] = "01234567";
+ bool ParseHex = S.startswith("0x");
+ bool ParseOct = S.startswith("0o");
+ if (ParseHex || ParseOct) {
+ if (S.size() < 3)
+ return false;
+ for (const auto &Char : S.drop_front(2)) {
+ if (ParseHex && std::find(std::begin(HexChars), std::end(HexChars),
+ Char) == std::end(HexChars))
+ return false;
+ if (ParseOct && std::find(std::begin(OctalChars), std::end(OctalChars),
+ Char) == std::end(OctalChars))
+ return false;
+ }
return true;
+ }
- return false;
-}
-
-inline bool isNumeric(StringRef S) {
- if ((S.front() == '-' || S.front() == '+') && isNumber(S.drop_front()))
- return true;
+ static const char DecChars[] = "0123456789";
- if (isNumber(S))
- return true;
+ // Leading zeros are not allowed.
+ if (Tail.front() == '0' && Tail.size() > 1 &&
+ std::find(std::begin(DecChars), std::end(DecChars), Tail[1]) !=
+ std::end(DecChars))
+ return false;
+
+ // Parse float: [-+]? (\. [0-9]+ | [0-9]+ (\. [0-9]* )?) ([eE] [-+]? [0-9]+)?
+ bool FoundDot = false;
+ bool FoundExponent = false;
+ for (size_t I = 0; I < Tail.size(); ++I) {
+ char Symbol = Tail[I];
+ if (Symbol == '.') {
+ // There can only be one dot in the number.
+ if (FoundDot)
+ return false;
+ FoundDot = true;
+ // If string starts with '.' it has to be followed by at least one digit.
+ if (I == 0 && (Tail.size() == 1 || Tail.find_first_of(DecChars) != 1))
+ return false;
+ } else if (Symbol == 'e' || Symbol == 'E') {
+ // There can only be one exponent sign in the number.
+ if (FoundExponent)
+ return false;
+ FoundExponent = true;
+ } else if (Symbol == '+' || Symbol == '-') {
+ // Sign can only follow an exponent sign.
+ if (!FoundExponent || (Tail[I - 1] != 'e' && Tail[I - 1] != 'E'))
+ return false;
+ } else if ('0' > Symbol || Symbol > '9') {
+ return false;
+ }
+ }
- if (S.equals(".nan") || S.equals(".NaN") || S.equals(".NAN"))
- return true;
+ // Exponent sign has been found: it should be followed by at least one digit.
+ if (FoundExponent)
+ return ('0' <= S.back() && S.back() <= '9');
- return false;
+ return true;
}
inline bool isNull(StringRef S) {
_______________________________________________
cfe-commits mailing list
[email protected]
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits