Repository: spark Updated Branches: refs/heads/master 619c94901 -> 0dd61ec47
[SPARK-25427][SQL][TEST] Add BloomFilter creation test cases ## What changes were proposed in this pull request? Spark supports BloomFilter creation for ORC files. This PR aims to add test coverages to prevent accidental regressions like [SPARK-12417](https://issues.apache.org/jira/browse/SPARK-12417). ## How was this patch tested? Pass the Jenkins with newly added test cases. Closes #22418 from dongjoon-hyun/SPARK-25427. Authored-by: Dongjoon Hyun <[email protected]> Signed-off-by: Wenchen Fan <[email protected]> Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/0dd61ec4 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/0dd61ec4 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/0dd61ec4 Branch: refs/heads/master Commit: 0dd61ec47df7078fd4f77d8c58ecf26c630c700e Parents: 619c949 Author: Dongjoon Hyun <[email protected]> Authored: Mon Sep 17 19:33:51 2018 +0800 Committer: Wenchen Fan <[email protected]> Committed: Mon Sep 17 19:33:51 2018 +0800 ---------------------------------------------------------------------- .../datasources/orc/OrcSourceSuite.scala | 69 ++++++++++++++++++++ .../spark/sql/hive/orc/HiveOrcSourceSuite.scala | 9 +++ 2 files changed, 78 insertions(+) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/spark/blob/0dd61ec4/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/orc/OrcSourceSuite.scala ---------------------------------------------------------------------- diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/orc/OrcSourceSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/orc/OrcSourceSuite.scala index 02bfb71..b6bb1d7 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/orc/OrcSourceSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/orc/OrcSourceSuite.scala @@ -21,7 +21,12 @@ import java.io.File import java.sql.Timestamp import java.util.Locale +import org.apache.hadoop.conf.Configuration +import org.apache.hadoop.fs.Path import org.apache.orc.OrcConf.COMPRESS +import org.apache.orc.OrcFile +import org.apache.orc.OrcProto.Stream.Kind +import org.apache.orc.impl.RecordReaderImpl import org.scalatest.BeforeAndAfterAll import org.apache.spark.sql.Row @@ -50,6 +55,66 @@ abstract class OrcSuite extends OrcTest with BeforeAndAfterAll { .createOrReplaceTempView("orc_temp_table") } + protected def testBloomFilterCreation(bloomFilterKind: Kind) { + val tableName = "bloomFilter" + + withTempDir { dir => + withTable(tableName) { + val sqlStatement = orcImp match { + case "native" => + s""" + |CREATE TABLE $tableName (a INT, b STRING) + |USING ORC + |OPTIONS ( + | path '${dir.toURI}', + | orc.bloom.filter.columns '*', + | orc.bloom.filter.fpp 0.1 + |) + """.stripMargin + case "hive" => + s""" + |CREATE TABLE $tableName (a INT, b STRING) + |STORED AS ORC + |LOCATION '${dir.toURI}' + |TBLPROPERTIES ( + | orc.bloom.filter.columns='*', + | orc.bloom.filter.fpp=0.1 + |) + """.stripMargin + case impl => + throw new UnsupportedOperationException(s"Unknown ORC implementation: $impl") + } + + sql(sqlStatement) + sql(s"INSERT INTO $tableName VALUES (1, 'str')") + + val partFiles = dir.listFiles() + .filter(f => f.isFile && !f.getName.startsWith(".") && !f.getName.startsWith("_")) + assert(partFiles.length === 1) + + val orcFilePath = new Path(partFiles.head.getAbsolutePath) + val readerOptions = OrcFile.readerOptions(new Configuration()) + val reader = OrcFile.createReader(orcFilePath, readerOptions) + var recordReader: RecordReaderImpl = null + try { + recordReader = reader.rows.asInstanceOf[RecordReaderImpl] + + // BloomFilter array is created for all types; `struct`, int (`a`), string (`b`) + val sargColumns = Array(true, true, true) + val orcIndex = recordReader.readRowIndex(0, null, sargColumns) + + // Check the types and counts of bloom filters + assert(orcIndex.getBloomFilterKinds.forall(_ === bloomFilterKind)) + assert(orcIndex.getBloomFilterIndex.forall(_.getBloomFilterCount > 0)) + } finally { + if (recordReader != null) { + recordReader.close() + } + } + } + } + } + test("create temporary orc table") { checkAnswer(sql("SELECT COUNT(*) FROM normal_orc_source"), Row(10)) @@ -215,4 +280,8 @@ class OrcSourceSuite extends OrcSuite with SharedSQLContext { |) """.stripMargin) } + + test("Check BloomFilter creation") { + testBloomFilterCreation(Kind.BLOOM_FILTER_UTF8) // After ORC-101 + } } http://git-wip-us.apache.org/repos/asf/spark/blob/0dd61ec4/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/HiveOrcSourceSuite.scala ---------------------------------------------------------------------- diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/HiveOrcSourceSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/HiveOrcSourceSuite.scala index d84f9a3..c1ae2f6 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/HiveOrcSourceSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/HiveOrcSourceSuite.scala @@ -23,6 +23,7 @@ import org.apache.spark.sql.{AnalysisException, Row} import org.apache.spark.sql.TestingUDT.{IntervalData, IntervalUDT} import org.apache.spark.sql.catalyst.TableIdentifier import org.apache.spark.sql.execution.datasources.orc.OrcSuite +import org.apache.spark.sql.hive.HiveUtils import org.apache.spark.sql.hive.test.TestHiveSingleton import org.apache.spark.sql.internal.HiveSerDe import org.apache.spark.sql.types._ @@ -173,4 +174,12 @@ class HiveOrcSourceSuite extends OrcSuite with TestHiveSingleton { assert(msg.contains("ORC data source does not support calendarinterval data type.")) } } + + test("Check BloomFilter creation") { + Seq(true, false).foreach { convertMetastore => + withSQLConf(HiveUtils.CONVERT_METASTORE_ORC.key -> s"$convertMetastore") { + testBloomFilterCreation(org.apache.orc.OrcProto.Stream.Kind.BLOOM_FILTER) // Before ORC-101 + } + } + } } --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
