diff --git a/docs/sql-data-sources-load-save-functions.md b/docs/sql-data-sources-load-save-functions.md index dcb07deb9c2f9..25df34ef5b008 100644 --- a/docs/sql-data-sources-load-save-functions.md +++ b/docs/sql-data-sources-load-save-functions.md @@ -107,7 +107,7 @@ For example, you can control bloom filters and dictionary encodings for ORC data The following ORC example will create bloom filter and use dictionary encoding only for `favorite_color`. For Parquet, there exists `parquet.bloom.filter.enabled` and `parquet.enable.dictionary`, too. To find more detailed information about the extra ORC/Parquet options, -visit the official Apache ORC/Parquet websites. +visit the official Apache [ORC](https://orc.apache.org/docs/spark-config.html) / [Parquet](https://github.com/apache/parquet-mr/tree/master/parquet-hadoop) websites. ORC data source: @@ -172,15 +172,15 @@ Parquet data source: {% highlight sql %} CREATE TABLE users_with_options ( -name STRING, -favorite_color STRING, -favorite_numbers array + name STRING, + favorite_color STRING, + favorite_numbers array ) USING parquet OPTIONS ( -`parquet.bloom.filter.enabled#favorite_color` true, -`parquet.bloom.filter.expected.ndv#favorite_color` 1000000, -parquet.enable.dictionary true, -parquet.page.write-checksum.enabled true + `parquet.bloom.filter.enabled#favorite_color` true, + `parquet.bloom.filter.expected.ndv#favorite_color` 1000000, + parquet.enable.dictionary true, + parquet.page.write-checksum.enabled true ) {% endhighlight %} diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilterSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilterSuite.scala index c677b2b027cd3..94bda56bc8738 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilterSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilterSuite.scala @@ -1637,15 +1637,16 @@ abstract class ParquetFilterSuite extends QueryTest with ParquetTest with Shared test("SPARK-34562: Bloom filter push down") { withTempPath { dir => - val path = dir.toURI.toString - spark.range(100).selectExpr("id * 2 AS id").write - .option(ParquetOutputFormat.BLOOM_FILTER_ENABLED + "#id", "true") + val path = dir.getCanonicalPath + spark.range(100).selectExpr("id * 2 AS id") + .write + .option(ParquetOutputFormat.BLOOM_FILTER_ENABLED + "#id", true) + // Disable dictionary because the distinct values less than 40000. + .option(ParquetOutputFormat.ENABLE_DICTIONARY, false) .parquet(path) Seq(true, false).foreach { bloomFilterEnabled => - withSQLConf( - ParquetInputFormat.DICTIONARY_FILTERING_ENABLED -> "false", - ParquetInputFormat.BLOOM_FILTERING_ENABLED -> bloomFilterEnabled.toString) { + withSQLConf(ParquetInputFormat.BLOOM_FILTERING_ENABLED -> bloomFilterEnabled.toString) { val accu = new NumRowGroupsAcc sparkContext.register(accu)