From d7cfad6b1fc982a76a8b89097026da9daf75ceb1 Mon Sep 17 00:00:00 2001 From: Peng Huo Date: Tue, 2 Feb 2021 21:22:42 -0800 Subject: [PATCH] Disable access to the field keyword in the new SQL engine (#1025) * update * add breaking change doc --- docs/dev/NewSQLEngine.md | 6 ++++ docs/experiment/ppl/cmd/search.rst | 28 ++++++++-------- docs/user/general/identifiers.rst | 32 +++++++++---------- .../elasticsearch/mapping/IndexMapping.java | 9 ------ .../storage/ElasticsearchIndex.java | 10 +++++- .../client/ElasticsearchNodeClientTest.java | 4 +-- .../client/ElasticsearchRestClientTest.java | 4 +-- .../sql/legacy/PrettyFormatResponseIT.java | 31 ------------------ .../sql/sql/QueryValidationIT.java | 23 ++++++++++++- 9 files changed, 69 insertions(+), 78 deletions(-) diff --git a/docs/dev/NewSQLEngine.md b/docs/dev/NewSQLEngine.md index a9034ef39e..b891810e75 100644 --- a/docs/dev/NewSQLEngine.md +++ b/docs/dev/NewSQLEngine.md @@ -35,11 +35,17 @@ As for correctness, besides full coverage of unit and integration test, we devel ### 3.1 Breaking Changes +#### 3.1.1 Response Because of implementation changed internally, you can expect Explain output in a different format. For query protocol, there are slightly changes on two fields' value in the default response format: * **Schema**: Previously the `name` and `alias` value differed for different queries. For consistency, name is always the original text now and alias is its alias defined in SELECT clause or absent if none. * **Total**: The `total` field represented how many documents matched in total no matter how many returned (indicated by `size` field). However, this field becomes meaningless because of post processing on DSL response in the new query engine. Thus, for now the total number is always same as size field. +#### 3.1.2 Syntax and Semantic + +* **field.keyword**: In Elasticsearch, user could assign different type for one field. The most common use case is assign `text` and `keyword` type for string field. In the legacy engine, user could select the `field.keyword` from the index. e.g. `SELECT addr.state FROM account WHERE addr.state = 'WA' `. In the new engine, user don't have access the to the `addr.state` field, instead, the query engine will handle it automatically. + + ### 3.2 Limitations You can find all the limitations in [Limitations](/docs/user/limitations/limitations.rst). For these unsupported features, the query will be forwarded to the old query engine by fallback mechanism. To avoid impact on your side, normally you won't see any difference in a query response. If you want to check if and why your query falls back to be handled by old SQL engine, please explain your query and check Elasticsearch log for "Request is falling back to old SQL engine due to ...". diff --git a/docs/experiment/ppl/cmd/search.rst b/docs/experiment/ppl/cmd/search.rst index 5e3b825b5e..d1a943f24f 100644 --- a/docs/experiment/ppl/cmd/search.rst +++ b/docs/experiment/ppl/cmd/search.rst @@ -32,14 +32,14 @@ PPL query:: od> source=accounts; fetched rows / total rows = 4/4 - +------------------+-------------+----------------------+----------+--------+------------+-----------+------------+---------+-------+-----------------------+ - | account_number | firstname | address | gender | city | lastname | balance | employer | state | age | email | - |------------------+-------------+----------------------+----------+--------+------------+-----------+------------+---------+-------+-----------------------| - | 1 | Amber | 880 Holmes Lane | M | Brogan | Duke | 39225 | Pyrami | IL | 32 | amberduke@pyrami.com | - | 6 | Hattie | 671 Bristol Street | M | Dante | Bond | 5686 | Netagy | TN | 36 | hattiebond@netagy.com | - | 13 | Nanette | 789 Madison Street | F | Nogal | Bates | 32838 | Quility | VA | 28 | null | - | 18 | Dale | 467 Hutchinson Court | M | Orick | Adams | 4180 | null | MD | 33 | daleadams@boink.com | - +------------------+-------------+----------------------+----------+--------+------------+-----------+------------+---------+-------+-----------------------+ + +------------------+-------------+----------------------+-----------+----------+--------+------------+---------+-------+-----------------------+------------+ + | account_number | firstname | address | balance | gender | city | employer | state | age | email | lastname | + |------------------+-------------+----------------------+-----------+----------+--------+------------+---------+-------+-----------------------+------------| + | 1 | Amber | 880 Holmes Lane | 39225 | M | Brogan | Pyrami | IL | 32 | amberduke@pyrami.com | Duke | + | 6 | Hattie | 671 Bristol Street | 5686 | M | Dante | Netagy | TN | 36 | hattiebond@netagy.com | Bond | + | 13 | Nanette | 789 Madison Street | 32838 | F | Nogal | Quility | VA | 28 | null | Bates | + | 18 | Dale | 467 Hutchinson Court | 4180 | M | Orick | null | MD | 33 | daleadams@boink.com | Adams | + +------------------+-------------+----------------------+-----------+----------+--------+------------+---------+-------+-----------------------+------------+ Example 2: Fetch data with condition ==================================== @@ -50,10 +50,10 @@ PPL query:: od> source=accounts account_number=1 or gender="F"; fetched rows / total rows = 2/2 - +------------------+-------------+--------------------+----------+--------+------------+-----------+------------+---------+-------+----------------------+ - | account_number | firstname | address | gender | city | lastname | balance | employer | state | age | email | - |------------------+-------------+--------------------+----------+--------+------------+-----------+------------+---------+-------+----------------------| - | 1 | Amber | 880 Holmes Lane | M | Brogan | Duke | 39225 | Pyrami | IL | 32 | amberduke@pyrami.com | - | 13 | Nanette | 789 Madison Street | F | Nogal | Bates | 32838 | Quility | VA | 28 | null | - +------------------+-------------+--------------------+----------+--------+------------+-----------+------------+---------+-------+----------------------+ + +------------------+-------------+--------------------+-----------+----------+--------+------------+---------+-------+----------------------+------------+ + | account_number | firstname | address | balance | gender | city | employer | state | age | email | lastname | + |------------------+-------------+--------------------+-----------+----------+--------+------------+---------+-------+----------------------+------------| + | 1 | Amber | 880 Holmes Lane | 39225 | M | Brogan | Pyrami | IL | 32 | amberduke@pyrami.com | Duke | + | 13 | Nanette | 789 Madison Street | 32838 | F | Nogal | Quility | VA | 28 | null | Bates | + +------------------+-------------+--------------------+-----------+----------+--------+------------+---------+-------+----------------------+------------+ diff --git a/docs/user/general/identifiers.rst b/docs/user/general/identifiers.rst index a2c499f32c..b94593228b 100644 --- a/docs/user/general/identifiers.rst +++ b/docs/user/general/identifiers.rst @@ -40,14 +40,14 @@ Here are examples for using index pattern directly without quotes:: od> SELECT * FROM *cc*nt*; fetched rows / total rows = 4/4 - +------------------+-------------+----------------------+----------+--------+------------+-----------+------------+---------+-------+-----------------------+ - | account_number | firstname | address | gender | city | lastname | balance | employer | state | age | email | - |------------------+-------------+----------------------+----------+--------+------------+-----------+------------+---------+-------+-----------------------| - | 1 | Amber | 880 Holmes Lane | M | Brogan | Duke | 39225 | Pyrami | IL | 32 | amberduke@pyrami.com | - | 6 | Hattie | 671 Bristol Street | M | Dante | Bond | 5686 | Netagy | TN | 36 | hattiebond@netagy.com | - | 13 | Nanette | 789 Madison Street | F | Nogal | Bates | 32838 | Quility | VA | 28 | null | - | 18 | Dale | 467 Hutchinson Court | M | Orick | Adams | 4180 | null | MD | 33 | daleadams@boink.com | - +------------------+-------------+----------------------+----------+--------+------------+-----------+------------+---------+-------+-----------------------+ + +------------------+-------------+----------------------+-----------+----------+--------+------------+---------+-------+-----------------------+------------+ + | account_number | firstname | address | balance | gender | city | employer | state | age | email | lastname | + |------------------+-------------+----------------------+-----------+----------+--------+------------+---------+-------+-----------------------+------------| + | 1 | Amber | 880 Holmes Lane | 39225 | M | Brogan | Pyrami | IL | 32 | amberduke@pyrami.com | Duke | + | 6 | Hattie | 671 Bristol Street | 5686 | M | Dante | Netagy | TN | 36 | hattiebond@netagy.com | Bond | + | 13 | Nanette | 789 Madison Street | 32838 | F | Nogal | Quility | VA | 28 | null | Bates | + | 18 | Dale | 467 Hutchinson Court | 4180 | M | Orick | null | MD | 33 | daleadams@boink.com | Adams | + +------------------+-------------+----------------------+-----------+----------+--------+------------+---------+-------+-----------------------+------------+ Delimited Identifiers @@ -76,14 +76,14 @@ Here are examples for quoting an index name by back ticks:: od> SELECT * FROM `accounts`; fetched rows / total rows = 4/4 - +------------------+-------------+----------------------+----------+--------+------------+-----------+------------+---------+-------+-----------------------+ - | account_number | firstname | address | gender | city | lastname | balance | employer | state | age | email | - |------------------+-------------+----------------------+----------+--------+------------+-----------+------------+---------+-------+-----------------------| - | 1 | Amber | 880 Holmes Lane | M | Brogan | Duke | 39225 | Pyrami | IL | 32 | amberduke@pyrami.com | - | 6 | Hattie | 671 Bristol Street | M | Dante | Bond | 5686 | Netagy | TN | 36 | hattiebond@netagy.com | - | 13 | Nanette | 789 Madison Street | F | Nogal | Bates | 32838 | Quility | VA | 28 | null | - | 18 | Dale | 467 Hutchinson Court | M | Orick | Adams | 4180 | null | MD | 33 | daleadams@boink.com | - +------------------+-------------+----------------------+----------+--------+------------+-----------+------------+---------+-------+-----------------------+ + +------------------+-------------+----------------------+-----------+----------+--------+------------+---------+-------+-----------------------+------------+ + | account_number | firstname | address | balance | gender | city | employer | state | age | email | lastname | + |------------------+-------------+----------------------+-----------+----------+--------+------------+---------+-------+-----------------------+------------| + | 1 | Amber | 880 Holmes Lane | 39225 | M | Brogan | Pyrami | IL | 32 | amberduke@pyrami.com | Duke | + | 6 | Hattie | 671 Bristol Street | 5686 | M | Dante | Netagy | TN | 36 | hattiebond@netagy.com | Bond | + | 13 | Nanette | 789 Madison Street | 32838 | F | Nogal | Quility | VA | 28 | null | Bates | + | 18 | Dale | 467 Hutchinson Court | 4180 | M | Orick | null | MD | 33 | daleadams@boink.com | Adams | + +------------------+-------------+----------------------+-----------+----------+--------+------------+---------+-------+-----------------------+------------+ Case Sensitivity diff --git a/elasticsearch/src/main/java/com/amazon/opendistroforelasticsearch/sql/elasticsearch/mapping/IndexMapping.java b/elasticsearch/src/main/java/com/amazon/opendistroforelasticsearch/sql/elasticsearch/mapping/IndexMapping.java index 4440d9854d..bf92275964 100644 --- a/elasticsearch/src/main/java/com/amazon/opendistroforelasticsearch/sql/elasticsearch/mapping/IndexMapping.java +++ b/elasticsearch/src/main/java/com/amazon/opendistroforelasticsearch/sql/elasticsearch/mapping/IndexMapping.java @@ -101,15 +101,6 @@ private void flatMappings( func.accept(fullFieldName, type); } - if (isMultiField(mapping)) { - ((Map>) mapping.get("fields")) - .forEach( - (innerFieldName, innerMapping) -> - func.accept( - fullFieldName + "." + innerFieldName, - (String) innerMapping.getOrDefault("type", "object"))); - } - if (mapping.containsKey("properties")) { // Nested field flatMappings((Map) mapping.get("properties"), fullFieldName, func); } diff --git a/elasticsearch/src/main/java/com/amazon/opendistroforelasticsearch/sql/elasticsearch/storage/ElasticsearchIndex.java b/elasticsearch/src/main/java/com/amazon/opendistroforelasticsearch/sql/elasticsearch/storage/ElasticsearchIndex.java index accfff5285..bc8091d601 100644 --- a/elasticsearch/src/main/java/com/amazon/opendistroforelasticsearch/sql/elasticsearch/storage/ElasticsearchIndex.java +++ b/elasticsearch/src/main/java/com/amazon/opendistroforelasticsearch/sql/elasticsearch/storage/ElasticsearchIndex.java @@ -54,6 +54,11 @@ public class ElasticsearchIndex implements Table { /** Current Elasticsearch index name. */ private final String indexName; + /** + * The cached mapping of field and type in index. + */ + private Map cachedFieldTypes = null; + /* * TODO: Assume indexName doesn't have wildcard. * Need to either handle field name conflicts @@ -61,7 +66,10 @@ public class ElasticsearchIndex implements Table { */ @Override public Map getFieldTypes() { - return new ElasticsearchDescribeIndexRequest(client, indexName).getFieldTypes(); + if (cachedFieldTypes == null) { + cachedFieldTypes = new ElasticsearchDescribeIndexRequest(client, indexName).getFieldTypes(); + } + return cachedFieldTypes; } /** diff --git a/elasticsearch/src/test/java/com/amazon/opendistroforelasticsearch/sql/elasticsearch/client/ElasticsearchNodeClientTest.java b/elasticsearch/src/test/java/com/amazon/opendistroforelasticsearch/sql/elasticsearch/client/ElasticsearchNodeClientTest.java index 642c44e036..fc936b53ad 100644 --- a/elasticsearch/src/test/java/com/amazon/opendistroforelasticsearch/sql/elasticsearch/client/ElasticsearchNodeClientTest.java +++ b/elasticsearch/src/test/java/com/amazon/opendistroforelasticsearch/sql/elasticsearch/client/ElasticsearchNodeClientTest.java @@ -111,7 +111,7 @@ public void getIndexMappings() throws IOException { assertEquals(1, indexMappings.size()); IndexMapping indexMapping = indexMappings.values().iterator().next(); - assertEquals(20, indexMapping.size()); + assertEquals(18, indexMapping.size()); assertEquals("text", indexMapping.getFieldType("address")); assertEquals("integer", indexMapping.getFieldType("age")); assertEquals("double", indexMapping.getFieldType("balance")); @@ -121,7 +121,6 @@ public void getIndexMappings() throws IOException { assertEquals("some_new_es_type_outside_type_system", indexMapping.getFieldType("new_field")); assertEquals("text", indexMapping.getFieldType("field with spaces")); assertEquals("text_keyword", indexMapping.getFieldType("employer")); - assertEquals("keyword", indexMapping.getFieldType("employer.raw")); assertEquals("nested", indexMapping.getFieldType("projects")); assertEquals("boolean", indexMapping.getFieldType("projects.active")); assertEquals("date", indexMapping.getFieldType("projects.release")); @@ -129,7 +128,6 @@ public void getIndexMappings() throws IOException { assertEquals("text", indexMapping.getFieldType("projects.members.name")); assertEquals("object", indexMapping.getFieldType("manager")); assertEquals("text_keyword", indexMapping.getFieldType("manager.name")); - assertEquals("keyword", indexMapping.getFieldType("manager.name.keyword")); assertEquals("keyword", indexMapping.getFieldType("manager.address")); assertEquals("long", indexMapping.getFieldType("manager.salary")); } diff --git a/elasticsearch/src/test/java/com/amazon/opendistroforelasticsearch/sql/elasticsearch/client/ElasticsearchRestClientTest.java b/elasticsearch/src/test/java/com/amazon/opendistroforelasticsearch/sql/elasticsearch/client/ElasticsearchRestClientTest.java index 60f0adeaef..725f9a34ba 100644 --- a/elasticsearch/src/test/java/com/amazon/opendistroforelasticsearch/sql/elasticsearch/client/ElasticsearchRestClientTest.java +++ b/elasticsearch/src/test/java/com/amazon/opendistroforelasticsearch/sql/elasticsearch/client/ElasticsearchRestClientTest.java @@ -110,7 +110,7 @@ void getIndexMappings() throws IOException { assertEquals(1, indexMappings.size()); IndexMapping indexMapping = indexMappings.values().iterator().next(); - assertEquals(20, indexMapping.size()); + assertEquals(18, indexMapping.size()); assertEquals("text", indexMapping.getFieldType("address")); assertEquals("integer", indexMapping.getFieldType("age")); assertEquals("double", indexMapping.getFieldType("balance")); @@ -120,7 +120,6 @@ void getIndexMappings() throws IOException { assertEquals("some_new_es_type_outside_type_system", indexMapping.getFieldType("new_field")); assertEquals("text", indexMapping.getFieldType("field with spaces")); assertEquals("text_keyword", indexMapping.getFieldType("employer")); - assertEquals("keyword", indexMapping.getFieldType("employer.raw")); assertEquals("nested", indexMapping.getFieldType("projects")); assertEquals("boolean", indexMapping.getFieldType("projects.active")); assertEquals("date", indexMapping.getFieldType("projects.release")); @@ -128,7 +127,6 @@ void getIndexMappings() throws IOException { assertEquals("text", indexMapping.getFieldType("projects.members.name")); assertEquals("object", indexMapping.getFieldType("manager")); assertEquals("text_keyword", indexMapping.getFieldType("manager.name")); - assertEquals("keyword", indexMapping.getFieldType("manager.name.keyword")); assertEquals("keyword", indexMapping.getFieldType("manager.address")); assertEquals("long", indexMapping.getFieldType("manager.salary")); } diff --git a/integ-test/src/test/java/com/amazon/opendistroforelasticsearch/sql/legacy/PrettyFormatResponseIT.java b/integ-test/src/test/java/com/amazon/opendistroforelasticsearch/sql/legacy/PrettyFormatResponseIT.java index bbe47c946f..5039e9e61e 100644 --- a/integ-test/src/test/java/com/amazon/opendistroforelasticsearch/sql/legacy/PrettyFormatResponseIT.java +++ b/integ-test/src/test/java/com/amazon/opendistroforelasticsearch/sql/legacy/PrettyFormatResponseIT.java @@ -136,26 +136,6 @@ public void selectWrongField() throws IOException { assertThat(getDataRows(response).length(), equalTo(RESPONSE_DEFAULT_MAX_SIZE)); } - @Ignore("Issue 1019, Breaking Change, the keyword should not been allowed, semantic exception " - + "is expected") - @Test - public void selectKeyword() throws IOException { - JSONObject response = executeQuery( - String.format(Locale.ROOT, "SELECT firstname.keyword FROM %s", - TestsConstants.TEST_INDEX_ACCOUNT)); - - List fields = Collections.singletonList("firstname.keyword"); - assertContainsColumns(getSchema(response), fields); - - /* - * firstname.keyword will appear in Schema but because there is no 'firstname.keyword' in SearchHits source - * the DataRows will output null. - * - * Looks like x-pack adds this keyword field to "docvalue_fields", this is likely how it ends up in SearchHits - */ - // assertContainsData(getDataRows(protocol), fields); - } - @Test public void selectScore() throws IOException { JSONObject response = executeQuery( @@ -377,17 +357,6 @@ public void aggregationFunctionInSelectWithAlias() throws IOException { } } - @Test - public void aggregationFunctionInSelectGroupByMultipleFields() throws IOException { - JSONObject response = executeQuery( - String.format(Locale.ROOT, "SELECT SUM(age) FROM %s GROUP BY age, state.keyword", - TestsConstants.TEST_INDEX_ACCOUNT)); - - List fields = Arrays.asList("SUM(age)"); - assertContainsColumns(getSchema(response), fields); - assertContainsData(getDataRows(response), fields); - } - @Test public void aggregationFunctionInSelectNoGroupBy() throws IOException { JSONObject response = executeQuery(String.format(Locale.ROOT, "SELECT SUM(age) FROM %s", diff --git a/integ-test/src/test/java/com/amazon/opendistroforelasticsearch/sql/sql/QueryValidationIT.java b/integ-test/src/test/java/com/amazon/opendistroforelasticsearch/sql/sql/QueryValidationIT.java index 8214536fdf..2e14eb7fbc 100644 --- a/integ-test/src/test/java/com/amazon/opendistroforelasticsearch/sql/sql/QueryValidationIT.java +++ b/integ-test/src/test/java/com/amazon/opendistroforelasticsearch/sql/sql/QueryValidationIT.java @@ -65,10 +65,31 @@ public void testNonAggregatedSelectColumnPresentWithoutGroupByClause() throws IO .hasStatusCode(BAD_REQUEST) .hasErrorType("SemanticCheckException") .containsMessage("Explicit GROUP BY clause is required because expression [state] " - + "contains non-aggregated column") + + "contains non-aggregated column") .whenExecute("SELECT state, AVG(age) FROM elasticsearch-sql_test_index_account"); } + @Test + public void testQueryFieldWithKeyword() throws IOException { + expectResponseException() + .hasStatusCode(BAD_REQUEST) + .hasErrorType("SemanticCheckException") + .containsMessage( + "can't resolve Symbol(namespace=FIELD_NAME, name=firstname.keyword) in type env") + .whenExecute("SELECT firstname.keyword FROM elasticsearch-sql_test_index_account"); + } + + @Test + public void aggregationFunctionInSelectGroupByMultipleFields() throws IOException { + expectResponseException() + .hasStatusCode(BAD_REQUEST) + .hasErrorType("SemanticCheckException") + .containsMessage( + "can't resolve Symbol(namespace=FIELD_NAME, name=state.keyword) in type env") + .whenExecute( + "SELECT SUM(age) FROM elasticsearch-sql_test_index_account GROUP BY state.keyword"); + } + public ResponseExceptionAssertion expectResponseException() { return new ResponseExceptionAssertion(exceptionRule); }