From 3352fa2e8bae4e9f3f8e66c40ad536a897752f98 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Luk=C3=A1=C5=A1=20Vl=C4=8Dek?= Date: Wed, 8 Nov 2023 15:15:16 +0100 Subject: [PATCH] Deprecate CamelCase PathHierarchy tokenizer name (#10894) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Deprecate CamelCase PathHierarchy tokenizer name in favor to lowercase path_hierarchy. Signed-off-by: Lukáš Vlček Signed-off-by: Shivansh Arora --- CHANGELOG.md | 1 + .../common/CommonAnalysisModulePlugin.java | 25 +++++++++-- .../PathHierarchyTokenizerFactoryTests.java | 45 +++++++++++++++++++ .../test/analysis-common/30_tokenizers.yml | 9 +++- 4 files changed, 76 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 2011769858699..772afcb0ea715 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -63,6 +63,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), - Return 409 Conflict HTTP status instead of 503 on failure to concurrently execute snapshots ([#8986](https://github.com/opensearch-project/OpenSearch/pull/5855)) - Add task completion count in search backpressure stats API ([#10028](https://github.com/opensearch-project/OpenSearch/pull/10028/)) - Performance improvement for Datetime field caching ([#4558](https://github.com/opensearch-project/OpenSearch/issues/4558)) +- Deprecate CamelCase `PathHierarchy` tokenizer name in favor to lowercase `path_hierarchy` ([#10894](https://github.com/opensearch-project/OpenSearch/pull/10894)) ### Deprecated diff --git a/modules/analysis-common/src/main/java/org/opensearch/analysis/common/CommonAnalysisModulePlugin.java b/modules/analysis-common/src/main/java/org/opensearch/analysis/common/CommonAnalysisModulePlugin.java index edb8c37c2dbdd..cf2736a8583d2 100644 --- a/modules/analysis-common/src/main/java/org/opensearch/analysis/common/CommonAnalysisModulePlugin.java +++ b/modules/analysis-common/src/main/java/org/opensearch/analysis/common/CommonAnalysisModulePlugin.java @@ -394,7 +394,17 @@ public Map> getTokenizers() { // TODO deprecate and remove in API tokenizers.put("lowercase", XLowerCaseTokenizerFactory::new); tokenizers.put("path_hierarchy", PathHierarchyTokenizerFactory::new); - tokenizers.put("PathHierarchy", PathHierarchyTokenizerFactory::new); + tokenizers.put("PathHierarchy", (IndexSettings indexSettings, Environment environment, String name, Settings settings) -> { + // TODO Remove "PathHierarchy" tokenizer name in 4.0 and throw exception + if (indexSettings.getIndexVersionCreated().onOrAfter(Version.V_3_0_0)) { + deprecationLogger.deprecate( + "PathHierarchy_tokenizer_deprecation", + "The [PathHierarchy] tokenizer name is deprecated and will be removed in a future version. " + + "Please change the tokenizer name to [path_hierarchy] instead." + ); + } + return new PathHierarchyTokenizerFactory(indexSettings, environment, name, settings); + }); tokenizers.put("pattern", PatternTokenizerFactory::new); tokenizers.put("uax_url_email", UAX29URLEmailTokenizerFactory::new); tokenizers.put("whitespace", WhitespaceTokenizerFactory::new); @@ -662,8 +672,17 @@ public List getPreConfiguredTokenizers() { } return new EdgeNGramTokenizer(NGramTokenizer.DEFAULT_MIN_NGRAM_SIZE, NGramTokenizer.DEFAULT_MAX_NGRAM_SIZE); })); - tokenizers.add(PreConfiguredTokenizer.singleton("PathHierarchy", PathHierarchyTokenizer::new)); - + tokenizers.add(PreConfiguredTokenizer.openSearchVersion("PathHierarchy", (version) -> { + // TODO Remove "PathHierarchy" tokenizer name in 4.0 and throw exception + if (version.onOrAfter(Version.V_3_0_0)) { + deprecationLogger.deprecate( + "PathHierarchy_tokenizer_deprecation", + "The [PathHierarchy] tokenizer name is deprecated and will be removed in a future version. " + + "Please change the tokenizer name to [path_hierarchy] instead." + ); + } + return new PathHierarchyTokenizer(); + })); return tokenizers; } } diff --git a/modules/analysis-common/src/test/java/org/opensearch/analysis/common/PathHierarchyTokenizerFactoryTests.java b/modules/analysis-common/src/test/java/org/opensearch/analysis/common/PathHierarchyTokenizerFactoryTests.java index 1fe7c582449ec..555d6c78b6ec5 100644 --- a/modules/analysis-common/src/test/java/org/opensearch/analysis/common/PathHierarchyTokenizerFactoryTests.java +++ b/modules/analysis-common/src/test/java/org/opensearch/analysis/common/PathHierarchyTokenizerFactoryTests.java @@ -35,16 +35,61 @@ import com.carrotsearch.randomizedtesting.generators.RandomPicks; import org.apache.lucene.analysis.Tokenizer; +import org.opensearch.Version; +import org.opensearch.cluster.metadata.IndexMetadata; import org.opensearch.common.settings.Settings; import org.opensearch.core.index.Index; +import org.opensearch.env.Environment; +import org.opensearch.env.TestEnvironment; +import org.opensearch.index.IndexSettings; +import org.opensearch.index.analysis.IndexAnalyzers; +import org.opensearch.index.analysis.NamedAnalyzer; +import org.opensearch.indices.analysis.AnalysisModule; import org.opensearch.test.IndexSettingsModule; import org.opensearch.test.OpenSearchTokenStreamTestCase; +import org.opensearch.test.VersionUtils; import java.io.IOException; import java.io.StringReader; +import java.util.Collections; public class PathHierarchyTokenizerFactoryTests extends OpenSearchTokenStreamTestCase { + private IndexAnalyzers buildAnalyzers(Version version, String tokenizer) throws IOException { + Settings settings = Settings.builder().put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()).build(); + Settings indexSettings = Settings.builder() + .put(IndexMetadata.SETTING_VERSION_CREATED, version) + .put("index.analysis.analyzer.my_analyzer.tokenizer", tokenizer) + .build(); + IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", indexSettings); + return new AnalysisModule(TestEnvironment.newEnvironment(settings), Collections.singletonList(new CommonAnalysisModulePlugin())) + .getAnalysisRegistry() + .build(idxSettings); + } + + /** + * Test that deprecated "PathHierarchy" tokenizer name is still available via {@link CommonAnalysisModulePlugin} starting in 3.x. + */ + public void testPreConfiguredTokenizer() throws IOException { + + { + try ( + IndexAnalyzers indexAnalyzers = buildAnalyzers( + VersionUtils.randomVersionBetween(random(), Version.V_3_0_0, Version.CURRENT), + "PathHierarchy" + ) + ) { + NamedAnalyzer analyzer = indexAnalyzers.get("my_analyzer"); + assertNotNull(analyzer); + assertTokenStreamContents(analyzer.tokenStream("dummy", "/a/b/c"), new String[] { "/a", "/a/b", "/a/b/c" }); + // Once LUCENE-12750 is fixed we can use the following testing method instead. + // Similar testing approach has been used for deprecation of (Edge)NGrams tokenizers as well. + // assertAnalyzesTo(analyzer, "/a/b/c", new String[] { "/a", "/a/b", "/a/b/c" }); + + } + } + } + public void testDefaults() throws IOException { final Index index = new Index("test", "_na_"); final Settings indexSettings = newAnalysisSettingsBuilder().build(); diff --git a/modules/analysis-common/src/yamlRestTest/resources/rest-api-spec/test/analysis-common/30_tokenizers.yml b/modules/analysis-common/src/yamlRestTest/resources/rest-api-spec/test/analysis-common/30_tokenizers.yml index 56ed2175df60a..179de835a4105 100644 --- a/modules/analysis-common/src/yamlRestTest/resources/rest-api-spec/test/analysis-common/30_tokenizers.yml +++ b/modules/analysis-common/src/yamlRestTest/resources/rest-api-spec/test/analysis-common/30_tokenizers.yml @@ -298,6 +298,9 @@ --- "path_hierarchy": + - skip: + features: "allowed_warnings" + - do: indices.analyze: body: @@ -312,6 +315,8 @@ - match: { detail.tokenizer.tokens.2.token: a/b/c } - do: + allowed_warnings: + - 'The [PathHierarchy] tokenizer name is deprecated and will be removed in a future version. Please change the tokenizer name to [path_hierarchy] instead.' indices.analyze: body: text: "a/b/c" @@ -337,11 +342,13 @@ - match: { detail.tokenizer.tokens.2.token: a/b/c } - do: + allowed_warnings: + - 'The [PathHierarchy] tokenizer name is deprecated and will be removed in a future version. Please change the tokenizer name to [path_hierarchy] instead.' indices.analyze: body: text: "a/b/c" explain: true - tokenizer: PathHierarchy + tokenizer: PathHierarchy - length: { detail.tokenizer.tokens: 3 } - match: { detail.tokenizer.name: PathHierarchy } - match: { detail.tokenizer.tokens.0.token: a }