From 3352fa2e8bae4e9f3f8e66c40ad536a897752f98 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Luk=C3=A1=C5=A1=20Vl=C4=8Dek?= <lukas.vlcek@aiven.io>
Date: Wed, 8 Nov 2023 15:15:16 +0100
Subject: [PATCH] Deprecate CamelCase PathHierarchy tokenizer name (#10894)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Deprecate CamelCase PathHierarchy tokenizer name in favor to lowercase path_hierarchy.

Signed-off-by: Lukáš Vlček <lukas.vlcek@aiven.io>
Signed-off-by: Shivansh Arora <hishiv@amazon.com>
---
 CHANGELOG.md                                  |  1 +
 .../common/CommonAnalysisModulePlugin.java    | 25 +++++++++--
 .../PathHierarchyTokenizerFactoryTests.java   | 45 +++++++++++++++++++
 .../test/analysis-common/30_tokenizers.yml    |  9 +++-
 4 files changed, 76 insertions(+), 4 deletions(-)
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 2011769858699..772afcb0ea715 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -63,6 +63,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 - Return 409 Conflict HTTP status instead of 503 on failure to concurrently execute snapshots ([#8986](https://github.com/opensearch-project/OpenSearch/pull/5855))
 - Add task completion count in search backpressure stats API ([#10028](https://github.com/opensearch-project/OpenSearch/pull/10028/))
 - Performance improvement for Datetime field caching ([#4558](https://github.com/opensearch-project/OpenSearch/issues/4558))
+- Deprecate CamelCase `PathHierarchy` tokenizer name in favor to lowercase `path_hierarchy` ([#10894](https://github.com/opensearch-project/OpenSearch/pull/10894))
 
 
 ### Deprecated
diff --git a/modules/analysis-common/src/main/java/org/opensearch/analysis/common/CommonAnalysisModulePlugin.java b/modules/analysis-common/src/main/java/org/opensearch/analysis/common/CommonAnalysisModulePlugin.java
index edb8c37c2dbdd..cf2736a8583d2 100644
--- a/modules/analysis-common/src/main/java/org/opensearch/analysis/common/CommonAnalysisModulePlugin.java
+++ b/modules/analysis-common/src/main/java/org/opensearch/analysis/common/CommonAnalysisModulePlugin.java
@@ -394,7 +394,17 @@ public Map<String, AnalysisProvider<TokenizerFactory>> getTokenizers() {
         // TODO deprecate and remove in API
         tokenizers.put("lowercase", XLowerCaseTokenizerFactory::new);
         tokenizers.put("path_hierarchy", PathHierarchyTokenizerFactory::new);
-        tokenizers.put("PathHierarchy", PathHierarchyTokenizerFactory::new);
+        tokenizers.put("PathHierarchy", (IndexSettings indexSettings, Environment environment, String name, Settings settings) -> {
+            // TODO Remove "PathHierarchy" tokenizer name in 4.0 and throw exception
+            if (indexSettings.getIndexVersionCreated().onOrAfter(Version.V_3_0_0)) {
+                deprecationLogger.deprecate(
+                    "PathHierarchy_tokenizer_deprecation",
+                    "The [PathHierarchy] tokenizer name is deprecated and will be removed in a future version. "
+                        + "Please change the tokenizer name to [path_hierarchy] instead."
+                );
+            }
+            return new PathHierarchyTokenizerFactory(indexSettings, environment, name, settings);
+        });
         tokenizers.put("pattern", PatternTokenizerFactory::new);
         tokenizers.put("uax_url_email", UAX29URLEmailTokenizerFactory::new);
         tokenizers.put("whitespace", WhitespaceTokenizerFactory::new);
@@ -662,8 +672,17 @@ public List<PreConfiguredTokenizer> getPreConfiguredTokenizers() {
             }
             return new EdgeNGramTokenizer(NGramTokenizer.DEFAULT_MIN_NGRAM_SIZE, NGramTokenizer.DEFAULT_MAX_NGRAM_SIZE);
         }));
-        tokenizers.add(PreConfiguredTokenizer.singleton("PathHierarchy", PathHierarchyTokenizer::new));
-
+        tokenizers.add(PreConfiguredTokenizer.openSearchVersion("PathHierarchy", (version) -> {
+            // TODO Remove "PathHierarchy" tokenizer name in 4.0 and throw exception
+            if (version.onOrAfter(Version.V_3_0_0)) {
+                deprecationLogger.deprecate(
+                    "PathHierarchy_tokenizer_deprecation",
+                    "The [PathHierarchy] tokenizer name is deprecated and will be removed in a future version. "
+                        + "Please change the tokenizer name to [path_hierarchy] instead."
+                );
+            }
+            return new PathHierarchyTokenizer();
+        }));
         return tokenizers;
     }
 }
diff --git a/modules/analysis-common/src/test/java/org/opensearch/analysis/common/PathHierarchyTokenizerFactoryTests.java b/modules/analysis-common/src/test/java/org/opensearch/analysis/common/PathHierarchyTokenizerFactoryTests.java
index 1fe7c582449ec..555d6c78b6ec5 100644
--- a/modules/analysis-common/src/test/java/org/opensearch/analysis/common/PathHierarchyTokenizerFactoryTests.java
+++ b/modules/analysis-common/src/test/java/org/opensearch/analysis/common/PathHierarchyTokenizerFactoryTests.java
@@ -35,16 +35,61 @@
 import com.carrotsearch.randomizedtesting.generators.RandomPicks;
 
 import org.apache.lucene.analysis.Tokenizer;
+import org.opensearch.Version;
+import org.opensearch.cluster.metadata.IndexMetadata;
 import org.opensearch.common.settings.Settings;
 import org.opensearch.core.index.Index;
+import org.opensearch.env.Environment;
+import org.opensearch.env.TestEnvironment;
+import org.opensearch.index.IndexSettings;
+import org.opensearch.index.analysis.IndexAnalyzers;
+import org.opensearch.index.analysis.NamedAnalyzer;
+import org.opensearch.indices.analysis.AnalysisModule;
 import org.opensearch.test.IndexSettingsModule;
 import org.opensearch.test.OpenSearchTokenStreamTestCase;
+import org.opensearch.test.VersionUtils;
 
 import java.io.IOException;
 import java.io.StringReader;
+import java.util.Collections;
 
 public class PathHierarchyTokenizerFactoryTests extends OpenSearchTokenStreamTestCase {
 
+    private IndexAnalyzers buildAnalyzers(Version version, String tokenizer) throws IOException {
+        Settings settings = Settings.builder().put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()).build();
+        Settings indexSettings = Settings.builder()
+            .put(IndexMetadata.SETTING_VERSION_CREATED, version)
+            .put("index.analysis.analyzer.my_analyzer.tokenizer", tokenizer)
+            .build();
+        IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", indexSettings);
+        return new AnalysisModule(TestEnvironment.newEnvironment(settings), Collections.singletonList(new CommonAnalysisModulePlugin()))
+            .getAnalysisRegistry()
+            .build(idxSettings);
+    }
+
+    /**
+     * Test that deprecated "PathHierarchy" tokenizer name is still available via {@link CommonAnalysisModulePlugin} starting in 3.x.
+     */
+    public void testPreConfiguredTokenizer() throws IOException {
+
+        {
+            try (
+                IndexAnalyzers indexAnalyzers = buildAnalyzers(
+                    VersionUtils.randomVersionBetween(random(), Version.V_3_0_0, Version.CURRENT),
+                    "PathHierarchy"
+                )
+            ) {
+                NamedAnalyzer analyzer = indexAnalyzers.get("my_analyzer");
+                assertNotNull(analyzer);
+                assertTokenStreamContents(analyzer.tokenStream("dummy", "/a/b/c"), new String[] { "/a", "/a/b", "/a/b/c" });
+                // Once LUCENE-12750 is fixed we can use the following testing method instead.
+                // Similar testing approach has been used for deprecation of (Edge)NGrams tokenizers as well.
+                // assertAnalyzesTo(analyzer, "/a/b/c", new String[] { "/a", "/a/b", "/a/b/c" });
+
+            }
+        }
+    }
+
     public void testDefaults() throws IOException {
         final Index index = new Index("test", "_na_");
         final Settings indexSettings = newAnalysisSettingsBuilder().build();
diff --git a/modules/analysis-common/src/yamlRestTest/resources/rest-api-spec/test/analysis-common/30_tokenizers.yml b/modules/analysis-common/src/yamlRestTest/resources/rest-api-spec/test/analysis-common/30_tokenizers.yml
index 56ed2175df60a..179de835a4105 100644
--- a/modules/analysis-common/src/yamlRestTest/resources/rest-api-spec/test/analysis-common/30_tokenizers.yml
+++ b/modules/analysis-common/src/yamlRestTest/resources/rest-api-spec/test/analysis-common/30_tokenizers.yml
@@ -298,6 +298,9 @@
 
 ---
 "path_hierarchy":
+    - skip:
+        features: "allowed_warnings"
+
     - do:
         indices.analyze:
           body:
@@ -312,6 +315,8 @@
     - match:  { detail.tokenizer.tokens.2.token: a/b/c }
 
     - do:
+        allowed_warnings:
+          - 'The [PathHierarchy] tokenizer name is deprecated and will be removed in a future version. Please change the tokenizer name to [path_hierarchy] instead.'
         indices.analyze:
           body:
             text: "a/b/c"
@@ -337,11 +342,13 @@
     - match:  { detail.tokenizer.tokens.2.token: a/b/c }
 
     - do:
+        allowed_warnings:
+          - 'The [PathHierarchy] tokenizer name is deprecated and will be removed in a future version. Please change the tokenizer name to [path_hierarchy] instead.'
         indices.analyze:
           body:
             text: "a/b/c"
             explain: true
-            tokenizer:  PathHierarchy
+            tokenizer: PathHierarchy
     - length: { detail.tokenizer.tokens: 3 }
     - match:  { detail.tokenizer.name: PathHierarchy }
     - match:  { detail.tokenizer.tokens.0.token: a }