Skip to content

Commit

Permalink
Deprecate and remove camel-case nGram and edgeNGram tokenizers (#50862)
Browse files Browse the repository at this point in the history
We already deprecated and removed the camel-case versions of the nGram and edgeNGram 
filters a while ago and we should do the same with the nGram and edgeNGram tokenizers.
This PR deprecates the use of these names in favour of ngram and edge_ngram in 7
and disallows usage in new indices starting with 8.

Closes #50561
  • Loading branch information
Christoph Büscher authored Jan 14, 2020
1 parent e349c5e commit 9a4357a
Show file tree
Hide file tree
Showing 4 changed files with 135 additions and 5 deletions.
9 changes: 9 additions & 0 deletions docs/reference/migration/migrate_8_0/analysis.asciidoc
Original file line number Diff line number Diff line change
Expand Up @@ -16,3 +16,12 @@
The `nGram` and `edgeNGram` token filter names that have been deprecated since
version 6.4 have been removed. Both token filters can only be used by their
alternative names `ngram` and `edge_ngram` since version 7.0.

[float]
[[nGram-edgeNGram-tokenizer-dreprecation]]
==== Disallow use of the `nGram` and `edgeNGram` tokenizer names

The `nGram` and `edgeNGram` tokenizer names haven been deprecated with 7.6 and are no longer
supported on new indices. Mappings for indices created after 7.6 will continue to work but
emit a deprecation warning. The tokenizer name should be changed to the fully equivalent
`ngram` or `edge_ngram` names for new indices and in index templates.
Original file line number Diff line number Diff line change
Expand Up @@ -337,9 +337,29 @@ public Map<String, AnalysisProvider<TokenizerFactory>> getTokenizers() {
tokenizers.put("simple_pattern", SimplePatternTokenizerFactory::new);
tokenizers.put("simple_pattern_split", SimplePatternSplitTokenizerFactory::new);
tokenizers.put("thai", ThaiTokenizerFactory::new);
tokenizers.put("nGram", NGramTokenizerFactory::new);
tokenizers.put("nGram", (IndexSettings indexSettings, Environment environment, String name, Settings settings) -> {
if (indexSettings.getIndexVersionCreated().onOrAfter(org.elasticsearch.Version.V_8_0_0)) {
throw new IllegalArgumentException("The [nGram] tokenizer name was deprecated in 7.6. "
+ "Please use the tokenizer name to [ngram] for indices created in versions 8 or higher instead.");
} else if (indexSettings.getIndexVersionCreated().onOrAfter(org.elasticsearch.Version.V_7_6_0)) {
deprecationLogger.deprecatedAndMaybeLog("nGram_tokenizer_deprecation",
"The [nGram] tokenizer name is deprecated and will be removed in a future version. "
+ "Please change the tokenizer name to [ngram] instead.");
}
return new NGramTokenizerFactory(indexSettings, environment, name, settings);
});
tokenizers.put("ngram", NGramTokenizerFactory::new);
tokenizers.put("edgeNGram", EdgeNGramTokenizerFactory::new);
tokenizers.put("edgeNGram", (IndexSettings indexSettings, Environment environment, String name, Settings settings) -> {
if (indexSettings.getIndexVersionCreated().onOrAfter(org.elasticsearch.Version.V_8_0_0)) {
throw new IllegalArgumentException("The [edgeNGram] tokenizer name was deprecated in 7.6. "
+ "Please use the tokenizer name to [edge_nGram] for indices created in versions 8 or higher instead.");
} else if (indexSettings.getIndexVersionCreated().onOrAfter(org.elasticsearch.Version.V_7_6_0)) {
deprecationLogger.deprecatedAndMaybeLog("edgeNGram_tokenizer_deprecation",
"The [edgeNGram] tokenizer name is deprecated and will be removed in a future version. "
+ "Please change the tokenizer name to [edge_ngram] instead.");
}
return new EdgeNGramTokenizerFactory(indexSettings, environment, name, settings);
});
tokenizers.put("edge_ngram", EdgeNGramTokenizerFactory::new);
tokenizers.put("char_group", CharGroupTokenizerFactory::new);
tokenizers.put("classic", ClassicTokenizerFactory::new);
Expand Down Expand Up @@ -522,8 +542,26 @@ public List<PreConfiguredTokenizer> getPreConfiguredTokenizers() {
tokenizers.add(PreConfiguredTokenizer.singleton("lowercase", XLowerCaseTokenizer::new));

// Temporary shim for aliases. TODO deprecate after they are moved
tokenizers.add(PreConfiguredTokenizer.singleton("nGram", NGramTokenizer::new));
tokenizers.add(PreConfiguredTokenizer.elasticsearchVersion("nGram", (version) -> {
if (version.onOrAfter(org.elasticsearch.Version.V_8_0_0)) {
throw new IllegalArgumentException("The [nGram] tokenizer name was deprecated in 7.6. "
+ "Please use the tokenizer name to [ngram] for indices created in versions 8 or higher instead.");
} else if (version.onOrAfter(org.elasticsearch.Version.V_7_6_0)) {
deprecationLogger.deprecatedAndMaybeLog("nGram_tokenizer_deprecation",
"The [nGram] tokenizer name is deprecated and will be removed in a future version. "
+ "Please change the tokenizer name to [ngram] instead.");
}
return new NGramTokenizer();
}));
tokenizers.add(PreConfiguredTokenizer.elasticsearchVersion("edgeNGram", (version) -> {
if (version.onOrAfter(org.elasticsearch.Version.V_8_0_0)) {
throw new IllegalArgumentException("The [edgeNGram] tokenizer name was deprecated in 7.6. "
+ "Please use the tokenizer name to [edge_ngram] for indices created in versions 8 or higher instead.");
} else if (version.onOrAfter(org.elasticsearch.Version.V_7_6_0)) {
deprecationLogger.deprecatedAndMaybeLog("edgeNGram_tokenizer_deprecation",
"The [edgeNGram] tokenizer name is deprecated and will be removed in a future version. "
+ "Please change the tokenizer name to [edge_ngram] instead.");
}
if (version.onOrAfter(Version.V_7_3_0)) {
return new EdgeNGramTokenizer(NGramTokenizer.DEFAULT_MIN_NGRAM_SIZE, NGramTokenizer.DEFAULT_MAX_NGRAM_SIZE);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,15 +19,18 @@

package org.elasticsearch.analysis.common;

import org.apache.lucene.analysis.Tokenizer;
import org.elasticsearch.Version;
import org.elasticsearch.cluster.metadata.IndexMetaData;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment;
import org.elasticsearch.index.analysis.TokenizerFactory;
import org.elasticsearch.test.ESTestCase;
import org.elasticsearch.test.IndexSettingsModule;
import org.elasticsearch.test.VersionUtils;

import java.io.IOException;
import java.util.Map;

public class CommonAnalysisPluginTests extends ESTestCase {

Expand Down Expand Up @@ -102,4 +105,82 @@ public void testEdgeNGramFilterInCustomAnalyzerDeprecationError() throws IOExcep
+ "Please change the filter name to [edge_ngram] instead.");
}
}

/**
* Check that we log a deprecation warning for "nGram" and "edgeNGram" tokenizer names with 7.6 and
* disallow usages for indices created after 8.0
*/
public void testNGramTokenizerDeprecation() throws IOException {
// tests for prebuilt tokenizer
doTestPrebuiltTokenizerDeprecation("nGram", "ngram",
VersionUtils.randomVersionBetween(random(), Version.V_7_0_0, Version.V_7_5_2), false);
doTestPrebuiltTokenizerDeprecation("edgeNGram", "edge_ngram",
VersionUtils.randomVersionBetween(random(), Version.V_7_0_0, Version.V_7_5_2), false);
doTestPrebuiltTokenizerDeprecation("nGram", "ngram",
VersionUtils.randomVersionBetween(random(), Version.V_7_6_0,
Version.max(Version.V_7_6_0, VersionUtils.getPreviousVersion(Version.V_8_0_0))),
true);
doTestPrebuiltTokenizerDeprecation("edgeNGram", "edge_ngram",
VersionUtils.randomVersionBetween(random(), Version.V_7_6_0,
Version.max(Version.V_7_6_0, VersionUtils.getPreviousVersion(Version.V_8_0_0))), true);
expectThrows(IllegalArgumentException.class, () -> doTestPrebuiltTokenizerDeprecation("nGram", "ngram",
VersionUtils.randomVersionBetween(random(), Version.V_8_0_0, Version.CURRENT), true));
expectThrows(IllegalArgumentException.class, () -> doTestPrebuiltTokenizerDeprecation("edgeNGram", "edge_ngram",
VersionUtils.randomVersionBetween(random(), Version.V_8_0_0, Version.CURRENT), true));

// same batch of tests for custom tokenizer definition in the settings
doTestCustomTokenizerDeprecation("nGram", "ngram",
VersionUtils.randomVersionBetween(random(), Version.V_7_0_0, Version.V_7_5_2), false);
doTestCustomTokenizerDeprecation("edgeNGram", "edge_ngram",
VersionUtils.randomVersionBetween(random(), Version.V_7_0_0, Version.V_7_5_2), false);
doTestCustomTokenizerDeprecation("nGram", "ngram",
VersionUtils.randomVersionBetween(random(), Version.V_7_6_0,
Version.max(Version.V_7_6_0, VersionUtils.getPreviousVersion(Version.V_8_0_0))),
true);
doTestCustomTokenizerDeprecation("edgeNGram", "edge_ngram",
VersionUtils.randomVersionBetween(random(), Version.V_7_6_0,
Version.max(Version.V_7_6_0, VersionUtils.getPreviousVersion(Version.V_8_0_0))), true);
expectThrows(IllegalArgumentException.class, () -> doTestCustomTokenizerDeprecation("nGram", "ngram",
VersionUtils.randomVersionBetween(random(), Version.V_8_0_0, Version.CURRENT), true));
expectThrows(IllegalArgumentException.class, () -> doTestCustomTokenizerDeprecation("edgeNGram", "edge_ngram",
VersionUtils.randomVersionBetween(random(), Version.V_8_0_0, Version.CURRENT), true));
}

public void doTestPrebuiltTokenizerDeprecation(String deprecatedName, String replacement, Version version, boolean expectWarning)
throws IOException {
final Settings settings = Settings.builder().put(Environment.PATH_HOME_SETTING.getKey(), createTempDir())
.put(IndexMetaData.SETTING_VERSION_CREATED, version).build();

try (CommonAnalysisPlugin commonAnalysisPlugin = new CommonAnalysisPlugin()) {
Map<String, TokenizerFactory> tokenizers = createTestAnalysis(
IndexSettingsModule.newIndexSettings("index", settings), settings, commonAnalysisPlugin).tokenizer;
TokenizerFactory tokenizerFactory = tokenizers.get(deprecatedName);

Tokenizer tokenizer = tokenizerFactory.create();
assertNotNull(tokenizer);
if (expectWarning) {
assertWarnings("The [" + deprecatedName + "] tokenizer name is deprecated and will be removed in a future version. "
+ "Please change the tokenizer name to [" + replacement + "] instead.");
}
}
}

public void doTestCustomTokenizerDeprecation(String deprecatedName, String replacement, Version version, boolean expectWarning)
throws IOException {
final Settings settings = Settings.builder().put(Environment.PATH_HOME_SETTING.getKey(), createTempDir())
.put(IndexMetaData.SETTING_VERSION_CREATED, version)
.put("index.analysis.analyzer.custom_analyzer.type", "custom")
.put("index.analysis.analyzer.custom_analyzer.tokenizer", "my_tokenizer")
.put("index.analysis.tokenizer.my_tokenizer.type", deprecatedName)
.build();

try (CommonAnalysisPlugin commonAnalysisPlugin = new CommonAnalysisPlugin()) {
createTestAnalysis(IndexSettingsModule.newIndexSettings("index", settings), settings, commonAnalysisPlugin);

if (expectWarning) {
assertWarnings("The [" + deprecatedName + "] tokenizer name is deprecated and will be removed in a future version. "
+ "Please change the tokenizer name to [" + replacement + "] instead.");
}
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -86,9 +86,11 @@ public void testPreConfiguredTokenizer() throws IOException {
}
}

// Check deprecated name as well
// Check deprecated name as well, needs version before 8.0 because throws IAE after that
{
try (IndexAnalyzers indexAnalyzers = buildAnalyzers(Version.CURRENT, "edgeNGram")) {
try (IndexAnalyzers indexAnalyzers = buildAnalyzers(
VersionUtils.randomVersionBetween(random(), Version.V_7_3_0, VersionUtils.getPreviousVersion(Version.V_8_0_0)),
"edgeNGram")) {
NamedAnalyzer analyzer = indexAnalyzers.get("my_analyzer");
assertNotNull(analyzer);
assertAnalyzesTo(analyzer, "test", new String[]{"t", "te"});
Expand Down

0 comments on commit 9a4357a

Please sign in to comment.