Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Deprecate and remove camel-case nGram and edgeNGram tokenizers #50862

Merged
merged 7 commits into from
Jan 14, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions docs/reference/migration/migrate_8_0/analysis.asciidoc
Original file line number Diff line number Diff line change
Expand Up @@ -16,3 +16,12 @@
The `nGram` and `edgeNGram` token filter names that have been deprecated since
version 6.4 have been removed. Both token filters can only be used by their
alternative names `ngram` and `edge_ngram` since version 7.0.

[float]
[[nGram-edgeNGram-tokenizer-dreprecation]]
==== Disallow use of the `nGram` and `edgeNGram` tokenizer names

The `nGram` and `edgeNGram` tokenizer names haven been deprecated with 7.6 and are no longer
supported on new indices. Mappings for indices created after 7.6 will continue to work but
emit a deprecation warning. The tokenizer name should be changed to the fully equivalent
`ngram` or `edge_ngram` names for new indices and in index templates.
Original file line number Diff line number Diff line change
Expand Up @@ -337,9 +337,29 @@ public Map<String, AnalysisProvider<TokenizerFactory>> getTokenizers() {
tokenizers.put("simple_pattern", SimplePatternTokenizerFactory::new);
tokenizers.put("simple_pattern_split", SimplePatternSplitTokenizerFactory::new);
tokenizers.put("thai", ThaiTokenizerFactory::new);
tokenizers.put("nGram", NGramTokenizerFactory::new);
tokenizers.put("nGram", (IndexSettings indexSettings, Environment environment, String name, Settings settings) -> {
if (indexSettings.getIndexVersionCreated().onOrAfter(org.elasticsearch.Version.V_8_0_0)) {
throw new IllegalArgumentException("The [nGram] tokenizer name was deprecated in 7.6. "
+ "Please use the tokenizer name to [ngram] for indices created in versions 8 or higher instead.");
} else if (indexSettings.getIndexVersionCreated().onOrAfter(org.elasticsearch.Version.V_7_6_0)) {
deprecationLogger.deprecatedAndMaybeLog("nGram_tokenizer_deprecation",
"The [nGram] tokenizer name is deprecated and will be removed in a future version. "
+ "Please change the tokenizer name to [ngram] instead.");
}
return new NGramTokenizerFactory(indexSettings, environment, name, settings);
});
tokenizers.put("ngram", NGramTokenizerFactory::new);
tokenizers.put("edgeNGram", EdgeNGramTokenizerFactory::new);
tokenizers.put("edgeNGram", (IndexSettings indexSettings, Environment environment, String name, Settings settings) -> {
if (indexSettings.getIndexVersionCreated().onOrAfter(org.elasticsearch.Version.V_8_0_0)) {
throw new IllegalArgumentException("The [edgeNGram] tokenizer name was deprecated in 7.6. "
+ "Please use the tokenizer name to [edge_nGram] for indices created in versions 8 or higher instead.");
} else if (indexSettings.getIndexVersionCreated().onOrAfter(org.elasticsearch.Version.V_7_6_0)) {
deprecationLogger.deprecatedAndMaybeLog("edgeNGram_tokenizer_deprecation",
"The [edgeNGram] tokenizer name is deprecated and will be removed in a future version. "
+ "Please change the tokenizer name to [edge_ngram] instead.");
}
return new EdgeNGramTokenizerFactory(indexSettings, environment, name, settings);
});
tokenizers.put("edge_ngram", EdgeNGramTokenizerFactory::new);
tokenizers.put("char_group", CharGroupTokenizerFactory::new);
tokenizers.put("classic", ClassicTokenizerFactory::new);
Expand Down Expand Up @@ -522,8 +542,26 @@ public List<PreConfiguredTokenizer> getPreConfiguredTokenizers() {
tokenizers.add(PreConfiguredTokenizer.singleton("lowercase", XLowerCaseTokenizer::new));

// Temporary shim for aliases. TODO deprecate after they are moved
tokenizers.add(PreConfiguredTokenizer.singleton("nGram", NGramTokenizer::new));
tokenizers.add(PreConfiguredTokenizer.elasticsearchVersion("nGram", (version) -> {
if (version.onOrAfter(org.elasticsearch.Version.V_8_0_0)) {
throw new IllegalArgumentException("The [nGram] tokenizer name was deprecated in 7.6. "
+ "Please use the tokenizer name to [ngram] for indices created in versions 8 or higher instead.");
} else if (version.onOrAfter(org.elasticsearch.Version.V_7_6_0)) {
deprecationLogger.deprecatedAndMaybeLog("nGram_tokenizer_deprecation",
"The [nGram] tokenizer name is deprecated and will be removed in a future version. "
+ "Please change the tokenizer name to [ngram] instead.");
}
return new NGramTokenizer();
}));
tokenizers.add(PreConfiguredTokenizer.elasticsearchVersion("edgeNGram", (version) -> {
if (version.onOrAfter(org.elasticsearch.Version.V_8_0_0)) {
throw new IllegalArgumentException("The [edgeNGram] tokenizer name was deprecated in 7.6. "
+ "Please use the tokenizer name to [edge_ngram] for indices created in versions 8 or higher instead.");
} else if (version.onOrAfter(org.elasticsearch.Version.V_7_6_0)) {
deprecationLogger.deprecatedAndMaybeLog("edgeNGram_tokenizer_deprecation",
"The [edgeNGram] tokenizer name is deprecated and will be removed in a future version. "
+ "Please change the tokenizer name to [edge_ngram] instead.");
}
if (version.onOrAfter(Version.V_7_3_0)) {
return new EdgeNGramTokenizer(NGramTokenizer.DEFAULT_MIN_NGRAM_SIZE, NGramTokenizer.DEFAULT_MAX_NGRAM_SIZE);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,15 +19,18 @@

package org.elasticsearch.analysis.common;

import org.apache.lucene.analysis.Tokenizer;
import org.elasticsearch.Version;
import org.elasticsearch.cluster.metadata.IndexMetaData;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment;
import org.elasticsearch.index.analysis.TokenizerFactory;
import org.elasticsearch.test.ESTestCase;
import org.elasticsearch.test.IndexSettingsModule;
import org.elasticsearch.test.VersionUtils;

import java.io.IOException;
import java.util.Map;

public class CommonAnalysisPluginTests extends ESTestCase {

Expand Down Expand Up @@ -102,4 +105,82 @@ public void testEdgeNGramFilterInCustomAnalyzerDeprecationError() throws IOExcep
+ "Please change the filter name to [edge_ngram] instead.");
}
}

/**
* Check that we log a deprecation warning for "nGram" and "edgeNGram" tokenizer names with 7.6 and
* disallow usages for indices created after 8.0
*/
public void testNGramTokenizerDeprecation() throws IOException {
// tests for prebuilt tokenizer
doTestPrebuiltTokenizerDeprecation("nGram", "ngram",
VersionUtils.randomVersionBetween(random(), Version.V_7_0_0, Version.V_7_5_2), false);
doTestPrebuiltTokenizerDeprecation("edgeNGram", "edge_ngram",
VersionUtils.randomVersionBetween(random(), Version.V_7_0_0, Version.V_7_5_2), false);
doTestPrebuiltTokenizerDeprecation("nGram", "ngram",
VersionUtils.randomVersionBetween(random(), Version.V_7_6_0,
Version.max(Version.V_7_6_0, VersionUtils.getPreviousVersion(Version.V_8_0_0))),
true);
doTestPrebuiltTokenizerDeprecation("edgeNGram", "edge_ngram",
VersionUtils.randomVersionBetween(random(), Version.V_7_6_0,
Version.max(Version.V_7_6_0, VersionUtils.getPreviousVersion(Version.V_8_0_0))), true);
expectThrows(IllegalArgumentException.class, () -> doTestPrebuiltTokenizerDeprecation("nGram", "ngram",
VersionUtils.randomVersionBetween(random(), Version.V_8_0_0, Version.CURRENT), true));
expectThrows(IllegalArgumentException.class, () -> doTestPrebuiltTokenizerDeprecation("edgeNGram", "edge_ngram",
VersionUtils.randomVersionBetween(random(), Version.V_8_0_0, Version.CURRENT), true));

// same batch of tests for custom tokenizer definition in the settings
doTestCustomTokenizerDeprecation("nGram", "ngram",
VersionUtils.randomVersionBetween(random(), Version.V_7_0_0, Version.V_7_5_2), false);
doTestCustomTokenizerDeprecation("edgeNGram", "edge_ngram",
VersionUtils.randomVersionBetween(random(), Version.V_7_0_0, Version.V_7_5_2), false);
doTestCustomTokenizerDeprecation("nGram", "ngram",
VersionUtils.randomVersionBetween(random(), Version.V_7_6_0,
Version.max(Version.V_7_6_0, VersionUtils.getPreviousVersion(Version.V_8_0_0))),
true);
doTestCustomTokenizerDeprecation("edgeNGram", "edge_ngram",
VersionUtils.randomVersionBetween(random(), Version.V_7_6_0,
Version.max(Version.V_7_6_0, VersionUtils.getPreviousVersion(Version.V_8_0_0))), true);
expectThrows(IllegalArgumentException.class, () -> doTestCustomTokenizerDeprecation("nGram", "ngram",
VersionUtils.randomVersionBetween(random(), Version.V_8_0_0, Version.CURRENT), true));
expectThrows(IllegalArgumentException.class, () -> doTestCustomTokenizerDeprecation("edgeNGram", "edge_ngram",
VersionUtils.randomVersionBetween(random(), Version.V_8_0_0, Version.CURRENT), true));
}

public void doTestPrebuiltTokenizerDeprecation(String deprecatedName, String replacement, Version version, boolean expectWarning)
throws IOException {
final Settings settings = Settings.builder().put(Environment.PATH_HOME_SETTING.getKey(), createTempDir())
.put(IndexMetaData.SETTING_VERSION_CREATED, version).build();

try (CommonAnalysisPlugin commonAnalysisPlugin = new CommonAnalysisPlugin()) {
Map<String, TokenizerFactory> tokenizers = createTestAnalysis(
IndexSettingsModule.newIndexSettings("index", settings), settings, commonAnalysisPlugin).tokenizer;
TokenizerFactory tokenizerFactory = tokenizers.get(deprecatedName);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You shouldn't need to pull a factory and call create to get warnings anymore, they will be emitted when createTestAnalysis is called.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This tests prebuild tokenizer directly though, e.g. when they are used via “_analyze” endpoint directly they wouldn’t appear in analyzers. createTestAnalysis doesn’t trigger the warning in that case if nothing refers to them from analysis settings.


Tokenizer tokenizer = tokenizerFactory.create();
assertNotNull(tokenizer);
if (expectWarning) {
assertWarnings("The [" + deprecatedName + "] tokenizer name is deprecated and will be removed in a future version. "
+ "Please change the tokenizer name to [" + replacement + "] instead.");
}
}
}

public void doTestCustomTokenizerDeprecation(String deprecatedName, String replacement, Version version, boolean expectWarning)
throws IOException {
final Settings settings = Settings.builder().put(Environment.PATH_HOME_SETTING.getKey(), createTempDir())
.put(IndexMetaData.SETTING_VERSION_CREATED, version)
.put("index.analysis.analyzer.custom_analyzer.type", "custom")
.put("index.analysis.analyzer.custom_analyzer.tokenizer", "my_tokenizer")
.put("index.analysis.tokenizer.my_tokenizer.type", deprecatedName)
.build();

try (CommonAnalysisPlugin commonAnalysisPlugin = new CommonAnalysisPlugin()) {
createTestAnalysis(IndexSettingsModule.newIndexSettings("index", settings), settings, commonAnalysisPlugin);

if (expectWarning) {
assertWarnings("The [" + deprecatedName + "] tokenizer name is deprecated and will be removed in a future version. "
+ "Please change the tokenizer name to [" + replacement + "] instead.");
}
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -86,9 +86,11 @@ public void testPreConfiguredTokenizer() throws IOException {
}
}

// Check deprecated name as well
// Check deprecated name as well, needs version before 8.0 because throws IAE after that
{
try (IndexAnalyzers indexAnalyzers = buildAnalyzers(Version.CURRENT, "edgeNGram")) {
try (IndexAnalyzers indexAnalyzers = buildAnalyzers(
VersionUtils.randomVersionBetween(random(), Version.V_7_3_0, VersionUtils.getPreviousVersion(Version.V_8_0_0)),
"edgeNGram")) {
NamedAnalyzer analyzer = indexAnalyzers.get("my_analyzer");
assertNotNull(analyzer);
assertAnalyzesTo(analyzer, "test", new String[]{"t", "te"});
Expand Down