From cbc54234c282839619fcb98f273371d8ec53b989 Mon Sep 17 00:00:00 2001 From: yuye-aws Date: Sun, 18 Feb 2024 20:14:36 +0800 Subject: [PATCH 001/189] implement chunking processor and fixed token length Signed-off-by: yuye-aws --- .../neuralsearch/plugin/NeuralSearch.java | 7 +- .../processor/DocumentChunkingProcessor.java | 166 ++++++++++++++++++ .../processor/chunker/ChunkerFactory.java | 38 ++++ .../processor/chunker/DelimiterChunker.java | 23 +++ .../chunker/FixedTokenLengthChunker.java | 104 +++++++++++ .../processor/chunker/IFieldChunker.java | 14 ++ 6 files changed, 350 insertions(+), 2 deletions(-) create mode 100644 src/main/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessor.java create mode 100644 src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerFactory.java create mode 100644 src/main/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunker.java create mode 100644 src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java create mode 100644 src/main/java/org/opensearch/neuralsearch/processor/chunker/IFieldChunker.java diff --git a/src/main/java/org/opensearch/neuralsearch/plugin/NeuralSearch.java b/src/main/java/org/opensearch/neuralsearch/plugin/NeuralSearch.java index 0182ff4d3..faf12fa12 100644 --- a/src/main/java/org/opensearch/neuralsearch/plugin/NeuralSearch.java +++ b/src/main/java/org/opensearch/neuralsearch/plugin/NeuralSearch.java @@ -31,6 +31,7 @@ import org.opensearch.neuralsearch.processor.NormalizationProcessorWorkflow; import org.opensearch.neuralsearch.processor.SparseEncodingProcessor; import org.opensearch.neuralsearch.processor.TextEmbeddingProcessor; +import org.opensearch.neuralsearch.processor.DocumentChunkingProcessor; import org.opensearch.neuralsearch.processor.TextImageEmbeddingProcessor; import org.opensearch.neuralsearch.processor.combination.ScoreCombinationFactory; import org.opensearch.neuralsearch.processor.combination.ScoreCombiner; @@ -114,14 +115,16 @@ public Map getProcessors(Processor.Parameters paramet SparseEncodingProcessor.TYPE, new SparseEncodingProcessorFactory(clientAccessor, parameters.env), TextImageEmbeddingProcessor.TYPE, - new TextImageEmbeddingProcessorFactory(clientAccessor, parameters.env, parameters.ingestService.getClusterService()) + new TextImageEmbeddingProcessorFactory(clientAccessor, parameters.env, parameters.ingestService.getClusterService()), + DocumentChunkingProcessor.TYPE, + new DocumentChunkingProcessor.Factory() ); } @Override public Optional getQueryPhaseSearcher() { // we're using "is_disabled" flag as there are no proper implementation of FeatureFlags.isDisabled(). Both - // cases when flag is not set or it is "false" are interpretted in the same way. In such case core is reading + // cases when flag is not set, or it is "false" are interpreted in the same way. In such case core is reading // the actual value from settings. if (FeatureFlags.isEnabled(NEURAL_SEARCH_HYBRID_SEARCH_DISABLED.getKey())) { log.info( diff --git a/src/main/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessor.java b/src/main/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessor.java new file mode 100644 index 000000000..4cfa52909 --- /dev/null +++ b/src/main/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessor.java @@ -0,0 +1,166 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ +package org.opensearch.neuralsearch.processor; + +import java.util.Map; +import java.util.Set; +import java.util.ArrayList; +import java.util.List; +import lombok.extern.log4j.Log4j2; + +import org.opensearch.client.Client; +import org.opensearch.client.node.NodeClient; +import org.opensearch.ingest.IngestDocument; +import org.opensearch.ingest.Processor; +import org.opensearch.ingest.AbstractProcessor; +import org.opensearch.neuralsearch.processor.chunker.ChunkerFactory; +import org.opensearch.neuralsearch.processor.chunker.IFieldChunker; + +import static org.opensearch.ingest.ConfigurationUtils.readMap; +import static org.opensearch.neuralsearch.processor.InferenceProcessor.FIELD_MAP_FIELD; + +@Log4j2 +public final class DocumentChunkingProcessor extends AbstractProcessor { + + public static final String TYPE = "chunking"; + public static final String OUTPUT_FIELD = "output_field"; + + private final Map fieldMap; + + private static NodeClient nodeClient; + + private final Set supportedChunkers = ChunkerFactory.getChunkers(); + + public DocumentChunkingProcessor(String tag, String description, Map fieldMap) { + super(tag, description); + validateDocumentChunkingFieldMap(fieldMap); + this.fieldMap = fieldMap; + } + + public static void initialize(Client nodeClient) { + DocumentChunkingProcessor.nodeClient = (NodeClient) nodeClient; + } + + public String getType() { + return TYPE; + } + + private void validateDocumentChunkingFieldMap(Map fieldMap) { + if (fieldMap == null || fieldMap.isEmpty()) { + throw new IllegalArgumentException("Unable to create the processor as field_map is null or empty"); + } + + for (Map.Entry fieldMapEntry : fieldMap.entrySet()) { + String inputField = fieldMapEntry.getKey(); + Object parameters = fieldMapEntry.getValue(); + + if (parameters == null) { + throw new IllegalArgumentException("parameters for input field [" + inputField + "] is null, cannot process it."); + } + + if (!(parameters instanceof Map)) { + throw new IllegalArgumentException( + "parameters for input field [" + inputField + "] cannot be cast to [" + String.class.getName() + "]" + ); + } + + // Casting parameters to a map + Map parameterMap = (Map) parameters; + + // output field must be string + if (!(parameterMap.containsKey(OUTPUT_FIELD))) { + throw new IllegalArgumentException("parameters for output field [" + OUTPUT_FIELD + "] is null, cannot process it."); + } + + Object outputField = parameterMap.get(OUTPUT_FIELD); + + if (!(outputField instanceof String)) { + throw new IllegalArgumentException( + "parameters for output field [" + OUTPUT_FIELD + "] cannot be cast to [" + String.class.getName() + "]" + ); + } + + // check non string parameters + int chunkingAlgorithmCount = 0; + Map chunkerParameters; + for (Map.Entry parameterEntry : parameterMap.entrySet()) { + if (!(parameterEntry.getKey() instanceof String)) { + throw new IllegalArgumentException("found parameter entry with non-string key"); + } + String parameterKey = (String) parameterEntry.getKey(); + if (supportedChunkers.contains(parameterKey)) { + chunkingAlgorithmCount += 1; + chunkerParameters = (Map) parameterEntry.getValue(); + IFieldChunker chunker = ChunkerFactory.create(parameterKey, nodeClient); + chunker.validateParameters(chunkerParameters); + } + } + + // should only define one algorithm + if (chunkingAlgorithmCount == 0) { + throw new IllegalArgumentException("chunking algorithm not defined for input field [" + inputField + "]"); + } + if (chunkingAlgorithmCount > 1) { + throw new IllegalArgumentException("multiple chunking algorithms defined for input field [" + inputField + "]"); + } + } + } + + @Override + public final IngestDocument execute(IngestDocument document) { + for (Map.Entry fieldMapEntry : fieldMap.entrySet()) { + String inputField = fieldMapEntry.getKey(); + Object content = document.getFieldValue(inputField, Object.class); + + if (content == null) { + throw new IllegalArgumentException("input field in document [" + inputField + "] is null, cannot process it."); + } + + if (!(content instanceof String)) { + throw new IllegalArgumentException( + "input field [" + + inputField + + "] of type [" + + content.getClass().getName() + + "] cannot be cast to [" + + String.class.getName() + + "]" + ); + } + + Map parameters = (Map) fieldMapEntry.getValue(); + String outputField = (String) parameters.get(OUTPUT_FIELD); + List chunkedPassages = new ArrayList<>(); + + // parameter has been checked that there is only one algorithm + for (Map.Entry parameterEntry : parameters.entrySet()) { + String parameterKey = (String) parameterEntry.getKey(); + if (supportedChunkers.contains(parameterKey)) { + Map chunkerParameters = (Map) parameterEntry.getValue(); + IFieldChunker chunker = ChunkerFactory.create(parameterKey, nodeClient); + chunkedPassages = chunker.chunk((String) content, (Map) chunkerParameters); + } + } + document.setFieldValue(outputField, chunkedPassages); + } + return document; + } + + public static class Factory implements Processor.Factory { + public Factory() {} + + @Override + public DocumentChunkingProcessor create( + Map registry, + String processorTag, + String description, + Map config + ) throws Exception { + Map fieldMap = readMap(TYPE, processorTag, config, FIELD_MAP_FIELD); + return new DocumentChunkingProcessor(processorTag, description, fieldMap); + } + + } +} diff --git a/src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerFactory.java b/src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerFactory.java new file mode 100644 index 000000000..79bbe5211 --- /dev/null +++ b/src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerFactory.java @@ -0,0 +1,38 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ +package org.opensearch.neuralsearch.processor.chunker; + +import org.opensearch.client.node.NodeClient; + +import java.util.Set; + +public class ChunkerFactory { + + public static final String FIXED_LENGTH_ALGORITHM = "fix_length"; + public static final String DELIMITER_ALGORITHM = "delimiter"; + + public static IFieldChunker create(String type, NodeClient nodeClient) { + switch (type) { + case FIXED_LENGTH_ALGORITHM: + return new FixedTokenLengthChunker(nodeClient); + case DELIMITER_ALGORITHM: + return new DelimiterChunker(); + default: + throw new IllegalArgumentException( + "chunker type [" + + type + + "] is not supported. Supported chunkers types are [" + + FIXED_LENGTH_ALGORITHM + + ", " + + DELIMITER_ALGORITHM + + "]" + ); + } + } + + public static Set getChunkers() { + return Set.of(FIXED_LENGTH_ALGORITHM, DELIMITER_ALGORITHM); + } +} diff --git a/src/main/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunker.java b/src/main/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunker.java new file mode 100644 index 000000000..c9ef5e211 --- /dev/null +++ b/src/main/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunker.java @@ -0,0 +1,23 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ +package org.opensearch.neuralsearch.processor.chunker; + +import java.util.List; +import java.util.Map; + +public class DelimiterChunker implements IFieldChunker { + + public DelimiterChunker() {} + + @Override + public void validateParameters(Map parameters) { + throw new UnsupportedOperationException("delimiter chunker has not been implemented yet"); + } + + @Override + public List chunk(String content, Map parameters) { + throw new UnsupportedOperationException("delimiter chunker has not been implemented yet"); + } +} diff --git a/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java b/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java new file mode 100644 index 000000000..31dfbfa4f --- /dev/null +++ b/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java @@ -0,0 +1,104 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ +package org.opensearch.neuralsearch.processor.chunker; + +import java.util.ArrayList; +import java.util.List; +import java.util.Map; + +import org.opensearch.action.admin.indices.analyze.AnalyzeAction; +import org.opensearch.client.node.NodeClient; + +public class FixedTokenLengthChunker implements IFieldChunker { + + private static final String TOKEN_LIMIT = "token_limit"; + private static final String OVERLAP_RATE = "overlap_rate"; + + private static final String TOKENIZER = "tokenizer"; + + private final NodeClient nodeClient; + + public FixedTokenLengthChunker(NodeClient nodeClient) { + this.nodeClient = nodeClient; + } + + private List tokenize(String content, String tokenizer) { + AnalyzeAction.Request analyzeRequest = new AnalyzeAction.Request(); + analyzeRequest.text(content); + analyzeRequest.tokenizer(tokenizer); + AnalyzeAction.Response analyzeResponse = nodeClient.admin().indices().analyze(analyzeRequest).actionGet(); + List analyzeTokenList = analyzeResponse.getTokens(); + List tokenList = new ArrayList<>(); + for (AnalyzeAction.AnalyzeToken analyzeToken : analyzeTokenList) { + tokenList.add(analyzeToken.getTerm()); + } + return tokenList; + } + + @Override + public List chunk(String content, Map parameters) { + // parameters has been validated + int tokenLimit = 500; + double overlapRate = 0.2; + String tokenizer = "standard"; + + if (parameters.containsKey(TOKEN_LIMIT)) { + Number tokenLimitParam = (Number) parameters.get(TOKEN_LIMIT); + tokenLimit = (int) tokenLimitParam.intValue(); + } + if (parameters.containsKey(OVERLAP_RATE)) { + Number overlapRateParam = (Number) parameters.get(OVERLAP_RATE); + overlapRate = overlapRateParam.doubleValue(); + overlapRate = Math.min(1.0, overlapRate); + overlapRate = Math.max(0.0, overlapRate); + } + if (parameters.containsKey(TOKENIZER)) { + tokenizer = (String) parameters.get(TOKENIZER); + } + + List tokens = tokenize(content, tokenizer); + List passages = new ArrayList<>(); + + int startToken = 0; + int overlapTokenNumber = (int) Math.floor(tokenLimit * overlapRate); + // overlapTokenNumber must be smaller than the token limit + overlapTokenNumber = Math.min(overlapTokenNumber, tokenLimit - 1); + + while (startToken < tokens.size()) { + if (startToken + tokenLimit >= tokens.size()) { + // break the loop when already cover the last token + String passage = String.join(" ", tokens.subList(startToken, startToken + tokenLimit)); + passages.add(passage); + break; + } else { + String passage = String.join(" ", tokens.subList(startToken, startToken + tokenLimit)); + passages.add(passage); + } + startToken += (tokenLimit - overlapTokenNumber); + } + return passages; + } + + @Override + public void validateParameters(Map parameters) { + if (parameters.containsKey(TOKEN_LIMIT) && !(parameters.get(TOKEN_LIMIT) instanceof Number)) { + throw new IllegalArgumentException( + "fixed length parameter [" + TOKEN_LIMIT + "] cannot be cast to [" + Number.class.getName() + "]" + ); + } + + if (parameters.containsKey(OVERLAP_RATE) && !(parameters.get(OVERLAP_RATE) instanceof Number)) { + throw new IllegalArgumentException( + "fixed length parameter [" + OVERLAP_RATE + "] cannot be cast to [" + Number.class.getName() + "]" + ); + } + + if (parameters.containsKey(TOKENIZER) && !(parameters.get(TOKENIZER) instanceof String)) { + throw new IllegalArgumentException( + "fixed length parameter [" + TOKENIZER + "] cannot be cast to [" + String.class.getName() + "]" + ); + } + } +} diff --git a/src/main/java/org/opensearch/neuralsearch/processor/chunker/IFieldChunker.java b/src/main/java/org/opensearch/neuralsearch/processor/chunker/IFieldChunker.java new file mode 100644 index 000000000..6f031bf53 --- /dev/null +++ b/src/main/java/org/opensearch/neuralsearch/processor/chunker/IFieldChunker.java @@ -0,0 +1,14 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ +package org.opensearch.neuralsearch.processor.chunker; + +import java.util.Map; +import java.util.List; + +public interface IFieldChunker { + void validateParameters(Map parameters); + + List chunk(String content, Map parameters); +} From 3e2d365f94a160092540db32db69154040a0959e Mon Sep 17 00:00:00 2001 From: yuye-aws Date: Sun, 18 Feb 2024 20:23:42 +0800 Subject: [PATCH 002/189] initialize node client for document chunking processor Signed-off-by: yuye-aws --- .../java/org/opensearch/neuralsearch/plugin/NeuralSearch.java | 1 + 1 file changed, 1 insertion(+) diff --git a/src/main/java/org/opensearch/neuralsearch/plugin/NeuralSearch.java b/src/main/java/org/opensearch/neuralsearch/plugin/NeuralSearch.java index faf12fa12..b4f786fb0 100644 --- a/src/main/java/org/opensearch/neuralsearch/plugin/NeuralSearch.java +++ b/src/main/java/org/opensearch/neuralsearch/plugin/NeuralSearch.java @@ -91,6 +91,7 @@ public Collection createComponents( final Supplier repositoriesServiceSupplier ) { NeuralSearchClusterUtil.instance().initialize(clusterService); + DocumentChunkingProcessor.initialize(client); NeuralQueryBuilder.initialize(clientAccessor); NeuralSparseQueryBuilder.initialize(clientAccessor); normalizationProcessorWorkflow = new NormalizationProcessorWorkflow(new ScoreNormalizer(), new ScoreCombiner()); From 89584a94acab4dfb05c20b2fe6bc077e4dbfdab1 Mon Sep 17 00:00:00 2001 From: yuye-aws Date: Thu, 22 Feb 2024 16:46:37 +0800 Subject: [PATCH 003/189] initialize document chunking processor with analysis registry Signed-off-by: yuye-aws --- .../java/org/opensearch/neuralsearch/plugin/NeuralSearch.java | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/main/java/org/opensearch/neuralsearch/plugin/NeuralSearch.java b/src/main/java/org/opensearch/neuralsearch/plugin/NeuralSearch.java index b4f786fb0..cf0cecb35 100644 --- a/src/main/java/org/opensearch/neuralsearch/plugin/NeuralSearch.java +++ b/src/main/java/org/opensearch/neuralsearch/plugin/NeuralSearch.java @@ -91,7 +91,6 @@ public Collection createComponents( final Supplier repositoriesServiceSupplier ) { NeuralSearchClusterUtil.instance().initialize(clusterService); - DocumentChunkingProcessor.initialize(client); NeuralQueryBuilder.initialize(clientAccessor); NeuralSparseQueryBuilder.initialize(clientAccessor); normalizationProcessorWorkflow = new NormalizationProcessorWorkflow(new ScoreNormalizer(), new ScoreCombiner()); @@ -118,7 +117,7 @@ public Map getProcessors(Processor.Parameters paramet TextImageEmbeddingProcessor.TYPE, new TextImageEmbeddingProcessorFactory(clientAccessor, parameters.env, parameters.ingestService.getClusterService()), DocumentChunkingProcessor.TYPE, - new DocumentChunkingProcessor.Factory() + new DocumentChunkingProcessor.Factory(parameters.analysisRegistry) ); } From 596fbf7c60107dc2c8badcaa2b25bdf9f58730a9 Mon Sep 17 00:00:00 2001 From: yuye-aws Date: Thu, 22 Feb 2024 16:47:35 +0800 Subject: [PATCH 004/189] chunker factory create with analysis registry Signed-off-by: yuye-aws --- .../processor/DocumentChunkingProcessor.java | 66 ++++++++++++------- .../processor/chunker/ChunkerFactory.java | 6 +- 2 files changed, 47 insertions(+), 25 deletions(-) diff --git a/src/main/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessor.java b/src/main/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessor.java index 4cfa52909..8be5c8132 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessor.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessor.java @@ -8,10 +8,8 @@ import java.util.Set; import java.util.ArrayList; import java.util.List; -import lombok.extern.log4j.Log4j2; -import org.opensearch.client.Client; -import org.opensearch.client.node.NodeClient; +import org.opensearch.index.analysis.AnalysisRegistry; import org.opensearch.ingest.IngestDocument; import org.opensearch.ingest.Processor; import org.opensearch.ingest.AbstractProcessor; @@ -21,7 +19,6 @@ import static org.opensearch.ingest.ConfigurationUtils.readMap; import static org.opensearch.neuralsearch.processor.InferenceProcessor.FIELD_MAP_FIELD; -@Log4j2 public final class DocumentChunkingProcessor extends AbstractProcessor { public static final String TYPE = "chunking"; @@ -29,24 +26,22 @@ public final class DocumentChunkingProcessor extends AbstractProcessor { private final Map fieldMap; - private static NodeClient nodeClient; - private final Set supportedChunkers = ChunkerFactory.getChunkers(); - public DocumentChunkingProcessor(String tag, String description, Map fieldMap) { + private final AnalysisRegistry analysisRegistry; + + public DocumentChunkingProcessor(String tag, String description, Map fieldMap, AnalysisRegistry analysisRegistry) { super(tag, description); validateDocumentChunkingFieldMap(fieldMap); this.fieldMap = fieldMap; - } - - public static void initialize(Client nodeClient) { - DocumentChunkingProcessor.nodeClient = (NodeClient) nodeClient; + this.analysisRegistry = analysisRegistry; } public String getType() { return TYPE; } + @SuppressWarnings("unchecked") private void validateDocumentChunkingFieldMap(Map fieldMap) { if (fieldMap == null || fieldMap.isEmpty()) { throw new IllegalArgumentException("Unable to create the processor as field_map is null or empty"); @@ -66,8 +61,7 @@ private void validateDocumentChunkingFieldMap(Map fieldMap) { ); } - // Casting parameters to a map - Map parameterMap = (Map) parameters; + Map parameterMap = (Map) parameters; // output field must be string if (!(parameterMap.containsKey(OUTPUT_FIELD))) { @@ -93,7 +87,7 @@ private void validateDocumentChunkingFieldMap(Map fieldMap) { if (supportedChunkers.contains(parameterKey)) { chunkingAlgorithmCount += 1; chunkerParameters = (Map) parameterEntry.getValue(); - IFieldChunker chunker = ChunkerFactory.create(parameterKey, nodeClient); + IFieldChunker chunker = ChunkerFactory.create(parameterKey, analysisRegistry); chunker.validateParameters(chunkerParameters); } } @@ -109,7 +103,7 @@ private void validateDocumentChunkingFieldMap(Map fieldMap) { } @Override - public final IngestDocument execute(IngestDocument document) { + public IngestDocument execute(IngestDocument document) { for (Map.Entry fieldMapEntry : fieldMap.entrySet()) { String inputField = fieldMapEntry.getKey(); Object content = document.getFieldValue(inputField, Object.class); @@ -118,7 +112,22 @@ public final IngestDocument execute(IngestDocument document) { throw new IllegalArgumentException("input field in document [" + inputField + "] is null, cannot process it."); } - if (!(content instanceof String)) { + if (content instanceof List) { + List contentList = (List) content; + for (Object contentElement : contentList) { + if (!(contentElement instanceof String)) { + throw new IllegalArgumentException( + "element in input field list [" + + inputField + + "] of type [" + + contentElement.getClass().getName() + + "] cannot be cast to [" + + String.class.getName() + + "]" + ); + } + } + } else if (!(content instanceof String)) { throw new IllegalArgumentException( "input field [" + inputField @@ -134,13 +143,21 @@ public final IngestDocument execute(IngestDocument document) { String outputField = (String) parameters.get(OUTPUT_FIELD); List chunkedPassages = new ArrayList<>(); - // parameter has been checked that there is only one algorithm + // we have validated that there is one chunking algorithm + // and that chunkerParameters is of type Map for (Map.Entry parameterEntry : parameters.entrySet()) { String parameterKey = (String) parameterEntry.getKey(); if (supportedChunkers.contains(parameterKey)) { - Map chunkerParameters = (Map) parameterEntry.getValue(); - IFieldChunker chunker = ChunkerFactory.create(parameterKey, nodeClient); - chunkedPassages = chunker.chunk((String) content, (Map) chunkerParameters); + @SuppressWarnings("unchecked") + Map chunkerParameters = (Map) parameterEntry.getValue(); + IFieldChunker chunker = ChunkerFactory.create(parameterKey, analysisRegistry); + if (content instanceof String) { + chunkedPassages = chunker.chunk((String) content, chunkerParameters); + } else { + for (Object contentElement : (List) content) { + chunkedPassages.addAll(chunker.chunk((String) contentElement, chunkerParameters)); + } + } } } document.setFieldValue(outputField, chunkedPassages); @@ -149,7 +166,12 @@ public final IngestDocument execute(IngestDocument document) { } public static class Factory implements Processor.Factory { - public Factory() {} + + private final AnalysisRegistry analysisRegistry; + + public Factory(AnalysisRegistry analysisRegistry) { + this.analysisRegistry = analysisRegistry; + } @Override public DocumentChunkingProcessor create( @@ -159,7 +181,7 @@ public DocumentChunkingProcessor create( Map config ) throws Exception { Map fieldMap = readMap(TYPE, processorTag, config, FIELD_MAP_FIELD); - return new DocumentChunkingProcessor(processorTag, description, fieldMap); + return new DocumentChunkingProcessor(processorTag, description, fieldMap, analysisRegistry); } } diff --git a/src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerFactory.java b/src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerFactory.java index 79bbe5211..01f6d547f 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerFactory.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerFactory.java @@ -4,7 +4,7 @@ */ package org.opensearch.neuralsearch.processor.chunker; -import org.opensearch.client.node.NodeClient; +import org.opensearch.index.analysis.AnalysisRegistry; import java.util.Set; @@ -13,10 +13,10 @@ public class ChunkerFactory { public static final String FIXED_LENGTH_ALGORITHM = "fix_length"; public static final String DELIMITER_ALGORITHM = "delimiter"; - public static IFieldChunker create(String type, NodeClient nodeClient) { + public static IFieldChunker create(String type, AnalysisRegistry analysisRegistry) { switch (type) { case FIXED_LENGTH_ALGORITHM: - return new FixedTokenLengthChunker(nodeClient); + return new FixedTokenLengthChunker(analysisRegistry); case DELIMITER_ALGORITHM: return new DelimiterChunker(); default: From 636f907c2d907d0d7aa5668786308492d146ad9a Mon Sep 17 00:00:00 2001 From: yuye-aws Date: Thu, 22 Feb 2024 16:48:11 +0800 Subject: [PATCH 005/189] implement tokenizer in fixed token length algorithm with analysis registry Signed-off-by: yuye-aws --- .../chunker/FixedTokenLengthChunker.java | 77 ++++++++++++------- 1 file changed, 49 insertions(+), 28 deletions(-) diff --git a/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java b/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java index 31dfbfa4f..c96c181a6 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java @@ -4,13 +4,19 @@ */ package org.opensearch.neuralsearch.processor.chunker; +import java.io.IOException; import java.util.ArrayList; import java.util.List; import java.util.Map; +import lombok.extern.log4j.Log4j2; import org.opensearch.action.admin.indices.analyze.AnalyzeAction; -import org.opensearch.client.node.NodeClient; +import org.opensearch.index.analysis.AnalysisRegistry; + +import static org.opensearch.action.admin.indices.analyze.TransportAnalyzeAction.analyze; + +@Log4j2 public class FixedTokenLengthChunker implements IFieldChunker { private static final String TOKEN_LIMIT = "token_limit"; @@ -18,24 +24,29 @@ public class FixedTokenLengthChunker implements IFieldChunker { private static final String TOKENIZER = "tokenizer"; - private final NodeClient nodeClient; + private final AnalysisRegistry analysisRegistry; - public FixedTokenLengthChunker(NodeClient nodeClient) { - this.nodeClient = nodeClient; + public FixedTokenLengthChunker(AnalysisRegistry analysisRegistry) { + this.analysisRegistry = analysisRegistry; } private List tokenize(String content, String tokenizer) { AnalyzeAction.Request analyzeRequest = new AnalyzeAction.Request(); analyzeRequest.text(content); analyzeRequest.tokenizer(tokenizer); - AnalyzeAction.Response analyzeResponse = nodeClient.admin().indices().analyze(analyzeRequest).actionGet(); - List analyzeTokenList = analyzeResponse.getTokens(); - List tokenList = new ArrayList<>(); - for (AnalyzeAction.AnalyzeToken analyzeToken : analyzeTokenList) { - tokenList.add(analyzeToken.getTerm()); + try { + AnalyzeAction.Response analyzeResponse = analyze(analyzeRequest, analysisRegistry, null, 10000); + List analyzeTokenList = analyzeResponse.getTokens(); + List tokenList = new ArrayList<>(); + for (AnalyzeAction.AnalyzeToken analyzeToken : analyzeTokenList) { + tokenList.add(analyzeToken.getTerm()); + } + return tokenList; + + } catch (IOException e) { + throw new RuntimeException(e); } - return tokenList; - } + }; @Override public List chunk(String content, Map parameters) { @@ -45,14 +56,10 @@ public List chunk(String content, Map parameters) { String tokenizer = "standard"; if (parameters.containsKey(TOKEN_LIMIT)) { - Number tokenLimitParam = (Number) parameters.get(TOKEN_LIMIT); - tokenLimit = (int) tokenLimitParam.intValue(); + tokenLimit = ((Number) parameters.get(TOKEN_LIMIT)).intValue(); } if (parameters.containsKey(OVERLAP_RATE)) { - Number overlapRateParam = (Number) parameters.get(OVERLAP_RATE); - overlapRate = overlapRateParam.doubleValue(); - overlapRate = Math.min(1.0, overlapRate); - overlapRate = Math.max(0.0, overlapRate); + overlapRate = ((Number) parameters.get(OVERLAP_RATE)).doubleValue(); } if (parameters.containsKey(TOKENIZER)) { tokenizer = (String) parameters.get(TOKENIZER); @@ -61,6 +68,7 @@ public List chunk(String content, Map parameters) { List tokens = tokenize(content, tokenizer); List passages = new ArrayList<>(); + String passage; int startToken = 0; int overlapTokenNumber = (int) Math.floor(tokenLimit * overlapRate); // overlapTokenNumber must be smaller than the token limit @@ -69,30 +77,43 @@ public List chunk(String content, Map parameters) { while (startToken < tokens.size()) { if (startToken + tokenLimit >= tokens.size()) { // break the loop when already cover the last token - String passage = String.join(" ", tokens.subList(startToken, startToken + tokenLimit)); + passage = String.join(" ", tokens.subList(startToken, tokens.size())); passages.add(passage); break; } else { - String passage = String.join(" ", tokens.subList(startToken, startToken + tokenLimit)); + passage = String.join(" ", tokens.subList(startToken, startToken + tokenLimit)); passages.add(passage); } - startToken += (tokenLimit - overlapTokenNumber); + startToken += tokenLimit - overlapTokenNumber; } return passages; } @Override public void validateParameters(Map parameters) { - if (parameters.containsKey(TOKEN_LIMIT) && !(parameters.get(TOKEN_LIMIT) instanceof Number)) { - throw new IllegalArgumentException( - "fixed length parameter [" + TOKEN_LIMIT + "] cannot be cast to [" + Number.class.getName() + "]" - ); + if (parameters.containsKey(TOKEN_LIMIT)) { + if (!(parameters.get(TOKEN_LIMIT) instanceof Number)) { + throw new IllegalArgumentException( + "fixed length parameter [" + TOKEN_LIMIT + "] cannot be cast to [" + Number.class.getName() + "]" + ); + } + if (((Number) parameters.get(TOKEN_LIMIT)).intValue() <= 0) { + throw new IllegalArgumentException("fixed length parameter [" + TOKEN_LIMIT + "] must be positive"); + } } - if (parameters.containsKey(OVERLAP_RATE) && !(parameters.get(OVERLAP_RATE) instanceof Number)) { - throw new IllegalArgumentException( - "fixed length parameter [" + OVERLAP_RATE + "] cannot be cast to [" + Number.class.getName() + "]" - ); + if (parameters.containsKey(OVERLAP_RATE)) { + if (!(parameters.get(OVERLAP_RATE) instanceof Number)) { + throw new IllegalArgumentException( + "fixed length parameter [" + OVERLAP_RATE + "] cannot be cast to [" + Number.class.getName() + "]" + ); + } + if (((Number) parameters.get(OVERLAP_RATE)).doubleValue() < 0.0 + || ((Number) parameters.get(OVERLAP_RATE)).doubleValue() >= 1.0) { + throw new IllegalArgumentException( + "fixed length parameter [" + OVERLAP_RATE + "] must be between 0 and 1, 1 is not included." + ); + } } if (parameters.containsKey(TOKENIZER) && !(parameters.get(TOKENIZER) instanceof String)) { From 2ffd6b0758698c79f2aa825b6a67133ab191fb2c Mon Sep 17 00:00:00 2001 From: yuye-aws Date: Thu, 22 Feb 2024 22:57:31 +0800 Subject: [PATCH 006/189] add max token count parsing logic Signed-off-by: yuye-aws --- .../neuralsearch/plugin/NeuralSearch.java | 7 ++- .../processor/DocumentChunkingProcessor.java | 62 ++++++++++++++++++- .../chunker/FixedTokenLengthChunker.java | 18 ++++-- 3 files changed, 77 insertions(+), 10 deletions(-) diff --git a/src/main/java/org/opensearch/neuralsearch/plugin/NeuralSearch.java b/src/main/java/org/opensearch/neuralsearch/plugin/NeuralSearch.java index cf0cecb35..dc5b6e8f2 100644 --- a/src/main/java/org/opensearch/neuralsearch/plugin/NeuralSearch.java +++ b/src/main/java/org/opensearch/neuralsearch/plugin/NeuralSearch.java @@ -117,7 +117,12 @@ public Map getProcessors(Processor.Parameters paramet TextImageEmbeddingProcessor.TYPE, new TextImageEmbeddingProcessorFactory(clientAccessor, parameters.env, parameters.ingestService.getClusterService()), DocumentChunkingProcessor.TYPE, - new DocumentChunkingProcessor.Factory(parameters.analysisRegistry) + new DocumentChunkingProcessor.Factory( + parameters.env.settings(), + parameters.ingestService.getClusterService(), + parameters.indicesService, + parameters.analysisRegistry + ) ); } diff --git a/src/main/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessor.java b/src/main/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessor.java index 8be5c8132..99f09e3ff 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessor.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessor.java @@ -8,13 +8,22 @@ import java.util.Set; import java.util.ArrayList; import java.util.List; +import java.util.Objects; +import org.opensearch.cluster.service.ClusterService; +import org.opensearch.common.settings.Settings; +import org.opensearch.core.index.Index; +import org.opensearch.index.IndexService; +import org.opensearch.index.IndexSettings; import org.opensearch.index.analysis.AnalysisRegistry; +import org.opensearch.indices.IndicesService; import org.opensearch.ingest.IngestDocument; import org.opensearch.ingest.Processor; import org.opensearch.ingest.AbstractProcessor; import org.opensearch.neuralsearch.processor.chunker.ChunkerFactory; +import org.opensearch.neuralsearch.processor.chunker.FixedTokenLengthChunker; import org.opensearch.neuralsearch.processor.chunker.IFieldChunker; +import org.opensearch.index.mapper.IndexFieldMapper; import static org.opensearch.ingest.ConfigurationUtils.readMap; import static org.opensearch.neuralsearch.processor.InferenceProcessor.FIELD_MAP_FIELD; @@ -28,12 +37,29 @@ public final class DocumentChunkingProcessor extends AbstractProcessor { private final Set supportedChunkers = ChunkerFactory.getChunkers(); + private final Settings settings; + + private final ClusterService clusterService; + + private final IndicesService indicesService; + private final AnalysisRegistry analysisRegistry; - public DocumentChunkingProcessor(String tag, String description, Map fieldMap, AnalysisRegistry analysisRegistry) { + public DocumentChunkingProcessor( + String tag, + String description, + Map fieldMap, + Settings settings, + ClusterService clusterService, + IndicesService indicesService, + AnalysisRegistry analysisRegistry + ) { super(tag, description); validateDocumentChunkingFieldMap(fieldMap); this.fieldMap = fieldMap; + this.settings = settings; + this.clusterService = clusterService; + this.indicesService = indicesService; this.analysisRegistry = analysisRegistry; } @@ -139,6 +165,8 @@ public IngestDocument execute(IngestDocument document) { ); } + + Map parameters = (Map) fieldMapEntry.getValue(); String outputField = (String) parameters.get(OUTPUT_FIELD); List chunkedPassages = new ArrayList<>(); @@ -150,6 +178,17 @@ public IngestDocument execute(IngestDocument document) { if (supportedChunkers.contains(parameterKey)) { @SuppressWarnings("unchecked") Map chunkerParameters = (Map) parameterEntry.getValue(); + if (Objects.equals(parameterKey, ChunkerFactory.FIXED_LENGTH_ALGORITHM)) { + // add maxTokenCount setting from index metadata to chunker parameters + Map sourceAndMetadataMap = document.getSourceAndMetadata(); + String indexName = sourceAndMetadataMap.get(IndexFieldMapper.NAME).toString(); + Index index = clusterService.state().metadata().index(indexName).getIndex(); + IndexService indexService = indicesService.indexServiceSafe(index); + final int maxTokenCount = indexService == null + ? IndexSettings.MAX_TOKEN_COUNT_SETTING.get(settings) + : indexService.getIndexSettings().getMaxTokenCount(); + chunkerParameters.put(FixedTokenLengthChunker.MAX_TOKEN_COUNT, maxTokenCount); + } IFieldChunker chunker = ChunkerFactory.create(parameterKey, analysisRegistry); if (content instanceof String) { chunkedPassages = chunker.chunk((String) content, chunkerParameters); @@ -167,9 +206,18 @@ public IngestDocument execute(IngestDocument document) { public static class Factory implements Processor.Factory { + private final Settings settings; + + private final ClusterService clusterService; + + private final IndicesService indicesService; + private final AnalysisRegistry analysisRegistry; - public Factory(AnalysisRegistry analysisRegistry) { + public Factory(Settings settings, ClusterService clusterService, IndicesService indicesService, AnalysisRegistry analysisRegistry) { + this.settings = settings; + this.clusterService = clusterService; + this.indicesService = indicesService; this.analysisRegistry = analysisRegistry; } @@ -181,7 +229,15 @@ public DocumentChunkingProcessor create( Map config ) throws Exception { Map fieldMap = readMap(TYPE, processorTag, config, FIELD_MAP_FIELD); - return new DocumentChunkingProcessor(processorTag, description, fieldMap, analysisRegistry); + return new DocumentChunkingProcessor( + processorTag, + description, + fieldMap, + settings, + clusterService, + indicesService, + analysisRegistry + ); } } diff --git a/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java b/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java index c96c181a6..95727893a 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java @@ -19,10 +19,12 @@ @Log4j2 public class FixedTokenLengthChunker implements IFieldChunker { - private static final String TOKEN_LIMIT = "token_limit"; - private static final String OVERLAP_RATE = "overlap_rate"; + public static final String TOKEN_LIMIT = "token_limit"; + public static final String OVERLAP_RATE = "overlap_rate"; - private static final String TOKENIZER = "tokenizer"; + public static final String MAX_TOKEN_COUNT = "max_token_count"; + + public static final String TOKENIZER = "tokenizer"; private final AnalysisRegistry analysisRegistry; @@ -30,12 +32,12 @@ public FixedTokenLengthChunker(AnalysisRegistry analysisRegistry) { this.analysisRegistry = analysisRegistry; } - private List tokenize(String content, String tokenizer) { + private List tokenize(String content, String tokenizer, int maxTokenCount) { AnalyzeAction.Request analyzeRequest = new AnalyzeAction.Request(); analyzeRequest.text(content); analyzeRequest.tokenizer(tokenizer); try { - AnalyzeAction.Response analyzeResponse = analyze(analyzeRequest, analysisRegistry, null, 10000); + AnalyzeAction.Response analyzeResponse = analyze(analyzeRequest, analysisRegistry, null, maxTokenCount); List analyzeTokenList = analyzeResponse.getTokens(); List tokenList = new ArrayList<>(); for (AnalyzeAction.AnalyzeToken analyzeToken : analyzeTokenList) { @@ -53,6 +55,7 @@ public List chunk(String content, Map parameters) { // parameters has been validated int tokenLimit = 500; double overlapRate = 0.2; + int maxTokenCount = 10000; String tokenizer = "standard"; if (parameters.containsKey(TOKEN_LIMIT)) { @@ -61,11 +64,14 @@ public List chunk(String content, Map parameters) { if (parameters.containsKey(OVERLAP_RATE)) { overlapRate = ((Number) parameters.get(OVERLAP_RATE)).doubleValue(); } + if (parameters.containsKey(MAX_TOKEN_COUNT)) { + maxTokenCount = ((Number) parameters.get(MAX_TOKEN_COUNT)).intValue(); + } if (parameters.containsKey(TOKENIZER)) { tokenizer = (String) parameters.get(TOKENIZER); } - List tokens = tokenize(content, tokenizer); + List tokens = tokenize(content, tokenizer, maxTokenCount); List passages = new ArrayList<>(); String passage; From 2195353c3ceb472e8a1333ba89e9ead4e18ca2b6 Mon Sep 17 00:00:00 2001 From: yuye-aws Date: Thu, 22 Feb 2024 23:05:20 +0800 Subject: [PATCH 007/189] bug fix for non-existing index Signed-off-by: yuye-aws --- .../processor/DocumentChunkingProcessor.java | 20 +++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/src/main/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessor.java b/src/main/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessor.java index 99f09e3ff..061f28289 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessor.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessor.java @@ -10,9 +10,9 @@ import java.util.List; import java.util.Objects; +import org.opensearch.cluster.metadata.IndexMetadata; import org.opensearch.cluster.service.ClusterService; import org.opensearch.common.settings.Settings; -import org.opensearch.core.index.Index; import org.opensearch.index.IndexService; import org.opensearch.index.IndexSettings; import org.opensearch.index.analysis.AnalysisRegistry; @@ -165,8 +165,6 @@ public IngestDocument execute(IngestDocument document) { ); } - - Map parameters = (Map) fieldMapEntry.getValue(); String outputField = (String) parameters.get(OUTPUT_FIELD); List chunkedPassages = new ArrayList<>(); @@ -179,14 +177,16 @@ public IngestDocument execute(IngestDocument document) { @SuppressWarnings("unchecked") Map chunkerParameters = (Map) parameterEntry.getValue(); if (Objects.equals(parameterKey, ChunkerFactory.FIXED_LENGTH_ALGORITHM)) { - // add maxTokenCount setting from index metadata to chunker parameters + // add maxTokenCount to chunker parameters Map sourceAndMetadataMap = document.getSourceAndMetadata(); + int maxTokenCount = IndexSettings.MAX_TOKEN_COUNT_SETTING.get(settings); String indexName = sourceAndMetadataMap.get(IndexFieldMapper.NAME).toString(); - Index index = clusterService.state().metadata().index(indexName).getIndex(); - IndexService indexService = indicesService.indexServiceSafe(index); - final int maxTokenCount = indexService == null - ? IndexSettings.MAX_TOKEN_COUNT_SETTING.get(settings) - : indexService.getIndexSettings().getMaxTokenCount(); + IndexMetadata indexMetadata = clusterService.state().metadata().index(indexName); + if (indexMetadata != null) { + // if the index exists, read maxTokenCount from the index setting + IndexService indexService = indicesService.indexServiceSafe(indexMetadata.getIndex()); + maxTokenCount = indexService.getIndexSettings().getMaxTokenCount(); + } chunkerParameters.put(FixedTokenLengthChunker.MAX_TOKEN_COUNT, maxTokenCount); } IFieldChunker chunker = ChunkerFactory.create(parameterKey, analysisRegistry); @@ -211,7 +211,7 @@ public static class Factory implements Processor.Factory { private final ClusterService clusterService; private final IndicesService indicesService; - + private final AnalysisRegistry analysisRegistry; public Factory(Settings settings, ClusterService clusterService, IndicesService indicesService, AnalysisRegistry analysisRegistry) { From bdd418e7e68242f4806c0700a7078083692c86fe Mon Sep 17 00:00:00 2001 From: yuye-aws Date: Thu, 22 Feb 2024 23:16:00 +0800 Subject: [PATCH 008/189] change error log Signed-off-by: yuye-aws --- .../neuralsearch/processor/DocumentChunkingProcessor.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessor.java b/src/main/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessor.java index 061f28289..8be4db6eb 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessor.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessor.java @@ -143,7 +143,7 @@ public IngestDocument execute(IngestDocument document) { for (Object contentElement : contentList) { if (!(contentElement instanceof String)) { throw new IllegalArgumentException( - "element in input field list [" + "some element in input field list [" + inputField + "] of type [" + contentElement.getClass().getName() From 458420b6425ee183466149b894c9e2de155604f2 Mon Sep 17 00:00:00 2001 From: yuye-aws Date: Mon, 26 Feb 2024 11:31:11 +0800 Subject: [PATCH 009/189] implement evenly chunk Signed-off-by: yuye-aws --- .../chunker/FixedTokenLengthChunker.java | 108 ++++++++++++++---- 1 file changed, 86 insertions(+), 22 deletions(-) diff --git a/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java b/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java index 95727893a..b0aef474c 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java @@ -19,13 +19,19 @@ @Log4j2 public class FixedTokenLengthChunker implements IFieldChunker { + // parameters + public static final String TOKEN_LIMIT = "token_limit"; public static final String OVERLAP_RATE = "overlap_rate"; - public static final String MAX_TOKEN_COUNT = "max_token_count"; - public static final String TOKENIZER = "tokenizer"; + // default values for each parameter + private static final int DEFAULT_TOKEN_LIMIT = 500; + private static final double DEFAULT_OVERLAP_RATE = 0.2; + private static final int DEFAULT_MAX_TOKEN_COUNT = 10000; + private static final String DEFAULT_TOKENIZER = "standard"; + private final AnalysisRegistry analysisRegistry; public FixedTokenLengthChunker(AnalysisRegistry analysisRegistry) { @@ -48,15 +54,64 @@ private List tokenize(String content, String tokenizer, int maxTokenCoun } catch (IOException e) { throw new RuntimeException(e); } - }; + } + + private static int getOverlapTokenNumber(int tokenCount, double overlapRate) { + return Math.min(tokenCount - 1, (int) Math.floor(tokenCount * overlapRate)); + } + + private static int getDocumentTokenCount(int tokensPerPassage, int passageCount, double overlapRate) { + // return the token count of the document, if all (passageCount) passages have (tokensPerPassage) tokens + int overlapTokenNumber = getOverlapTokenNumber(tokensPerPassage, overlapRate); + return tokensPerPassage + (passageCount - 1) * (tokensPerPassage - overlapTokenNumber); + } + + private static int getPassageCount(int contentLength, int tokenLimit, double overlapRate) { + /* + passageCount means the number of chunked passages, which should be the minimum integer + so that getDocumentTokenCount(tokenLimit, passageCount, overlapRate) >= contentLength + */ + int overlapTokenNumber = getOverlapTokenNumber(tokenLimit, overlapRate); + return 1 + (int) Math.ceil((contentLength - tokenLimit) / (double) (tokenLimit - overlapTokenNumber)); + } + + private static int getTokensPerPassage(int contentLength, int passageCount, int tokeLimit, double overlapRate) { + /* + To evenly chunk the documents, the token length difference among passages is at most 1. + The output passages contain long passages and short passages (with 1 token less than long passages). + tokensPerPassage means the number of tokens for longest passages. + tokensPerPassage should be the minimum integer so that + getDocumentTokenCount(tokensPerPassage, passageCount, overlapRate) >= contentLength + + As this problem do not have a closed form solution, we use binary search to find the tokensPerPassages + */ + int left = 1; + int right = tokeLimit; + int tokensPerPassage = right; + int mid; + while (left <= right) { + mid = (left + right) >> 1; + if (getDocumentTokenCount(mid, passageCount, overlapRate) < contentLength) { + // mid is too small + left = mid + 1; + } else if (mid > left && getDocumentTokenCount(mid - 1, passageCount, overlapRate) >= contentLength) { + // mid - 1 suffices + right = mid + 1; + } else { + tokensPerPassage = mid; + break; + } + } + return tokensPerPassage; + } @Override public List chunk(String content, Map parameters) { - // parameters has been validated - int tokenLimit = 500; - double overlapRate = 0.2; - int maxTokenCount = 10000; - String tokenizer = "standard"; + // assume that parameters has been validated + int tokenLimit = DEFAULT_TOKEN_LIMIT; + double overlapRate = DEFAULT_OVERLAP_RATE; + int maxTokenCount = DEFAULT_MAX_TOKEN_COUNT; + String tokenizer = DEFAULT_TOKENIZER; if (parameters.containsKey(TOKEN_LIMIT)) { tokenLimit = ((Number) parameters.get(TOKEN_LIMIT)).intValue(); @@ -72,26 +127,35 @@ public List chunk(String content, Map parameters) { } List tokens = tokenize(content, tokenizer, maxTokenCount); - List passages = new ArrayList<>(); + int tokenLength = tokens.size(); + + if (tokenLength == 0) { + return new ArrayList<>(); + } + if (tokenLength <= tokenLimit) { + return List.of(content); + } + + int passageCount = getPassageCount(tokenLength, tokenLimit, overlapRate); + int tokensPerPassage = getTokensPerPassage(tokenLength, passageCount, tokenLimit, overlapRate); + int overlapTokenNumber = getOverlapTokenNumber(tokensPerPassage, overlapRate); + int exceedingTokenCount = getDocumentTokenCount(tokensPerPassage, passageCount, overlapRate) - tokenLength; String passage; int startToken = 0; - int overlapTokenNumber = (int) Math.floor(tokenLimit * overlapRate); - // overlapTokenNumber must be smaller than the token limit - overlapTokenNumber = Math.min(overlapTokenNumber, tokenLimit - 1); - - while (startToken < tokens.size()) { - if (startToken + tokenLimit >= tokens.size()) { - // break the loop when already cover the last token - passage = String.join(" ", tokens.subList(startToken, tokens.size())); - passages.add(passage); - break; + List passages = new ArrayList<>(); + + for (int i = 0; i < passageCount; i++) { + if (i + exceedingTokenCount < passageCount) { + passage = String.join(" ", tokens.subList(startToken, startToken + tokensPerPassage)); } else { - passage = String.join(" ", tokens.subList(startToken, startToken + tokenLimit)); - passages.add(passage); + // (exceedingTokenCount) passages contain 1 less token + passage = String.join(" ", tokens.subList(startToken, startToken + tokensPerPassage - 1)); } - startToken += tokenLimit - overlapTokenNumber; + passages.add(passage); + startToken += tokensPerPassage - overlapTokenNumber; } + return passages; } From 02420d7c1e5c1662b74476ad656f259cb49a315c Mon Sep 17 00:00:00 2001 From: yuye-aws Date: Mon, 26 Feb 2024 12:19:42 +0800 Subject: [PATCH 010/189] unit tests for chunker factory Signed-off-by: yuye-aws --- .../processor/DocumentChunkingProcessor.java | 2 +- .../processor/chunker/ChunkerFactory.java | 3 +- .../chunker/ChunkerFactoryTests.java | 36 +++++++++++++++++++ 3 files changed, 39 insertions(+), 2 deletions(-) create mode 100644 src/test/java/org/opensearch/neuralsearch/processor/chunker/ChunkerFactoryTests.java diff --git a/src/main/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessor.java b/src/main/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessor.java index 8be4db6eb..83e61a63e 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessor.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessor.java @@ -35,7 +35,7 @@ public final class DocumentChunkingProcessor extends AbstractProcessor { private final Map fieldMap; - private final Set supportedChunkers = ChunkerFactory.getChunkers(); + private final Set supportedChunkers = ChunkerFactory.getAllChunkers(); private final Settings settings; diff --git a/src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerFactory.java b/src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerFactory.java index 01f6d547f..f1eb3b68b 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerFactory.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerFactory.java @@ -32,7 +32,8 @@ public static IFieldChunker create(String type, AnalysisRegistry analysisRegistr } } - public static Set getChunkers() { + public static Set getAllChunkers() { return Set.of(FIXED_LENGTH_ALGORITHM, DELIMITER_ALGORITHM); } + } diff --git a/src/test/java/org/opensearch/neuralsearch/processor/chunker/ChunkerFactoryTests.java b/src/test/java/org/opensearch/neuralsearch/processor/chunker/ChunkerFactoryTests.java new file mode 100644 index 000000000..360a4e99d --- /dev/null +++ b/src/test/java/org/opensearch/neuralsearch/processor/chunker/ChunkerFactoryTests.java @@ -0,0 +1,36 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ +package org.opensearch.neuralsearch.processor.chunker; + +import org.opensearch.index.analysis.AnalysisRegistry; +import org.opensearch.test.OpenSearchTestCase; + +import java.util.Set; + +public class ChunkerFactoryTests extends OpenSearchTestCase { + + private AnalysisRegistry registry; + + public void testGetAllChunkers() { + Set expected = Set.of(ChunkerFactory.FIXED_LENGTH_ALGORITHM, ChunkerFactory.DELIMITER_ALGORITHM); + assertEquals(expected, ChunkerFactory.getAllChunkers()); + } + + public void testCreate_FixedTokenLength() { + IFieldChunker chunker = ChunkerFactory.create(ChunkerFactory.FIXED_LENGTH_ALGORITHM, registry); + assertNotNull(chunker); + assertTrue(chunker instanceof FixedTokenLengthChunker); + } + + public void testCreate_Delimiter() { + IFieldChunker chunker = ChunkerFactory.create(ChunkerFactory.DELIMITER_ALGORITHM, registry); + assertNotNull(chunker); + assertTrue(chunker instanceof DelimiterChunker); + } + + public void testCreate_Invalid() { + assertThrows(IllegalArgumentException.class, () -> ChunkerFactory.create("Invalid Chunker Type", registry)); + } +} From f8f60a103a84b66d136befccb777ed7edbbfcb45 Mon Sep 17 00:00:00 2001 From: yuye-aws Date: Mon, 26 Feb 2024 12:28:51 +0800 Subject: [PATCH 011/189] unit tests for chunker factory Signed-off-by: yuye-aws --- .../processor/chunker/FixedTokenLengthChunkerTests.java | 2 ++ 1 file changed, 2 insertions(+) create mode 100644 src/test/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunkerTests.java diff --git a/src/test/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunkerTests.java b/src/test/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunkerTests.java new file mode 100644 index 000000000..3b544991d --- /dev/null +++ b/src/test/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunkerTests.java @@ -0,0 +1,2 @@ +package org.opensearch.neuralsearch.processor.chunker;public class FixedTokenLengthChunkerTests { +} From ff0587ccc70d90b0b046bd1da7ca5212ae9c897b Mon Sep 17 00:00:00 2001 From: yuye-aws Date: Mon, 26 Feb 2024 12:35:45 +0800 Subject: [PATCH 012/189] add error message for chunker factory tests Signed-off-by: yuye-aws --- .../processor/chunker/ChunkerFactoryTests.java | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/src/test/java/org/opensearch/neuralsearch/processor/chunker/ChunkerFactoryTests.java b/src/test/java/org/opensearch/neuralsearch/processor/chunker/ChunkerFactoryTests.java index 360a4e99d..8ecbae67d 100644 --- a/src/test/java/org/opensearch/neuralsearch/processor/chunker/ChunkerFactoryTests.java +++ b/src/test/java/org/opensearch/neuralsearch/processor/chunker/ChunkerFactoryTests.java @@ -4,6 +4,7 @@ */ package org.opensearch.neuralsearch.processor.chunker; +import org.mockito.Mock; import org.opensearch.index.analysis.AnalysisRegistry; import org.opensearch.test.OpenSearchTestCase; @@ -11,6 +12,7 @@ public class ChunkerFactoryTests extends OpenSearchTestCase { + @Mock private AnalysisRegistry registry; public void testGetAllChunkers() { @@ -31,6 +33,13 @@ public void testCreate_Delimiter() { } public void testCreate_Invalid() { - assertThrows(IllegalArgumentException.class, () -> ChunkerFactory.create("Invalid Chunker Type", registry)); + IllegalArgumentException illegalArgumentException = assertThrows( + IllegalArgumentException.class, + () -> ChunkerFactory.create("Invalid Chunker Type", registry) + ); + assertEquals( + "chunker type [Invalid Chunker Type] is not supported. Supported chunkers types are [fix_length, delimiter]", + illegalArgumentException.getMessage() + ); } } From afc3189859979966c90d72cc957b30a29638afd4 Mon Sep 17 00:00:00 2001 From: yuye-aws Date: Mon, 26 Feb 2024 13:02:52 +0800 Subject: [PATCH 013/189] resolve comments Signed-off-by: yuye-aws --- .../processor/DocumentChunkingProcessor.java | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/src/main/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessor.java b/src/main/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessor.java index 83e61a63e..bf7a54656 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessor.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessor.java @@ -83,7 +83,7 @@ private void validateDocumentChunkingFieldMap(Map fieldMap) { if (!(parameters instanceof Map)) { throw new IllegalArgumentException( - "parameters for input field [" + inputField + "] cannot be cast to [" + String.class.getName() + "]" + "parameters for input field [" + inputField + "] cannot be cast to [" + Map.class.getName() + "]" ); } @@ -119,11 +119,8 @@ private void validateDocumentChunkingFieldMap(Map fieldMap) { } // should only define one algorithm - if (chunkingAlgorithmCount == 0) { - throw new IllegalArgumentException("chunking algorithm not defined for input field [" + inputField + "]"); - } - if (chunkingAlgorithmCount > 1) { - throw new IllegalArgumentException("multiple chunking algorithms defined for input field [" + inputField + "]"); + if (chunkingAlgorithmCount != 1) { + throw new IllegalArgumentException("input field [" + inputField + "] should has and only has 1 chunking algorithm"); } } } @@ -165,14 +162,15 @@ public IngestDocument execute(IngestDocument document) { ); } - Map parameters = (Map) fieldMapEntry.getValue(); + @SuppressWarnings("unchecked") + Map parameters = (Map) fieldMapEntry.getValue(); String outputField = (String) parameters.get(OUTPUT_FIELD); List chunkedPassages = new ArrayList<>(); // we have validated that there is one chunking algorithm // and that chunkerParameters is of type Map - for (Map.Entry parameterEntry : parameters.entrySet()) { - String parameterKey = (String) parameterEntry.getKey(); + for (Map.Entry parameterEntry : parameters.entrySet()) { + String parameterKey = parameterEntry.getKey(); if (supportedChunkers.contains(parameterKey)) { @SuppressWarnings("unchecked") Map chunkerParameters = (Map) parameterEntry.getValue(); From 159e426812f75651483743ca3c8ce1416e52911e Mon Sep 17 00:00:00 2001 From: yuye-aws Date: Mon, 26 Feb 2024 14:31:10 +0800 Subject: [PATCH 014/189] Revert "implement evenly chunk" This reverts commit 93dd2f454cdb89fd0c4bc53142b2c06f94a93202. Signed-off-by: yuye-aws --- .../chunker/FixedTokenLengthChunker.java | 108 ++++-------------- 1 file changed, 22 insertions(+), 86 deletions(-) diff --git a/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java b/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java index b0aef474c..95727893a 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java @@ -19,18 +19,12 @@ @Log4j2 public class FixedTokenLengthChunker implements IFieldChunker { - // parameters - public static final String TOKEN_LIMIT = "token_limit"; public static final String OVERLAP_RATE = "overlap_rate"; + public static final String MAX_TOKEN_COUNT = "max_token_count"; - public static final String TOKENIZER = "tokenizer"; - // default values for each parameter - private static final int DEFAULT_TOKEN_LIMIT = 500; - private static final double DEFAULT_OVERLAP_RATE = 0.2; - private static final int DEFAULT_MAX_TOKEN_COUNT = 10000; - private static final String DEFAULT_TOKENIZER = "standard"; + public static final String TOKENIZER = "tokenizer"; private final AnalysisRegistry analysisRegistry; @@ -54,64 +48,15 @@ private List tokenize(String content, String tokenizer, int maxTokenCoun } catch (IOException e) { throw new RuntimeException(e); } - } - - private static int getOverlapTokenNumber(int tokenCount, double overlapRate) { - return Math.min(tokenCount - 1, (int) Math.floor(tokenCount * overlapRate)); - } - - private static int getDocumentTokenCount(int tokensPerPassage, int passageCount, double overlapRate) { - // return the token count of the document, if all (passageCount) passages have (tokensPerPassage) tokens - int overlapTokenNumber = getOverlapTokenNumber(tokensPerPassage, overlapRate); - return tokensPerPassage + (passageCount - 1) * (tokensPerPassage - overlapTokenNumber); - } - - private static int getPassageCount(int contentLength, int tokenLimit, double overlapRate) { - /* - passageCount means the number of chunked passages, which should be the minimum integer - so that getDocumentTokenCount(tokenLimit, passageCount, overlapRate) >= contentLength - */ - int overlapTokenNumber = getOverlapTokenNumber(tokenLimit, overlapRate); - return 1 + (int) Math.ceil((contentLength - tokenLimit) / (double) (tokenLimit - overlapTokenNumber)); - } - - private static int getTokensPerPassage(int contentLength, int passageCount, int tokeLimit, double overlapRate) { - /* - To evenly chunk the documents, the token length difference among passages is at most 1. - The output passages contain long passages and short passages (with 1 token less than long passages). - tokensPerPassage means the number of tokens for longest passages. - tokensPerPassage should be the minimum integer so that - getDocumentTokenCount(tokensPerPassage, passageCount, overlapRate) >= contentLength - - As this problem do not have a closed form solution, we use binary search to find the tokensPerPassages - */ - int left = 1; - int right = tokeLimit; - int tokensPerPassage = right; - int mid; - while (left <= right) { - mid = (left + right) >> 1; - if (getDocumentTokenCount(mid, passageCount, overlapRate) < contentLength) { - // mid is too small - left = mid + 1; - } else if (mid > left && getDocumentTokenCount(mid - 1, passageCount, overlapRate) >= contentLength) { - // mid - 1 suffices - right = mid + 1; - } else { - tokensPerPassage = mid; - break; - } - } - return tokensPerPassage; - } + }; @Override public List chunk(String content, Map parameters) { - // assume that parameters has been validated - int tokenLimit = DEFAULT_TOKEN_LIMIT; - double overlapRate = DEFAULT_OVERLAP_RATE; - int maxTokenCount = DEFAULT_MAX_TOKEN_COUNT; - String tokenizer = DEFAULT_TOKENIZER; + // parameters has been validated + int tokenLimit = 500; + double overlapRate = 0.2; + int maxTokenCount = 10000; + String tokenizer = "standard"; if (parameters.containsKey(TOKEN_LIMIT)) { tokenLimit = ((Number) parameters.get(TOKEN_LIMIT)).intValue(); @@ -127,35 +72,26 @@ public List chunk(String content, Map parameters) { } List tokens = tokenize(content, tokenizer, maxTokenCount); - int tokenLength = tokens.size(); - - if (tokenLength == 0) { - return new ArrayList<>(); - } - if (tokenLength <= tokenLimit) { - return List.of(content); - } - - int passageCount = getPassageCount(tokenLength, tokenLimit, overlapRate); - int tokensPerPassage = getTokensPerPassage(tokenLength, passageCount, tokenLimit, overlapRate); - int overlapTokenNumber = getOverlapTokenNumber(tokensPerPassage, overlapRate); - int exceedingTokenCount = getDocumentTokenCount(tokensPerPassage, passageCount, overlapRate) - tokenLength; + List passages = new ArrayList<>(); String passage; int startToken = 0; - List passages = new ArrayList<>(); - - for (int i = 0; i < passageCount; i++) { - if (i + exceedingTokenCount < passageCount) { - passage = String.join(" ", tokens.subList(startToken, startToken + tokensPerPassage)); + int overlapTokenNumber = (int) Math.floor(tokenLimit * overlapRate); + // overlapTokenNumber must be smaller than the token limit + overlapTokenNumber = Math.min(overlapTokenNumber, tokenLimit - 1); + + while (startToken < tokens.size()) { + if (startToken + tokenLimit >= tokens.size()) { + // break the loop when already cover the last token + passage = String.join(" ", tokens.subList(startToken, tokens.size())); + passages.add(passage); + break; } else { - // (exceedingTokenCount) passages contain 1 less token - passage = String.join(" ", tokens.subList(startToken, startToken + tokensPerPassage - 1)); + passage = String.join(" ", tokens.subList(startToken, startToken + tokenLimit)); + passages.add(passage); } - passages.add(passage); - startToken += tokensPerPassage - overlapTokenNumber; + startToken += tokenLimit - overlapTokenNumber; } - return passages; } From 240595244781c9be53c14fc13543b1bbf7f2b03f Mon Sep 17 00:00:00 2001 From: yuye-aws Date: Mon, 26 Feb 2024 14:32:16 +0800 Subject: [PATCH 015/189] add default value logic back Signed-off-by: yuye-aws --- .../chunker/FixedTokenLengthChunker.java | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java b/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java index 95727893a..9045f4be6 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java @@ -26,6 +26,12 @@ public class FixedTokenLengthChunker implements IFieldChunker { public static final String TOKENIZER = "tokenizer"; + // default values for each parameter + private static final int DEFAULT_TOKEN_LIMIT = 500; + private static final double DEFAULT_OVERLAP_RATE = 0.2; + private static final int DEFAULT_MAX_TOKEN_COUNT = 10000; + private static final String DEFAULT_TOKENIZER = "standard"; + private final AnalysisRegistry analysisRegistry; public FixedTokenLengthChunker(AnalysisRegistry analysisRegistry) { @@ -52,11 +58,11 @@ private List tokenize(String content, String tokenizer, int maxTokenCoun @Override public List chunk(String content, Map parameters) { - // parameters has been validated - int tokenLimit = 500; - double overlapRate = 0.2; - int maxTokenCount = 10000; - String tokenizer = "standard"; + // assume that parameters has been validated + int tokenLimit = DEFAULT_TOKEN_LIMIT; + double overlapRate = DEFAULT_OVERLAP_RATE; + int maxTokenCount = DEFAULT_MAX_TOKEN_COUNT; + String tokenizer = DEFAULT_TOKENIZER; if (parameters.containsKey(TOKEN_LIMIT)) { tokenLimit = ((Number) parameters.get(TOKEN_LIMIT)).intValue(); From b9302228f5585f58bbdf9244d5943c6d2c247f68 Mon Sep 17 00:00:00 2001 From: yuye-aws Date: Mon, 26 Feb 2024 15:53:53 +0800 Subject: [PATCH 016/189] implement unit test for fixed token length chunker Signed-off-by: yuye-aws --- .../chunker/FixedTokenLengthChunkerTests.java | 142 +++++++++++++++++- 1 file changed, 141 insertions(+), 1 deletion(-) diff --git a/src/test/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunkerTests.java b/src/test/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunkerTests.java index 3b544991d..c45de1159 100644 --- a/src/test/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunkerTests.java +++ b/src/test/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunkerTests.java @@ -1,2 +1,142 @@ -package org.opensearch.neuralsearch.processor.chunker;public class FixedTokenLengthChunkerTests { +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ +package org.opensearch.neuralsearch.processor.chunker; + +import lombok.SneakyThrows; +import org.apache.lucene.tests.analysis.MockTokenizer; +import org.junit.Before; +import org.opensearch.common.settings.Settings; +import org.opensearch.env.Environment; +import org.opensearch.env.TestEnvironment; +import org.opensearch.index.analysis.TokenizerFactory; +import org.opensearch.index.analysis.AnalysisRegistry; +import org.opensearch.indices.analysis.AnalysisModule; +import org.opensearch.plugins.AnalysisPlugin; +import org.opensearch.test.OpenSearchTestCase; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +import static java.util.Collections.singletonList; +import static java.util.Collections.singletonMap; +import static org.opensearch.neuralsearch.processor.chunker.FixedTokenLengthChunker.TOKEN_LIMIT; +import static org.opensearch.neuralsearch.processor.chunker.FixedTokenLengthChunker.OVERLAP_RATE; +import static org.opensearch.neuralsearch.processor.chunker.FixedTokenLengthChunker.TOKENIZER; + +public class FixedTokenLengthChunkerTests extends OpenSearchTestCase { + + private FixedTokenLengthChunker FixedTokenLengthChunker; + + @Before + @SneakyThrows + public void setup() { + Settings settings = Settings.builder().put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()).build(); + Environment environment = TestEnvironment.newEnvironment(settings); + + AnalysisPlugin plugin = new AnalysisPlugin() { + + @Override + public Map> getTokenizers() { + return singletonMap( + "keyword", + (indexSettings, environment, name, settings) -> TokenizerFactory.newFactory( + name, + () -> new MockTokenizer(MockTokenizer.KEYWORD, false) + ) + ); + } + }; + AnalysisRegistry analysisRegistry = new AnalysisModule(environment, singletonList(plugin)).getAnalysisRegistry(); + FixedTokenLengthChunker = new FixedTokenLengthChunker(analysisRegistry); + } + + public void testValidateParameters_whenNoParams_thenSuccessful() { + Map parameters = new HashMap<>(); + ; + FixedTokenLengthChunker.validateParameters(parameters); + } + + public void testValidateParameters_whenIllegalTokenLimitType_thenFail() { + Map parameters = new HashMap<>(); + parameters.put(TOKEN_LIMIT, "invalid token limit"); + IllegalArgumentException illegalArgumentException = assertThrows( + IllegalArgumentException.class, + () -> FixedTokenLengthChunker.validateParameters(parameters) + ); + assertEquals("fixed length parameter [token_limit] cannot be cast to [java.lang.Number]", illegalArgumentException.getMessage()); + } + + public void testValidateParameters_whenIllegalTokenLimitValue_thenFail() { + Map parameters = new HashMap<>(); + parameters.put(TOKEN_LIMIT, -1); + IllegalArgumentException illegalArgumentException = assertThrows( + IllegalArgumentException.class, + () -> FixedTokenLengthChunker.validateParameters(parameters) + ); + assertEquals("fixed length parameter [token_limit] must be positive", illegalArgumentException.getMessage()); + } + + public void testValidateParameters_whenIllegalOverlapRateType_thenFail() { + Map parameters = new HashMap<>(); + parameters.put(OVERLAP_RATE, "invalid overlap rate"); + IllegalArgumentException illegalArgumentException = assertThrows( + IllegalArgumentException.class, + () -> FixedTokenLengthChunker.validateParameters(parameters) + ); + assertEquals("fixed length parameter [overlap_rate] cannot be cast to [java.lang.Number]", illegalArgumentException.getMessage()); + } + + public void testValidateParameters_whenIllegalOverlapRateValue_thenFail() { + Map parameters = new HashMap<>(); + parameters.put(OVERLAP_RATE, 1.0); + IllegalArgumentException illegalArgumentException = assertThrows( + IllegalArgumentException.class, + () -> FixedTokenLengthChunker.validateParameters(parameters) + ); + assertEquals( + "fixed length parameter [overlap_rate] must be between 0 and 1, 1 is not included.", + illegalArgumentException.getMessage() + ); + } + + public void testValidateParameters_whenIllegalTokenizerType_thenFail() { + Map parameters = new HashMap<>(); + parameters.put(TOKENIZER, 111); + IllegalArgumentException illegalArgumentException = assertThrows( + IllegalArgumentException.class, + () -> FixedTokenLengthChunker.validateParameters(parameters) + ); + assertEquals("fixed length parameter [tokenizer] cannot be cast to [java.lang.String]", illegalArgumentException.getMessage()); + } + + public void testChunk_withTokenLimit_10() { + Map parameters = new HashMap<>(); + parameters.put(TOKEN_LIMIT, 10); + String content = + "This is an example document to be chunked. The document contains a single paragraph, two sentences and 24 tokens by standard tokenizer in OpenSearch."; + List passages = FixedTokenLengthChunker.chunk(content, parameters); + List expectedPassages = new ArrayList<>(); + expectedPassages.add("This is an example document to be chunked The document"); + expectedPassages.add("The document contains a single paragraph two sentences and 24"); + expectedPassages.add("and 24 tokens by standard tokenizer in OpenSearch"); + assertEquals(expectedPassages, passages); + } + + public void testChunk_withTokenLimit_20() { + Map parameters = new HashMap<>(); + parameters.put(TOKEN_LIMIT, 20); + String content = + "This is an example document to be chunked. The document contains a single paragraph, two sentences and 24 tokens by standard tokenizer in OpenSearch."; + List passages = FixedTokenLengthChunker.chunk(content, parameters); + List expectedPassages = new ArrayList<>(); + expectedPassages.add( + "This is an example document to be chunked The document contains a single paragraph two sentences and 24 tokens by" + ); + expectedPassages.add("and 24 tokens by standard tokenizer in OpenSearch"); + assertEquals(expectedPassages, passages); + } } From ecb8297e225fe3733f14da873aa32f59c06b2623 Mon Sep 17 00:00:00 2001 From: yuye-aws Date: Mon, 26 Feb 2024 16:02:32 +0800 Subject: [PATCH 017/189] add test cases in unit test for fixed token length chunker Signed-off-by: yuye-aws --- .../chunker/FixedTokenLengthChunkerTests.java | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/src/test/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunkerTests.java b/src/test/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunkerTests.java index c45de1159..bea3a68bf 100644 --- a/src/test/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunkerTests.java +++ b/src/test/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunkerTests.java @@ -139,4 +139,19 @@ public void testChunk_withTokenLimit_20() { expectedPassages.add("and 24 tokens by standard tokenizer in OpenSearch"); assertEquals(expectedPassages, passages); } + + public void testChunk_withOverlapRate_half() { + Map parameters = new HashMap<>(); + parameters.put(TOKEN_LIMIT, 10); + parameters.put(OVERLAP_RATE, 0.5); + String content = + "This is an example document to be chunked. The document contains a single paragraph, two sentences and 24 tokens by standard tokenizer in OpenSearch."; + List passages = FixedTokenLengthChunker.chunk(content, parameters); + List expectedPassages = new ArrayList<>(); + expectedPassages.add("This is an example document to be chunked The document"); + expectedPassages.add("to be chunked The document contains a single paragraph two"); + expectedPassages.add("contains a single paragraph two sentences and 24 tokens by"); + expectedPassages.add("sentences and 24 tokens by standard tokenizer in OpenSearch"); + assertEquals(expectedPassages, passages); + } } From d6d31faee281ef978ac71c14b4f2ba9080165acd Mon Sep 17 00:00:00 2001 From: yuye-aws Date: Mon, 26 Feb 2024 16:07:02 +0800 Subject: [PATCH 018/189] support map type as an input Signed-off-by: yuye-aws --- .../processor/DocumentChunkingProcessor.java | 22 ++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/src/main/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessor.java b/src/main/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessor.java index bf7a54656..c6767742c 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessor.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessor.java @@ -150,6 +150,21 @@ public IngestDocument execute(IngestDocument document) { ); } } + } else if (content instanceof Map) { + Map contentMap = (Map) content; + for (Object contentElement : contentMap.values()) { + if (!(contentElement instanceof String)) { + throw new IllegalArgumentException( + "some element in input field map [" + + inputField + + "] of type [" + + contentElement.getClass().getName() + + "] cannot be cast to [" + + String.class.getName() + + "]" + ); + } + } } else if (!(content instanceof String)) { throw new IllegalArgumentException( "input field [" @@ -190,10 +205,15 @@ public IngestDocument execute(IngestDocument document) { IFieldChunker chunker = ChunkerFactory.create(parameterKey, analysisRegistry); if (content instanceof String) { chunkedPassages = chunker.chunk((String) content, chunkerParameters); - } else { + } else if (content instanceof List) { for (Object contentElement : (List) content) { chunkedPassages.addAll(chunker.chunk((String) contentElement, chunkerParameters)); } + } else { + // content is map type + for (Object contentElement : ((Map) content).values()) { + chunkedPassages.addAll(chunker.chunk((String) contentElement, chunkerParameters)); + } } } } From fafae93e9ce1bfa76344f4d72956ce428933c9a6 Mon Sep 17 00:00:00 2001 From: yuye-aws Date: Mon, 26 Feb 2024 16:55:40 +0800 Subject: [PATCH 019/189] support map type as an input Signed-off-by: yuye-aws --- .../processor/DocumentChunkingProcessor.java | 124 ++++++++++-------- 1 file changed, 69 insertions(+), 55 deletions(-) diff --git a/src/main/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessor.java b/src/main/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessor.java index c6767742c..95e3a0084 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessor.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessor.java @@ -4,6 +4,7 @@ */ package org.opensearch.neuralsearch.processor; +import java.util.HashMap; import java.util.Map; import java.util.Set; import java.util.ArrayList; @@ -125,6 +126,72 @@ private void validateDocumentChunkingFieldMap(Map fieldMap) { } } + private void validateContent(Object content, String inputField) { + // content can be a map, a list of strings or a list + if (content instanceof Map) { + @SuppressWarnings("unchecked") + Map contentMap = (Map) content; + for (Map.Entry contentEntry : contentMap.entrySet()) { + String contentKey = contentEntry.getKey(); + Object contentValue = contentEntry.getValue(); + // the map value can also be a map, list or string + validateContent(contentValue, inputField + "." + contentKey); + } + } + if (content instanceof List) { + List contentList = (List) content; + for (Object contentElement : contentList) { + if (!(contentElement instanceof String)) { + throw new IllegalArgumentException( + "some element in input field list [" + + inputField + + "] of type [" + + contentElement.getClass().getName() + + "] cannot be cast to [" + + String.class.getName() + + "]" + ); + } + } + } + if (!(content instanceof String)) { + throw new IllegalArgumentException( + "input field [" + + inputField + + "] of type [" + + content.getClass().getName() + + "] cannot be cast to [" + + String.class.getName() + + "]" + ); + } + } + + private Object chunk(IFieldChunker chunker, Object content, Map chunkerParameters) { + // assume that content is either a map, list or string + if (content instanceof Map) { + Map chunkedPassageMap = new HashMap<>(); + Map contentMap = (Map) content; + for (Map.Entry contentEntry : contentMap.entrySet()) { + String contentKey = contentEntry.getKey(); + Object contentValue = contentEntry.getValue(); + // contentValue can also be a map, list or string + chunkedPassageMap.put(contentKey, chunk(chunker, contentValue, chunkerParameters)); + } + return chunkedPassageMap; + } else if (content instanceof List) { + List chunkedPassageList = new ArrayList<>(); + List contentList = (List) content; + for (Object contentElement : contentList) { + chunkedPassageList.addAll(chunker.chunk((String) contentElement, chunkerParameters)); + } + return chunkedPassageList; + } else { + return chunker.chunk((String) content, chunkerParameters); + } + } + + @Override public IngestDocument execute(IngestDocument document) { for (Map.Entry fieldMapEntry : fieldMap.entrySet()) { @@ -135,52 +202,11 @@ public IngestDocument execute(IngestDocument document) { throw new IllegalArgumentException("input field in document [" + inputField + "] is null, cannot process it."); } - if (content instanceof List) { - List contentList = (List) content; - for (Object contentElement : contentList) { - if (!(contentElement instanceof String)) { - throw new IllegalArgumentException( - "some element in input field list [" - + inputField - + "] of type [" - + contentElement.getClass().getName() - + "] cannot be cast to [" - + String.class.getName() - + "]" - ); - } - } - } else if (content instanceof Map) { - Map contentMap = (Map) content; - for (Object contentElement : contentMap.values()) { - if (!(contentElement instanceof String)) { - throw new IllegalArgumentException( - "some element in input field map [" - + inputField - + "] of type [" - + contentElement.getClass().getName() - + "] cannot be cast to [" - + String.class.getName() - + "]" - ); - } - } - } else if (!(content instanceof String)) { - throw new IllegalArgumentException( - "input field [" - + inputField - + "] of type [" - + content.getClass().getName() - + "] cannot be cast to [" - + String.class.getName() - + "]" - ); - } + validateContent(content, inputField); @SuppressWarnings("unchecked") Map parameters = (Map) fieldMapEntry.getValue(); String outputField = (String) parameters.get(OUTPUT_FIELD); - List chunkedPassages = new ArrayList<>(); // we have validated that there is one chunking algorithm // and that chunkerParameters is of type Map @@ -203,21 +229,9 @@ public IngestDocument execute(IngestDocument document) { chunkerParameters.put(FixedTokenLengthChunker.MAX_TOKEN_COUNT, maxTokenCount); } IFieldChunker chunker = ChunkerFactory.create(parameterKey, analysisRegistry); - if (content instanceof String) { - chunkedPassages = chunker.chunk((String) content, chunkerParameters); - } else if (content instanceof List) { - for (Object contentElement : (List) content) { - chunkedPassages.addAll(chunker.chunk((String) contentElement, chunkerParameters)); - } - } else { - // content is map type - for (Object contentElement : ((Map) content).values()) { - chunkedPassages.addAll(chunker.chunk((String) contentElement, chunkerParameters)); - } - } + document.setFieldValue(outputField, chunk(chunker, content, parameters)); } } - document.setFieldValue(outputField, chunkedPassages); } return document; } From d23c1fbb3bde61d85c997d20f8a5f6f681546b44 Mon Sep 17 00:00:00 2001 From: yuye-aws Date: Mon, 26 Feb 2024 19:12:59 +0800 Subject: [PATCH 020/189] bug fix for map type Signed-off-by: yuye-aws --- .../processor/DocumentChunkingProcessor.java | 33 +++++++++---------- .../DocumentChunkingProcessorTests.java | 2 ++ 2 files changed, 18 insertions(+), 17 deletions(-) create mode 100644 src/test/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessorTests.java diff --git a/src/main/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessor.java b/src/main/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessor.java index 95e3a0084..e0575e2b9 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessor.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessor.java @@ -129,6 +129,7 @@ private void validateDocumentChunkingFieldMap(Map fieldMap) { private void validateContent(Object content, String inputField) { // content can be a map, a list of strings or a list if (content instanceof Map) { + System.out.println("map type"); @SuppressWarnings("unchecked") Map contentMap = (Map) content; for (Map.Entry contentEntry : contentMap.entrySet()) { @@ -137,32 +138,31 @@ private void validateContent(Object content, String inputField) { // the map value can also be a map, list or string validateContent(contentValue, inputField + "." + contentKey); } - } - if (content instanceof List) { + } else if (content instanceof List) { List contentList = (List) content; for (Object contentElement : contentList) { if (!(contentElement instanceof String)) { throw new IllegalArgumentException( - "some element in input field list [" - + inputField - + "] of type [" - + contentElement.getClass().getName() - + "] cannot be cast to [" - + String.class.getName() - + "]" + "some element in input field list [" + + inputField + + "] of type [" + + contentElement.getClass().getName() + + "] cannot be cast to [" + + String.class.getName() + + "]" ); } } } if (!(content instanceof String)) { throw new IllegalArgumentException( - "input field [" - + inputField - + "] of type [" - + content.getClass().getName() - + "] cannot be cast to [" - + String.class.getName() - + "]" + "input field [" + + inputField + + "] of type [" + + content.getClass().getName() + + "] cannot be cast to [" + + String.class.getName() + + "]" ); } } @@ -191,7 +191,6 @@ private Object chunk(IFieldChunker chunker, Object content, Map } } - @Override public IngestDocument execute(IngestDocument document) { for (Map.Entry fieldMapEntry : fieldMap.entrySet()) { diff --git a/src/test/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessorTests.java b/src/test/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessorTests.java new file mode 100644 index 000000000..3da358f41 --- /dev/null +++ b/src/test/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessorTests.java @@ -0,0 +1,2 @@ +package org.opensearch.neuralsearch.processor;public class DocumentChunkingProcessorTests { +} From 39c616278aaccba529720ca386c423cc6e9d8019 Mon Sep 17 00:00:00 2001 From: yuye-aws Date: Mon, 26 Feb 2024 19:13:41 +0800 Subject: [PATCH 021/189] bug fix for map type Signed-off-by: yuye-aws --- .../neuralsearch/processor/DocumentChunkingProcessor.java | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/main/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessor.java b/src/main/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessor.java index e0575e2b9..7b3bc3024 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessor.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessor.java @@ -153,8 +153,7 @@ private void validateContent(Object content, String inputField) { ); } } - } - if (!(content instanceof String)) { + } else if (!(content instanceof String)) { throw new IllegalArgumentException( "input field [" + inputField From 5714d1e2d0e9c03f0cef3ae561e34aba217ae99c Mon Sep 17 00:00:00 2001 From: yuye-aws Date: Mon, 26 Feb 2024 19:20:42 +0800 Subject: [PATCH 022/189] bug fix for map type in document chunking processor Signed-off-by: yuye-aws --- .../neuralsearch/processor/DocumentChunkingProcessor.java | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/main/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessor.java b/src/main/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessor.java index 7b3bc3024..426d5d1ba 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessor.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessor.java @@ -168,6 +168,7 @@ private void validateContent(Object content, String inputField) { private Object chunk(IFieldChunker chunker, Object content, Map chunkerParameters) { // assume that content is either a map, list or string + System.out.println("chunkerParameters: " + chunkerParameters); if (content instanceof Map) { Map chunkedPassageMap = new HashMap<>(); Map contentMap = (Map) content; @@ -227,7 +228,7 @@ public IngestDocument execute(IngestDocument document) { chunkerParameters.put(FixedTokenLengthChunker.MAX_TOKEN_COUNT, maxTokenCount); } IFieldChunker chunker = ChunkerFactory.create(parameterKey, analysisRegistry); - document.setFieldValue(outputField, chunk(chunker, content, parameters)); + document.setFieldValue(outputField, chunk(chunker, content, chunkerParameters)); } } } From 2f23c303a9aadf4a7055ca447e2ad776fabd9c95 Mon Sep 17 00:00:00 2001 From: yuye-aws Date: Mon, 26 Feb 2024 19:21:25 +0800 Subject: [PATCH 023/189] remove system out println Signed-off-by: yuye-aws --- .../neuralsearch/processor/DocumentChunkingProcessor.java | 1 - 1 file changed, 1 deletion(-) diff --git a/src/main/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessor.java b/src/main/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessor.java index 426d5d1ba..2f3965773 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessor.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessor.java @@ -168,7 +168,6 @@ private void validateContent(Object content, String inputField) { private Object chunk(IFieldChunker chunker, Object content, Map chunkerParameters) { // assume that content is either a map, list or string - System.out.println("chunkerParameters: " + chunkerParameters); if (content instanceof Map) { Map chunkedPassageMap = new HashMap<>(); Map contentMap = (Map) content; From 41cff0c83c093065ae0700057a87ba8bf71257ed Mon Sep 17 00:00:00 2001 From: xinyual Date: Mon, 26 Feb 2024 14:34:39 +0800 Subject: [PATCH 024/189] add delimiter chunker Signed-off-by: xinyual Signed-off-by: yuye-aws --- .../processor/chunker/DelimiterChunker.java | 26 +++++++++++++++++-- 1 file changed, 24 insertions(+), 2 deletions(-) diff --git a/src/main/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunker.java b/src/main/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunker.java index c9ef5e211..b95ccd088 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunker.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunker.java @@ -4,20 +4,42 @@ */ package org.opensearch.neuralsearch.processor.chunker; +import java.util.ArrayList; import java.util.List; import java.util.Map; +import java.util.Scanner; public class DelimiterChunker implements IFieldChunker { public DelimiterChunker() {} + public static String DELIMITER_FIELD = "delimiter"; + + @Override public void validateParameters(Map parameters) { - throw new UnsupportedOperationException("delimiter chunker has not been implemented yet"); + if (parameters.containsKey(DELIMITER_FIELD)) + { + Object delimiter = parameters.get(DELIMITER_FIELD); + if (!(delimiter instanceof String)){ + throw new IllegalArgumentException("delimiter parameters " + delimiter + " must be string"); + } + } + else { + throw new IllegalArgumentException("You must contain field:" + DELIMITER_FIELD + " in your parameter"); + } } @Override public List chunk(String content, Map parameters) { - throw new UnsupportedOperationException("delimiter chunker has not been implemented yet"); + List chunkingResult = new ArrayList<>(); + String delimiter = (String) parameters.get(DELIMITER_FIELD); + Scanner scanner = new Scanner(content); + scanner.useDelimiter(delimiter); + while (scanner.hasNext()) { + String nextChunk = scanner.next(); + chunkingResult.add(nextChunk); + } + return chunkingResult; } } From b0fda97acec92dd57a05ca12cc53814c23b1764a Mon Sep 17 00:00:00 2001 From: xinyual Date: Mon, 26 Feb 2024 20:15:01 +0800 Subject: [PATCH 025/189] add UT for delimiter chunker Signed-off-by: xinyual Signed-off-by: yuye-aws --- .../processor/chunker/DelimiterChunker.java | 33 ++++---- .../chunker/DelimiterChunkerTests.java | 84 +++++++++++++++++++ 2 files changed, 102 insertions(+), 15 deletions(-) create mode 100644 src/test/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunkerTests.java diff --git a/src/main/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunker.java b/src/main/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunker.java index b95ccd088..625562a36 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunker.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunker.java @@ -7,7 +7,6 @@ import java.util.ArrayList; import java.util.List; import java.util.Map; -import java.util.Scanner; public class DelimiterChunker implements IFieldChunker { @@ -15,31 +14,35 @@ public DelimiterChunker() {} public static String DELIMITER_FIELD = "delimiter"; - @Override public void validateParameters(Map parameters) { - if (parameters.containsKey(DELIMITER_FIELD)) - { + if (parameters.containsKey(DELIMITER_FIELD)) { Object delimiter = parameters.get(DELIMITER_FIELD); - if (!(delimiter instanceof String)){ - throw new IllegalArgumentException("delimiter parameters " + delimiter + " must be string"); + if (!(delimiter instanceof String)) { + throw new IllegalArgumentException("delimiter parameters: " + delimiter + " must be string"); } - } - else { + } else { throw new IllegalArgumentException("You must contain field:" + DELIMITER_FIELD + " in your parameter"); } } @Override public List chunk(String content, Map parameters) { - List chunkingResult = new ArrayList<>(); String delimiter = (String) parameters.get(DELIMITER_FIELD); - Scanner scanner = new Scanner(content); - scanner.useDelimiter(delimiter); - while (scanner.hasNext()) { - String nextChunk = scanner.next(); - chunkingResult.add(nextChunk); + List chunkResult = new ArrayList<>(); + int start = 0; + int end = content.indexOf(delimiter); + + while (end != -1) { + chunkResult.add(content.substring(start, end + delimiter.length())); + start = end + delimiter.length(); + end = content.indexOf(delimiter, start); } - return chunkingResult; + + if (start < content.length()) { + chunkResult.add(content.substring(start)); + } + return chunkResult; + } } diff --git a/src/test/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunkerTests.java b/src/test/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunkerTests.java new file mode 100644 index 000000000..776f96479 --- /dev/null +++ b/src/test/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunkerTests.java @@ -0,0 +1,84 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ +package org.opensearch.neuralsearch.processor.chunker; + +import org.junit.Assert; +import org.junit.Test; + +import java.util.List; +import java.util.Map; + +import static junit.framework.TestCase.assertEquals; +import static org.junit.Assert.assertThrows; +import static org.opensearch.neuralsearch.processor.chunker.DelimiterChunker.DELIMITER_FIELD; + +public class DelimiterChunkerTests { + + @Test + public void testChunkerWithNoDelimiterField() { + DelimiterChunker chunker = new DelimiterChunker(); + String content = "a\nb\nc\nd"; + Map inputParameters = Map.of("", ""); + Exception exception = assertThrows(IllegalArgumentException.class, () -> chunker.validateParameters(inputParameters)); + Assert.assertEquals("You must contain field:" + DELIMITER_FIELD + " in your parameter", exception.getMessage()); + } + + @Test + public void testChunkerWithDelimiterFieldNotString() { + DelimiterChunker chunker = new DelimiterChunker(); + String content = "a\nb\nc\nd"; + Map inputParameters = Map.of(DELIMITER_FIELD, List.of("")); + Exception exception = assertThrows(IllegalArgumentException.class, () -> chunker.validateParameters(inputParameters)); + Assert.assertEquals("delimiter parameters: " + List.of("") + " must be string", exception.getMessage()); + } + + @Test + public void testChunker() { + DelimiterChunker chunker = new DelimiterChunker(); + String content = "a\nb\nc\nd"; + Map inputParameters = Map.of(DELIMITER_FIELD, "\n"); + List chunkResult = chunker.chunk(content, inputParameters); + assertEquals(4, chunkResult.size()); + assertEquals(7, cntLength(chunkResult)); + } + + @Test + public void testChunkerWithDelimiterEnd() { + DelimiterChunker chunker = new DelimiterChunker(); + String content = "a\nb\nc\nd\n"; + Map inputParameters = Map.of(DELIMITER_FIELD, "\n"); + List chunkResult = chunker.chunk(content, inputParameters); + assertEquals(4, chunkResult.size()); + assertEquals(8, cntLength(chunkResult)); + } + + @Test + public void testChunkerWithOnlyDelimiter() { + DelimiterChunker chunker = new DelimiterChunker(); + String content = "\n"; + Map inputParameters = Map.of(DELIMITER_FIELD, "\n"); + List chunkResult = chunker.chunk(content, inputParameters); + assertEquals(1, chunkResult.size()); + assertEquals(1, cntLength(chunkResult)); + } + + @Test + public void testChunkerWithAllDelimiters() { + DelimiterChunker chunker = new DelimiterChunker(); + String content = "\n\n\n"; + Map inputParameters = Map.of(DELIMITER_FIELD, "\n"); + List chunkResult = chunker.chunk(content, inputParameters); + assertEquals(3, chunkResult.size()); + assertEquals(3, cntLength(chunkResult)); + } + + private int cntLength(List outputs) { + int totalLength = 0; + for (String output : outputs) { + totalLength += output.length(); + } + return totalLength; + } +} From b16e7c465436091f48a3591f8072bf5a25dd4cf2 Mon Sep 17 00:00:00 2001 From: xinyual Date: Mon, 26 Feb 2024 20:16:36 +0800 Subject: [PATCH 026/189] add delimiter chunker processor Signed-off-by: xinyual Signed-off-by: yuye-aws --- .../processor/DelimiterChunkerProcessor.json | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) create mode 100644 src/test/resources/processor/DelimiterChunkerProcessor.json diff --git a/src/test/resources/processor/DelimiterChunkerProcessor.json b/src/test/resources/processor/DelimiterChunkerProcessor.json new file mode 100644 index 000000000..c94f3e249 --- /dev/null +++ b/src/test/resources/processor/DelimiterChunkerProcessor.json @@ -0,0 +1,17 @@ +{ + "description": "An example delimiter chunker pipeline", + "processors" : [ + { + "chunking": { + "field_map": { + "body_chunk1": { + "delimiter": { + "delimiter": "\n" + }, + "output_field": "body_chunk2" + } + } + } + } + ] +} \ No newline at end of file From 11e6a4b22f4a119d40e0305da89e52b76b9d395f Mon Sep 17 00:00:00 2001 From: xinyual Date: Tue, 27 Feb 2024 11:28:29 +0800 Subject: [PATCH 027/189] add more UTs Signed-off-by: xinyual Signed-off-by: yuye-aws --- .../chunker/DelimiterChunkerTests.java | 35 +++++++++++-------- 1 file changed, 21 insertions(+), 14 deletions(-) diff --git a/src/test/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunkerTests.java b/src/test/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunkerTests.java index 776f96479..8e16ffee2 100644 --- a/src/test/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunkerTests.java +++ b/src/test/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunkerTests.java @@ -40,8 +40,7 @@ public void testChunker() { String content = "a\nb\nc\nd"; Map inputParameters = Map.of(DELIMITER_FIELD, "\n"); List chunkResult = chunker.chunk(content, inputParameters); - assertEquals(4, chunkResult.size()); - assertEquals(7, cntLength(chunkResult)); + assertEquals(List.of("a\n", "b\n", "c\n", "d"), chunkResult); } @Test @@ -50,8 +49,7 @@ public void testChunkerWithDelimiterEnd() { String content = "a\nb\nc\nd\n"; Map inputParameters = Map.of(DELIMITER_FIELD, "\n"); List chunkResult = chunker.chunk(content, inputParameters); - assertEquals(4, chunkResult.size()); - assertEquals(8, cntLength(chunkResult)); + assertEquals(List.of("a\n", "b\n", "c\n", "d\n"), chunkResult); } @Test @@ -60,8 +58,7 @@ public void testChunkerWithOnlyDelimiter() { String content = "\n"; Map inputParameters = Map.of(DELIMITER_FIELD, "\n"); List chunkResult = chunker.chunk(content, inputParameters); - assertEquals(1, chunkResult.size()); - assertEquals(1, cntLength(chunkResult)); + assertEquals(List.of("\n"), chunkResult); } @Test @@ -70,15 +67,25 @@ public void testChunkerWithAllDelimiters() { String content = "\n\n\n"; Map inputParameters = Map.of(DELIMITER_FIELD, "\n"); List chunkResult = chunker.chunk(content, inputParameters); - assertEquals(3, chunkResult.size()); - assertEquals(3, cntLength(chunkResult)); + assertEquals(List.of("\n", "\n", "\n"), chunkResult); } - private int cntLength(List outputs) { - int totalLength = 0; - for (String output : outputs) { - totalLength += output.length(); - } - return totalLength; + @Test + public void testChunkerWithDifferentDelimiters() { + DelimiterChunker chunker = new DelimiterChunker(); + String content = "a.b.cc.d."; + Map inputParameters = Map.of(DELIMITER_FIELD, "."); + List chunkResult = chunker.chunk(content, inputParameters); + assertEquals(List.of("a.", "b.", "cc.", "d."), chunkResult); } + + @Test + public void testChunkerWithStringDelimter() { + DelimiterChunker chunker = new DelimiterChunker(); + String content = "\n\na\n\n\n"; + Map inputParameters = Map.of(DELIMITER_FIELD, "\n\n"); + List chunkResult = chunker.chunk(content, inputParameters); + assertEquals(List.of("\n\n", "a\n\n", "\n"), chunkResult); + } + } From 81000f3a3153eb96c02bcd7ace4cde3ef86d6641 Mon Sep 17 00:00:00 2001 From: xinyual Date: Tue, 27 Feb 2024 11:48:15 +0800 Subject: [PATCH 028/189] add more UTs Signed-off-by: xinyual Signed-off-by: yuye-aws --- .../processor/chunker/DelimiterChunker.java | 6 ++++-- .../processor/DocumentChunkingProcessorTests.java | 9 +++++++-- .../processor/chunker/DelimiterChunkerTests.java | 13 +++++++++++-- .../processor/DelimiterChunkerProcessor.json | 2 +- 4 files changed, 23 insertions(+), 7 deletions(-) diff --git a/src/main/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunker.java b/src/main/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunker.java index 625562a36..31577604f 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunker.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunker.java @@ -19,10 +19,12 @@ public void validateParameters(Map parameters) { if (parameters.containsKey(DELIMITER_FIELD)) { Object delimiter = parameters.get(DELIMITER_FIELD); if (!(delimiter instanceof String)) { - throw new IllegalArgumentException("delimiter parameters: " + delimiter + " must be string"); + throw new IllegalArgumentException("delimiter parameters: " + delimiter + " must be string."); + } else if (((String) delimiter).length() == 0) { + throw new IllegalArgumentException("delimiter parameters should not be empty."); } } else { - throw new IllegalArgumentException("You must contain field:" + DELIMITER_FIELD + " in your parameter"); + throw new IllegalArgumentException("You must contain field:" + DELIMITER_FIELD + " in your parameter."); } } diff --git a/src/test/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessorTests.java b/src/test/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessorTests.java index 3da358f41..639a85898 100644 --- a/src/test/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessorTests.java +++ b/src/test/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessorTests.java @@ -1,2 +1,7 @@ -package org.opensearch.neuralsearch.processor;public class DocumentChunkingProcessorTests { -} +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ +package org.opensearch.neuralsearch.processor; + +public class DocumentChunkingProcessorTests {} diff --git a/src/test/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunkerTests.java b/src/test/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunkerTests.java index 8e16ffee2..8838310f4 100644 --- a/src/test/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunkerTests.java +++ b/src/test/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunkerTests.java @@ -22,7 +22,7 @@ public void testChunkerWithNoDelimiterField() { String content = "a\nb\nc\nd"; Map inputParameters = Map.of("", ""); Exception exception = assertThrows(IllegalArgumentException.class, () -> chunker.validateParameters(inputParameters)); - Assert.assertEquals("You must contain field:" + DELIMITER_FIELD + " in your parameter", exception.getMessage()); + Assert.assertEquals("You must contain field:" + DELIMITER_FIELD + " in your parameter.", exception.getMessage()); } @Test @@ -31,7 +31,16 @@ public void testChunkerWithDelimiterFieldNotString() { String content = "a\nb\nc\nd"; Map inputParameters = Map.of(DELIMITER_FIELD, List.of("")); Exception exception = assertThrows(IllegalArgumentException.class, () -> chunker.validateParameters(inputParameters)); - Assert.assertEquals("delimiter parameters: " + List.of("") + " must be string", exception.getMessage()); + Assert.assertEquals("delimiter parameters: " + List.of("") + " must be string.", exception.getMessage()); + } + + @Test + public void testChunkerWithDelimiterFieldNoString() { + DelimiterChunker chunker = new DelimiterChunker(); + String content = "a\nb\nc\nd"; + Map inputParameters = Map.of(DELIMITER_FIELD, ""); + Exception exception = assertThrows(IllegalArgumentException.class, () -> chunker.validateParameters(inputParameters)); + Assert.assertEquals("delimiter parameters should not be empty.", exception.getMessage()); } @Test diff --git a/src/test/resources/processor/DelimiterChunkerProcessor.json b/src/test/resources/processor/DelimiterChunkerProcessor.json index c94f3e249..08077bc54 100644 --- a/src/test/resources/processor/DelimiterChunkerProcessor.json +++ b/src/test/resources/processor/DelimiterChunkerProcessor.json @@ -14,4 +14,4 @@ } } ] -} \ No newline at end of file +} From f3b468f2134355959a19940d765717f58594beef Mon Sep 17 00:00:00 2001 From: yuye-aws Date: Tue, 27 Feb 2024 13:47:39 +0800 Subject: [PATCH 029/189] basic unit tests for document chunking processor Signed-off-by: yuye-aws --- .../processor/DocumentChunkingProcessor.java | 6 +- .../DocumentChunkingProcessorTests.java | 124 +++++++++++++++++- .../chunker/ChunkerFactoryTests.java | 8 +- .../chunker/FixedTokenLengthChunkerTests.java | 1 - 4 files changed, 130 insertions(+), 9 deletions(-) diff --git a/src/main/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessor.java b/src/main/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessor.java index 2f3965773..a5a5726d4 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessor.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessor.java @@ -27,13 +27,14 @@ import org.opensearch.index.mapper.IndexFieldMapper; import static org.opensearch.ingest.ConfigurationUtils.readMap; -import static org.opensearch.neuralsearch.processor.InferenceProcessor.FIELD_MAP_FIELD; public final class DocumentChunkingProcessor extends AbstractProcessor { public static final String TYPE = "chunking"; public static final String OUTPUT_FIELD = "output_field"; + public static final String FIELD_MAP_FIELD = "field_map"; + private final Map fieldMap; private final Set supportedChunkers = ChunkerFactory.getAllChunkers(); @@ -129,7 +130,6 @@ private void validateDocumentChunkingFieldMap(Map fieldMap) { private void validateContent(Object content, String inputField) { // content can be a map, a list of strings or a list if (content instanceof Map) { - System.out.println("map type"); @SuppressWarnings("unchecked") Map contentMap = (Map) content; for (Map.Entry contentEntry : contentMap.entrySet()) { @@ -214,7 +214,7 @@ public IngestDocument execute(IngestDocument document) { @SuppressWarnings("unchecked") Map chunkerParameters = (Map) parameterEntry.getValue(); if (Objects.equals(parameterKey, ChunkerFactory.FIXED_LENGTH_ALGORITHM)) { - // add maxTokenCount to chunker parameters + // for fixed token length algorithm, add maxTokenCount to chunker parameters Map sourceAndMetadataMap = document.getSourceAndMetadata(); int maxTokenCount = IndexSettings.MAX_TOKEN_COUNT_SETTING.get(settings); String indexName = sourceAndMetadataMap.get(IndexFieldMapper.NAME).toString(); diff --git a/src/test/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessorTests.java b/src/test/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessorTests.java index 639a85898..17044eb6a 100644 --- a/src/test/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessorTests.java +++ b/src/test/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessorTests.java @@ -4,4 +4,126 @@ */ package org.opensearch.neuralsearch.processor; -public class DocumentChunkingProcessorTests {} +import lombok.SneakyThrows; +import org.apache.lucene.tests.analysis.MockTokenizer; +import org.junit.Before; +import org.opensearch.cluster.ClusterState; +import org.opensearch.cluster.metadata.Metadata; +import org.opensearch.cluster.service.ClusterService; +import org.opensearch.common.settings.Settings; +import org.opensearch.env.Environment; +import org.opensearch.env.TestEnvironment; +import org.opensearch.index.analysis.AnalysisRegistry; +import org.opensearch.index.analysis.TokenizerFactory; +import org.opensearch.index.mapper.IndexFieldMapper; +import org.opensearch.indices.IndicesService; +import org.opensearch.indices.analysis.AnalysisModule; +import org.opensearch.ingest.IngestDocument; +import org.opensearch.ingest.Processor; +import org.opensearch.neuralsearch.processor.chunker.ChunkerFactory; +import org.opensearch.neuralsearch.processor.chunker.FixedTokenLengthChunker; +import org.opensearch.plugins.AnalysisPlugin; +import org.opensearch.test.OpenSearchTestCase; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +import static java.util.Collections.singletonList; +import static java.util.Collections.singletonMap; +import static org.mockito.ArgumentMatchers.anyString; +import static org.mockito.Mockito.when; +import static org.mockito.Mockito.mock; + +public class DocumentChunkingProcessorTests extends OpenSearchTestCase { + + private DocumentChunkingProcessor.Factory factory; + + private static final String PROCESSOR_TAG = "mockTag"; + private static final String DESCRIPTION = "mockDescription"; + + @SneakyThrows + private AnalysisRegistry getAnalysisRegistry() { + Settings settings = Settings.builder().put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()).build(); + Environment environment = TestEnvironment.newEnvironment(settings); + AnalysisPlugin plugin = new AnalysisPlugin() { + + @Override + public Map> getTokenizers() { + return singletonMap( + "keyword", + (indexSettings, environment, name, settings) -> TokenizerFactory.newFactory( + name, + () -> new MockTokenizer(MockTokenizer.KEYWORD, false) + ) + ); + } + }; + return new AnalysisModule(environment, singletonList(plugin)).getAnalysisRegistry(); + } + + @Before + public void setup() { + Settings settings = Settings.builder().build(); + Metadata metadata = mock(Metadata.class); + ClusterState clusterState = mock(ClusterState.class); + ClusterService clusterService = mock(ClusterService.class); + IndicesService indicesService = mock(IndicesService.class); + when(metadata.index(anyString())).thenReturn(null); + when(clusterState.metadata()).thenReturn(metadata); + when(clusterService.state()).thenReturn(clusterState); + factory = new DocumentChunkingProcessor.Factory(settings, clusterService, indicesService, getAnalysisRegistry()); + } + + @SneakyThrows + public void testGetType() { + DocumentChunkingProcessor processor = createFixedTokenLengthInstance(); + String type = processor.getType(); + assertEquals(DocumentChunkingProcessor.TYPE, type); + } + + private Map createFixedTokenLengthParameters() { + Map parameters = new HashMap<>(); + parameters.put(FixedTokenLengthChunker.TOKEN_LIMIT, 10); + return parameters; + } + + @SneakyThrows + private DocumentChunkingProcessor createFixedTokenLengthInstance() { + Map config = new HashMap<>(); + Map fieldParameters = new HashMap<>(); + Map chunkerParameters = new HashMap<>(); + chunkerParameters.put(ChunkerFactory.FIXED_LENGTH_ALGORITHM, createFixedTokenLengthParameters()); + chunkerParameters.put(DocumentChunkingProcessor.OUTPUT_FIELD, "body_chunk"); + fieldParameters.put("body", chunkerParameters); + config.put(DocumentChunkingProcessor.FIELD_MAP_FIELD, fieldParameters); + Map registry = new HashMap<>(); + return factory.create(registry, PROCESSOR_TAG, DESCRIPTION, config); + } + + private IngestDocument createIngestDocument() { + Map sourceAndMetadata = new HashMap<>(); + sourceAndMetadata.put( + "body", + "This is an example document to be chunked. The document contains a single paragraph, two sentences and 24 tokens by standard tokenizer in OpenSearch." + ); + sourceAndMetadata.put(IndexFieldMapper.NAME, "_index"); + return new IngestDocument(sourceAndMetadata, new HashMap<>()); + } + + @SneakyThrows + public void testExecute_withFixedTokenLength_successful() { + DocumentChunkingProcessor processor = createFixedTokenLengthInstance(); + IngestDocument ingestDocument = createIngestDocument(); + IngestDocument document = processor.execute(ingestDocument); + assert document.getSourceAndMetadata().containsKey("body_chunk"); + Object passages = document.getSourceAndMetadata().get("body_chunk"); + assert (passages instanceof List); + List expectedPassages = new ArrayList<>(); + expectedPassages.add("This is an example document to be chunked The document"); + expectedPassages.add("The document contains a single paragraph two sentences and 24"); + expectedPassages.add("and 24 tokens by standard tokenizer in OpenSearch"); + assertEquals(expectedPassages, passages); + } +} diff --git a/src/test/java/org/opensearch/neuralsearch/processor/chunker/ChunkerFactoryTests.java b/src/test/java/org/opensearch/neuralsearch/processor/chunker/ChunkerFactoryTests.java index 8ecbae67d..0f6a95d40 100644 --- a/src/test/java/org/opensearch/neuralsearch/processor/chunker/ChunkerFactoryTests.java +++ b/src/test/java/org/opensearch/neuralsearch/processor/chunker/ChunkerFactoryTests.java @@ -13,7 +13,7 @@ public class ChunkerFactoryTests extends OpenSearchTestCase { @Mock - private AnalysisRegistry registry; + private AnalysisRegistry analysisRegistry; public void testGetAllChunkers() { Set expected = Set.of(ChunkerFactory.FIXED_LENGTH_ALGORITHM, ChunkerFactory.DELIMITER_ALGORITHM); @@ -21,13 +21,13 @@ public void testGetAllChunkers() { } public void testCreate_FixedTokenLength() { - IFieldChunker chunker = ChunkerFactory.create(ChunkerFactory.FIXED_LENGTH_ALGORITHM, registry); + IFieldChunker chunker = ChunkerFactory.create(ChunkerFactory.FIXED_LENGTH_ALGORITHM, analysisRegistry); assertNotNull(chunker); assertTrue(chunker instanceof FixedTokenLengthChunker); } public void testCreate_Delimiter() { - IFieldChunker chunker = ChunkerFactory.create(ChunkerFactory.DELIMITER_ALGORITHM, registry); + IFieldChunker chunker = ChunkerFactory.create(ChunkerFactory.DELIMITER_ALGORITHM, analysisRegistry); assertNotNull(chunker); assertTrue(chunker instanceof DelimiterChunker); } @@ -35,7 +35,7 @@ public void testCreate_Delimiter() { public void testCreate_Invalid() { IllegalArgumentException illegalArgumentException = assertThrows( IllegalArgumentException.class, - () -> ChunkerFactory.create("Invalid Chunker Type", registry) + () -> ChunkerFactory.create("Invalid Chunker Type", analysisRegistry) ); assertEquals( "chunker type [Invalid Chunker Type] is not supported. Supported chunkers types are [fix_length, delimiter]", diff --git a/src/test/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunkerTests.java b/src/test/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunkerTests.java index bea3a68bf..e9b36682a 100644 --- a/src/test/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunkerTests.java +++ b/src/test/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunkerTests.java @@ -36,7 +36,6 @@ public class FixedTokenLengthChunkerTests extends OpenSearchTestCase { public void setup() { Settings settings = Settings.builder().put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()).build(); Environment environment = TestEnvironment.newEnvironment(settings); - AnalysisPlugin plugin = new AnalysisPlugin() { @Override From eea6fc8c474b519e294f2793aa89a7e13c2a2e5b Mon Sep 17 00:00:00 2001 From: yuye-aws Date: Tue, 27 Feb 2024 14:11:09 +0800 Subject: [PATCH 030/189] fix tests for getProcessors in neural search Signed-off-by: yuye-aws --- .../opensearch/neuralsearch/plugin/NeuralSearchTests.java | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/src/test/java/org/opensearch/neuralsearch/plugin/NeuralSearchTests.java b/src/test/java/org/opensearch/neuralsearch/plugin/NeuralSearchTests.java index 2a66f6992..cb3869868 100644 --- a/src/test/java/org/opensearch/neuralsearch/plugin/NeuralSearchTests.java +++ b/src/test/java/org/opensearch/neuralsearch/plugin/NeuralSearchTests.java @@ -5,11 +5,14 @@ package org.opensearch.neuralsearch.plugin; import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.when; import java.util.List; import java.util.Map; import java.util.Optional; +import org.opensearch.common.settings.Settings; +import org.opensearch.env.Environment; import org.opensearch.indices.IndicesService; import org.opensearch.ingest.IngestService; import org.opensearch.ingest.Processor; @@ -57,8 +60,11 @@ public void testQueryPhaseSearcher() { public void testProcessors() { NeuralSearch plugin = new NeuralSearch(); + Settings settings = Settings.builder().build(); + Environment environment = mock(Environment.class); + when(environment.settings()).thenReturn(settings); Processor.Parameters processorParams = new Processor.Parameters( - null, + environment, null, null, null, From ec6bf49c4fd44fd67fa3149a564c5a7d4da2ae59 Mon Sep 17 00:00:00 2001 From: yuye-aws Date: Tue, 27 Feb 2024 15:34:27 +0800 Subject: [PATCH 031/189] add unit tests with string, map and nested map type for document chunking processor Signed-off-by: yuye-aws --- .../processor/DocumentChunkingProcessor.java | 2 +- .../chunker/FixedTokenLengthChunker.java | 50 ++--- .../DocumentChunkingProcessorTests.java | 181 ++++++++++++++++-- .../chunker/DelimiterChunkerTests.java | 15 +- .../chunker/FixedTokenLengthChunkerTests.java | 24 +-- 5 files changed, 209 insertions(+), 63 deletions(-) diff --git a/src/main/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessor.java b/src/main/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessor.java index a5a5726d4..550c8013f 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessor.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessor.java @@ -224,7 +224,7 @@ public IngestDocument execute(IngestDocument document) { IndexService indexService = indicesService.indexServiceSafe(indexMetadata.getIndex()); maxTokenCount = indexService.getIndexSettings().getMaxTokenCount(); } - chunkerParameters.put(FixedTokenLengthChunker.MAX_TOKEN_COUNT, maxTokenCount); + chunkerParameters.put(FixedTokenLengthChunker.MAX_TOKEN_COUNT_FIELD, maxTokenCount); } IFieldChunker chunker = ChunkerFactory.create(parameterKey, analysisRegistry); document.setFieldValue(outputField, chunk(chunker, content, chunkerParameters)); diff --git a/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java b/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java index 9045f4be6..3079fcf8e 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java @@ -19,12 +19,12 @@ @Log4j2 public class FixedTokenLengthChunker implements IFieldChunker { - public static final String TOKEN_LIMIT = "token_limit"; - public static final String OVERLAP_RATE = "overlap_rate"; + public static final String TOKEN_LIMIT_FIELD = "token_limit"; + public static final String OVERLAP_RATE_FIELD = "overlap_rate"; - public static final String MAX_TOKEN_COUNT = "max_token_count"; + public static final String MAX_TOKEN_COUNT_FIELD = "max_token_count"; - public static final String TOKENIZER = "tokenizer"; + public static final String TOKENIZER_FIELD = "tokenizer"; // default values for each parameter private static final int DEFAULT_TOKEN_LIMIT = 500; @@ -64,17 +64,17 @@ public List chunk(String content, Map parameters) { int maxTokenCount = DEFAULT_MAX_TOKEN_COUNT; String tokenizer = DEFAULT_TOKENIZER; - if (parameters.containsKey(TOKEN_LIMIT)) { - tokenLimit = ((Number) parameters.get(TOKEN_LIMIT)).intValue(); + if (parameters.containsKey(TOKEN_LIMIT_FIELD)) { + tokenLimit = ((Number) parameters.get(TOKEN_LIMIT_FIELD)).intValue(); } - if (parameters.containsKey(OVERLAP_RATE)) { - overlapRate = ((Number) parameters.get(OVERLAP_RATE)).doubleValue(); + if (parameters.containsKey(OVERLAP_RATE_FIELD)) { + overlapRate = ((Number) parameters.get(OVERLAP_RATE_FIELD)).doubleValue(); } - if (parameters.containsKey(MAX_TOKEN_COUNT)) { - maxTokenCount = ((Number) parameters.get(MAX_TOKEN_COUNT)).intValue(); + if (parameters.containsKey(MAX_TOKEN_COUNT_FIELD)) { + maxTokenCount = ((Number) parameters.get(MAX_TOKEN_COUNT_FIELD)).intValue(); } - if (parameters.containsKey(TOKENIZER)) { - tokenizer = (String) parameters.get(TOKENIZER); + if (parameters.containsKey(TOKENIZER_FIELD)) { + tokenizer = (String) parameters.get(TOKENIZER_FIELD); } List tokens = tokenize(content, tokenizer, maxTokenCount); @@ -103,34 +103,34 @@ public List chunk(String content, Map parameters) { @Override public void validateParameters(Map parameters) { - if (parameters.containsKey(TOKEN_LIMIT)) { - if (!(parameters.get(TOKEN_LIMIT) instanceof Number)) { + if (parameters.containsKey(TOKEN_LIMIT_FIELD)) { + if (!(parameters.get(TOKEN_LIMIT_FIELD) instanceof Number)) { throw new IllegalArgumentException( - "fixed length parameter [" + TOKEN_LIMIT + "] cannot be cast to [" + Number.class.getName() + "]" + "fixed length parameter [" + TOKEN_LIMIT_FIELD + "] cannot be cast to [" + Number.class.getName() + "]" ); } - if (((Number) parameters.get(TOKEN_LIMIT)).intValue() <= 0) { - throw new IllegalArgumentException("fixed length parameter [" + TOKEN_LIMIT + "] must be positive"); + if (((Number) parameters.get(TOKEN_LIMIT_FIELD)).intValue() <= 0) { + throw new IllegalArgumentException("fixed length parameter [" + TOKEN_LIMIT_FIELD + "] must be positive"); } } - if (parameters.containsKey(OVERLAP_RATE)) { - if (!(parameters.get(OVERLAP_RATE) instanceof Number)) { + if (parameters.containsKey(OVERLAP_RATE_FIELD)) { + if (!(parameters.get(OVERLAP_RATE_FIELD) instanceof Number)) { throw new IllegalArgumentException( - "fixed length parameter [" + OVERLAP_RATE + "] cannot be cast to [" + Number.class.getName() + "]" + "fixed length parameter [" + OVERLAP_RATE_FIELD + "] cannot be cast to [" + Number.class.getName() + "]" ); } - if (((Number) parameters.get(OVERLAP_RATE)).doubleValue() < 0.0 - || ((Number) parameters.get(OVERLAP_RATE)).doubleValue() >= 1.0) { + if (((Number) parameters.get(OVERLAP_RATE_FIELD)).doubleValue() < 0.0 + || ((Number) parameters.get(OVERLAP_RATE_FIELD)).doubleValue() >= 1.0) { throw new IllegalArgumentException( - "fixed length parameter [" + OVERLAP_RATE + "] must be between 0 and 1, 1 is not included." + "fixed length parameter [" + OVERLAP_RATE_FIELD + "] must be between 0 and 1, 1 is not included." ); } } - if (parameters.containsKey(TOKENIZER) && !(parameters.get(TOKENIZER) instanceof String)) { + if (parameters.containsKey(TOKENIZER_FIELD) && !(parameters.get(TOKENIZER_FIELD) instanceof String)) { throw new IllegalArgumentException( - "fixed length parameter [" + TOKENIZER + "] cannot be cast to [" + String.class.getName() + "]" + "fixed length parameter [" + TOKENIZER_FIELD + "] cannot be cast to [" + String.class.getName() + "]" ); } } diff --git a/src/test/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessorTests.java b/src/test/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessorTests.java index 17044eb6a..3b0395320 100644 --- a/src/test/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessorTests.java +++ b/src/test/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessorTests.java @@ -4,6 +4,7 @@ */ package org.opensearch.neuralsearch.processor; +import com.google.common.collect.ImmutableMap; import lombok.SneakyThrows; import org.apache.lucene.tests.analysis.MockTokenizer; import org.junit.Before; @@ -21,6 +22,7 @@ import org.opensearch.ingest.IngestDocument; import org.opensearch.ingest.Processor; import org.opensearch.neuralsearch.processor.chunker.ChunkerFactory; +import org.opensearch.neuralsearch.processor.chunker.DelimiterChunker; import org.opensearch.neuralsearch.processor.chunker.FixedTokenLengthChunker; import org.opensearch.plugins.AnalysisPlugin; import org.opensearch.test.OpenSearchTestCase; @@ -42,6 +44,9 @@ public class DocumentChunkingProcessorTests extends OpenSearchTestCase { private static final String PROCESSOR_TAG = "mockTag"; private static final String DESCRIPTION = "mockDescription"; + private static final String INPUT_FIELD = "body"; + private static final String OUTPUT_FIELD = "body_chunk"; + private static final String INDEX_NAME = "_index"; @SneakyThrows private AnalysisRegistry getAnalysisRegistry() { @@ -85,7 +90,13 @@ public void testGetType() { private Map createFixedTokenLengthParameters() { Map parameters = new HashMap<>(); - parameters.put(FixedTokenLengthChunker.TOKEN_LIMIT, 10); + parameters.put(FixedTokenLengthChunker.TOKEN_LIMIT_FIELD, 10); + return parameters; + } + + private Map createDelimiterParameters() { + Map parameters = new HashMap<>(); + parameters.put(DelimiterChunker.DELIMITER_FIELD, "."); return parameters; } @@ -95,30 +106,79 @@ private DocumentChunkingProcessor createFixedTokenLengthInstance() { Map fieldParameters = new HashMap<>(); Map chunkerParameters = new HashMap<>(); chunkerParameters.put(ChunkerFactory.FIXED_LENGTH_ALGORITHM, createFixedTokenLengthParameters()); - chunkerParameters.put(DocumentChunkingProcessor.OUTPUT_FIELD, "body_chunk"); - fieldParameters.put("body", chunkerParameters); + chunkerParameters.put(DocumentChunkingProcessor.OUTPUT_FIELD, OUTPUT_FIELD); + fieldParameters.put(INPUT_FIELD, chunkerParameters); config.put(DocumentChunkingProcessor.FIELD_MAP_FIELD, fieldParameters); Map registry = new HashMap<>(); return factory.create(registry, PROCESSOR_TAG, DESCRIPTION, config); } - private IngestDocument createIngestDocument() { - Map sourceAndMetadata = new HashMap<>(); - sourceAndMetadata.put( - "body", - "This is an example document to be chunked. The document contains a single paragraph, two sentences and 24 tokens by standard tokenizer in OpenSearch." + @SneakyThrows + private DocumentChunkingProcessor createDelimiterInstance() { + Map config = new HashMap<>(); + Map fieldParameters = new HashMap<>(); + Map chunkerParameters = new HashMap<>(); + chunkerParameters.put(ChunkerFactory.DELIMITER_ALGORITHM, createDelimiterParameters()); + chunkerParameters.put(DocumentChunkingProcessor.OUTPUT_FIELD, OUTPUT_FIELD); + fieldParameters.put(INPUT_FIELD, chunkerParameters); + config.put(DocumentChunkingProcessor.FIELD_MAP_FIELD, fieldParameters); + Map registry = new HashMap<>(); + return factory.create(registry, PROCESSOR_TAG, DESCRIPTION, config); + } + + private String createSourceDataString() { + return "This is an example document to be chunked. The document contains a single paragraph, two sentences and 24 tokens by standard tokenizer in OpenSearch."; + } + + private List createSourceDataList() { + List documents = new ArrayList<>(); + documents.add( + "This is the first document to be chunked. The document contains a single paragraph, two sentences and 24 tokens by standard tokenizer in OpenSearch." + ); + documents.add( + "This is the second document to be chunked. The document contains a single paragraph, two sentences and 24 tokens by standard tokenizer in OpenSearch." ); - sourceAndMetadata.put(IndexFieldMapper.NAME, "_index"); + return documents; + } + + private Map createSourceDataMap() { + Map documents = new HashMap<>(); + documents.put( + "third", + "This is the third document to be chunked. The document contains a single paragraph, two sentences and 24 tokens by standard tokenizer in OpenSearch." + ); + documents.put( + "fourth", + "This is the fourth document to be chunked. The document contains a single paragraph, two sentences and 24 tokens by standard tokenizer in OpenSearch." + ); + return documents; + } + + private Map createSourceDataNestedMap() { + String documentString = createSourceDataString(); + List documentList = createSourceDataList(); + Map documentMap = createSourceDataMap(); + Map documents = new HashMap<>(); + documents.put("String", documentString); + documents.put("List", documentList); + documents.put("Map", documentMap); + return documents; + } + + private IngestDocument createIngestDocumentWithSourceData(Object sourceData) { + Map sourceAndMetadata = new HashMap<>(); + sourceAndMetadata.put(INPUT_FIELD, sourceData); + sourceAndMetadata.put(IndexFieldMapper.NAME, INDEX_NAME); return new IngestDocument(sourceAndMetadata, new HashMap<>()); } @SneakyThrows - public void testExecute_withFixedTokenLength_successful() { + public void testExecute_withFixedTokenLength_andSourceDataString_successful() { DocumentChunkingProcessor processor = createFixedTokenLengthInstance(); - IngestDocument ingestDocument = createIngestDocument(); + IngestDocument ingestDocument = createIngestDocumentWithSourceData(createSourceDataString()); IngestDocument document = processor.execute(ingestDocument); - assert document.getSourceAndMetadata().containsKey("body_chunk"); - Object passages = document.getSourceAndMetadata().get("body_chunk"); + assert document.getSourceAndMetadata().containsKey(OUTPUT_FIELD); + Object passages = document.getSourceAndMetadata().get(OUTPUT_FIELD); assert (passages instanceof List); List expectedPassages = new ArrayList<>(); expectedPassages.add("This is an example document to be chunked The document"); @@ -126,4 +186,99 @@ public void testExecute_withFixedTokenLength_successful() { expectedPassages.add("and 24 tokens by standard tokenizer in OpenSearch"); assertEquals(expectedPassages, passages); } + + @SneakyThrows + public void testExecute_withFixedTokenLength_andSourceDataList_successful() { + DocumentChunkingProcessor processor = createFixedTokenLengthInstance(); + IngestDocument ingestDocument = createIngestDocumentWithSourceData(createSourceDataList()); + IngestDocument document = processor.execute(ingestDocument); + assert document.getSourceAndMetadata().containsKey(OUTPUT_FIELD); + Object passages = document.getSourceAndMetadata().get(OUTPUT_FIELD); + assert (passages instanceof List); + + List expectedPassages = new ArrayList<>(); + expectedPassages.add("This is the first document to be chunked The document"); + expectedPassages.add("The document contains a single paragraph two sentences and 24"); + expectedPassages.add("and 24 tokens by standard tokenizer in OpenSearch"); + expectedPassages.add("This is the second document to be chunked The document"); + expectedPassages.add("The document contains a single paragraph two sentences and 24"); + expectedPassages.add("and 24 tokens by standard tokenizer in OpenSearch"); + assertEquals(expectedPassages, passages); + } + + @SneakyThrows + public void testExecute_withFixedTokenLength_andSourceDataMap_successful() { + DocumentChunkingProcessor processor = createFixedTokenLengthInstance(); + IngestDocument ingestDocument = createIngestDocumentWithSourceData(createSourceDataMap()); + IngestDocument document = processor.execute(ingestDocument); + assert document.getSourceAndMetadata().containsKey(OUTPUT_FIELD); + Object passages = document.getSourceAndMetadata().get(OUTPUT_FIELD); + assert (passages instanceof Map); + + List expectedPassages1 = new ArrayList<>(); + List expectedPassages2 = new ArrayList<>(); + + expectedPassages1.add("This is the third document to be chunked The document"); + expectedPassages1.add("The document contains a single paragraph two sentences and 24"); + expectedPassages1.add("and 24 tokens by standard tokenizer in OpenSearch"); + expectedPassages2.add("This is the fourth document to be chunked The document"); + expectedPassages2.add("The document contains a single paragraph two sentences and 24"); + expectedPassages2.add("and 24 tokens by standard tokenizer in OpenSearch"); + + Map expectedPassages = ImmutableMap.of("third", expectedPassages1, "fourth", expectedPassages2); + + assertEquals(expectedPassages, passages); + } + + @SneakyThrows + public void testExecute_withFixedTokenLength_andSourceDataNestedMap_successful() { + DocumentChunkingProcessor processor = createFixedTokenLengthInstance(); + IngestDocument ingestDocument = createIngestDocumentWithSourceData(createSourceDataNestedMap()); + IngestDocument document = processor.execute(ingestDocument); + assert document.getSourceAndMetadata().containsKey(OUTPUT_FIELD); + Object passages = document.getSourceAndMetadata().get(OUTPUT_FIELD); + assert (passages instanceof Map); + + Map expectedPassages = new HashMap<>(); + List expectedPassages1 = new ArrayList<>(); + List expectedPassages2 = new ArrayList<>(); + List expectedPassages3 = new ArrayList<>(); + List expectedPassages4 = new ArrayList<>(); + + expectedPassages1.add("This is an example document to be chunked The document"); + expectedPassages1.add("The document contains a single paragraph two sentences and 24"); + expectedPassages1.add("and 24 tokens by standard tokenizer in OpenSearch"); + expectedPassages2.add("This is the first document to be chunked The document"); + expectedPassages2.add("The document contains a single paragraph two sentences and 24"); + expectedPassages2.add("and 24 tokens by standard tokenizer in OpenSearch"); + expectedPassages2.add("This is the second document to be chunked The document"); + expectedPassages2.add("The document contains a single paragraph two sentences and 24"); + expectedPassages2.add("and 24 tokens by standard tokenizer in OpenSearch"); + expectedPassages3.add("This is the third document to be chunked The document"); + expectedPassages3.add("The document contains a single paragraph two sentences and 24"); + expectedPassages3.add("and 24 tokens by standard tokenizer in OpenSearch"); + expectedPassages4.add("This is the fourth document to be chunked The document"); + expectedPassages4.add("The document contains a single paragraph two sentences and 24"); + expectedPassages4.add("and 24 tokens by standard tokenizer in OpenSearch"); + + expectedPassages.put("String", expectedPassages1); + expectedPassages.put("List", expectedPassages2); + expectedPassages.put("Map", ImmutableMap.of("third", expectedPassages3, "fourth", expectedPassages4)); + + assertEquals(expectedPassages, passages); + } + + @SneakyThrows + public void testExecute_withDelimiter_andSourceDataString_successful() { + DocumentChunkingProcessor processor = createDelimiterInstance(); + IngestDocument ingestDocument = createIngestDocumentWithSourceData(createSourceDataString()); + IngestDocument document = processor.execute(ingestDocument); + assert document.getSourceAndMetadata().containsKey(OUTPUT_FIELD); + Object passages = document.getSourceAndMetadata().get(OUTPUT_FIELD); + assert (passages instanceof List); + List expectedPassages = new ArrayList<>(); + expectedPassages.add("This is an example document to be chunked."); + expectedPassages.add(" The document contains a single paragraph, two sentences and 24 tokens by standard tokenizer in OpenSearch."); + assertEquals(expectedPassages, passages); + } } diff --git a/src/test/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunkerTests.java b/src/test/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunkerTests.java index 8838310f4..d201ab574 100644 --- a/src/test/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunkerTests.java +++ b/src/test/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunkerTests.java @@ -5,7 +5,7 @@ package org.opensearch.neuralsearch.processor.chunker; import org.junit.Assert; -import org.junit.Test; +import org.opensearch.test.OpenSearchTestCase; import java.util.List; import java.util.Map; @@ -14,9 +14,8 @@ import static org.junit.Assert.assertThrows; import static org.opensearch.neuralsearch.processor.chunker.DelimiterChunker.DELIMITER_FIELD; -public class DelimiterChunkerTests { +public class DelimiterChunkerTests extends OpenSearchTestCase { - @Test public void testChunkerWithNoDelimiterField() { DelimiterChunker chunker = new DelimiterChunker(); String content = "a\nb\nc\nd"; @@ -25,7 +24,6 @@ public void testChunkerWithNoDelimiterField() { Assert.assertEquals("You must contain field:" + DELIMITER_FIELD + " in your parameter.", exception.getMessage()); } - @Test public void testChunkerWithDelimiterFieldNotString() { DelimiterChunker chunker = new DelimiterChunker(); String content = "a\nb\nc\nd"; @@ -34,7 +32,6 @@ public void testChunkerWithDelimiterFieldNotString() { Assert.assertEquals("delimiter parameters: " + List.of("") + " must be string.", exception.getMessage()); } - @Test public void testChunkerWithDelimiterFieldNoString() { DelimiterChunker chunker = new DelimiterChunker(); String content = "a\nb\nc\nd"; @@ -43,7 +40,6 @@ public void testChunkerWithDelimiterFieldNoString() { Assert.assertEquals("delimiter parameters should not be empty.", exception.getMessage()); } - @Test public void testChunker() { DelimiterChunker chunker = new DelimiterChunker(); String content = "a\nb\nc\nd"; @@ -52,7 +48,6 @@ public void testChunker() { assertEquals(List.of("a\n", "b\n", "c\n", "d"), chunkResult); } - @Test public void testChunkerWithDelimiterEnd() { DelimiterChunker chunker = new DelimiterChunker(); String content = "a\nb\nc\nd\n"; @@ -61,7 +56,6 @@ public void testChunkerWithDelimiterEnd() { assertEquals(List.of("a\n", "b\n", "c\n", "d\n"), chunkResult); } - @Test public void testChunkerWithOnlyDelimiter() { DelimiterChunker chunker = new DelimiterChunker(); String content = "\n"; @@ -70,7 +64,6 @@ public void testChunkerWithOnlyDelimiter() { assertEquals(List.of("\n"), chunkResult); } - @Test public void testChunkerWithAllDelimiters() { DelimiterChunker chunker = new DelimiterChunker(); String content = "\n\n\n"; @@ -79,7 +72,6 @@ public void testChunkerWithAllDelimiters() { assertEquals(List.of("\n", "\n", "\n"), chunkResult); } - @Test public void testChunkerWithDifferentDelimiters() { DelimiterChunker chunker = new DelimiterChunker(); String content = "a.b.cc.d."; @@ -88,8 +80,7 @@ public void testChunkerWithDifferentDelimiters() { assertEquals(List.of("a.", "b.", "cc.", "d."), chunkResult); } - @Test - public void testChunkerWithStringDelimter() { + public void testChunkerWithStringDelimiter() { DelimiterChunker chunker = new DelimiterChunker(); String content = "\n\na\n\n\n"; Map inputParameters = Map.of(DELIMITER_FIELD, "\n\n"); diff --git a/src/test/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunkerTests.java b/src/test/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunkerTests.java index e9b36682a..4c498d070 100644 --- a/src/test/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunkerTests.java +++ b/src/test/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunkerTests.java @@ -23,9 +23,9 @@ import static java.util.Collections.singletonList; import static java.util.Collections.singletonMap; -import static org.opensearch.neuralsearch.processor.chunker.FixedTokenLengthChunker.TOKEN_LIMIT; -import static org.opensearch.neuralsearch.processor.chunker.FixedTokenLengthChunker.OVERLAP_RATE; -import static org.opensearch.neuralsearch.processor.chunker.FixedTokenLengthChunker.TOKENIZER; +import static org.opensearch.neuralsearch.processor.chunker.FixedTokenLengthChunker.TOKEN_LIMIT_FIELD; +import static org.opensearch.neuralsearch.processor.chunker.FixedTokenLengthChunker.OVERLAP_RATE_FIELD; +import static org.opensearch.neuralsearch.processor.chunker.FixedTokenLengthChunker.TOKENIZER_FIELD; public class FixedTokenLengthChunkerTests extends OpenSearchTestCase { @@ -61,7 +61,7 @@ public void testValidateParameters_whenNoParams_thenSuccessful() { public void testValidateParameters_whenIllegalTokenLimitType_thenFail() { Map parameters = new HashMap<>(); - parameters.put(TOKEN_LIMIT, "invalid token limit"); + parameters.put(TOKEN_LIMIT_FIELD, "invalid token limit"); IllegalArgumentException illegalArgumentException = assertThrows( IllegalArgumentException.class, () -> FixedTokenLengthChunker.validateParameters(parameters) @@ -71,7 +71,7 @@ public void testValidateParameters_whenIllegalTokenLimitType_thenFail() { public void testValidateParameters_whenIllegalTokenLimitValue_thenFail() { Map parameters = new HashMap<>(); - parameters.put(TOKEN_LIMIT, -1); + parameters.put(TOKEN_LIMIT_FIELD, -1); IllegalArgumentException illegalArgumentException = assertThrows( IllegalArgumentException.class, () -> FixedTokenLengthChunker.validateParameters(parameters) @@ -81,7 +81,7 @@ public void testValidateParameters_whenIllegalTokenLimitValue_thenFail() { public void testValidateParameters_whenIllegalOverlapRateType_thenFail() { Map parameters = new HashMap<>(); - parameters.put(OVERLAP_RATE, "invalid overlap rate"); + parameters.put(OVERLAP_RATE_FIELD, "invalid overlap rate"); IllegalArgumentException illegalArgumentException = assertThrows( IllegalArgumentException.class, () -> FixedTokenLengthChunker.validateParameters(parameters) @@ -91,7 +91,7 @@ public void testValidateParameters_whenIllegalOverlapRateType_thenFail() { public void testValidateParameters_whenIllegalOverlapRateValue_thenFail() { Map parameters = new HashMap<>(); - parameters.put(OVERLAP_RATE, 1.0); + parameters.put(OVERLAP_RATE_FIELD, 1.0); IllegalArgumentException illegalArgumentException = assertThrows( IllegalArgumentException.class, () -> FixedTokenLengthChunker.validateParameters(parameters) @@ -104,7 +104,7 @@ public void testValidateParameters_whenIllegalOverlapRateValue_thenFail() { public void testValidateParameters_whenIllegalTokenizerType_thenFail() { Map parameters = new HashMap<>(); - parameters.put(TOKENIZER, 111); + parameters.put(TOKENIZER_FIELD, 111); IllegalArgumentException illegalArgumentException = assertThrows( IllegalArgumentException.class, () -> FixedTokenLengthChunker.validateParameters(parameters) @@ -114,7 +114,7 @@ public void testValidateParameters_whenIllegalTokenizerType_thenFail() { public void testChunk_withTokenLimit_10() { Map parameters = new HashMap<>(); - parameters.put(TOKEN_LIMIT, 10); + parameters.put(TOKEN_LIMIT_FIELD, 10); String content = "This is an example document to be chunked. The document contains a single paragraph, two sentences and 24 tokens by standard tokenizer in OpenSearch."; List passages = FixedTokenLengthChunker.chunk(content, parameters); @@ -127,7 +127,7 @@ public void testChunk_withTokenLimit_10() { public void testChunk_withTokenLimit_20() { Map parameters = new HashMap<>(); - parameters.put(TOKEN_LIMIT, 20); + parameters.put(TOKEN_LIMIT_FIELD, 20); String content = "This is an example document to be chunked. The document contains a single paragraph, two sentences and 24 tokens by standard tokenizer in OpenSearch."; List passages = FixedTokenLengthChunker.chunk(content, parameters); @@ -141,8 +141,8 @@ public void testChunk_withTokenLimit_20() { public void testChunk_withOverlapRate_half() { Map parameters = new HashMap<>(); - parameters.put(TOKEN_LIMIT, 10); - parameters.put(OVERLAP_RATE, 0.5); + parameters.put(TOKEN_LIMIT_FIELD, 10); + parameters.put(OVERLAP_RATE_FIELD, 0.5); String content = "This is an example document to be chunked. The document contains a single paragraph, two sentences and 24 tokens by standard tokenizer in OpenSearch."; List passages = FixedTokenLengthChunker.chunk(content, parameters); From 3ae94e444880257e8d5860f921a1845af017a5fd Mon Sep 17 00:00:00 2001 From: yuye-aws Date: Tue, 27 Feb 2024 16:49:38 +0800 Subject: [PATCH 032/189] add unit tests for parameter valdiation in document chunking processor Signed-off-by: yuye-aws --- .idea/runConfigurations/Run_Neural_Search.xml | 23 ---- .../processor/DocumentChunkingProcessor.java | 4 +- .../DocumentChunkingProcessorTests.java | 116 ++++++++++++++++-- 3 files changed, 112 insertions(+), 31 deletions(-) delete mode 100644 .idea/runConfigurations/Run_Neural_Search.xml diff --git a/.idea/runConfigurations/Run_Neural_Search.xml b/.idea/runConfigurations/Run_Neural_Search.xml deleted file mode 100644 index d881bd512..000000000 --- a/.idea/runConfigurations/Run_Neural_Search.xml +++ /dev/null @@ -1,23 +0,0 @@ - - - - - - - true - true - false - - - \ No newline at end of file diff --git a/src/main/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessor.java b/src/main/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessor.java index 550c8013f..d1a1af536 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessor.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessor.java @@ -93,7 +93,9 @@ private void validateDocumentChunkingFieldMap(Map fieldMap) { // output field must be string if (!(parameterMap.containsKey(OUTPUT_FIELD))) { - throw new IllegalArgumentException("parameters for output field [" + OUTPUT_FIELD + "] is null, cannot process it."); + throw new IllegalArgumentException( + "parameters for input field [" + inputField + "] misses [" + OUTPUT_FIELD + "], cannot process it." + ); } Object outputField = parameterMap.get(OUTPUT_FIELD); diff --git a/src/test/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessorTests.java b/src/test/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessorTests.java index 3b0395320..daadcbb6a 100644 --- a/src/test/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessorTests.java +++ b/src/test/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessorTests.java @@ -81,13 +81,6 @@ public void setup() { factory = new DocumentChunkingProcessor.Factory(settings, clusterService, indicesService, getAnalysisRegistry()); } - @SneakyThrows - public void testGetType() { - DocumentChunkingProcessor processor = createFixedTokenLengthInstance(); - String type = processor.getType(); - assertEquals(DocumentChunkingProcessor.TYPE, type); - } - private Map createFixedTokenLengthParameters() { Map parameters = new HashMap<>(); parameters.put(FixedTokenLengthChunker.TOKEN_LIMIT_FIELD, 10); @@ -126,6 +119,115 @@ private DocumentChunkingProcessor createDelimiterInstance() { return factory.create(registry, PROCESSOR_TAG, DESCRIPTION, config); } + public void testCreate_whenFieldMapEmpty_failure() { + Map config = new HashMap<>(); + Map emptyFieldMap = new HashMap<>(); + config.put(DocumentChunkingProcessor.FIELD_MAP_FIELD, emptyFieldMap); + Map registry = new HashMap<>(); + IllegalArgumentException illegalArgumentException = assertThrows( + IllegalArgumentException.class, + () -> factory.create(registry, PROCESSOR_TAG, DESCRIPTION, config) + ); + assertEquals("Unable to create the processor as field_map is null or empty", illegalArgumentException.getMessage()); + } + + public void testCreate_whenFieldMapWithEmptyParameter_failure() { + Map config = new HashMap<>(); + Map fieldMap = new HashMap<>(); + fieldMap.put("key", null); + config.put(DocumentChunkingProcessor.FIELD_MAP_FIELD, fieldMap); + Map registry = new HashMap<>(); + IllegalArgumentException illegalArgumentException = assertThrows( + IllegalArgumentException.class, + () -> factory.create(registry, PROCESSOR_TAG, DESCRIPTION, config) + ); + assertEquals("parameters for input field [key] is null, cannot process it.", illegalArgumentException.getMessage()); + } + + public void testCreate_whenFieldMapWithIllegalParameterType_failure() { + Map config = new HashMap<>(); + Map fieldMap = new HashMap<>(); + fieldMap.put("key", "value"); + config.put(DocumentChunkingProcessor.FIELD_MAP_FIELD, fieldMap); + Map registry = new HashMap<>(); + IllegalArgumentException illegalArgumentException = assertThrows( + IllegalArgumentException.class, + () -> factory.create(registry, PROCESSOR_TAG, DESCRIPTION, config) + ); + assertEquals("parameters for input field [key] cannot be cast to [java.util.Map]", illegalArgumentException.getMessage()); + } + + public void testCreate_whenFieldMapWithEmptyOutputField_failure() { + Map config = new HashMap<>(); + Map fieldMap = new HashMap<>(); + fieldMap.put(INPUT_FIELD, ImmutableMap.of()); + config.put(DocumentChunkingProcessor.FIELD_MAP_FIELD, fieldMap); + Map registry = new HashMap<>(); + IllegalArgumentException illegalArgumentException = assertThrows( + IllegalArgumentException.class, + () -> factory.create(registry, PROCESSOR_TAG, DESCRIPTION, config) + ); + assertEquals( + "parameters for input field [" + INPUT_FIELD + "] misses [" + DocumentChunkingProcessor.OUTPUT_FIELD + "], cannot process it.", + illegalArgumentException.getMessage() + ); + } + + public void testCreate_whenFieldMapWithIllegalOutputField_failure() { + Map config = new HashMap<>(); + Map fieldMap = new HashMap<>(); + fieldMap.put(INPUT_FIELD, ImmutableMap.of(DocumentChunkingProcessor.OUTPUT_FIELD, 1)); + config.put(DocumentChunkingProcessor.FIELD_MAP_FIELD, fieldMap); + Map registry = new HashMap<>(); + IllegalArgumentException illegalArgumentException = assertThrows( + IllegalArgumentException.class, + () -> factory.create(registry, PROCESSOR_TAG, DESCRIPTION, config) + ); + assertEquals( + "parameters for output field [output_field] cannot be cast to [java.lang.String]", + illegalArgumentException.getMessage() + ); + } + + public void testCreate_whenFieldMapWithIllegalKey_failure() { + Map config = new HashMap<>(); + Map fieldMap = new HashMap<>(); + fieldMap.put(INPUT_FIELD, ImmutableMap.of(DocumentChunkingProcessor.OUTPUT_FIELD, OUTPUT_FIELD, 1, 1)); + config.put(DocumentChunkingProcessor.FIELD_MAP_FIELD, fieldMap); + Map registry = new HashMap<>(); + IllegalArgumentException illegalArgumentException = assertThrows( + IllegalArgumentException.class, + () -> factory.create(registry, PROCESSOR_TAG, DESCRIPTION, config) + ); + assertEquals( + "found parameter entry with non-string key", + illegalArgumentException.getMessage() + ); + } + + public void testCreate_whenFieldMapWithNoAlgorithm_failure() { + Map config = new HashMap<>(); + Map fieldMap = new HashMap<>(); + fieldMap.put(INPUT_FIELD, ImmutableMap.of(DocumentChunkingProcessor.OUTPUT_FIELD, INPUT_FIELD)); + config.put(DocumentChunkingProcessor.FIELD_MAP_FIELD, fieldMap); + Map registry = new HashMap<>(); + IllegalArgumentException illegalArgumentException = assertThrows( + IllegalArgumentException.class, + () -> factory.create(registry, PROCESSOR_TAG, DESCRIPTION, config) + ); + assertEquals( + "input field [" + INPUT_FIELD + "] should has and only has 1 chunking algorithm", + illegalArgumentException.getMessage() + ); + } + + @SneakyThrows + public void testGetType() { + DocumentChunkingProcessor processor = createFixedTokenLengthInstance(); + String type = processor.getType(); + assertEquals(DocumentChunkingProcessor.TYPE, type); + } + private String createSourceDataString() { return "This is an example document to be chunked. The document contains a single paragraph, two sentences and 24 tokens by standard tokenizer in OpenSearch."; } From c8dc66c95ffcfbb522aa275cd453d3392b1f5b6a Mon Sep 17 00:00:00 2001 From: yuye-aws Date: Tue, 27 Feb 2024 17:19:47 +0800 Subject: [PATCH 033/189] add back deleted xml file Signed-off-by: yuye-aws --- .idea/runConfigurations/Run_Neural_Search.xml | 23 +++++++++++++++++++ 1 file changed, 23 insertions(+) create mode 100644 .idea/runConfigurations/Run_Neural_Search.xml diff --git a/.idea/runConfigurations/Run_Neural_Search.xml b/.idea/runConfigurations/Run_Neural_Search.xml new file mode 100644 index 000000000..605085bbf --- /dev/null +++ b/.idea/runConfigurations/Run_Neural_Search.xml @@ -0,0 +1,23 @@ + + + + + + + true + true + false + + + From 1e1ce1b6050e48c020fb3673fdfd16844d1e7917 Mon Sep 17 00:00:00 2001 From: yuye-aws Date: Tue, 27 Feb 2024 17:20:46 +0800 Subject: [PATCH 034/189] restore xml file Signed-off-by: yuye-aws --- .idea/runConfigurations/Run_Neural_Search.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.idea/runConfigurations/Run_Neural_Search.xml b/.idea/runConfigurations/Run_Neural_Search.xml index 605085bbf..d881bd512 100644 --- a/.idea/runConfigurations/Run_Neural_Search.xml +++ b/.idea/runConfigurations/Run_Neural_Search.xml @@ -20,4 +20,4 @@ false - + \ No newline at end of file From b4251220cf2fc46a4dbd9e12e777a6ded1b76c4e Mon Sep 17 00:00:00 2001 From: yuye-aws Date: Thu, 29 Feb 2024 11:31:02 +0800 Subject: [PATCH 035/189] integration tests for document chunking processor Signed-off-by: yuye-aws --- .idea/runConfigurations/Run_Neural_Search.xml | 23 --- .../DocumentChunkingProcessorIT.java | 179 ++++++++++++++++++ .../DocumentChunkingProcessorTests.java | 5 +- .../DocumentChunkingIndexSettings.json | 6 + .../chunker/DocumentChunkingTestDocument.json | 3 + .../DocumentChunkingTestLongDocument.json | 3 + .../chunker/PipelineForCascadedChunker.json | 29 +++ .../PipelineForDelimiterChunker.json} | 6 +- .../PipelineForFixedTokenLengthChunker.json | 17 ++ 9 files changed, 241 insertions(+), 30 deletions(-) delete mode 100644 .idea/runConfigurations/Run_Neural_Search.xml create mode 100644 src/test/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessorIT.java create mode 100644 src/test/resources/processor/chunker/DocumentChunkingIndexSettings.json create mode 100644 src/test/resources/processor/chunker/DocumentChunkingTestDocument.json create mode 100644 src/test/resources/processor/chunker/DocumentChunkingTestLongDocument.json create mode 100644 src/test/resources/processor/chunker/PipelineForCascadedChunker.json rename src/test/resources/processor/{DelimiterChunkerProcessor.json => chunker/PipelineForDelimiterChunker.json} (67%) create mode 100644 src/test/resources/processor/chunker/PipelineForFixedTokenLengthChunker.json diff --git a/.idea/runConfigurations/Run_Neural_Search.xml b/.idea/runConfigurations/Run_Neural_Search.xml deleted file mode 100644 index d881bd512..000000000 --- a/.idea/runConfigurations/Run_Neural_Search.xml +++ /dev/null @@ -1,23 +0,0 @@ - - - - - - - true - true - false - - - \ No newline at end of file diff --git a/src/test/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessorIT.java b/src/test/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessorIT.java new file mode 100644 index 000000000..320d7ac6f --- /dev/null +++ b/src/test/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessorIT.java @@ -0,0 +1,179 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ +package org.opensearch.neuralsearch.processor; + +import com.google.common.collect.ImmutableList; +import org.apache.hc.core5.http.HttpHeaders; +import org.apache.hc.core5.http.io.entity.EntityUtils; +import org.apache.hc.core5.http.message.BasicHeader; +import org.junit.Before; +import org.opensearch.client.Response; +import org.opensearch.common.xcontent.XContentHelper; +import org.opensearch.common.xcontent.XContentType; +import org.opensearch.index.query.MatchAllQueryBuilder; +import org.opensearch.neuralsearch.BaseNeuralSearchIT; + +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.ArrayList; +import java.util.List; +import java.util.Map; + +import static org.opensearch.neuralsearch.TestUtils.DEFAULT_USER_AGENT; + +public class DocumentChunkingProcessorIT extends BaseNeuralSearchIT { + private static final String INDEX_NAME = "document_chunking_index"; + + private static final String OUTPUT_FIELD = "body_chunk"; + + private static final String FIXED_TOKEN_LENGTH_PIPELINE_NAME = "pipeline-document-chunking-fixed-token-length"; + + private static final String DELIMITER_PIPELINE_NAME = "pipeline-document-chunking-delimiter"; + + private static final String CASCADE_PIPELINE_NAME = "pipeline-document-chunking-cascade"; + + private static final String TEST_DOCUMENT = "processor/chunker/DocumentChunkingTestDocument.json"; + + private static final String TEST_LONG_DOCUMENT = "processor/chunker/DocumentChunkingTestLongDocument.json"; + + private static final Map PIPELINE_CONFIGS_BY_NAME = Map.of( + FIXED_TOKEN_LENGTH_PIPELINE_NAME, + "processor/chunker/PipelineForFixedTokenLengthChunker.json", + DELIMITER_PIPELINE_NAME, + "processor/chunker/PipelineForDelimiterChunker.json", + CASCADE_PIPELINE_NAME, + "processor/chunker/PipelineForCascadedChunker.json" + ); + + @Before + public void setUp() throws Exception { + super.setUp(); + updateClusterSettings(); + } + + public void testDocumentChunkingProcessor_withFixedTokenLength_successful() throws Exception { + try { + createPipelineProcessor(FIXED_TOKEN_LENGTH_PIPELINE_NAME); + createDocumentChunkingIndex(FIXED_TOKEN_LENGTH_PIPELINE_NAME); + ingestDocument(TEST_DOCUMENT); + + List expectedPassages = new ArrayList<>(); + expectedPassages.add("This is an example document to be chunked The document"); + expectedPassages.add("The document contains a single paragraph two sentences and 24"); + expectedPassages.add("and 24 tokens by standard tokenizer in OpenSearch"); + validateIndexIngestResults(INDEX_NAME, expectedPassages); + } finally { + wipeOfTestResources(INDEX_NAME, FIXED_TOKEN_LENGTH_PIPELINE_NAME, null, null); + } + } + + public void testDocumentChunkingProcessor_withFixedTokenLength_fail() throws Exception { + try { + createPipelineProcessor(FIXED_TOKEN_LENGTH_PIPELINE_NAME); + createDocumentChunkingIndex(FIXED_TOKEN_LENGTH_PIPELINE_NAME); + Exception exception = assertThrows(Exception.class, () -> ingestDocument(TEST_LONG_DOCUMENT)); + // max_token_count is 100 by index settings + assert (exception.getMessage() + .contains("The number of tokens produced by calling _analyze has exceeded the allowed maximum of [100].")); + assertEquals(0, getDocCount(INDEX_NAME)); + } finally { + wipeOfTestResources(INDEX_NAME, FIXED_TOKEN_LENGTH_PIPELINE_NAME, null, null); + } + } + + public void testDocumentChunkingProcessor_withDelimiter_successful() throws Exception { + try { + createPipelineProcessor(DELIMITER_PIPELINE_NAME); + createDocumentChunkingIndex(DELIMITER_PIPELINE_NAME); + ingestDocument(TEST_DOCUMENT); + + List expectedPassages = new ArrayList<>(); + expectedPassages.add("This is an example document to be chunked."); + expectedPassages.add( + " The document contains a single paragraph, two sentences and 24 tokens by standard tokenizer in OpenSearch." + ); + validateIndexIngestResults(INDEX_NAME, expectedPassages); + } finally { + wipeOfTestResources(INDEX_NAME, DELIMITER_PIPELINE_NAME, null, null); + } + } + + public void testDocumentChunkingProcessor_withCascade_successful() throws Exception { + try { + createPipelineProcessor(CASCADE_PIPELINE_NAME); + createDocumentChunkingIndex(CASCADE_PIPELINE_NAME); + ingestDocument(TEST_DOCUMENT); + + List expectedPassages = new ArrayList<>(); + // " ", "." and "," will not be included in fixed token length output + expectedPassages.add("This is an example document to be chunked"); + expectedPassages.add("The document contains a single paragraph two sentences and 24"); + expectedPassages.add("and 24 tokens by standard tokenizer in OpenSearch"); + validateIndexIngestResults(INDEX_NAME, expectedPassages); + } finally { + wipeOfTestResources(INDEX_NAME, CASCADE_PIPELINE_NAME, null, null); + } + } + + private void validateIndexIngestResults(String indexName, Object expected) { + assertEquals(1, getDocCount(indexName)); + MatchAllQueryBuilder query = new MatchAllQueryBuilder(); + Map searchResults = search(indexName, query, 10); + assertNotNull(searchResults); + Map document = getFirstInnerHit(searchResults); + assertNotNull(document); + Object documentSource = document.get("_source"); + assert (documentSource instanceof Map); + @SuppressWarnings("unchecked") + Map documentSourceMap = (Map) documentSource; + assert (documentSourceMap).containsKey(OUTPUT_FIELD); + Object ingestOutputs = documentSourceMap.get(OUTPUT_FIELD); + assertEquals(expected, ingestOutputs); + } + + private void createPipelineProcessor(final String pipelineName) throws Exception { + String requestBody = Files.readString(Path.of(classLoader.getResource(PIPELINE_CONFIGS_BY_NAME.get(pipelineName)).toURI())); + Response pipelineCreateResponse = makeRequest( + client(), + "PUT", + "/_ingest/pipeline/" + pipelineName, + null, + toHttpEntity(String.format(LOCALE, requestBody)), + ImmutableList.of(new BasicHeader(HttpHeaders.USER_AGENT, DEFAULT_USER_AGENT)) + ); + Map node = XContentHelper.convertToMap( + XContentType.JSON.xContent(), + EntityUtils.toString(pipelineCreateResponse.getEntity()), + false + ); + assertEquals("true", node.get("acknowledged").toString()); + } + + private void createDocumentChunkingIndex(String pipelineName) throws Exception { + createIndexWithConfiguration( + INDEX_NAME, + Files.readString(Path.of(classLoader.getResource("processor/chunker/DocumentChunkingIndexSettings.json").toURI())), + pipelineName + ); + } + + private void ingestDocument(String documentPath) throws Exception { + String ingestDocument = Files.readString(Path.of(classLoader.getResource(documentPath).toURI())); + Response response = makeRequest( + client(), + "POST", + INDEX_NAME + "/_doc?refresh", + null, + toHttpEntity(ingestDocument), + ImmutableList.of(new BasicHeader(HttpHeaders.USER_AGENT, "Kibana")) + ); + Map map = XContentHelper.convertToMap( + XContentType.JSON.xContent(), + EntityUtils.toString(response.getEntity()), + false + ); + assertEquals("created", map.get("result")); + } +} diff --git a/src/test/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessorTests.java b/src/test/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessorTests.java index daadcbb6a..d02c75350 100644 --- a/src/test/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessorTests.java +++ b/src/test/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessorTests.java @@ -199,10 +199,7 @@ public void testCreate_whenFieldMapWithIllegalKey_failure() { IllegalArgumentException.class, () -> factory.create(registry, PROCESSOR_TAG, DESCRIPTION, config) ); - assertEquals( - "found parameter entry with non-string key", - illegalArgumentException.getMessage() - ); + assertEquals("found parameter entry with non-string key", illegalArgumentException.getMessage()); } public void testCreate_whenFieldMapWithNoAlgorithm_failure() { diff --git a/src/test/resources/processor/chunker/DocumentChunkingIndexSettings.json b/src/test/resources/processor/chunker/DocumentChunkingIndexSettings.json new file mode 100644 index 000000000..a2b074e69 --- /dev/null +++ b/src/test/resources/processor/chunker/DocumentChunkingIndexSettings.json @@ -0,0 +1,6 @@ +{ + "settings":{ + "index.analyze.max_token_count" : 100, + "default_pipeline": "%s" + } +} diff --git a/src/test/resources/processor/chunker/DocumentChunkingTestDocument.json b/src/test/resources/processor/chunker/DocumentChunkingTestDocument.json new file mode 100644 index 000000000..673e8b1cf --- /dev/null +++ b/src/test/resources/processor/chunker/DocumentChunkingTestDocument.json @@ -0,0 +1,3 @@ +{ + "body": "This is an example document to be chunked. The document contains a single paragraph, two sentences and 24 tokens by standard tokenizer in OpenSearch." +} diff --git a/src/test/resources/processor/chunker/DocumentChunkingTestLongDocument.json b/src/test/resources/processor/chunker/DocumentChunkingTestLongDocument.json new file mode 100644 index 000000000..71927887b --- /dev/null +++ b/src/test/resources/processor/chunker/DocumentChunkingTestLongDocument.json @@ -0,0 +1,3 @@ +{ + "body": "This is an example long document to be chunked. The document has more than 100 tokens by standard tokenizer in OpenSearch. This is an example long document to be chunked. The document has more than 100 tokens by standard tokenizer in OpenSearch. This is an example long document to be chunked. The document has more than 100 tokens by standard tokenizer in OpenSearch. This is an example long document to be chunked. The document has more than 100 tokens by standard tokenizer in OpenSearch. This is an example long document to be chunked. The document has more than 100 tokens by standard tokenizer in OpenSearch. This is an example long document to be chunked. The document has more than 100 tokens by standard tokenizer in OpenSearch. This is an example long document to be chunked. The document has more than 100 tokens by standard tokenizer in OpenSearch. This is an example long document to be chunked. The document has more than 100 tokens by standard tokenizer in OpenSearch. This is an example long document to be chunked. The document has more than 100 tokens by standard tokenizer in OpenSearch. This is an example long document to be chunked. The document has more than 100 tokens by standard tokenizer in OpenSearch. This is an example long document to be chunked. The document has more than 100 tokens by standard tokenizer in OpenSearch." +} diff --git a/src/test/resources/processor/chunker/PipelineForCascadedChunker.json b/src/test/resources/processor/chunker/PipelineForCascadedChunker.json new file mode 100644 index 000000000..6302cfdfe --- /dev/null +++ b/src/test/resources/processor/chunker/PipelineForCascadedChunker.json @@ -0,0 +1,29 @@ +{ + "description": "An example cascaded pipeline with fixed token length algorithm after chunking algorithm", + "processors" : [ + { + "chunking": { + "field_map": { + "body": { + "delimiter": { + "delimiter": "." + }, + "output_field": "body_chunk_intermediate" + } + } + } + }, + { + "chunking": { + "field_map": { + "body_chunk_intermediate": { + "fix_length": { + "token_limit": 10 + }, + "output_field": "body_chunk" + } + } + } + } + ] +} diff --git a/src/test/resources/processor/DelimiterChunkerProcessor.json b/src/test/resources/processor/chunker/PipelineForDelimiterChunker.json similarity index 67% rename from src/test/resources/processor/DelimiterChunkerProcessor.json rename to src/test/resources/processor/chunker/PipelineForDelimiterChunker.json index 08077bc54..9ababd6ed 100644 --- a/src/test/resources/processor/DelimiterChunkerProcessor.json +++ b/src/test/resources/processor/chunker/PipelineForDelimiterChunker.json @@ -4,11 +4,11 @@ { "chunking": { "field_map": { - "body_chunk1": { + "body": { "delimiter": { - "delimiter": "\n" + "delimiter": "." }, - "output_field": "body_chunk2" + "output_field": "body_chunk" } } } diff --git a/src/test/resources/processor/chunker/PipelineForFixedTokenLengthChunker.json b/src/test/resources/processor/chunker/PipelineForFixedTokenLengthChunker.json new file mode 100644 index 000000000..27daf19c8 --- /dev/null +++ b/src/test/resources/processor/chunker/PipelineForFixedTokenLengthChunker.json @@ -0,0 +1,17 @@ +{ + "description": "An example fixed token length chunker pipeline", + "processors" : [ + { + "chunking": { + "field_map": { + "body": { + "fix_length": { + "token_limit": 10 + }, + "output_field": "body_chunk" + } + } + } + } + ] +} From 31bf921865b1a2bc9f2accd7ba35f435c9aa6d44 Mon Sep 17 00:00:00 2001 From: yuye-aws Date: Thu, 29 Feb 2024 11:51:43 +0800 Subject: [PATCH 036/189] add back Run_Neural_Search.xml Signed-off-by: yuye-aws --- .idea/runConfigurations/Run_Neural_Search.xml | 23 +++++++++++++++++++ 1 file changed, 23 insertions(+) create mode 100644 .idea/runConfigurations/Run_Neural_Search.xml diff --git a/.idea/runConfigurations/Run_Neural_Search.xml b/.idea/runConfigurations/Run_Neural_Search.xml new file mode 100644 index 000000000..06406a880 --- /dev/null +++ b/.idea/runConfigurations/Run_Neural_Search.xml @@ -0,0 +1,23 @@ + + + + + + + true + true + false + + + \ No newline at end of file From 11d8f53c4fa884a3eed3ba27ba186b88b8e8d064 Mon Sep 17 00:00:00 2001 From: yuye-aws Date: Thu, 29 Feb 2024 11:55:51 +0800 Subject: [PATCH 037/189] restore Run_Neural_Search.xml Signed-off-by: yuye-aws --- .idea/runConfigurations/Run_Neural_Search.xml | 42 +++++++++---------- 1 file changed, 21 insertions(+), 21 deletions(-) diff --git a/.idea/runConfigurations/Run_Neural_Search.xml b/.idea/runConfigurations/Run_Neural_Search.xml index 06406a880..d881bd512 100644 --- a/.idea/runConfigurations/Run_Neural_Search.xml +++ b/.idea/runConfigurations/Run_Neural_Search.xml @@ -1,23 +1,23 @@ - - - - - - true - true - false - - + + + + + + true + true + false + + \ No newline at end of file From 0662278cdc112367035ee46006ced084ef0b9dc2 Mon Sep 17 00:00:00 2001 From: yuye-aws Date: Thu, 29 Feb 2024 12:06:30 +0800 Subject: [PATCH 038/189] add changelog Signed-off-by: yuye-aws --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 57a2f7a4d..e7fadfe4d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), ## [Unreleased 3.0](https://github.com/opensearch-project/neural-search/compare/2.x...HEAD) ### Features +- Implement document chunking processor with fixed token length and delimiter algorithm ([#607](https://github.com/opensearch-project/neural-search/pull/607/)) ### Enhancements ### Bug Fixes - Fix async actions are left in neural_sparse query ([#438](https://github.com/opensearch-project/neural-search/pull/438)) From 5e75e046f333773275628f978213cb90430630dc Mon Sep 17 00:00:00 2001 From: yuye-aws Date: Thu, 29 Feb 2024 12:15:00 +0800 Subject: [PATCH 039/189] update integration test for cascade processor Signed-off-by: yuye-aws --- .../DocumentChunkingProcessorIT.java | 22 ++++++++++++++----- 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/src/test/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessorIT.java b/src/test/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessorIT.java index 320d7ac6f..d8caa64da 100644 --- a/src/test/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessorIT.java +++ b/src/test/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessorIT.java @@ -28,6 +28,8 @@ public class DocumentChunkingProcessorIT extends BaseNeuralSearchIT { private static final String OUTPUT_FIELD = "body_chunk"; + private static final String INTERMEDIATE_FIELD = "body_chunk_intermediate"; + private static final String FIXED_TOKEN_LENGTH_PIPELINE_NAME = "pipeline-document-chunking-fixed-token-length"; private static final String DELIMITER_PIPELINE_NAME = "pipeline-document-chunking-delimiter"; @@ -63,7 +65,7 @@ public void testDocumentChunkingProcessor_withFixedTokenLength_successful() thro expectedPassages.add("This is an example document to be chunked The document"); expectedPassages.add("The document contains a single paragraph two sentences and 24"); expectedPassages.add("and 24 tokens by standard tokenizer in OpenSearch"); - validateIndexIngestResults(INDEX_NAME, expectedPassages); + validateIndexIngestResults(INDEX_NAME, OUTPUT_FIELD, expectedPassages); } finally { wipeOfTestResources(INDEX_NAME, FIXED_TOKEN_LENGTH_PIPELINE_NAME, null, null); } @@ -94,7 +96,7 @@ public void testDocumentChunkingProcessor_withDelimiter_successful() throws Exce expectedPassages.add( " The document contains a single paragraph, two sentences and 24 tokens by standard tokenizer in OpenSearch." ); - validateIndexIngestResults(INDEX_NAME, expectedPassages); + validateIndexIngestResults(INDEX_NAME, OUTPUT_FIELD, expectedPassages); } finally { wipeOfTestResources(INDEX_NAME, DELIMITER_PIPELINE_NAME, null, null); } @@ -111,13 +113,21 @@ public void testDocumentChunkingProcessor_withCascade_successful() throws Except expectedPassages.add("This is an example document to be chunked"); expectedPassages.add("The document contains a single paragraph two sentences and 24"); expectedPassages.add("and 24 tokens by standard tokenizer in OpenSearch"); - validateIndexIngestResults(INDEX_NAME, expectedPassages); + validateIndexIngestResults(INDEX_NAME, OUTPUT_FIELD, expectedPassages); + + expectedPassages.clear(); + expectedPassages.add("This is an example document to be chunked."); + expectedPassages.add( + " The document contains a single paragraph, two sentences and 24 tokens by standard tokenizer in OpenSearch." + ); + validateIndexIngestResults(INDEX_NAME, INTERMEDIATE_FIELD, expectedPassages); + } finally { wipeOfTestResources(INDEX_NAME, CASCADE_PIPELINE_NAME, null, null); } } - private void validateIndexIngestResults(String indexName, Object expected) { + private void validateIndexIngestResults(String indexName, String fieldName, Object expected) { assertEquals(1, getDocCount(indexName)); MatchAllQueryBuilder query = new MatchAllQueryBuilder(); Map searchResults = search(indexName, query, 10); @@ -128,8 +138,8 @@ private void validateIndexIngestResults(String indexName, Object expected) { assert (documentSource instanceof Map); @SuppressWarnings("unchecked") Map documentSourceMap = (Map) documentSource; - assert (documentSourceMap).containsKey(OUTPUT_FIELD); - Object ingestOutputs = documentSourceMap.get(OUTPUT_FIELD); + assert (documentSourceMap).containsKey(fieldName); + Object ingestOutputs = documentSourceMap.get(fieldName); assertEquals(expected, ingestOutputs); } From 962ed326a9c90c5565d063c018499f97b831d8d8 Mon Sep 17 00:00:00 2001 From: xinyual Date: Fri, 1 Mar 2024 11:01:16 +0800 Subject: [PATCH 040/189] add max chunk limit Signed-off-by: xinyual Signed-off-by: yuye-aws --- gradle.properties | 1 + .../processor/chunker/DelimiterChunker.java | 23 +++++++++++++++++ .../chunker/DelimiterChunkerTests.java | 25 +++++++++++++++++++ 3 files changed, 49 insertions(+) diff --git a/gradle.properties b/gradle.properties index df3092f50..6d22deaa5 100644 --- a/gradle.properties +++ b/gradle.properties @@ -16,3 +16,4 @@ org.gradle.jvmargs=--add-exports jdk.compiler/com.sun.tools.javac.api=ALL-UNNAME --add-exports jdk.compiler/com.sun.tools.javac.parser=ALL-UNNAMED \ --add-exports jdk.compiler/com.sun.tools.javac.tree=ALL-UNNAMED \ --add-exports jdk.compiler/com.sun.tools.javac.util=ALL-UNNAMED +customDistributionUrl=https://artifacts.opensearch.org/snapshots/core/opensearch/3.0.0-SNAPSHOT/opensearch-min-3.0.0-SNAPSHOT-darwin-x64-latest.tar.gz diff --git a/src/main/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunker.java b/src/main/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunker.java index 31577604f..2670d3882 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunker.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunker.java @@ -14,6 +14,8 @@ public DelimiterChunker() {} public static String DELIMITER_FIELD = "delimiter"; + public static String MAX_CHUNK_LIMIT_FIELD = "max_chunk_limit"; + @Override public void validateParameters(Map parameters) { if (parameters.containsKey(DELIMITER_FIELD)) { @@ -26,11 +28,26 @@ public void validateParameters(Map parameters) { } else { throw new IllegalArgumentException("You must contain field:" + DELIMITER_FIELD + " in your parameter."); } + if (parameters.containsKey(MAX_CHUNK_LIMIT_FIELD)) { + Object maxChunkLimit = parameters.get(MAX_CHUNK_LIMIT_FIELD); + if (!(maxChunkLimit instanceof String)) { + throw new IllegalArgumentException( + "Parameter max_chunk_limit:" + maxChunkLimit.toString() + " cannot be converted to integer." + ); + } else { + try { + int maxChunkingNumber = Integer.valueOf((String) maxChunkLimit); + } catch (Exception exception) { + throw new IllegalArgumentException("Parameter max_chunk_limit:" + maxChunkLimit + " cannot be converted to integer."); + } + } + } } @Override public List chunk(String content, Map parameters) { String delimiter = (String) parameters.get(DELIMITER_FIELD); + int maxChunkingNumber = Integer.valueOf((String) parameters.getOrDefault(MAX_CHUNK_LIMIT_FIELD, "0")); List chunkResult = new ArrayList<>(); int start = 0; int end = content.indexOf(delimiter); @@ -39,10 +56,16 @@ public List chunk(String content, Map parameters) { chunkResult.add(content.substring(start, end + delimiter.length())); start = end + delimiter.length(); end = content.indexOf(delimiter, start); + if (chunkResult.size() > maxChunkingNumber && maxChunkingNumber > 0) { + throw new IllegalArgumentException("Exceed max chunk number: " + maxChunkingNumber); + } } if (start < content.length()) { chunkResult.add(content.substring(start)); + if (chunkResult.size() > maxChunkingNumber && maxChunkingNumber > 0) { + throw new IllegalArgumentException("Exceed max chunk number: " + maxChunkingNumber); + } } return chunkResult; diff --git a/src/test/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunkerTests.java b/src/test/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunkerTests.java index d201ab574..147f91588 100644 --- a/src/test/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunkerTests.java +++ b/src/test/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunkerTests.java @@ -13,6 +13,7 @@ import static junit.framework.TestCase.assertEquals; import static org.junit.Assert.assertThrows; import static org.opensearch.neuralsearch.processor.chunker.DelimiterChunker.DELIMITER_FIELD; +import static org.opensearch.neuralsearch.processor.chunker.DelimiterChunker.MAX_CHUNK_LIMIT_FIELD; public class DelimiterChunkerTests extends OpenSearchTestCase { @@ -24,6 +25,22 @@ public void testChunkerWithNoDelimiterField() { Assert.assertEquals("You must contain field:" + DELIMITER_FIELD + " in your parameter.", exception.getMessage()); } + public void testChunkerWithWrongLimitFieldList() { + DelimiterChunker chunker = new DelimiterChunker(); + String content = "a\nb\nc\nd"; + Map inputParameters = Map.of(MAX_CHUNK_LIMIT_FIELD, List.of("-1"), DELIMITER_FIELD, "\n"); + Exception exception = assertThrows(IllegalArgumentException.class, () -> chunker.validateParameters(inputParameters)); + Assert.assertEquals("Parameter max_chunk_limit:" + List.of("-1") + " cannot be converted to integer.", exception.getMessage()); + } + + public void testChunkerWithWrongLimitField() { + DelimiterChunker chunker = new DelimiterChunker(); + String content = "a\nb\nc\nd"; + Map inputParameters = Map.of(MAX_CHUNK_LIMIT_FIELD, "1000\n", DELIMITER_FIELD, "\n"); + Exception exception = assertThrows(IllegalArgumentException.class, () -> chunker.validateParameters(inputParameters)); + Assert.assertEquals("Parameter max_chunk_limit:1000\n cannot be converted to integer.", exception.getMessage()); + } + public void testChunkerWithDelimiterFieldNotString() { DelimiterChunker chunker = new DelimiterChunker(); String content = "a\nb\nc\nd"; @@ -40,6 +57,14 @@ public void testChunkerWithDelimiterFieldNoString() { Assert.assertEquals("delimiter parameters should not be empty.", exception.getMessage()); } + public void testChunkerWithLimitNumber() { + DelimiterChunker chunker = new DelimiterChunker(); + String content = "a\nb\nc\nd"; + Map inputParameters = Map.of(DELIMITER_FIELD, "\n", MAX_CHUNK_LIMIT_FIELD, "1"); + Exception exception = assertThrows(IllegalArgumentException.class, () -> chunker.chunk(content, inputParameters)); + Assert.assertEquals("Exceed max chunk number: 1", exception.getMessage()); + } + public void testChunker() { DelimiterChunker chunker = new DelimiterChunker(); String content = "a\nb\nc\nd"; From 9487de5284d86efabc0f669e611337aa7d15985f Mon Sep 17 00:00:00 2001 From: xinyual Date: Fri, 1 Mar 2024 11:07:29 +0800 Subject: [PATCH 041/189] remove useless and apply spotless Signed-off-by: xinyual Signed-off-by: yuye-aws --- gradle.properties | 1 - 1 file changed, 1 deletion(-) diff --git a/gradle.properties b/gradle.properties index 6d22deaa5..df3092f50 100644 --- a/gradle.properties +++ b/gradle.properties @@ -16,4 +16,3 @@ org.gradle.jvmargs=--add-exports jdk.compiler/com.sun.tools.javac.api=ALL-UNNAME --add-exports jdk.compiler/com.sun.tools.javac.parser=ALL-UNNAMED \ --add-exports jdk.compiler/com.sun.tools.javac.tree=ALL-UNNAMED \ --add-exports jdk.compiler/com.sun.tools.javac.util=ALL-UNNAMED -customDistributionUrl=https://artifacts.opensearch.org/snapshots/core/opensearch/3.0.0-SNAPSHOT/opensearch-min-3.0.0-SNAPSHOT-darwin-x64-latest.tar.gz From 04043cad0b328f8638245964a27e1ed4be2c3563 Mon Sep 17 00:00:00 2001 From: yuye-aws Date: Fri, 1 Mar 2024 11:26:47 +0800 Subject: [PATCH 042/189] update error message Signed-off-by: yuye-aws --- .../neuralsearch/processor/chunker/DelimiterChunker.java | 2 +- .../neuralsearch/processor/chunker/DelimiterChunkerTests.java | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/main/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunker.java b/src/main/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunker.java index 2670d3882..38b3d4c89 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunker.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunker.java @@ -26,7 +26,7 @@ public void validateParameters(Map parameters) { throw new IllegalArgumentException("delimiter parameters should not be empty."); } } else { - throw new IllegalArgumentException("You must contain field:" + DELIMITER_FIELD + " in your parameter."); + throw new IllegalArgumentException("You must contain field: " + DELIMITER_FIELD + " in your parameter."); } if (parameters.containsKey(MAX_CHUNK_LIMIT_FIELD)) { Object maxChunkLimit = parameters.get(MAX_CHUNK_LIMIT_FIELD); diff --git a/src/test/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunkerTests.java b/src/test/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunkerTests.java index 147f91588..83a53ed8f 100644 --- a/src/test/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunkerTests.java +++ b/src/test/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunkerTests.java @@ -22,7 +22,7 @@ public void testChunkerWithNoDelimiterField() { String content = "a\nb\nc\nd"; Map inputParameters = Map.of("", ""); Exception exception = assertThrows(IllegalArgumentException.class, () -> chunker.validateParameters(inputParameters)); - Assert.assertEquals("You must contain field:" + DELIMITER_FIELD + " in your parameter.", exception.getMessage()); + Assert.assertEquals("You must contain field: " + DELIMITER_FIELD + " in your parameter.", exception.getMessage()); } public void testChunkerWithWrongLimitFieldList() { From 08bf2d1f489aa6dac6e848f6bca3ff675a8d843d Mon Sep 17 00:00:00 2001 From: xinyual Date: Fri, 1 Mar 2024 12:22:38 +0800 Subject: [PATCH 043/189] change field UT Signed-off-by: xinyual Signed-off-by: yuye-aws --- gradle.properties | 1 + .../processor/chunker/DelimiterChunker.java | 28 ++++++++----------- .../chunker/DelimiterChunkerTests.java | 2 +- 3 files changed, 14 insertions(+), 17 deletions(-) diff --git a/gradle.properties b/gradle.properties index df3092f50..6d22deaa5 100644 --- a/gradle.properties +++ b/gradle.properties @@ -16,3 +16,4 @@ org.gradle.jvmargs=--add-exports jdk.compiler/com.sun.tools.javac.api=ALL-UNNAME --add-exports jdk.compiler/com.sun.tools.javac.parser=ALL-UNNAMED \ --add-exports jdk.compiler/com.sun.tools.javac.tree=ALL-UNNAMED \ --add-exports jdk.compiler/com.sun.tools.javac.util=ALL-UNNAMED +customDistributionUrl=https://artifacts.opensearch.org/snapshots/core/opensearch/3.0.0-SNAPSHOT/opensearch-min-3.0.0-SNAPSHOT-darwin-x64-latest.tar.gz diff --git a/src/main/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunker.java b/src/main/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunker.java index 38b3d4c89..490309947 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunker.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunker.java @@ -30,16 +30,10 @@ public void validateParameters(Map parameters) { } if (parameters.containsKey(MAX_CHUNK_LIMIT_FIELD)) { Object maxChunkLimit = parameters.get(MAX_CHUNK_LIMIT_FIELD); - if (!(maxChunkLimit instanceof String)) { + if (!(maxChunkLimit instanceof Integer)) { throw new IllegalArgumentException( "Parameter max_chunk_limit:" + maxChunkLimit.toString() + " cannot be converted to integer." ); - } else { - try { - int maxChunkingNumber = Integer.valueOf((String) maxChunkLimit); - } catch (Exception exception) { - throw new IllegalArgumentException("Parameter max_chunk_limit:" + maxChunkLimit + " cannot be converted to integer."); - } } } } @@ -47,27 +41,29 @@ public void validateParameters(Map parameters) { @Override public List chunk(String content, Map parameters) { String delimiter = (String) parameters.get(DELIMITER_FIELD); - int maxChunkingNumber = Integer.valueOf((String) parameters.getOrDefault(MAX_CHUNK_LIMIT_FIELD, "0")); + int maxChunkingNumber = (int) parameters.getOrDefault(MAX_CHUNK_LIMIT_FIELD, 0); List chunkResult = new ArrayList<>(); int start = 0; int end = content.indexOf(delimiter); while (end != -1) { - chunkResult.add(content.substring(start, end + delimiter.length())); + addChunkResult(chunkResult, maxChunkingNumber, content.substring(start, end + delimiter.length())); start = end + delimiter.length(); end = content.indexOf(delimiter, start); - if (chunkResult.size() > maxChunkingNumber && maxChunkingNumber > 0) { - throw new IllegalArgumentException("Exceed max chunk number: " + maxChunkingNumber); - } + } if (start < content.length()) { - chunkResult.add(content.substring(start)); - if (chunkResult.size() > maxChunkingNumber && maxChunkingNumber > 0) { - throw new IllegalArgumentException("Exceed max chunk number: " + maxChunkingNumber); - } + addChunkResult(chunkResult, maxChunkingNumber, content.substring(start)); } return chunkResult; } + + private void addChunkResult(List chunkResult, int maxChunkingNumber, String candidate) { + if (chunkResult.size() >= maxChunkingNumber && maxChunkingNumber > 0) { + throw new IllegalArgumentException("Exceed max chunk number: " + maxChunkingNumber); + } + chunkResult.add(candidate); + } } diff --git a/src/test/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunkerTests.java b/src/test/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunkerTests.java index 83a53ed8f..8cae19bf3 100644 --- a/src/test/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunkerTests.java +++ b/src/test/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunkerTests.java @@ -60,7 +60,7 @@ public void testChunkerWithDelimiterFieldNoString() { public void testChunkerWithLimitNumber() { DelimiterChunker chunker = new DelimiterChunker(); String content = "a\nb\nc\nd"; - Map inputParameters = Map.of(DELIMITER_FIELD, "\n", MAX_CHUNK_LIMIT_FIELD, "1"); + Map inputParameters = Map.of(DELIMITER_FIELD, "\n", MAX_CHUNK_LIMIT_FIELD, 1); Exception exception = assertThrows(IllegalArgumentException.class, () -> chunker.chunk(content, inputParameters)); Assert.assertEquals("Exceed max chunk number: 1", exception.getMessage()); } From c7cc59f74a989c711c5540075e978f5b72a78809 Mon Sep 17 00:00:00 2001 From: xinyual Date: Fri, 1 Mar 2024 12:24:02 +0800 Subject: [PATCH 044/189] remove useless and apply spotless Signed-off-by: xinyual Signed-off-by: yuye-aws --- gradle.properties | 1 - 1 file changed, 1 deletion(-) diff --git a/gradle.properties b/gradle.properties index 6d22deaa5..df3092f50 100644 --- a/gradle.properties +++ b/gradle.properties @@ -16,4 +16,3 @@ org.gradle.jvmargs=--add-exports jdk.compiler/com.sun.tools.javac.api=ALL-UNNAME --add-exports jdk.compiler/com.sun.tools.javac.parser=ALL-UNNAMED \ --add-exports jdk.compiler/com.sun.tools.javac.tree=ALL-UNNAMED \ --add-exports jdk.compiler/com.sun.tools.javac.util=ALL-UNNAMED -customDistributionUrl=https://artifacts.opensearch.org/snapshots/core/opensearch/3.0.0-SNAPSHOT/opensearch-min-3.0.0-SNAPSHOT-darwin-x64-latest.tar.gz From 0721f7aec75d11c7b07f4509b1679805d87067d4 Mon Sep 17 00:00:00 2001 From: xinyual Date: Fri, 1 Mar 2024 15:10:17 +0800 Subject: [PATCH 045/189] change logic of max chunk number Signed-off-by: xinyual Signed-off-by: yuye-aws --- .../processor/chunker/DelimiterChunker.java | 8 ++++---- .../processor/chunker/DelimiterChunkerTests.java | 12 ++++++++++-- 2 files changed, 14 insertions(+), 6 deletions(-) diff --git a/src/main/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunker.java b/src/main/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunker.java index 490309947..c6b5c3ae9 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunker.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunker.java @@ -31,9 +31,9 @@ public void validateParameters(Map parameters) { if (parameters.containsKey(MAX_CHUNK_LIMIT_FIELD)) { Object maxChunkLimit = parameters.get(MAX_CHUNK_LIMIT_FIELD); if (!(maxChunkLimit instanceof Integer)) { - throw new IllegalArgumentException( - "Parameter max_chunk_limit:" + maxChunkLimit.toString() + " cannot be converted to integer." - ); + throw new IllegalArgumentException("Parameter max_chunk_limit:" + maxChunkLimit.toString() + " should be integer."); + } else if ((int) maxChunkLimit <= 0) { + throw new IllegalArgumentException("Parameter max_chunk_limit:" + maxChunkLimit + " is not greater than 0."); } } } @@ -41,7 +41,7 @@ public void validateParameters(Map parameters) { @Override public List chunk(String content, Map parameters) { String delimiter = (String) parameters.get(DELIMITER_FIELD); - int maxChunkingNumber = (int) parameters.getOrDefault(MAX_CHUNK_LIMIT_FIELD, 0); + int maxChunkingNumber = (int) parameters.getOrDefault(MAX_CHUNK_LIMIT_FIELD, -1); List chunkResult = new ArrayList<>(); int start = 0; int end = content.indexOf(delimiter); diff --git a/src/test/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunkerTests.java b/src/test/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunkerTests.java index 8cae19bf3..eb9808982 100644 --- a/src/test/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunkerTests.java +++ b/src/test/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunkerTests.java @@ -30,7 +30,7 @@ public void testChunkerWithWrongLimitFieldList() { String content = "a\nb\nc\nd"; Map inputParameters = Map.of(MAX_CHUNK_LIMIT_FIELD, List.of("-1"), DELIMITER_FIELD, "\n"); Exception exception = assertThrows(IllegalArgumentException.class, () -> chunker.validateParameters(inputParameters)); - Assert.assertEquals("Parameter max_chunk_limit:" + List.of("-1") + " cannot be converted to integer.", exception.getMessage()); + Assert.assertEquals("Parameter max_chunk_limit:" + List.of("-1") + " should be integer.", exception.getMessage()); } public void testChunkerWithWrongLimitField() { @@ -38,7 +38,15 @@ public void testChunkerWithWrongLimitField() { String content = "a\nb\nc\nd"; Map inputParameters = Map.of(MAX_CHUNK_LIMIT_FIELD, "1000\n", DELIMITER_FIELD, "\n"); Exception exception = assertThrows(IllegalArgumentException.class, () -> chunker.validateParameters(inputParameters)); - Assert.assertEquals("Parameter max_chunk_limit:1000\n cannot be converted to integer.", exception.getMessage()); + Assert.assertEquals("Parameter max_chunk_limit:1000\n should be integer.", exception.getMessage()); + } + + public void testChunkerWithNegativeLimit() { + DelimiterChunker chunker = new DelimiterChunker(); + String content = "a\nb\nc\nd"; + Map inputParameters = Map.of(MAX_CHUNK_LIMIT_FIELD, -1, DELIMITER_FIELD, "\n"); + Exception exception = assertThrows(IllegalArgumentException.class, () -> chunker.validateParameters(inputParameters)); + Assert.assertEquals("Parameter max_chunk_limit:-1 is not greater than 0.", exception.getMessage()); } public void testChunkerWithDelimiterFieldNotString() { From d2bc57618085ae5034863688906546ad90fcc664 Mon Sep 17 00:00:00 2001 From: yuye-aws Date: Fri, 1 Mar 2024 17:29:58 +0800 Subject: [PATCH 046/189] add max chunk limit into fixed token length algorithm Signed-off-by: yuye-aws --- .../processor/chunker/DelimiterChunker.java | 3 +- .../chunker/FixedTokenLengthChunker.java | 46 +++++++++++++------ 2 files changed, 33 insertions(+), 16 deletions(-) diff --git a/src/main/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunker.java b/src/main/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunker.java index c6b5c3ae9..fc5f41d24 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunker.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunker.java @@ -22,7 +22,7 @@ public void validateParameters(Map parameters) { Object delimiter = parameters.get(DELIMITER_FIELD); if (!(delimiter instanceof String)) { throw new IllegalArgumentException("delimiter parameters: " + delimiter + " must be string."); - } else if (((String) delimiter).length() == 0) { + } else if (((String) delimiter).isEmpty()) { throw new IllegalArgumentException("delimiter parameters should not be empty."); } } else { @@ -50,7 +50,6 @@ public List chunk(String content, Map parameters) { addChunkResult(chunkResult, maxChunkingNumber, content.substring(start, end + delimiter.length())); start = end + delimiter.length(); end = content.indexOf(delimiter, start); - } if (start < content.length()) { diff --git a/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java b/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java index 3079fcf8e..b57a8fdca 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java @@ -21,15 +21,15 @@ public class FixedTokenLengthChunker implements IFieldChunker { public static final String TOKEN_LIMIT_FIELD = "token_limit"; public static final String OVERLAP_RATE_FIELD = "overlap_rate"; - public static final String MAX_TOKEN_COUNT_FIELD = "max_token_count"; - + public static String MAX_CHUNK_LIMIT_FIELD = "max_chunk_limit"; public static final String TOKENIZER_FIELD = "tokenizer"; // default values for each parameter private static final int DEFAULT_TOKEN_LIMIT = 500; private static final double DEFAULT_OVERLAP_RATE = 0.2; private static final int DEFAULT_MAX_TOKEN_COUNT = 10000; + private static final int DEFAULT_MAX_CHUNK_LIMIT = -1; private static final String DEFAULT_TOKENIZER = "standard"; private final AnalysisRegistry analysisRegistry; @@ -62,6 +62,8 @@ public List chunk(String content, Map parameters) { int tokenLimit = DEFAULT_TOKEN_LIMIT; double overlapRate = DEFAULT_OVERLAP_RATE; int maxTokenCount = DEFAULT_MAX_TOKEN_COUNT; + int maxChunkLimit = DEFAULT_MAX_CHUNK_LIMIT; + String tokenizer = DEFAULT_TOKENIZER; if (parameters.containsKey(TOKEN_LIMIT_FIELD)) { @@ -76,6 +78,9 @@ public List chunk(String content, Map parameters) { if (parameters.containsKey(TOKENIZER_FIELD)) { tokenizer = (String) parameters.get(TOKENIZER_FIELD); } + if (parameters.containsKey(MAX_CHUNK_LIMIT_FIELD)) { + maxChunkLimit = ((Number) parameters.get(MAX_CHUNK_LIMIT_FIELD)).intValue(); + } List tokens = tokenize(content, tokenizer, maxTokenCount); List passages = new ArrayList<>(); @@ -90,29 +95,42 @@ public List chunk(String content, Map parameters) { if (startToken + tokenLimit >= tokens.size()) { // break the loop when already cover the last token passage = String.join(" ", tokens.subList(startToken, tokens.size())); - passages.add(passage); + addPassageToList(passages, passage, maxChunkLimit); break; } else { passage = String.join(" ", tokens.subList(startToken, startToken + tokenLimit)); - passages.add(passage); + addPassageToList(passages, passage, maxChunkLimit); } startToken += tokenLimit - overlapTokenNumber; } return passages; } + private void addPassageToList(List passages, String passage, int maxChunkLimit) { + if (maxChunkLimit != DEFAULT_MAX_CHUNK_LIMIT && passages.size() + 1 >= maxChunkLimit) { + throw new IllegalArgumentException("Exceed max chunk number: " + maxChunkLimit); + } + passages.add(passage); + } + + private void validatePositiveIntegerParameter(Map parameters, String fieldName) { + // this method validate that parameter is a positive integer + // this method accepts positive float or double number + if (!(parameters.get(fieldName) instanceof Number)) { + throw new IllegalArgumentException( + "fixed length parameter [" + fieldName + "] cannot be cast to [" + Number.class.getName() + "]" + ); + } + if (((Number) parameters.get(fieldName)).intValue() <= 0) { + throw new IllegalArgumentException("fixed length parameter [" + fieldName + "] must be positive"); + } + } + @Override public void validateParameters(Map parameters) { - if (parameters.containsKey(TOKEN_LIMIT_FIELD)) { - if (!(parameters.get(TOKEN_LIMIT_FIELD) instanceof Number)) { - throw new IllegalArgumentException( - "fixed length parameter [" + TOKEN_LIMIT_FIELD + "] cannot be cast to [" + Number.class.getName() + "]" - ); - } - if (((Number) parameters.get(TOKEN_LIMIT_FIELD)).intValue() <= 0) { - throw new IllegalArgumentException("fixed length parameter [" + TOKEN_LIMIT_FIELD + "] must be positive"); - } - } + validatePositiveIntegerParameter(parameters, TOKEN_LIMIT_FIELD); + validatePositiveIntegerParameter(parameters, MAX_CHUNK_LIMIT_FIELD); + validatePositiveIntegerParameter(parameters, MAX_TOKEN_COUNT_FIELD); if (parameters.containsKey(OVERLAP_RATE_FIELD)) { if (!(parameters.get(OVERLAP_RATE_FIELD) instanceof Number)) { From 120fae872f224ba670a09f778e06f9b641b81a1e Mon Sep 17 00:00:00 2001 From: zane-neo Date: Fri, 1 Mar 2024 14:40:14 +0800 Subject: [PATCH 047/189] Support list> type in embedding and extract validation logic to common class Signed-off-by: zane-neo Signed-off-by: yuye-aws --- .../neuralsearch/plugin/NeuralSearch.java | 6 +- .../processor/InferenceProcessor.java | 118 +++++++----------- .../processor/ProcessorInputValidator.java | 93 ++++++++++++++ .../processor/SparseEncodingProcessor.java | 7 +- .../processor/TextEmbeddingProcessor.java | 7 +- .../SparseEncodingProcessorFactory.java | 19 ++- .../TextEmbeddingProcessorFactory.java | 20 ++- .../TextEmbeddingProcessorTests.java | 14 +-- 8 files changed, 190 insertions(+), 94 deletions(-) create mode 100644 src/main/java/org/opensearch/neuralsearch/processor/ProcessorInputValidator.java diff --git a/src/main/java/org/opensearch/neuralsearch/plugin/NeuralSearch.java b/src/main/java/org/opensearch/neuralsearch/plugin/NeuralSearch.java index dc5b6e8f2..13f554621 100644 --- a/src/main/java/org/opensearch/neuralsearch/plugin/NeuralSearch.java +++ b/src/main/java/org/opensearch/neuralsearch/plugin/NeuralSearch.java @@ -29,6 +29,7 @@ import org.opensearch.neuralsearch.processor.NeuralQueryEnricherProcessor; import org.opensearch.neuralsearch.processor.NormalizationProcessor; import org.opensearch.neuralsearch.processor.NormalizationProcessorWorkflow; +import org.opensearch.neuralsearch.processor.ProcessorInputValidator; import org.opensearch.neuralsearch.processor.SparseEncodingProcessor; import org.opensearch.neuralsearch.processor.TextEmbeddingProcessor; import org.opensearch.neuralsearch.processor.DocumentChunkingProcessor; @@ -109,11 +110,12 @@ public List> getQueries() { @Override public Map getProcessors(Processor.Parameters parameters) { clientAccessor = new MLCommonsClientAccessor(new MachineLearningNodeClient(parameters.client)); + ProcessorInputValidator processorInputValidator = new ProcessorInputValidator(); return Map.of( TextEmbeddingProcessor.TYPE, - new TextEmbeddingProcessorFactory(clientAccessor, parameters.env), + new TextEmbeddingProcessorFactory(clientAccessor, parameters.env, processorInputValidator), SparseEncodingProcessor.TYPE, - new SparseEncodingProcessorFactory(clientAccessor, parameters.env), + new SparseEncodingProcessorFactory(clientAccessor, parameters.env, processorInputValidator), TextImageEmbeddingProcessor.TYPE, new TextImageEmbeddingProcessorFactory(clientAccessor, parameters.env, parameters.ingestService.getClusterService()), DocumentChunkingProcessor.TYPE, diff --git a/src/main/java/org/opensearch/neuralsearch/processor/InferenceProcessor.java b/src/main/java/org/opensearch/neuralsearch/processor/InferenceProcessor.java index fe201abae..4762c699d 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/InferenceProcessor.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/InferenceProcessor.java @@ -10,13 +10,11 @@ import java.util.Map; import java.util.Objects; import java.util.function.BiConsumer; -import java.util.function.Supplier; import java.util.stream.Collectors; import java.util.stream.IntStream; import org.apache.commons.lang3.StringUtils; import org.opensearch.env.Environment; -import org.opensearch.index.mapper.MapperService; import org.opensearch.ingest.AbstractProcessor; import org.opensearch.ingest.IngestDocument; import org.opensearch.neuralsearch.ml.MLCommonsClientAccessor; @@ -51,6 +49,8 @@ public abstract class InferenceProcessor extends AbstractProcessor { private final Environment environment; + private final ProcessorInputValidator processorInputValidator; + public InferenceProcessor( String tag, String description, @@ -59,7 +59,8 @@ public InferenceProcessor( String modelId, Map fieldMap, MLCommonsClientAccessor clientAccessor, - Environment environment + Environment environment, + ProcessorInputValidator processorInputValidator ) { super(tag, description); this.type = type; @@ -71,6 +72,7 @@ public InferenceProcessor( this.fieldMap = fieldMap; this.mlCommonsClientAccessor = clientAccessor; this.environment = environment; + this.processorInputValidator = processorInputValidator; } private void validateEmbeddingConfiguration(Map fieldMap) { @@ -106,13 +108,13 @@ public IngestDocument execute(IngestDocument ingestDocument) throws Exception { @Override public void execute(IngestDocument ingestDocument, BiConsumer handler) { try { - validateEmbeddingFieldsValue(ingestDocument); - Map ProcessMap = buildMapWithProcessorKeyAndOriginalValue(ingestDocument); - List inferenceList = createInferenceList(ProcessMap); + processorInputValidator.validateFieldsValue(fieldMap, environment, ingestDocument, false); + Map processMap = buildMapWithProcessorKeyAndOriginalValue(ingestDocument); + List inferenceList = createInferenceList(processMap); if (inferenceList.size() == 0) { handler.accept(ingestDocument, null); } else { - doExecute(ingestDocument, ProcessMap, inferenceList, handler); + doExecute(ingestDocument, processMap, inferenceList, handler); } } catch (Exception e) { handler.accept(null, e); @@ -125,7 +127,13 @@ private List createInferenceList(Map knnKeyMap) { knnKeyMap.entrySet().stream().filter(knnMapEntry -> knnMapEntry.getValue() != null).forEach(knnMapEntry -> { Object sourceValue = knnMapEntry.getValue(); if (sourceValue instanceof List) { - texts.addAll(((List) sourceValue)); + for (Object nestedValue : (List) sourceValue) { + if (nestedValue instanceof String) { + texts.add((String) nestedValue); + } else { + texts.addAll((List) nestedValue); + } + } } else if (sourceValue instanceof Map) { createInferenceListForMapTypeInput(sourceValue, texts); } else { @@ -204,68 +212,16 @@ private void buildMapWithProcessorKeyAndOriginalValueForMapType( } } - private void validateEmbeddingFieldsValue(IngestDocument ingestDocument) { - Map sourceAndMetadataMap = ingestDocument.getSourceAndMetadata(); - for (Map.Entry embeddingFieldsEntry : fieldMap.entrySet()) { - Object sourceValue = sourceAndMetadataMap.get(embeddingFieldsEntry.getKey()); - if (sourceValue != null) { - String sourceKey = embeddingFieldsEntry.getKey(); - Class sourceValueClass = sourceValue.getClass(); - if (List.class.isAssignableFrom(sourceValueClass) || Map.class.isAssignableFrom(sourceValueClass)) { - validateNestedTypeValue(sourceKey, sourceValue, () -> 1); - } else if (!String.class.isAssignableFrom(sourceValueClass)) { - throw new IllegalArgumentException("field [" + sourceKey + "] is neither string nor nested type, cannot process it"); - } else if (StringUtils.isBlank(sourceValue.toString())) { - throw new IllegalArgumentException("field [" + sourceKey + "] has empty string value, cannot process it"); - } - } - } - } - - @SuppressWarnings({ "rawtypes", "unchecked" }) - private void validateNestedTypeValue(String sourceKey, Object sourceValue, Supplier maxDepthSupplier) { - int maxDepth = maxDepthSupplier.get(); - if (maxDepth > MapperService.INDEX_MAPPING_DEPTH_LIMIT_SETTING.get(environment.settings())) { - throw new IllegalArgumentException("map type field [" + sourceKey + "] reached max depth limit, cannot process it"); - } else if ((List.class.isAssignableFrom(sourceValue.getClass()))) { - validateListTypeValue(sourceKey, sourceValue, maxDepthSupplier); - } else if (Map.class.isAssignableFrom(sourceValue.getClass())) { - ((Map) sourceValue).values() - .stream() - .filter(Objects::nonNull) - .forEach(x -> validateNestedTypeValue(sourceKey, x, () -> maxDepth + 1)); - } else if (!String.class.isAssignableFrom(sourceValue.getClass())) { - throw new IllegalArgumentException("map type field [" + sourceKey + "] has non-string type, cannot process it"); - } else if (StringUtils.isBlank(sourceValue.toString())) { - throw new IllegalArgumentException("map type field [" + sourceKey + "] has empty string, cannot process it"); - } - } - - @SuppressWarnings({ "rawtypes" }) - private void validateListTypeValue(String sourceKey, Object sourceValue, Supplier maxDepthSupplier) { - for (Object value : (List) sourceValue) { - if (value instanceof Map) { - validateNestedTypeValue(sourceKey, value, () -> maxDepthSupplier.get() + 1); - } else if (value == null) { - throw new IllegalArgumentException("list type field [" + sourceKey + "] has null, cannot process it"); - } else if (!(value instanceof String)) { - throw new IllegalArgumentException("list type field [" + sourceKey + "] has non string value, cannot process it"); - } else if (StringUtils.isBlank(value.toString())) { - throw new IllegalArgumentException("list type field [" + sourceKey + "] has empty string, cannot process it"); - } - } - } - - protected void setVectorFieldsToDocument(IngestDocument ingestDocument, Map processorMap, List results) { + protected void setTargetFieldsToDocument(IngestDocument ingestDocument, Map processorMap, List results) { Objects.requireNonNull(results, "embedding failed, inference returns null result!"); log.debug("Model inference result fetched, starting build vector output!"); - Map nlpResult = buildNLPResult(processorMap, results, ingestDocument.getSourceAndMetadata()); - nlpResult.forEach(ingestDocument::setFieldValue); + Map result = buildResult(processorMap, results, ingestDocument.getSourceAndMetadata()); + result.forEach(ingestDocument::setFieldValue); } @SuppressWarnings({ "unchecked" }) @VisibleForTesting - Map buildNLPResult(Map processorMap, List results, Map sourceAndMetadataMap) { + Map buildResult(Map processorMap, List results, Map sourceAndMetadataMap) { IndexWrapper indexWrapper = new IndexWrapper(0); Map result = new LinkedHashMap<>(); for (Map.Entry knnMapEntry : processorMap.entrySet()) { @@ -274,16 +230,16 @@ Map buildNLPResult(Map processorMap, List res if (sourceValue instanceof String) { result.put(knnKey, results.get(indexWrapper.index++)); } else if (sourceValue instanceof List) { - result.put(knnKey, buildNLPResultForListType((List) sourceValue, results, indexWrapper)); + result.put(knnKey, buildResultForListType((List) sourceValue, results, indexWrapper)); } else if (sourceValue instanceof Map) { - putNLPResultToSourceMapForMapType(knnKey, sourceValue, results, indexWrapper, sourceAndMetadataMap); + putResultToSourceMapForMapType(knnKey, sourceValue, results, indexWrapper, sourceAndMetadataMap); } } return result; } @SuppressWarnings({ "unchecked" }) - private void putNLPResultToSourceMapForMapType( + private void putResultToSourceMapForMapType( String processorKey, Object sourceValue, List results, @@ -294,12 +250,12 @@ private void putNLPResultToSourceMapForMapType( if (sourceValue instanceof Map) { for (Map.Entry inputNestedMapEntry : ((Map) sourceValue).entrySet()) { if (sourceAndMetadataMap.get(processorKey) instanceof List) { - // build nlp output for list of nested objects + // build output for list of nested objects for (Map nestedElement : (List>) sourceAndMetadataMap.get(processorKey)) { nestedElement.put(inputNestedMapEntry.getKey(), results.get(indexWrapper.index++)); } } else { - putNLPResultToSourceMapForMapType( + putResultToSourceMapForMapType( inputNestedMapEntry.getKey(), inputNestedMapEntry.getValue(), results, @@ -311,15 +267,27 @@ private void putNLPResultToSourceMapForMapType( } else if (sourceValue instanceof String) { sourceAndMetadataMap.put(processorKey, results.get(indexWrapper.index++)); } else if (sourceValue instanceof List) { - sourceAndMetadataMap.put(processorKey, buildNLPResultForListType((List) sourceValue, results, indexWrapper)); + sourceAndMetadataMap.put(processorKey, buildResultForListType((List) sourceValue, results, indexWrapper)); } } - private List> buildNLPResultForListType(List sourceValue, List results, IndexWrapper indexWrapper) { - List> keyToResult = new ArrayList<>(); - IntStream.range(0, sourceValue.size()) - .forEachOrdered(x -> keyToResult.add(ImmutableMap.of(listTypeNestedMapKey, results.get(indexWrapper.index++)))); - return keyToResult; + protected List buildResultForListType(List sourceValue, List results, IndexWrapper indexWrapper) { + Object peek = sourceValue.get(0); + if (peek instanceof String) { + List> keyToResult = new ArrayList<>(); + IntStream.range(0, sourceValue.size()) + .forEachOrdered(x -> keyToResult.add(ImmutableMap.of(listTypeNestedMapKey, results.get(indexWrapper.index++)))); + return keyToResult; + } else { + List>> keyToResult = new ArrayList<>(); + for (Object nestedList : sourceValue) { + List> nestedResult = new ArrayList<>(); + IntStream.range(0, ((List) nestedList).size()) + .forEachOrdered(x -> nestedResult.add(ImmutableMap.of(listTypeNestedMapKey, results.get(indexWrapper.index++)))); + keyToResult.add(nestedResult); + } + return keyToResult; + } } @Override diff --git a/src/main/java/org/opensearch/neuralsearch/processor/ProcessorInputValidator.java b/src/main/java/org/opensearch/neuralsearch/processor/ProcessorInputValidator.java new file mode 100644 index 000000000..57766c911 --- /dev/null +++ b/src/main/java/org/opensearch/neuralsearch/processor/ProcessorInputValidator.java @@ -0,0 +1,93 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ +package org.opensearch.neuralsearch.processor; + +import org.apache.commons.lang3.StringUtils; +import org.opensearch.env.Environment; +import org.opensearch.index.mapper.MapperService; +import org.opensearch.ingest.IngestDocument; + +import java.util.List; +import java.util.Map; +import java.util.Objects; +import java.util.function.Supplier; + +public class ProcessorInputValidator { + + public void validateFieldsValue( + Map fieldMap, + Environment environment, + IngestDocument ingestDocument, + boolean allowEmpty + ) { + Map sourceAndMetadataMap = ingestDocument.getSourceAndMetadata(); + for (Map.Entry embeddingFieldsEntry : fieldMap.entrySet()) { + Object sourceValue = sourceAndMetadataMap.get(embeddingFieldsEntry.getKey()); + if (sourceValue != null) { + String sourceKey = embeddingFieldsEntry.getKey(); + Class sourceValueClass = sourceValue.getClass(); + if (List.class.isAssignableFrom(sourceValueClass) || Map.class.isAssignableFrom(sourceValueClass)) { + validateNestedTypeValue(sourceKey, sourceValue, environment, allowEmpty, () -> 1); + } else if (!String.class.isAssignableFrom(sourceValueClass)) { + throw new IllegalArgumentException("field [" + sourceKey + "] is neither string nor nested type, cannot process it"); + } else if (!allowEmpty && StringUtils.isBlank(sourceValue.toString())) { + throw new IllegalArgumentException("field [" + sourceKey + "] has empty string value, cannot process it"); + } + } + } + } + + @SuppressWarnings({ "rawtypes", "unchecked" }) + private void validateNestedTypeValue( + String sourceKey, + Object sourceValue, + Environment environment, + boolean allowEmpty, + Supplier maxDepthSupplier + ) { + int maxDepth = maxDepthSupplier.get(); + if (maxDepth > MapperService.INDEX_MAPPING_DEPTH_LIMIT_SETTING.get(environment.settings())) { + throw new IllegalArgumentException("map type field [" + sourceKey + "] reached max depth limit, cannot process it"); + } else if ((List.class.isAssignableFrom(sourceValue.getClass()))) { + validateListTypeValue(sourceKey, sourceValue, environment, allowEmpty, maxDepthSupplier); + } else if (Map.class.isAssignableFrom(sourceValue.getClass())) { + ((Map) sourceValue).values() + .stream() + .filter(Objects::nonNull) + .forEach(x -> validateNestedTypeValue(sourceKey, x, environment, allowEmpty, () -> maxDepth + 1)); + } else if (!String.class.isAssignableFrom(sourceValue.getClass())) { + throw new IllegalArgumentException("map type field [" + sourceKey + "] has non-string type, cannot process it"); + } else if (!allowEmpty && StringUtils.isBlank(sourceValue.toString())) { + throw new IllegalArgumentException("map type field [" + sourceKey + "] has empty string, cannot process it"); + } + } + + @SuppressWarnings({ "rawtypes" }) + private void validateListTypeValue( + String sourceKey, + Object sourceValue, + Environment environment, + boolean allowEmpty, + Supplier maxDepthSupplier + ) { + for (Object value : (List) sourceValue) { + if (value instanceof Map) { + validateNestedTypeValue(sourceKey, value, environment, allowEmpty, () -> maxDepthSupplier.get() + 1); + } else if (value == null) { + throw new IllegalArgumentException("list type field [" + sourceKey + "] has null, cannot process it"); + } else if (value instanceof List) { + for (Object nestedValue : (List) sourceValue) { + if (!(nestedValue instanceof String)) { + throw new IllegalArgumentException("list type field [" + sourceKey + "] has non string value, cannot process it"); + } + } + } else if (!(value instanceof String)) { + throw new IllegalArgumentException("list type field [" + sourceKey + "] has non string value, cannot process it"); + } else if (!allowEmpty && StringUtils.isBlank(value.toString())) { + throw new IllegalArgumentException("list type field [" + sourceKey + "] has empty string, cannot process it"); + } + } + } +} diff --git a/src/main/java/org/opensearch/neuralsearch/processor/SparseEncodingProcessor.java b/src/main/java/org/opensearch/neuralsearch/processor/SparseEncodingProcessor.java index 8acf95bf7..1087651c2 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/SparseEncodingProcessor.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/SparseEncodingProcessor.java @@ -32,9 +32,10 @@ public SparseEncodingProcessor( String modelId, Map fieldMap, MLCommonsClientAccessor clientAccessor, - Environment environment + Environment environment, + ProcessorInputValidator processorInputValidator ) { - super(tag, description, TYPE, LIST_TYPE_NESTED_MAP_KEY, modelId, fieldMap, clientAccessor, environment); + super(tag, description, TYPE, LIST_TYPE_NESTED_MAP_KEY, modelId, fieldMap, clientAccessor, environment, processorInputValidator); } @Override @@ -45,7 +46,7 @@ public void doExecute( BiConsumer handler ) { mlCommonsClientAccessor.inferenceSentencesWithMapResult(this.modelId, inferenceList, ActionListener.wrap(resultMaps -> { - setVectorFieldsToDocument(ingestDocument, ProcessMap, TokenWeightUtil.fetchListOfTokenWeightMap(resultMaps)); + setTargetFieldsToDocument(ingestDocument, ProcessMap, TokenWeightUtil.fetchListOfTokenWeightMap(resultMaps)); handler.accept(ingestDocument, null); }, e -> { handler.accept(null, e); })); } diff --git a/src/main/java/org/opensearch/neuralsearch/processor/TextEmbeddingProcessor.java b/src/main/java/org/opensearch/neuralsearch/processor/TextEmbeddingProcessor.java index c1b8f92a6..ce024cd87 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/TextEmbeddingProcessor.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/TextEmbeddingProcessor.java @@ -31,9 +31,10 @@ public TextEmbeddingProcessor( String modelId, Map fieldMap, MLCommonsClientAccessor clientAccessor, - Environment environment + Environment environment, + ProcessorInputValidator processorInputValidator ) { - super(tag, description, TYPE, LIST_TYPE_NESTED_MAP_KEY, modelId, fieldMap, clientAccessor, environment); + super(tag, description, TYPE, LIST_TYPE_NESTED_MAP_KEY, modelId, fieldMap, clientAccessor, environment, processorInputValidator); } @Override @@ -44,7 +45,7 @@ public void doExecute( BiConsumer handler ) { mlCommonsClientAccessor.inferenceSentences(this.modelId, inferenceList, ActionListener.wrap(vectors -> { - setVectorFieldsToDocument(ingestDocument, ProcessMap, vectors); + setTargetFieldsToDocument(ingestDocument, ProcessMap, vectors); handler.accept(ingestDocument, null); }, e -> { handler.accept(null, e); })); } diff --git a/src/main/java/org/opensearch/neuralsearch/processor/factory/SparseEncodingProcessorFactory.java b/src/main/java/org/opensearch/neuralsearch/processor/factory/SparseEncodingProcessorFactory.java index 95b2803a0..d5b90c406 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/factory/SparseEncodingProcessorFactory.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/factory/SparseEncodingProcessorFactory.java @@ -15,6 +15,7 @@ import org.opensearch.env.Environment; import org.opensearch.ingest.Processor; import org.opensearch.neuralsearch.ml.MLCommonsClientAccessor; +import org.opensearch.neuralsearch.processor.ProcessorInputValidator; import org.opensearch.neuralsearch.processor.SparseEncodingProcessor; import lombok.extern.log4j.Log4j2; @@ -26,10 +27,16 @@ public class SparseEncodingProcessorFactory implements Processor.Factory { private final MLCommonsClientAccessor clientAccessor; private final Environment environment; + private ProcessorInputValidator processorInputValidator; - public SparseEncodingProcessorFactory(MLCommonsClientAccessor clientAccessor, Environment environment) { + public SparseEncodingProcessorFactory( + MLCommonsClientAccessor clientAccessor, + Environment environment, + ProcessorInputValidator processorInputValidator + ) { this.clientAccessor = clientAccessor; this.environment = environment; + this.processorInputValidator = processorInputValidator; } @Override @@ -42,6 +49,14 @@ public SparseEncodingProcessor create( String modelId = readStringProperty(TYPE, processorTag, config, MODEL_ID_FIELD); Map fieldMap = readMap(TYPE, processorTag, config, FIELD_MAP_FIELD); - return new SparseEncodingProcessor(processorTag, description, modelId, fieldMap, clientAccessor, environment); + return new SparseEncodingProcessor( + processorTag, + description, + modelId, + fieldMap, + clientAccessor, + environment, + processorInputValidator + ); } } diff --git a/src/main/java/org/opensearch/neuralsearch/processor/factory/TextEmbeddingProcessorFactory.java b/src/main/java/org/opensearch/neuralsearch/processor/factory/TextEmbeddingProcessorFactory.java index 7802cb1f6..061ac3474 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/factory/TextEmbeddingProcessorFactory.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/factory/TextEmbeddingProcessorFactory.java @@ -15,6 +15,7 @@ import org.opensearch.env.Environment; import org.opensearch.ingest.Processor; import org.opensearch.neuralsearch.ml.MLCommonsClientAccessor; +import org.opensearch.neuralsearch.processor.ProcessorInputValidator; import org.opensearch.neuralsearch.processor.TextEmbeddingProcessor; /** @@ -26,9 +27,16 @@ public class TextEmbeddingProcessorFactory implements Processor.Factory { private final Environment environment; - public TextEmbeddingProcessorFactory(final MLCommonsClientAccessor clientAccessor, final Environment environment) { + private ProcessorInputValidator processorInputValidator; + + public TextEmbeddingProcessorFactory( + final MLCommonsClientAccessor clientAccessor, + final Environment environment, + ProcessorInputValidator processorInputValidator + ) { this.clientAccessor = clientAccessor; this.environment = environment; + this.processorInputValidator = processorInputValidator; } @Override @@ -40,6 +48,14 @@ public TextEmbeddingProcessor create( ) throws Exception { String modelId = readStringProperty(TYPE, processorTag, config, MODEL_ID_FIELD); Map filedMap = readMap(TYPE, processorTag, config, FIELD_MAP_FIELD); - return new TextEmbeddingProcessor(processorTag, description, modelId, filedMap, clientAccessor, environment); + return new TextEmbeddingProcessor( + processorTag, + description, + modelId, + filedMap, + clientAccessor, + environment, + processorInputValidator + ); } } diff --git a/src/test/java/org/opensearch/neuralsearch/processor/TextEmbeddingProcessorTests.java b/src/test/java/org/opensearch/neuralsearch/processor/TextEmbeddingProcessorTests.java index 60408d820..25d41c345 100644 --- a/src/test/java/org/opensearch/neuralsearch/processor/TextEmbeddingProcessorTests.java +++ b/src/test/java/org/opensearch/neuralsearch/processor/TextEmbeddingProcessorTests.java @@ -357,7 +357,7 @@ public void testProcessResponse_successful() throws Exception { Map knnMap = processor.buildMapWithProcessorKeyAndOriginalValue(ingestDocument); List> modelTensorList = createMockVectorResult(); - processor.setVectorFieldsToDocument(ingestDocument, knnMap, modelTensorList); + processor.setTargetFieldsToDocument(ingestDocument, knnMap, modelTensorList); assertEquals(12, ingestDocument.getSourceAndMetadata().size()); } @@ -378,7 +378,7 @@ public void testBuildVectorOutput_withPlainStringValue_successful() { assertEquals(knnKeyList.get(lastIndex), configValueList.get(lastIndex).toString()); List> modelTensorList = createMockVectorResult(); - Map result = processor.buildNLPResult(knnMap, modelTensorList, ingestDocument.getSourceAndMetadata()); + Map result = processor.buildResult(knnMap, modelTensorList, ingestDocument.getSourceAndMetadata()); assertTrue(result.containsKey("oriKey1_knn")); assertTrue(result.containsKey("oriKey2_knn")); assertTrue(result.containsKey("oriKey3_knn")); @@ -395,7 +395,7 @@ public void testBuildVectorOutput_withNestedMap_successful() { TextEmbeddingProcessor processor = createInstanceWithNestedMapConfiguration(config); Map knnMap = processor.buildMapWithProcessorKeyAndOriginalValue(ingestDocument); List> modelTensorList = createMockVectorResult(); - processor.buildNLPResult(knnMap, modelTensorList, ingestDocument.getSourceAndMetadata()); + processor.buildResult(knnMap, modelTensorList, ingestDocument.getSourceAndMetadata()); Map favoritesMap = (Map) ingestDocument.getSourceAndMetadata().get("favorites"); assertNotNull(favoritesMap); Map favoriteGames = (Map) favoritesMap.get("favorite.games"); @@ -411,7 +411,7 @@ public void testBuildVectorOutput_withNestedList_successful() { TextEmbeddingProcessor textEmbeddingProcessor = createInstanceWithNestedMapConfiguration(config); Map knnMap = textEmbeddingProcessor.buildMapWithProcessorKeyAndOriginalValue(ingestDocument); List> modelTensorList = createMockVectorResult(); - textEmbeddingProcessor.buildNLPResult(knnMap, modelTensorList, ingestDocument.getSourceAndMetadata()); + textEmbeddingProcessor.buildResult(knnMap, modelTensorList, ingestDocument.getSourceAndMetadata()); List> nestedObj = (List>) ingestDocument.getSourceAndMetadata().get("nestedField"); assertTrue(nestedObj.get(0).containsKey("vectorField")); assertTrue(nestedObj.get(1).containsKey("vectorField")); @@ -425,7 +425,7 @@ public void testBuildVectorOutput_withNestedList_Level2_successful() { TextEmbeddingProcessor textEmbeddingProcessor = createInstanceWithNestedMapConfiguration(config); Map knnMap = textEmbeddingProcessor.buildMapWithProcessorKeyAndOriginalValue(ingestDocument); List> modelTensorList = createMockVectorResult(); - textEmbeddingProcessor.buildNLPResult(knnMap, modelTensorList, ingestDocument.getSourceAndMetadata()); + textEmbeddingProcessor.buildResult(knnMap, modelTensorList, ingestDocument.getSourceAndMetadata()); Map nestedLevel1 = (Map) ingestDocument.getSourceAndMetadata().get("nestedField"); List> nestedObj = (List>) nestedLevel1.get("nestedField"); assertTrue(nestedObj.get(0).containsKey("vectorField")); @@ -440,10 +440,10 @@ public void test_updateDocument_appendVectorFieldsToDocument_successful() { TextEmbeddingProcessor processor = createInstanceWithNestedMapConfiguration(config); Map knnMap = processor.buildMapWithProcessorKeyAndOriginalValue(ingestDocument); List> modelTensorList = createMockVectorResult(); - processor.setVectorFieldsToDocument(ingestDocument, knnMap, modelTensorList); + processor.setTargetFieldsToDocument(ingestDocument, knnMap, modelTensorList); List> modelTensorList1 = createMockVectorResult(); - processor.setVectorFieldsToDocument(ingestDocument, knnMap, modelTensorList1); + processor.setTargetFieldsToDocument(ingestDocument, knnMap, modelTensorList1); assertEquals(12, ingestDocument.getSourceAndMetadata().size()); assertEquals(2, ((List) ingestDocument.getSourceAndMetadata().get("oriKey6_knn")).size()); } From 0af30248faa12c91c31d9c39e84d12b7687cfe44 Mon Sep 17 00:00:00 2001 From: yuye-aws Date: Fri, 1 Mar 2024 21:46:09 +0800 Subject: [PATCH 048/189] fix unit tests for inference processor Signed-off-by: yuye-aws --- .../processor/SparseEncodingProcessorTests.java | 9 ++++++++- .../processor/TextEmbeddingProcessorTests.java | 15 +++++++++++++-- 2 files changed, 21 insertions(+), 3 deletions(-) diff --git a/src/test/java/org/opensearch/neuralsearch/processor/SparseEncodingProcessorTests.java b/src/test/java/org/opensearch/neuralsearch/processor/SparseEncodingProcessorTests.java index 815ea851b..36cad43e3 100644 --- a/src/test/java/org/opensearch/neuralsearch/processor/SparseEncodingProcessorTests.java +++ b/src/test/java/org/opensearch/neuralsearch/processor/SparseEncodingProcessorTests.java @@ -51,6 +51,9 @@ public class SparseEncodingProcessorTests extends OpenSearchTestCase { @Mock private Environment env; + @Mock + private ProcessorInputValidator processorInputValidator; + @InjectMocks private SparseEncodingProcessorFactory SparseEncodingProcessorFactory; private static final String PROCESSOR_TAG = "mockTag"; @@ -97,7 +100,11 @@ public void testExecute_whenInferenceTextListEmpty_SuccessWithoutAnyMap() { IngestDocument ingestDocument = new IngestDocument(sourceAndMetadata, new HashMap<>()); Map registry = new HashMap<>(); MLCommonsClientAccessor accessor = mock(MLCommonsClientAccessor.class); - SparseEncodingProcessorFactory sparseEncodingProcessorFactory = new SparseEncodingProcessorFactory(accessor, env); + SparseEncodingProcessorFactory sparseEncodingProcessorFactory = new SparseEncodingProcessorFactory( + accessor, + env, + processorInputValidator + ); Map config = new HashMap<>(); config.put(TextEmbeddingProcessor.MODEL_ID_FIELD, "mockModelId"); diff --git a/src/test/java/org/opensearch/neuralsearch/processor/TextEmbeddingProcessorTests.java b/src/test/java/org/opensearch/neuralsearch/processor/TextEmbeddingProcessorTests.java index 25d41c345..db323bd4b 100644 --- a/src/test/java/org/opensearch/neuralsearch/processor/TextEmbeddingProcessorTests.java +++ b/src/test/java/org/opensearch/neuralsearch/processor/TextEmbeddingProcessorTests.java @@ -51,6 +51,9 @@ public class TextEmbeddingProcessorTests extends OpenSearchTestCase { @Mock private Environment env; + @Mock + private ProcessorInputValidator processorInputValidator; + @InjectMocks private TextEmbeddingProcessorFactory textEmbeddingProcessorFactory; private static final String PROCESSOR_TAG = "mockTag"; @@ -127,7 +130,11 @@ public void testExecute_whenInferenceThrowInterruptedException_throwRuntimeExcep IngestDocument ingestDocument = new IngestDocument(sourceAndMetadata, new HashMap<>()); Map registry = new HashMap<>(); MLCommonsClientAccessor accessor = mock(MLCommonsClientAccessor.class); - TextEmbeddingProcessorFactory textEmbeddingProcessorFactory = new TextEmbeddingProcessorFactory(accessor, env); + TextEmbeddingProcessorFactory textEmbeddingProcessorFactory = new TextEmbeddingProcessorFactory( + accessor, + env, + processorInputValidator + ); Map config = new HashMap<>(); config.put(TextEmbeddingProcessor.MODEL_ID_FIELD, "mockModelId"); @@ -145,7 +152,11 @@ public void testExecute_whenInferenceTextListEmpty_SuccessWithoutEmbedding() { IngestDocument ingestDocument = new IngestDocument(sourceAndMetadata, new HashMap<>()); Map registry = new HashMap<>(); MLCommonsClientAccessor accessor = mock(MLCommonsClientAccessor.class); - TextEmbeddingProcessorFactory textEmbeddingProcessorFactory = new TextEmbeddingProcessorFactory(accessor, env); + TextEmbeddingProcessorFactory textEmbeddingProcessorFactory = new TextEmbeddingProcessorFactory( + accessor, + env, + processorInputValidator + ); Map config = new HashMap<>(); config.put(TextEmbeddingProcessor.MODEL_ID_FIELD, "mockModelId"); From e69bbe12bbd964ec254fd7340acca78e5ab2d4d2 Mon Sep 17 00:00:00 2001 From: yuye-aws Date: Fri, 1 Mar 2024 21:46:44 +0800 Subject: [PATCH 049/189] implement unit tests for unit tests with max_chunk_limit in fixed token length Signed-off-by: yuye-aws --- .../processor/chunker/DelimiterChunker.java | 2 +- .../chunker/FixedTokenLengthChunker.java | 10 ++- .../chunker/FixedTokenLengthChunkerTests.java | 72 +++++++++++++++++-- 3 files changed, 74 insertions(+), 10 deletions(-) diff --git a/src/main/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunker.java b/src/main/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunker.java index fc5f41d24..4062c560e 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunker.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunker.java @@ -61,7 +61,7 @@ public List chunk(String content, Map parameters) { private void addChunkResult(List chunkResult, int maxChunkingNumber, String candidate) { if (chunkResult.size() >= maxChunkingNumber && maxChunkingNumber > 0) { - throw new IllegalArgumentException("Exceed max chunk number: " + maxChunkingNumber); + throw new IllegalStateException("Exceed max chunk number: " + maxChunkingNumber); } chunkResult.add(candidate); } diff --git a/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java b/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java index b57a8fdca..082e00620 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java @@ -107,8 +107,8 @@ public List chunk(String content, Map parameters) { } private void addPassageToList(List passages, String passage, int maxChunkLimit) { - if (maxChunkLimit != DEFAULT_MAX_CHUNK_LIMIT && passages.size() + 1 >= maxChunkLimit) { - throw new IllegalArgumentException("Exceed max chunk number: " + maxChunkLimit); + if (maxChunkLimit != DEFAULT_MAX_CHUNK_LIMIT && passages.size() >= maxChunkLimit) { + throw new IllegalStateException("Exceed max chunk number: " + maxChunkLimit); } passages.add(passage); } @@ -116,9 +116,13 @@ private void addPassageToList(List passages, String passage, int maxChun private void validatePositiveIntegerParameter(Map parameters, String fieldName) { // this method validate that parameter is a positive integer // this method accepts positive float or double number + if (!parameters.containsKey(fieldName)) { + // all parameters are optional + return; + } if (!(parameters.get(fieldName) instanceof Number)) { throw new IllegalArgumentException( - "fixed length parameter [" + fieldName + "] cannot be cast to [" + Number.class.getName() + "]" + "fixed length parameter [" + fieldName + "] cannot be cast to [" + Number.class.getName() + "]" ); } if (((Number) parameters.get(fieldName)).intValue() <= 0) { diff --git a/src/test/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunkerTests.java b/src/test/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunkerTests.java index 4c498d070..d24d9f423 100644 --- a/src/test/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunkerTests.java +++ b/src/test/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunkerTests.java @@ -23,9 +23,10 @@ import static java.util.Collections.singletonList; import static java.util.Collections.singletonMap; +import static org.opensearch.neuralsearch.processor.chunker.FixedTokenLengthChunker.TOKENIZER_FIELD; import static org.opensearch.neuralsearch.processor.chunker.FixedTokenLengthChunker.TOKEN_LIMIT_FIELD; import static org.opensearch.neuralsearch.processor.chunker.FixedTokenLengthChunker.OVERLAP_RATE_FIELD; -import static org.opensearch.neuralsearch.processor.chunker.FixedTokenLengthChunker.TOKENIZER_FIELD; +import static org.opensearch.neuralsearch.processor.chunker.FixedTokenLengthChunker.MAX_CHUNK_LIMIT_FIELD; public class FixedTokenLengthChunkerTests extends OpenSearchTestCase { @@ -66,7 +67,10 @@ public void testValidateParameters_whenIllegalTokenLimitType_thenFail() { IllegalArgumentException.class, () -> FixedTokenLengthChunker.validateParameters(parameters) ); - assertEquals("fixed length parameter [token_limit] cannot be cast to [java.lang.Number]", illegalArgumentException.getMessage()); + assertEquals( + "fixed length parameter [" + TOKEN_LIMIT_FIELD + "] cannot be cast to [" + Number.class.getName() + "]", + illegalArgumentException.getMessage() + ); } public void testValidateParameters_whenIllegalTokenLimitValue_thenFail() { @@ -76,7 +80,7 @@ public void testValidateParameters_whenIllegalTokenLimitValue_thenFail() { IllegalArgumentException.class, () -> FixedTokenLengthChunker.validateParameters(parameters) ); - assertEquals("fixed length parameter [token_limit] must be positive", illegalArgumentException.getMessage()); + assertEquals("fixed length parameter [" + TOKEN_LIMIT_FIELD + "] must be positive", illegalArgumentException.getMessage()); } public void testValidateParameters_whenIllegalOverlapRateType_thenFail() { @@ -86,7 +90,10 @@ public void testValidateParameters_whenIllegalOverlapRateType_thenFail() { IllegalArgumentException.class, () -> FixedTokenLengthChunker.validateParameters(parameters) ); - assertEquals("fixed length parameter [overlap_rate] cannot be cast to [java.lang.Number]", illegalArgumentException.getMessage()); + assertEquals( + "fixed length parameter [" + OVERLAP_RATE_FIELD + "] cannot be cast to [" + Number.class.getName() + "]", + illegalArgumentException.getMessage() + ); } public void testValidateParameters_whenIllegalOverlapRateValue_thenFail() { @@ -97,7 +104,7 @@ public void testValidateParameters_whenIllegalOverlapRateValue_thenFail() { () -> FixedTokenLengthChunker.validateParameters(parameters) ); assertEquals( - "fixed length parameter [overlap_rate] must be between 0 and 1, 1 is not included.", + "fixed length parameter [" + OVERLAP_RATE_FIELD + "] must be between 0 and 1, 1 is not included.", illegalArgumentException.getMessage() ); } @@ -109,7 +116,33 @@ public void testValidateParameters_whenIllegalTokenizerType_thenFail() { IllegalArgumentException.class, () -> FixedTokenLengthChunker.validateParameters(parameters) ); - assertEquals("fixed length parameter [tokenizer] cannot be cast to [java.lang.String]", illegalArgumentException.getMessage()); + assertEquals( + "fixed length parameter [" + TOKENIZER_FIELD + "] cannot be cast to [" + String.class.getName() + "]", + illegalArgumentException.getMessage() + ); + } + + public void testValidateParameters_whenIllegalChunkLimitType_thenFail() { + Map parameters = new HashMap<>(); + parameters.put(MAX_CHUNK_LIMIT_FIELD, "invalid chunk limit"); + IllegalArgumentException illegalArgumentException = assertThrows( + IllegalArgumentException.class, + () -> FixedTokenLengthChunker.validateParameters(parameters) + ); + assertEquals( + "fixed length parameter [" + MAX_CHUNK_LIMIT_FIELD + "] cannot be cast to [" + Number.class.getName() + "]", + illegalArgumentException.getMessage() + ); + } + + public void testValidateParameters_whenIllegalChunkLimitValue_thenFail() { + Map parameters = new HashMap<>(); + parameters.put(MAX_CHUNK_LIMIT_FIELD, -1); + IllegalArgumentException illegalArgumentException = assertThrows( + IllegalArgumentException.class, + () -> FixedTokenLengthChunker.validateParameters(parameters) + ); + assertEquals("fixed length parameter [" + MAX_CHUNK_LIMIT_FIELD + "] must be positive", illegalArgumentException.getMessage()); } public void testChunk_withTokenLimit_10() { @@ -153,4 +186,31 @@ public void testChunk_withOverlapRate_half() { expectedPassages.add("sentences and 24 tokens by standard tokenizer in OpenSearch"); assertEquals(expectedPassages, passages); } + + public void testChunk_withMaxChunkLimitOne_thenFail() { + Map parameters = new HashMap<>(); + parameters.put(TOKEN_LIMIT_FIELD, 10); + parameters.put(MAX_CHUNK_LIMIT_FIELD, 1); + String content = + "This is an example document to be chunked. The document contains a single paragraph, two sentences and 24 tokens by standard tokenizer in OpenSearch."; + IllegalStateException illegalStateException = assertThrows( + IllegalStateException.class, + () -> FixedTokenLengthChunker.chunk(content, parameters) + ); + assertEquals("Exceed max chunk number: 1", illegalStateException.getMessage()); + } + + public void testChunk_withMaxChunkLimitTen_thenSuccess() { + Map parameters = new HashMap<>(); + parameters.put(TOKEN_LIMIT_FIELD, 10); + parameters.put(MAX_CHUNK_LIMIT_FIELD, 10); + String content = + "This is an example document to be chunked. The document contains a single paragraph, two sentences and 24 tokens by standard tokenizer in OpenSearch."; + List passages = FixedTokenLengthChunker.chunk(content, parameters); + List expectedPassages = new ArrayList<>(); + expectedPassages.add("This is an example document to be chunked The document"); + expectedPassages.add("The document contains a single paragraph two sentences and 24"); + expectedPassages.add("and 24 tokens by standard tokenizer in OpenSearch"); + assertEquals(expectedPassages, passages); + } } From f21f40f9ddf897fb8b1b6a90cc6b2fab66a0b38d Mon Sep 17 00:00:00 2001 From: yuye-aws Date: Mon, 4 Mar 2024 11:25:55 +0800 Subject: [PATCH 050/189] constructor for inference processor Signed-off-by: yuye-aws --- .../java/org/opensearch/neuralsearch/plugin/NeuralSearch.java | 3 ++- .../opensearch/neuralsearch/processor/InferenceProcessor.java | 3 +-- .../neuralsearch/processor/SparseEncodingProcessor.java | 2 ++ .../neuralsearch/processor/TextEmbeddingProcessor.java | 2 ++ 4 files changed, 7 insertions(+), 3 deletions(-) diff --git a/src/main/java/org/opensearch/neuralsearch/plugin/NeuralSearch.java b/src/main/java/org/opensearch/neuralsearch/plugin/NeuralSearch.java index 13f554621..de3839a68 100644 --- a/src/main/java/org/opensearch/neuralsearch/plugin/NeuralSearch.java +++ b/src/main/java/org/opensearch/neuralsearch/plugin/NeuralSearch.java @@ -123,7 +123,8 @@ public Map getProcessors(Processor.Parameters paramet parameters.env.settings(), parameters.ingestService.getClusterService(), parameters.indicesService, - parameters.analysisRegistry + parameters.analysisRegistry, + processorInputValidator ) ); } diff --git a/src/main/java/org/opensearch/neuralsearch/processor/InferenceProcessor.java b/src/main/java/org/opensearch/neuralsearch/processor/InferenceProcessor.java index 4762c699d..e072e5d7d 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/InferenceProcessor.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/InferenceProcessor.java @@ -64,7 +64,6 @@ public InferenceProcessor( ) { super(tag, description); this.type = type; - if (StringUtils.isBlank(modelId)) throw new IllegalArgumentException("model_id is null or empty, cannot process it"); validateEmbeddingConfiguration(fieldMap); this.listTypeNestedMapKey = listTypeNestedMapKey; @@ -111,7 +110,7 @@ public void execute(IngestDocument ingestDocument, BiConsumer processMap = buildMapWithProcessorKeyAndOriginalValue(ingestDocument); List inferenceList = createInferenceList(processMap); - if (inferenceList.size() == 0) { + if (inferenceList.isEmpty()) { handler.accept(ingestDocument, null); } else { doExecute(ingestDocument, processMap, inferenceList, handler); diff --git a/src/main/java/org/opensearch/neuralsearch/processor/SparseEncodingProcessor.java b/src/main/java/org/opensearch/neuralsearch/processor/SparseEncodingProcessor.java index 1087651c2..30486b048 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/SparseEncodingProcessor.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/SparseEncodingProcessor.java @@ -8,6 +8,7 @@ import java.util.Map; import java.util.function.BiConsumer; +import org.apache.commons.lang3.StringUtils; import org.opensearch.core.action.ActionListener; import org.opensearch.env.Environment; import org.opensearch.ingest.IngestDocument; @@ -36,6 +37,7 @@ public SparseEncodingProcessor( ProcessorInputValidator processorInputValidator ) { super(tag, description, TYPE, LIST_TYPE_NESTED_MAP_KEY, modelId, fieldMap, clientAccessor, environment, processorInputValidator); + if (StringUtils.isBlank(modelId)) throw new IllegalArgumentException("model_id is null or empty, cannot process it"); } @Override diff --git a/src/main/java/org/opensearch/neuralsearch/processor/TextEmbeddingProcessor.java b/src/main/java/org/opensearch/neuralsearch/processor/TextEmbeddingProcessor.java index ce024cd87..d98d41728 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/TextEmbeddingProcessor.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/TextEmbeddingProcessor.java @@ -8,6 +8,7 @@ import java.util.Map; import java.util.function.BiConsumer; +import org.apache.commons.lang3.StringUtils; import org.opensearch.core.action.ActionListener; import org.opensearch.env.Environment; import org.opensearch.ingest.IngestDocument; @@ -35,6 +36,7 @@ public TextEmbeddingProcessor( ProcessorInputValidator processorInputValidator ) { super(tag, description, TYPE, LIST_TYPE_NESTED_MAP_KEY, modelId, fieldMap, clientAccessor, environment, processorInputValidator); + if (StringUtils.isBlank(modelId)) throw new IllegalArgumentException("model_id is null or empty, cannot process it"); } @Override From 4babd4dc24bd21747a921eaaa0b03f6c548ecbde Mon Sep 17 00:00:00 2001 From: xinyual Date: Fri, 1 Mar 2024 16:30:11 +0800 Subject: [PATCH 051/189] use inference processor Signed-off-by: xinyual Signed-off-by: yuye-aws --- .../neuralsearch/plugin/NeuralSearch.java | 3 +- .../processor/DocumentChunkingProcessor.java | 165 ++++++------------ 2 files changed, 52 insertions(+), 116 deletions(-) diff --git a/src/main/java/org/opensearch/neuralsearch/plugin/NeuralSearch.java b/src/main/java/org/opensearch/neuralsearch/plugin/NeuralSearch.java index de3839a68..96ce0b923 100644 --- a/src/main/java/org/opensearch/neuralsearch/plugin/NeuralSearch.java +++ b/src/main/java/org/opensearch/neuralsearch/plugin/NeuralSearch.java @@ -124,7 +124,8 @@ public Map getProcessors(Processor.Parameters paramet parameters.ingestService.getClusterService(), parameters.indicesService, parameters.analysisRegistry, - processorInputValidator + processorInputValidator, + parameters.env ) ); } diff --git a/src/main/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessor.java b/src/main/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessor.java index d1a1af536..3e9a60143 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessor.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessor.java @@ -10,17 +10,18 @@ import java.util.ArrayList; import java.util.List; import java.util.Objects; +import java.util.function.BiConsumer; import org.opensearch.cluster.metadata.IndexMetadata; import org.opensearch.cluster.service.ClusterService; import org.opensearch.common.settings.Settings; +import org.opensearch.env.Environment; import org.opensearch.index.IndexService; import org.opensearch.index.IndexSettings; import org.opensearch.index.analysis.AnalysisRegistry; import org.opensearch.indices.IndicesService; import org.opensearch.ingest.IngestDocument; import org.opensearch.ingest.Processor; -import org.opensearch.ingest.AbstractProcessor; import org.opensearch.neuralsearch.processor.chunker.ChunkerFactory; import org.opensearch.neuralsearch.processor.chunker.FixedTokenLengthChunker; import org.opensearch.neuralsearch.processor.chunker.IFieldChunker; @@ -28,14 +29,16 @@ import static org.opensearch.ingest.ConfigurationUtils.readMap; -public final class DocumentChunkingProcessor extends AbstractProcessor { +public final class DocumentChunkingProcessor extends InferenceProcessor { public static final String TYPE = "chunking"; public static final String OUTPUT_FIELD = "output_field"; public static final String FIELD_MAP_FIELD = "field_map"; - private final Map fieldMap; + public static final String LIST_TYPE_NESTED_MAP_KEY = "chunking"; + + private final Map originalFieldMap; private final Set supportedChunkers = ChunkerFactory.getAllChunkers(); @@ -54,11 +57,11 @@ public DocumentChunkingProcessor( Settings settings, ClusterService clusterService, IndicesService indicesService, - AnalysisRegistry analysisRegistry + AnalysisRegistry analysisRegistry, + Environment environment ) { - super(tag, description); - validateDocumentChunkingFieldMap(fieldMap); - this.fieldMap = fieldMap; + super(tag, description, TYPE, LIST_TYPE_NESTED_MAP_KEY, "", tranferFieldMap(fieldMap), null, environment); + this.originalFieldMap = fieldMap; this.settings = settings; this.clusterService = clusterService; this.indicesService = indicesService; @@ -69,105 +72,6 @@ public String getType() { return TYPE; } - @SuppressWarnings("unchecked") - private void validateDocumentChunkingFieldMap(Map fieldMap) { - if (fieldMap == null || fieldMap.isEmpty()) { - throw new IllegalArgumentException("Unable to create the processor as field_map is null or empty"); - } - - for (Map.Entry fieldMapEntry : fieldMap.entrySet()) { - String inputField = fieldMapEntry.getKey(); - Object parameters = fieldMapEntry.getValue(); - - if (parameters == null) { - throw new IllegalArgumentException("parameters for input field [" + inputField + "] is null, cannot process it."); - } - - if (!(parameters instanceof Map)) { - throw new IllegalArgumentException( - "parameters for input field [" + inputField + "] cannot be cast to [" + Map.class.getName() + "]" - ); - } - - Map parameterMap = (Map) parameters; - - // output field must be string - if (!(parameterMap.containsKey(OUTPUT_FIELD))) { - throw new IllegalArgumentException( - "parameters for input field [" + inputField + "] misses [" + OUTPUT_FIELD + "], cannot process it." - ); - } - - Object outputField = parameterMap.get(OUTPUT_FIELD); - - if (!(outputField instanceof String)) { - throw new IllegalArgumentException( - "parameters for output field [" + OUTPUT_FIELD + "] cannot be cast to [" + String.class.getName() + "]" - ); - } - - // check non string parameters - int chunkingAlgorithmCount = 0; - Map chunkerParameters; - for (Map.Entry parameterEntry : parameterMap.entrySet()) { - if (!(parameterEntry.getKey() instanceof String)) { - throw new IllegalArgumentException("found parameter entry with non-string key"); - } - String parameterKey = (String) parameterEntry.getKey(); - if (supportedChunkers.contains(parameterKey)) { - chunkingAlgorithmCount += 1; - chunkerParameters = (Map) parameterEntry.getValue(); - IFieldChunker chunker = ChunkerFactory.create(parameterKey, analysisRegistry); - chunker.validateParameters(chunkerParameters); - } - } - - // should only define one algorithm - if (chunkingAlgorithmCount != 1) { - throw new IllegalArgumentException("input field [" + inputField + "] should has and only has 1 chunking algorithm"); - } - } - } - - private void validateContent(Object content, String inputField) { - // content can be a map, a list of strings or a list - if (content instanceof Map) { - @SuppressWarnings("unchecked") - Map contentMap = (Map) content; - for (Map.Entry contentEntry : contentMap.entrySet()) { - String contentKey = contentEntry.getKey(); - Object contentValue = contentEntry.getValue(); - // the map value can also be a map, list or string - validateContent(contentValue, inputField + "." + contentKey); - } - } else if (content instanceof List) { - List contentList = (List) content; - for (Object contentElement : contentList) { - if (!(contentElement instanceof String)) { - throw new IllegalArgumentException( - "some element in input field list [" - + inputField - + "] of type [" - + contentElement.getClass().getName() - + "] cannot be cast to [" - + String.class.getName() - + "]" - ); - } - } - } else if (!(content instanceof String)) { - throw new IllegalArgumentException( - "input field [" - + inputField - + "] of type [" - + content.getClass().getName() - + "] cannot be cast to [" - + String.class.getName() - + "]" - ); - } - } - private Object chunk(IFieldChunker chunker, Object content, Map chunkerParameters) { // assume that content is either a map, list or string if (content instanceof Map) { @@ -192,17 +96,32 @@ private Object chunk(IFieldChunker chunker, Object content, Map } } + private static Map tranferFieldMap(Map orignalMap) { + // The original map should be + Map transferFieldMap = new HashMap<>(); + Map tmpParameters = (Map) orignalMap.entrySet().iterator().next().getValue(); + String inputField = orignalMap.entrySet().iterator().next().getKey(); + Object outputField = tmpParameters.get(OUTPUT_FIELD); + transferFieldMap.put(inputField, outputField); + return transferFieldMap; + } + @Override - public IngestDocument execute(IngestDocument document) { - for (Map.Entry fieldMapEntry : fieldMap.entrySet()) { + public void doExecute( + IngestDocument ingestDocument, + Map ProcessMap, + List inferenceList, + BiConsumer handler + ) { + List results = new ArrayList<>(); + for (Map.Entry fieldMapEntry : originalFieldMap.entrySet()) { String inputField = fieldMapEntry.getKey(); - Object content = document.getFieldValue(inputField, Object.class); + Object content = ingestDocument.getFieldValue(inputField, Object.class); if (content == null) { throw new IllegalArgumentException("input field in document [" + inputField + "] is null, cannot process it."); } - validateContent(content, inputField); @SuppressWarnings("unchecked") Map parameters = (Map) fieldMapEntry.getValue(); @@ -217,7 +136,7 @@ public IngestDocument execute(IngestDocument document) { Map chunkerParameters = (Map) parameterEntry.getValue(); if (Objects.equals(parameterKey, ChunkerFactory.FIXED_LENGTH_ALGORITHM)) { // for fixed token length algorithm, add maxTokenCount to chunker parameters - Map sourceAndMetadataMap = document.getSourceAndMetadata(); + Map sourceAndMetadataMap = ingestDocument.getSourceAndMetadata(); int maxTokenCount = IndexSettings.MAX_TOKEN_COUNT_SETTING.get(settings); String indexName = sourceAndMetadataMap.get(IndexFieldMapper.NAME).toString(); IndexMetadata indexMetadata = clusterService.state().metadata().index(indexName); @@ -229,11 +148,17 @@ public IngestDocument execute(IngestDocument document) { chunkerParameters.put(FixedTokenLengthChunker.MAX_TOKEN_COUNT_FIELD, maxTokenCount); } IFieldChunker chunker = ChunkerFactory.create(parameterKey, analysisRegistry); - document.setFieldValue(outputField, chunk(chunker, content, chunkerParameters)); + results.add(chunk(chunker, content, chunkerParameters)); } } } - return document; + try { + setTargetFieldsToDocument(ingestDocument, ProcessMap, results); + handler.accept(ingestDocument, null); + } catch (Exception exception) { + handler.accept(null, exception); + } + } public static class Factory implements Processor.Factory { @@ -244,13 +169,22 @@ public static class Factory implements Processor.Factory { private final IndicesService indicesService; + private final Environment environment; + private final AnalysisRegistry analysisRegistry; - public Factory(Settings settings, ClusterService clusterService, IndicesService indicesService, AnalysisRegistry analysisRegistry) { + public Factory( + Settings settings, + ClusterService clusterService, + IndicesService indicesService, + AnalysisRegistry analysisRegistry, + Environment environment + ) { this.settings = settings; this.clusterService = clusterService; this.indicesService = indicesService; this.analysisRegistry = analysisRegistry; + this.environment = environment; } @Override @@ -268,7 +202,8 @@ public DocumentChunkingProcessor create( settings, clusterService, indicesService, - analysisRegistry + analysisRegistry, + environment ); } From 24f4980ba5d196f7f74391de5cabb15c4d9ad90a Mon Sep 17 00:00:00 2001 From: yuye-aws Date: Tue, 5 Mar 2024 10:35:38 +0800 Subject: [PATCH 052/189] draft code for extending inference processor with document chunking processor Signed-off-by: yuye-aws --- .../neuralsearch/plugin/NeuralSearch.java | 4 +- .../processor/DocumentChunkingProcessor.java | 296 ++++++++++++++---- .../processor/InferenceProcessor.java | 6 +- .../DocumentChunkingProcessorTests.java | 16 +- 4 files changed, 258 insertions(+), 64 deletions(-) diff --git a/src/main/java/org/opensearch/neuralsearch/plugin/NeuralSearch.java b/src/main/java/org/opensearch/neuralsearch/plugin/NeuralSearch.java index 96ce0b923..b7d76f80b 100644 --- a/src/main/java/org/opensearch/neuralsearch/plugin/NeuralSearch.java +++ b/src/main/java/org/opensearch/neuralsearch/plugin/NeuralSearch.java @@ -124,8 +124,8 @@ public Map getProcessors(Processor.Parameters paramet parameters.ingestService.getClusterService(), parameters.indicesService, parameters.analysisRegistry, - processorInputValidator, - parameters.env + parameters.env, + processorInputValidator ) ); } diff --git a/src/main/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessor.java b/src/main/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessor.java index 3e9a60143..a5312c04d 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessor.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessor.java @@ -9,23 +9,19 @@ import java.util.Set; import java.util.ArrayList; import java.util.List; -import java.util.Objects; +import java.util.LinkedHashMap; import java.util.function.BiConsumer; +import java.util.stream.Collectors; -import org.opensearch.cluster.metadata.IndexMetadata; import org.opensearch.cluster.service.ClusterService; import org.opensearch.common.settings.Settings; import org.opensearch.env.Environment; -import org.opensearch.index.IndexService; -import org.opensearch.index.IndexSettings; import org.opensearch.index.analysis.AnalysisRegistry; import org.opensearch.indices.IndicesService; import org.opensearch.ingest.IngestDocument; import org.opensearch.ingest.Processor; import org.opensearch.neuralsearch.processor.chunker.ChunkerFactory; -import org.opensearch.neuralsearch.processor.chunker.FixedTokenLengthChunker; import org.opensearch.neuralsearch.processor.chunker.IFieldChunker; -import org.opensearch.index.mapper.IndexFieldMapper; import static org.opensearch.ingest.ConfigurationUtils.readMap; @@ -38,7 +34,7 @@ public final class DocumentChunkingProcessor extends InferenceProcessor { public static final String LIST_TYPE_NESTED_MAP_KEY = "chunking"; - private final Map originalFieldMap; + private final Map chunkingFieldMap; private final Set supportedChunkers = ChunkerFactory.getAllChunkers(); @@ -58,11 +54,23 @@ public DocumentChunkingProcessor( ClusterService clusterService, IndicesService indicesService, AnalysisRegistry analysisRegistry, - Environment environment + Environment environment, + ProcessorInputValidator processorInputValidator ) { - super(tag, description, TYPE, LIST_TYPE_NESTED_MAP_KEY, "", tranferFieldMap(fieldMap), null, environment); - this.originalFieldMap = fieldMap; + super( + tag, + description, + TYPE, + LIST_TYPE_NESTED_MAP_KEY, + null, + transformFieldMap(fieldMap), + null, + environment, + processorInputValidator + ); + validateFieldMap(fieldMap, ""); this.settings = settings; + this.chunkingFieldMap = fieldMap; this.clusterService = clusterService; this.indicesService = indicesService; this.analysisRegistry = analysisRegistry; @@ -96,14 +104,89 @@ private Object chunk(IFieldChunker chunker, Object content, Map } } - private static Map tranferFieldMap(Map orignalMap) { - // The original map should be - Map transferFieldMap = new HashMap<>(); - Map tmpParameters = (Map) orignalMap.entrySet().iterator().next().getValue(); - String inputField = orignalMap.entrySet().iterator().next().getKey(); - Object outputField = tmpParameters.get(OUTPUT_FIELD); - transferFieldMap.put(inputField, outputField); - return transferFieldMap; + @SuppressWarnings("unchecked") + private void validateFieldMap(Map fieldMap, String inputPrefix) { + for (Map.Entry fieldMapEntry : fieldMap.entrySet()) { + String inputField = fieldMapEntry.getKey(); + if (fieldMapEntry.getValue() instanceof Map) { + Map insideFieldMap = (Map) fieldMapEntry.getValue(); + if (insideFieldMap.containsKey(OUTPUT_FIELD)) { + validateChunkingFieldMap(insideFieldMap, inputPrefix + "." + inputField); + } else { + validateFieldMap(insideFieldMap, inputPrefix + "." + inputField); + } + } + } + } + + @SuppressWarnings("unchecked") + private void validateChunkingFieldMap(Map fieldMap, String inputField) { + // this function validates the parameters for chunking processors with: + // 1. the output field is a string + // 2. the chunker parameters must include and only include 1 type of chunker + // 3. the chunker parameters must be validated by each algorithm + Object outputField = fieldMap.get(OUTPUT_FIELD); + + if (!(outputField instanceof String)) { + throw new IllegalArgumentException( + "parameters for output field [" + OUTPUT_FIELD + "] cannot be cast to [" + String.class.getName() + "]" + ); + } + + // check non string parameter key + // validate each algorithm + int chunkingAlgorithmCount = 0; + Map chunkerParameters; + for (Map.Entry parameterEntry : fieldMap.entrySet()) { + if (!(parameterEntry.getKey() instanceof String)) { + throw new IllegalArgumentException("found parameter entry with non-string key"); + } + String parameterKey = (String) parameterEntry.getKey(); + if (supportedChunkers.contains(parameterKey)) { + chunkingAlgorithmCount += 1; + chunkerParameters = (Map) parameterEntry.getValue(); + IFieldChunker chunker = ChunkerFactory.create(parameterKey, analysisRegistry); + chunker.validateParameters(chunkerParameters); + } + } + + // should only define one algorithm + if (chunkingAlgorithmCount != 1) { + throw new IllegalArgumentException("input field [" + inputField + "] should has and only has 1 chunking algorithm"); + } + } + + private static Map transformFieldMap(Map fieldMap) { + // transform the into field map for inference processor + Map transformedFieldMap = new HashMap<>(); + for (Map.Entry fieldMapEntry : fieldMap.entrySet()) { + String inputField = fieldMapEntry.getKey(); + if (fieldMapEntry.getValue() instanceof Map) { + Map insideFieldMap = (Map) fieldMapEntry.getValue(); + if (insideFieldMap.containsKey(OUTPUT_FIELD)) { + Object outputField = insideFieldMap.get(OUTPUT_FIELD); + transformedFieldMap.put(inputField, outputField); + } else { + transformedFieldMap.put(inputField, transformFieldMap(insideFieldMap)); + } + } + } + return transformedFieldMap; + } + + private List> chunk(List contents, Map> parameter) { + // parameter only contains 1 key defining chunker type + // its value should be chunking parameters + List> chunkedContents = new ArrayList<>(); + for (Map.Entry> parameterEntry : parameter.entrySet()) { + String type = parameterEntry.getKey(); + Map chunkerParameters = parameterEntry.getValue(); + IFieldChunker chunker = ChunkerFactory.create(type, analysisRegistry); + for (String content : contents) { + chunkedContents.add(chunker.chunk(content, chunkerParameters)); + } + } + return chunkedContents; } @Override @@ -113,52 +196,144 @@ public void doExecute( List inferenceList, BiConsumer handler ) { - List results = new ArrayList<>(); - for (Map.Entry fieldMapEntry : originalFieldMap.entrySet()) { - String inputField = fieldMapEntry.getKey(); - Object content = ingestDocument.getFieldValue(inputField, Object.class); + throw new RuntimeException("method doExecute() not implemented in document chunking processor"); + } - if (content == null) { - throw new IllegalArgumentException("input field in document [" + inputField + "] is null, cannot process it."); + @Override + public void execute(IngestDocument ingestDocument, BiConsumer handler) { + try { + processorInputValidator.validateFieldsValue(fieldMap, environment, ingestDocument, false); + Map sourceAndMetadataMap = ingestDocument.getSourceAndMetadata(); + Map processMap = buildMapWithProcessorKeyAndOriginalValue(sourceAndMetadataMap, chunkingFieldMap); + // List inferenceList = createInferenceList(processMap); + // List> results = chunk(processMap); + // setTargetFieldsToDocument(ingestDocument, processMap, results); + handler.accept(ingestDocument, null); + /* + if (inferenceList.isEmpty()) { + handler.accept(ingestDocument, null); + } else { + // perform chunking + List> results = chunk(inferenceList, processMap); + setTargetFieldsToDocument(ingestDocument, processMap, results); + doExecute(ingestDocument, processMap, inferenceList, handler); + handler.accept(ingestDocument, null); } + */ + } catch (Exception e) { + handler.accept(null, e); + } + } - - @SuppressWarnings("unchecked") - Map parameters = (Map) fieldMapEntry.getValue(); - String outputField = (String) parameters.get(OUTPUT_FIELD); - - // we have validated that there is one chunking algorithm - // and that chunkerParameters is of type Map - for (Map.Entry parameterEntry : parameters.entrySet()) { - String parameterKey = parameterEntry.getKey(); - if (supportedChunkers.contains(parameterKey)) { - @SuppressWarnings("unchecked") - Map chunkerParameters = (Map) parameterEntry.getValue(); - if (Objects.equals(parameterKey, ChunkerFactory.FIXED_LENGTH_ALGORITHM)) { - // for fixed token length algorithm, add maxTokenCount to chunker parameters - Map sourceAndMetadataMap = ingestDocument.getSourceAndMetadata(); - int maxTokenCount = IndexSettings.MAX_TOKEN_COUNT_SETTING.get(settings); - String indexName = sourceAndMetadataMap.get(IndexFieldMapper.NAME).toString(); - IndexMetadata indexMetadata = clusterService.state().metadata().index(indexName); - if (indexMetadata != null) { - // if the index exists, read maxTokenCount from the index setting - IndexService indexService = indicesService.indexServiceSafe(indexMetadata.getIndex()); - maxTokenCount = indexService.getIndexSettings().getMaxTokenCount(); - } - chunkerParameters.put(FixedTokenLengthChunker.MAX_TOKEN_COUNT_FIELD, maxTokenCount); - } - IFieldChunker chunker = ChunkerFactory.create(parameterKey, analysisRegistry); - results.add(chunk(chunker, content, chunkerParameters)); + Map buildMapWithProcessorKeyAndOriginalValue( + Map sourceAndMetadataMap, + Map chunkingFieldMap + ) { + // the leaf map for processMap contains two key value pairs + // parameters: the chunker parameters, Map + // inferenceList: a list of strings to be chunked, List + Map mapWithProcessorKeys = new LinkedHashMap<>(); + for (Map.Entry fieldMapEntry : fieldMap.entrySet()) { + String originalKey = fieldMapEntry.getKey(); + Object targetKey = fieldMapEntry.getValue(); + if (targetKey instanceof Map) { + Map treeRes = new LinkedHashMap<>(); + buildMapWithProcessorKeyAndOriginalValueForMapType(originalKey, targetKey, sourceAndMetadataMap, chunkingFieldMap, treeRes); + mapWithProcessorKeys.put(originalKey, treeRes.get(originalKey)); + } else { + Map leafMap = new HashMap<>(); + leafMap.put("parameters", chunkingFieldMap.get(originalKey)); + Object inferenceObject = sourceAndMetadataMap.get(originalKey); + // inferenceObject is either a string or a list of strings + if (inferenceObject instanceof List) { + leafMap.put("inferenceList", inferenceObject); + } else { + leafMap.put("inferenceList", stringToList((String) inferenceObject)); } + mapWithProcessorKeys.put(String.valueOf(targetKey), leafMap); } } - try { - setTargetFieldsToDocument(ingestDocument, ProcessMap, results); - handler.accept(ingestDocument, null); - } catch (Exception exception) { - handler.accept(null, exception); + return mapWithProcessorKeys; + } + + @SuppressWarnings("unchecked") + private void buildMapWithProcessorKeyAndOriginalValueForMapType( + String parentKey, + Object processorKey, + Map sourceAndMetadataMap, + Map chunkingFieldMap, + Map treeRes + ) { + if (processorKey == null || sourceAndMetadataMap == null || chunkingFieldMap == null) return; + if (processorKey instanceof Map) { + Map next = new LinkedHashMap<>(); + if (sourceAndMetadataMap.get(parentKey) instanceof Map) { + for (Map.Entry nestedFieldMapEntry : ((Map) processorKey).entrySet()) { + buildMapWithProcessorKeyAndOriginalValueForMapType( + nestedFieldMapEntry.getKey(), + nestedFieldMapEntry.getValue(), + (Map) sourceAndMetadataMap.get(parentKey), + (Map) chunkingFieldMap.get(parentKey), + next + ); + } + } else if (sourceAndMetadataMap.get(parentKey) instanceof List) { + for (Map.Entry nestedFieldMapEntry : ((Map) processorKey).entrySet()) { + List> list = (List>) sourceAndMetadataMap.get(parentKey); + List listOfStrings = list.stream().map(x -> x.get(nestedFieldMapEntry.getKey())).collect(Collectors.toList()); + Map map = new LinkedHashMap<>(); + map.put(nestedFieldMapEntry.getKey(), listOfStrings); + buildMapWithProcessorKeyAndOriginalValueForMapType( + nestedFieldMapEntry.getKey(), + nestedFieldMapEntry.getValue(), + map, + (Map) chunkingFieldMap.get(nestedFieldMapEntry.getKey()), + next + ); + } + } + treeRes.put(parentKey, next); + } else { + Map leafMap = new HashMap<>(); + leafMap.put("parameters", chunkingFieldMap.get(parentKey)); + Object inferenceObject = sourceAndMetadataMap.get(parentKey); + // inferenceObject is either a string or a list of strings + if (inferenceObject instanceof List) { + leafMap.put("inferenceList", inferenceObject); + } else { + leafMap.put("inferenceList", stringToList((String) inferenceObject)); + } + treeRes.put(parentKey, leafMap); } + } + + private static List stringToList(String string) { + List list = new ArrayList<>(); + list.add(string); + return list; + } + + @SuppressWarnings("unchecked") + private List> createInferenceList(Map processMap) { + List> texts = new ArrayList<>(); + processMap.entrySet().stream().filter(processMapEntry -> processMapEntry.getValue() != null).forEach(processMapEntry -> { + Map sourceValue = (Map) processMapEntry.getValue(); + // get "inferenceList" key + createInferenceListForMapTypeInput(sourceValue, texts); + }); + return texts; + } + @SuppressWarnings("unchecked") + private void createInferenceListForMapTypeInput(Map mapInput, List> texts) { + if (mapInput.containsKey("inferenceList")) { + texts.add((List) mapInput.get("inferenceList")); + return; + } + for (Map.Entry nestedFieldMapEntry : mapInput.entrySet()) { + Map nestedMapInput = (Map) nestedFieldMapEntry.getValue(); + createInferenceListForMapTypeInput(nestedMapInput, texts); + } } public static class Factory implements Processor.Factory { @@ -173,18 +348,22 @@ public static class Factory implements Processor.Factory { private final AnalysisRegistry analysisRegistry; + private final ProcessorInputValidator processorInputValidator; + public Factory( Settings settings, ClusterService clusterService, IndicesService indicesService, AnalysisRegistry analysisRegistry, - Environment environment + Environment environment, + ProcessorInputValidator processorInputValidator ) { this.settings = settings; this.clusterService = clusterService; this.indicesService = indicesService; this.analysisRegistry = analysisRegistry; this.environment = environment; + this.processorInputValidator = processorInputValidator; } @Override @@ -203,7 +382,8 @@ public DocumentChunkingProcessor create( clusterService, indicesService, analysisRegistry, - environment + environment, + processorInputValidator ); } diff --git a/src/main/java/org/opensearch/neuralsearch/processor/InferenceProcessor.java b/src/main/java/org/opensearch/neuralsearch/processor/InferenceProcessor.java index e072e5d7d..bf73be795 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/InferenceProcessor.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/InferenceProcessor.java @@ -43,13 +43,13 @@ public abstract class InferenceProcessor extends AbstractProcessor { protected final String modelId; - private final Map fieldMap; + protected final Map fieldMap; protected final MLCommonsClientAccessor mlCommonsClientAccessor; - private final Environment environment; + protected final Environment environment; - private final ProcessorInputValidator processorInputValidator; + protected final ProcessorInputValidator processorInputValidator; public InferenceProcessor( String tag, diff --git a/src/test/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessorTests.java b/src/test/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessorTests.java index d02c75350..61590c5ed 100644 --- a/src/test/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessorTests.java +++ b/src/test/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessorTests.java @@ -8,6 +8,7 @@ import lombok.SneakyThrows; import org.apache.lucene.tests.analysis.MockTokenizer; import org.junit.Before; +import org.mockito.Mock; import org.opensearch.cluster.ClusterState; import org.opensearch.cluster.metadata.Metadata; import org.opensearch.cluster.service.ClusterService; @@ -48,6 +49,12 @@ public class DocumentChunkingProcessorTests extends OpenSearchTestCase { private static final String OUTPUT_FIELD = "body_chunk"; private static final String INDEX_NAME = "_index"; + @Mock + private ProcessorInputValidator processorInputValidator; + + @Mock + private Environment environment; + @SneakyThrows private AnalysisRegistry getAnalysisRegistry() { Settings settings = Settings.builder().put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()).build(); @@ -78,7 +85,14 @@ public void setup() { when(metadata.index(anyString())).thenReturn(null); when(clusterState.metadata()).thenReturn(metadata); when(clusterService.state()).thenReturn(clusterState); - factory = new DocumentChunkingProcessor.Factory(settings, clusterService, indicesService, getAnalysisRegistry()); + factory = new DocumentChunkingProcessor.Factory( + settings, + clusterService, + indicesService, + getAnalysisRegistry(), + environment, + processorInputValidator + ); } private Map createFixedTokenLengthParameters() { From 0b4036af1f2c582cbc2ddd0ea2e68d4dff967898 Mon Sep 17 00:00:00 2001 From: yuye-aws Date: Tue, 5 Mar 2024 16:13:19 +0800 Subject: [PATCH 053/189] api refactor for document chunking processor Signed-off-by: yuye-aws --- .../processor/DocumentChunkingProcessor.java | 302 ++++-------------- .../processor/InferenceProcessor.java | 2 +- .../DocumentChunkingProcessorTests.java | 56 +--- 3 files changed, 75 insertions(+), 285 deletions(-) diff --git a/src/main/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessor.java b/src/main/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessor.java index a5312c04d..34a02eb35 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessor.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessor.java @@ -4,42 +4,50 @@ */ package org.opensearch.neuralsearch.processor; -import java.util.HashMap; import java.util.Map; import java.util.Set; import java.util.ArrayList; import java.util.List; -import java.util.LinkedHashMap; +import java.util.Objects; import java.util.function.BiConsumer; -import java.util.stream.Collectors; +import org.opensearch.cluster.metadata.IndexMetadata; +import org.opensearch.index.IndexService; import org.opensearch.cluster.service.ClusterService; import org.opensearch.common.settings.Settings; import org.opensearch.env.Environment; import org.opensearch.index.analysis.AnalysisRegistry; import org.opensearch.indices.IndicesService; +import org.opensearch.index.IndexSettings; import org.opensearch.ingest.IngestDocument; import org.opensearch.ingest.Processor; import org.opensearch.neuralsearch.processor.chunker.ChunkerFactory; import org.opensearch.neuralsearch.processor.chunker.IFieldChunker; +import org.opensearch.index.mapper.IndexFieldMapper; +import org.opensearch.neuralsearch.processor.chunker.FixedTokenLengthChunker; import static org.opensearch.ingest.ConfigurationUtils.readMap; +import static org.opensearch.neuralsearch.processor.chunker.ChunkerFactory.DELIMITER_ALGORITHM; +import static org.opensearch.neuralsearch.processor.chunker.ChunkerFactory.FIXED_LENGTH_ALGORITHM; public final class DocumentChunkingProcessor extends InferenceProcessor { public static final String TYPE = "chunking"; - public static final String OUTPUT_FIELD = "output_field"; public static final String FIELD_MAP_FIELD = "field_map"; - public static final String LIST_TYPE_NESTED_MAP_KEY = "chunking"; + public static final String ALGORITHM_FIELD = "algorithm"; - private final Map chunkingFieldMap; + public static final String LIST_TYPE_NESTED_MAP_KEY = "chunking"; private final Set supportedChunkers = ChunkerFactory.getAllChunkers(); private final Settings settings; + private String chunkerType; + + private Map chunkerParameters; + private final ClusterService clusterService; private final IndicesService indicesService; @@ -50,6 +58,7 @@ public DocumentChunkingProcessor( String tag, String description, Map fieldMap, + Map algorithmMap, Settings settings, ClusterService clusterService, IndicesService indicesService, @@ -57,20 +66,9 @@ public DocumentChunkingProcessor( Environment environment, ProcessorInputValidator processorInputValidator ) { - super( - tag, - description, - TYPE, - LIST_TYPE_NESTED_MAP_KEY, - null, - transformFieldMap(fieldMap), - null, - environment, - processorInputValidator - ); - validateFieldMap(fieldMap, ""); + super(tag, description, TYPE, LIST_TYPE_NESTED_MAP_KEY, null, fieldMap, null, environment, processorInputValidator); + validateAndParseAlgorithmMap(algorithmMap); this.settings = settings; - this.chunkingFieldMap = fieldMap; this.clusterService = clusterService; this.indicesService = indicesService; this.analysisRegistry = analysisRegistry; @@ -80,115 +78,44 @@ public String getType() { return TYPE; } - private Object chunk(IFieldChunker chunker, Object content, Map chunkerParameters) { + private List chunk(String content) { // assume that content is either a map, list or string - if (content instanceof Map) { - Map chunkedPassageMap = new HashMap<>(); - Map contentMap = (Map) content; - for (Map.Entry contentEntry : contentMap.entrySet()) { - String contentKey = contentEntry.getKey(); - Object contentValue = contentEntry.getValue(); - // contentValue can also be a map, list or string - chunkedPassageMap.put(contentKey, chunk(chunker, contentValue, chunkerParameters)); - } - return chunkedPassageMap; - } else if (content instanceof List) { - List chunkedPassageList = new ArrayList<>(); - List contentList = (List) content; - for (Object contentElement : contentList) { - chunkedPassageList.addAll(chunker.chunk((String) contentElement, chunkerParameters)); - } - return chunkedPassageList; - } else { - return chunker.chunk((String) content, chunkerParameters); - } + IFieldChunker chunker = ChunkerFactory.create(chunkerType, analysisRegistry); + return chunker.chunk(content, chunkerParameters); } @SuppressWarnings("unchecked") - private void validateFieldMap(Map fieldMap, String inputPrefix) { - for (Map.Entry fieldMapEntry : fieldMap.entrySet()) { - String inputField = fieldMapEntry.getKey(); - if (fieldMapEntry.getValue() instanceof Map) { - Map insideFieldMap = (Map) fieldMapEntry.getValue(); - if (insideFieldMap.containsKey(OUTPUT_FIELD)) { - validateChunkingFieldMap(insideFieldMap, inputPrefix + "." + inputField); - } else { - validateFieldMap(insideFieldMap, inputPrefix + "." + inputField); - } - } - } - } - - @SuppressWarnings("unchecked") - private void validateChunkingFieldMap(Map fieldMap, String inputField) { - // this function validates the parameters for chunking processors with: - // 1. the output field is a string - // 2. the chunker parameters must include and only include 1 type of chunker - // 3. the chunker parameters must be validated by each algorithm - Object outputField = fieldMap.get(OUTPUT_FIELD); - - if (!(outputField instanceof String)) { + private void validateAndParseAlgorithmMap(Map algorithmMap) { + if (algorithmMap.size() != 1) { throw new IllegalArgumentException( - "parameters for output field [" + OUTPUT_FIELD + "] cannot be cast to [" + String.class.getName() + "]" + "Unable to create the processor as [" + ALGORITHM_FIELD + "] must contain and only contain 1 algorithm" ); } - // check non string parameter key - // validate each algorithm - int chunkingAlgorithmCount = 0; - Map chunkerParameters; - for (Map.Entry parameterEntry : fieldMap.entrySet()) { - if (!(parameterEntry.getKey() instanceof String)) { - throw new IllegalArgumentException("found parameter entry with non-string key"); + for (Map.Entry algorithmEntry : algorithmMap.entrySet()) { + String algorithmKey = algorithmEntry.getKey(); + Object algorithmValue = algorithmEntry.getValue(); + if (!supportedChunkers.contains(algorithmKey)) { + throw new IllegalArgumentException( + "Unable to create the processor as chunker algorithm [" + + algorithmKey + + "] is not supported. Supported chunkers types are [" + + FIXED_LENGTH_ALGORITHM + + ", " + + DELIMITER_ALGORITHM + + "]" + ); } - String parameterKey = (String) parameterEntry.getKey(); - if (supportedChunkers.contains(parameterKey)) { - chunkingAlgorithmCount += 1; - chunkerParameters = (Map) parameterEntry.getValue(); - IFieldChunker chunker = ChunkerFactory.create(parameterKey, analysisRegistry); - chunker.validateParameters(chunkerParameters); + if (!(algorithmValue instanceof Map)) { + throw new IllegalArgumentException( + "Unable to create the processor as [" + ALGORITHM_FIELD + "] cannot be cast to [" + Map.class.getName() + "]" + ); } - } - - // should only define one algorithm - if (chunkingAlgorithmCount != 1) { - throw new IllegalArgumentException("input field [" + inputField + "] should has and only has 1 chunking algorithm"); + this.chunkerType = algorithmKey; + this.chunkerParameters = (Map) algorithmValue; } } - private static Map transformFieldMap(Map fieldMap) { - // transform the into field map for inference processor - Map transformedFieldMap = new HashMap<>(); - for (Map.Entry fieldMapEntry : fieldMap.entrySet()) { - String inputField = fieldMapEntry.getKey(); - if (fieldMapEntry.getValue() instanceof Map) { - Map insideFieldMap = (Map) fieldMapEntry.getValue(); - if (insideFieldMap.containsKey(OUTPUT_FIELD)) { - Object outputField = insideFieldMap.get(OUTPUT_FIELD); - transformedFieldMap.put(inputField, outputField); - } else { - transformedFieldMap.put(inputField, transformFieldMap(insideFieldMap)); - } - } - } - return transformedFieldMap; - } - - private List> chunk(List contents, Map> parameter) { - // parameter only contains 1 key defining chunker type - // its value should be chunking parameters - List> chunkedContents = new ArrayList<>(); - for (Map.Entry> parameterEntry : parameter.entrySet()) { - String type = parameterEntry.getKey(); - Map chunkerParameters = parameterEntry.getValue(); - IFieldChunker chunker = ChunkerFactory.create(type, analysisRegistry); - for (String content : contents) { - chunkedContents.add(chunker.chunk(content, chunkerParameters)); - } - } - return chunkedContents; - } - @Override public void doExecute( IngestDocument ingestDocument, @@ -196,114 +123,30 @@ public void doExecute( List inferenceList, BiConsumer handler ) { - throw new RuntimeException("method doExecute() not implemented in document chunking processor"); - } - - @Override - public void execute(IngestDocument ingestDocument, BiConsumer handler) { try { processorInputValidator.validateFieldsValue(fieldMap, environment, ingestDocument, false); - Map sourceAndMetadataMap = ingestDocument.getSourceAndMetadata(); - Map processMap = buildMapWithProcessorKeyAndOriginalValue(sourceAndMetadataMap, chunkingFieldMap); - // List inferenceList = createInferenceList(processMap); - // List> results = chunk(processMap); - // setTargetFieldsToDocument(ingestDocument, processMap, results); - handler.accept(ingestDocument, null); - /* - if (inferenceList.isEmpty()) { - handler.accept(ingestDocument, null); - } else { - // perform chunking - List> results = chunk(inferenceList, processMap); - setTargetFieldsToDocument(ingestDocument, processMap, results); - doExecute(ingestDocument, processMap, inferenceList, handler); - handler.accept(ingestDocument, null); - } - */ - } catch (Exception e) { - handler.accept(null, e); - } - } - - Map buildMapWithProcessorKeyAndOriginalValue( - Map sourceAndMetadataMap, - Map chunkingFieldMap - ) { - // the leaf map for processMap contains two key value pairs - // parameters: the chunker parameters, Map - // inferenceList: a list of strings to be chunked, List - Map mapWithProcessorKeys = new LinkedHashMap<>(); - for (Map.Entry fieldMapEntry : fieldMap.entrySet()) { - String originalKey = fieldMapEntry.getKey(); - Object targetKey = fieldMapEntry.getValue(); - if (targetKey instanceof Map) { - Map treeRes = new LinkedHashMap<>(); - buildMapWithProcessorKeyAndOriginalValueForMapType(originalKey, targetKey, sourceAndMetadataMap, chunkingFieldMap, treeRes); - mapWithProcessorKeys.put(originalKey, treeRes.get(originalKey)); - } else { - Map leafMap = new HashMap<>(); - leafMap.put("parameters", chunkingFieldMap.get(originalKey)); - Object inferenceObject = sourceAndMetadataMap.get(originalKey); - // inferenceObject is either a string or a list of strings - if (inferenceObject instanceof List) { - leafMap.put("inferenceList", inferenceObject); - } else { - leafMap.put("inferenceList", stringToList((String) inferenceObject)); + if (Objects.equals(chunkerType, FIXED_LENGTH_ALGORITHM)) { + // add maxTokenCount setting from index metadata to chunker parameters + Map sourceAndMetadataMap = ingestDocument.getSourceAndMetadata(); + String indexName = sourceAndMetadataMap.get(IndexFieldMapper.NAME).toString(); + int maxTokenCount = IndexSettings.MAX_TOKEN_COUNT_SETTING.get(settings); + IndexMetadata indexMetadata = clusterService.state().metadata().index(indexName); + if (indexMetadata != null) { + // if the index exists, read maxTokenCount from the index setting + IndexService indexService = indicesService.indexServiceSafe(indexMetadata.getIndex()); + maxTokenCount = indexService.getIndexSettings().getMaxTokenCount(); } - mapWithProcessorKeys.put(String.valueOf(targetKey), leafMap); + chunkerParameters.put(FixedTokenLengthChunker.MAX_TOKEN_COUNT_FIELD, maxTokenCount); } - } - return mapWithProcessorKeys; - } - @SuppressWarnings("unchecked") - private void buildMapWithProcessorKeyAndOriginalValueForMapType( - String parentKey, - Object processorKey, - Map sourceAndMetadataMap, - Map chunkingFieldMap, - Map treeRes - ) { - if (processorKey == null || sourceAndMetadataMap == null || chunkingFieldMap == null) return; - if (processorKey instanceof Map) { - Map next = new LinkedHashMap<>(); - if (sourceAndMetadataMap.get(parentKey) instanceof Map) { - for (Map.Entry nestedFieldMapEntry : ((Map) processorKey).entrySet()) { - buildMapWithProcessorKeyAndOriginalValueForMapType( - nestedFieldMapEntry.getKey(), - nestedFieldMapEntry.getValue(), - (Map) sourceAndMetadataMap.get(parentKey), - (Map) chunkingFieldMap.get(parentKey), - next - ); - } - } else if (sourceAndMetadataMap.get(parentKey) instanceof List) { - for (Map.Entry nestedFieldMapEntry : ((Map) processorKey).entrySet()) { - List> list = (List>) sourceAndMetadataMap.get(parentKey); - List listOfStrings = list.stream().map(x -> x.get(nestedFieldMapEntry.getKey())).collect(Collectors.toList()); - Map map = new LinkedHashMap<>(); - map.put(nestedFieldMapEntry.getKey(), listOfStrings); - buildMapWithProcessorKeyAndOriginalValueForMapType( - nestedFieldMapEntry.getKey(), - nestedFieldMapEntry.getValue(), - map, - (Map) chunkingFieldMap.get(nestedFieldMapEntry.getKey()), - next - ); - } - } - treeRes.put(parentKey, next); - } else { - Map leafMap = new HashMap<>(); - leafMap.put("parameters", chunkingFieldMap.get(parentKey)); - Object inferenceObject = sourceAndMetadataMap.get(parentKey); - // inferenceObject is either a string or a list of strings - if (inferenceObject instanceof List) { - leafMap.put("inferenceList", inferenceObject); - } else { - leafMap.put("inferenceList", stringToList((String) inferenceObject)); + List> chunkedResults = new ArrayList<>(); + for (String inferenceString : inferenceList) { + chunkedResults.add(chunk(inferenceString)); } - treeRes.put(parentKey, leafMap); + setTargetFieldsToDocument(ingestDocument, ProcessMap, chunkedResults); + handler.accept(ingestDocument, null); + } catch (Exception e) { + handler.accept(null, e); } } @@ -313,29 +156,6 @@ private static List stringToList(String string) { return list; } - @SuppressWarnings("unchecked") - private List> createInferenceList(Map processMap) { - List> texts = new ArrayList<>(); - processMap.entrySet().stream().filter(processMapEntry -> processMapEntry.getValue() != null).forEach(processMapEntry -> { - Map sourceValue = (Map) processMapEntry.getValue(); - // get "inferenceList" key - createInferenceListForMapTypeInput(sourceValue, texts); - }); - return texts; - } - - @SuppressWarnings("unchecked") - private void createInferenceListForMapTypeInput(Map mapInput, List> texts) { - if (mapInput.containsKey("inferenceList")) { - texts.add((List) mapInput.get("inferenceList")); - return; - } - for (Map.Entry nestedFieldMapEntry : mapInput.entrySet()) { - Map nestedMapInput = (Map) nestedFieldMapEntry.getValue(); - createInferenceListForMapTypeInput(nestedMapInput, texts); - } - } - public static class Factory implements Processor.Factory { private final Settings settings; @@ -374,10 +194,12 @@ public DocumentChunkingProcessor create( Map config ) throws Exception { Map fieldMap = readMap(TYPE, processorTag, config, FIELD_MAP_FIELD); + Map algorithmMap = readMap(TYPE, processorTag, config, ALGORITHM_FIELD); return new DocumentChunkingProcessor( processorTag, description, fieldMap, + algorithmMap, settings, clusterService, indicesService, diff --git a/src/main/java/org/opensearch/neuralsearch/processor/InferenceProcessor.java b/src/main/java/org/opensearch/neuralsearch/processor/InferenceProcessor.java index bf73be795..b88bee0e1 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/InferenceProcessor.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/InferenceProcessor.java @@ -121,7 +121,7 @@ public void execute(IngestDocument ingestDocument, BiConsumer createInferenceList(Map knnKeyMap) { + protected List createInferenceList(Map knnKeyMap) { List texts = new ArrayList<>(); knnKeyMap.entrySet().stream().filter(knnMapEntry -> knnMapEntry.getValue() != null).forEach(knnMapEntry -> { Object sourceValue = knnMapEntry.getValue(); diff --git a/src/test/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessorTests.java b/src/test/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessorTests.java index 61590c5ed..c70d64d32 100644 --- a/src/test/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessorTests.java +++ b/src/test/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessorTests.java @@ -110,12 +110,12 @@ private Map createDelimiterParameters() { @SneakyThrows private DocumentChunkingProcessor createFixedTokenLengthInstance() { Map config = new HashMap<>(); - Map fieldParameters = new HashMap<>(); - Map chunkerParameters = new HashMap<>(); - chunkerParameters.put(ChunkerFactory.FIXED_LENGTH_ALGORITHM, createFixedTokenLengthParameters()); - chunkerParameters.put(DocumentChunkingProcessor.OUTPUT_FIELD, OUTPUT_FIELD); - fieldParameters.put(INPUT_FIELD, chunkerParameters); - config.put(DocumentChunkingProcessor.FIELD_MAP_FIELD, fieldParameters); + Map fieldMap = new HashMap<>(); + Map algorithmMap = new HashMap<>(); + algorithmMap.put(ChunkerFactory.FIXED_LENGTH_ALGORITHM, createFixedTokenLengthParameters()); + fieldMap.put(INPUT_FIELD, OUTPUT_FIELD); + config.put(DocumentChunkingProcessor.FIELD_MAP_FIELD, fieldMap); + config.put(DocumentChunkingProcessor.ALGORITHM_FIELD, algorithmMap); Map registry = new HashMap<>(); return factory.create(registry, PROCESSOR_TAG, DESCRIPTION, config); } @@ -123,12 +123,12 @@ private DocumentChunkingProcessor createFixedTokenLengthInstance() { @SneakyThrows private DocumentChunkingProcessor createDelimiterInstance() { Map config = new HashMap<>(); - Map fieldParameters = new HashMap<>(); - Map chunkerParameters = new HashMap<>(); - chunkerParameters.put(ChunkerFactory.DELIMITER_ALGORITHM, createDelimiterParameters()); - chunkerParameters.put(DocumentChunkingProcessor.OUTPUT_FIELD, OUTPUT_FIELD); - fieldParameters.put(INPUT_FIELD, chunkerParameters); - config.put(DocumentChunkingProcessor.FIELD_MAP_FIELD, fieldParameters); + Map fieldMap = new HashMap<>(); + Map algorithmMap = new HashMap<>(); + algorithmMap.put(ChunkerFactory.DELIMITER_ALGORITHM, createDelimiterParameters()); + fieldMap.put(INPUT_FIELD, OUTPUT_FIELD); + config.put(DocumentChunkingProcessor.FIELD_MAP_FIELD, fieldMap); + config.put(DocumentChunkingProcessor.ALGORITHM_FIELD, algorithmMap); Map registry = new HashMap<>(); return factory.create(registry, PROCESSOR_TAG, DESCRIPTION, config); } @@ -171,38 +171,6 @@ public void testCreate_whenFieldMapWithIllegalParameterType_failure() { assertEquals("parameters for input field [key] cannot be cast to [java.util.Map]", illegalArgumentException.getMessage()); } - public void testCreate_whenFieldMapWithEmptyOutputField_failure() { - Map config = new HashMap<>(); - Map fieldMap = new HashMap<>(); - fieldMap.put(INPUT_FIELD, ImmutableMap.of()); - config.put(DocumentChunkingProcessor.FIELD_MAP_FIELD, fieldMap); - Map registry = new HashMap<>(); - IllegalArgumentException illegalArgumentException = assertThrows( - IllegalArgumentException.class, - () -> factory.create(registry, PROCESSOR_TAG, DESCRIPTION, config) - ); - assertEquals( - "parameters for input field [" + INPUT_FIELD + "] misses [" + DocumentChunkingProcessor.OUTPUT_FIELD + "], cannot process it.", - illegalArgumentException.getMessage() - ); - } - - public void testCreate_whenFieldMapWithIllegalOutputField_failure() { - Map config = new HashMap<>(); - Map fieldMap = new HashMap<>(); - fieldMap.put(INPUT_FIELD, ImmutableMap.of(DocumentChunkingProcessor.OUTPUT_FIELD, 1)); - config.put(DocumentChunkingProcessor.FIELD_MAP_FIELD, fieldMap); - Map registry = new HashMap<>(); - IllegalArgumentException illegalArgumentException = assertThrows( - IllegalArgumentException.class, - () -> factory.create(registry, PROCESSOR_TAG, DESCRIPTION, config) - ); - assertEquals( - "parameters for output field [output_field] cannot be cast to [java.lang.String]", - illegalArgumentException.getMessage() - ); - } - public void testCreate_whenFieldMapWithIllegalKey_failure() { Map config = new HashMap<>(); Map fieldMap = new HashMap<>(); From 9ff6645df06008f19953ef86465fd47457cd300d Mon Sep 17 00:00:00 2001 From: yuye-aws Date: Tue, 5 Mar 2024 17:00:23 +0800 Subject: [PATCH 054/189] remove nested list key for chunking processor Signed-off-by: yuye-aws --- .../processor/DocumentChunkingProcessor.java | 25 ++++++++++++++++--- .../processor/InferenceProcessor.java | 2 +- .../DocumentChunkingProcessorTests.java | 12 ++++++--- .../chunker/DelimiterChunkerTests.java | 2 +- 4 files changed, 32 insertions(+), 9 deletions(-) diff --git a/src/main/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessor.java b/src/main/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessor.java index 34a02eb35..8f151c20a 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessor.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessor.java @@ -9,6 +9,7 @@ import java.util.ArrayList; import java.util.List; import java.util.Objects; +import java.util.stream.IntStream; import java.util.function.BiConsumer; import org.opensearch.cluster.metadata.IndexMetadata; @@ -38,8 +39,6 @@ public final class DocumentChunkingProcessor extends InferenceProcessor { public static final String ALGORITHM_FIELD = "algorithm"; - public static final String LIST_TYPE_NESTED_MAP_KEY = "chunking"; - private final Set supportedChunkers = ChunkerFactory.getAllChunkers(); private final Settings settings; @@ -66,7 +65,7 @@ public DocumentChunkingProcessor( Environment environment, ProcessorInputValidator processorInputValidator ) { - super(tag, description, TYPE, LIST_TYPE_NESTED_MAP_KEY, null, fieldMap, null, environment, processorInputValidator); + super(tag, description, TYPE, null, null, fieldMap, null, environment, processorInputValidator); validateAndParseAlgorithmMap(algorithmMap); this.settings = settings; this.clusterService = clusterService; @@ -116,6 +115,24 @@ private void validateAndParseAlgorithmMap(Map algorithmMap) { } } + @Override + protected List buildResultForListType(List sourceValue, List results, IndexWrapper indexWrapper) { + Object peek = sourceValue.get(0); + if (peek instanceof String) { + List keyToResult = new ArrayList<>(); + IntStream.range(0, sourceValue.size()).forEachOrdered(x -> keyToResult.add(results.get(indexWrapper.index++))); + return keyToResult; + } else { + List> keyToResult = new ArrayList<>(); + for (Object nestedList : sourceValue) { + List nestedResult = new ArrayList<>(); + IntStream.range(0, ((List) nestedList).size()).forEachOrdered(x -> nestedResult.add(results.get(indexWrapper.index++))); + keyToResult.add(nestedResult); + } + return keyToResult; + } + } + @Override public void doExecute( IngestDocument ingestDocument, @@ -124,7 +141,7 @@ public void doExecute( BiConsumer handler ) { try { - processorInputValidator.validateFieldsValue(fieldMap, environment, ingestDocument, false); + processorInputValidator.validateFieldsValue(fieldMap, environment, ingestDocument, true); if (Objects.equals(chunkerType, FIXED_LENGTH_ALGORITHM)) { // add maxTokenCount setting from index metadata to chunker parameters Map sourceAndMetadataMap = ingestDocument.getSourceAndMetadata(); diff --git a/src/main/java/org/opensearch/neuralsearch/processor/InferenceProcessor.java b/src/main/java/org/opensearch/neuralsearch/processor/InferenceProcessor.java index b88bee0e1..e54d8b1d8 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/InferenceProcessor.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/InferenceProcessor.java @@ -303,7 +303,7 @@ public String getType() { * index: the index pointer of the text embedding result. */ static class IndexWrapper { - private int index; + protected int index; protected IndexWrapper(int index) { this.index = index; diff --git a/src/test/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessorTests.java b/src/test/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessorTests.java index c70d64d32..b25ab3269 100644 --- a/src/test/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessorTests.java +++ b/src/test/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessorTests.java @@ -173,9 +173,13 @@ public void testCreate_whenFieldMapWithIllegalParameterType_failure() { public void testCreate_whenFieldMapWithIllegalKey_failure() { Map config = new HashMap<>(); - Map fieldMap = new HashMap<>(); - fieldMap.put(INPUT_FIELD, ImmutableMap.of(DocumentChunkingProcessor.OUTPUT_FIELD, OUTPUT_FIELD, 1, 1)); + Map fieldMap = new HashMap<>(); + fieldMap.put(null, 1); + fieldMap.put(INPUT_FIELD, OUTPUT_FIELD); + Map algorithmMap = new HashMap<>(); + algorithmMap.put(ChunkerFactory.FIXED_LENGTH_ALGORITHM, createFixedTokenLengthParameters()); config.put(DocumentChunkingProcessor.FIELD_MAP_FIELD, fieldMap); + config.put(DocumentChunkingProcessor.ALGORITHM_FIELD, algorithmMap); Map registry = new HashMap<>(); IllegalArgumentException illegalArgumentException = assertThrows( IllegalArgumentException.class, @@ -187,8 +191,10 @@ public void testCreate_whenFieldMapWithIllegalKey_failure() { public void testCreate_whenFieldMapWithNoAlgorithm_failure() { Map config = new HashMap<>(); Map fieldMap = new HashMap<>(); - fieldMap.put(INPUT_FIELD, ImmutableMap.of(DocumentChunkingProcessor.OUTPUT_FIELD, INPUT_FIELD)); + Map algorithmMap = new HashMap<>(); + fieldMap.put(INPUT_FIELD, OUTPUT_FIELD); config.put(DocumentChunkingProcessor.FIELD_MAP_FIELD, fieldMap); + config.put(DocumentChunkingProcessor.ALGORITHM_FIELD, algorithmMap); Map registry = new HashMap<>(); IllegalArgumentException illegalArgumentException = assertThrows( IllegalArgumentException.class, diff --git a/src/test/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunkerTests.java b/src/test/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunkerTests.java index eb9808982..61fea30c2 100644 --- a/src/test/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunkerTests.java +++ b/src/test/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunkerTests.java @@ -69,7 +69,7 @@ public void testChunkerWithLimitNumber() { DelimiterChunker chunker = new DelimiterChunker(); String content = "a\nb\nc\nd"; Map inputParameters = Map.of(DELIMITER_FIELD, "\n", MAX_CHUNK_LIMIT_FIELD, 1); - Exception exception = assertThrows(IllegalArgumentException.class, () -> chunker.chunk(content, inputParameters)); + IllegalStateException exception = assertThrows(IllegalStateException.class, () -> chunker.chunk(content, inputParameters)); Assert.assertEquals("Exceed max chunk number: 1", exception.getMessage()); } From 0e464fed3492328d3618a845945716d75762927c Mon Sep 17 00:00:00 2001 From: yuye-aws Date: Tue, 5 Mar 2024 17:08:24 +0800 Subject: [PATCH 055/189] remove unused function Signed-off-by: yuye-aws --- .../neuralsearch/processor/DocumentChunkingProcessor.java | 6 ------ 1 file changed, 6 deletions(-) diff --git a/src/main/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessor.java b/src/main/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessor.java index 8f151c20a..37d853018 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessor.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessor.java @@ -167,12 +167,6 @@ public void doExecute( } } - private static List stringToList(String string) { - List list = new ArrayList<>(); - list.add(string); - return list; - } - public static class Factory implements Processor.Factory { private final Settings settings; From d6b68ed44c6b7a72824afabdc8116dda6cd48f5e Mon Sep 17 00:00:00 2001 From: yuye-aws Date: Wed, 6 Mar 2024 12:02:54 +0800 Subject: [PATCH 056/189] remove processor validator Signed-off-by: yuye-aws --- .../neuralsearch/plugin/NeuralSearch.java | 10 +- .../processor/DocumentChunkingProcessor.java | 239 ++++++++++++++---- .../processor/InferenceProcessor.java | 8 +- .../processor/SparseEncodingProcessor.java | 7 +- .../processor/TextEmbeddingProcessor.java | 7 +- .../SparseEncodingProcessorFactory.java | 19 +- .../TextEmbeddingProcessorFactory.java | 20 +- .../DocumentChunkingProcessorTests.java | 12 +- .../SparseEncodingProcessorTests.java | 9 +- .../TextEmbeddingProcessorTests.java | 15 +- 10 files changed, 205 insertions(+), 141 deletions(-) diff --git a/src/main/java/org/opensearch/neuralsearch/plugin/NeuralSearch.java b/src/main/java/org/opensearch/neuralsearch/plugin/NeuralSearch.java index b7d76f80b..dc5b6e8f2 100644 --- a/src/main/java/org/opensearch/neuralsearch/plugin/NeuralSearch.java +++ b/src/main/java/org/opensearch/neuralsearch/plugin/NeuralSearch.java @@ -29,7 +29,6 @@ import org.opensearch.neuralsearch.processor.NeuralQueryEnricherProcessor; import org.opensearch.neuralsearch.processor.NormalizationProcessor; import org.opensearch.neuralsearch.processor.NormalizationProcessorWorkflow; -import org.opensearch.neuralsearch.processor.ProcessorInputValidator; import org.opensearch.neuralsearch.processor.SparseEncodingProcessor; import org.opensearch.neuralsearch.processor.TextEmbeddingProcessor; import org.opensearch.neuralsearch.processor.DocumentChunkingProcessor; @@ -110,12 +109,11 @@ public List> getQueries() { @Override public Map getProcessors(Processor.Parameters parameters) { clientAccessor = new MLCommonsClientAccessor(new MachineLearningNodeClient(parameters.client)); - ProcessorInputValidator processorInputValidator = new ProcessorInputValidator(); return Map.of( TextEmbeddingProcessor.TYPE, - new TextEmbeddingProcessorFactory(clientAccessor, parameters.env, processorInputValidator), + new TextEmbeddingProcessorFactory(clientAccessor, parameters.env), SparseEncodingProcessor.TYPE, - new SparseEncodingProcessorFactory(clientAccessor, parameters.env, processorInputValidator), + new SparseEncodingProcessorFactory(clientAccessor, parameters.env), TextImageEmbeddingProcessor.TYPE, new TextImageEmbeddingProcessorFactory(clientAccessor, parameters.env, parameters.ingestService.getClusterService()), DocumentChunkingProcessor.TYPE, @@ -123,9 +121,7 @@ public Map getProcessors(Processor.Parameters paramet parameters.env.settings(), parameters.ingestService.getClusterService(), parameters.indicesService, - parameters.analysisRegistry, - parameters.env, - processorInputValidator + parameters.analysisRegistry ) ); } diff --git a/src/main/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessor.java b/src/main/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessor.java index 37d853018..290756c12 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessor.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessor.java @@ -9,17 +9,20 @@ import java.util.ArrayList; import java.util.List; import java.util.Objects; +import java.util.LinkedHashMap; +import java.util.stream.Collectors; import java.util.stream.IntStream; -import java.util.function.BiConsumer; +import com.google.common.annotations.VisibleForTesting; +import lombok.extern.log4j.Log4j2; import org.opensearch.cluster.metadata.IndexMetadata; import org.opensearch.index.IndexService; import org.opensearch.cluster.service.ClusterService; import org.opensearch.common.settings.Settings; -import org.opensearch.env.Environment; import org.opensearch.index.analysis.AnalysisRegistry; import org.opensearch.indices.IndicesService; import org.opensearch.index.IndexSettings; +import org.opensearch.ingest.AbstractProcessor; import org.opensearch.ingest.IngestDocument; import org.opensearch.ingest.Processor; import org.opensearch.neuralsearch.processor.chunker.ChunkerFactory; @@ -31,7 +34,8 @@ import static org.opensearch.neuralsearch.processor.chunker.ChunkerFactory.DELIMITER_ALGORITHM; import static org.opensearch.neuralsearch.processor.chunker.ChunkerFactory.FIXED_LENGTH_ALGORITHM; -public final class DocumentChunkingProcessor extends InferenceProcessor { +@Log4j2 +public final class DocumentChunkingProcessor extends AbstractProcessor { public static final String TYPE = "chunking"; @@ -47,6 +51,8 @@ public final class DocumentChunkingProcessor extends InferenceProcessor { private Map chunkerParameters; + private final Map fieldMap; + private final ClusterService clusterService; private final IndicesService indicesService; @@ -61,12 +67,11 @@ public DocumentChunkingProcessor( Settings settings, ClusterService clusterService, IndicesService indicesService, - AnalysisRegistry analysisRegistry, - Environment environment, - ProcessorInputValidator processorInputValidator + AnalysisRegistry analysisRegistry ) { - super(tag, description, TYPE, null, null, fieldMap, null, environment, processorInputValidator); + super(tag, description); validateAndParseAlgorithmMap(algorithmMap); + this.fieldMap = fieldMap; this.settings = settings; this.clusterService = clusterService; this.indicesService = indicesService; @@ -116,7 +121,40 @@ private void validateAndParseAlgorithmMap(Map algorithmMap) { } @Override - protected List buildResultForListType(List sourceValue, List results, IndexWrapper indexWrapper) { + public IngestDocument execute(IngestDocument ingestDocument) { + Map processMap = buildMapWithProcessorKeyAndOriginalValue(ingestDocument); + List inferenceList = createInferenceList(processMap); + if (inferenceList.isEmpty()) { + return ingestDocument; + } else { + return doExecute(ingestDocument, processMap, inferenceList); + } + } + + public IngestDocument doExecute(IngestDocument ingestDocument, Map ProcessMap, List inferenceList) { + if (Objects.equals(chunkerType, FIXED_LENGTH_ALGORITHM)) { + // add maxTokenCount setting from index metadata to chunker parameters + Map sourceAndMetadataMap = ingestDocument.getSourceAndMetadata(); + String indexName = sourceAndMetadataMap.get(IndexFieldMapper.NAME).toString(); + int maxTokenCount = IndexSettings.MAX_TOKEN_COUNT_SETTING.get(settings); + IndexMetadata indexMetadata = clusterService.state().metadata().index(indexName); + if (indexMetadata != null) { + // if the index exists, read maxTokenCount from the index setting + IndexService indexService = indicesService.indexServiceSafe(indexMetadata.getIndex()); + maxTokenCount = indexService.getIndexSettings().getMaxTokenCount(); + } + chunkerParameters.put(FixedTokenLengthChunker.MAX_TOKEN_COUNT_FIELD, maxTokenCount); + } + + List> chunkedResults = new ArrayList<>(); + for (String inferenceString : inferenceList) { + chunkedResults.add(chunk(inferenceString)); + } + setTargetFieldsToDocument(ingestDocument, ProcessMap, chunkedResults); + return ingestDocument; + } + + private List buildResultForListType(List sourceValue, List results, InferenceProcessor.IndexWrapper indexWrapper) { Object peek = sourceValue.get(0); if (peek instanceof String) { List keyToResult = new ArrayList<>(); @@ -133,37 +171,151 @@ protected List buildResultForListType(List sourceValue, List resul } } - @Override - public void doExecute( - IngestDocument ingestDocument, - Map ProcessMap, - List inferenceList, - BiConsumer handler + private Map buildMapWithProcessorKeyAndOriginalValue(IngestDocument ingestDocument) { + Map sourceAndMetadataMap = ingestDocument.getSourceAndMetadata(); + Map mapWithProcessorKeys = new LinkedHashMap<>(); + for (Map.Entry fieldMapEntry : fieldMap.entrySet()) { + String originalKey = fieldMapEntry.getKey(); + Object targetKey = fieldMapEntry.getValue(); + if (targetKey instanceof Map) { + Map treeRes = new LinkedHashMap<>(); + buildMapWithProcessorKeyAndOriginalValueForMapType(originalKey, targetKey, sourceAndMetadataMap, treeRes); + mapWithProcessorKeys.put(originalKey, treeRes.get(originalKey)); + } else { + mapWithProcessorKeys.put(String.valueOf(targetKey), sourceAndMetadataMap.get(originalKey)); + } + } + return mapWithProcessorKeys; + } + + private void buildMapWithProcessorKeyAndOriginalValueForMapType( + String parentKey, + Object processorKey, + Map sourceAndMetadataMap, + Map treeRes ) { - try { - processorInputValidator.validateFieldsValue(fieldMap, environment, ingestDocument, true); - if (Objects.equals(chunkerType, FIXED_LENGTH_ALGORITHM)) { - // add maxTokenCount setting from index metadata to chunker parameters - Map sourceAndMetadataMap = ingestDocument.getSourceAndMetadata(); - String indexName = sourceAndMetadataMap.get(IndexFieldMapper.NAME).toString(); - int maxTokenCount = IndexSettings.MAX_TOKEN_COUNT_SETTING.get(settings); - IndexMetadata indexMetadata = clusterService.state().metadata().index(indexName); - if (indexMetadata != null) { - // if the index exists, read maxTokenCount from the index setting - IndexService indexService = indicesService.indexServiceSafe(indexMetadata.getIndex()); - maxTokenCount = indexService.getIndexSettings().getMaxTokenCount(); + if (processorKey == null || sourceAndMetadataMap == null) return; + if (processorKey instanceof Map) { + Map next = new LinkedHashMap<>(); + if (sourceAndMetadataMap.get(parentKey) instanceof Map) { + for (Map.Entry nestedFieldMapEntry : ((Map) processorKey).entrySet()) { + buildMapWithProcessorKeyAndOriginalValueForMapType( + nestedFieldMapEntry.getKey(), + nestedFieldMapEntry.getValue(), + (Map) sourceAndMetadataMap.get(parentKey), + next + ); + } + } else if (sourceAndMetadataMap.get(parentKey) instanceof List) { + for (Map.Entry nestedFieldMapEntry : ((Map) processorKey).entrySet()) { + List> list = (List>) sourceAndMetadataMap.get(parentKey); + List listOfStrings = list.stream().map(x -> x.get(nestedFieldMapEntry.getKey())).collect(Collectors.toList()); + Map map = new LinkedHashMap<>(); + map.put(nestedFieldMapEntry.getKey(), listOfStrings); + buildMapWithProcessorKeyAndOriginalValueForMapType( + nestedFieldMapEntry.getKey(), + nestedFieldMapEntry.getValue(), + map, + next + ); } - chunkerParameters.put(FixedTokenLengthChunker.MAX_TOKEN_COUNT_FIELD, maxTokenCount); } + treeRes.put(parentKey, next); + } else { + String key = String.valueOf(processorKey); + treeRes.put(key, sourceAndMetadataMap.get(parentKey)); + } + } - List> chunkedResults = new ArrayList<>(); - for (String inferenceString : inferenceList) { - chunkedResults.add(chunk(inferenceString)); + @SuppressWarnings({ "unchecked" }) + private List createInferenceList(Map knnKeyMap) { + List texts = new ArrayList<>(); + knnKeyMap.entrySet().stream().filter(knnMapEntry -> knnMapEntry.getValue() != null).forEach(knnMapEntry -> { + Object sourceValue = knnMapEntry.getValue(); + if (sourceValue instanceof List) { + for (Object nestedValue : (List) sourceValue) { + if (nestedValue instanceof String) { + texts.add((String) nestedValue); + } else { + texts.addAll((List) nestedValue); + } + } + } else if (sourceValue instanceof Map) { + createInferenceListForMapTypeInput(sourceValue, texts); + } else { + texts.add(sourceValue.toString()); } - setTargetFieldsToDocument(ingestDocument, ProcessMap, chunkedResults); - handler.accept(ingestDocument, null); - } catch (Exception e) { - handler.accept(null, e); + }); + return texts; + } + + @SuppressWarnings("unchecked") + private void createInferenceListForMapTypeInput(Object sourceValue, List texts) { + if (sourceValue instanceof Map) { + ((Map) sourceValue).forEach((k, v) -> createInferenceListForMapTypeInput(v, texts)); + } else if (sourceValue instanceof List) { + texts.addAll(((List) sourceValue)); + } else { + if (sourceValue == null) return; + texts.add(sourceValue.toString()); + } + } + + private void setTargetFieldsToDocument(IngestDocument ingestDocument, Map processorMap, List results) { + Objects.requireNonNull(results, "embedding failed, inference returns null result!"); + log.debug("Model inference result fetched, starting build vector output!"); + Map result = buildResult(processorMap, results, ingestDocument.getSourceAndMetadata()); + result.forEach(ingestDocument::setFieldValue); + } + + @VisibleForTesting + Map buildResult(Map processorMap, List results, Map sourceAndMetadataMap) { + InferenceProcessor.IndexWrapper indexWrapper = new InferenceProcessor.IndexWrapper(0); + Map result = new LinkedHashMap<>(); + for (Map.Entry knnMapEntry : processorMap.entrySet()) { + String knnKey = knnMapEntry.getKey(); + Object sourceValue = knnMapEntry.getValue(); + if (sourceValue instanceof String) { + result.put(knnKey, results.get(indexWrapper.index++)); + } else if (sourceValue instanceof List) { + result.put(knnKey, buildResultForListType((List) sourceValue, results, indexWrapper)); + } else if (sourceValue instanceof Map) { + putResultToSourceMapForMapType(knnKey, sourceValue, results, indexWrapper, sourceAndMetadataMap); + } + } + return result; + } + + @SuppressWarnings({ "unchecked" }) + private void putResultToSourceMapForMapType( + String processorKey, + Object sourceValue, + List results, + InferenceProcessor.IndexWrapper indexWrapper, + Map sourceAndMetadataMap + ) { + if (processorKey == null || sourceAndMetadataMap == null || sourceValue == null) return; + if (sourceValue instanceof Map) { + for (Map.Entry inputNestedMapEntry : ((Map) sourceValue).entrySet()) { + if (sourceAndMetadataMap.get(processorKey) instanceof List) { + // build output for list of nested objects + for (Map nestedElement : (List>) sourceAndMetadataMap.get(processorKey)) { + nestedElement.put(inputNestedMapEntry.getKey(), results.get(indexWrapper.index++)); + } + } else { + putResultToSourceMapForMapType( + inputNestedMapEntry.getKey(), + inputNestedMapEntry.getValue(), + results, + indexWrapper, + (Map) sourceAndMetadataMap.get(processorKey) + ); + } + } + } else if (sourceValue instanceof String) { + sourceAndMetadataMap.put(processorKey, results.get(indexWrapper.index++)); + } else if (sourceValue instanceof List) { + sourceAndMetadataMap.put(processorKey, buildResultForListType((List) sourceValue, results, indexWrapper)); } } @@ -175,26 +327,13 @@ public static class Factory implements Processor.Factory { private final IndicesService indicesService; - private final Environment environment; - private final AnalysisRegistry analysisRegistry; - private final ProcessorInputValidator processorInputValidator; - - public Factory( - Settings settings, - ClusterService clusterService, - IndicesService indicesService, - AnalysisRegistry analysisRegistry, - Environment environment, - ProcessorInputValidator processorInputValidator - ) { + public Factory(Settings settings, ClusterService clusterService, IndicesService indicesService, AnalysisRegistry analysisRegistry) { this.settings = settings; this.clusterService = clusterService; this.indicesService = indicesService; this.analysisRegistry = analysisRegistry; - this.environment = environment; - this.processorInputValidator = processorInputValidator; } @Override @@ -214,9 +353,7 @@ public DocumentChunkingProcessor create( settings, clusterService, indicesService, - analysisRegistry, - environment, - processorInputValidator + analysisRegistry ); } diff --git a/src/main/java/org/opensearch/neuralsearch/processor/InferenceProcessor.java b/src/main/java/org/opensearch/neuralsearch/processor/InferenceProcessor.java index e54d8b1d8..a1ea84a5a 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/InferenceProcessor.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/InferenceProcessor.java @@ -49,8 +49,6 @@ public abstract class InferenceProcessor extends AbstractProcessor { protected final Environment environment; - protected final ProcessorInputValidator processorInputValidator; - public InferenceProcessor( String tag, String description, @@ -59,19 +57,17 @@ public InferenceProcessor( String modelId, Map fieldMap, MLCommonsClientAccessor clientAccessor, - Environment environment, - ProcessorInputValidator processorInputValidator + Environment environment ) { super(tag, description); this.type = type; validateEmbeddingConfiguration(fieldMap); - + if (StringUtils.isBlank(modelId)) throw new IllegalArgumentException("model_id is null or empty, cannot process it"); this.listTypeNestedMapKey = listTypeNestedMapKey; this.modelId = modelId; this.fieldMap = fieldMap; this.mlCommonsClientAccessor = clientAccessor; this.environment = environment; - this.processorInputValidator = processorInputValidator; } private void validateEmbeddingConfiguration(Map fieldMap) { diff --git a/src/main/java/org/opensearch/neuralsearch/processor/SparseEncodingProcessor.java b/src/main/java/org/opensearch/neuralsearch/processor/SparseEncodingProcessor.java index 30486b048..8ae0f9a90 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/SparseEncodingProcessor.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/SparseEncodingProcessor.java @@ -8,7 +8,6 @@ import java.util.Map; import java.util.function.BiConsumer; -import org.apache.commons.lang3.StringUtils; import org.opensearch.core.action.ActionListener; import org.opensearch.env.Environment; import org.opensearch.ingest.IngestDocument; @@ -33,11 +32,9 @@ public SparseEncodingProcessor( String modelId, Map fieldMap, MLCommonsClientAccessor clientAccessor, - Environment environment, - ProcessorInputValidator processorInputValidator + Environment environment ) { - super(tag, description, TYPE, LIST_TYPE_NESTED_MAP_KEY, modelId, fieldMap, clientAccessor, environment, processorInputValidator); - if (StringUtils.isBlank(modelId)) throw new IllegalArgumentException("model_id is null or empty, cannot process it"); + super(tag, description, TYPE, LIST_TYPE_NESTED_MAP_KEY, modelId, fieldMap, clientAccessor, environment); } @Override diff --git a/src/main/java/org/opensearch/neuralsearch/processor/TextEmbeddingProcessor.java b/src/main/java/org/opensearch/neuralsearch/processor/TextEmbeddingProcessor.java index d98d41728..04af25fc5 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/TextEmbeddingProcessor.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/TextEmbeddingProcessor.java @@ -8,7 +8,6 @@ import java.util.Map; import java.util.function.BiConsumer; -import org.apache.commons.lang3.StringUtils; import org.opensearch.core.action.ActionListener; import org.opensearch.env.Environment; import org.opensearch.ingest.IngestDocument; @@ -32,11 +31,9 @@ public TextEmbeddingProcessor( String modelId, Map fieldMap, MLCommonsClientAccessor clientAccessor, - Environment environment, - ProcessorInputValidator processorInputValidator + Environment environment ) { - super(tag, description, TYPE, LIST_TYPE_NESTED_MAP_KEY, modelId, fieldMap, clientAccessor, environment, processorInputValidator); - if (StringUtils.isBlank(modelId)) throw new IllegalArgumentException("model_id is null or empty, cannot process it"); + super(tag, description, TYPE, LIST_TYPE_NESTED_MAP_KEY, modelId, fieldMap, clientAccessor, environment); } @Override diff --git a/src/main/java/org/opensearch/neuralsearch/processor/factory/SparseEncodingProcessorFactory.java b/src/main/java/org/opensearch/neuralsearch/processor/factory/SparseEncodingProcessorFactory.java index d5b90c406..95b2803a0 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/factory/SparseEncodingProcessorFactory.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/factory/SparseEncodingProcessorFactory.java @@ -15,7 +15,6 @@ import org.opensearch.env.Environment; import org.opensearch.ingest.Processor; import org.opensearch.neuralsearch.ml.MLCommonsClientAccessor; -import org.opensearch.neuralsearch.processor.ProcessorInputValidator; import org.opensearch.neuralsearch.processor.SparseEncodingProcessor; import lombok.extern.log4j.Log4j2; @@ -27,16 +26,10 @@ public class SparseEncodingProcessorFactory implements Processor.Factory { private final MLCommonsClientAccessor clientAccessor; private final Environment environment; - private ProcessorInputValidator processorInputValidator; - public SparseEncodingProcessorFactory( - MLCommonsClientAccessor clientAccessor, - Environment environment, - ProcessorInputValidator processorInputValidator - ) { + public SparseEncodingProcessorFactory(MLCommonsClientAccessor clientAccessor, Environment environment) { this.clientAccessor = clientAccessor; this.environment = environment; - this.processorInputValidator = processorInputValidator; } @Override @@ -49,14 +42,6 @@ public SparseEncodingProcessor create( String modelId = readStringProperty(TYPE, processorTag, config, MODEL_ID_FIELD); Map fieldMap = readMap(TYPE, processorTag, config, FIELD_MAP_FIELD); - return new SparseEncodingProcessor( - processorTag, - description, - modelId, - fieldMap, - clientAccessor, - environment, - processorInputValidator - ); + return new SparseEncodingProcessor(processorTag, description, modelId, fieldMap, clientAccessor, environment); } } diff --git a/src/main/java/org/opensearch/neuralsearch/processor/factory/TextEmbeddingProcessorFactory.java b/src/main/java/org/opensearch/neuralsearch/processor/factory/TextEmbeddingProcessorFactory.java index 061ac3474..7802cb1f6 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/factory/TextEmbeddingProcessorFactory.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/factory/TextEmbeddingProcessorFactory.java @@ -15,7 +15,6 @@ import org.opensearch.env.Environment; import org.opensearch.ingest.Processor; import org.opensearch.neuralsearch.ml.MLCommonsClientAccessor; -import org.opensearch.neuralsearch.processor.ProcessorInputValidator; import org.opensearch.neuralsearch.processor.TextEmbeddingProcessor; /** @@ -27,16 +26,9 @@ public class TextEmbeddingProcessorFactory implements Processor.Factory { private final Environment environment; - private ProcessorInputValidator processorInputValidator; - - public TextEmbeddingProcessorFactory( - final MLCommonsClientAccessor clientAccessor, - final Environment environment, - ProcessorInputValidator processorInputValidator - ) { + public TextEmbeddingProcessorFactory(final MLCommonsClientAccessor clientAccessor, final Environment environment) { this.clientAccessor = clientAccessor; this.environment = environment; - this.processorInputValidator = processorInputValidator; } @Override @@ -48,14 +40,6 @@ public TextEmbeddingProcessor create( ) throws Exception { String modelId = readStringProperty(TYPE, processorTag, config, MODEL_ID_FIELD); Map filedMap = readMap(TYPE, processorTag, config, FIELD_MAP_FIELD); - return new TextEmbeddingProcessor( - processorTag, - description, - modelId, - filedMap, - clientAccessor, - environment, - processorInputValidator - ); + return new TextEmbeddingProcessor(processorTag, description, modelId, filedMap, clientAccessor, environment); } } diff --git a/src/test/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessorTests.java b/src/test/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessorTests.java index b25ab3269..f515bc6e4 100644 --- a/src/test/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessorTests.java +++ b/src/test/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessorTests.java @@ -49,9 +49,6 @@ public class DocumentChunkingProcessorTests extends OpenSearchTestCase { private static final String OUTPUT_FIELD = "body_chunk"; private static final String INDEX_NAME = "_index"; - @Mock - private ProcessorInputValidator processorInputValidator; - @Mock private Environment environment; @@ -85,14 +82,7 @@ public void setup() { when(metadata.index(anyString())).thenReturn(null); when(clusterState.metadata()).thenReturn(metadata); when(clusterService.state()).thenReturn(clusterState); - factory = new DocumentChunkingProcessor.Factory( - settings, - clusterService, - indicesService, - getAnalysisRegistry(), - environment, - processorInputValidator - ); + factory = new DocumentChunkingProcessor.Factory(settings, clusterService, indicesService, getAnalysisRegistry()); } private Map createFixedTokenLengthParameters() { diff --git a/src/test/java/org/opensearch/neuralsearch/processor/SparseEncodingProcessorTests.java b/src/test/java/org/opensearch/neuralsearch/processor/SparseEncodingProcessorTests.java index 36cad43e3..815ea851b 100644 --- a/src/test/java/org/opensearch/neuralsearch/processor/SparseEncodingProcessorTests.java +++ b/src/test/java/org/opensearch/neuralsearch/processor/SparseEncodingProcessorTests.java @@ -51,9 +51,6 @@ public class SparseEncodingProcessorTests extends OpenSearchTestCase { @Mock private Environment env; - @Mock - private ProcessorInputValidator processorInputValidator; - @InjectMocks private SparseEncodingProcessorFactory SparseEncodingProcessorFactory; private static final String PROCESSOR_TAG = "mockTag"; @@ -100,11 +97,7 @@ public void testExecute_whenInferenceTextListEmpty_SuccessWithoutAnyMap() { IngestDocument ingestDocument = new IngestDocument(sourceAndMetadata, new HashMap<>()); Map registry = new HashMap<>(); MLCommonsClientAccessor accessor = mock(MLCommonsClientAccessor.class); - SparseEncodingProcessorFactory sparseEncodingProcessorFactory = new SparseEncodingProcessorFactory( - accessor, - env, - processorInputValidator - ); + SparseEncodingProcessorFactory sparseEncodingProcessorFactory = new SparseEncodingProcessorFactory(accessor, env); Map config = new HashMap<>(); config.put(TextEmbeddingProcessor.MODEL_ID_FIELD, "mockModelId"); diff --git a/src/test/java/org/opensearch/neuralsearch/processor/TextEmbeddingProcessorTests.java b/src/test/java/org/opensearch/neuralsearch/processor/TextEmbeddingProcessorTests.java index db323bd4b..25d41c345 100644 --- a/src/test/java/org/opensearch/neuralsearch/processor/TextEmbeddingProcessorTests.java +++ b/src/test/java/org/opensearch/neuralsearch/processor/TextEmbeddingProcessorTests.java @@ -51,9 +51,6 @@ public class TextEmbeddingProcessorTests extends OpenSearchTestCase { @Mock private Environment env; - @Mock - private ProcessorInputValidator processorInputValidator; - @InjectMocks private TextEmbeddingProcessorFactory textEmbeddingProcessorFactory; private static final String PROCESSOR_TAG = "mockTag"; @@ -130,11 +127,7 @@ public void testExecute_whenInferenceThrowInterruptedException_throwRuntimeExcep IngestDocument ingestDocument = new IngestDocument(sourceAndMetadata, new HashMap<>()); Map registry = new HashMap<>(); MLCommonsClientAccessor accessor = mock(MLCommonsClientAccessor.class); - TextEmbeddingProcessorFactory textEmbeddingProcessorFactory = new TextEmbeddingProcessorFactory( - accessor, - env, - processorInputValidator - ); + TextEmbeddingProcessorFactory textEmbeddingProcessorFactory = new TextEmbeddingProcessorFactory(accessor, env); Map config = new HashMap<>(); config.put(TextEmbeddingProcessor.MODEL_ID_FIELD, "mockModelId"); @@ -152,11 +145,7 @@ public void testExecute_whenInferenceTextListEmpty_SuccessWithoutEmbedding() { IngestDocument ingestDocument = new IngestDocument(sourceAndMetadata, new HashMap<>()); Map registry = new HashMap<>(); MLCommonsClientAccessor accessor = mock(MLCommonsClientAccessor.class); - TextEmbeddingProcessorFactory textEmbeddingProcessorFactory = new TextEmbeddingProcessorFactory( - accessor, - env, - processorInputValidator - ); + TextEmbeddingProcessorFactory textEmbeddingProcessorFactory = new TextEmbeddingProcessorFactory(accessor, env); Map config = new HashMap<>(); config.put(TextEmbeddingProcessor.MODEL_ID_FIELD, "mockModelId"); From a7a92607920ff2b2c90e51809c845c96e9bf38f4 Mon Sep 17 00:00:00 2001 From: yuye-aws Date: Wed, 6 Mar 2024 12:03:29 +0800 Subject: [PATCH 057/189] remove processor validator Signed-off-by: yuye-aws --- .../processor/ProcessorInputValidator.java | 93 ------------------- 1 file changed, 93 deletions(-) delete mode 100644 src/main/java/org/opensearch/neuralsearch/processor/ProcessorInputValidator.java diff --git a/src/main/java/org/opensearch/neuralsearch/processor/ProcessorInputValidator.java b/src/main/java/org/opensearch/neuralsearch/processor/ProcessorInputValidator.java deleted file mode 100644 index 57766c911..000000000 --- a/src/main/java/org/opensearch/neuralsearch/processor/ProcessorInputValidator.java +++ /dev/null @@ -1,93 +0,0 @@ -/* - * Copyright OpenSearch Contributors - * SPDX-License-Identifier: Apache-2.0 - */ -package org.opensearch.neuralsearch.processor; - -import org.apache.commons.lang3.StringUtils; -import org.opensearch.env.Environment; -import org.opensearch.index.mapper.MapperService; -import org.opensearch.ingest.IngestDocument; - -import java.util.List; -import java.util.Map; -import java.util.Objects; -import java.util.function.Supplier; - -public class ProcessorInputValidator { - - public void validateFieldsValue( - Map fieldMap, - Environment environment, - IngestDocument ingestDocument, - boolean allowEmpty - ) { - Map sourceAndMetadataMap = ingestDocument.getSourceAndMetadata(); - for (Map.Entry embeddingFieldsEntry : fieldMap.entrySet()) { - Object sourceValue = sourceAndMetadataMap.get(embeddingFieldsEntry.getKey()); - if (sourceValue != null) { - String sourceKey = embeddingFieldsEntry.getKey(); - Class sourceValueClass = sourceValue.getClass(); - if (List.class.isAssignableFrom(sourceValueClass) || Map.class.isAssignableFrom(sourceValueClass)) { - validateNestedTypeValue(sourceKey, sourceValue, environment, allowEmpty, () -> 1); - } else if (!String.class.isAssignableFrom(sourceValueClass)) { - throw new IllegalArgumentException("field [" + sourceKey + "] is neither string nor nested type, cannot process it"); - } else if (!allowEmpty && StringUtils.isBlank(sourceValue.toString())) { - throw new IllegalArgumentException("field [" + sourceKey + "] has empty string value, cannot process it"); - } - } - } - } - - @SuppressWarnings({ "rawtypes", "unchecked" }) - private void validateNestedTypeValue( - String sourceKey, - Object sourceValue, - Environment environment, - boolean allowEmpty, - Supplier maxDepthSupplier - ) { - int maxDepth = maxDepthSupplier.get(); - if (maxDepth > MapperService.INDEX_MAPPING_DEPTH_LIMIT_SETTING.get(environment.settings())) { - throw new IllegalArgumentException("map type field [" + sourceKey + "] reached max depth limit, cannot process it"); - } else if ((List.class.isAssignableFrom(sourceValue.getClass()))) { - validateListTypeValue(sourceKey, sourceValue, environment, allowEmpty, maxDepthSupplier); - } else if (Map.class.isAssignableFrom(sourceValue.getClass())) { - ((Map) sourceValue).values() - .stream() - .filter(Objects::nonNull) - .forEach(x -> validateNestedTypeValue(sourceKey, x, environment, allowEmpty, () -> maxDepth + 1)); - } else if (!String.class.isAssignableFrom(sourceValue.getClass())) { - throw new IllegalArgumentException("map type field [" + sourceKey + "] has non-string type, cannot process it"); - } else if (!allowEmpty && StringUtils.isBlank(sourceValue.toString())) { - throw new IllegalArgumentException("map type field [" + sourceKey + "] has empty string, cannot process it"); - } - } - - @SuppressWarnings({ "rawtypes" }) - private void validateListTypeValue( - String sourceKey, - Object sourceValue, - Environment environment, - boolean allowEmpty, - Supplier maxDepthSupplier - ) { - for (Object value : (List) sourceValue) { - if (value instanceof Map) { - validateNestedTypeValue(sourceKey, value, environment, allowEmpty, () -> maxDepthSupplier.get() + 1); - } else if (value == null) { - throw new IllegalArgumentException("list type field [" + sourceKey + "] has null, cannot process it"); - } else if (value instanceof List) { - for (Object nestedValue : (List) sourceValue) { - if (!(nestedValue instanceof String)) { - throw new IllegalArgumentException("list type field [" + sourceKey + "] has non string value, cannot process it"); - } - } - } else if (!(value instanceof String)) { - throw new IllegalArgumentException("list type field [" + sourceKey + "] has non string value, cannot process it"); - } else if (!allowEmpty && StringUtils.isBlank(value.toString())) { - throw new IllegalArgumentException("list type field [" + sourceKey + "] has empty string, cannot process it"); - } - } - } -} From 39e8df59b965df91459f1e983a54d7856070f1fc Mon Sep 17 00:00:00 2001 From: Yuye Zhu Date: Wed, 6 Mar 2024 12:06:46 +0800 Subject: [PATCH 058/189] Revert InferenceProcessor.java Signed-off-by: Yuye Zhu Signed-off-by: yuye-aws --- .../processor/InferenceProcessor.java | 125 ++++++++++++------ 1 file changed, 81 insertions(+), 44 deletions(-) diff --git a/src/main/java/org/opensearch/neuralsearch/processor/InferenceProcessor.java b/src/main/java/org/opensearch/neuralsearch/processor/InferenceProcessor.java index a1ea84a5a..fe201abae 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/InferenceProcessor.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/InferenceProcessor.java @@ -10,11 +10,13 @@ import java.util.Map; import java.util.Objects; import java.util.function.BiConsumer; +import java.util.function.Supplier; import java.util.stream.Collectors; import java.util.stream.IntStream; import org.apache.commons.lang3.StringUtils; import org.opensearch.env.Environment; +import org.opensearch.index.mapper.MapperService; import org.opensearch.ingest.AbstractProcessor; import org.opensearch.ingest.IngestDocument; import org.opensearch.neuralsearch.ml.MLCommonsClientAccessor; @@ -43,11 +45,11 @@ public abstract class InferenceProcessor extends AbstractProcessor { protected final String modelId; - protected final Map fieldMap; + private final Map fieldMap; protected final MLCommonsClientAccessor mlCommonsClientAccessor; - protected final Environment environment; + private final Environment environment; public InferenceProcessor( String tag, @@ -61,8 +63,9 @@ public InferenceProcessor( ) { super(tag, description); this.type = type; - validateEmbeddingConfiguration(fieldMap); if (StringUtils.isBlank(modelId)) throw new IllegalArgumentException("model_id is null or empty, cannot process it"); + validateEmbeddingConfiguration(fieldMap); + this.listTypeNestedMapKey = listTypeNestedMapKey; this.modelId = modelId; this.fieldMap = fieldMap; @@ -103,13 +106,13 @@ public IngestDocument execute(IngestDocument ingestDocument) throws Exception { @Override public void execute(IngestDocument ingestDocument, BiConsumer handler) { try { - processorInputValidator.validateFieldsValue(fieldMap, environment, ingestDocument, false); - Map processMap = buildMapWithProcessorKeyAndOriginalValue(ingestDocument); - List inferenceList = createInferenceList(processMap); - if (inferenceList.isEmpty()) { + validateEmbeddingFieldsValue(ingestDocument); + Map ProcessMap = buildMapWithProcessorKeyAndOriginalValue(ingestDocument); + List inferenceList = createInferenceList(ProcessMap); + if (inferenceList.size() == 0) { handler.accept(ingestDocument, null); } else { - doExecute(ingestDocument, processMap, inferenceList, handler); + doExecute(ingestDocument, ProcessMap, inferenceList, handler); } } catch (Exception e) { handler.accept(null, e); @@ -117,18 +120,12 @@ public void execute(IngestDocument ingestDocument, BiConsumer createInferenceList(Map knnKeyMap) { + private List createInferenceList(Map knnKeyMap) { List texts = new ArrayList<>(); knnKeyMap.entrySet().stream().filter(knnMapEntry -> knnMapEntry.getValue() != null).forEach(knnMapEntry -> { Object sourceValue = knnMapEntry.getValue(); if (sourceValue instanceof List) { - for (Object nestedValue : (List) sourceValue) { - if (nestedValue instanceof String) { - texts.add((String) nestedValue); - } else { - texts.addAll((List) nestedValue); - } - } + texts.addAll(((List) sourceValue)); } else if (sourceValue instanceof Map) { createInferenceListForMapTypeInput(sourceValue, texts); } else { @@ -207,16 +204,68 @@ private void buildMapWithProcessorKeyAndOriginalValueForMapType( } } - protected void setTargetFieldsToDocument(IngestDocument ingestDocument, Map processorMap, List results) { + private void validateEmbeddingFieldsValue(IngestDocument ingestDocument) { + Map sourceAndMetadataMap = ingestDocument.getSourceAndMetadata(); + for (Map.Entry embeddingFieldsEntry : fieldMap.entrySet()) { + Object sourceValue = sourceAndMetadataMap.get(embeddingFieldsEntry.getKey()); + if (sourceValue != null) { + String sourceKey = embeddingFieldsEntry.getKey(); + Class sourceValueClass = sourceValue.getClass(); + if (List.class.isAssignableFrom(sourceValueClass) || Map.class.isAssignableFrom(sourceValueClass)) { + validateNestedTypeValue(sourceKey, sourceValue, () -> 1); + } else if (!String.class.isAssignableFrom(sourceValueClass)) { + throw new IllegalArgumentException("field [" + sourceKey + "] is neither string nor nested type, cannot process it"); + } else if (StringUtils.isBlank(sourceValue.toString())) { + throw new IllegalArgumentException("field [" + sourceKey + "] has empty string value, cannot process it"); + } + } + } + } + + @SuppressWarnings({ "rawtypes", "unchecked" }) + private void validateNestedTypeValue(String sourceKey, Object sourceValue, Supplier maxDepthSupplier) { + int maxDepth = maxDepthSupplier.get(); + if (maxDepth > MapperService.INDEX_MAPPING_DEPTH_LIMIT_SETTING.get(environment.settings())) { + throw new IllegalArgumentException("map type field [" + sourceKey + "] reached max depth limit, cannot process it"); + } else if ((List.class.isAssignableFrom(sourceValue.getClass()))) { + validateListTypeValue(sourceKey, sourceValue, maxDepthSupplier); + } else if (Map.class.isAssignableFrom(sourceValue.getClass())) { + ((Map) sourceValue).values() + .stream() + .filter(Objects::nonNull) + .forEach(x -> validateNestedTypeValue(sourceKey, x, () -> maxDepth + 1)); + } else if (!String.class.isAssignableFrom(sourceValue.getClass())) { + throw new IllegalArgumentException("map type field [" + sourceKey + "] has non-string type, cannot process it"); + } else if (StringUtils.isBlank(sourceValue.toString())) { + throw new IllegalArgumentException("map type field [" + sourceKey + "] has empty string, cannot process it"); + } + } + + @SuppressWarnings({ "rawtypes" }) + private void validateListTypeValue(String sourceKey, Object sourceValue, Supplier maxDepthSupplier) { + for (Object value : (List) sourceValue) { + if (value instanceof Map) { + validateNestedTypeValue(sourceKey, value, () -> maxDepthSupplier.get() + 1); + } else if (value == null) { + throw new IllegalArgumentException("list type field [" + sourceKey + "] has null, cannot process it"); + } else if (!(value instanceof String)) { + throw new IllegalArgumentException("list type field [" + sourceKey + "] has non string value, cannot process it"); + } else if (StringUtils.isBlank(value.toString())) { + throw new IllegalArgumentException("list type field [" + sourceKey + "] has empty string, cannot process it"); + } + } + } + + protected void setVectorFieldsToDocument(IngestDocument ingestDocument, Map processorMap, List results) { Objects.requireNonNull(results, "embedding failed, inference returns null result!"); log.debug("Model inference result fetched, starting build vector output!"); - Map result = buildResult(processorMap, results, ingestDocument.getSourceAndMetadata()); - result.forEach(ingestDocument::setFieldValue); + Map nlpResult = buildNLPResult(processorMap, results, ingestDocument.getSourceAndMetadata()); + nlpResult.forEach(ingestDocument::setFieldValue); } @SuppressWarnings({ "unchecked" }) @VisibleForTesting - Map buildResult(Map processorMap, List results, Map sourceAndMetadataMap) { + Map buildNLPResult(Map processorMap, List results, Map sourceAndMetadataMap) { IndexWrapper indexWrapper = new IndexWrapper(0); Map result = new LinkedHashMap<>(); for (Map.Entry knnMapEntry : processorMap.entrySet()) { @@ -225,16 +274,16 @@ Map buildResult(Map processorMap, List result if (sourceValue instanceof String) { result.put(knnKey, results.get(indexWrapper.index++)); } else if (sourceValue instanceof List) { - result.put(knnKey, buildResultForListType((List) sourceValue, results, indexWrapper)); + result.put(knnKey, buildNLPResultForListType((List) sourceValue, results, indexWrapper)); } else if (sourceValue instanceof Map) { - putResultToSourceMapForMapType(knnKey, sourceValue, results, indexWrapper, sourceAndMetadataMap); + putNLPResultToSourceMapForMapType(knnKey, sourceValue, results, indexWrapper, sourceAndMetadataMap); } } return result; } @SuppressWarnings({ "unchecked" }) - private void putResultToSourceMapForMapType( + private void putNLPResultToSourceMapForMapType( String processorKey, Object sourceValue, List results, @@ -245,12 +294,12 @@ private void putResultToSourceMapForMapType( if (sourceValue instanceof Map) { for (Map.Entry inputNestedMapEntry : ((Map) sourceValue).entrySet()) { if (sourceAndMetadataMap.get(processorKey) instanceof List) { - // build output for list of nested objects + // build nlp output for list of nested objects for (Map nestedElement : (List>) sourceAndMetadataMap.get(processorKey)) { nestedElement.put(inputNestedMapEntry.getKey(), results.get(indexWrapper.index++)); } } else { - putResultToSourceMapForMapType( + putNLPResultToSourceMapForMapType( inputNestedMapEntry.getKey(), inputNestedMapEntry.getValue(), results, @@ -262,27 +311,15 @@ private void putResultToSourceMapForMapType( } else if (sourceValue instanceof String) { sourceAndMetadataMap.put(processorKey, results.get(indexWrapper.index++)); } else if (sourceValue instanceof List) { - sourceAndMetadataMap.put(processorKey, buildResultForListType((List) sourceValue, results, indexWrapper)); + sourceAndMetadataMap.put(processorKey, buildNLPResultForListType((List) sourceValue, results, indexWrapper)); } } - protected List buildResultForListType(List sourceValue, List results, IndexWrapper indexWrapper) { - Object peek = sourceValue.get(0); - if (peek instanceof String) { - List> keyToResult = new ArrayList<>(); - IntStream.range(0, sourceValue.size()) - .forEachOrdered(x -> keyToResult.add(ImmutableMap.of(listTypeNestedMapKey, results.get(indexWrapper.index++)))); - return keyToResult; - } else { - List>> keyToResult = new ArrayList<>(); - for (Object nestedList : sourceValue) { - List> nestedResult = new ArrayList<>(); - IntStream.range(0, ((List) nestedList).size()) - .forEachOrdered(x -> nestedResult.add(ImmutableMap.of(listTypeNestedMapKey, results.get(indexWrapper.index++)))); - keyToResult.add(nestedResult); - } - return keyToResult; - } + private List> buildNLPResultForListType(List sourceValue, List results, IndexWrapper indexWrapper) { + List> keyToResult = new ArrayList<>(); + IntStream.range(0, sourceValue.size()) + .forEachOrdered(x -> keyToResult.add(ImmutableMap.of(listTypeNestedMapKey, results.get(indexWrapper.index++)))); + return keyToResult; } @Override @@ -299,7 +336,7 @@ public String getType() { * index: the index pointer of the text embedding result. */ static class IndexWrapper { - protected int index; + private int index; protected IndexWrapper(int index) { this.index = index; From 2ee19232d069b4c82b6cbc43857fcd3a006c9856 Mon Sep 17 00:00:00 2001 From: yuye-aws Date: Wed, 6 Mar 2024 12:11:17 +0800 Subject: [PATCH 059/189] revert changes in text embedding and sparse encoding processor Signed-off-by: yuye-aws --- .../processor/SparseEncodingProcessor.java | 2 +- .../processor/TextEmbeddingProcessor.java | 2 +- .../processor/TextEmbeddingProcessorTests.java | 14 +++++++------- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/src/main/java/org/opensearch/neuralsearch/processor/SparseEncodingProcessor.java b/src/main/java/org/opensearch/neuralsearch/processor/SparseEncodingProcessor.java index 8ae0f9a90..8acf95bf7 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/SparseEncodingProcessor.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/SparseEncodingProcessor.java @@ -45,7 +45,7 @@ public void doExecute( BiConsumer handler ) { mlCommonsClientAccessor.inferenceSentencesWithMapResult(this.modelId, inferenceList, ActionListener.wrap(resultMaps -> { - setTargetFieldsToDocument(ingestDocument, ProcessMap, TokenWeightUtil.fetchListOfTokenWeightMap(resultMaps)); + setVectorFieldsToDocument(ingestDocument, ProcessMap, TokenWeightUtil.fetchListOfTokenWeightMap(resultMaps)); handler.accept(ingestDocument, null); }, e -> { handler.accept(null, e); })); } diff --git a/src/main/java/org/opensearch/neuralsearch/processor/TextEmbeddingProcessor.java b/src/main/java/org/opensearch/neuralsearch/processor/TextEmbeddingProcessor.java index 04af25fc5..c1b8f92a6 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/TextEmbeddingProcessor.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/TextEmbeddingProcessor.java @@ -44,7 +44,7 @@ public void doExecute( BiConsumer handler ) { mlCommonsClientAccessor.inferenceSentences(this.modelId, inferenceList, ActionListener.wrap(vectors -> { - setTargetFieldsToDocument(ingestDocument, ProcessMap, vectors); + setVectorFieldsToDocument(ingestDocument, ProcessMap, vectors); handler.accept(ingestDocument, null); }, e -> { handler.accept(null, e); })); } diff --git a/src/test/java/org/opensearch/neuralsearch/processor/TextEmbeddingProcessorTests.java b/src/test/java/org/opensearch/neuralsearch/processor/TextEmbeddingProcessorTests.java index 25d41c345..60408d820 100644 --- a/src/test/java/org/opensearch/neuralsearch/processor/TextEmbeddingProcessorTests.java +++ b/src/test/java/org/opensearch/neuralsearch/processor/TextEmbeddingProcessorTests.java @@ -357,7 +357,7 @@ public void testProcessResponse_successful() throws Exception { Map knnMap = processor.buildMapWithProcessorKeyAndOriginalValue(ingestDocument); List> modelTensorList = createMockVectorResult(); - processor.setTargetFieldsToDocument(ingestDocument, knnMap, modelTensorList); + processor.setVectorFieldsToDocument(ingestDocument, knnMap, modelTensorList); assertEquals(12, ingestDocument.getSourceAndMetadata().size()); } @@ -378,7 +378,7 @@ public void testBuildVectorOutput_withPlainStringValue_successful() { assertEquals(knnKeyList.get(lastIndex), configValueList.get(lastIndex).toString()); List> modelTensorList = createMockVectorResult(); - Map result = processor.buildResult(knnMap, modelTensorList, ingestDocument.getSourceAndMetadata()); + Map result = processor.buildNLPResult(knnMap, modelTensorList, ingestDocument.getSourceAndMetadata()); assertTrue(result.containsKey("oriKey1_knn")); assertTrue(result.containsKey("oriKey2_knn")); assertTrue(result.containsKey("oriKey3_knn")); @@ -395,7 +395,7 @@ public void testBuildVectorOutput_withNestedMap_successful() { TextEmbeddingProcessor processor = createInstanceWithNestedMapConfiguration(config); Map knnMap = processor.buildMapWithProcessorKeyAndOriginalValue(ingestDocument); List> modelTensorList = createMockVectorResult(); - processor.buildResult(knnMap, modelTensorList, ingestDocument.getSourceAndMetadata()); + processor.buildNLPResult(knnMap, modelTensorList, ingestDocument.getSourceAndMetadata()); Map favoritesMap = (Map) ingestDocument.getSourceAndMetadata().get("favorites"); assertNotNull(favoritesMap); Map favoriteGames = (Map) favoritesMap.get("favorite.games"); @@ -411,7 +411,7 @@ public void testBuildVectorOutput_withNestedList_successful() { TextEmbeddingProcessor textEmbeddingProcessor = createInstanceWithNestedMapConfiguration(config); Map knnMap = textEmbeddingProcessor.buildMapWithProcessorKeyAndOriginalValue(ingestDocument); List> modelTensorList = createMockVectorResult(); - textEmbeddingProcessor.buildResult(knnMap, modelTensorList, ingestDocument.getSourceAndMetadata()); + textEmbeddingProcessor.buildNLPResult(knnMap, modelTensorList, ingestDocument.getSourceAndMetadata()); List> nestedObj = (List>) ingestDocument.getSourceAndMetadata().get("nestedField"); assertTrue(nestedObj.get(0).containsKey("vectorField")); assertTrue(nestedObj.get(1).containsKey("vectorField")); @@ -425,7 +425,7 @@ public void testBuildVectorOutput_withNestedList_Level2_successful() { TextEmbeddingProcessor textEmbeddingProcessor = createInstanceWithNestedMapConfiguration(config); Map knnMap = textEmbeddingProcessor.buildMapWithProcessorKeyAndOriginalValue(ingestDocument); List> modelTensorList = createMockVectorResult(); - textEmbeddingProcessor.buildResult(knnMap, modelTensorList, ingestDocument.getSourceAndMetadata()); + textEmbeddingProcessor.buildNLPResult(knnMap, modelTensorList, ingestDocument.getSourceAndMetadata()); Map nestedLevel1 = (Map) ingestDocument.getSourceAndMetadata().get("nestedField"); List> nestedObj = (List>) nestedLevel1.get("nestedField"); assertTrue(nestedObj.get(0).containsKey("vectorField")); @@ -440,10 +440,10 @@ public void test_updateDocument_appendVectorFieldsToDocument_successful() { TextEmbeddingProcessor processor = createInstanceWithNestedMapConfiguration(config); Map knnMap = processor.buildMapWithProcessorKeyAndOriginalValue(ingestDocument); List> modelTensorList = createMockVectorResult(); - processor.setTargetFieldsToDocument(ingestDocument, knnMap, modelTensorList); + processor.setVectorFieldsToDocument(ingestDocument, knnMap, modelTensorList); List> modelTensorList1 = createMockVectorResult(); - processor.setTargetFieldsToDocument(ingestDocument, knnMap, modelTensorList1); + processor.setVectorFieldsToDocument(ingestDocument, knnMap, modelTensorList1); assertEquals(12, ingestDocument.getSourceAndMetadata().size()); assertEquals(2, ((List) ingestDocument.getSourceAndMetadata().get("oriKey6_knn")).size()); } From ca534abe891d5b413c305c7215b802a32d86c308 Mon Sep 17 00:00:00 2001 From: yuye-aws Date: Thu, 7 Mar 2024 10:47:59 +0800 Subject: [PATCH 060/189] implement chunk with map in document chunking processor Signed-off-by: yuye-aws --- .../neuralsearch/plugin/NeuralSearch.java | 2 +- .../processor/DocumentChunkingProcessor.java | 301 +++++++----------- 2 files changed, 122 insertions(+), 181 deletions(-) diff --git a/src/main/java/org/opensearch/neuralsearch/plugin/NeuralSearch.java b/src/main/java/org/opensearch/neuralsearch/plugin/NeuralSearch.java index dc5b6e8f2..80fcf90f4 100644 --- a/src/main/java/org/opensearch/neuralsearch/plugin/NeuralSearch.java +++ b/src/main/java/org/opensearch/neuralsearch/plugin/NeuralSearch.java @@ -118,7 +118,7 @@ public Map getProcessors(Processor.Parameters paramet new TextImageEmbeddingProcessorFactory(clientAccessor, parameters.env, parameters.ingestService.getClusterService()), DocumentChunkingProcessor.TYPE, new DocumentChunkingProcessor.Factory( - parameters.env.settings(), + parameters.env, parameters.ingestService.getClusterService(), parameters.indicesService, parameters.analysisRegistry diff --git a/src/main/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessor.java b/src/main/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessor.java index 290756c12..957652638 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessor.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessor.java @@ -9,17 +9,15 @@ import java.util.ArrayList; import java.util.List; import java.util.Objects; -import java.util.LinkedHashMap; -import java.util.stream.Collectors; -import java.util.stream.IntStream; -import com.google.common.annotations.VisibleForTesting; import lombok.extern.log4j.Log4j2; +import org.apache.commons.lang3.StringUtils; import org.opensearch.cluster.metadata.IndexMetadata; +import org.opensearch.env.Environment; import org.opensearch.index.IndexService; import org.opensearch.cluster.service.ClusterService; -import org.opensearch.common.settings.Settings; import org.opensearch.index.analysis.AnalysisRegistry; +import org.opensearch.index.mapper.MapperService; import org.opensearch.indices.IndicesService; import org.opensearch.index.IndexSettings; import org.opensearch.ingest.AbstractProcessor; @@ -45,8 +43,6 @@ public final class DocumentChunkingProcessor extends AbstractProcessor { private final Set supportedChunkers = ChunkerFactory.getAllChunkers(); - private final Settings settings; - private String chunkerType; private Map chunkerParameters; @@ -59,12 +55,14 @@ public final class DocumentChunkingProcessor extends AbstractProcessor { private final AnalysisRegistry analysisRegistry; + private final Environment environment; + public DocumentChunkingProcessor( String tag, String description, Map fieldMap, Map algorithmMap, - Settings settings, + Environment environment, ClusterService clusterService, IndicesService indicesService, AnalysisRegistry analysisRegistry @@ -72,7 +70,7 @@ public DocumentChunkingProcessor( super(tag, description); validateAndParseAlgorithmMap(algorithmMap); this.fieldMap = fieldMap; - this.settings = settings; + this.environment = environment; this.clusterService = clusterService; this.indicesService = indicesService; this.analysisRegistry = analysisRegistry; @@ -82,12 +80,6 @@ public String getType() { return TYPE; } - private List chunk(String content) { - // assume that content is either a map, list or string - IFieldChunker chunker = ChunkerFactory.create(chunkerType, analysisRegistry); - return chunker.chunk(content, chunkerParameters); - } - @SuppressWarnings("unchecked") private void validateAndParseAlgorithmMap(Map algorithmMap) { if (algorithmMap.size() != 1) { @@ -120,23 +112,56 @@ private void validateAndParseAlgorithmMap(Map algorithmMap) { } } - @Override - public IngestDocument execute(IngestDocument ingestDocument) { - Map processMap = buildMapWithProcessorKeyAndOriginalValue(ingestDocument); - List inferenceList = createInferenceList(processMap); - if (inferenceList.isEmpty()) { - return ingestDocument; - } else { - return doExecute(ingestDocument, processMap, inferenceList); + @SuppressWarnings("unchecked") + private boolean isListString(Object value) { + // an empty list is also List + if (!(value instanceof List)) { + return false; + } + for (Object element : (List) value) { + if (!(element instanceof String)) { + return false; + } + } + return true; + } + + private List chunkString(String content) { + // assume that content is either a map, list or string + IFieldChunker chunker = ChunkerFactory.create(chunkerType, analysisRegistry); + return chunker.chunk(content, chunkerParameters); + } + + private List chunkList(List contentList) { + // flatten the List> output to List + List result = new ArrayList<>(); + for (String content : contentList) { + result.addAll(chunkString(content)); + } + return result; + } + + @SuppressWarnings("unchecked") + private List chunkLeafType(Object value) { + // leaf type is either String or List + List chunkedResult = null; + if (value instanceof String) { + chunkedResult = chunkString(String.valueOf(value)); + } else if (isListString(value)) { + chunkedResult = chunkList((List) value); } + return chunkedResult; } - public IngestDocument doExecute(IngestDocument ingestDocument, Map ProcessMap, List inferenceList) { + @Override + public IngestDocument execute(IngestDocument ingestDocument) { + validateEmbeddingFieldsValue(ingestDocument); + if (Objects.equals(chunkerType, FIXED_LENGTH_ALGORITHM)) { // add maxTokenCount setting from index metadata to chunker parameters Map sourceAndMetadataMap = ingestDocument.getSourceAndMetadata(); String indexName = sourceAndMetadataMap.get(IndexFieldMapper.NAME).toString(); - int maxTokenCount = IndexSettings.MAX_TOKEN_COUNT_SETTING.get(settings); + int maxTokenCount = IndexSettings.MAX_TOKEN_COUNT_SETTING.get(environment.settings()); IndexMetadata indexMetadata = clusterService.state().metadata().index(indexName); if (indexMetadata != null) { // if the index exists, read maxTokenCount from the index setting @@ -146,182 +171,93 @@ public IngestDocument doExecute(IngestDocument ingestDocument, Map> chunkedResults = new ArrayList<>(); - for (String inferenceString : inferenceList) { - chunkedResults.add(chunk(inferenceString)); - } - setTargetFieldsToDocument(ingestDocument, ProcessMap, chunkedResults); + Map sourceAndMetadataMap = ingestDocument.getSourceAndMetadata(); + chunkMapType(sourceAndMetadataMap, fieldMap); + sourceAndMetadataMap.forEach(ingestDocument::setFieldValue); return ingestDocument; } - private List buildResultForListType(List sourceValue, List results, InferenceProcessor.IndexWrapper indexWrapper) { - Object peek = sourceValue.get(0); - if (peek instanceof String) { - List keyToResult = new ArrayList<>(); - IntStream.range(0, sourceValue.size()).forEachOrdered(x -> keyToResult.add(results.get(indexWrapper.index++))); - return keyToResult; - } else { - List> keyToResult = new ArrayList<>(); - for (Object nestedList : sourceValue) { - List nestedResult = new ArrayList<>(); - IntStream.range(0, ((List) nestedList).size()).forEachOrdered(x -> nestedResult.add(results.get(indexWrapper.index++))); - keyToResult.add(nestedResult); - } - return keyToResult; - } - } - - private Map buildMapWithProcessorKeyAndOriginalValue(IngestDocument ingestDocument) { + private void validateEmbeddingFieldsValue(IngestDocument ingestDocument) { Map sourceAndMetadataMap = ingestDocument.getSourceAndMetadata(); - Map mapWithProcessorKeys = new LinkedHashMap<>(); - for (Map.Entry fieldMapEntry : fieldMap.entrySet()) { - String originalKey = fieldMapEntry.getKey(); - Object targetKey = fieldMapEntry.getValue(); - if (targetKey instanceof Map) { - Map treeRes = new LinkedHashMap<>(); - buildMapWithProcessorKeyAndOriginalValueForMapType(originalKey, targetKey, sourceAndMetadataMap, treeRes); - mapWithProcessorKeys.put(originalKey, treeRes.get(originalKey)); - } else { - mapWithProcessorKeys.put(String.valueOf(targetKey), sourceAndMetadataMap.get(originalKey)); - } - } - return mapWithProcessorKeys; - } - - private void buildMapWithProcessorKeyAndOriginalValueForMapType( - String parentKey, - Object processorKey, - Map sourceAndMetadataMap, - Map treeRes - ) { - if (processorKey == null || sourceAndMetadataMap == null) return; - if (processorKey instanceof Map) { - Map next = new LinkedHashMap<>(); - if (sourceAndMetadataMap.get(parentKey) instanceof Map) { - for (Map.Entry nestedFieldMapEntry : ((Map) processorKey).entrySet()) { - buildMapWithProcessorKeyAndOriginalValueForMapType( - nestedFieldMapEntry.getKey(), - nestedFieldMapEntry.getValue(), - (Map) sourceAndMetadataMap.get(parentKey), - next - ); - } - } else if (sourceAndMetadataMap.get(parentKey) instanceof List) { - for (Map.Entry nestedFieldMapEntry : ((Map) processorKey).entrySet()) { - List> list = (List>) sourceAndMetadataMap.get(parentKey); - List listOfStrings = list.stream().map(x -> x.get(nestedFieldMapEntry.getKey())).collect(Collectors.toList()); - Map map = new LinkedHashMap<>(); - map.put(nestedFieldMapEntry.getKey(), listOfStrings); - buildMapWithProcessorKeyAndOriginalValueForMapType( - nestedFieldMapEntry.getKey(), - nestedFieldMapEntry.getValue(), - map, - next - ); + for (Map.Entry embeddingFieldsEntry : fieldMap.entrySet()) { + Object sourceValue = sourceAndMetadataMap.get(embeddingFieldsEntry.getKey()); + if (sourceValue != null) { + String sourceKey = embeddingFieldsEntry.getKey(); + Class sourceValueClass = sourceValue.getClass(); + if (List.class.isAssignableFrom(sourceValueClass) || Map.class.isAssignableFrom(sourceValueClass)) { + validateNestedTypeValue(sourceKey, sourceValue, 1); + } else if (!String.class.isAssignableFrom(sourceValueClass)) { + throw new IllegalArgumentException("field [" + sourceKey + "] is neither string nor nested type, cannot process it"); + } else if (StringUtils.isBlank(sourceValue.toString())) { + throw new IllegalArgumentException("field [" + sourceKey + "] has empty string value, cannot process it"); } } - treeRes.put(parentKey, next); - } else { - String key = String.valueOf(processorKey); - treeRes.put(key, sourceAndMetadataMap.get(parentKey)); } } - @SuppressWarnings({ "unchecked" }) - private List createInferenceList(Map knnKeyMap) { - List texts = new ArrayList<>(); - knnKeyMap.entrySet().stream().filter(knnMapEntry -> knnMapEntry.getValue() != null).forEach(knnMapEntry -> { - Object sourceValue = knnMapEntry.getValue(); - if (sourceValue instanceof List) { - for (Object nestedValue : (List) sourceValue) { - if (nestedValue instanceof String) { - texts.add((String) nestedValue); - } else { - texts.addAll((List) nestedValue); - } - } - } else if (sourceValue instanceof Map) { - createInferenceListForMapTypeInput(sourceValue, texts); - } else { - texts.add(sourceValue.toString()); - } - }); - return texts; - } - - @SuppressWarnings("unchecked") - private void createInferenceListForMapTypeInput(Object sourceValue, List texts) { - if (sourceValue instanceof Map) { - ((Map) sourceValue).forEach((k, v) -> createInferenceListForMapTypeInput(v, texts)); - } else if (sourceValue instanceof List) { - texts.addAll(((List) sourceValue)); - } else { - if (sourceValue == null) return; - texts.add(sourceValue.toString()); + @SuppressWarnings({ "rawtypes", "unchecked" }) + private void validateNestedTypeValue(String sourceKey, Object sourceValue, int maxDepth) { + if (maxDepth > MapperService.INDEX_MAPPING_DEPTH_LIMIT_SETTING.get(environment.settings())) { + throw new IllegalArgumentException("map type field [" + sourceKey + "] reached max depth limit, cannot process it"); + } else if ((List.class.isAssignableFrom(sourceValue.getClass()))) { + validateListTypeValue(sourceKey, sourceValue, maxDepth); + } else if (Map.class.isAssignableFrom(sourceValue.getClass())) { + ((Map) sourceValue).values() + .stream() + .filter(Objects::nonNull) + .forEach(x -> validateNestedTypeValue(sourceKey, x, maxDepth + 1)); + } else if (!String.class.isAssignableFrom(sourceValue.getClass())) { + throw new IllegalArgumentException("map type field [" + sourceKey + "] has non-string type, cannot process it"); } } - private void setTargetFieldsToDocument(IngestDocument ingestDocument, Map processorMap, List results) { - Objects.requireNonNull(results, "embedding failed, inference returns null result!"); - log.debug("Model inference result fetched, starting build vector output!"); - Map result = buildResult(processorMap, results, ingestDocument.getSourceAndMetadata()); - result.forEach(ingestDocument::setFieldValue); - } - - @VisibleForTesting - Map buildResult(Map processorMap, List results, Map sourceAndMetadataMap) { - InferenceProcessor.IndexWrapper indexWrapper = new InferenceProcessor.IndexWrapper(0); - Map result = new LinkedHashMap<>(); - for (Map.Entry knnMapEntry : processorMap.entrySet()) { - String knnKey = knnMapEntry.getKey(); - Object sourceValue = knnMapEntry.getValue(); - if (sourceValue instanceof String) { - result.put(knnKey, results.get(indexWrapper.index++)); - } else if (sourceValue instanceof List) { - result.put(knnKey, buildResultForListType((List) sourceValue, results, indexWrapper)); - } else if (sourceValue instanceof Map) { - putResultToSourceMapForMapType(knnKey, sourceValue, results, indexWrapper, sourceAndMetadataMap); + @SuppressWarnings({ "rawtypes" }) + private void validateListTypeValue(String sourceKey, Object sourceValue, int maxDepth) { + for (Object value : (List) sourceValue) { + if (value instanceof Map) { + validateNestedTypeValue(sourceKey, value, maxDepth + 1); + } else if (value == null) { + throw new IllegalArgumentException("list type field [" + sourceKey + "] has null, cannot process it"); + } else if (!(value instanceof String)) { + throw new IllegalArgumentException("list type field [" + sourceKey + "] has non string value, cannot process it"); + } else if (StringUtils.isBlank(value.toString())) { + throw new IllegalArgumentException("list type field [" + sourceKey + "] has empty string, cannot process it"); } } - return result; } - @SuppressWarnings({ "unchecked" }) - private void putResultToSourceMapForMapType( - String processorKey, - Object sourceValue, - List results, - InferenceProcessor.IndexWrapper indexWrapper, - Map sourceAndMetadataMap - ) { - if (processorKey == null || sourceAndMetadataMap == null || sourceValue == null) return; - if (sourceValue instanceof Map) { - for (Map.Entry inputNestedMapEntry : ((Map) sourceValue).entrySet()) { - if (sourceAndMetadataMap.get(processorKey) instanceof List) { - // build output for list of nested objects - for (Map nestedElement : (List>) sourceAndMetadataMap.get(processorKey)) { - nestedElement.put(inputNestedMapEntry.getKey(), results.get(indexWrapper.index++)); + @SuppressWarnings("unchecked") + private void chunkMapType(Map sourceAndMetadataMap, Map fieldMap) { + for (Map.Entry fieldMapEntry : fieldMap.entrySet()) { + String originalKey = fieldMapEntry.getKey(); + Object targetKey = fieldMapEntry.getValue(); + if (targetKey instanceof Map) { + // call this method recursively when target key is a map + Object sourceObject = sourceAndMetadataMap.get(originalKey); + if (sourceObject instanceof List) { + List sourceObjectList = (List) sourceObject; + for (Object source : sourceObjectList) { + if (source instanceof Map) { + chunkMapType((Map) source, (Map) targetKey); + } } - } else { - putResultToSourceMapForMapType( - inputNestedMapEntry.getKey(), - inputNestedMapEntry.getValue(), - results, - indexWrapper, - (Map) sourceAndMetadataMap.get(processorKey) - ); + } else if (sourceObject instanceof Map) { + chunkMapType((Map) sourceObject, (Map) targetKey); + } + } else { + // chunk the object when target key is a string + Object chunkObject = sourceAndMetadataMap.get(originalKey); + List chunkedResult = chunkLeafType(chunkObject); + if (chunkedResult != null) { + sourceAndMetadataMap.put(String.valueOf(targetKey), chunkedResult); } } - } else if (sourceValue instanceof String) { - sourceAndMetadataMap.put(processorKey, results.get(indexWrapper.index++)); - } else if (sourceValue instanceof List) { - sourceAndMetadataMap.put(processorKey, buildResultForListType((List) sourceValue, results, indexWrapper)); } } public static class Factory implements Processor.Factory { - private final Settings settings; + private final Environment environment; private final ClusterService clusterService; @@ -329,8 +265,13 @@ public static class Factory implements Processor.Factory { private final AnalysisRegistry analysisRegistry; - public Factory(Settings settings, ClusterService clusterService, IndicesService indicesService, AnalysisRegistry analysisRegistry) { - this.settings = settings; + public Factory( + Environment environment, + ClusterService clusterService, + IndicesService indicesService, + AnalysisRegistry analysisRegistry + ) { + this.environment = environment; this.clusterService = clusterService; this.indicesService = indicesService; this.analysisRegistry = analysisRegistry; @@ -350,7 +291,7 @@ public DocumentChunkingProcessor create( description, fieldMap, algorithmMap, - settings, + environment, clusterService, indicesService, analysisRegistry From eedd58d927eda113362a4cda511ea74aae3f6253 Mon Sep 17 00:00:00 2001 From: Lu Date: Thu, 7 Mar 2024 13:01:59 +0800 Subject: [PATCH 061/189] add default delimiter value Signed-off-by: Lu Signed-off-by: yuye-aws --- .../processor/chunker/DelimiterChunker.java | 4 +--- .../processor/chunker/DelimiterChunkerTests.java | 16 ++++++++-------- 2 files changed, 9 insertions(+), 11 deletions(-) diff --git a/src/main/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunker.java b/src/main/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunker.java index 4062c560e..1d765893c 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunker.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunker.java @@ -25,8 +25,6 @@ public void validateParameters(Map parameters) { } else if (((String) delimiter).isEmpty()) { throw new IllegalArgumentException("delimiter parameters should not be empty."); } - } else { - throw new IllegalArgumentException("You must contain field: " + DELIMITER_FIELD + " in your parameter."); } if (parameters.containsKey(MAX_CHUNK_LIMIT_FIELD)) { Object maxChunkLimit = parameters.get(MAX_CHUNK_LIMIT_FIELD); @@ -40,7 +38,7 @@ public void validateParameters(Map parameters) { @Override public List chunk(String content, Map parameters) { - String delimiter = (String) parameters.get(DELIMITER_FIELD); + String delimiter = (String) parameters.getOrDefault(DELIMITER_FIELD, "."); int maxChunkingNumber = (int) parameters.getOrDefault(MAX_CHUNK_LIMIT_FIELD, -1); List chunkResult = new ArrayList<>(); int start = 0; diff --git a/src/test/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunkerTests.java b/src/test/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunkerTests.java index 61fea30c2..b92999fa3 100644 --- a/src/test/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunkerTests.java +++ b/src/test/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunkerTests.java @@ -17,14 +17,6 @@ public class DelimiterChunkerTests extends OpenSearchTestCase { - public void testChunkerWithNoDelimiterField() { - DelimiterChunker chunker = new DelimiterChunker(); - String content = "a\nb\nc\nd"; - Map inputParameters = Map.of("", ""); - Exception exception = assertThrows(IllegalArgumentException.class, () -> chunker.validateParameters(inputParameters)); - Assert.assertEquals("You must contain field: " + DELIMITER_FIELD + " in your parameter.", exception.getMessage()); - } - public void testChunkerWithWrongLimitFieldList() { DelimiterChunker chunker = new DelimiterChunker(); String content = "a\nb\nc\nd"; @@ -81,6 +73,14 @@ public void testChunker() { assertEquals(List.of("a\n", "b\n", "c\n", "d"), chunkResult); } + public void testChunkerWithDefaultDelimiter() { + DelimiterChunker chunker = new DelimiterChunker(); + String content = "a.b.c.d"; + Map inputParameters = Map.of(); + List chunkResult = chunker.chunk(content, inputParameters); + assertEquals(List.of("a.", "b.", "c.", "d"), chunkResult); + } + public void testChunkerWithDelimiterEnd() { DelimiterChunker chunker = new DelimiterChunker(); String content = "a\nb\nc\nd\n"; From b9bf3ef7978c817e88970bb853d0fb7ea87109ed Mon Sep 17 00:00:00 2001 From: yuye-aws Date: Thu, 7 Mar 2024 13:07:30 +0800 Subject: [PATCH 062/189] implement max chunk logic in document chunking processor Signed-off-by: yuye-aws --- .../processor/DocumentChunkingProcessor.java | 21 ++++++++++-- .../processor/chunker/ChunkerFactory.java | 2 +- .../processor/chunker/DelimiterChunker.java | 2 +- .../{IFieldChunker.java => FieldChunker.java} | 2 +- .../chunker/FixedTokenLengthChunker.java | 34 ++++++++++--------- .../chunker/ChunkerFactoryTests.java | 4 +-- .../chunker/FixedTokenLengthChunkerTests.java | 2 +- 7 files changed, 43 insertions(+), 24 deletions(-) rename src/main/java/org/opensearch/neuralsearch/processor/chunker/{IFieldChunker.java => FieldChunker.java} (90%) diff --git a/src/main/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessor.java b/src/main/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessor.java index 957652638..dc7ded196 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessor.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessor.java @@ -24,7 +24,7 @@ import org.opensearch.ingest.IngestDocument; import org.opensearch.ingest.Processor; import org.opensearch.neuralsearch.processor.chunker.ChunkerFactory; -import org.opensearch.neuralsearch.processor.chunker.IFieldChunker; +import org.opensearch.neuralsearch.processor.chunker.FieldChunker; import org.opensearch.index.mapper.IndexFieldMapper; import org.opensearch.neuralsearch.processor.chunker.FixedTokenLengthChunker; @@ -41,6 +41,11 @@ public final class DocumentChunkingProcessor extends AbstractProcessor { public static final String ALGORITHM_FIELD = "algorithm"; + public static String MAX_CHUNK_LIMIT_FIELD = "max_chunk_limit"; + + private static final int DEFAULT_MAX_CHUNK_LIMIT = 100; + + private int current_max_chunk_limit = DEFAULT_MAX_CHUNK_LIMIT; private final Set supportedChunkers = ChunkerFactory.getAllChunkers(); private String chunkerType; @@ -107,8 +112,14 @@ private void validateAndParseAlgorithmMap(Map algorithmMap) { "Unable to create the processor as [" + ALGORITHM_FIELD + "] cannot be cast to [" + Map.class.getName() + "]" ); } + FieldChunker chunker = ChunkerFactory.create(algorithmKey, analysisRegistry); + chunker.validateParameters((Map) algorithmValue); this.chunkerType = algorithmKey; this.chunkerParameters = (Map) algorithmValue; + chunker.validateParameters(chunkerParameters); + if (((Map) algorithmValue).containsKey(MAX_CHUNK_LIMIT_FIELD)) { + this.current_max_chunk_limit = ((Number) ((Map) algorithmValue).get(MAX_CHUNK_LIMIT_FIELD)).intValue(); + } } } @@ -128,7 +139,13 @@ private boolean isListString(Object value) { private List chunkString(String content) { // assume that content is either a map, list or string - IFieldChunker chunker = ChunkerFactory.create(chunkerType, analysisRegistry); + if (current_max_chunk_limit <= 0) { + throw new IllegalStateException("Exceed [" + MAX_CHUNK_LIMIT_FIELD + "] in [" + chunkerType+ "] algorithm"); + } + FieldChunker chunker = ChunkerFactory.create(chunkerType, analysisRegistry); + List result = chunker.chunk(content, chunkerParameters); + current_max_chunk_limit -= result.size(); + chunkerParameters.put(MAX_CHUNK_LIMIT_FIELD, current_max_chunk_limit); return chunker.chunk(content, chunkerParameters); } diff --git a/src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerFactory.java b/src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerFactory.java index f1eb3b68b..bf8b52882 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerFactory.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerFactory.java @@ -13,7 +13,7 @@ public class ChunkerFactory { public static final String FIXED_LENGTH_ALGORITHM = "fix_length"; public static final String DELIMITER_ALGORITHM = "delimiter"; - public static IFieldChunker create(String type, AnalysisRegistry analysisRegistry) { + public static FieldChunker create(String type, AnalysisRegistry analysisRegistry) { switch (type) { case FIXED_LENGTH_ALGORITHM: return new FixedTokenLengthChunker(analysisRegistry); diff --git a/src/main/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunker.java b/src/main/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunker.java index 1d765893c..e35a811c8 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunker.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunker.java @@ -8,7 +8,7 @@ import java.util.List; import java.util.Map; -public class DelimiterChunker implements IFieldChunker { +public class DelimiterChunker implements FieldChunker { public DelimiterChunker() {} diff --git a/src/main/java/org/opensearch/neuralsearch/processor/chunker/IFieldChunker.java b/src/main/java/org/opensearch/neuralsearch/processor/chunker/FieldChunker.java similarity index 90% rename from src/main/java/org/opensearch/neuralsearch/processor/chunker/IFieldChunker.java rename to src/main/java/org/opensearch/neuralsearch/processor/chunker/FieldChunker.java index 6f031bf53..194e35b97 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/chunker/IFieldChunker.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/chunker/FieldChunker.java @@ -7,7 +7,7 @@ import java.util.Map; import java.util.List; -public interface IFieldChunker { +public interface FieldChunker { void validateParameters(Map parameters); List chunk(String content, Map parameters); diff --git a/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java b/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java index 082e00620..2f810d64d 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java @@ -11,13 +11,11 @@ import lombok.extern.log4j.Log4j2; import org.opensearch.action.admin.indices.analyze.AnalyzeAction; - import org.opensearch.index.analysis.AnalysisRegistry; - import static org.opensearch.action.admin.indices.analyze.TransportAnalyzeAction.analyze; @Log4j2 -public class FixedTokenLengthChunker implements IFieldChunker { +public class FixedTokenLengthChunker implements FieldChunker { public static final String TOKEN_LIMIT_FIELD = "token_limit"; public static final String OVERLAP_RATE_FIELD = "overlap_rate"; @@ -29,7 +27,7 @@ public class FixedTokenLengthChunker implements IFieldChunker { private static final int DEFAULT_TOKEN_LIMIT = 500; private static final double DEFAULT_OVERLAP_RATE = 0.2; private static final int DEFAULT_MAX_TOKEN_COUNT = 10000; - private static final int DEFAULT_MAX_CHUNK_LIMIT = -1; + private static final int DEFAULT_MAX_CHUNK_LIMIT = 100; private static final String DEFAULT_TOKENIZER = "standard"; private final AnalysisRegistry analysisRegistry; @@ -50,15 +48,14 @@ private List tokenize(String content, String tokenizer, int maxTokenCoun tokenList.add(analyzeToken.getTerm()); } return tokenList; - } catch (IOException e) { - throw new RuntimeException(e); + throw new RuntimeException("Fixed token length algorithm meet with exception: " + e); } }; @Override public List chunk(String content, Map parameters) { - // assume that parameters has been validated + // prior to chunking, parameters have been validated int tokenLimit = DEFAULT_TOKEN_LIMIT; double overlapRate = DEFAULT_OVERLAP_RATE; int maxTokenCount = DEFAULT_MAX_TOKEN_COUNT; @@ -107,15 +104,14 @@ public List chunk(String content, Map parameters) { } private void addPassageToList(List passages, String passage, int maxChunkLimit) { - if (maxChunkLimit != DEFAULT_MAX_CHUNK_LIMIT && passages.size() >= maxChunkLimit) { - throw new IllegalStateException("Exceed max chunk number: " + maxChunkLimit); + if (passages.size() >= maxChunkLimit) { + throw new IllegalStateException("Exceed max chunk number in fixed token length algorithm"); } passages.add(passage); } - private void validatePositiveIntegerParameter(Map parameters, String fieldName) { + private void validatePositiveIntegerParameter(Map parameters, String fieldName, boolean requirePositive) { // this method validate that parameter is a positive integer - // this method accepts positive float or double number if (!parameters.containsKey(fieldName)) { // all parameters are optional return; @@ -125,16 +121,22 @@ private void validatePositiveIntegerParameter(Map parameters, St "fixed length parameter [" + fieldName + "] cannot be cast to [" + Number.class.getName() + "]" ); } - if (((Number) parameters.get(fieldName)).intValue() <= 0) { - throw new IllegalArgumentException("fixed length parameter [" + fieldName + "] must be positive"); + if (requirePositive) { + if (((Number) parameters.get(fieldName)).intValue() <= 0) { + throw new IllegalArgumentException("fixed length parameter [" + fieldName + "] must be positive"); + } + } else { + if (((Number) parameters.get(fieldName)).intValue() < 0) { + throw new IllegalArgumentException("fixed length parameter [" + fieldName + "] cannot be negative"); + } } } @Override public void validateParameters(Map parameters) { - validatePositiveIntegerParameter(parameters, TOKEN_LIMIT_FIELD); - validatePositiveIntegerParameter(parameters, MAX_CHUNK_LIMIT_FIELD); - validatePositiveIntegerParameter(parameters, MAX_TOKEN_COUNT_FIELD); + validatePositiveIntegerParameter(parameters, TOKEN_LIMIT_FIELD, true); + validatePositiveIntegerParameter(parameters, MAX_CHUNK_LIMIT_FIELD, false); + validatePositiveIntegerParameter(parameters, MAX_TOKEN_COUNT_FIELD, true); if (parameters.containsKey(OVERLAP_RATE_FIELD)) { if (!(parameters.get(OVERLAP_RATE_FIELD) instanceof Number)) { diff --git a/src/test/java/org/opensearch/neuralsearch/processor/chunker/ChunkerFactoryTests.java b/src/test/java/org/opensearch/neuralsearch/processor/chunker/ChunkerFactoryTests.java index 0f6a95d40..38557c1d4 100644 --- a/src/test/java/org/opensearch/neuralsearch/processor/chunker/ChunkerFactoryTests.java +++ b/src/test/java/org/opensearch/neuralsearch/processor/chunker/ChunkerFactoryTests.java @@ -21,13 +21,13 @@ public void testGetAllChunkers() { } public void testCreate_FixedTokenLength() { - IFieldChunker chunker = ChunkerFactory.create(ChunkerFactory.FIXED_LENGTH_ALGORITHM, analysisRegistry); + FieldChunker chunker = ChunkerFactory.create(ChunkerFactory.FIXED_LENGTH_ALGORITHM, analysisRegistry); assertNotNull(chunker); assertTrue(chunker instanceof FixedTokenLengthChunker); } public void testCreate_Delimiter() { - IFieldChunker chunker = ChunkerFactory.create(ChunkerFactory.DELIMITER_ALGORITHM, analysisRegistry); + FieldChunker chunker = ChunkerFactory.create(ChunkerFactory.DELIMITER_ALGORITHM, analysisRegistry); assertNotNull(chunker); assertTrue(chunker instanceof DelimiterChunker); } diff --git a/src/test/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunkerTests.java b/src/test/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunkerTests.java index d24d9f423..47bc4990e 100644 --- a/src/test/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunkerTests.java +++ b/src/test/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunkerTests.java @@ -30,7 +30,7 @@ public class FixedTokenLengthChunkerTests extends OpenSearchTestCase { - private FixedTokenLengthChunker FixedTokenLengthChunker; + private org.opensearch.neuralsearch.processor.chunker.FixedTokenLengthChunker FixedTokenLengthChunker; @Before @SneakyThrows From 2ac2f60f4d2b3d78a036005c73ae21f26cbc3ad0 Mon Sep 17 00:00:00 2001 From: yuye-aws Date: Thu, 7 Mar 2024 13:09:51 +0800 Subject: [PATCH 063/189] add initial value for max chunk limit in document chunking processor Signed-off-by: yuye-aws --- .../neuralsearch/processor/DocumentChunkingProcessor.java | 1 + 1 file changed, 1 insertion(+) diff --git a/src/main/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessor.java b/src/main/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessor.java index dc7ded196..ac3a9a45d 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessor.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessor.java @@ -174,6 +174,7 @@ private List chunkLeafType(Object value) { public IngestDocument execute(IngestDocument ingestDocument) { validateEmbeddingFieldsValue(ingestDocument); + chunkerParameters.put(MAX_CHUNK_LIMIT_FIELD, current_max_chunk_limit); if (Objects.equals(chunkerType, FIXED_LENGTH_ALGORITHM)) { // add maxTokenCount setting from index metadata to chunker parameters Map sourceAndMetadataMap = ingestDocument.getSourceAndMetadata(); From 6067044894a5b0664d7b49a3c19710fa3d55a11a Mon Sep 17 00:00:00 2001 From: yuye-aws Date: Thu, 7 Mar 2024 13:10:37 +0800 Subject: [PATCH 064/189] bug fix in chunking processor: allow 0 max_chunk_limit Signed-off-by: yuye-aws --- .../neuralsearch/processor/DocumentChunkingProcessor.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessor.java b/src/main/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessor.java index ac3a9a45d..ed269af20 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessor.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessor.java @@ -139,7 +139,7 @@ private boolean isListString(Object value) { private List chunkString(String content) { // assume that content is either a map, list or string - if (current_max_chunk_limit <= 0) { + if (current_max_chunk_limit < 0) { throw new IllegalStateException("Exceed [" + MAX_CHUNK_LIMIT_FIELD + "] in [" + chunkerType+ "] algorithm"); } FieldChunker chunker = ChunkerFactory.create(chunkerType, analysisRegistry); From 98d1ab3d7ee4380bea71f869c4dfb206a733b980 Mon Sep 17 00:00:00 2001 From: yuye-aws Date: Thu, 7 Mar 2024 13:37:35 +0800 Subject: [PATCH 065/189] implement overlap rate with big decimal Signed-off-by: yuye-aws --- .../processor/DocumentChunkingProcessor.java | 1 - .../processor/chunker/FixedTokenLengthChunker.java | 13 ++++++++----- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/src/main/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessor.java b/src/main/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessor.java index ed269af20..292388f0c 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessor.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessor.java @@ -315,6 +315,5 @@ public DocumentChunkingProcessor create( analysisRegistry ); } - } } diff --git a/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java b/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java index 2f810d64d..98d9402f2 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java @@ -5,9 +5,11 @@ package org.opensearch.neuralsearch.processor.chunker; import java.io.IOException; +import java.math.RoundingMode; import java.util.ArrayList; import java.util.List; import java.util.Map; +import java.math.BigDecimal; import lombok.extern.log4j.Log4j2; import org.opensearch.action.admin.indices.analyze.AnalyzeAction; @@ -57,7 +59,7 @@ private List tokenize(String content, String tokenizer, int maxTokenCoun public List chunk(String content, Map parameters) { // prior to chunking, parameters have been validated int tokenLimit = DEFAULT_TOKEN_LIMIT; - double overlapRate = DEFAULT_OVERLAP_RATE; + BigDecimal overlap_rate = new BigDecimal(String.valueOf(DEFAULT_OVERLAP_RATE)); int maxTokenCount = DEFAULT_MAX_TOKEN_COUNT; int maxChunkLimit = DEFAULT_MAX_CHUNK_LIMIT; @@ -67,7 +69,7 @@ public List chunk(String content, Map parameters) { tokenLimit = ((Number) parameters.get(TOKEN_LIMIT_FIELD)).intValue(); } if (parameters.containsKey(OVERLAP_RATE_FIELD)) { - overlapRate = ((Number) parameters.get(OVERLAP_RATE_FIELD)).doubleValue(); + overlap_rate = new BigDecimal(String.valueOf(parameters.get(OVERLAP_RATE_FIELD))); } if (parameters.containsKey(MAX_TOKEN_COUNT_FIELD)) { maxTokenCount = ((Number) parameters.get(MAX_TOKEN_COUNT_FIELD)).intValue(); @@ -84,7 +86,8 @@ public List chunk(String content, Map parameters) { String passage; int startToken = 0; - int overlapTokenNumber = (int) Math.floor(tokenLimit * overlapRate); + BigDecimal overlapTokenNumberBigDecimal = overlap_rate.multiply(new BigDecimal(String.valueOf(tokenLimit))).setScale(0, RoundingMode.DOWN); + int overlapTokenNumber = overlapTokenNumberBigDecimal.intValue();; // overlapTokenNumber must be smaller than the token limit overlapTokenNumber = Math.min(overlapTokenNumber, tokenLimit - 1); @@ -144,8 +147,8 @@ public void validateParameters(Map parameters) { "fixed length parameter [" + OVERLAP_RATE_FIELD + "] cannot be cast to [" + Number.class.getName() + "]" ); } - if (((Number) parameters.get(OVERLAP_RATE_FIELD)).doubleValue() < 0.0 - || ((Number) parameters.get(OVERLAP_RATE_FIELD)).doubleValue() >= 1.0) { + BigDecimal overlap_rate = new BigDecimal(String.valueOf(parameters.get(OVERLAP_RATE_FIELD))); + if (overlap_rate.compareTo(BigDecimal.ZERO) < 0 || overlap_rate.compareTo(BigDecimal.ONE) >= 0) { throw new IllegalArgumentException( "fixed length parameter [" + OVERLAP_RATE_FIELD + "] must be between 0 and 1, 1 is not included." ); From 79a637c0d20cecf9594c3ca5ccb743df65a8adea Mon Sep 17 00:00:00 2001 From: yuye-aws Date: Thu, 7 Mar 2024 13:40:07 +0800 Subject: [PATCH 066/189] update max chunk limit in delimiter Signed-off-by: yuye-aws --- .../neuralsearch/processor/chunker/DelimiterChunker.java | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/main/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunker.java b/src/main/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunker.java index e35a811c8..e99849b53 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunker.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunker.java @@ -16,6 +16,8 @@ public DelimiterChunker() {} public static String MAX_CHUNK_LIMIT_FIELD = "max_chunk_limit"; + private static final int DEFAULT_MAX_CHUNK_LIMIT = 100; + @Override public void validateParameters(Map parameters) { if (parameters.containsKey(DELIMITER_FIELD)) { @@ -30,8 +32,8 @@ public void validateParameters(Map parameters) { Object maxChunkLimit = parameters.get(MAX_CHUNK_LIMIT_FIELD); if (!(maxChunkLimit instanceof Integer)) { throw new IllegalArgumentException("Parameter max_chunk_limit:" + maxChunkLimit.toString() + " should be integer."); - } else if ((int) maxChunkLimit <= 0) { - throw new IllegalArgumentException("Parameter max_chunk_limit:" + maxChunkLimit + " is not greater than 0."); + } else if ((int) maxChunkLimit < 0) { + throw new IllegalArgumentException("Parameter max_chunk_limit:" + maxChunkLimit + " is negative."); } } } From 6da6395645c81bf4a4cc22620cf17c08692816f2 Mon Sep 17 00:00:00 2001 From: yuye-aws Date: Thu, 7 Mar 2024 16:26:45 +0800 Subject: [PATCH 067/189] update parameter setting for fixed token length algorithm Signed-off-by: yuye-aws --- .../chunker/FixedTokenLengthChunker.java | 56 ++++++------------- .../chunker/FixedTokenLengthChunkerTests.java | 51 ----------------- 2 files changed, 18 insertions(+), 89 deletions(-) diff --git a/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java b/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java index 98d9402f2..5d0bcdef9 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java @@ -10,6 +10,7 @@ import java.util.List; import java.util.Map; import java.math.BigDecimal; +import java.util.stream.Collectors; import lombok.extern.log4j.Log4j2; import org.opensearch.action.admin.indices.analyze.AnalyzeAction; @@ -22,16 +23,16 @@ public class FixedTokenLengthChunker implements FieldChunker { public static final String TOKEN_LIMIT_FIELD = "token_limit"; public static final String OVERLAP_RATE_FIELD = "overlap_rate"; public static final String MAX_TOKEN_COUNT_FIELD = "max_token_count"; - public static String MAX_CHUNK_LIMIT_FIELD = "max_chunk_limit"; public static final String TOKENIZER_FIELD = "tokenizer"; // default values for each parameter private static final int DEFAULT_TOKEN_LIMIT = 500; - private static final double DEFAULT_OVERLAP_RATE = 0.2; + private static final BigDecimal DEFAULT_OVERLAP_RATE = new BigDecimal("0"); private static final int DEFAULT_MAX_TOKEN_COUNT = 10000; - private static final int DEFAULT_MAX_CHUNK_LIMIT = 100; private static final String DEFAULT_TOKENIZER = "standard"; + private static final BigDecimal OVERLAP_RATE_UPPER_BOUND = new BigDecimal("0.5"); + private final AnalysisRegistry analysisRegistry; public FixedTokenLengthChunker(AnalysisRegistry analysisRegistry) { @@ -44,12 +45,7 @@ private List tokenize(String content, String tokenizer, int maxTokenCoun analyzeRequest.tokenizer(tokenizer); try { AnalyzeAction.Response analyzeResponse = analyze(analyzeRequest, analysisRegistry, null, maxTokenCount); - List analyzeTokenList = analyzeResponse.getTokens(); - List tokenList = new ArrayList<>(); - for (AnalyzeAction.AnalyzeToken analyzeToken : analyzeTokenList) { - tokenList.add(analyzeToken.getTerm()); - } - return tokenList; + return analyzeResponse.getTokens().stream().map(AnalyzeAction.AnalyzeToken::getTerm).collect(Collectors.toList()); } catch (IOException e) { throw new RuntimeException("Fixed token length algorithm meet with exception: " + e); } @@ -61,7 +57,6 @@ public List chunk(String content, Map parameters) { int tokenLimit = DEFAULT_TOKEN_LIMIT; BigDecimal overlap_rate = new BigDecimal(String.valueOf(DEFAULT_OVERLAP_RATE)); int maxTokenCount = DEFAULT_MAX_TOKEN_COUNT; - int maxChunkLimit = DEFAULT_MAX_CHUNK_LIMIT; String tokenizer = DEFAULT_TOKENIZER; @@ -77,17 +72,16 @@ public List chunk(String content, Map parameters) { if (parameters.containsKey(TOKENIZER_FIELD)) { tokenizer = (String) parameters.get(TOKENIZER_FIELD); } - if (parameters.containsKey(MAX_CHUNK_LIMIT_FIELD)) { - maxChunkLimit = ((Number) parameters.get(MAX_CHUNK_LIMIT_FIELD)).intValue(); - } List tokens = tokenize(content, tokenizer, maxTokenCount); List passages = new ArrayList<>(); String passage; int startToken = 0; - BigDecimal overlapTokenNumberBigDecimal = overlap_rate.multiply(new BigDecimal(String.valueOf(tokenLimit))).setScale(0, RoundingMode.DOWN); - int overlapTokenNumber = overlapTokenNumberBigDecimal.intValue();; + BigDecimal overlapTokenNumberBigDecimal = overlap_rate.multiply(new BigDecimal(String.valueOf(tokenLimit))) + .setScale(0, RoundingMode.DOWN); + int overlapTokenNumber = overlapTokenNumberBigDecimal.intValue(); + ; // overlapTokenNumber must be smaller than the token limit overlapTokenNumber = Math.min(overlapTokenNumber, tokenLimit - 1); @@ -95,25 +89,18 @@ public List chunk(String content, Map parameters) { if (startToken + tokenLimit >= tokens.size()) { // break the loop when already cover the last token passage = String.join(" ", tokens.subList(startToken, tokens.size())); - addPassageToList(passages, passage, maxChunkLimit); + passages.add(passage); break; } else { passage = String.join(" ", tokens.subList(startToken, startToken + tokenLimit)); - addPassageToList(passages, passage, maxChunkLimit); + passages.add(passage); } startToken += tokenLimit - overlapTokenNumber; } return passages; } - private void addPassageToList(List passages, String passage, int maxChunkLimit) { - if (passages.size() >= maxChunkLimit) { - throw new IllegalStateException("Exceed max chunk number in fixed token length algorithm"); - } - passages.add(passage); - } - - private void validatePositiveIntegerParameter(Map parameters, String fieldName, boolean requirePositive) { + private void validatePositiveIntegerParameter(Map parameters, String fieldName) { // this method validate that parameter is a positive integer if (!parameters.containsKey(fieldName)) { // all parameters are optional @@ -124,22 +111,15 @@ private void validatePositiveIntegerParameter(Map parameters, St "fixed length parameter [" + fieldName + "] cannot be cast to [" + Number.class.getName() + "]" ); } - if (requirePositive) { - if (((Number) parameters.get(fieldName)).intValue() <= 0) { - throw new IllegalArgumentException("fixed length parameter [" + fieldName + "] must be positive"); - } - } else { - if (((Number) parameters.get(fieldName)).intValue() < 0) { - throw new IllegalArgumentException("fixed length parameter [" + fieldName + "] cannot be negative"); - } + if (((Number) parameters.get(fieldName)).intValue() <= 0) { + throw new IllegalArgumentException("fixed length parameter [" + fieldName + "] must be positive"); } } @Override public void validateParameters(Map parameters) { - validatePositiveIntegerParameter(parameters, TOKEN_LIMIT_FIELD, true); - validatePositiveIntegerParameter(parameters, MAX_CHUNK_LIMIT_FIELD, false); - validatePositiveIntegerParameter(parameters, MAX_TOKEN_COUNT_FIELD, true); + validatePositiveIntegerParameter(parameters, TOKEN_LIMIT_FIELD); + validatePositiveIntegerParameter(parameters, MAX_TOKEN_COUNT_FIELD); if (parameters.containsKey(OVERLAP_RATE_FIELD)) { if (!(parameters.get(OVERLAP_RATE_FIELD) instanceof Number)) { @@ -148,9 +128,9 @@ public void validateParameters(Map parameters) { ); } BigDecimal overlap_rate = new BigDecimal(String.valueOf(parameters.get(OVERLAP_RATE_FIELD))); - if (overlap_rate.compareTo(BigDecimal.ZERO) < 0 || overlap_rate.compareTo(BigDecimal.ONE) >= 0) { + if (overlap_rate.compareTo(BigDecimal.ZERO) < 0 || overlap_rate.compareTo(OVERLAP_RATE_UPPER_BOUND) > 0) { throw new IllegalArgumentException( - "fixed length parameter [" + OVERLAP_RATE_FIELD + "] must be between 0 and 1, 1 is not included." + "fixed length parameter [" + OVERLAP_RATE_FIELD + "] must be between 0 and " + OVERLAP_RATE_UPPER_BOUND ); } } diff --git a/src/test/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunkerTests.java b/src/test/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunkerTests.java index 47bc4990e..2f86d448e 100644 --- a/src/test/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunkerTests.java +++ b/src/test/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunkerTests.java @@ -26,7 +26,6 @@ import static org.opensearch.neuralsearch.processor.chunker.FixedTokenLengthChunker.TOKENIZER_FIELD; import static org.opensearch.neuralsearch.processor.chunker.FixedTokenLengthChunker.TOKEN_LIMIT_FIELD; import static org.opensearch.neuralsearch.processor.chunker.FixedTokenLengthChunker.OVERLAP_RATE_FIELD; -import static org.opensearch.neuralsearch.processor.chunker.FixedTokenLengthChunker.MAX_CHUNK_LIMIT_FIELD; public class FixedTokenLengthChunkerTests extends OpenSearchTestCase { @@ -122,29 +121,6 @@ public void testValidateParameters_whenIllegalTokenizerType_thenFail() { ); } - public void testValidateParameters_whenIllegalChunkLimitType_thenFail() { - Map parameters = new HashMap<>(); - parameters.put(MAX_CHUNK_LIMIT_FIELD, "invalid chunk limit"); - IllegalArgumentException illegalArgumentException = assertThrows( - IllegalArgumentException.class, - () -> FixedTokenLengthChunker.validateParameters(parameters) - ); - assertEquals( - "fixed length parameter [" + MAX_CHUNK_LIMIT_FIELD + "] cannot be cast to [" + Number.class.getName() + "]", - illegalArgumentException.getMessage() - ); - } - - public void testValidateParameters_whenIllegalChunkLimitValue_thenFail() { - Map parameters = new HashMap<>(); - parameters.put(MAX_CHUNK_LIMIT_FIELD, -1); - IllegalArgumentException illegalArgumentException = assertThrows( - IllegalArgumentException.class, - () -> FixedTokenLengthChunker.validateParameters(parameters) - ); - assertEquals("fixed length parameter [" + MAX_CHUNK_LIMIT_FIELD + "] must be positive", illegalArgumentException.getMessage()); - } - public void testChunk_withTokenLimit_10() { Map parameters = new HashMap<>(); parameters.put(TOKEN_LIMIT_FIELD, 10); @@ -186,31 +162,4 @@ public void testChunk_withOverlapRate_half() { expectedPassages.add("sentences and 24 tokens by standard tokenizer in OpenSearch"); assertEquals(expectedPassages, passages); } - - public void testChunk_withMaxChunkLimitOne_thenFail() { - Map parameters = new HashMap<>(); - parameters.put(TOKEN_LIMIT_FIELD, 10); - parameters.put(MAX_CHUNK_LIMIT_FIELD, 1); - String content = - "This is an example document to be chunked. The document contains a single paragraph, two sentences and 24 tokens by standard tokenizer in OpenSearch."; - IllegalStateException illegalStateException = assertThrows( - IllegalStateException.class, - () -> FixedTokenLengthChunker.chunk(content, parameters) - ); - assertEquals("Exceed max chunk number: 1", illegalStateException.getMessage()); - } - - public void testChunk_withMaxChunkLimitTen_thenSuccess() { - Map parameters = new HashMap<>(); - parameters.put(TOKEN_LIMIT_FIELD, 10); - parameters.put(MAX_CHUNK_LIMIT_FIELD, 10); - String content = - "This is an example document to be chunked. The document contains a single paragraph, two sentences and 24 tokens by standard tokenizer in OpenSearch."; - List passages = FixedTokenLengthChunker.chunk(content, parameters); - List expectedPassages = new ArrayList<>(); - expectedPassages.add("This is an example document to be chunked The document"); - expectedPassages.add("The document contains a single paragraph two sentences and 24"); - expectedPassages.add("and 24 tokens by standard tokenizer in OpenSearch"); - assertEquals(expectedPassages, passages); - } } From 105d4a0fcfe1e8a025008253801b862a5d107812 Mon Sep 17 00:00:00 2001 From: yuye-aws Date: Thu, 7 Mar 2024 16:31:47 +0800 Subject: [PATCH 068/189] update max chunk limit implementation in chunking processor Signed-off-by: yuye-aws --- .../processor/DocumentChunkingProcessor.java | 23 +++++++++++-------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/src/main/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessor.java b/src/main/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessor.java index 292388f0c..f3bbaf9d3 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessor.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessor.java @@ -43,9 +43,11 @@ public final class DocumentChunkingProcessor extends AbstractProcessor { public static String MAX_CHUNK_LIMIT_FIELD = "max_chunk_limit"; - private static final int DEFAULT_MAX_CHUNK_LIMIT = 100; + private static final int DEFAULT_MAX_CHUNK_LIMIT = -1; - private int current_max_chunk_limit = DEFAULT_MAX_CHUNK_LIMIT; + private int current_chunk_count = 0; + + private int max_chunk_limit = DEFAULT_MAX_CHUNK_LIMIT; private final Set supportedChunkers = ChunkerFactory.getAllChunkers(); private String chunkerType; @@ -118,7 +120,7 @@ private void validateAndParseAlgorithmMap(Map algorithmMap) { this.chunkerParameters = (Map) algorithmValue; chunker.validateParameters(chunkerParameters); if (((Map) algorithmValue).containsKey(MAX_CHUNK_LIMIT_FIELD)) { - this.current_max_chunk_limit = ((Number) ((Map) algorithmValue).get(MAX_CHUNK_LIMIT_FIELD)).intValue(); + this.max_chunk_limit = ((Number) ((Map) algorithmValue).get(MAX_CHUNK_LIMIT_FIELD)).intValue(); } } } @@ -138,14 +140,16 @@ private boolean isListString(Object value) { } private List chunkString(String content) { - // assume that content is either a map, list or string - if (current_max_chunk_limit < 0) { - throw new IllegalStateException("Exceed [" + MAX_CHUNK_LIMIT_FIELD + "] in [" + chunkerType+ "] algorithm"); - } FieldChunker chunker = ChunkerFactory.create(chunkerType, analysisRegistry); List result = chunker.chunk(content, chunkerParameters); - current_max_chunk_limit -= result.size(); - chunkerParameters.put(MAX_CHUNK_LIMIT_FIELD, current_max_chunk_limit); + current_chunk_count += result.size(); + if (max_chunk_limit != DEFAULT_MAX_CHUNK_LIMIT && current_chunk_count > max_chunk_limit) { + throw new IllegalArgumentException( + "Unable to create the processor as the number of chunks [" + current_chunk_count + "] exceeds the maximum chunk limit [" + + MAX_CHUNK_LIMIT_FIELD + + "]" + ); + } return chunker.chunk(content, chunkerParameters); } @@ -174,7 +178,6 @@ private List chunkLeafType(Object value) { public IngestDocument execute(IngestDocument ingestDocument) { validateEmbeddingFieldsValue(ingestDocument); - chunkerParameters.put(MAX_CHUNK_LIMIT_FIELD, current_max_chunk_limit); if (Objects.equals(chunkerType, FIXED_LENGTH_ALGORITHM)) { // add maxTokenCount setting from index metadata to chunker parameters Map sourceAndMetadataMap = ingestDocument.getSourceAndMetadata(); From cd4eda732106f534902e1456c7c8ecfadaeb1b22 Mon Sep 17 00:00:00 2001 From: yuye-aws Date: Thu, 7 Mar 2024 16:38:34 +0800 Subject: [PATCH 069/189] fix unit tests for fixed token length algorithm Signed-off-by: yuye-aws --- .../chunker/FixedTokenLengthChunkerTests.java | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/test/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunkerTests.java b/src/test/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunkerTests.java index 2f86d448e..0633213fb 100644 --- a/src/test/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunkerTests.java +++ b/src/test/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunkerTests.java @@ -97,13 +97,13 @@ public void testValidateParameters_whenIllegalOverlapRateType_thenFail() { public void testValidateParameters_whenIllegalOverlapRateValue_thenFail() { Map parameters = new HashMap<>(); - parameters.put(OVERLAP_RATE_FIELD, 1.0); + parameters.put(OVERLAP_RATE_FIELD, 0.6); IllegalArgumentException illegalArgumentException = assertThrows( IllegalArgumentException.class, () -> FixedTokenLengthChunker.validateParameters(parameters) ); assertEquals( - "fixed length parameter [" + OVERLAP_RATE_FIELD + "] must be between 0 and 1, 1 is not included.", + "fixed length parameter [" + OVERLAP_RATE_FIELD + "] must be between 0 and 0.5", illegalArgumentException.getMessage() ); } @@ -129,8 +129,8 @@ public void testChunk_withTokenLimit_10() { List passages = FixedTokenLengthChunker.chunk(content, parameters); List expectedPassages = new ArrayList<>(); expectedPassages.add("This is an example document to be chunked The document"); - expectedPassages.add("The document contains a single paragraph two sentences and 24"); - expectedPassages.add("and 24 tokens by standard tokenizer in OpenSearch"); + expectedPassages.add("contains a single paragraph two sentences and 24 tokens by"); + expectedPassages.add("standard tokenizer in OpenSearch"); assertEquals(expectedPassages, passages); } @@ -144,7 +144,7 @@ public void testChunk_withTokenLimit_20() { expectedPassages.add( "This is an example document to be chunked The document contains a single paragraph two sentences and 24 tokens by" ); - expectedPassages.add("and 24 tokens by standard tokenizer in OpenSearch"); + expectedPassages.add("standard tokenizer in OpenSearch"); assertEquals(expectedPassages, passages); } From ceaa7d29013b9460750fabd7332be2e8daacc9e9 Mon Sep 17 00:00:00 2001 From: yuye-aws Date: Thu, 7 Mar 2024 16:41:24 +0800 Subject: [PATCH 070/189] spotless apply for document chunking processor Signed-off-by: yuye-aws --- .../neuralsearch/processor/DocumentChunkingProcessor.java | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/main/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessor.java b/src/main/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessor.java index f3bbaf9d3..5b8269586 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessor.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessor.java @@ -145,7 +145,9 @@ private List chunkString(String content) { current_chunk_count += result.size(); if (max_chunk_limit != DEFAULT_MAX_CHUNK_LIMIT && current_chunk_count > max_chunk_limit) { throw new IllegalArgumentException( - "Unable to create the processor as the number of chunks [" + current_chunk_count + "] exceeds the maximum chunk limit [" + "Unable to create the processor as the number of chunks [" + + current_chunk_count + + "] exceeds the maximum chunk limit [" + MAX_CHUNK_LIMIT_FIELD + "]" ); From 715c14582b60919e97d41e97b234b2edbef964f7 Mon Sep 17 00:00:00 2001 From: yuye-aws Date: Thu, 7 Mar 2024 16:51:25 +0800 Subject: [PATCH 071/189] initialize current chunk count Signed-off-by: yuye-aws --- .../neuralsearch/processor/DocumentChunkingProcessor.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessor.java b/src/main/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessor.java index 5b8269586..fa43238e2 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessor.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessor.java @@ -179,7 +179,7 @@ private List chunkLeafType(Object value) { @Override public IngestDocument execute(IngestDocument ingestDocument) { validateEmbeddingFieldsValue(ingestDocument); - + current_chunk_count = 0; if (Objects.equals(chunkerType, FIXED_LENGTH_ALGORITHM)) { // add maxTokenCount setting from index metadata to chunker parameters Map sourceAndMetadataMap = ingestDocument.getSourceAndMetadata(); From 75663e183c9c3447986eed8ee76e96aef1712c0a Mon Sep 17 00:00:00 2001 From: yuye-aws Date: Thu, 7 Mar 2024 17:50:50 +0800 Subject: [PATCH 072/189] parameter validation for max chunk limit Signed-off-by: yuye-aws --- .../neuralsearch/processor/DocumentChunkingProcessor.java | 6 +++++- .../processor/DocumentChunkingProcessorTests.java | 3 +-- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/src/main/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessor.java b/src/main/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessor.java index fa43238e2..24c95e3d0 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessor.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessor.java @@ -120,7 +120,11 @@ private void validateAndParseAlgorithmMap(Map algorithmMap) { this.chunkerParameters = (Map) algorithmValue; chunker.validateParameters(chunkerParameters); if (((Map) algorithmValue).containsKey(MAX_CHUNK_LIMIT_FIELD)) { - this.max_chunk_limit = ((Number) ((Map) algorithmValue).get(MAX_CHUNK_LIMIT_FIELD)).intValue(); + int max_chunk_limit = ((Number) ((Map) algorithmValue).get(MAX_CHUNK_LIMIT_FIELD)).intValue(); + if (max_chunk_limit <= 0) { + throw new IllegalArgumentException("Parameter [" + MAX_CHUNK_LIMIT_FIELD + "] must be a positive integer"); + } + this.max_chunk_limit = max_chunk_limit; } } } diff --git a/src/test/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessorTests.java b/src/test/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessorTests.java index f515bc6e4..332819ae7 100644 --- a/src/test/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessorTests.java +++ b/src/test/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessorTests.java @@ -74,7 +74,6 @@ public Map> getTokeniz @Before public void setup() { - Settings settings = Settings.builder().build(); Metadata metadata = mock(Metadata.class); ClusterState clusterState = mock(ClusterState.class); ClusterService clusterService = mock(ClusterService.class); @@ -82,7 +81,7 @@ public void setup() { when(metadata.index(anyString())).thenReturn(null); when(clusterState.metadata()).thenReturn(metadata); when(clusterService.state()).thenReturn(clusterState); - factory = new DocumentChunkingProcessor.Factory(settings, clusterService, indicesService, getAnalysisRegistry()); + factory = new DocumentChunkingProcessor.Factory(environment, clusterService, indicesService, getAnalysisRegistry()); } private Map createFixedTokenLengthParameters() { From 2e5dc00ea9874599208da0b311446f4ba7c3437b Mon Sep 17 00:00:00 2001 From: yuye-aws Date: Thu, 7 Mar 2024 18:14:29 +0800 Subject: [PATCH 073/189] fix integration tests Signed-off-by: yuye-aws --- .../DocumentChunkingProcessorIT.java | 6 +++--- .../chunker/PipelineForCascadedChunker.json | 20 +++++++++---------- .../chunker/PipelineForDelimiterChunker.json | 10 +++++----- .../PipelineForFixedTokenLengthChunker.json | 10 +++++----- 4 files changed, 23 insertions(+), 23 deletions(-) diff --git a/src/test/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessorIT.java b/src/test/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessorIT.java index d8caa64da..d136af9c4 100644 --- a/src/test/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessorIT.java +++ b/src/test/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessorIT.java @@ -63,8 +63,8 @@ public void testDocumentChunkingProcessor_withFixedTokenLength_successful() thro List expectedPassages = new ArrayList<>(); expectedPassages.add("This is an example document to be chunked The document"); - expectedPassages.add("The document contains a single paragraph two sentences and 24"); - expectedPassages.add("and 24 tokens by standard tokenizer in OpenSearch"); + expectedPassages.add("contains a single paragraph two sentences and 24 tokens by"); + expectedPassages.add("standard tokenizer in OpenSearch"); validateIndexIngestResults(INDEX_NAME, OUTPUT_FIELD, expectedPassages); } finally { wipeOfTestResources(INDEX_NAME, FIXED_TOKEN_LENGTH_PIPELINE_NAME, null, null); @@ -112,7 +112,7 @@ public void testDocumentChunkingProcessor_withCascade_successful() throws Except // " ", "." and "," will not be included in fixed token length output expectedPassages.add("This is an example document to be chunked"); expectedPassages.add("The document contains a single paragraph two sentences and 24"); - expectedPassages.add("and 24 tokens by standard tokenizer in OpenSearch"); + expectedPassages.add("tokens by standard tokenizer in OpenSearch"); validateIndexIngestResults(INDEX_NAME, OUTPUT_FIELD, expectedPassages); expectedPassages.clear(); diff --git a/src/test/resources/processor/chunker/PipelineForCascadedChunker.json b/src/test/resources/processor/chunker/PipelineForCascadedChunker.json index 6302cfdfe..3125d3d53 100644 --- a/src/test/resources/processor/chunker/PipelineForCascadedChunker.json +++ b/src/test/resources/processor/chunker/PipelineForCascadedChunker.json @@ -4,11 +4,11 @@ { "chunking": { "field_map": { - "body": { - "delimiter": { - "delimiter": "." - }, - "output_field": "body_chunk_intermediate" + "body": "body_chunk_intermediate" + }, + "algorithm": { + "delimiter": { + "delimiter": "." } } } @@ -16,11 +16,11 @@ { "chunking": { "field_map": { - "body_chunk_intermediate": { - "fix_length": { - "token_limit": 10 - }, - "output_field": "body_chunk" + "body_chunk_intermediate": "body_chunk" + }, + "algorithm": { + "fix_length": { + "token_limit": 10 } } } diff --git a/src/test/resources/processor/chunker/PipelineForDelimiterChunker.json b/src/test/resources/processor/chunker/PipelineForDelimiterChunker.json index 9ababd6ed..dfa504065 100644 --- a/src/test/resources/processor/chunker/PipelineForDelimiterChunker.json +++ b/src/test/resources/processor/chunker/PipelineForDelimiterChunker.json @@ -4,11 +4,11 @@ { "chunking": { "field_map": { - "body": { - "delimiter": { - "delimiter": "." - }, - "output_field": "body_chunk" + "body": "body_chunk" + }, + "algorithm": { + "delimiter": { + "delimiter": "." } } } diff --git a/src/test/resources/processor/chunker/PipelineForFixedTokenLengthChunker.json b/src/test/resources/processor/chunker/PipelineForFixedTokenLengthChunker.json index 27daf19c8..c2a55e4f2 100644 --- a/src/test/resources/processor/chunker/PipelineForFixedTokenLengthChunker.json +++ b/src/test/resources/processor/chunker/PipelineForFixedTokenLengthChunker.json @@ -4,11 +4,11 @@ { "chunking": { "field_map": { - "body": { - "fix_length": { - "token_limit": 10 - }, - "output_field": "body_chunk" + "body": "body_chunk" + }, + "algorithm": { + "fix_length": { + "token_limit": 10 } } } From d711390e0d3e6220dec3fac2130388ccc2603348 Mon Sep 17 00:00:00 2001 From: xinyual Date: Thu, 7 Mar 2024 18:45:49 +0800 Subject: [PATCH 074/189] fix current UT Signed-off-by: xinyual Signed-off-by: yuye-aws --- .../DocumentChunkingProcessorTests.java | 188 +++++++----------- 1 file changed, 74 insertions(+), 114 deletions(-) diff --git a/src/test/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessorTests.java b/src/test/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessorTests.java index 332819ae7..4b15c25f7 100644 --- a/src/test/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessorTests.java +++ b/src/test/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessorTests.java @@ -4,11 +4,11 @@ */ package org.opensearch.neuralsearch.processor; -import com.google.common.collect.ImmutableMap; import lombok.SneakyThrows; import org.apache.lucene.tests.analysis.MockTokenizer; import org.junit.Before; import org.mockito.Mock; +import org.opensearch.OpenSearchParseException; import org.opensearch.cluster.ClusterState; import org.opensearch.cluster.metadata.Metadata; import org.opensearch.cluster.service.ClusterService; @@ -38,6 +38,7 @@ import static org.mockito.ArgumentMatchers.anyString; import static org.mockito.Mockito.when; import static org.mockito.Mockito.mock; +import static org.opensearch.neuralsearch.processor.DocumentChunkingProcessor.ALGORITHM_FIELD; public class DocumentChunkingProcessorTests extends OpenSearchTestCase { @@ -46,6 +47,8 @@ public class DocumentChunkingProcessorTests extends OpenSearchTestCase { private static final String PROCESSOR_TAG = "mockTag"; private static final String DESCRIPTION = "mockDescription"; private static final String INPUT_FIELD = "body"; + + private static final String INPUT_NESTED_FIELD_KEY = "nested"; private static final String OUTPUT_FIELD = "body_chunk"; private static final String INDEX_NAME = "_index"; @@ -61,11 +64,11 @@ private AnalysisRegistry getAnalysisRegistry() { @Override public Map> getTokenizers() { return singletonMap( - "keyword", - (indexSettings, environment, name, settings) -> TokenizerFactory.newFactory( - name, - () -> new MockTokenizer(MockTokenizer.KEYWORD, false) - ) + "keyword", + (indexSettings, environment, name, settings) -> TokenizerFactory.newFactory( + name, + () -> new MockTokenizer(MockTokenizer.KEYWORD, false) + ) ); } }; @@ -75,6 +78,9 @@ public Map> getTokeniz @Before public void setup() { Metadata metadata = mock(Metadata.class); + Environment environment = mock(Environment.class); + Settings settings = Settings.builder().put("index.mapping.depth.limit", 20).build(); + when(environment.settings()).thenReturn(settings); ClusterState clusterState = mock(ClusterState.class); ClusterService clusterService = mock(ClusterService.class); IndicesService indicesService = mock(IndicesService.class); @@ -96,15 +102,25 @@ private Map createDelimiterParameters() { return parameters; } + private Map createStringFieldMap() { + Map fieldMap = new HashMap<>(); + fieldMap.put(INPUT_FIELD, OUTPUT_FIELD); + return fieldMap; + } + + private Map createNestedFieldMap() { + Map fieldMap = new HashMap<>(); + fieldMap.put(INPUT_NESTED_FIELD_KEY, Map.of(INPUT_FIELD, OUTPUT_FIELD)); + return fieldMap; + } + @SneakyThrows - private DocumentChunkingProcessor createFixedTokenLengthInstance() { + private DocumentChunkingProcessor createFixedTokenLengthInstance(Map fieldMap) { Map config = new HashMap<>(); - Map fieldMap = new HashMap<>(); Map algorithmMap = new HashMap<>(); algorithmMap.put(ChunkerFactory.FIXED_LENGTH_ALGORITHM, createFixedTokenLengthParameters()); - fieldMap.put(INPUT_FIELD, OUTPUT_FIELD); config.put(DocumentChunkingProcessor.FIELD_MAP_FIELD, fieldMap); - config.put(DocumentChunkingProcessor.ALGORITHM_FIELD, algorithmMap); + config.put(ALGORITHM_FIELD, algorithmMap); Map registry = new HashMap<>(); return factory.create(registry, PROCESSOR_TAG, DESCRIPTION, config); } @@ -117,7 +133,7 @@ private DocumentChunkingProcessor createDelimiterInstance() { algorithmMap.put(ChunkerFactory.DELIMITER_ALGORITHM, createDelimiterParameters()); fieldMap.put(INPUT_FIELD, OUTPUT_FIELD); config.put(DocumentChunkingProcessor.FIELD_MAP_FIELD, fieldMap); - config.put(DocumentChunkingProcessor.ALGORITHM_FIELD, algorithmMap); + config.put(ALGORITHM_FIELD, algorithmMap); Map registry = new HashMap<>(); return factory.create(registry, PROCESSOR_TAG, DESCRIPTION, config); } @@ -127,11 +143,11 @@ public void testCreate_whenFieldMapEmpty_failure() { Map emptyFieldMap = new HashMap<>(); config.put(DocumentChunkingProcessor.FIELD_MAP_FIELD, emptyFieldMap); Map registry = new HashMap<>(); - IllegalArgumentException illegalArgumentException = assertThrows( - IllegalArgumentException.class, - () -> factory.create(registry, PROCESSOR_TAG, DESCRIPTION, config) + OpenSearchParseException openSearchParseException = assertThrows( + OpenSearchParseException.class, + () -> factory.create(registry, PROCESSOR_TAG, DESCRIPTION, config) ); - assertEquals("Unable to create the processor as field_map is null or empty", illegalArgumentException.getMessage()); + assertEquals("[" + ALGORITHM_FIELD + "] required property is missing", openSearchParseException.getMessage()); } public void testCreate_whenFieldMapWithEmptyParameter_failure() { @@ -140,11 +156,11 @@ public void testCreate_whenFieldMapWithEmptyParameter_failure() { fieldMap.put("key", null); config.put(DocumentChunkingProcessor.FIELD_MAP_FIELD, fieldMap); Map registry = new HashMap<>(); - IllegalArgumentException illegalArgumentException = assertThrows( - IllegalArgumentException.class, - () -> factory.create(registry, PROCESSOR_TAG, DESCRIPTION, config) + OpenSearchParseException openSearchParseException = assertThrows( + OpenSearchParseException.class, + () -> factory.create(registry, PROCESSOR_TAG, DESCRIPTION, config) ); - assertEquals("parameters for input field [key] is null, cannot process it.", illegalArgumentException.getMessage()); + assertEquals("[" + ALGORITHM_FIELD + "] required property is missing", openSearchParseException.getMessage()); } public void testCreate_whenFieldMapWithIllegalParameterType_failure() { @@ -153,28 +169,11 @@ public void testCreate_whenFieldMapWithIllegalParameterType_failure() { fieldMap.put("key", "value"); config.put(DocumentChunkingProcessor.FIELD_MAP_FIELD, fieldMap); Map registry = new HashMap<>(); - IllegalArgumentException illegalArgumentException = assertThrows( - IllegalArgumentException.class, - () -> factory.create(registry, PROCESSOR_TAG, DESCRIPTION, config) - ); - assertEquals("parameters for input field [key] cannot be cast to [java.util.Map]", illegalArgumentException.getMessage()); - } - - public void testCreate_whenFieldMapWithIllegalKey_failure() { - Map config = new HashMap<>(); - Map fieldMap = new HashMap<>(); - fieldMap.put(null, 1); - fieldMap.put(INPUT_FIELD, OUTPUT_FIELD); - Map algorithmMap = new HashMap<>(); - algorithmMap.put(ChunkerFactory.FIXED_LENGTH_ALGORITHM, createFixedTokenLengthParameters()); - config.put(DocumentChunkingProcessor.FIELD_MAP_FIELD, fieldMap); - config.put(DocumentChunkingProcessor.ALGORITHM_FIELD, algorithmMap); - Map registry = new HashMap<>(); - IllegalArgumentException illegalArgumentException = assertThrows( - IllegalArgumentException.class, - () -> factory.create(registry, PROCESSOR_TAG, DESCRIPTION, config) + OpenSearchParseException openSearchParseException = assertThrows( + OpenSearchParseException.class, + () -> factory.create(registry, PROCESSOR_TAG, DESCRIPTION, config) ); - assertEquals("found parameter entry with non-string key", illegalArgumentException.getMessage()); + assertEquals("[" + ALGORITHM_FIELD + "] required property is missing", openSearchParseException.getMessage()); } public void testCreate_whenFieldMapWithNoAlgorithm_failure() { @@ -183,21 +182,21 @@ public void testCreate_whenFieldMapWithNoAlgorithm_failure() { Map algorithmMap = new HashMap<>(); fieldMap.put(INPUT_FIELD, OUTPUT_FIELD); config.put(DocumentChunkingProcessor.FIELD_MAP_FIELD, fieldMap); - config.put(DocumentChunkingProcessor.ALGORITHM_FIELD, algorithmMap); + config.put(ALGORITHM_FIELD, algorithmMap); Map registry = new HashMap<>(); IllegalArgumentException illegalArgumentException = assertThrows( - IllegalArgumentException.class, - () -> factory.create(registry, PROCESSOR_TAG, DESCRIPTION, config) + IllegalArgumentException.class, + () -> factory.create(registry, PROCESSOR_TAG, DESCRIPTION, config) ); assertEquals( - "input field [" + INPUT_FIELD + "] should has and only has 1 chunking algorithm", - illegalArgumentException.getMessage() + "Unable to create the processor as [" + ALGORITHM_FIELD + "] must contain and only contain 1 algorithm", + illegalArgumentException.getMessage() ); } @SneakyThrows public void testGetType() { - DocumentChunkingProcessor processor = createFixedTokenLengthInstance(); + DocumentChunkingProcessor processor = createFixedTokenLengthInstance(createStringFieldMap()); String type = processor.getType(); assertEquals(DocumentChunkingProcessor.TYPE, type); } @@ -209,10 +208,10 @@ private String createSourceDataString() { private List createSourceDataList() { List documents = new ArrayList<>(); documents.add( - "This is the first document to be chunked. The document contains a single paragraph, two sentences and 24 tokens by standard tokenizer in OpenSearch." + "This is the first document to be chunked. The document contains a single paragraph, two sentences and 24 tokens by standard tokenizer in OpenSearch." ); documents.add( - "This is the second document to be chunked. The document contains a single paragraph, two sentences and 24 tokens by standard tokenizer in OpenSearch." + "This is the second document to be chunked. The document contains a single paragraph, two sentences and 24 tokens by standard tokenizer in OpenSearch." ); return documents; } @@ -220,27 +219,29 @@ private List createSourceDataList() { private Map createSourceDataMap() { Map documents = new HashMap<>(); documents.put( - "third", - "This is the third document to be chunked. The document contains a single paragraph, two sentences and 24 tokens by standard tokenizer in OpenSearch." + "third", + "This is the third document to be chunked. The document contains a single paragraph, two sentences and 24 tokens by standard tokenizer in OpenSearch." ); documents.put( - "fourth", - "This is the fourth document to be chunked. The document contains a single paragraph, two sentences and 24 tokens by standard tokenizer in OpenSearch." + "fourth", + "This is the fourth document to be chunked. The document contains a single paragraph, two sentences and 24 tokens by standard tokenizer in OpenSearch." ); return documents; } private Map createSourceDataNestedMap() { - String documentString = createSourceDataString(); - List documentList = createSourceDataList(); - Map documentMap = createSourceDataMap(); Map documents = new HashMap<>(); - documents.put("String", documentString); - documents.put("List", documentList); - documents.put("Map", documentMap); + documents.put(INPUT_FIELD, createSourceDataString()); return documents; } + private IngestDocument createIngestDocumentWithNestedSourceData(Object sourceData) { + Map sourceAndMetadata = new HashMap<>(); + sourceAndMetadata.put(INPUT_NESTED_FIELD_KEY, sourceData); + sourceAndMetadata.put(IndexFieldMapper.NAME, INDEX_NAME); + return new IngestDocument(sourceAndMetadata, new HashMap<>()); + } + private IngestDocument createIngestDocumentWithSourceData(Object sourceData) { Map sourceAndMetadata = new HashMap<>(); sourceAndMetadata.put(INPUT_FIELD, sourceData); @@ -250,7 +251,7 @@ private IngestDocument createIngestDocumentWithSourceData(Object sourceData) { @SneakyThrows public void testExecute_withFixedTokenLength_andSourceDataString_successful() { - DocumentChunkingProcessor processor = createFixedTokenLengthInstance(); + DocumentChunkingProcessor processor = createFixedTokenLengthInstance(createStringFieldMap()); IngestDocument ingestDocument = createIngestDocumentWithSourceData(createSourceDataString()); IngestDocument document = processor.execute(ingestDocument); assert document.getSourceAndMetadata().containsKey(OUTPUT_FIELD); @@ -265,7 +266,7 @@ public void testExecute_withFixedTokenLength_andSourceDataString_successful() { @SneakyThrows public void testExecute_withFixedTokenLength_andSourceDataList_successful() { - DocumentChunkingProcessor processor = createFixedTokenLengthInstance(); + DocumentChunkingProcessor processor = createFixedTokenLengthInstance(createStringFieldMap()); IngestDocument ingestDocument = createIngestDocumentWithSourceData(createSourceDataList()); IngestDocument document = processor.execute(ingestDocument); assert document.getSourceAndMetadata().containsKey(OUTPUT_FIELD); @@ -283,63 +284,22 @@ public void testExecute_withFixedTokenLength_andSourceDataList_successful() { } @SneakyThrows - public void testExecute_withFixedTokenLength_andSourceDataMap_successful() { - DocumentChunkingProcessor processor = createFixedTokenLengthInstance(); - IngestDocument ingestDocument = createIngestDocumentWithSourceData(createSourceDataMap()); + public void testExecute_withFixedTokenLength_andFieldMapNestedMap_successful() { + DocumentChunkingProcessor processor = createFixedTokenLengthInstance(createNestedFieldMap()); + IngestDocument ingestDocument = createIngestDocumentWithNestedSourceData(createSourceDataNestedMap()); IngestDocument document = processor.execute(ingestDocument); - assert document.getSourceAndMetadata().containsKey(OUTPUT_FIELD); - Object passages = document.getSourceAndMetadata().get(OUTPUT_FIELD); - assert (passages instanceof Map); - - List expectedPassages1 = new ArrayList<>(); - List expectedPassages2 = new ArrayList<>(); - - expectedPassages1.add("This is the third document to be chunked The document"); - expectedPassages1.add("The document contains a single paragraph two sentences and 24"); - expectedPassages1.add("and 24 tokens by standard tokenizer in OpenSearch"); - expectedPassages2.add("This is the fourth document to be chunked The document"); - expectedPassages2.add("The document contains a single paragraph two sentences and 24"); - expectedPassages2.add("and 24 tokens by standard tokenizer in OpenSearch"); + assert document.getSourceAndMetadata().containsKey(INPUT_NESTED_FIELD_KEY); + Object nestedResult = document.getSourceAndMetadata().get(INPUT_NESTED_FIELD_KEY); + assert (nestedResult instanceof Map); + assert ((Map) nestedResult).containsKey(OUTPUT_FIELD); + Object passages = ((Map) nestedResult).get(OUTPUT_FIELD); + assert (passages instanceof List); - Map expectedPassages = ImmutableMap.of("third", expectedPassages1, "fourth", expectedPassages2); - - assertEquals(expectedPassages, passages); - } + List expectedPassages = new ArrayList<>(); - @SneakyThrows - public void testExecute_withFixedTokenLength_andSourceDataNestedMap_successful() { - DocumentChunkingProcessor processor = createFixedTokenLengthInstance(); - IngestDocument ingestDocument = createIngestDocumentWithSourceData(createSourceDataNestedMap()); - IngestDocument document = processor.execute(ingestDocument); - assert document.getSourceAndMetadata().containsKey(OUTPUT_FIELD); - Object passages = document.getSourceAndMetadata().get(OUTPUT_FIELD); - assert (passages instanceof Map); - - Map expectedPassages = new HashMap<>(); - List expectedPassages1 = new ArrayList<>(); - List expectedPassages2 = new ArrayList<>(); - List expectedPassages3 = new ArrayList<>(); - List expectedPassages4 = new ArrayList<>(); - - expectedPassages1.add("This is an example document to be chunked The document"); - expectedPassages1.add("The document contains a single paragraph two sentences and 24"); - expectedPassages1.add("and 24 tokens by standard tokenizer in OpenSearch"); - expectedPassages2.add("This is the first document to be chunked The document"); - expectedPassages2.add("The document contains a single paragraph two sentences and 24"); - expectedPassages2.add("and 24 tokens by standard tokenizer in OpenSearch"); - expectedPassages2.add("This is the second document to be chunked The document"); - expectedPassages2.add("The document contains a single paragraph two sentences and 24"); - expectedPassages2.add("and 24 tokens by standard tokenizer in OpenSearch"); - expectedPassages3.add("This is the third document to be chunked The document"); - expectedPassages3.add("The document contains a single paragraph two sentences and 24"); - expectedPassages3.add("and 24 tokens by standard tokenizer in OpenSearch"); - expectedPassages4.add("This is the fourth document to be chunked The document"); - expectedPassages4.add("The document contains a single paragraph two sentences and 24"); - expectedPassages4.add("and 24 tokens by standard tokenizer in OpenSearch"); - - expectedPassages.put("String", expectedPassages1); - expectedPassages.put("List", expectedPassages2); - expectedPassages.put("Map", ImmutableMap.of("third", expectedPassages3, "fourth", expectedPassages4)); + expectedPassages.add("This is an example document to be chunked The document"); + expectedPassages.add("The document contains a single paragraph two sentences and 24"); + expectedPassages.add("and 24 tokens by standard tokenizer in OpenSearch"); assertEquals(expectedPassages, passages); } From 98124ee635889718f39f82788e5f36eb28b6bc57 Mon Sep 17 00:00:00 2001 From: xinyual Date: Thu, 7 Mar 2024 18:46:23 +0800 Subject: [PATCH 075/189] change delimiter UT Signed-off-by: xinyual Signed-off-by: yuye-aws --- .../chunker/DelimiterChunkerTests.java | 33 ------------------- 1 file changed, 33 deletions(-) diff --git a/src/test/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunkerTests.java b/src/test/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunkerTests.java index b92999fa3..a1fa4185c 100644 --- a/src/test/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunkerTests.java +++ b/src/test/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunkerTests.java @@ -13,34 +13,9 @@ import static junit.framework.TestCase.assertEquals; import static org.junit.Assert.assertThrows; import static org.opensearch.neuralsearch.processor.chunker.DelimiterChunker.DELIMITER_FIELD; -import static org.opensearch.neuralsearch.processor.chunker.DelimiterChunker.MAX_CHUNK_LIMIT_FIELD; public class DelimiterChunkerTests extends OpenSearchTestCase { - public void testChunkerWithWrongLimitFieldList() { - DelimiterChunker chunker = new DelimiterChunker(); - String content = "a\nb\nc\nd"; - Map inputParameters = Map.of(MAX_CHUNK_LIMIT_FIELD, List.of("-1"), DELIMITER_FIELD, "\n"); - Exception exception = assertThrows(IllegalArgumentException.class, () -> chunker.validateParameters(inputParameters)); - Assert.assertEquals("Parameter max_chunk_limit:" + List.of("-1") + " should be integer.", exception.getMessage()); - } - - public void testChunkerWithWrongLimitField() { - DelimiterChunker chunker = new DelimiterChunker(); - String content = "a\nb\nc\nd"; - Map inputParameters = Map.of(MAX_CHUNK_LIMIT_FIELD, "1000\n", DELIMITER_FIELD, "\n"); - Exception exception = assertThrows(IllegalArgumentException.class, () -> chunker.validateParameters(inputParameters)); - Assert.assertEquals("Parameter max_chunk_limit:1000\n should be integer.", exception.getMessage()); - } - - public void testChunkerWithNegativeLimit() { - DelimiterChunker chunker = new DelimiterChunker(); - String content = "a\nb\nc\nd"; - Map inputParameters = Map.of(MAX_CHUNK_LIMIT_FIELD, -1, DELIMITER_FIELD, "\n"); - Exception exception = assertThrows(IllegalArgumentException.class, () -> chunker.validateParameters(inputParameters)); - Assert.assertEquals("Parameter max_chunk_limit:-1 is not greater than 0.", exception.getMessage()); - } - public void testChunkerWithDelimiterFieldNotString() { DelimiterChunker chunker = new DelimiterChunker(); String content = "a\nb\nc\nd"; @@ -57,14 +32,6 @@ public void testChunkerWithDelimiterFieldNoString() { Assert.assertEquals("delimiter parameters should not be empty.", exception.getMessage()); } - public void testChunkerWithLimitNumber() { - DelimiterChunker chunker = new DelimiterChunker(); - String content = "a\nb\nc\nd"; - Map inputParameters = Map.of(DELIMITER_FIELD, "\n", MAX_CHUNK_LIMIT_FIELD, 1); - IllegalStateException exception = assertThrows(IllegalStateException.class, () -> chunker.chunk(content, inputParameters)); - Assert.assertEquals("Exceed max chunk number: 1", exception.getMessage()); - } - public void testChunker() { DelimiterChunker chunker = new DelimiterChunker(); String content = "a\nb\nc\nd"; From 353e88eb2ee04fb616935c45ac618384fb168f69 Mon Sep 17 00:00:00 2001 From: xinyual Date: Thu, 7 Mar 2024 18:47:07 +0800 Subject: [PATCH 076/189] remove delimiter useless code Signed-off-by: xinyual Signed-off-by: yuye-aws --- .../processor/chunker/DelimiterChunker.java | 24 ++----------------- 1 file changed, 2 insertions(+), 22 deletions(-) diff --git a/src/main/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunker.java b/src/main/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunker.java index e99849b53..64dffd1cd 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunker.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunker.java @@ -14,10 +14,6 @@ public DelimiterChunker() {} public static String DELIMITER_FIELD = "delimiter"; - public static String MAX_CHUNK_LIMIT_FIELD = "max_chunk_limit"; - - private static final int DEFAULT_MAX_CHUNK_LIMIT = 100; - @Override public void validateParameters(Map parameters) { if (parameters.containsKey(DELIMITER_FIELD)) { @@ -28,41 +24,25 @@ public void validateParameters(Map parameters) { throw new IllegalArgumentException("delimiter parameters should not be empty."); } } - if (parameters.containsKey(MAX_CHUNK_LIMIT_FIELD)) { - Object maxChunkLimit = parameters.get(MAX_CHUNK_LIMIT_FIELD); - if (!(maxChunkLimit instanceof Integer)) { - throw new IllegalArgumentException("Parameter max_chunk_limit:" + maxChunkLimit.toString() + " should be integer."); - } else if ((int) maxChunkLimit < 0) { - throw new IllegalArgumentException("Parameter max_chunk_limit:" + maxChunkLimit + " is negative."); - } - } } @Override public List chunk(String content, Map parameters) { String delimiter = (String) parameters.getOrDefault(DELIMITER_FIELD, "."); - int maxChunkingNumber = (int) parameters.getOrDefault(MAX_CHUNK_LIMIT_FIELD, -1); List chunkResult = new ArrayList<>(); int start = 0; int end = content.indexOf(delimiter); while (end != -1) { - addChunkResult(chunkResult, maxChunkingNumber, content.substring(start, end + delimiter.length())); + chunkResult.add(content.substring(start, end + delimiter.length())); start = end + delimiter.length(); end = content.indexOf(delimiter, start); } if (start < content.length()) { - addChunkResult(chunkResult, maxChunkingNumber, content.substring(start)); + chunkResult.add(content.substring(start)); } return chunkResult; } - - private void addChunkResult(List chunkResult, int maxChunkingNumber, String candidate) { - if (chunkResult.size() >= maxChunkingNumber && maxChunkingNumber > 0) { - throw new IllegalStateException("Exceed max chunk number: " + maxChunkingNumber); - } - chunkResult.add(candidate); - } } From de554e6c2c2b1abb96b2df3b4afc6ceb4e2eff4c Mon Sep 17 00:00:00 2001 From: xinyual Date: Thu, 7 Mar 2024 19:24:35 +0800 Subject: [PATCH 077/189] add more UT Signed-off-by: xinyual Signed-off-by: yuye-aws --- .../processor/DocumentChunkingProcessor.java | 2 +- .../DocumentChunkingProcessorTests.java | 142 ++++++++++++++---- 2 files changed, 114 insertions(+), 30 deletions(-) diff --git a/src/main/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessor.java b/src/main/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessor.java index 24c95e3d0..3ce01cdcb 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessor.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessor.java @@ -152,7 +152,7 @@ private List chunkString(String content) { "Unable to create the processor as the number of chunks [" + current_chunk_count + "] exceeds the maximum chunk limit [" - + MAX_CHUNK_LIMIT_FIELD + + max_chunk_limit + "]" ); } diff --git a/src/test/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessorTests.java b/src/test/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessorTests.java index 4b15c25f7..dc0337152 100644 --- a/src/test/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessorTests.java +++ b/src/test/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessorTests.java @@ -39,6 +39,7 @@ import static org.mockito.Mockito.when; import static org.mockito.Mockito.mock; import static org.opensearch.neuralsearch.processor.DocumentChunkingProcessor.ALGORITHM_FIELD; +import static org.opensearch.neuralsearch.processor.DocumentChunkingProcessor.MAX_CHUNK_LIMIT_FIELD; public class DocumentChunkingProcessorTests extends OpenSearchTestCase { @@ -64,11 +65,11 @@ private AnalysisRegistry getAnalysisRegistry() { @Override public Map> getTokenizers() { return singletonMap( - "keyword", - (indexSettings, environment, name, settings) -> TokenizerFactory.newFactory( - name, - () -> new MockTokenizer(MockTokenizer.KEYWORD, false) - ) + "keyword", + (indexSettings, environment, name, settings) -> TokenizerFactory.newFactory( + name, + () -> new MockTokenizer(MockTokenizer.KEYWORD, false) + ) ); } }; @@ -96,6 +97,13 @@ private Map createFixedTokenLengthParameters() { return parameters; } + private Map createFixedTokenLengthParametersWithMaxChunk(int maxChunkNum) { + Map parameters = new HashMap<>(); + parameters.put(FixedTokenLengthChunker.TOKEN_LIMIT_FIELD, 10); + parameters.put(MAX_CHUNK_LIMIT_FIELD, maxChunkNum); + return parameters; + } + private Map createDelimiterParameters() { Map parameters = new HashMap<>(); parameters.put(DelimiterChunker.DELIMITER_FIELD, "."); @@ -125,6 +133,17 @@ private DocumentChunkingProcessor createFixedTokenLengthInstance(Map fieldMap, int maxChunkNum) { + Map config = new HashMap<>(); + Map algorithmMap = new HashMap<>(); + algorithmMap.put(ChunkerFactory.FIXED_LENGTH_ALGORITHM, createFixedTokenLengthParametersWithMaxChunk(maxChunkNum)); + config.put(DocumentChunkingProcessor.FIELD_MAP_FIELD, fieldMap); + config.put(ALGORITHM_FIELD, algorithmMap); + Map registry = new HashMap<>(); + return factory.create(registry, PROCESSOR_TAG, DESCRIPTION, config); + } + @SneakyThrows private DocumentChunkingProcessor createDelimiterInstance() { Map config = new HashMap<>(); @@ -144,12 +163,30 @@ public void testCreate_whenFieldMapEmpty_failure() { config.put(DocumentChunkingProcessor.FIELD_MAP_FIELD, emptyFieldMap); Map registry = new HashMap<>(); OpenSearchParseException openSearchParseException = assertThrows( - OpenSearchParseException.class, - () -> factory.create(registry, PROCESSOR_TAG, DESCRIPTION, config) + OpenSearchParseException.class, + () -> factory.create(registry, PROCESSOR_TAG, DESCRIPTION, config) ); assertEquals("[" + ALGORITHM_FIELD + "] required property is missing", openSearchParseException.getMessage()); } + @SneakyThrows + public void testCreate_whenMaxChunkNumNegative() { + Map registry = new HashMap<>(); + Map config = new HashMap<>(); + Map fieldMap = new HashMap<>(); + Map algorithmMap = new HashMap<>(); + fieldMap.put(INPUT_FIELD, OUTPUT_FIELD); + algorithmMap.put(ChunkerFactory.FIXED_LENGTH_ALGORITHM, createFixedTokenLengthParametersWithMaxChunk(-1)); + config.put(DocumentChunkingProcessor.FIELD_MAP_FIELD, fieldMap); + config.put(ALGORITHM_FIELD, algorithmMap); + IllegalArgumentException illegalArgumentException = assertThrows( + IllegalArgumentException.class, + () -> factory.create(registry, PROCESSOR_TAG, DESCRIPTION, config) + ); + assertEquals("Parameter [" + MAX_CHUNK_LIMIT_FIELD + "] must be a positive integer", illegalArgumentException.getMessage()); + + } + public void testCreate_whenFieldMapWithEmptyParameter_failure() { Map config = new HashMap<>(); Map fieldMap = new HashMap<>(); @@ -157,8 +194,8 @@ public void testCreate_whenFieldMapWithEmptyParameter_failure() { config.put(DocumentChunkingProcessor.FIELD_MAP_FIELD, fieldMap); Map registry = new HashMap<>(); OpenSearchParseException openSearchParseException = assertThrows( - OpenSearchParseException.class, - () -> factory.create(registry, PROCESSOR_TAG, DESCRIPTION, config) + OpenSearchParseException.class, + () -> factory.create(registry, PROCESSOR_TAG, DESCRIPTION, config) ); assertEquals("[" + ALGORITHM_FIELD + "] required property is missing", openSearchParseException.getMessage()); } @@ -170,8 +207,8 @@ public void testCreate_whenFieldMapWithIllegalParameterType_failure() { config.put(DocumentChunkingProcessor.FIELD_MAP_FIELD, fieldMap); Map registry = new HashMap<>(); OpenSearchParseException openSearchParseException = assertThrows( - OpenSearchParseException.class, - () -> factory.create(registry, PROCESSOR_TAG, DESCRIPTION, config) + OpenSearchParseException.class, + () -> factory.create(registry, PROCESSOR_TAG, DESCRIPTION, config) ); assertEquals("[" + ALGORITHM_FIELD + "] required property is missing", openSearchParseException.getMessage()); } @@ -185,12 +222,12 @@ public void testCreate_whenFieldMapWithNoAlgorithm_failure() { config.put(ALGORITHM_FIELD, algorithmMap); Map registry = new HashMap<>(); IllegalArgumentException illegalArgumentException = assertThrows( - IllegalArgumentException.class, - () -> factory.create(registry, PROCESSOR_TAG, DESCRIPTION, config) + IllegalArgumentException.class, + () -> factory.create(registry, PROCESSOR_TAG, DESCRIPTION, config) ); assertEquals( - "Unable to create the processor as [" + ALGORITHM_FIELD + "] must contain and only contain 1 algorithm", - illegalArgumentException.getMessage() + "Unable to create the processor as [" + ALGORITHM_FIELD + "] must contain and only contain 1 algorithm", + illegalArgumentException.getMessage() ); } @@ -208,10 +245,10 @@ private String createSourceDataString() { private List createSourceDataList() { List documents = new ArrayList<>(); documents.add( - "This is the first document to be chunked. The document contains a single paragraph, two sentences and 24 tokens by standard tokenizer in OpenSearch." + "This is the first document to be chunked. The document contains a single paragraph, two sentences and 24 tokens by standard tokenizer in OpenSearch." ); documents.add( - "This is the second document to be chunked. The document contains a single paragraph, two sentences and 24 tokens by standard tokenizer in OpenSearch." + "This is the second document to be chunked. The document contains a single paragraph, two sentences and 24 tokens by standard tokenizer in OpenSearch." ); return documents; } @@ -219,12 +256,12 @@ private List createSourceDataList() { private Map createSourceDataMap() { Map documents = new HashMap<>(); documents.put( - "third", - "This is the third document to be chunked. The document contains a single paragraph, two sentences and 24 tokens by standard tokenizer in OpenSearch." + "third", + "This is the third document to be chunked. The document contains a single paragraph, two sentences and 24 tokens by standard tokenizer in OpenSearch." ); documents.put( - "fourth", - "This is the fourth document to be chunked. The document contains a single paragraph, two sentences and 24 tokens by standard tokenizer in OpenSearch." + "fourth", + "This is the fourth document to be chunked. The document contains a single paragraph, two sentences and 24 tokens by standard tokenizer in OpenSearch." ); return documents; } @@ -249,6 +286,53 @@ private IngestDocument createIngestDocumentWithSourceData(Object sourceData) { return new IngestDocument(sourceAndMetadata, new HashMap<>()); } + @SneakyThrows + public void testExecute_withFixedTokenLength_andSourceDataStringWithMaxChunkNum_successful() { + DocumentChunkingProcessor processor = createFixedTokenLengthInstanceWithMaxChunkNum(createStringFieldMap(), 5); + IngestDocument ingestDocument = createIngestDocumentWithSourceData(createSourceDataString()); + IngestDocument document = processor.execute(ingestDocument); + assert document.getSourceAndMetadata().containsKey(OUTPUT_FIELD); + Object passages = document.getSourceAndMetadata().get(OUTPUT_FIELD); + assert (passages instanceof List); + List expectedPassages = new ArrayList<>(); + expectedPassages.add("This is an example document to be chunked The document"); + expectedPassages.add("contains a single paragraph two sentences and 24 tokens by"); + expectedPassages.add("standard tokenizer in OpenSearch"); + assertEquals(expectedPassages, passages); + } + + @SneakyThrows + public void testExecute_withFixedTokenLength_andSourceDataStringWithMaxChunkNumTwice_successful() { + DocumentChunkingProcessor processor = createFixedTokenLengthInstanceWithMaxChunkNum(createStringFieldMap(), 5); + for (int i = 0; i < 2; i++) { + IngestDocument ingestDocument = createIngestDocumentWithSourceData(createSourceDataString()); + IngestDocument document = processor.execute(ingestDocument); + assert document.getSourceAndMetadata().containsKey(OUTPUT_FIELD); + Object passages = document.getSourceAndMetadata().get(OUTPUT_FIELD); + assert (passages instanceof List); + List expectedPassages = new ArrayList<>(); + expectedPassages.add("This is an example document to be chunked The document"); + expectedPassages.add("contains a single paragraph two sentences and 24 tokens by"); + expectedPassages.add("standard tokenizer in OpenSearch"); + assertEquals(expectedPassages, passages); + } + } + + @SneakyThrows + public void testExecute_withFixedTokenLength_andSourceDataStringWithMaxChunkNum_Exceed() { + DocumentChunkingProcessor processor = createFixedTokenLengthInstanceWithMaxChunkNum(createStringFieldMap(), 1); + IngestDocument ingestDocument = createIngestDocumentWithSourceData(createSourceDataString()); + IllegalArgumentException illegalArgumentException = assertThrows( + IllegalArgumentException.class, + () -> processor.execute(ingestDocument) + ); + assertEquals( + illegalArgumentException.getMessage(), + "Unable to create the processor as the number of chunks [" + "3" + "] exceeds the maximum chunk limit [" + "1" + "]" + ); + + } + @SneakyThrows public void testExecute_withFixedTokenLength_andSourceDataString_successful() { DocumentChunkingProcessor processor = createFixedTokenLengthInstance(createStringFieldMap()); @@ -259,8 +343,8 @@ public void testExecute_withFixedTokenLength_andSourceDataString_successful() { assert (passages instanceof List); List expectedPassages = new ArrayList<>(); expectedPassages.add("This is an example document to be chunked The document"); - expectedPassages.add("The document contains a single paragraph two sentences and 24"); - expectedPassages.add("and 24 tokens by standard tokenizer in OpenSearch"); + expectedPassages.add("contains a single paragraph two sentences and 24 tokens by"); + expectedPassages.add("standard tokenizer in OpenSearch"); assertEquals(expectedPassages, passages); } @@ -275,11 +359,11 @@ public void testExecute_withFixedTokenLength_andSourceDataList_successful() { List expectedPassages = new ArrayList<>(); expectedPassages.add("This is the first document to be chunked The document"); - expectedPassages.add("The document contains a single paragraph two sentences and 24"); - expectedPassages.add("and 24 tokens by standard tokenizer in OpenSearch"); + expectedPassages.add("contains a single paragraph two sentences and 24 tokens by"); + expectedPassages.add("standard tokenizer in OpenSearch"); expectedPassages.add("This is the second document to be chunked The document"); - expectedPassages.add("The document contains a single paragraph two sentences and 24"); - expectedPassages.add("and 24 tokens by standard tokenizer in OpenSearch"); + expectedPassages.add("contains a single paragraph two sentences and 24 tokens by"); + expectedPassages.add("standard tokenizer in OpenSearch"); assertEquals(expectedPassages, passages); } @@ -298,8 +382,8 @@ public void testExecute_withFixedTokenLength_andFieldMapNestedMap_successful() { List expectedPassages = new ArrayList<>(); expectedPassages.add("This is an example document to be chunked The document"); - expectedPassages.add("The document contains a single paragraph two sentences and 24"); - expectedPassages.add("and 24 tokens by standard tokenizer in OpenSearch"); + expectedPassages.add("contains a single paragraph two sentences and 24 tokens by"); + expectedPassages.add("standard tokenizer in OpenSearch"); assertEquals(expectedPassages, passages); } From 2453a7966c347c4f93f9de600ec4201652c0c436 Mon Sep 17 00:00:00 2001 From: xinyual Date: Thu, 7 Mar 2024 19:25:50 +0800 Subject: [PATCH 078/189] add UT for list inside map Signed-off-by: xinyual Signed-off-by: yuye-aws --- .../DocumentChunkingProcessorTests.java | 23 +++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/src/test/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessorTests.java b/src/test/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessorTests.java index dc0337152..3b2e7eba4 100644 --- a/src/test/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessorTests.java +++ b/src/test/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessorTests.java @@ -388,6 +388,29 @@ public void testExecute_withFixedTokenLength_andFieldMapNestedMap_successful() { assertEquals(expectedPassages, passages); } + @SneakyThrows + public void testExecute_withFixedTokenLength_andFieldMapNestedMap_sourceList_successful() { + DocumentChunkingProcessor processor = createFixedTokenLengthInstance(createNestedFieldMap()); + IngestDocument ingestDocument = createIngestDocumentWithNestedSourceData(createSourceDataListNestedMap()); + IngestDocument document = processor.execute(ingestDocument); + assert document.getSourceAndMetadata().containsKey(INPUT_NESTED_FIELD_KEY); + Object nestedResult = document.getSourceAndMetadata().get(INPUT_NESTED_FIELD_KEY); + List expectedPassages = new ArrayList<>(); + + expectedPassages.add("This is an example document to be chunked The document"); + expectedPassages.add("The document contains a single paragraph two sentences and 24"); + expectedPassages.add("and 24 tokens by standard tokenizer in OpenSearch"); + assert (nestedResult instanceof List); + assertEquals(((List) nestedResult).size(), 2); + for (Object result : (List) nestedResult) { + assert (result instanceof Map); + assert ((Map) result).containsKey(OUTPUT_FIELD); + Object passages = ((Map) result).get(OUTPUT_FIELD); + assert (passages instanceof List); + assertEquals(expectedPassages, passages); + } + } + @SneakyThrows public void testExecute_withDelimiter_andSourceDataString_successful() { DocumentChunkingProcessor processor = createDelimiterInstance(); From 5f001075fae17dc867da65b876d5b34c03e92fba Mon Sep 17 00:00:00 2001 From: xinyual Date: Thu, 7 Mar 2024 19:27:38 +0800 Subject: [PATCH 079/189] add UT for list inside map Signed-off-by: xinyual Signed-off-by: yuye-aws --- .../processor/DocumentChunkingProcessorTests.java | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/src/test/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessorTests.java b/src/test/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessorTests.java index 3b2e7eba4..ae4ff17de 100644 --- a/src/test/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessorTests.java +++ b/src/test/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessorTests.java @@ -97,6 +97,12 @@ private Map createFixedTokenLengthParameters() { return parameters; } + private List> createSourceDataListNestedMap() { + Map documents = new HashMap<>(); + documents.put(INPUT_FIELD, createSourceDataString()); + return List.of(documents, documents); + } + private Map createFixedTokenLengthParametersWithMaxChunk(int maxChunkNum) { Map parameters = new HashMap<>(); parameters.put(FixedTokenLengthChunker.TOKEN_LIMIT_FIELD, 10); @@ -398,8 +404,8 @@ public void testExecute_withFixedTokenLength_andFieldMapNestedMap_sourceList_suc List expectedPassages = new ArrayList<>(); expectedPassages.add("This is an example document to be chunked The document"); - expectedPassages.add("The document contains a single paragraph two sentences and 24"); - expectedPassages.add("and 24 tokens by standard tokenizer in OpenSearch"); + expectedPassages.add("contains a single paragraph two sentences and 24 tokens by"); + expectedPassages.add("standard tokenizer in OpenSearch"); assert (nestedResult instanceof List); assertEquals(((List) nestedResult).size(), 2); for (Object result : (List) nestedResult) { From fc94955f044ff137f4abe1c99a8271ced9784191 Mon Sep 17 00:00:00 2001 From: yuye-aws Date: Fri, 8 Mar 2024 11:43:29 +0800 Subject: [PATCH 080/189] update unit tests for chunking processor Signed-off-by: yuye-aws --- .../processor/DocumentChunkingProcessor.java | 7 +- .../DocumentChunkingProcessorTests.java | 134 +++++++++++++----- 2 files changed, 99 insertions(+), 42 deletions(-) diff --git a/src/main/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessor.java b/src/main/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessor.java index 3ce01cdcb..60ea84902 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessor.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessor.java @@ -11,7 +11,6 @@ import java.util.Objects; import lombok.extern.log4j.Log4j2; -import org.apache.commons.lang3.StringUtils; import org.opensearch.cluster.metadata.IndexMetadata; import org.opensearch.env.Environment; import org.opensearch.index.IndexService; @@ -111,7 +110,7 @@ private void validateAndParseAlgorithmMap(Map algorithmMap) { } if (!(algorithmValue instanceof Map)) { throw new IllegalArgumentException( - "Unable to create the processor as [" + ALGORITHM_FIELD + "] cannot be cast to [" + Map.class.getName() + "]" + "Unable to create the processor as [" + algorithmKey + "] parameters cannot be cast to [" + Map.class.getName() + "]" ); } FieldChunker chunker = ChunkerFactory.create(algorithmKey, analysisRegistry); @@ -215,8 +214,6 @@ private void validateEmbeddingFieldsValue(IngestDocument ingestDocument) { validateNestedTypeValue(sourceKey, sourceValue, 1); } else if (!String.class.isAssignableFrom(sourceValueClass)) { throw new IllegalArgumentException("field [" + sourceKey + "] is neither string nor nested type, cannot process it"); - } else if (StringUtils.isBlank(sourceValue.toString())) { - throw new IllegalArgumentException("field [" + sourceKey + "] has empty string value, cannot process it"); } } } @@ -247,8 +244,6 @@ private void validateListTypeValue(String sourceKey, Object sourceValue, int max throw new IllegalArgumentException("list type field [" + sourceKey + "] has null, cannot process it"); } else if (!(value instanceof String)) { throw new IllegalArgumentException("list type field [" + sourceKey + "] has non string value, cannot process it"); - } else if (StringUtils.isBlank(value.toString())) { - throw new IllegalArgumentException("list type field [" + sourceKey + "] has empty string, cannot process it"); } } } diff --git a/src/test/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessorTests.java b/src/test/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessorTests.java index ae4ff17de..dec5ff01b 100644 --- a/src/test/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessorTests.java +++ b/src/test/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessorTests.java @@ -7,7 +7,6 @@ import lombok.SneakyThrows; import org.apache.lucene.tests.analysis.MockTokenizer; import org.junit.Before; -import org.mockito.Mock; import org.opensearch.OpenSearchParseException; import org.opensearch.cluster.ClusterState; import org.opensearch.cluster.metadata.Metadata; @@ -38,6 +37,7 @@ import static org.mockito.ArgumentMatchers.anyString; import static org.mockito.Mockito.when; import static org.mockito.Mockito.mock; +import static org.opensearch.neuralsearch.processor.DocumentChunkingProcessor.FIELD_MAP_FIELD; import static org.opensearch.neuralsearch.processor.DocumentChunkingProcessor.ALGORITHM_FIELD; import static org.opensearch.neuralsearch.processor.DocumentChunkingProcessor.MAX_CHUNK_LIMIT_FIELD; @@ -53,9 +53,6 @@ public class DocumentChunkingProcessorTests extends OpenSearchTestCase { private static final String OUTPUT_FIELD = "body_chunk"; private static final String INDEX_NAME = "_index"; - @Mock - private Environment environment; - @SneakyThrows private AnalysisRegistry getAnalysisRegistry() { Settings settings = Settings.builder().put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()).build(); @@ -133,7 +130,7 @@ private DocumentChunkingProcessor createFixedTokenLengthInstance(Map config = new HashMap<>(); Map algorithmMap = new HashMap<>(); algorithmMap.put(ChunkerFactory.FIXED_LENGTH_ALGORITHM, createFixedTokenLengthParameters()); - config.put(DocumentChunkingProcessor.FIELD_MAP_FIELD, fieldMap); + config.put(FIELD_MAP_FIELD, fieldMap); config.put(ALGORITHM_FIELD, algorithmMap); Map registry = new HashMap<>(); return factory.create(registry, PROCESSOR_TAG, DESCRIPTION, config); @@ -144,7 +141,7 @@ private DocumentChunkingProcessor createFixedTokenLengthInstanceWithMaxChunkNum( Map config = new HashMap<>(); Map algorithmMap = new HashMap<>(); algorithmMap.put(ChunkerFactory.FIXED_LENGTH_ALGORITHM, createFixedTokenLengthParametersWithMaxChunk(maxChunkNum)); - config.put(DocumentChunkingProcessor.FIELD_MAP_FIELD, fieldMap); + config.put(FIELD_MAP_FIELD, fieldMap); config.put(ALGORITHM_FIELD, algorithmMap); Map registry = new HashMap<>(); return factory.create(registry, PROCESSOR_TAG, DESCRIPTION, config); @@ -157,16 +154,16 @@ private DocumentChunkingProcessor createDelimiterInstance() { Map algorithmMap = new HashMap<>(); algorithmMap.put(ChunkerFactory.DELIMITER_ALGORITHM, createDelimiterParameters()); fieldMap.put(INPUT_FIELD, OUTPUT_FIELD); - config.put(DocumentChunkingProcessor.FIELD_MAP_FIELD, fieldMap); + config.put(FIELD_MAP_FIELD, fieldMap); config.put(ALGORITHM_FIELD, algorithmMap); Map registry = new HashMap<>(); return factory.create(registry, PROCESSOR_TAG, DESCRIPTION, config); } - public void testCreate_whenFieldMapEmpty_failure() { + public void testCreate_whenAlgorithmFieldMissing_failure() { Map config = new HashMap<>(); - Map emptyFieldMap = new HashMap<>(); - config.put(DocumentChunkingProcessor.FIELD_MAP_FIELD, emptyFieldMap); + Map fieldMap = new HashMap<>(); + config.put(FIELD_MAP_FIELD, fieldMap); Map registry = new HashMap<>(); OpenSearchParseException openSearchParseException = assertThrows( OpenSearchParseException.class, @@ -183,7 +180,7 @@ public void testCreate_whenMaxChunkNumNegative() { Map algorithmMap = new HashMap<>(); fieldMap.put(INPUT_FIELD, OUTPUT_FIELD); algorithmMap.put(ChunkerFactory.FIXED_LENGTH_ALGORITHM, createFixedTokenLengthParametersWithMaxChunk(-1)); - config.put(DocumentChunkingProcessor.FIELD_MAP_FIELD, fieldMap); + config.put(FIELD_MAP_FIELD, fieldMap); config.put(ALGORITHM_FIELD, algorithmMap); IllegalArgumentException illegalArgumentException = assertThrows( IllegalArgumentException.class, @@ -193,38 +190,69 @@ public void testCreate_whenMaxChunkNumNegative() { } - public void testCreate_whenFieldMapWithEmptyParameter_failure() { + public void testCreate_whenAlgorithmFieldNoAlgorithm_failure() { Map config = new HashMap<>(); Map fieldMap = new HashMap<>(); - fieldMap.put("key", null); + Map algorithmMap = new HashMap<>(); + fieldMap.put(INPUT_FIELD, OUTPUT_FIELD); config.put(DocumentChunkingProcessor.FIELD_MAP_FIELD, fieldMap); + config.put(ALGORITHM_FIELD, algorithmMap); Map registry = new HashMap<>(); - OpenSearchParseException openSearchParseException = assertThrows( - OpenSearchParseException.class, + IllegalArgumentException illegalArgumentException = assertThrows( + IllegalArgumentException.class, () -> factory.create(registry, PROCESSOR_TAG, DESCRIPTION, config) ); - assertEquals("[" + ALGORITHM_FIELD + "] required property is missing", openSearchParseException.getMessage()); + assertEquals( + "Unable to create the processor as [" + ALGORITHM_FIELD + "] must contain and only contain 1 algorithm", + illegalArgumentException.getMessage() + ); } - public void testCreate_whenFieldMapWithIllegalParameterType_failure() { + public void testCreate_whenAlgorithmFieldMultipleAlgorithm_failure() { Map config = new HashMap<>(); Map fieldMap = new HashMap<>(); - fieldMap.put("key", "value"); + Map algorithmMap = new HashMap<>(); + fieldMap.put(INPUT_FIELD, OUTPUT_FIELD); config.put(DocumentChunkingProcessor.FIELD_MAP_FIELD, fieldMap); + algorithmMap.put(ChunkerFactory.FIXED_LENGTH_ALGORITHM, createFixedTokenLengthParameters()); + algorithmMap.put(ChunkerFactory.DELIMITER_ALGORITHM, createDelimiterParameters()); + config.put(ALGORITHM_FIELD, algorithmMap); Map registry = new HashMap<>(); - OpenSearchParseException openSearchParseException = assertThrows( - OpenSearchParseException.class, + IllegalArgumentException illegalArgumentException = assertThrows( + IllegalArgumentException.class, () -> factory.create(registry, PROCESSOR_TAG, DESCRIPTION, config) ); - assertEquals("[" + ALGORITHM_FIELD + "] required property is missing", openSearchParseException.getMessage()); + assertEquals( + "Unable to create the processor as [" + ALGORITHM_FIELD + "] must contain and only contain 1 algorithm", + illegalArgumentException.getMessage() + ); + } + + public void testCreate_whenAlgorithmFieldInvalidAlgorithmName_failure() { + Map config = new HashMap<>(); + Map fieldMap = new HashMap<>(); + Map algorithmMap = new HashMap<>(); + String invalid_algorithm_type = "invalid algorithm"; + fieldMap.put(INPUT_FIELD, OUTPUT_FIELD); + config.put(DocumentChunkingProcessor.FIELD_MAP_FIELD, fieldMap); + algorithmMap.put(invalid_algorithm_type, createFixedTokenLengthParameters()); + config.put(ALGORITHM_FIELD, algorithmMap); + Map registry = new HashMap<>(); + IllegalArgumentException illegalArgumentException = assertThrows( + IllegalArgumentException.class, + () -> factory.create(registry, PROCESSOR_TAG, DESCRIPTION, config) + ); + assert (illegalArgumentException.getMessage() + .contains("Unable to create the processor as chunker algorithm [" + invalid_algorithm_type + "] is not supported")); } - public void testCreate_whenFieldMapWithNoAlgorithm_failure() { + public void testCreate_whenAlgorithmFieldInvalidAlgorithmContent_failure() { Map config = new HashMap<>(); Map fieldMap = new HashMap<>(); Map algorithmMap = new HashMap<>(); fieldMap.put(INPUT_FIELD, OUTPUT_FIELD); config.put(DocumentChunkingProcessor.FIELD_MAP_FIELD, fieldMap); + algorithmMap.put(ChunkerFactory.FIXED_LENGTH_ALGORITHM, 1); config.put(ALGORITHM_FIELD, algorithmMap); Map registry = new HashMap<>(); IllegalArgumentException illegalArgumentException = assertThrows( @@ -232,7 +260,11 @@ public void testCreate_whenFieldMapWithNoAlgorithm_failure() { () -> factory.create(registry, PROCESSOR_TAG, DESCRIPTION, config) ); assertEquals( - "Unable to create the processor as [" + ALGORITHM_FIELD + "] must contain and only contain 1 algorithm", + "Unable to create the processor as [" + + ChunkerFactory.FIXED_LENGTH_ALGORITHM + + "] parameters cannot be cast to [" + + Map.class.getName() + + "]", illegalArgumentException.getMessage() ); } @@ -248,7 +280,7 @@ private String createSourceDataString() { return "This is an example document to be chunked. The document contains a single paragraph, two sentences and 24 tokens by standard tokenizer in OpenSearch."; } - private List createSourceDataList() { + private List createSourceDataListStrings() { List documents = new ArrayList<>(); documents.add( "This is the first document to be chunked. The document contains a single paragraph, two sentences and 24 tokens by standard tokenizer in OpenSearch." @@ -259,16 +291,21 @@ private List createSourceDataList() { return documents; } - private Map createSourceDataMap() { - Map documents = new HashMap<>(); - documents.put( - "third", - "This is the third document to be chunked. The document contains a single paragraph, two sentences and 24 tokens by standard tokenizer in OpenSearch." + private List createSourceDataListHybridType() { + List documents = new ArrayList<>(); + documents.add( + "This is the first document to be chunked. The document contains a single paragraph, two sentences and 24 tokens by standard tokenizer in OpenSearch." ); - documents.put( - "fourth", - "This is the fourth document to be chunked. The document contains a single paragraph, two sentences and 24 tokens by standard tokenizer in OpenSearch." + documents.add(1); + return documents; + } + + private List createSourceDataListWithNull() { + List documents = new ArrayList<>(); + documents.add( + "This is the first document to be chunked. The document contains a single paragraph, two sentences and 24 tokens by standard tokenizer in OpenSearch." ); + documents.add(null); return documents; } @@ -355,9 +392,9 @@ public void testExecute_withFixedTokenLength_andSourceDataString_successful() { } @SneakyThrows - public void testExecute_withFixedTokenLength_andSourceDataList_successful() { + public void testExecute_withFixedTokenLength_andSourceDataListStrings_successful() { DocumentChunkingProcessor processor = createFixedTokenLengthInstance(createStringFieldMap()); - IngestDocument ingestDocument = createIngestDocumentWithSourceData(createSourceDataList()); + IngestDocument ingestDocument = createIngestDocumentWithSourceData(createSourceDataListStrings()); IngestDocument document = processor.execute(ingestDocument); assert document.getSourceAndMetadata().containsKey(OUTPUT_FIELD); Object passages = document.getSourceAndMetadata().get(OUTPUT_FIELD); @@ -373,6 +410,32 @@ public void testExecute_withFixedTokenLength_andSourceDataList_successful() { assertEquals(expectedPassages, passages); } + @SneakyThrows + public void testExecute_withFixedTokenLength_andSourceDataListHybridType_failure() { + DocumentChunkingProcessor processor = createFixedTokenLengthInstance(createStringFieldMap()); + IngestDocument ingestDocument = createIngestDocumentWithSourceData(createSourceDataListHybridType()); + IllegalArgumentException illegalArgumentException = assertThrows( + IllegalArgumentException.class, + () -> processor.execute(ingestDocument) + ); + assertEquals( + "list type field [" + INPUT_FIELD + "] has non string value, cannot process it", + illegalArgumentException.getMessage() + ); + } + + @SneakyThrows + public void testExecute_withFixedTokenLength_andSourceDataListWithNull_failure() { + DocumentChunkingProcessor processor = createFixedTokenLengthInstance(createStringFieldMap()); + IngestDocument ingestDocument = createIngestDocumentWithSourceData(createSourceDataListWithNull()); + IllegalArgumentException illegalArgumentException = assertThrows( + IllegalArgumentException.class, + () -> processor.execute(ingestDocument) + ); + assertEquals("list type field [" + INPUT_FIELD + "] has null, cannot process it", illegalArgumentException.getMessage()); + } + + @SuppressWarnings("unchecked") @SneakyThrows public void testExecute_withFixedTokenLength_andFieldMapNestedMap_successful() { DocumentChunkingProcessor processor = createFixedTokenLengthInstance(createNestedFieldMap()); @@ -386,15 +449,14 @@ public void testExecute_withFixedTokenLength_andFieldMapNestedMap_successful() { assert (passages instanceof List); List expectedPassages = new ArrayList<>(); - expectedPassages.add("This is an example document to be chunked The document"); expectedPassages.add("contains a single paragraph two sentences and 24 tokens by"); expectedPassages.add("standard tokenizer in OpenSearch"); - assertEquals(expectedPassages, passages); } @SneakyThrows + @SuppressWarnings("unchecked") public void testExecute_withFixedTokenLength_andFieldMapNestedMap_sourceList_successful() { DocumentChunkingProcessor processor = createFixedTokenLengthInstance(createNestedFieldMap()); IngestDocument ingestDocument = createIngestDocumentWithNestedSourceData(createSourceDataListNestedMap()); From 388fd435b2acbe984c8729d4d90bcdacbf227013 Mon Sep 17 00:00:00 2001 From: yuye-aws Date: Fri, 8 Mar 2024 12:06:48 +0800 Subject: [PATCH 081/189] add more unit tests for chunking processor Signed-off-by: yuye-aws --- .../processor/DocumentChunkingProcessor.java | 4 +- .../DocumentChunkingProcessorTests.java | 63 +++++++++++++++++++ 2 files changed, 65 insertions(+), 2 deletions(-) diff --git a/src/main/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessor.java b/src/main/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessor.java index 60ea84902..f27ec7930 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessor.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessor.java @@ -181,7 +181,7 @@ private List chunkLeafType(Object value) { @Override public IngestDocument execute(IngestDocument ingestDocument) { - validateEmbeddingFieldsValue(ingestDocument); + validateFieldsValue(ingestDocument); current_chunk_count = 0; if (Objects.equals(chunkerType, FIXED_LENGTH_ALGORITHM)) { // add maxTokenCount setting from index metadata to chunker parameters @@ -203,7 +203,7 @@ public IngestDocument execute(IngestDocument ingestDocument) { return ingestDocument; } - private void validateEmbeddingFieldsValue(IngestDocument ingestDocument) { + private void validateFieldsValue(IngestDocument ingestDocument) { Map sourceAndMetadataMap = ingestDocument.getSourceAndMetadata(); for (Map.Entry embeddingFieldsEntry : fieldMap.entrySet()) { Object sourceValue = sourceAndMetadataMap.get(embeddingFieldsEntry.getKey()); diff --git a/src/test/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessorTests.java b/src/test/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessorTests.java index dec5ff01b..acb0c91f0 100644 --- a/src/test/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessorTests.java +++ b/src/test/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessorTests.java @@ -315,6 +315,24 @@ private Map createSourceDataNestedMap() { return documents; } + private Map createSourceDataInvalidNestedMap() { + Map documents = new HashMap<>(); + documents.put(INPUT_FIELD, Map.of(INPUT_NESTED_FIELD_KEY, 1)); + return documents; + } + + private Map createMaxDepthLimitExceedMap(int maxDepth) { + if (maxDepth > 21) { + return null; + } + Map resultMap = new HashMap<>(); + Map innerMap = createMaxDepthLimitExceedMap(maxDepth + 1); + if (innerMap != null) { + resultMap.put(INPUT_FIELD, innerMap); + } + return resultMap; + } + private IngestDocument createIngestDocumentWithNestedSourceData(Object sourceData) { Map sourceAndMetadata = new HashMap<>(); sourceAndMetadata.put(INPUT_NESTED_FIELD_KEY, sourceData); @@ -391,6 +409,23 @@ public void testExecute_withFixedTokenLength_andSourceDataString_successful() { assertEquals(expectedPassages, passages); } + @SneakyThrows + public void testExecute_withFixedTokenLength_andSourceDataInvalidType_failure() { + DocumentChunkingProcessor processor = createFixedTokenLengthInstance(createStringFieldMap()); + Map sourceAndMetadata = new HashMap<>(); + sourceAndMetadata.put(INPUT_FIELD, 1); + sourceAndMetadata.put(IndexFieldMapper.NAME, INDEX_NAME); + IngestDocument ingestDocument = new IngestDocument(sourceAndMetadata, new HashMap<>()); + IllegalArgumentException illegalArgumentException = assertThrows( + IllegalArgumentException.class, + () -> processor.execute(ingestDocument) + ); + assertEquals( + "field [" + INPUT_FIELD + "] is neither string nor nested type, cannot process it", + illegalArgumentException.getMessage() + ); + } + @SneakyThrows public void testExecute_withFixedTokenLength_andSourceDataListStrings_successful() { DocumentChunkingProcessor processor = createFixedTokenLengthInstance(createStringFieldMap()); @@ -455,6 +490,34 @@ public void testExecute_withFixedTokenLength_andFieldMapNestedMap_successful() { assertEquals(expectedPassages, passages); } + @SneakyThrows + public void testExecute_withFixedTokenLength_andMaxDepthLimitExceedFieldMap_failure() { + DocumentChunkingProcessor processor = createFixedTokenLengthInstance(createNestedFieldMap()); + IngestDocument ingestDocument = createIngestDocumentWithNestedSourceData(createMaxDepthLimitExceedMap(0)); + IllegalArgumentException illegalArgumentException = assertThrows( + IllegalArgumentException.class, + () -> processor.execute(ingestDocument) + ); + assertEquals( + "map type field [" + INPUT_NESTED_FIELD_KEY + "] reached max depth limit, cannot process it", + illegalArgumentException.getMessage() + ); + } + + @SneakyThrows + public void testExecute_withFixedTokenLength_andFieldMapNestedMap_failure() { + DocumentChunkingProcessor processor = createFixedTokenLengthInstance(createNestedFieldMap()); + IngestDocument ingestDocument = createIngestDocumentWithNestedSourceData(createSourceDataInvalidNestedMap()); + IllegalArgumentException illegalArgumentException = assertThrows( + IllegalArgumentException.class, + () -> processor.execute(ingestDocument) + ); + assertEquals( + "map type field [" + INPUT_NESTED_FIELD_KEY + "] has non-string type, cannot process it", + illegalArgumentException.getMessage() + ); + } + @SneakyThrows @SuppressWarnings("unchecked") public void testExecute_withFixedTokenLength_andFieldMapNestedMap_sourceList_successful() { From bb35c79a6fb46e36c8731f7cdd3e8c1a8ded4915 Mon Sep 17 00:00:00 2001 From: yuye-aws Date: Fri, 8 Mar 2024 12:09:37 +0800 Subject: [PATCH 082/189] resolve code review comments Signed-off-by: yuye-aws --- .../neuralsearch/processor/DocumentChunkingProcessor.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessor.java b/src/main/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessor.java index f27ec7930..b4c33b2ee 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessor.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessor.java @@ -155,7 +155,7 @@ private List chunkString(String content) { + "]" ); } - return chunker.chunk(content, chunkerParameters); + return result; } private List chunkList(List contentList) { From b4d5fda5d91d02b7fd96ba3782893c7c7374ba62 Mon Sep 17 00:00:00 2001 From: yuye-aws Date: Fri, 8 Mar 2024 12:49:50 +0800 Subject: [PATCH 083/189] add java doc Signed-off-by: yuye-aws --- .../processor/DocumentChunkingProcessor.java | 7 ++ .../processor/chunker/ChunkerFactory.java | 3 + .../processor/chunker/DelimiterChunker.java | 18 +++ .../processor/chunker/FieldChunker.java | 18 +++ .../chunker/FixedTokenLengthChunker.java | 111 +++++++++++------- 5 files changed, 114 insertions(+), 43 deletions(-) diff --git a/src/main/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessor.java b/src/main/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessor.java index b4c33b2ee..18384614d 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessor.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessor.java @@ -31,6 +31,10 @@ import static org.opensearch.neuralsearch.processor.chunker.ChunkerFactory.DELIMITER_ALGORITHM; import static org.opensearch.neuralsearch.processor.chunker.ChunkerFactory.FIXED_LENGTH_ALGORITHM; +/** + * This processor is used for chunking user input data text embedding processing, algorithm can be used to indicate chunking algorithm and parameters, + * and field_map can be used to indicate which fields needs chunking and the corresponding keys for the chunking results. + */ @Log4j2 public final class DocumentChunkingProcessor extends AbstractProcessor { @@ -277,6 +281,9 @@ private void chunkMapType(Map sourceAndMetadataMap, Map parameters) { if (parameters.containsKey(DELIMITER_FIELD)) { @@ -26,6 +37,13 @@ public void validateParameters(Map parameters) { } } + + /** + * Return the chunked passages for delimiter algorithm + * + * @param content input string + * @param parameters a map containing parameters, containing the following parameters + */ @Override public List chunk(String content, Map parameters) { String delimiter = (String) parameters.getOrDefault(DELIMITER_FIELD, "."); diff --git a/src/main/java/org/opensearch/neuralsearch/processor/chunker/FieldChunker.java b/src/main/java/org/opensearch/neuralsearch/processor/chunker/FieldChunker.java index 194e35b97..84aeaeb03 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/chunker/FieldChunker.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/chunker/FieldChunker.java @@ -7,8 +7,26 @@ import java.util.Map; import java.util.List; +/** + * The interface for all chunking algorithms. + * All algorithms need to validate parameters and chunk the content, + */ public interface FieldChunker { + + /** + * Chunk the incoming string and return chunked passages + * + * @param parameters a map containing parameters for chunking algorithms + * @throws IllegalArgumentException parameters are invalid + */ void validateParameters(Map parameters); + /** + * Chunk the incoming string and return chunked passages + * + * @param content input string + * @param parameters a map containing parameters for chunking algorithms + * @return Chunked passages + */ List chunk(String content, Map parameters); } diff --git a/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java b/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java index 5d0bcdef9..2610270f3 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java @@ -17,6 +17,9 @@ import org.opensearch.index.analysis.AnalysisRegistry; import static org.opensearch.action.admin.indices.analyze.TransportAnalyzeAction.analyze; +/** + * The implementation of fixed token length chunker algorithm. + */ @Log4j2 public class FixedTokenLengthChunker implements FieldChunker { @@ -39,6 +42,22 @@ public FixedTokenLengthChunker(AnalysisRegistry analysisRegistry) { this.analysisRegistry = analysisRegistry; } + private void validatePositiveIntegerParameter(Map parameters, String fieldName) { + // this method validate that parameter is a positive integer + if (!parameters.containsKey(fieldName)) { + // all parameters are optional + return; + } + if (!(parameters.get(fieldName) instanceof Number)) { + throw new IllegalArgumentException( + "fixed length parameter [" + fieldName + "] cannot be cast to [" + Number.class.getName() + "]" + ); + } + if (((Number) parameters.get(fieldName)).intValue() <= 0) { + throw new IllegalArgumentException("fixed length parameter [" + fieldName + "] must be positive"); + } + } + private List tokenize(String content, String tokenizer, int maxTokenCount) { AnalyzeAction.Request analyzeRequest = new AnalyzeAction.Request(); analyzeRequest.text(content); @@ -49,8 +68,56 @@ private List tokenize(String content, String tokenizer, int maxTokenCoun } catch (IOException e) { throw new RuntimeException("Fixed token length algorithm meet with exception: " + e); } - }; + } + + /** + * Validate the chunked passages for fixed token length algorithm + * + * @param parameters a map containing parameters, containing the following parameters + * 1. tokenizer the analyzer tokenizer in opensearch, please check https://opensearch.org/docs/latest/analyzers/tokenizers/index/ + * 2. token_limit the token limit for each chunked passage + * 3. overlap_rate the overlapping degree for each chunked passage, indicating how many token comes from the previous passage + * 4. max_token_count the max token limit for the tokenizer + * @throws IllegalArgumentException If max_token_count and token_limit is not a positive integer + * @throws IllegalArgumentException If overlap_rate < 0 or overlap_rate > 0.5 + * @throws IllegalArgumentException If tokenizer is not a string + */ + @Override + public void validateParameters(Map parameters) { + validatePositiveIntegerParameter(parameters, TOKEN_LIMIT_FIELD); + validatePositiveIntegerParameter(parameters, MAX_TOKEN_COUNT_FIELD); + + if (parameters.containsKey(OVERLAP_RATE_FIELD)) { + if (!(parameters.get(OVERLAP_RATE_FIELD) instanceof Number)) { + throw new IllegalArgumentException( + "fixed length parameter [" + OVERLAP_RATE_FIELD + "] cannot be cast to [" + Number.class.getName() + "]" + ); + } + BigDecimal overlap_rate = new BigDecimal(String.valueOf(parameters.get(OVERLAP_RATE_FIELD))); + if (overlap_rate.compareTo(BigDecimal.ZERO) < 0 || overlap_rate.compareTo(OVERLAP_RATE_UPPER_BOUND) > 0) { + throw new IllegalArgumentException( + "fixed length parameter [" + OVERLAP_RATE_FIELD + "] must be between 0 and " + OVERLAP_RATE_UPPER_BOUND + ); + } + } + + if (parameters.containsKey(TOKENIZER_FIELD) && !(parameters.get(TOKENIZER_FIELD) instanceof String)) { + throw new IllegalArgumentException( + "fixed length parameter [" + TOKENIZER_FIELD + "] cannot be cast to [" + String.class.getName() + "]" + ); + } + } + /** + * Return the chunked passages for fixed token length algorithm + * + * @param content input string + * @param parameters a map containing parameters, containing the following parameters + * 1. tokenizer the analyzer tokenizer in opensearch, please check https://opensearch.org/docs/latest/analyzers/tokenizers/index/ + * 2. token_limit the token limit for each chunked passage + * 3. overlap_rate the overlapping degree for each chunked passage, indicating how many token comes from the previous passage + * 4. max_token_count the max token limit for the tokenizer + */ @Override public List chunk(String content, Map parameters) { // prior to chunking, parameters have been validated @@ -99,46 +166,4 @@ public List chunk(String content, Map parameters) { } return passages; } - - private void validatePositiveIntegerParameter(Map parameters, String fieldName) { - // this method validate that parameter is a positive integer - if (!parameters.containsKey(fieldName)) { - // all parameters are optional - return; - } - if (!(parameters.get(fieldName) instanceof Number)) { - throw new IllegalArgumentException( - "fixed length parameter [" + fieldName + "] cannot be cast to [" + Number.class.getName() + "]" - ); - } - if (((Number) parameters.get(fieldName)).intValue() <= 0) { - throw new IllegalArgumentException("fixed length parameter [" + fieldName + "] must be positive"); - } - } - - @Override - public void validateParameters(Map parameters) { - validatePositiveIntegerParameter(parameters, TOKEN_LIMIT_FIELD); - validatePositiveIntegerParameter(parameters, MAX_TOKEN_COUNT_FIELD); - - if (parameters.containsKey(OVERLAP_RATE_FIELD)) { - if (!(parameters.get(OVERLAP_RATE_FIELD) instanceof Number)) { - throw new IllegalArgumentException( - "fixed length parameter [" + OVERLAP_RATE_FIELD + "] cannot be cast to [" + Number.class.getName() + "]" - ); - } - BigDecimal overlap_rate = new BigDecimal(String.valueOf(parameters.get(OVERLAP_RATE_FIELD))); - if (overlap_rate.compareTo(BigDecimal.ZERO) < 0 || overlap_rate.compareTo(OVERLAP_RATE_UPPER_BOUND) > 0) { - throw new IllegalArgumentException( - "fixed length parameter [" + OVERLAP_RATE_FIELD + "] must be between 0 and " + OVERLAP_RATE_UPPER_BOUND - ); - } - } - - if (parameters.containsKey(TOKENIZER_FIELD) && !(parameters.get(TOKENIZER_FIELD) instanceof String)) { - throw new IllegalArgumentException( - "fixed length parameter [" + TOKENIZER_FIELD + "] cannot be cast to [" + String.class.getName() + "]" - ); - } - } } From 453dd356b82c1aab433f0aebf55d45ef5c7ab216 Mon Sep 17 00:00:00 2001 From: yuye-aws Date: Fri, 8 Mar 2024 12:53:12 +0800 Subject: [PATCH 084/189] update java doc Signed-off-by: yuye-aws --- .../neuralsearch/processor/DocumentChunkingProcessor.java | 5 +++++ .../neuralsearch/processor/chunker/DelimiterChunker.java | 2 +- .../processor/chunker/FixedTokenLengthChunker.java | 2 +- 3 files changed, 7 insertions(+), 2 deletions(-) diff --git a/src/main/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessor.java b/src/main/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessor.java index 18384614d..dec310940 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessor.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessor.java @@ -9,6 +9,7 @@ import java.util.ArrayList; import java.util.List; import java.util.Objects; +import java.util.function.BiConsumer; import lombok.extern.log4j.Log4j2; import org.opensearch.cluster.metadata.IndexMetadata; @@ -183,6 +184,10 @@ private List chunkLeafType(Object value) { return chunkedResult; } + /** + * This method will be invoked by PipelineService to perform chunking and then write back chunking results to the document. + * @param ingestDocument {@link IngestDocument} which is the document passed to processor. + */ @Override public IngestDocument execute(IngestDocument ingestDocument) { validateFieldsValue(ingestDocument); diff --git a/src/main/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunker.java b/src/main/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunker.java index b47d52a80..cdfafc1ab 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunker.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunker.java @@ -9,7 +9,7 @@ import java.util.Map; /** - * The implementation of delimiter algorithm + * The implementation {@link FieldChunker} for delimiter algorithm */ public class DelimiterChunker implements FieldChunker { diff --git a/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java b/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java index 2610270f3..01070c01c 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java @@ -18,7 +18,7 @@ import static org.opensearch.action.admin.indices.analyze.TransportAnalyzeAction.analyze; /** - * The implementation of fixed token length chunker algorithm. + * The implementation {@link FieldChunker} for fixed token length algorithm. */ @Log4j2 public class FixedTokenLengthChunker implements FieldChunker { From 8c8fbaf29bd8904578baa48b80ed2c72fc665030 Mon Sep 17 00:00:00 2001 From: yuye-aws Date: Fri, 8 Mar 2024 12:59:16 +0800 Subject: [PATCH 085/189] update java doc Signed-off-by: yuye-aws --- .../neuralsearch/processor/DocumentChunkingProcessor.java | 1 - .../neuralsearch/processor/chunker/ChunkerFactory.java | 2 +- .../processor/chunker/FixedTokenLengthChunker.java | 8 ++++---- 3 files changed, 5 insertions(+), 6 deletions(-) diff --git a/src/main/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessor.java b/src/main/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessor.java index dec310940..82dffe170 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessor.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessor.java @@ -9,7 +9,6 @@ import java.util.ArrayList; import java.util.List; import java.util.Objects; -import java.util.function.BiConsumer; import lombok.extern.log4j.Log4j2; import org.opensearch.cluster.metadata.IndexMetadata; diff --git a/src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerFactory.java b/src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerFactory.java index 532b90365..57894c0f5 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerFactory.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerFactory.java @@ -9,7 +9,7 @@ import java.util.Set; /** - * A factory to create different chunking algorithm classes. + * A factory to create different chunking algorithm classes and return all supported chunking algorithms. */ public class ChunkerFactory { diff --git a/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java b/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java index 01070c01c..a95a0449a 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java @@ -50,7 +50,7 @@ private void validatePositiveIntegerParameter(Map parameters, St } if (!(parameters.get(fieldName) instanceof Number)) { throw new IllegalArgumentException( - "fixed length parameter [" + fieldName + "] cannot be cast to [" + Number.class.getName() + "]" + "fixed length parameter [" + fieldName + "] cannot be cast to [" + Number.class.getName() + "]" ); } if (((Number) parameters.get(fieldName)).intValue() <= 0) { @@ -90,20 +90,20 @@ public void validateParameters(Map parameters) { if (parameters.containsKey(OVERLAP_RATE_FIELD)) { if (!(parameters.get(OVERLAP_RATE_FIELD) instanceof Number)) { throw new IllegalArgumentException( - "fixed length parameter [" + OVERLAP_RATE_FIELD + "] cannot be cast to [" + Number.class.getName() + "]" + "fixed length parameter [" + OVERLAP_RATE_FIELD + "] cannot be cast to [" + Number.class.getName() + "]" ); } BigDecimal overlap_rate = new BigDecimal(String.valueOf(parameters.get(OVERLAP_RATE_FIELD))); if (overlap_rate.compareTo(BigDecimal.ZERO) < 0 || overlap_rate.compareTo(OVERLAP_RATE_UPPER_BOUND) > 0) { throw new IllegalArgumentException( - "fixed length parameter [" + OVERLAP_RATE_FIELD + "] must be between 0 and " + OVERLAP_RATE_UPPER_BOUND + "fixed length parameter [" + OVERLAP_RATE_FIELD + "] must be between 0 and " + OVERLAP_RATE_UPPER_BOUND ); } } if (parameters.containsKey(TOKENIZER_FIELD) && !(parameters.get(TOKENIZER_FIELD) instanceof String)) { throw new IllegalArgumentException( - "fixed length parameter [" + TOKENIZER_FIELD + "] cannot be cast to [" + String.class.getName() + "]" + "fixed length parameter [" + TOKENIZER_FIELD + "] cannot be cast to [" + String.class.getName() + "]" ); } } From b58898337b64b846b28825065c23baca3c746e80 Mon Sep 17 00:00:00 2001 From: yuye-aws Date: Fri, 8 Mar 2024 13:02:58 +0800 Subject: [PATCH 086/189] fix import order Signed-off-by: yuye-aws --- .../processor/DocumentChunkingProcessor.java | 1 - .../processor/chunker/ChunkerFactory.java | 5 ++--- .../processor/chunker/DelimiterChunker.java | 6 ++---- .../chunker/FixedTokenLengthChunker.java | 8 +++---- .../DocumentChunkingProcessorIT.java | 11 +++++----- .../DocumentChunkingProcessorTests.java | 21 +++++++++---------- 6 files changed, 23 insertions(+), 29 deletions(-) diff --git a/src/main/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessor.java b/src/main/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessor.java index 82dffe170..81d7d2b17 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessor.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessor.java @@ -26,7 +26,6 @@ import org.opensearch.neuralsearch.processor.chunker.FieldChunker; import org.opensearch.index.mapper.IndexFieldMapper; import org.opensearch.neuralsearch.processor.chunker.FixedTokenLengthChunker; - import static org.opensearch.ingest.ConfigurationUtils.readMap; import static org.opensearch.neuralsearch.processor.chunker.ChunkerFactory.DELIMITER_ALGORITHM; import static org.opensearch.neuralsearch.processor.chunker.ChunkerFactory.FIXED_LENGTH_ALGORITHM; diff --git a/src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerFactory.java b/src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerFactory.java index 57894c0f5..6dea95a22 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerFactory.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerFactory.java @@ -4,10 +4,10 @@ */ package org.opensearch.neuralsearch.processor.chunker; -import org.opensearch.index.analysis.AnalysisRegistry; - import java.util.Set; +import org.opensearch.index.analysis.AnalysisRegistry; + /** * A factory to create different chunking algorithm classes and return all supported chunking algorithms. */ @@ -38,5 +38,4 @@ public static FieldChunker create(String type, AnalysisRegistry analysisRegistry public static Set getAllChunkers() { return Set.of(FIXED_LENGTH_ALGORITHM, DELIMITER_ALGORITHM); } - } diff --git a/src/main/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunker.java b/src/main/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunker.java index cdfafc1ab..3fd12f8f1 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunker.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunker.java @@ -4,9 +4,9 @@ */ package org.opensearch.neuralsearch.processor.chunker; -import java.util.ArrayList; -import java.util.List; import java.util.Map; +import java.util.List; +import java.util.ArrayList; /** * The implementation {@link FieldChunker} for delimiter algorithm @@ -17,7 +17,6 @@ public DelimiterChunker() {} public static String DELIMITER_FIELD = "delimiter"; - /** * Validate the chunked passages for delimiter algorithm * @@ -37,7 +36,6 @@ public void validateParameters(Map parameters) { } } - /** * Return the chunked passages for delimiter algorithm * diff --git a/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java b/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java index a95a0449a..793966b05 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java @@ -5,14 +5,14 @@ package org.opensearch.neuralsearch.processor.chunker; import java.io.IOException; +import java.math.BigDecimal; import java.math.RoundingMode; -import java.util.ArrayList; -import java.util.List; import java.util.Map; -import java.math.BigDecimal; +import java.util.List; +import java.util.ArrayList; import java.util.stream.Collectors; - import lombok.extern.log4j.Log4j2; + import org.opensearch.action.admin.indices.analyze.AnalyzeAction; import org.opensearch.index.analysis.AnalysisRegistry; import static org.opensearch.action.admin.indices.analyze.TransportAnalyzeAction.analyze; diff --git a/src/test/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessorIT.java b/src/test/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessorIT.java index d136af9c4..c47f8e225 100644 --- a/src/test/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessorIT.java +++ b/src/test/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessorIT.java @@ -9,18 +9,17 @@ import org.apache.hc.core5.http.io.entity.EntityUtils; import org.apache.hc.core5.http.message.BasicHeader; import org.junit.Before; -import org.opensearch.client.Response; -import org.opensearch.common.xcontent.XContentHelper; -import org.opensearch.common.xcontent.XContentType; -import org.opensearch.index.query.MatchAllQueryBuilder; -import org.opensearch.neuralsearch.BaseNeuralSearchIT; - import java.nio.file.Files; import java.nio.file.Path; import java.util.ArrayList; import java.util.List; import java.util.Map; +import org.opensearch.client.Response; +import org.opensearch.common.xcontent.XContentHelper; +import org.opensearch.common.xcontent.XContentType; +import org.opensearch.index.query.MatchAllQueryBuilder; +import org.opensearch.neuralsearch.BaseNeuralSearchIT; import static org.opensearch.neuralsearch.TestUtils.DEFAULT_USER_AGENT; public class DocumentChunkingProcessorIT extends BaseNeuralSearchIT { diff --git a/src/test/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessorTests.java b/src/test/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessorTests.java index acb0c91f0..c02c76256 100644 --- a/src/test/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessorTests.java +++ b/src/test/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessorTests.java @@ -7,6 +7,16 @@ import lombok.SneakyThrows; import org.apache.lucene.tests.analysis.MockTokenizer; import org.junit.Before; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import static java.util.Collections.singletonList; +import static java.util.Collections.singletonMap; +import static org.mockito.ArgumentMatchers.anyString; +import static org.mockito.Mockito.when; +import static org.mockito.Mockito.mock; + import org.opensearch.OpenSearchParseException; import org.opensearch.cluster.ClusterState; import org.opensearch.cluster.metadata.Metadata; @@ -26,17 +36,6 @@ import org.opensearch.neuralsearch.processor.chunker.FixedTokenLengthChunker; import org.opensearch.plugins.AnalysisPlugin; import org.opensearch.test.OpenSearchTestCase; - -import java.util.ArrayList; -import java.util.HashMap; -import java.util.List; -import java.util.Map; - -import static java.util.Collections.singletonList; -import static java.util.Collections.singletonMap; -import static org.mockito.ArgumentMatchers.anyString; -import static org.mockito.Mockito.when; -import static org.mockito.Mockito.mock; import static org.opensearch.neuralsearch.processor.DocumentChunkingProcessor.FIELD_MAP_FIELD; import static org.opensearch.neuralsearch.processor.DocumentChunkingProcessor.ALGORITHM_FIELD; import static org.opensearch.neuralsearch.processor.DocumentChunkingProcessor.MAX_CHUNK_LIMIT_FIELD; From 23dd7695a92df849b243f760fc80fafee7896136 Mon Sep 17 00:00:00 2001 From: yuye-aws Date: Fri, 8 Mar 2024 13:04:39 +0800 Subject: [PATCH 087/189] update java doc Signed-off-by: yuye-aws --- .../neuralsearch/processor/chunker/DelimiterChunker.java | 2 +- .../neuralsearch/processor/chunker/FixedTokenLengthChunker.java | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/main/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunker.java b/src/main/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunker.java index 3fd12f8f1..787e94cd7 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunker.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunker.java @@ -9,7 +9,7 @@ import java.util.ArrayList; /** - * The implementation {@link FieldChunker} for delimiter algorithm + * The implementation {@link FieldChunker} for delimiter algorithm */ public class DelimiterChunker implements FieldChunker { diff --git a/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java b/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java index 793966b05..0c691eae6 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java @@ -18,7 +18,7 @@ import static org.opensearch.action.admin.indices.analyze.TransportAnalyzeAction.analyze; /** - * The implementation {@link FieldChunker} for fixed token length algorithm. + * The implementation {@link FieldChunker} for fixed token length algorithm. */ @Log4j2 public class FixedTokenLengthChunker implements FieldChunker { From 3ad78da7714b58f300e81388f8caa93d9376e4aa Mon Sep 17 00:00:00 2001 From: yuye-aws Date: Fri, 8 Mar 2024 13:06:34 +0800 Subject: [PATCH 088/189] fix java doc error Signed-off-by: yuye-aws --- .../neuralsearch/processor/chunker/FixedTokenLengthChunker.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java b/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java index 0c691eae6..fe08ee074 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java @@ -79,7 +79,7 @@ private List tokenize(String content, String tokenizer, int maxTokenCoun * 3. overlap_rate the overlapping degree for each chunked passage, indicating how many token comes from the previous passage * 4. max_token_count the max token limit for the tokenizer * @throws IllegalArgumentException If max_token_count and token_limit is not a positive integer - * @throws IllegalArgumentException If overlap_rate < 0 or overlap_rate > 0.5 + * @throws IllegalArgumentException If overlap_rate is not within range [0, 0.5] * @throws IllegalArgumentException If tokenizer is not a string */ @Override From abb9bde7fadfa84c4c6e9d7cb31d8d514204ec27 Mon Sep 17 00:00:00 2001 From: yuye-aws Date: Fri, 8 Mar 2024 13:14:12 +0800 Subject: [PATCH 089/189] fix update ut for fixed token length chunker Signed-off-by: yuye-aws --- .../processor/chunker/FixedTokenLengthChunkerTests.java | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/test/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunkerTests.java b/src/test/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunkerTests.java index 0633213fb..75f40808f 100644 --- a/src/test/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunkerTests.java +++ b/src/test/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunkerTests.java @@ -26,6 +26,7 @@ import static org.opensearch.neuralsearch.processor.chunker.FixedTokenLengthChunker.TOKENIZER_FIELD; import static org.opensearch.neuralsearch.processor.chunker.FixedTokenLengthChunker.TOKEN_LIMIT_FIELD; import static org.opensearch.neuralsearch.processor.chunker.FixedTokenLengthChunker.OVERLAP_RATE_FIELD; +import static org.opensearch.neuralsearch.processor.chunker.FixedTokenLengthChunker.MAX_TOKEN_COUNT_FIELD; public class FixedTokenLengthChunkerTests extends OpenSearchTestCase { @@ -55,7 +56,6 @@ public Map> getTokeniz public void testValidateParameters_whenNoParams_thenSuccessful() { Map parameters = new HashMap<>(); - ; FixedTokenLengthChunker.validateParameters(parameters); } @@ -124,6 +124,8 @@ public void testValidateParameters_whenIllegalTokenizerType_thenFail() { public void testChunk_withTokenLimit_10() { Map parameters = new HashMap<>(); parameters.put(TOKEN_LIMIT_FIELD, 10); + parameters.put(TOKENIZER_FIELD, "standard"); + parameters.put(MAX_TOKEN_COUNT_FIELD, 10000); String content = "This is an example document to be chunked. The document contains a single paragraph, two sentences and 24 tokens by standard tokenizer in OpenSearch."; List passages = FixedTokenLengthChunker.chunk(content, parameters); @@ -137,6 +139,8 @@ public void testChunk_withTokenLimit_10() { public void testChunk_withTokenLimit_20() { Map parameters = new HashMap<>(); parameters.put(TOKEN_LIMIT_FIELD, 20); + parameters.put(TOKENIZER_FIELD, "standard"); + parameters.put(MAX_TOKEN_COUNT_FIELD, 10000); String content = "This is an example document to be chunked. The document contains a single paragraph, two sentences and 24 tokens by standard tokenizer in OpenSearch."; List passages = FixedTokenLengthChunker.chunk(content, parameters); From 82aa21943cc518adf80d186e93a2d626febd928d Mon Sep 17 00:00:00 2001 From: yuye-aws Date: Fri, 8 Mar 2024 15:41:44 +0800 Subject: [PATCH 090/189] resolve code review comments Signed-off-by: yuye-aws --- .../neuralsearch/plugin/NeuralSearch.java | 3 +- .../processor/DocumentChunkingProcessor.java | 75 +++--------- .../processor/chunker/DelimiterChunker.java | 1 - .../chunker/FixedTokenLengthChunker.java | 3 +- .../DocumentChunkingProcessorFactory.java | 66 ++++++++++ .../DocumentChunkingProcessorTests.java | 69 ++++++----- .../chunker/DelimiterChunkerTests.java | 2 - ...DocumentChunkingProcessorFactoryTests.java | 114 ++++++++++++++++++ 8 files changed, 236 insertions(+), 97 deletions(-) create mode 100644 src/main/java/org/opensearch/neuralsearch/processor/factory/DocumentChunkingProcessorFactory.java create mode 100644 src/test/java/org/opensearch/neuralsearch/processor/factory/DocumentChunkingProcessorFactoryTests.java diff --git a/src/main/java/org/opensearch/neuralsearch/plugin/NeuralSearch.java b/src/main/java/org/opensearch/neuralsearch/plugin/NeuralSearch.java index 80fcf90f4..8f60a6ff8 100644 --- a/src/main/java/org/opensearch/neuralsearch/plugin/NeuralSearch.java +++ b/src/main/java/org/opensearch/neuralsearch/plugin/NeuralSearch.java @@ -35,6 +35,7 @@ import org.opensearch.neuralsearch.processor.TextImageEmbeddingProcessor; import org.opensearch.neuralsearch.processor.combination.ScoreCombinationFactory; import org.opensearch.neuralsearch.processor.combination.ScoreCombiner; +import org.opensearch.neuralsearch.processor.factory.DocumentChunkingProcessorFactory; import org.opensearch.neuralsearch.processor.factory.NormalizationProcessorFactory; import org.opensearch.neuralsearch.processor.factory.RerankProcessorFactory; import org.opensearch.neuralsearch.processor.factory.SparseEncodingProcessorFactory; @@ -117,7 +118,7 @@ public Map getProcessors(Processor.Parameters paramet TextImageEmbeddingProcessor.TYPE, new TextImageEmbeddingProcessorFactory(clientAccessor, parameters.env, parameters.ingestService.getClusterService()), DocumentChunkingProcessor.TYPE, - new DocumentChunkingProcessor.Factory( + new DocumentChunkingProcessorFactory( parameters.env, parameters.ingestService.getClusterService(), parameters.indicesService, diff --git a/src/main/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessor.java b/src/main/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessor.java index 81d7d2b17..0f8b70b30 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessor.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessor.java @@ -21,12 +21,10 @@ import org.opensearch.index.IndexSettings; import org.opensearch.ingest.AbstractProcessor; import org.opensearch.ingest.IngestDocument; -import org.opensearch.ingest.Processor; import org.opensearch.neuralsearch.processor.chunker.ChunkerFactory; import org.opensearch.neuralsearch.processor.chunker.FieldChunker; import org.opensearch.index.mapper.IndexFieldMapper; import org.opensearch.neuralsearch.processor.chunker.FixedTokenLengthChunker; -import static org.opensearch.ingest.ConfigurationUtils.readMap; import static org.opensearch.neuralsearch.processor.chunker.ChunkerFactory.DELIMITER_ALGORITHM; import static org.opensearch.neuralsearch.processor.chunker.ChunkerFactory.FIXED_LENGTH_ALGORITHM; @@ -47,9 +45,9 @@ public final class DocumentChunkingProcessor extends AbstractProcessor { private static final int DEFAULT_MAX_CHUNK_LIMIT = -1; - private int current_chunk_count = 0; + private int currentChunkCount = 0; - private int max_chunk_limit = DEFAULT_MAX_CHUNK_LIMIT; + private int maxChunkLimit = DEFAULT_MAX_CHUNK_LIMIT; private final Set supportedChunkers = ChunkerFactory.getAllChunkers(); private String chunkerType; @@ -122,11 +120,17 @@ private void validateAndParseAlgorithmMap(Map algorithmMap) { this.chunkerParameters = (Map) algorithmValue; chunker.validateParameters(chunkerParameters); if (((Map) algorithmValue).containsKey(MAX_CHUNK_LIMIT_FIELD)) { - int max_chunk_limit = ((Number) ((Map) algorithmValue).get(MAX_CHUNK_LIMIT_FIELD)).intValue(); - if (max_chunk_limit <= 0) { + Object maxChunkLimitObject = ((Map) algorithmValue).get(MAX_CHUNK_LIMIT_FIELD); + if (!(maxChunkLimitObject instanceof Number)) { + throw new IllegalArgumentException( + "Parameter [" + MAX_CHUNK_LIMIT_FIELD + "] cannot be cast to [" + Number.class.getName() + "]" + ); + } + int maxChunkLimit = ((Number) maxChunkLimitObject).intValue(); + if (maxChunkLimit <= 0 && maxChunkLimit != DEFAULT_MAX_CHUNK_LIMIT) { throw new IllegalArgumentException("Parameter [" + MAX_CHUNK_LIMIT_FIELD + "] must be a positive integer"); } - this.max_chunk_limit = max_chunk_limit; + this.maxChunkLimit = maxChunkLimit; } } } @@ -148,13 +152,13 @@ private boolean isListString(Object value) { private List chunkString(String content) { FieldChunker chunker = ChunkerFactory.create(chunkerType, analysisRegistry); List result = chunker.chunk(content, chunkerParameters); - current_chunk_count += result.size(); - if (max_chunk_limit != DEFAULT_MAX_CHUNK_LIMIT && current_chunk_count > max_chunk_limit) { + currentChunkCount += result.size(); + if (maxChunkLimit != DEFAULT_MAX_CHUNK_LIMIT && currentChunkCount > maxChunkLimit) { throw new IllegalArgumentException( "Unable to create the processor as the number of chunks [" - + current_chunk_count + + currentChunkCount + "] exceeds the maximum chunk limit [" - + max_chunk_limit + + maxChunkLimit + "]" ); } @@ -189,7 +193,7 @@ private List chunkLeafType(Object value) { @Override public IngestDocument execute(IngestDocument ingestDocument) { validateFieldsValue(ingestDocument); - current_chunk_count = 0; + currentChunkCount = 0; if (Objects.equals(chunkerType, FIXED_LENGTH_ALGORITHM)) { // add maxTokenCount setting from index metadata to chunker parameters Map sourceAndMetadataMap = ingestDocument.getSourceAndMetadata(); @@ -283,51 +287,4 @@ private void chunkMapType(Map sourceAndMetadataMap, Map registry, - String processorTag, - String description, - Map config - ) throws Exception { - Map fieldMap = readMap(TYPE, processorTag, config, FIELD_MAP_FIELD); - Map algorithmMap = readMap(TYPE, processorTag, config, ALGORITHM_FIELD); - return new DocumentChunkingProcessor( - processorTag, - description, - fieldMap, - algorithmMap, - environment, - clusterService, - indicesService, - analysisRegistry - ); - } - } } diff --git a/src/main/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunker.java b/src/main/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunker.java index 787e94cd7..8fa8ec088 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunker.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunker.java @@ -59,6 +59,5 @@ public List chunk(String content, Map parameters) { chunkResult.add(content.substring(start)); } return chunkResult; - } } diff --git a/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java b/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java index fe08ee074..b58a9c157 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java @@ -122,7 +122,7 @@ public void validateParameters(Map parameters) { public List chunk(String content, Map parameters) { // prior to chunking, parameters have been validated int tokenLimit = DEFAULT_TOKEN_LIMIT; - BigDecimal overlap_rate = new BigDecimal(String.valueOf(DEFAULT_OVERLAP_RATE)); + BigDecimal overlap_rate = DEFAULT_OVERLAP_RATE; int maxTokenCount = DEFAULT_MAX_TOKEN_COUNT; String tokenizer = DEFAULT_TOKENIZER; @@ -148,7 +148,6 @@ public List chunk(String content, Map parameters) { BigDecimal overlapTokenNumberBigDecimal = overlap_rate.multiply(new BigDecimal(String.valueOf(tokenLimit))) .setScale(0, RoundingMode.DOWN); int overlapTokenNumber = overlapTokenNumberBigDecimal.intValue(); - ; // overlapTokenNumber must be smaller than the token limit overlapTokenNumber = Math.min(overlapTokenNumber, tokenLimit - 1); diff --git a/src/main/java/org/opensearch/neuralsearch/processor/factory/DocumentChunkingProcessorFactory.java b/src/main/java/org/opensearch/neuralsearch/processor/factory/DocumentChunkingProcessorFactory.java new file mode 100644 index 000000000..9fa38b48a --- /dev/null +++ b/src/main/java/org/opensearch/neuralsearch/processor/factory/DocumentChunkingProcessorFactory.java @@ -0,0 +1,66 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ +package org.opensearch.neuralsearch.processor.factory; + +import java.util.Map; + +import org.opensearch.cluster.service.ClusterService; +import org.opensearch.env.Environment; +import org.opensearch.index.analysis.AnalysisRegistry; +import org.opensearch.indices.IndicesService; +import org.opensearch.ingest.Processor; +import org.opensearch.neuralsearch.processor.DocumentChunkingProcessor; +import static org.opensearch.neuralsearch.processor.DocumentChunkingProcessor.TYPE; +import static org.opensearch.neuralsearch.processor.DocumentChunkingProcessor.FIELD_MAP_FIELD; +import static org.opensearch.neuralsearch.processor.DocumentChunkingProcessor.ALGORITHM_FIELD; +import static org.opensearch.ingest.ConfigurationUtils.readMap; + +/** + * Factory for chunking ingest processor for ingestion pipeline. + * Instantiates processor based on user provided input. + */ +public class DocumentChunkingProcessorFactory implements Processor.Factory { + + private final Environment environment; + + private final ClusterService clusterService; + + private final IndicesService indicesService; + + private final AnalysisRegistry analysisRegistry; + + public DocumentChunkingProcessorFactory( + Environment environment, + ClusterService clusterService, + IndicesService indicesService, + AnalysisRegistry analysisRegistry + ) { + this.environment = environment; + this.clusterService = clusterService; + this.indicesService = indicesService; + this.analysisRegistry = analysisRegistry; + } + + @Override + public DocumentChunkingProcessor create( + Map registry, + String processorTag, + String description, + Map config + ) throws Exception { + Map fieldMap = readMap(TYPE, processorTag, config, FIELD_MAP_FIELD); + Map algorithmMap = readMap(TYPE, processorTag, config, ALGORITHM_FIELD); + return new DocumentChunkingProcessor( + processorTag, + description, + fieldMap, + algorithmMap, + environment, + clusterService, + indicesService, + analysisRegistry + ); + } +} diff --git a/src/test/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessorTests.java b/src/test/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessorTests.java index c02c76256..b444e19b7 100644 --- a/src/test/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessorTests.java +++ b/src/test/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessorTests.java @@ -34,6 +34,7 @@ import org.opensearch.neuralsearch.processor.chunker.ChunkerFactory; import org.opensearch.neuralsearch.processor.chunker.DelimiterChunker; import org.opensearch.neuralsearch.processor.chunker.FixedTokenLengthChunker; +import org.opensearch.neuralsearch.processor.factory.DocumentChunkingProcessorFactory; import org.opensearch.plugins.AnalysisPlugin; import org.opensearch.test.OpenSearchTestCase; import static org.opensearch.neuralsearch.processor.DocumentChunkingProcessor.FIELD_MAP_FIELD; @@ -42,12 +43,11 @@ public class DocumentChunkingProcessorTests extends OpenSearchTestCase { - private DocumentChunkingProcessor.Factory factory; + private DocumentChunkingProcessorFactory documentChunkingProcessorFactory; private static final String PROCESSOR_TAG = "mockTag"; private static final String DESCRIPTION = "mockDescription"; private static final String INPUT_FIELD = "body"; - private static final String INPUT_NESTED_FIELD_KEY = "nested"; private static final String OUTPUT_FIELD = "body_chunk"; private static final String INDEX_NAME = "_index"; @@ -84,7 +84,12 @@ public void setup() { when(metadata.index(anyString())).thenReturn(null); when(clusterState.metadata()).thenReturn(metadata); when(clusterService.state()).thenReturn(clusterState); - factory = new DocumentChunkingProcessor.Factory(environment, clusterService, indicesService, getAnalysisRegistry()); + documentChunkingProcessorFactory = new DocumentChunkingProcessorFactory( + environment, + clusterService, + indicesService, + getAnalysisRegistry() + ); } private Map createFixedTokenLengthParameters() { @@ -132,7 +137,7 @@ private DocumentChunkingProcessor createFixedTokenLengthInstance(Map registry = new HashMap<>(); - return factory.create(registry, PROCESSOR_TAG, DESCRIPTION, config); + return documentChunkingProcessorFactory.create(registry, PROCESSOR_TAG, DESCRIPTION, config); } @SneakyThrows @@ -143,7 +148,7 @@ private DocumentChunkingProcessor createFixedTokenLengthInstanceWithMaxChunkNum( config.put(FIELD_MAP_FIELD, fieldMap); config.put(ALGORITHM_FIELD, algorithmMap); Map registry = new HashMap<>(); - return factory.create(registry, PROCESSOR_TAG, DESCRIPTION, config); + return documentChunkingProcessorFactory.create(registry, PROCESSOR_TAG, DESCRIPTION, config); } @SneakyThrows @@ -156,40 +161,40 @@ private DocumentChunkingProcessor createDelimiterInstance() { config.put(FIELD_MAP_FIELD, fieldMap); config.put(ALGORITHM_FIELD, algorithmMap); Map registry = new HashMap<>(); - return factory.create(registry, PROCESSOR_TAG, DESCRIPTION, config); + return documentChunkingProcessorFactory.create(registry, PROCESSOR_TAG, DESCRIPTION, config); } - public void testCreate_whenAlgorithmFieldMissing_failure() { + public void testCreate_whenAlgorithmFieldMissing_thenFail() { Map config = new HashMap<>(); Map fieldMap = new HashMap<>(); config.put(FIELD_MAP_FIELD, fieldMap); Map registry = new HashMap<>(); OpenSearchParseException openSearchParseException = assertThrows( OpenSearchParseException.class, - () -> factory.create(registry, PROCESSOR_TAG, DESCRIPTION, config) + () -> documentChunkingProcessorFactory.create(registry, PROCESSOR_TAG, DESCRIPTION, config) ); assertEquals("[" + ALGORITHM_FIELD + "] required property is missing", openSearchParseException.getMessage()); } @SneakyThrows - public void testCreate_whenMaxChunkNumNegative() { + public void testCreate_whenMaxChunkNumInvalidValue_thenFail() { Map registry = new HashMap<>(); Map config = new HashMap<>(); Map fieldMap = new HashMap<>(); Map algorithmMap = new HashMap<>(); fieldMap.put(INPUT_FIELD, OUTPUT_FIELD); - algorithmMap.put(ChunkerFactory.FIXED_LENGTH_ALGORITHM, createFixedTokenLengthParametersWithMaxChunk(-1)); + algorithmMap.put(ChunkerFactory.FIXED_LENGTH_ALGORITHM, createFixedTokenLengthParametersWithMaxChunk(-2)); config.put(FIELD_MAP_FIELD, fieldMap); config.put(ALGORITHM_FIELD, algorithmMap); IllegalArgumentException illegalArgumentException = assertThrows( IllegalArgumentException.class, - () -> factory.create(registry, PROCESSOR_TAG, DESCRIPTION, config) + () -> documentChunkingProcessorFactory.create(registry, PROCESSOR_TAG, DESCRIPTION, config) ); assertEquals("Parameter [" + MAX_CHUNK_LIMIT_FIELD + "] must be a positive integer", illegalArgumentException.getMessage()); } - public void testCreate_whenAlgorithmFieldNoAlgorithm_failure() { + public void testCreate_whenAlgorithmFieldNoAlgorithm_thenFail() { Map config = new HashMap<>(); Map fieldMap = new HashMap<>(); Map algorithmMap = new HashMap<>(); @@ -199,7 +204,7 @@ public void testCreate_whenAlgorithmFieldNoAlgorithm_failure() { Map registry = new HashMap<>(); IllegalArgumentException illegalArgumentException = assertThrows( IllegalArgumentException.class, - () -> factory.create(registry, PROCESSOR_TAG, DESCRIPTION, config) + () -> documentChunkingProcessorFactory.create(registry, PROCESSOR_TAG, DESCRIPTION, config) ); assertEquals( "Unable to create the processor as [" + ALGORITHM_FIELD + "] must contain and only contain 1 algorithm", @@ -207,7 +212,7 @@ public void testCreate_whenAlgorithmFieldNoAlgorithm_failure() { ); } - public void testCreate_whenAlgorithmFieldMultipleAlgorithm_failure() { + public void testCreate_whenAlgorithmFieldMultipleAlgorithm_thenFail() { Map config = new HashMap<>(); Map fieldMap = new HashMap<>(); Map algorithmMap = new HashMap<>(); @@ -219,7 +224,7 @@ public void testCreate_whenAlgorithmFieldMultipleAlgorithm_failure() { Map registry = new HashMap<>(); IllegalArgumentException illegalArgumentException = assertThrows( IllegalArgumentException.class, - () -> factory.create(registry, PROCESSOR_TAG, DESCRIPTION, config) + () -> documentChunkingProcessorFactory.create(registry, PROCESSOR_TAG, DESCRIPTION, config) ); assertEquals( "Unable to create the processor as [" + ALGORITHM_FIELD + "] must contain and only contain 1 algorithm", @@ -227,7 +232,7 @@ public void testCreate_whenAlgorithmFieldMultipleAlgorithm_failure() { ); } - public void testCreate_whenAlgorithmFieldInvalidAlgorithmName_failure() { + public void testCreate_whenAlgorithmFieldInvalidAlgorithmName_thenFail() { Map config = new HashMap<>(); Map fieldMap = new HashMap<>(); Map algorithmMap = new HashMap<>(); @@ -239,13 +244,13 @@ public void testCreate_whenAlgorithmFieldInvalidAlgorithmName_failure() { Map registry = new HashMap<>(); IllegalArgumentException illegalArgumentException = assertThrows( IllegalArgumentException.class, - () -> factory.create(registry, PROCESSOR_TAG, DESCRIPTION, config) + () -> documentChunkingProcessorFactory.create(registry, PROCESSOR_TAG, DESCRIPTION, config) ); assert (illegalArgumentException.getMessage() .contains("Unable to create the processor as chunker algorithm [" + invalid_algorithm_type + "] is not supported")); } - public void testCreate_whenAlgorithmFieldInvalidAlgorithmContent_failure() { + public void testCreate_whenAlgorithmFieldInvalidAlgorithmContent_thenFail() { Map config = new HashMap<>(); Map fieldMap = new HashMap<>(); Map algorithmMap = new HashMap<>(); @@ -256,7 +261,7 @@ public void testCreate_whenAlgorithmFieldInvalidAlgorithmContent_failure() { Map registry = new HashMap<>(); IllegalArgumentException illegalArgumentException = assertThrows( IllegalArgumentException.class, - () -> factory.create(registry, PROCESSOR_TAG, DESCRIPTION, config) + () -> documentChunkingProcessorFactory.create(registry, PROCESSOR_TAG, DESCRIPTION, config) ); assertEquals( "Unable to create the processor as [" @@ -347,7 +352,7 @@ private IngestDocument createIngestDocumentWithSourceData(Object sourceData) { } @SneakyThrows - public void testExecute_withFixedTokenLength_andSourceDataStringWithMaxChunkNum_successful() { + public void testExecute_withFixedTokenLength_andSourceDataStringWithMaxChunkNum_thenSucceed() { DocumentChunkingProcessor processor = createFixedTokenLengthInstanceWithMaxChunkNum(createStringFieldMap(), 5); IngestDocument ingestDocument = createIngestDocumentWithSourceData(createSourceDataString()); IngestDocument document = processor.execute(ingestDocument); @@ -362,7 +367,7 @@ public void testExecute_withFixedTokenLength_andSourceDataStringWithMaxChunkNum_ } @SneakyThrows - public void testExecute_withFixedTokenLength_andSourceDataStringWithMaxChunkNumTwice_successful() { + public void testExecute_withFixedTokenLength_andSourceDataStringWithMaxChunkNumTwice_thenSucceed() { DocumentChunkingProcessor processor = createFixedTokenLengthInstanceWithMaxChunkNum(createStringFieldMap(), 5); for (int i = 0; i < 2; i++) { IngestDocument ingestDocument = createIngestDocumentWithSourceData(createSourceDataString()); @@ -379,7 +384,7 @@ public void testExecute_withFixedTokenLength_andSourceDataStringWithMaxChunkNumT } @SneakyThrows - public void testExecute_withFixedTokenLength_andSourceDataStringWithMaxChunkNum_Exceed() { + public void testExecute_withFixedTokenLength_andSourceDataStringWithMaxChunkNumExceed_thenFail() { DocumentChunkingProcessor processor = createFixedTokenLengthInstanceWithMaxChunkNum(createStringFieldMap(), 1); IngestDocument ingestDocument = createIngestDocumentWithSourceData(createSourceDataString()); IllegalArgumentException illegalArgumentException = assertThrows( @@ -394,7 +399,7 @@ public void testExecute_withFixedTokenLength_andSourceDataStringWithMaxChunkNum_ } @SneakyThrows - public void testExecute_withFixedTokenLength_andSourceDataString_successful() { + public void testExecute_withFixedTokenLength_andSourceDataString_thenSucceed() { DocumentChunkingProcessor processor = createFixedTokenLengthInstance(createStringFieldMap()); IngestDocument ingestDocument = createIngestDocumentWithSourceData(createSourceDataString()); IngestDocument document = processor.execute(ingestDocument); @@ -409,7 +414,7 @@ public void testExecute_withFixedTokenLength_andSourceDataString_successful() { } @SneakyThrows - public void testExecute_withFixedTokenLength_andSourceDataInvalidType_failure() { + public void testExecute_withFixedTokenLength_andSourceDataInvalidType_thenFail() { DocumentChunkingProcessor processor = createFixedTokenLengthInstance(createStringFieldMap()); Map sourceAndMetadata = new HashMap<>(); sourceAndMetadata.put(INPUT_FIELD, 1); @@ -426,7 +431,7 @@ public void testExecute_withFixedTokenLength_andSourceDataInvalidType_failure() } @SneakyThrows - public void testExecute_withFixedTokenLength_andSourceDataListStrings_successful() { + public void testExecute_withFixedTokenLength_andSourceDataListStrings_thenSucceed() { DocumentChunkingProcessor processor = createFixedTokenLengthInstance(createStringFieldMap()); IngestDocument ingestDocument = createIngestDocumentWithSourceData(createSourceDataListStrings()); IngestDocument document = processor.execute(ingestDocument); @@ -445,7 +450,7 @@ public void testExecute_withFixedTokenLength_andSourceDataListStrings_successful } @SneakyThrows - public void testExecute_withFixedTokenLength_andSourceDataListHybridType_failure() { + public void testExecute_withFixedTokenLength_andSourceDataListHybridType_thenFail() { DocumentChunkingProcessor processor = createFixedTokenLengthInstance(createStringFieldMap()); IngestDocument ingestDocument = createIngestDocumentWithSourceData(createSourceDataListHybridType()); IllegalArgumentException illegalArgumentException = assertThrows( @@ -459,7 +464,7 @@ public void testExecute_withFixedTokenLength_andSourceDataListHybridType_failure } @SneakyThrows - public void testExecute_withFixedTokenLength_andSourceDataListWithNull_failure() { + public void testExecute_withFixedTokenLength_andSourceDataListWithNull_thenFail() { DocumentChunkingProcessor processor = createFixedTokenLengthInstance(createStringFieldMap()); IngestDocument ingestDocument = createIngestDocumentWithSourceData(createSourceDataListWithNull()); IllegalArgumentException illegalArgumentException = assertThrows( @@ -471,7 +476,7 @@ public void testExecute_withFixedTokenLength_andSourceDataListWithNull_failure() @SuppressWarnings("unchecked") @SneakyThrows - public void testExecute_withFixedTokenLength_andFieldMapNestedMap_successful() { + public void testExecute_withFixedTokenLength_andFieldMapNestedMap_thenSucceed() { DocumentChunkingProcessor processor = createFixedTokenLengthInstance(createNestedFieldMap()); IngestDocument ingestDocument = createIngestDocumentWithNestedSourceData(createSourceDataNestedMap()); IngestDocument document = processor.execute(ingestDocument); @@ -490,7 +495,7 @@ public void testExecute_withFixedTokenLength_andFieldMapNestedMap_successful() { } @SneakyThrows - public void testExecute_withFixedTokenLength_andMaxDepthLimitExceedFieldMap_failure() { + public void testExecute_withFixedTokenLength_andMaxDepthLimitExceedFieldMap_thenFail() { DocumentChunkingProcessor processor = createFixedTokenLengthInstance(createNestedFieldMap()); IngestDocument ingestDocument = createIngestDocumentWithNestedSourceData(createMaxDepthLimitExceedMap(0)); IllegalArgumentException illegalArgumentException = assertThrows( @@ -504,7 +509,7 @@ public void testExecute_withFixedTokenLength_andMaxDepthLimitExceedFieldMap_fail } @SneakyThrows - public void testExecute_withFixedTokenLength_andFieldMapNestedMap_failure() { + public void testExecute_withFixedTokenLength_andFieldMapNestedMap_thenFail() { DocumentChunkingProcessor processor = createFixedTokenLengthInstance(createNestedFieldMap()); IngestDocument ingestDocument = createIngestDocumentWithNestedSourceData(createSourceDataInvalidNestedMap()); IllegalArgumentException illegalArgumentException = assertThrows( @@ -519,7 +524,7 @@ public void testExecute_withFixedTokenLength_andFieldMapNestedMap_failure() { @SneakyThrows @SuppressWarnings("unchecked") - public void testExecute_withFixedTokenLength_andFieldMapNestedMap_sourceList_successful() { + public void testExecute_withFixedTokenLength_andFieldMapNestedMap_sourceList_thenSucceed() { DocumentChunkingProcessor processor = createFixedTokenLengthInstance(createNestedFieldMap()); IngestDocument ingestDocument = createIngestDocumentWithNestedSourceData(createSourceDataListNestedMap()); IngestDocument document = processor.execute(ingestDocument); @@ -542,7 +547,7 @@ public void testExecute_withFixedTokenLength_andFieldMapNestedMap_sourceList_suc } @SneakyThrows - public void testExecute_withDelimiter_andSourceDataString_successful() { + public void testExecute_withDelimiter_andSourceDataString_thenSucceed() { DocumentChunkingProcessor processor = createDelimiterInstance(); IngestDocument ingestDocument = createIngestDocumentWithSourceData(createSourceDataString()); IngestDocument document = processor.execute(ingestDocument); diff --git a/src/test/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunkerTests.java b/src/test/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunkerTests.java index a1fa4185c..1245f2a71 100644 --- a/src/test/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunkerTests.java +++ b/src/test/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunkerTests.java @@ -26,7 +26,6 @@ public void testChunkerWithDelimiterFieldNotString() { public void testChunkerWithDelimiterFieldNoString() { DelimiterChunker chunker = new DelimiterChunker(); - String content = "a\nb\nc\nd"; Map inputParameters = Map.of(DELIMITER_FIELD, ""); Exception exception = assertThrows(IllegalArgumentException.class, () -> chunker.validateParameters(inputParameters)); Assert.assertEquals("delimiter parameters should not be empty.", exception.getMessage()); @@ -87,5 +86,4 @@ public void testChunkerWithStringDelimiter() { List chunkResult = chunker.chunk(content, inputParameters); assertEquals(List.of("\n\n", "a\n\n", "\n"), chunkResult); } - } diff --git a/src/test/java/org/opensearch/neuralsearch/processor/factory/DocumentChunkingProcessorFactoryTests.java b/src/test/java/org/opensearch/neuralsearch/processor/factory/DocumentChunkingProcessorFactoryTests.java new file mode 100644 index 000000000..8fb8e1421 --- /dev/null +++ b/src/test/java/org/opensearch/neuralsearch/processor/factory/DocumentChunkingProcessorFactoryTests.java @@ -0,0 +1,114 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ +package org.opensearch.neuralsearch.processor.factory; + +import lombok.SneakyThrows; +import org.apache.lucene.tests.analysis.MockTokenizer; +import org.junit.Before; +import org.opensearch.cluster.service.ClusterService; +import org.opensearch.common.settings.Settings; +import org.opensearch.env.Environment; +import org.opensearch.env.TestEnvironment; +import org.opensearch.index.analysis.AnalysisRegistry; +import org.opensearch.index.analysis.TokenizerFactory; +import org.opensearch.indices.IndicesService; +import org.opensearch.indices.analysis.AnalysisModule; +import org.opensearch.ingest.Processor; +import org.opensearch.neuralsearch.processor.DocumentChunkingProcessor; +import org.opensearch.neuralsearch.processor.chunker.ChunkerFactory; +import org.opensearch.plugins.AnalysisPlugin; +import org.opensearch.test.OpenSearchTestCase; + +import java.util.HashMap; +import java.util.Map; + +import static java.util.Collections.singletonList; +import static java.util.Collections.singletonMap; +import static org.mockito.Mockito.mock; +import static org.opensearch.neuralsearch.processor.DocumentChunkingProcessor.TYPE; +import static org.opensearch.neuralsearch.processor.DocumentChunkingProcessor.FIELD_MAP_FIELD; +import static org.opensearch.neuralsearch.processor.DocumentChunkingProcessor.ALGORITHM_FIELD; + +public class DocumentChunkingProcessorFactoryTests extends OpenSearchTestCase { + + private static final String PROCESSOR_TAG = "mockTag"; + private static final String DESCRIPTION = "mockDescription"; + private static final Map algorithmMap = Map.of(ChunkerFactory.FIXED_LENGTH_ALGORITHM, new HashMap<>()); + + private DocumentChunkingProcessorFactory documentChunkingProcessorFactory; + + @SneakyThrows + private AnalysisRegistry getAnalysisRegistry() { + Settings settings = Settings.builder().put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()).build(); + Environment environment = TestEnvironment.newEnvironment(settings); + AnalysisPlugin plugin = new AnalysisPlugin() { + + @Override + public Map> getTokenizers() { + return singletonMap( + "keyword", + (indexSettings, environment, name, settings) -> TokenizerFactory.newFactory( + name, + () -> new MockTokenizer(MockTokenizer.KEYWORD, false) + ) + ); + } + }; + return new AnalysisModule(environment, singletonList(plugin)).getAnalysisRegistry(); + } + + @Before + public void setup() { + Environment environment = mock(Environment.class); + ClusterService clusterService = mock(ClusterService.class); + IndicesService indicesService = mock(IndicesService.class); + this.documentChunkingProcessorFactory = new DocumentChunkingProcessorFactory( + environment, + clusterService, + indicesService, + getAnalysisRegistry() + ); + } + + @SneakyThrows + public void testDocumentChunkingProcessorFactory_whenAllParamsPassed_thenSuccessful() { + final Map processorFactories = new HashMap<>(); + Map config = new HashMap<>(); + config.put(ALGORITHM_FIELD, algorithmMap); + config.put(FIELD_MAP_FIELD, new HashMap<>()); + DocumentChunkingProcessor documentChunkingProcessor = documentChunkingProcessorFactory.create( + processorFactories, + PROCESSOR_TAG, + DESCRIPTION, + config + ); + assertNotNull(documentChunkingProcessor); + assertEquals(TYPE, documentChunkingProcessor.getType()); + } + + @SneakyThrows + public void testDocumentChunkingProcessorFactory_whenOnlyFieldMap_thenFail() { + final Map processorFactories = new HashMap<>(); + Map config = new HashMap<>(); + config.put(FIELD_MAP_FIELD, new HashMap<>()); + Exception exception = assertThrows( + Exception.class, + () -> documentChunkingProcessorFactory.create(processorFactories, PROCESSOR_TAG, DESCRIPTION, config) + ); + assertEquals("[" + ALGORITHM_FIELD + "] required property is missing", exception.getMessage()); + } + + @SneakyThrows + public void testDocumentChunkingProcessorFactory_whenOnlyAlgorithm_thenFail() { + final Map processorFactories = new HashMap<>(); + Map config = new HashMap<>(); + config.put(ALGORITHM_FIELD, algorithmMap); + Exception exception = assertThrows( + Exception.class, + () -> documentChunkingProcessorFactory.create(processorFactories, PROCESSOR_TAG, DESCRIPTION, config) + ); + assertEquals("[" + FIELD_MAP_FIELD + "] required property is missing", exception.getMessage()); + } +} From 3158e287de51b7576c8c2a5b9b019c9133e78386 Mon Sep 17 00:00:00 2001 From: yuye-aws Date: Fri, 8 Mar 2024 16:11:35 +0800 Subject: [PATCH 091/189] resolve code review comments Signed-off-by: yuye-aws --- .../processor/DocumentChunkingProcessor.java | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/src/main/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessor.java b/src/main/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessor.java index 0f8b70b30..925286ea3 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessor.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessor.java @@ -10,6 +10,7 @@ import java.util.List; import java.util.Objects; +import com.google.common.annotations.VisibleForTesting; import lombok.extern.log4j.Log4j2; import org.opensearch.cluster.metadata.IndexMetadata; import org.opensearch.env.Environment; @@ -25,7 +26,6 @@ import org.opensearch.neuralsearch.processor.chunker.FieldChunker; import org.opensearch.index.mapper.IndexFieldMapper; import org.opensearch.neuralsearch.processor.chunker.FixedTokenLengthChunker; -import static org.opensearch.neuralsearch.processor.chunker.ChunkerFactory.DELIMITER_ALGORITHM; import static org.opensearch.neuralsearch.processor.chunker.ChunkerFactory.FIXED_LENGTH_ALGORITHM; /** @@ -41,14 +41,14 @@ public final class DocumentChunkingProcessor extends AbstractProcessor { public static final String ALGORITHM_FIELD = "algorithm"; - public static String MAX_CHUNK_LIMIT_FIELD = "max_chunk_limit"; + @VisibleForTesting + static final String MAX_CHUNK_LIMIT_FIELD = "max_chunk_limit"; private static final int DEFAULT_MAX_CHUNK_LIMIT = -1; private int currentChunkCount = 0; private int maxChunkLimit = DEFAULT_MAX_CHUNK_LIMIT; - private final Set supportedChunkers = ChunkerFactory.getAllChunkers(); private String chunkerType; @@ -98,15 +98,13 @@ private void validateAndParseAlgorithmMap(Map algorithmMap) { for (Map.Entry algorithmEntry : algorithmMap.entrySet()) { String algorithmKey = algorithmEntry.getKey(); Object algorithmValue = algorithmEntry.getValue(); + Set supportedChunkers = ChunkerFactory.getAllChunkers(); if (!supportedChunkers.contains(algorithmKey)) { throw new IllegalArgumentException( "Unable to create the processor as chunker algorithm [" + algorithmKey - + "] is not supported. Supported chunkers types are [" - + FIXED_LENGTH_ALGORITHM - + ", " - + DELIMITER_ALGORITHM - + "]" + + "] is not supported. Supported chunkers types are " + + supportedChunkers ); } if (!(algorithmValue instanceof Map)) { @@ -115,7 +113,6 @@ private void validateAndParseAlgorithmMap(Map algorithmMap) { ); } FieldChunker chunker = ChunkerFactory.create(algorithmKey, analysisRegistry); - chunker.validateParameters((Map) algorithmValue); this.chunkerType = algorithmKey; this.chunkerParameters = (Map) algorithmValue; chunker.validateParameters(chunkerParameters); From 38d6e6020b9f22fd16cc360477ede0bce936d834 Mon Sep 17 00:00:00 2001 From: yuye-aws Date: Fri, 8 Mar 2024 17:04:33 +0800 Subject: [PATCH 092/189] resolve code review comments Signed-off-by: yuye-aws --- .../processor/DocumentChunkingProcessor.java | 16 +++++++------- .../processor/chunker/ChunkerFactory.java | 8 +------ .../processor/chunker/DelimiterChunker.java | 12 ++++++++-- .../processor/chunker/FieldChunker.java | 6 ++--- .../chunker/FixedTokenLengthChunker.java | 22 ++++++++++--------- .../chunker/ChunkerFactoryTests.java | 8 +++---- 6 files changed, 37 insertions(+), 35 deletions(-) diff --git a/src/main/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessor.java b/src/main/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessor.java index 925286ea3..3f72d95d1 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessor.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessor.java @@ -29,7 +29,8 @@ import static org.opensearch.neuralsearch.processor.chunker.ChunkerFactory.FIXED_LENGTH_ALGORITHM; /** - * This processor is used for chunking user input data text embedding processing, algorithm can be used to indicate chunking algorithm and parameters, + * This processor is used for chunking user input data and chunked data could be used for downstream embedding processor, + * algorithm can be used to indicate chunking algorithm and parameters, * and field_map can be used to indicate which fields needs chunking and the corresponding keys for the chunking results. */ @Log4j2 @@ -176,7 +177,7 @@ private List chunkLeafType(Object value) { // leaf type is either String or List List chunkedResult = null; if (value instanceof String) { - chunkedResult = chunkString(String.valueOf(value)); + chunkedResult = chunkString(value.toString()); } else if (isListString(value)) { chunkedResult = chunkList((List) value); } @@ -217,10 +218,9 @@ private void validateFieldsValue(IngestDocument ingestDocument) { Object sourceValue = sourceAndMetadataMap.get(embeddingFieldsEntry.getKey()); if (sourceValue != null) { String sourceKey = embeddingFieldsEntry.getKey(); - Class sourceValueClass = sourceValue.getClass(); - if (List.class.isAssignableFrom(sourceValueClass) || Map.class.isAssignableFrom(sourceValueClass)) { + if (sourceValue instanceof List || sourceValue instanceof Map) { validateNestedTypeValue(sourceKey, sourceValue, 1); - } else if (!String.class.isAssignableFrom(sourceValueClass)) { + } else if (!(sourceValue instanceof String)) { throw new IllegalArgumentException("field [" + sourceKey + "] is neither string nor nested type, cannot process it"); } } @@ -231,14 +231,14 @@ private void validateFieldsValue(IngestDocument ingestDocument) { private void validateNestedTypeValue(String sourceKey, Object sourceValue, int maxDepth) { if (maxDepth > MapperService.INDEX_MAPPING_DEPTH_LIMIT_SETTING.get(environment.settings())) { throw new IllegalArgumentException("map type field [" + sourceKey + "] reached max depth limit, cannot process it"); - } else if ((List.class.isAssignableFrom(sourceValue.getClass()))) { + } else if (sourceValue instanceof List) { validateListTypeValue(sourceKey, sourceValue, maxDepth); - } else if (Map.class.isAssignableFrom(sourceValue.getClass())) { + } else if (sourceValue instanceof Map) { ((Map) sourceValue).values() .stream() .filter(Objects::nonNull) .forEach(x -> validateNestedTypeValue(sourceKey, x, maxDepth + 1)); - } else if (!String.class.isAssignableFrom(sourceValue.getClass())) { + } else if (!(sourceValue instanceof String)) { throw new IllegalArgumentException("map type field [" + sourceKey + "] has non-string type, cannot process it"); } } diff --git a/src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerFactory.java b/src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerFactory.java index 6dea95a22..e3e846308 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerFactory.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerFactory.java @@ -24,13 +24,7 @@ public static FieldChunker create(String type, AnalysisRegistry analysisRegistry return new DelimiterChunker(); default: throw new IllegalArgumentException( - "chunker type [" - + type - + "] is not supported. Supported chunkers types are [" - + FIXED_LENGTH_ALGORITHM - + ", " - + DELIMITER_ALGORITHM - + "]" + "chunker type [" + type + "] is not supported. Supported chunkers types are " + ChunkerFactory.getAllChunkers() ); } } diff --git a/src/main/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunker.java b/src/main/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunker.java index 8fa8ec088..e46cd0810 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunker.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunker.java @@ -4,6 +4,8 @@ */ package org.opensearch.neuralsearch.processor.chunker; +import org.apache.commons.lang3.StringUtils; + import java.util.Map; import java.util.List; import java.util.ArrayList; @@ -17,6 +19,8 @@ public DelimiterChunker() {} public static String DELIMITER_FIELD = "delimiter"; + public static String DEFAULT_DELIMITER = "."; + /** * Validate the chunked passages for delimiter algorithm * @@ -30,7 +34,7 @@ public void validateParameters(Map parameters) { Object delimiter = parameters.get(DELIMITER_FIELD); if (!(delimiter instanceof String)) { throw new IllegalArgumentException("delimiter parameters: " + delimiter + " must be string."); - } else if (((String) delimiter).isEmpty()) { + } else if (StringUtils.isBlank(delimiter.toString())) { throw new IllegalArgumentException("delimiter parameters should not be empty."); } } @@ -44,7 +48,11 @@ public void validateParameters(Map parameters) { */ @Override public List chunk(String content, Map parameters) { - String delimiter = (String) parameters.getOrDefault(DELIMITER_FIELD, "."); + String delimiter = DEFAULT_DELIMITER; + if (parameters.containsKey(DELIMITER_FIELD)) { + Object delimiterObject = parameters.get(DELIMITER_FIELD); + delimiter = delimiterObject.toString(); + } List chunkResult = new ArrayList<>(); int start = 0; int end = content.indexOf(delimiter); diff --git a/src/main/java/org/opensearch/neuralsearch/processor/chunker/FieldChunker.java b/src/main/java/org/opensearch/neuralsearch/processor/chunker/FieldChunker.java index 84aeaeb03..20546245a 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/chunker/FieldChunker.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/chunker/FieldChunker.java @@ -14,15 +14,15 @@ public interface FieldChunker { /** - * Chunk the incoming string and return chunked passages + * Validate the parameters for chunking algorithm, + * will throw IllegalArgumentException when parameters are invalid * * @param parameters a map containing parameters for chunking algorithms - * @throws IllegalArgumentException parameters are invalid */ void validateParameters(Map parameters); /** - * Chunk the incoming string and return chunked passages + * Chunk the incoming string according to parameters and return chunked passages * * @param content input string * @param parameters a map containing parameters for chunking algorithms diff --git a/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java b/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java index b58a9c157..ff79cfc35 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java @@ -11,8 +11,10 @@ import java.util.List; import java.util.ArrayList; import java.util.stream.Collectors; + import lombok.extern.log4j.Log4j2; +import org.apache.commons.lang3.math.NumberUtils; import org.opensearch.action.admin.indices.analyze.AnalyzeAction; import org.opensearch.index.analysis.AnalysisRegistry; import static org.opensearch.action.admin.indices.analyze.TransportAnalyzeAction.analyze; @@ -48,12 +50,13 @@ private void validatePositiveIntegerParameter(Map parameters, St // all parameters are optional return; } - if (!(parameters.get(fieldName) instanceof Number)) { + String fieldValue = parameters.get(fieldName).toString(); + if (!(NumberUtils.isParsable(fieldValue))) { throw new IllegalArgumentException( "fixed length parameter [" + fieldName + "] cannot be cast to [" + Number.class.getName() + "]" ); } - if (((Number) parameters.get(fieldName)).intValue() <= 0) { + if (NumberUtils.createInteger(fieldValue) <= 0) { throw new IllegalArgumentException("fixed length parameter [" + fieldName + "] must be positive"); } } @@ -88,13 +91,14 @@ public void validateParameters(Map parameters) { validatePositiveIntegerParameter(parameters, MAX_TOKEN_COUNT_FIELD); if (parameters.containsKey(OVERLAP_RATE_FIELD)) { - if (!(parameters.get(OVERLAP_RATE_FIELD) instanceof Number)) { + String overlapRateString = parameters.get(OVERLAP_RATE_FIELD).toString(); + if (!(NumberUtils.isParsable(overlapRateString))) { throw new IllegalArgumentException( "fixed length parameter [" + OVERLAP_RATE_FIELD + "] cannot be cast to [" + Number.class.getName() + "]" ); } - BigDecimal overlap_rate = new BigDecimal(String.valueOf(parameters.get(OVERLAP_RATE_FIELD))); - if (overlap_rate.compareTo(BigDecimal.ZERO) < 0 || overlap_rate.compareTo(OVERLAP_RATE_UPPER_BOUND) > 0) { + BigDecimal overlapRate = new BigDecimal(overlapRateString); + if (overlapRate.compareTo(BigDecimal.ZERO) < 0 || overlapRate.compareTo(OVERLAP_RATE_UPPER_BOUND) > 0) { throw new IllegalArgumentException( "fixed length parameter [" + OVERLAP_RATE_FIELD + "] must be between 0 and " + OVERLAP_RATE_UPPER_BOUND ); @@ -122,7 +126,7 @@ public void validateParameters(Map parameters) { public List chunk(String content, Map parameters) { // prior to chunking, parameters have been validated int tokenLimit = DEFAULT_TOKEN_LIMIT; - BigDecimal overlap_rate = DEFAULT_OVERLAP_RATE; + BigDecimal overlapRate = DEFAULT_OVERLAP_RATE; int maxTokenCount = DEFAULT_MAX_TOKEN_COUNT; String tokenizer = DEFAULT_TOKENIZER; @@ -131,7 +135,7 @@ public List chunk(String content, Map parameters) { tokenLimit = ((Number) parameters.get(TOKEN_LIMIT_FIELD)).intValue(); } if (parameters.containsKey(OVERLAP_RATE_FIELD)) { - overlap_rate = new BigDecimal(String.valueOf(parameters.get(OVERLAP_RATE_FIELD))); + overlapRate = new BigDecimal(parameters.get(OVERLAP_RATE_FIELD).toString()); } if (parameters.containsKey(MAX_TOKEN_COUNT_FIELD)) { maxTokenCount = ((Number) parameters.get(MAX_TOKEN_COUNT_FIELD)).intValue(); @@ -145,11 +149,9 @@ public List chunk(String content, Map parameters) { String passage; int startToken = 0; - BigDecimal overlapTokenNumberBigDecimal = overlap_rate.multiply(new BigDecimal(String.valueOf(tokenLimit))) + BigDecimal overlapTokenNumberBigDecimal = overlapRate.multiply(new BigDecimal(String.valueOf(tokenLimit))) .setScale(0, RoundingMode.DOWN); int overlapTokenNumber = overlapTokenNumberBigDecimal.intValue(); - // overlapTokenNumber must be smaller than the token limit - overlapTokenNumber = Math.min(overlapTokenNumber, tokenLimit - 1); while (startToken < tokens.size()) { if (startToken + tokenLimit >= tokens.size()) { diff --git a/src/test/java/org/opensearch/neuralsearch/processor/chunker/ChunkerFactoryTests.java b/src/test/java/org/opensearch/neuralsearch/processor/chunker/ChunkerFactoryTests.java index 38557c1d4..3bf347a56 100644 --- a/src/test/java/org/opensearch/neuralsearch/processor/chunker/ChunkerFactoryTests.java +++ b/src/test/java/org/opensearch/neuralsearch/processor/chunker/ChunkerFactoryTests.java @@ -33,13 +33,11 @@ public void testCreate_Delimiter() { } public void testCreate_Invalid() { + String invalidChunkerType = "Invalid Chunker Type"; IllegalArgumentException illegalArgumentException = assertThrows( IllegalArgumentException.class, - () -> ChunkerFactory.create("Invalid Chunker Type", analysisRegistry) - ); - assertEquals( - "chunker type [Invalid Chunker Type] is not supported. Supported chunkers types are [fix_length, delimiter]", - illegalArgumentException.getMessage() + () -> ChunkerFactory.create(invalidChunkerType, analysisRegistry) ); + assert (illegalArgumentException.getMessage().contains("chunker type [" + invalidChunkerType + "] is not supported.")); } } From 584bc59e2aa8622e1969c594e872aa768f365148 Mon Sep 17 00:00:00 2001 From: yuye-aws Date: Fri, 8 Mar 2024 17:08:01 +0800 Subject: [PATCH 093/189] resolve code review comments Signed-off-by: yuye-aws --- .../neuralsearch/processor/DocumentChunkingProcessor.java | 8 +++++--- .../neuralsearch/processor/chunker/DelimiterChunker.java | 4 ++-- .../processor/chunker/FixedTokenLengthChunker.java | 2 +- 3 files changed, 8 insertions(+), 6 deletions(-) diff --git a/src/main/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessor.java b/src/main/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessor.java index 3f72d95d1..6c0eaba56 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessor.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessor.java @@ -12,6 +12,8 @@ import com.google.common.annotations.VisibleForTesting; import lombok.extern.log4j.Log4j2; +import org.apache.commons.lang3.math.NumberUtils; + import org.opensearch.cluster.metadata.IndexMetadata; import org.opensearch.env.Environment; import org.opensearch.index.IndexService; @@ -118,13 +120,13 @@ private void validateAndParseAlgorithmMap(Map algorithmMap) { this.chunkerParameters = (Map) algorithmValue; chunker.validateParameters(chunkerParameters); if (((Map) algorithmValue).containsKey(MAX_CHUNK_LIMIT_FIELD)) { - Object maxChunkLimitObject = ((Map) algorithmValue).get(MAX_CHUNK_LIMIT_FIELD); - if (!(maxChunkLimitObject instanceof Number)) { + String maxChunkLimitString = ((Map) algorithmValue).get(MAX_CHUNK_LIMIT_FIELD).toString(); + if (!(NumberUtils.isParsable(maxChunkLimitString))) { throw new IllegalArgumentException( "Parameter [" + MAX_CHUNK_LIMIT_FIELD + "] cannot be cast to [" + Number.class.getName() + "]" ); } - int maxChunkLimit = ((Number) maxChunkLimitObject).intValue(); + int maxChunkLimit = NumberUtils.createInteger(maxChunkLimitString); if (maxChunkLimit <= 0 && maxChunkLimit != DEFAULT_MAX_CHUNK_LIMIT) { throw new IllegalArgumentException("Parameter [" + MAX_CHUNK_LIMIT_FIELD + "] must be a positive integer"); } diff --git a/src/main/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunker.java b/src/main/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunker.java index e46cd0810..5ba139489 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunker.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunker.java @@ -4,12 +4,12 @@ */ package org.opensearch.neuralsearch.processor.chunker; -import org.apache.commons.lang3.StringUtils; - import java.util.Map; import java.util.List; import java.util.ArrayList; +import org.apache.commons.lang3.StringUtils; + /** * The implementation {@link FieldChunker} for delimiter algorithm */ diff --git a/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java b/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java index ff79cfc35..3c2a89f52 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java @@ -13,8 +13,8 @@ import java.util.stream.Collectors; import lombok.extern.log4j.Log4j2; - import org.apache.commons.lang3.math.NumberUtils; + import org.opensearch.action.admin.indices.analyze.AnalyzeAction; import org.opensearch.index.analysis.AnalysisRegistry; import static org.opensearch.action.admin.indices.analyze.TransportAnalyzeAction.analyze; From cbea5dfad77c71ddb23ad405c647bec181e71c4a Mon Sep 17 00:00:00 2001 From: yuye-aws Date: Fri, 8 Mar 2024 21:53:19 +0800 Subject: [PATCH 094/189] implement chunk count wrapper for max chunk limit Signed-off-by: yuye-aws --- .../processor/DocumentChunkingProcessor.java | 45 ++++++++++++------- 1 file changed, 28 insertions(+), 17 deletions(-) diff --git a/src/main/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessor.java b/src/main/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessor.java index 6c0eaba56..0e7e73a50 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessor.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessor.java @@ -49,8 +49,6 @@ public final class DocumentChunkingProcessor extends AbstractProcessor { private static final int DEFAULT_MAX_CHUNK_LIMIT = -1; - private int currentChunkCount = 0; - private int maxChunkLimit = DEFAULT_MAX_CHUNK_LIMIT; private String chunkerType; @@ -67,6 +65,19 @@ public final class DocumentChunkingProcessor extends AbstractProcessor { private final Environment environment; + /** + * Users may specify parameter max_chunk_limit for a restriction on the number of strings from chunking results. + * Here the chunkCountWrapper is to store and increase the number of chunks across all output fields. + * chunkCount: the number of chunks of chunking result. + */ + static class ChunkCountWrapper { + private int chunkCount; + + protected ChunkCountWrapper(int chunkCount) { + this.chunkCount = chunkCount; + } + } + public DocumentChunkingProcessor( String tag, String description, @@ -149,14 +160,14 @@ private boolean isListString(Object value) { return true; } - private List chunkString(String content) { + private List chunkString(String content, ChunkCountWrapper chunkCountWrapper) { FieldChunker chunker = ChunkerFactory.create(chunkerType, analysisRegistry); List result = chunker.chunk(content, chunkerParameters); - currentChunkCount += result.size(); - if (maxChunkLimit != DEFAULT_MAX_CHUNK_LIMIT && currentChunkCount > maxChunkLimit) { + chunkCountWrapper.chunkCount += result.size(); + if (maxChunkLimit != DEFAULT_MAX_CHUNK_LIMIT && chunkCountWrapper.chunkCount > maxChunkLimit) { throw new IllegalArgumentException( "Unable to create the processor as the number of chunks [" - + currentChunkCount + + chunkCountWrapper.chunkCount + "] exceeds the maximum chunk limit [" + maxChunkLimit + "]" @@ -165,23 +176,23 @@ private List chunkString(String content) { return result; } - private List chunkList(List contentList) { + private List chunkList(List contentList, ChunkCountWrapper chunkCountWrapper) { // flatten the List> output to List List result = new ArrayList<>(); for (String content : contentList) { - result.addAll(chunkString(content)); + result.addAll(chunkString(content, chunkCountWrapper)); } return result; } @SuppressWarnings("unchecked") - private List chunkLeafType(Object value) { + private List chunkLeafType(Object value, ChunkCountWrapper chunkCountWrapper) { // leaf type is either String or List List chunkedResult = null; if (value instanceof String) { - chunkedResult = chunkString(value.toString()); + chunkedResult = chunkString(value.toString(), chunkCountWrapper); } else if (isListString(value)) { - chunkedResult = chunkList((List) value); + chunkedResult = chunkList((List) value, chunkCountWrapper); } return chunkedResult; } @@ -193,7 +204,7 @@ private List chunkLeafType(Object value) { @Override public IngestDocument execute(IngestDocument ingestDocument) { validateFieldsValue(ingestDocument); - currentChunkCount = 0; + ChunkCountWrapper chunkCountWrapper = new ChunkCountWrapper(0); if (Objects.equals(chunkerType, FIXED_LENGTH_ALGORITHM)) { // add maxTokenCount setting from index metadata to chunker parameters Map sourceAndMetadataMap = ingestDocument.getSourceAndMetadata(); @@ -209,7 +220,7 @@ public IngestDocument execute(IngestDocument ingestDocument) { } Map sourceAndMetadataMap = ingestDocument.getSourceAndMetadata(); - chunkMapType(sourceAndMetadataMap, fieldMap); + chunkMapType(sourceAndMetadataMap, fieldMap, chunkCountWrapper); sourceAndMetadataMap.forEach(ingestDocument::setFieldValue); return ingestDocument; } @@ -259,7 +270,7 @@ private void validateListTypeValue(String sourceKey, Object sourceValue, int max } @SuppressWarnings("unchecked") - private void chunkMapType(Map sourceAndMetadataMap, Map fieldMap) { + private void chunkMapType(Map sourceAndMetadataMap, Map fieldMap, ChunkCountWrapper chunkCountWrapper) { for (Map.Entry fieldMapEntry : fieldMap.entrySet()) { String originalKey = fieldMapEntry.getKey(); Object targetKey = fieldMapEntry.getValue(); @@ -270,16 +281,16 @@ private void chunkMapType(Map sourceAndMetadataMap, Map sourceObjectList = (List) sourceObject; for (Object source : sourceObjectList) { if (source instanceof Map) { - chunkMapType((Map) source, (Map) targetKey); + chunkMapType((Map) source, (Map) targetKey, chunkCountWrapper); } } } else if (sourceObject instanceof Map) { - chunkMapType((Map) sourceObject, (Map) targetKey); + chunkMapType((Map) sourceObject, (Map) targetKey, chunkCountWrapper); } } else { // chunk the object when target key is a string Object chunkObject = sourceAndMetadataMap.get(originalKey); - List chunkedResult = chunkLeafType(chunkObject); + List chunkedResult = chunkLeafType(chunkObject, chunkCountWrapper); if (chunkedResult != null) { sourceAndMetadataMap.put(String.valueOf(targetKey), chunkedResult); } From c3c8ff2edf3d571bf2566493640c3b07bb60c4c5 Mon Sep 17 00:00:00 2001 From: yuye-aws Date: Fri, 8 Mar 2024 22:05:02 +0800 Subject: [PATCH 095/189] rename variable end to nextDelimiterPosition Signed-off-by: yuye-aws --- .../processor/chunker/DelimiterChunker.java | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/src/main/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunker.java b/src/main/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunker.java index 5ba139489..c81eec999 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunker.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunker.java @@ -53,19 +53,22 @@ public List chunk(String content, Map parameters) { Object delimiterObject = parameters.get(DELIMITER_FIELD); delimiter = delimiterObject.toString(); } + List chunkResult = new ArrayList<>(); - int start = 0; - int end = content.indexOf(delimiter); + int start = 0, end; + int nextDelimiterPosition = content.indexOf(delimiter); - while (end != -1) { - chunkResult.add(content.substring(start, end + delimiter.length())); - start = end + delimiter.length(); - end = content.indexOf(delimiter, start); + while (nextDelimiterPosition != -1) { + end = nextDelimiterPosition + delimiter.length(); + chunkResult.add(content.substring(start, end)); + start = end; + nextDelimiterPosition = content.indexOf(delimiter, start); } if (start < content.length()) { chunkResult.add(content.substring(start)); } + return chunkResult; } } From da055e7e6da9381ffae005f1c304ba1f96bde6fe Mon Sep 17 00:00:00 2001 From: yuye-aws Date: Fri, 8 Mar 2024 22:05:36 +0800 Subject: [PATCH 096/189] adjust method place Signed-off-by: yuye-aws --- .../chunker/FixedTokenLengthChunker.java | 58 +++++++++---------- 1 file changed, 29 insertions(+), 29 deletions(-) diff --git a/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java b/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java index 3c2a89f52..f43ac6170 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java @@ -44,35 +44,6 @@ public FixedTokenLengthChunker(AnalysisRegistry analysisRegistry) { this.analysisRegistry = analysisRegistry; } - private void validatePositiveIntegerParameter(Map parameters, String fieldName) { - // this method validate that parameter is a positive integer - if (!parameters.containsKey(fieldName)) { - // all parameters are optional - return; - } - String fieldValue = parameters.get(fieldName).toString(); - if (!(NumberUtils.isParsable(fieldValue))) { - throw new IllegalArgumentException( - "fixed length parameter [" + fieldName + "] cannot be cast to [" + Number.class.getName() + "]" - ); - } - if (NumberUtils.createInteger(fieldValue) <= 0) { - throw new IllegalArgumentException("fixed length parameter [" + fieldName + "] must be positive"); - } - } - - private List tokenize(String content, String tokenizer, int maxTokenCount) { - AnalyzeAction.Request analyzeRequest = new AnalyzeAction.Request(); - analyzeRequest.text(content); - analyzeRequest.tokenizer(tokenizer); - try { - AnalyzeAction.Response analyzeResponse = analyze(analyzeRequest, analysisRegistry, null, maxTokenCount); - return analyzeResponse.getTokens().stream().map(AnalyzeAction.AnalyzeToken::getTerm).collect(Collectors.toList()); - } catch (IOException e) { - throw new RuntimeException("Fixed token length algorithm meet with exception: " + e); - } - } - /** * Validate the chunked passages for fixed token length algorithm * @@ -112,6 +83,23 @@ public void validateParameters(Map parameters) { } } + private void validatePositiveIntegerParameter(Map parameters, String fieldName) { + // this method validate that parameter is a positive integer + if (!parameters.containsKey(fieldName)) { + // all parameters are optional + return; + } + String fieldValue = parameters.get(fieldName).toString(); + if (!(NumberUtils.isParsable(fieldValue))) { + throw new IllegalArgumentException( + "fixed length parameter [" + fieldName + "] cannot be cast to [" + Number.class.getName() + "]" + ); + } + if (NumberUtils.createInteger(fieldValue) <= 0) { + throw new IllegalArgumentException("fixed length parameter [" + fieldName + "] must be positive"); + } + } + /** * Return the chunked passages for fixed token length algorithm * @@ -167,4 +155,16 @@ public List chunk(String content, Map parameters) { } return passages; } + + private List tokenize(String content, String tokenizer, int maxTokenCount) { + AnalyzeAction.Request analyzeRequest = new AnalyzeAction.Request(); + analyzeRequest.text(content); + analyzeRequest.tokenizer(tokenizer); + try { + AnalyzeAction.Response analyzeResponse = analyze(analyzeRequest, analysisRegistry, null, maxTokenCount); + return analyzeResponse.getTokens().stream().map(AnalyzeAction.AnalyzeToken::getTerm).collect(Collectors.toList()); + } catch (IOException e) { + throw new RuntimeException("Fixed token length algorithm meet with exception: " + e); + } + } } From d32840c2f5cbccad63ab2ab25a70ea0c3da513b0 Mon Sep 17 00:00:00 2001 From: yuye-aws Date: Fri, 8 Mar 2024 22:09:25 +0800 Subject: [PATCH 097/189] update java doc for fixed token length algorithm Signed-off-by: yuye-aws --- .../processor/chunker/FixedTokenLengthChunker.java | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java b/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java index f43ac6170..7bb082103 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java @@ -45,16 +45,18 @@ public FixedTokenLengthChunker(AnalysisRegistry analysisRegistry) { } /** - * Validate the chunked passages for fixed token length algorithm + * Validate the chunked passages for fixed token length algorithm, + * will throw IllegalArgumentException when parameters are invalid * - * @param parameters a map containing parameters, containing the following parameters + * @param parameters a map containing parameters, containing the following parameters: * 1. tokenizer the analyzer tokenizer in opensearch, please check https://opensearch.org/docs/latest/analyzers/tokenizers/index/ * 2. token_limit the token limit for each chunked passage * 3. overlap_rate the overlapping degree for each chunked passage, indicating how many token comes from the previous passage * 4. max_token_count the max token limit for the tokenizer - * @throws IllegalArgumentException If max_token_count and token_limit is not a positive integer - * @throws IllegalArgumentException If overlap_rate is not within range [0, 0.5] - * @throws IllegalArgumentException If tokenizer is not a string + * Here are requirements for parameters: + * max_token_count and token_limit should be a positive integer + * overlap_rate should be within range [0, 0.5] + * tokenizer should be string */ @Override public void validateParameters(Map parameters) { From 830f665e2239591bfbde638135e1f2f4d3afa15a Mon Sep 17 00:00:00 2001 From: yuye-aws Date: Fri, 8 Mar 2024 22:14:53 +0800 Subject: [PATCH 098/189] reanme interface name and fixed token length algorithm name Signed-off-by: yuye-aws --- .../processor/DocumentChunkingProcessor.java | 10 +++++----- .../chunker/{FieldChunker.java => Chunker.java} | 2 +- .../processor/chunker/ChunkerFactory.java | 8 ++++---- .../processor/chunker/DelimiterChunker.java | 4 ++-- .../chunker/FixedTokenLengthChunker.java | 6 +++--- .../processor/DocumentChunkingProcessorTests.java | 12 ++++++------ .../processor/chunker/ChunkerFactoryTests.java | 6 +++--- .../DocumentChunkingProcessorFactoryTests.java | 15 +++++++-------- 8 files changed, 31 insertions(+), 32 deletions(-) rename src/main/java/org/opensearch/neuralsearch/processor/chunker/{FieldChunker.java => Chunker.java} (96%) diff --git a/src/main/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessor.java b/src/main/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessor.java index 0e7e73a50..16666ee26 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessor.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessor.java @@ -25,10 +25,10 @@ import org.opensearch.ingest.AbstractProcessor; import org.opensearch.ingest.IngestDocument; import org.opensearch.neuralsearch.processor.chunker.ChunkerFactory; -import org.opensearch.neuralsearch.processor.chunker.FieldChunker; +import org.opensearch.neuralsearch.processor.chunker.Chunker; import org.opensearch.index.mapper.IndexFieldMapper; import org.opensearch.neuralsearch.processor.chunker.FixedTokenLengthChunker; -import static org.opensearch.neuralsearch.processor.chunker.ChunkerFactory.FIXED_LENGTH_ALGORITHM; +import static org.opensearch.neuralsearch.processor.chunker.ChunkerFactory.FIXED_TOKEN_LENGTH_ALGORITHM; /** * This processor is used for chunking user input data and chunked data could be used for downstream embedding processor, @@ -126,7 +126,7 @@ private void validateAndParseAlgorithmMap(Map algorithmMap) { "Unable to create the processor as [" + algorithmKey + "] parameters cannot be cast to [" + Map.class.getName() + "]" ); } - FieldChunker chunker = ChunkerFactory.create(algorithmKey, analysisRegistry); + Chunker chunker = ChunkerFactory.create(algorithmKey, analysisRegistry); this.chunkerType = algorithmKey; this.chunkerParameters = (Map) algorithmValue; chunker.validateParameters(chunkerParameters); @@ -161,7 +161,7 @@ private boolean isListString(Object value) { } private List chunkString(String content, ChunkCountWrapper chunkCountWrapper) { - FieldChunker chunker = ChunkerFactory.create(chunkerType, analysisRegistry); + Chunker chunker = ChunkerFactory.create(chunkerType, analysisRegistry); List result = chunker.chunk(content, chunkerParameters); chunkCountWrapper.chunkCount += result.size(); if (maxChunkLimit != DEFAULT_MAX_CHUNK_LIMIT && chunkCountWrapper.chunkCount > maxChunkLimit) { @@ -205,7 +205,7 @@ private List chunkLeafType(Object value, ChunkCountWrapper chunkCountWra public IngestDocument execute(IngestDocument ingestDocument) { validateFieldsValue(ingestDocument); ChunkCountWrapper chunkCountWrapper = new ChunkCountWrapper(0); - if (Objects.equals(chunkerType, FIXED_LENGTH_ALGORITHM)) { + if (Objects.equals(chunkerType, FIXED_TOKEN_LENGTH_ALGORITHM)) { // add maxTokenCount setting from index metadata to chunker parameters Map sourceAndMetadataMap = ingestDocument.getSourceAndMetadata(); String indexName = sourceAndMetadataMap.get(IndexFieldMapper.NAME).toString(); diff --git a/src/main/java/org/opensearch/neuralsearch/processor/chunker/FieldChunker.java b/src/main/java/org/opensearch/neuralsearch/processor/chunker/Chunker.java similarity index 96% rename from src/main/java/org/opensearch/neuralsearch/processor/chunker/FieldChunker.java rename to src/main/java/org/opensearch/neuralsearch/processor/chunker/Chunker.java index 20546245a..0e4d3bc5d 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/chunker/FieldChunker.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/chunker/Chunker.java @@ -11,7 +11,7 @@ * The interface for all chunking algorithms. * All algorithms need to validate parameters and chunk the content, */ -public interface FieldChunker { +public interface Chunker { /** * Validate the parameters for chunking algorithm, diff --git a/src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerFactory.java b/src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerFactory.java index e3e846308..52225ac34 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerFactory.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerFactory.java @@ -13,12 +13,12 @@ */ public class ChunkerFactory { - public static final String FIXED_LENGTH_ALGORITHM = "fix_length"; + public static final String FIXED_TOKEN_LENGTH_ALGORITHM = "fixed_token_length"; public static final String DELIMITER_ALGORITHM = "delimiter"; - public static FieldChunker create(String type, AnalysisRegistry analysisRegistry) { + public static Chunker create(String type, AnalysisRegistry analysisRegistry) { switch (type) { - case FIXED_LENGTH_ALGORITHM: + case FIXED_TOKEN_LENGTH_ALGORITHM: return new FixedTokenLengthChunker(analysisRegistry); case DELIMITER_ALGORITHM: return new DelimiterChunker(); @@ -30,6 +30,6 @@ public static FieldChunker create(String type, AnalysisRegistry analysisRegistry } public static Set getAllChunkers() { - return Set.of(FIXED_LENGTH_ALGORITHM, DELIMITER_ALGORITHM); + return Set.of(FIXED_TOKEN_LENGTH_ALGORITHM, DELIMITER_ALGORITHM); } } diff --git a/src/main/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunker.java b/src/main/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunker.java index c81eec999..9b3441500 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunker.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunker.java @@ -11,9 +11,9 @@ import org.apache.commons.lang3.StringUtils; /** - * The implementation {@link FieldChunker} for delimiter algorithm + * The implementation {@link Chunker} for delimiter algorithm */ -public class DelimiterChunker implements FieldChunker { +public class DelimiterChunker implements Chunker { public DelimiterChunker() {} diff --git a/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java b/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java index 7bb082103..8a40b29cd 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java @@ -20,10 +20,10 @@ import static org.opensearch.action.admin.indices.analyze.TransportAnalyzeAction.analyze; /** - * The implementation {@link FieldChunker} for fixed token length algorithm. + * The implementation {@link Chunker} for fixed token length algorithm. */ @Log4j2 -public class FixedTokenLengthChunker implements FieldChunker { +public class FixedTokenLengthChunker implements Chunker { public static final String TOKEN_LIMIT_FIELD = "token_limit"; public static final String OVERLAP_RATE_FIELD = "overlap_rate"; @@ -94,7 +94,7 @@ private void validatePositiveIntegerParameter(Map parameters, St String fieldValue = parameters.get(fieldName).toString(); if (!(NumberUtils.isParsable(fieldValue))) { throw new IllegalArgumentException( - "fixed length parameter [" + fieldName + "] cannot be cast to [" + Number.class.getName() + "]" + "fixed length parameter [" + fieldName + "] cannot be cast to [" + Number.class.getName() + "]" ); } if (NumberUtils.createInteger(fieldValue) <= 0) { diff --git a/src/test/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessorTests.java b/src/test/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessorTests.java index b444e19b7..bd0a2a32d 100644 --- a/src/test/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessorTests.java +++ b/src/test/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessorTests.java @@ -133,7 +133,7 @@ private Map createNestedFieldMap() { private DocumentChunkingProcessor createFixedTokenLengthInstance(Map fieldMap) { Map config = new HashMap<>(); Map algorithmMap = new HashMap<>(); - algorithmMap.put(ChunkerFactory.FIXED_LENGTH_ALGORITHM, createFixedTokenLengthParameters()); + algorithmMap.put(ChunkerFactory.FIXED_TOKEN_LENGTH_ALGORITHM, createFixedTokenLengthParameters()); config.put(FIELD_MAP_FIELD, fieldMap); config.put(ALGORITHM_FIELD, algorithmMap); Map registry = new HashMap<>(); @@ -144,7 +144,7 @@ private DocumentChunkingProcessor createFixedTokenLengthInstance(Map fieldMap, int maxChunkNum) { Map config = new HashMap<>(); Map algorithmMap = new HashMap<>(); - algorithmMap.put(ChunkerFactory.FIXED_LENGTH_ALGORITHM, createFixedTokenLengthParametersWithMaxChunk(maxChunkNum)); + algorithmMap.put(ChunkerFactory.FIXED_TOKEN_LENGTH_ALGORITHM, createFixedTokenLengthParametersWithMaxChunk(maxChunkNum)); config.put(FIELD_MAP_FIELD, fieldMap); config.put(ALGORITHM_FIELD, algorithmMap); Map registry = new HashMap<>(); @@ -183,7 +183,7 @@ public void testCreate_whenMaxChunkNumInvalidValue_thenFail() { Map fieldMap = new HashMap<>(); Map algorithmMap = new HashMap<>(); fieldMap.put(INPUT_FIELD, OUTPUT_FIELD); - algorithmMap.put(ChunkerFactory.FIXED_LENGTH_ALGORITHM, createFixedTokenLengthParametersWithMaxChunk(-2)); + algorithmMap.put(ChunkerFactory.FIXED_TOKEN_LENGTH_ALGORITHM, createFixedTokenLengthParametersWithMaxChunk(-2)); config.put(FIELD_MAP_FIELD, fieldMap); config.put(ALGORITHM_FIELD, algorithmMap); IllegalArgumentException illegalArgumentException = assertThrows( @@ -218,7 +218,7 @@ public void testCreate_whenAlgorithmFieldMultipleAlgorithm_thenFail() { Map algorithmMap = new HashMap<>(); fieldMap.put(INPUT_FIELD, OUTPUT_FIELD); config.put(DocumentChunkingProcessor.FIELD_MAP_FIELD, fieldMap); - algorithmMap.put(ChunkerFactory.FIXED_LENGTH_ALGORITHM, createFixedTokenLengthParameters()); + algorithmMap.put(ChunkerFactory.FIXED_TOKEN_LENGTH_ALGORITHM, createFixedTokenLengthParameters()); algorithmMap.put(ChunkerFactory.DELIMITER_ALGORITHM, createDelimiterParameters()); config.put(ALGORITHM_FIELD, algorithmMap); Map registry = new HashMap<>(); @@ -256,7 +256,7 @@ public void testCreate_whenAlgorithmFieldInvalidAlgorithmContent_thenFail() { Map algorithmMap = new HashMap<>(); fieldMap.put(INPUT_FIELD, OUTPUT_FIELD); config.put(DocumentChunkingProcessor.FIELD_MAP_FIELD, fieldMap); - algorithmMap.put(ChunkerFactory.FIXED_LENGTH_ALGORITHM, 1); + algorithmMap.put(ChunkerFactory.FIXED_TOKEN_LENGTH_ALGORITHM, 1); config.put(ALGORITHM_FIELD, algorithmMap); Map registry = new HashMap<>(); IllegalArgumentException illegalArgumentException = assertThrows( @@ -265,7 +265,7 @@ public void testCreate_whenAlgorithmFieldInvalidAlgorithmContent_thenFail() { ); assertEquals( "Unable to create the processor as [" - + ChunkerFactory.FIXED_LENGTH_ALGORITHM + + ChunkerFactory.FIXED_TOKEN_LENGTH_ALGORITHM + "] parameters cannot be cast to [" + Map.class.getName() + "]", diff --git a/src/test/java/org/opensearch/neuralsearch/processor/chunker/ChunkerFactoryTests.java b/src/test/java/org/opensearch/neuralsearch/processor/chunker/ChunkerFactoryTests.java index 3bf347a56..8978946a7 100644 --- a/src/test/java/org/opensearch/neuralsearch/processor/chunker/ChunkerFactoryTests.java +++ b/src/test/java/org/opensearch/neuralsearch/processor/chunker/ChunkerFactoryTests.java @@ -16,18 +16,18 @@ public class ChunkerFactoryTests extends OpenSearchTestCase { private AnalysisRegistry analysisRegistry; public void testGetAllChunkers() { - Set expected = Set.of(ChunkerFactory.FIXED_LENGTH_ALGORITHM, ChunkerFactory.DELIMITER_ALGORITHM); + Set expected = Set.of(ChunkerFactory.FIXED_TOKEN_LENGTH_ALGORITHM, ChunkerFactory.DELIMITER_ALGORITHM); assertEquals(expected, ChunkerFactory.getAllChunkers()); } public void testCreate_FixedTokenLength() { - FieldChunker chunker = ChunkerFactory.create(ChunkerFactory.FIXED_LENGTH_ALGORITHM, analysisRegistry); + Chunker chunker = ChunkerFactory.create(ChunkerFactory.FIXED_TOKEN_LENGTH_ALGORITHM, analysisRegistry); assertNotNull(chunker); assertTrue(chunker instanceof FixedTokenLengthChunker); } public void testCreate_Delimiter() { - FieldChunker chunker = ChunkerFactory.create(ChunkerFactory.DELIMITER_ALGORITHM, analysisRegistry); + Chunker chunker = ChunkerFactory.create(ChunkerFactory.DELIMITER_ALGORITHM, analysisRegistry); assertNotNull(chunker); assertTrue(chunker instanceof DelimiterChunker); } diff --git a/src/test/java/org/opensearch/neuralsearch/processor/factory/DocumentChunkingProcessorFactoryTests.java b/src/test/java/org/opensearch/neuralsearch/processor/factory/DocumentChunkingProcessorFactoryTests.java index 8fb8e1421..1a5635791 100644 --- a/src/test/java/org/opensearch/neuralsearch/processor/factory/DocumentChunkingProcessorFactoryTests.java +++ b/src/test/java/org/opensearch/neuralsearch/processor/factory/DocumentChunkingProcessorFactoryTests.java @@ -7,6 +7,12 @@ import lombok.SneakyThrows; import org.apache.lucene.tests.analysis.MockTokenizer; import org.junit.Before; +import java.util.HashMap; +import java.util.Map; +import static java.util.Collections.singletonList; +import static java.util.Collections.singletonMap; +import static org.mockito.Mockito.mock; + import org.opensearch.cluster.service.ClusterService; import org.opensearch.common.settings.Settings; import org.opensearch.env.Environment; @@ -20,13 +26,6 @@ import org.opensearch.neuralsearch.processor.chunker.ChunkerFactory; import org.opensearch.plugins.AnalysisPlugin; import org.opensearch.test.OpenSearchTestCase; - -import java.util.HashMap; -import java.util.Map; - -import static java.util.Collections.singletonList; -import static java.util.Collections.singletonMap; -import static org.mockito.Mockito.mock; import static org.opensearch.neuralsearch.processor.DocumentChunkingProcessor.TYPE; import static org.opensearch.neuralsearch.processor.DocumentChunkingProcessor.FIELD_MAP_FIELD; import static org.opensearch.neuralsearch.processor.DocumentChunkingProcessor.ALGORITHM_FIELD; @@ -35,7 +34,7 @@ public class DocumentChunkingProcessorFactoryTests extends OpenSearchTestCase { private static final String PROCESSOR_TAG = "mockTag"; private static final String DESCRIPTION = "mockDescription"; - private static final Map algorithmMap = Map.of(ChunkerFactory.FIXED_LENGTH_ALGORITHM, new HashMap<>()); + private static final Map algorithmMap = Map.of(ChunkerFactory.FIXED_TOKEN_LENGTH_ALGORITHM, new HashMap<>()); private DocumentChunkingProcessorFactory documentChunkingProcessorFactory; From 1275bd641a2554e3e4ad4f085ae33e0ea9ecb9db Mon Sep 17 00:00:00 2001 From: yuye-aws Date: Fri, 8 Mar 2024 22:15:34 +0800 Subject: [PATCH 099/189] update fixed token length algorithm configuration for integration tests Signed-off-by: yuye-aws --- .../resources/processor/chunker/PipelineForCascadedChunker.json | 2 +- .../processor/chunker/PipelineForFixedTokenLengthChunker.json | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/test/resources/processor/chunker/PipelineForCascadedChunker.json b/src/test/resources/processor/chunker/PipelineForCascadedChunker.json index 3125d3d53..e6add1f9f 100644 --- a/src/test/resources/processor/chunker/PipelineForCascadedChunker.json +++ b/src/test/resources/processor/chunker/PipelineForCascadedChunker.json @@ -19,7 +19,7 @@ "body_chunk_intermediate": "body_chunk" }, "algorithm": { - "fix_length": { + "fixed_token_length": { "token_limit": 10 } } diff --git a/src/test/resources/processor/chunker/PipelineForFixedTokenLengthChunker.json b/src/test/resources/processor/chunker/PipelineForFixedTokenLengthChunker.json index c2a55e4f2..29814a3b4 100644 --- a/src/test/resources/processor/chunker/PipelineForFixedTokenLengthChunker.json +++ b/src/test/resources/processor/chunker/PipelineForFixedTokenLengthChunker.json @@ -7,7 +7,7 @@ "body": "body_chunk" }, "algorithm": { - "fix_length": { + "fixed_token_length": { "token_limit": 10 } } From 4e2f5d47a9092406213e282815932b336253bbea Mon Sep 17 00:00:00 2001 From: yuye-aws Date: Fri, 8 Mar 2024 22:18:57 +0800 Subject: [PATCH 100/189] make delimiter member variables static Signed-off-by: yuye-aws --- .../neuralsearch/processor/chunker/DelimiterChunker.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/main/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunker.java b/src/main/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunker.java index 9b3441500..76b258504 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunker.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunker.java @@ -17,9 +17,9 @@ public class DelimiterChunker implements Chunker { public DelimiterChunker() {} - public static String DELIMITER_FIELD = "delimiter"; + public static final String DELIMITER_FIELD = "delimiter"; - public static String DEFAULT_DELIMITER = "."; + public static final String DEFAULT_DELIMITER = "."; /** * Validate the chunked passages for delimiter algorithm From 5c20b9bd968ffe5237243bd8c540f5243993e12a Mon Sep 17 00:00:00 2001 From: yuye-aws Date: Fri, 8 Mar 2024 23:17:40 +0800 Subject: [PATCH 101/189] remove redundant set field value in execute method Signed-off-by: yuye-aws --- .../neuralsearch/processor/DocumentChunkingProcessor.java | 1 - 1 file changed, 1 deletion(-) diff --git a/src/main/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessor.java b/src/main/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessor.java index 16666ee26..2dcd8acb1 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessor.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessor.java @@ -221,7 +221,6 @@ public IngestDocument execute(IngestDocument ingestDocument) { Map sourceAndMetadataMap = ingestDocument.getSourceAndMetadata(); chunkMapType(sourceAndMetadataMap, fieldMap, chunkCountWrapper); - sourceAndMetadataMap.forEach(ingestDocument::setFieldValue); return ingestDocument; } From addd37eed23e7676d2aeff1357bee1c3f62b8abd Mon Sep 17 00:00:00 2001 From: yuye-aws Date: Fri, 8 Mar 2024 23:36:06 +0800 Subject: [PATCH 102/189] resolve code review comments Signed-off-by: yuye-aws --- .../neuralsearch/processor/DocumentChunkingProcessor.java | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/main/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessor.java b/src/main/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessor.java index 2dcd8acb1..63d04a505 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessor.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessor.java @@ -130,8 +130,8 @@ private void validateAndParseAlgorithmMap(Map algorithmMap) { this.chunkerType = algorithmKey; this.chunkerParameters = (Map) algorithmValue; chunker.validateParameters(chunkerParameters); - if (((Map) algorithmValue).containsKey(MAX_CHUNK_LIMIT_FIELD)) { - String maxChunkLimitString = ((Map) algorithmValue).get(MAX_CHUNK_LIMIT_FIELD).toString(); + if (chunkerParameters.containsKey(MAX_CHUNK_LIMIT_FIELD)) { + String maxChunkLimitString = chunkerParameters.get(MAX_CHUNK_LIMIT_FIELD).toString(); if (!(NumberUtils.isParsable(maxChunkLimitString))) { throw new IllegalArgumentException( "Parameter [" + MAX_CHUNK_LIMIT_FIELD + "] cannot be cast to [" + Number.class.getName() + "]" @@ -147,7 +147,7 @@ private void validateAndParseAlgorithmMap(Map algorithmMap) { } @SuppressWarnings("unchecked") - private boolean isListString(Object value) { + private boolean isListOfString(Object value) { // an empty list is also List if (!(value instanceof List)) { return false; @@ -191,7 +191,7 @@ private List chunkLeafType(Object value, ChunkCountWrapper chunkCountWra List chunkedResult = null; if (value instanceof String) { chunkedResult = chunkString(value.toString(), chunkCountWrapper); - } else if (isListString(value)) { + } else if (isListOfString(value)) { chunkedResult = chunkList((List) value, chunkCountWrapper); } return chunkedResult; From 74691533c1bb9c1efe1bf6cfcc938db6e3f16499 Mon Sep 17 00:00:00 2001 From: yuye-aws Date: Sun, 10 Mar 2024 21:54:23 +0800 Subject: [PATCH 103/189] add integration tests with more tokenizers Signed-off-by: yuye-aws --- .../DocumentChunkingProcessorIT.java | 91 ++++++++++++++----- .../DocumentChunkingProcessorTests.java | 1 + ...okenLengthChunkerWithLetterTokenizer.json} | 5 +- ...enLengthChunkerWithLowerCaseTokenizer.json | 18 ++++ ...kenLengthChunkerWithStandardTokenizer.json | 18 ++++ 5 files changed, 110 insertions(+), 23 deletions(-) rename src/test/resources/processor/chunker/{PipelineForFixedTokenLengthChunker.json => PipelineForFixedTokenLengthChunkerWithLetterTokenizer.json} (75%) create mode 100644 src/test/resources/processor/chunker/PipelineForFixedTokenLengthChunkerWithLowerCaseTokenizer.json create mode 100644 src/test/resources/processor/chunker/PipelineForFixedTokenLengthChunkerWithStandardTokenizer.json diff --git a/src/test/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessorIT.java b/src/test/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessorIT.java index c47f8e225..2f3452c16 100644 --- a/src/test/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessorIT.java +++ b/src/test/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessorIT.java @@ -9,6 +9,8 @@ import org.apache.hc.core5.http.io.entity.EntityUtils; import org.apache.hc.core5.http.message.BasicHeader; import org.junit.Before; + +import java.net.URL; import java.nio.file.Files; import java.nio.file.Path; import java.util.ArrayList; @@ -29,7 +31,14 @@ public class DocumentChunkingProcessorIT extends BaseNeuralSearchIT { private static final String INTERMEDIATE_FIELD = "body_chunk_intermediate"; - private static final String FIXED_TOKEN_LENGTH_PIPELINE_NAME = "pipeline-document-chunking-fixed-token-length"; + private static final String FIXED_TOKEN_LENGTH_PIPELINE_WITH_STANDARD_TOKENIZER_NAME = + "pipeline-document-chunking-fixed-token-length-standard-tokenizer"; + + private static final String FIXED_TOKEN_LENGTH_PIPELINE_WITH_LETTER_TOKENIZER_NAME = + "pipeline-document-chunking-fixed-token-length-letter-tokenizer"; + + private static final String FIXED_TOKEN_LENGTH_PIPELINE_WITH_LOWERCASE_TOKENIZER_NAME = + "pipeline-document-chunking-fixed-token-length-lowercase-tokenizer"; private static final String DELIMITER_PIPELINE_NAME = "pipeline-document-chunking-delimiter"; @@ -40,8 +49,12 @@ public class DocumentChunkingProcessorIT extends BaseNeuralSearchIT { private static final String TEST_LONG_DOCUMENT = "processor/chunker/DocumentChunkingTestLongDocument.json"; private static final Map PIPELINE_CONFIGS_BY_NAME = Map.of( - FIXED_TOKEN_LENGTH_PIPELINE_NAME, - "processor/chunker/PipelineForFixedTokenLengthChunker.json", + FIXED_TOKEN_LENGTH_PIPELINE_WITH_STANDARD_TOKENIZER_NAME, + "processor/chunker/PipelineForFixedTokenLengthChunkerWithStandardTokenizer.json", + FIXED_TOKEN_LENGTH_PIPELINE_WITH_LETTER_TOKENIZER_NAME, + "processor/chunker/PipelineForFixedTokenLengthChunkerWithLetterTokenizer.json", + FIXED_TOKEN_LENGTH_PIPELINE_WITH_LOWERCASE_TOKENIZER_NAME, + "processor/chunker/PipelineForFixedTokenLengthChunkerWithLowercaseTokenizer.json", DELIMITER_PIPELINE_NAME, "processor/chunker/PipelineForDelimiterChunker.json", CASCADE_PIPELINE_NAME, @@ -54,10 +67,10 @@ public void setUp() throws Exception { updateClusterSettings(); } - public void testDocumentChunkingProcessor_withFixedTokenLength_successful() throws Exception { + public void testDocumentChunkingProcessor_withFixedTokenLengthAlgorithmStandardTokenizer_thenSucceed() throws Exception { try { - createPipelineProcessor(FIXED_TOKEN_LENGTH_PIPELINE_NAME); - createDocumentChunkingIndex(FIXED_TOKEN_LENGTH_PIPELINE_NAME); + createPipelineProcessor(FIXED_TOKEN_LENGTH_PIPELINE_WITH_STANDARD_TOKENIZER_NAME); + createDocumentChunkingIndex(FIXED_TOKEN_LENGTH_PIPELINE_WITH_STANDARD_TOKENIZER_NAME); ingestDocument(TEST_DOCUMENT); List expectedPassages = new ArrayList<>(); @@ -66,25 +79,59 @@ public void testDocumentChunkingProcessor_withFixedTokenLength_successful() thro expectedPassages.add("standard tokenizer in OpenSearch"); validateIndexIngestResults(INDEX_NAME, OUTPUT_FIELD, expectedPassages); } finally { - wipeOfTestResources(INDEX_NAME, FIXED_TOKEN_LENGTH_PIPELINE_NAME, null, null); + wipeOfTestResources(INDEX_NAME, FIXED_TOKEN_LENGTH_PIPELINE_WITH_STANDARD_TOKENIZER_NAME, null, null); } } - public void testDocumentChunkingProcessor_withFixedTokenLength_fail() throws Exception { + public void testDocumentChunkingProcessor_withFixedTokenLengthAlgorithmLetterTokenizer_thenSucceed() throws Exception { try { - createPipelineProcessor(FIXED_TOKEN_LENGTH_PIPELINE_NAME); - createDocumentChunkingIndex(FIXED_TOKEN_LENGTH_PIPELINE_NAME); + createPipelineProcessor(FIXED_TOKEN_LENGTH_PIPELINE_WITH_LETTER_TOKENIZER_NAME); + createDocumentChunkingIndex(FIXED_TOKEN_LENGTH_PIPELINE_WITH_LETTER_TOKENIZER_NAME); + ingestDocument(TEST_DOCUMENT); + + List expectedPassages = new ArrayList<>(); + expectedPassages.add("This is an example document to be chunked The document"); + expectedPassages.add("contains a single paragraph two sentences and tokens by standard"); + expectedPassages.add("tokenizer in OpenSearch"); + validateIndexIngestResults(INDEX_NAME, OUTPUT_FIELD, expectedPassages); + } finally { + wipeOfTestResources(INDEX_NAME, FIXED_TOKEN_LENGTH_PIPELINE_WITH_LETTER_TOKENIZER_NAME, null, null); + } + } + + public void testDocumentChunkingProcessor_withFixedTokenLengthAlgorithmLowercaseTokenizer_thenSucceed() throws Exception { + try { + // lowercase tokenizer also excludes numbers + createPipelineProcessor(FIXED_TOKEN_LENGTH_PIPELINE_WITH_LOWERCASE_TOKENIZER_NAME); + createDocumentChunkingIndex(FIXED_TOKEN_LENGTH_PIPELINE_WITH_LOWERCASE_TOKENIZER_NAME); + ingestDocument(TEST_DOCUMENT); + + List expectedPassages = new ArrayList<>(); + expectedPassages.add("this is an example document to be chunked the document"); + expectedPassages.add("contains a single paragraph two sentences and tokens by standard"); + expectedPassages.add("tokenizer in opensearch"); + validateIndexIngestResults(INDEX_NAME, OUTPUT_FIELD, expectedPassages); + } finally { + wipeOfTestResources(INDEX_NAME, FIXED_TOKEN_LENGTH_PIPELINE_WITH_LOWERCASE_TOKENIZER_NAME, null, null); + } + } + + public void testDocumentChunkingProcessor_withFixedTokenLengthAlgorithmStandardTokenizer_whenExceedMaxTokenCount_thenFail() + throws Exception { + try { + createPipelineProcessor(FIXED_TOKEN_LENGTH_PIPELINE_WITH_STANDARD_TOKENIZER_NAME); + createDocumentChunkingIndex(FIXED_TOKEN_LENGTH_PIPELINE_WITH_STANDARD_TOKENIZER_NAME); Exception exception = assertThrows(Exception.class, () -> ingestDocument(TEST_LONG_DOCUMENT)); // max_token_count is 100 by index settings assert (exception.getMessage() .contains("The number of tokens produced by calling _analyze has exceeded the allowed maximum of [100].")); assertEquals(0, getDocCount(INDEX_NAME)); } finally { - wipeOfTestResources(INDEX_NAME, FIXED_TOKEN_LENGTH_PIPELINE_NAME, null, null); + wipeOfTestResources(INDEX_NAME, FIXED_TOKEN_LENGTH_PIPELINE_WITH_STANDARD_TOKENIZER_NAME, null, null); } } - public void testDocumentChunkingProcessor_withDelimiter_successful() throws Exception { + public void testDocumentChunkingProcessor_withDelimiterAlgorithm_successful() throws Exception { try { createPipelineProcessor(DELIMITER_PIPELINE_NAME); createDocumentChunkingIndex(DELIMITER_PIPELINE_NAME); @@ -101,7 +148,7 @@ public void testDocumentChunkingProcessor_withDelimiter_successful() throws Exce } } - public void testDocumentChunkingProcessor_withCascade_successful() throws Exception { + public void testDocumentChunkingProcessor_withCascadePipeline_successful() throws Exception { try { createPipelineProcessor(CASCADE_PIPELINE_NAME); createDocumentChunkingIndex(CASCADE_PIPELINE_NAME); @@ -142,8 +189,10 @@ private void validateIndexIngestResults(String indexName, String fieldName, Obje assertEquals(expected, ingestOutputs); } - private void createPipelineProcessor(final String pipelineName) throws Exception { - String requestBody = Files.readString(Path.of(classLoader.getResource(PIPELINE_CONFIGS_BY_NAME.get(pipelineName)).toURI())); + private void createPipelineProcessor(String pipelineName) throws Exception { + URL pipelineURLPath = classLoader.getResource(PIPELINE_CONFIGS_BY_NAME.get(pipelineName)); + assert pipelineURLPath != null; + String requestBody = Files.readString(Path.of(pipelineURLPath.toURI())); Response pipelineCreateResponse = makeRequest( client(), "PUT", @@ -161,15 +210,15 @@ private void createPipelineProcessor(final String pipelineName) throws Exception } private void createDocumentChunkingIndex(String pipelineName) throws Exception { - createIndexWithConfiguration( - INDEX_NAME, - Files.readString(Path.of(classLoader.getResource("processor/chunker/DocumentChunkingIndexSettings.json").toURI())), - pipelineName - ); + URL indexSettingsURLPath = classLoader.getResource("processor/chunker/DocumentChunkingIndexSettings.json"); + assert indexSettingsURLPath != null; + createIndexWithConfiguration(INDEX_NAME, Files.readString(Path.of(indexSettingsURLPath.toURI())), pipelineName); } private void ingestDocument(String documentPath) throws Exception { - String ingestDocument = Files.readString(Path.of(classLoader.getResource(documentPath).toURI())); + URL documentURLPath = classLoader.getResource(documentPath); + assert documentURLPath != null; + String ingestDocument = Files.readString(Path.of(documentURLPath.toURI())); Response response = makeRequest( client(), "POST", diff --git a/src/test/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessorTests.java b/src/test/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessorTests.java index bd0a2a32d..17d230e43 100644 --- a/src/test/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessorTests.java +++ b/src/test/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessorTests.java @@ -95,6 +95,7 @@ public void setup() { private Map createFixedTokenLengthParameters() { Map parameters = new HashMap<>(); parameters.put(FixedTokenLengthChunker.TOKEN_LIMIT_FIELD, 10); + parameters.put(FixedTokenLengthChunker.TOKENIZER_FIELD, "letter"); return parameters; } diff --git a/src/test/resources/processor/chunker/PipelineForFixedTokenLengthChunker.json b/src/test/resources/processor/chunker/PipelineForFixedTokenLengthChunkerWithLetterTokenizer.json similarity index 75% rename from src/test/resources/processor/chunker/PipelineForFixedTokenLengthChunker.json rename to src/test/resources/processor/chunker/PipelineForFixedTokenLengthChunkerWithLetterTokenizer.json index 29814a3b4..e94dc1c05 100644 --- a/src/test/resources/processor/chunker/PipelineForFixedTokenLengthChunker.json +++ b/src/test/resources/processor/chunker/PipelineForFixedTokenLengthChunkerWithLetterTokenizer.json @@ -1,5 +1,5 @@ { - "description": "An example fixed token length chunker pipeline", + "description": "An example fixed token length chunker pipeline with letter tokenizer", "processors" : [ { "chunking": { @@ -8,7 +8,8 @@ }, "algorithm": { "fixed_token_length": { - "token_limit": 10 + "token_limit": 10, + "tokenizer": "letter" } } } diff --git a/src/test/resources/processor/chunker/PipelineForFixedTokenLengthChunkerWithLowerCaseTokenizer.json b/src/test/resources/processor/chunker/PipelineForFixedTokenLengthChunkerWithLowerCaseTokenizer.json new file mode 100644 index 000000000..2f2ccb664 --- /dev/null +++ b/src/test/resources/processor/chunker/PipelineForFixedTokenLengthChunkerWithLowerCaseTokenizer.json @@ -0,0 +1,18 @@ +{ + "description": "An example fixed token length chunker pipeline with lowercase tokenizer", + "processors" : [ + { + "chunking": { + "field_map": { + "body": "body_chunk" + }, + "algorithm": { + "fixed_token_length": { + "token_limit": 10, + "tokenizer": "lowercase" + } + } + } + } + ] +} diff --git a/src/test/resources/processor/chunker/PipelineForFixedTokenLengthChunkerWithStandardTokenizer.json b/src/test/resources/processor/chunker/PipelineForFixedTokenLengthChunkerWithStandardTokenizer.json new file mode 100644 index 000000000..f6dcd844e --- /dev/null +++ b/src/test/resources/processor/chunker/PipelineForFixedTokenLengthChunkerWithStandardTokenizer.json @@ -0,0 +1,18 @@ +{ + "description": "An example fixed token length chunker pipeline with standard tokenizer", + "processors" : [ + { + "chunking": { + "field_map": { + "body": "body_chunk" + }, + "algorithm": { + "fixed_token_length": { + "token_limit": 10, + "tokenizer": "standard" + } + } + } + } + ] +} From ad00b88d435fff7662c0efc81cc8e160d0ce59b3 Mon Sep 17 00:00:00 2001 From: yuye-aws Date: Sun, 10 Mar 2024 21:56:27 +0800 Subject: [PATCH 104/189] bug fix: unit test failure due to invalid tokenizer Signed-off-by: yuye-aws --- .../neuralsearch/processor/DocumentChunkingProcessorTests.java | 1 - 1 file changed, 1 deletion(-) diff --git a/src/test/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessorTests.java b/src/test/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessorTests.java index 17d230e43..bd0a2a32d 100644 --- a/src/test/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessorTests.java +++ b/src/test/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessorTests.java @@ -95,7 +95,6 @@ public void setup() { private Map createFixedTokenLengthParameters() { Map parameters = new HashMap<>(); parameters.put(FixedTokenLengthChunker.TOKEN_LIMIT_FIELD, 10); - parameters.put(FixedTokenLengthChunker.TOKENIZER_FIELD, "letter"); return parameters; } From d4673d47802defc50d6b6866a5a45c1134d5d751 Mon Sep 17 00:00:00 2001 From: yuye-aws Date: Sun, 10 Mar 2024 23:26:15 +0800 Subject: [PATCH 105/189] bug fix: token concatenation in fixed token length algorithm Signed-off-by: yuye-aws --- .../chunker/FixedTokenLengthChunker.java | 33 +++++++++---------- .../DocumentChunkingProcessorIT.java | 18 +++++----- .../DocumentChunkingProcessorTests.java | 28 ++++++++-------- .../chunker/FixedTokenLengthChunkerTests.java | 12 +++---- 4 files changed, 43 insertions(+), 48 deletions(-) diff --git a/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java b/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java index 8a40b29cd..3424e87b3 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java @@ -10,13 +10,12 @@ import java.util.Map; import java.util.List; import java.util.ArrayList; -import java.util.stream.Collectors; - import lombok.extern.log4j.Log4j2; import org.apache.commons.lang3.math.NumberUtils; -import org.opensearch.action.admin.indices.analyze.AnalyzeAction; import org.opensearch.index.analysis.AnalysisRegistry; +import org.opensearch.action.admin.indices.analyze.AnalyzeAction; +import org.opensearch.action.admin.indices.analyze.AnalyzeAction.AnalyzeToken; import static org.opensearch.action.admin.indices.analyze.TransportAnalyzeAction.analyze; /** @@ -107,7 +106,7 @@ private void validatePositiveIntegerParameter(Map parameters, St * * @param content input string * @param parameters a map containing parameters, containing the following parameters - * 1. tokenizer the analyzer tokenizer in opensearch, please check https://opensearch.org/docs/latest/analyzers/tokenizers/index/ + * 1. tokenizer the analyzer tokenizer in OpenSearch * 2. token_limit the token limit for each chunked passage * 3. overlap_rate the overlapping degree for each chunked passage, indicating how many token comes from the previous passage * 4. max_token_count the max token limit for the tokenizer @@ -134,37 +133,35 @@ public List chunk(String content, Map parameters) { tokenizer = (String) parameters.get(TOKENIZER_FIELD); } - List tokens = tokenize(content, tokenizer, maxTokenCount); + List tokens = tokenize(content, tokenizer, maxTokenCount); List passages = new ArrayList<>(); - String passage; - int startToken = 0; + int startTokenIndex = 0, endTokenIndex; + int startContentPosition, endContentPosition; BigDecimal overlapTokenNumberBigDecimal = overlapRate.multiply(new BigDecimal(String.valueOf(tokenLimit))) .setScale(0, RoundingMode.DOWN); int overlapTokenNumber = overlapTokenNumberBigDecimal.intValue(); - while (startToken < tokens.size()) { - if (startToken + tokenLimit >= tokens.size()) { - // break the loop when already cover the last token - passage = String.join(" ", tokens.subList(startToken, tokens.size())); - passages.add(passage); + while (startTokenIndex < tokens.size()) { + endTokenIndex = Math.min(tokens.size(), startTokenIndex + tokenLimit) - 1; + startContentPosition = tokens.get(startTokenIndex).getStartOffset(); + endContentPosition = tokens.get(endTokenIndex).getEndOffset(); + passages.add(content.substring(startContentPosition, endContentPosition)); + if (startTokenIndex + tokenLimit >= tokens.size()) { break; - } else { - passage = String.join(" ", tokens.subList(startToken, startToken + tokenLimit)); - passages.add(passage); } - startToken += tokenLimit - overlapTokenNumber; + startTokenIndex += tokenLimit - overlapTokenNumber; } return passages; } - private List tokenize(String content, String tokenizer, int maxTokenCount) { + private List tokenize(String content, String tokenizer, int maxTokenCount) { AnalyzeAction.Request analyzeRequest = new AnalyzeAction.Request(); analyzeRequest.text(content); analyzeRequest.tokenizer(tokenizer); try { AnalyzeAction.Response analyzeResponse = analyze(analyzeRequest, analysisRegistry, null, maxTokenCount); - return analyzeResponse.getTokens().stream().map(AnalyzeAction.AnalyzeToken::getTerm).collect(Collectors.toList()); + return analyzeResponse.getTokens(); } catch (IOException e) { throw new RuntimeException("Fixed token length algorithm meet with exception: " + e); } diff --git a/src/test/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessorIT.java b/src/test/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessorIT.java index 2f3452c16..762db29bd 100644 --- a/src/test/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessorIT.java +++ b/src/test/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessorIT.java @@ -74,8 +74,8 @@ public void testDocumentChunkingProcessor_withFixedTokenLengthAlgorithmStandardT ingestDocument(TEST_DOCUMENT); List expectedPassages = new ArrayList<>(); - expectedPassages.add("This is an example document to be chunked The document"); - expectedPassages.add("contains a single paragraph two sentences and 24 tokens by"); + expectedPassages.add("This is an example document to be chunked. The document"); + expectedPassages.add("contains a single paragraph, two sentences and 24 tokens by"); expectedPassages.add("standard tokenizer in OpenSearch"); validateIndexIngestResults(INDEX_NAME, OUTPUT_FIELD, expectedPassages); } finally { @@ -90,8 +90,8 @@ public void testDocumentChunkingProcessor_withFixedTokenLengthAlgorithmLetterTok ingestDocument(TEST_DOCUMENT); List expectedPassages = new ArrayList<>(); - expectedPassages.add("This is an example document to be chunked The document"); - expectedPassages.add("contains a single paragraph two sentences and tokens by standard"); + expectedPassages.add("This is an example document to be chunked. The document"); + expectedPassages.add("contains a single paragraph, two sentences and 24 tokens by standard"); expectedPassages.add("tokenizer in OpenSearch"); validateIndexIngestResults(INDEX_NAME, OUTPUT_FIELD, expectedPassages); } finally { @@ -101,15 +101,14 @@ public void testDocumentChunkingProcessor_withFixedTokenLengthAlgorithmLetterTok public void testDocumentChunkingProcessor_withFixedTokenLengthAlgorithmLowercaseTokenizer_thenSucceed() throws Exception { try { - // lowercase tokenizer also excludes numbers createPipelineProcessor(FIXED_TOKEN_LENGTH_PIPELINE_WITH_LOWERCASE_TOKENIZER_NAME); createDocumentChunkingIndex(FIXED_TOKEN_LENGTH_PIPELINE_WITH_LOWERCASE_TOKENIZER_NAME); ingestDocument(TEST_DOCUMENT); List expectedPassages = new ArrayList<>(); - expectedPassages.add("this is an example document to be chunked the document"); - expectedPassages.add("contains a single paragraph two sentences and tokens by standard"); - expectedPassages.add("tokenizer in opensearch"); + expectedPassages.add("This is an example document to be chunked. The document"); + expectedPassages.add("contains a single paragraph, two sentences and 24 tokens by standard"); + expectedPassages.add("tokenizer in OpenSearch"); validateIndexIngestResults(INDEX_NAME, OUTPUT_FIELD, expectedPassages); } finally { wipeOfTestResources(INDEX_NAME, FIXED_TOKEN_LENGTH_PIPELINE_WITH_LOWERCASE_TOKENIZER_NAME, null, null); @@ -155,9 +154,8 @@ public void testDocumentChunkingProcessor_withCascadePipeline_successful() throw ingestDocument(TEST_DOCUMENT); List expectedPassages = new ArrayList<>(); - // " ", "." and "," will not be included in fixed token length output expectedPassages.add("This is an example document to be chunked"); - expectedPassages.add("The document contains a single paragraph two sentences and 24"); + expectedPassages.add("The document contains a single paragraph, two sentences and 24"); expectedPassages.add("tokens by standard tokenizer in OpenSearch"); validateIndexIngestResults(INDEX_NAME, OUTPUT_FIELD, expectedPassages); diff --git a/src/test/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessorTests.java b/src/test/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessorTests.java index bd0a2a32d..9fffce3ac 100644 --- a/src/test/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessorTests.java +++ b/src/test/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessorTests.java @@ -360,8 +360,8 @@ public void testExecute_withFixedTokenLength_andSourceDataStringWithMaxChunkNum_ Object passages = document.getSourceAndMetadata().get(OUTPUT_FIELD); assert (passages instanceof List); List expectedPassages = new ArrayList<>(); - expectedPassages.add("This is an example document to be chunked The document"); - expectedPassages.add("contains a single paragraph two sentences and 24 tokens by"); + expectedPassages.add("This is an example document to be chunked. The document"); + expectedPassages.add("contains a single paragraph, two sentences and 24 tokens by"); expectedPassages.add("standard tokenizer in OpenSearch"); assertEquals(expectedPassages, passages); } @@ -376,8 +376,8 @@ public void testExecute_withFixedTokenLength_andSourceDataStringWithMaxChunkNumT Object passages = document.getSourceAndMetadata().get(OUTPUT_FIELD); assert (passages instanceof List); List expectedPassages = new ArrayList<>(); - expectedPassages.add("This is an example document to be chunked The document"); - expectedPassages.add("contains a single paragraph two sentences and 24 tokens by"); + expectedPassages.add("This is an example document to be chunked. The document"); + expectedPassages.add("contains a single paragraph, two sentences and 24 tokens by"); expectedPassages.add("standard tokenizer in OpenSearch"); assertEquals(expectedPassages, passages); } @@ -407,8 +407,8 @@ public void testExecute_withFixedTokenLength_andSourceDataString_thenSucceed() { Object passages = document.getSourceAndMetadata().get(OUTPUT_FIELD); assert (passages instanceof List); List expectedPassages = new ArrayList<>(); - expectedPassages.add("This is an example document to be chunked The document"); - expectedPassages.add("contains a single paragraph two sentences and 24 tokens by"); + expectedPassages.add("This is an example document to be chunked. The document"); + expectedPassages.add("contains a single paragraph, two sentences and 24 tokens by"); expectedPassages.add("standard tokenizer in OpenSearch"); assertEquals(expectedPassages, passages); } @@ -440,11 +440,11 @@ public void testExecute_withFixedTokenLength_andSourceDataListStrings_thenSuccee assert (passages instanceof List); List expectedPassages = new ArrayList<>(); - expectedPassages.add("This is the first document to be chunked The document"); - expectedPassages.add("contains a single paragraph two sentences and 24 tokens by"); + expectedPassages.add("This is the first document to be chunked. The document"); + expectedPassages.add("contains a single paragraph, two sentences and 24 tokens by"); expectedPassages.add("standard tokenizer in OpenSearch"); - expectedPassages.add("This is the second document to be chunked The document"); - expectedPassages.add("contains a single paragraph two sentences and 24 tokens by"); + expectedPassages.add("This is the second document to be chunked. The document"); + expectedPassages.add("contains a single paragraph, two sentences and 24 tokens by"); expectedPassages.add("standard tokenizer in OpenSearch"); assertEquals(expectedPassages, passages); } @@ -488,8 +488,8 @@ public void testExecute_withFixedTokenLength_andFieldMapNestedMap_thenSucceed() assert (passages instanceof List); List expectedPassages = new ArrayList<>(); - expectedPassages.add("This is an example document to be chunked The document"); - expectedPassages.add("contains a single paragraph two sentences and 24 tokens by"); + expectedPassages.add("This is an example document to be chunked. The document"); + expectedPassages.add("contains a single paragraph, two sentences and 24 tokens by"); expectedPassages.add("standard tokenizer in OpenSearch"); assertEquals(expectedPassages, passages); } @@ -532,8 +532,8 @@ public void testExecute_withFixedTokenLength_andFieldMapNestedMap_sourceList_the Object nestedResult = document.getSourceAndMetadata().get(INPUT_NESTED_FIELD_KEY); List expectedPassages = new ArrayList<>(); - expectedPassages.add("This is an example document to be chunked The document"); - expectedPassages.add("contains a single paragraph two sentences and 24 tokens by"); + expectedPassages.add("This is an example document to be chunked. The document"); + expectedPassages.add("contains a single paragraph, two sentences and 24 tokens by"); expectedPassages.add("standard tokenizer in OpenSearch"); assert (nestedResult instanceof List); assertEquals(((List) nestedResult).size(), 2); diff --git a/src/test/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunkerTests.java b/src/test/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunkerTests.java index 75f40808f..1c678fa9c 100644 --- a/src/test/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunkerTests.java +++ b/src/test/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunkerTests.java @@ -130,8 +130,8 @@ public void testChunk_withTokenLimit_10() { "This is an example document to be chunked. The document contains a single paragraph, two sentences and 24 tokens by standard tokenizer in OpenSearch."; List passages = FixedTokenLengthChunker.chunk(content, parameters); List expectedPassages = new ArrayList<>(); - expectedPassages.add("This is an example document to be chunked The document"); - expectedPassages.add("contains a single paragraph two sentences and 24 tokens by"); + expectedPassages.add("This is an example document to be chunked. The document"); + expectedPassages.add("contains a single paragraph, two sentences and 24 tokens by"); expectedPassages.add("standard tokenizer in OpenSearch"); assertEquals(expectedPassages, passages); } @@ -146,7 +146,7 @@ public void testChunk_withTokenLimit_20() { List passages = FixedTokenLengthChunker.chunk(content, parameters); List expectedPassages = new ArrayList<>(); expectedPassages.add( - "This is an example document to be chunked The document contains a single paragraph two sentences and 24 tokens by" + "This is an example document to be chunked. The document contains a single paragraph, two sentences and 24 tokens by" ); expectedPassages.add("standard tokenizer in OpenSearch"); assertEquals(expectedPassages, passages); @@ -160,9 +160,9 @@ public void testChunk_withOverlapRate_half() { "This is an example document to be chunked. The document contains a single paragraph, two sentences and 24 tokens by standard tokenizer in OpenSearch."; List passages = FixedTokenLengthChunker.chunk(content, parameters); List expectedPassages = new ArrayList<>(); - expectedPassages.add("This is an example document to be chunked The document"); - expectedPassages.add("to be chunked The document contains a single paragraph two"); - expectedPassages.add("contains a single paragraph two sentences and 24 tokens by"); + expectedPassages.add("This is an example document to be chunked. The document"); + expectedPassages.add("to be chunked. The document contains a single paragraph, two"); + expectedPassages.add("contains a single paragraph, two sentences and 24 tokens by"); expectedPassages.add("sentences and 24 tokens by standard tokenizer in OpenSearch"); assertEquals(expectedPassages, passages); } From 7a589c6bdb1b129204817d27eedf2bf1df3b335d Mon Sep 17 00:00:00 2001 From: yuye-aws Date: Mon, 11 Mar 2024 11:52:41 +0800 Subject: [PATCH 106/189] update chunker interface Signed-off-by: yuye-aws --- .../processor/chunker/Chunker.java | 19 ++- .../processor/chunker/ChunkerFactory.java | 7 +- .../processor/chunker/DelimiterChunker.java | 23 ++-- .../chunker/FixedTokenLengthChunker.java | 115 +++++++++--------- 4 files changed, 90 insertions(+), 74 deletions(-) diff --git a/src/main/java/org/opensearch/neuralsearch/processor/chunker/Chunker.java b/src/main/java/org/opensearch/neuralsearch/processor/chunker/Chunker.java index 0e4d3bc5d..90f5a11ee 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/chunker/Chunker.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/chunker/Chunker.java @@ -4,6 +4,9 @@ */ package org.opensearch.neuralsearch.processor.chunker; +import com.google.common.collect.ImmutableList; + +import java.util.ArrayList; import java.util.Map; import java.util.List; @@ -25,8 +28,20 @@ public interface Chunker { * Chunk the incoming string according to parameters and return chunked passages * * @param content input string - * @param parameters a map containing parameters for chunking algorithms * @return Chunked passages */ - List chunk(String content, Map parameters); + default List chunk(String content) { + return ImmutableList.of(); + } + + /** + * Chunk the incoming string according to parameters and return chunked passages + * + * @param content input string + * @param runtimeParameters a map containing runtime parameters for chunking algorithms + * @return Chunked passages + */ + default List chunk(String content, Map runtimeParameters) { + return ImmutableList.of(); + } } diff --git a/src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerFactory.java b/src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerFactory.java index 52225ac34..2f72eab19 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerFactory.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerFactory.java @@ -4,6 +4,7 @@ */ package org.opensearch.neuralsearch.processor.chunker; +import java.util.Map; import java.util.Set; import org.opensearch.index.analysis.AnalysisRegistry; @@ -16,12 +17,12 @@ public class ChunkerFactory { public static final String FIXED_TOKEN_LENGTH_ALGORITHM = "fixed_token_length"; public static final String DELIMITER_ALGORITHM = "delimiter"; - public static Chunker create(String type, AnalysisRegistry analysisRegistry) { + public static Chunker create(String type, Map parameters) { switch (type) { case FIXED_TOKEN_LENGTH_ALGORITHM: - return new FixedTokenLengthChunker(analysisRegistry); + return new FixedTokenLengthChunker(parameters); case DELIMITER_ALGORITHM: - return new DelimiterChunker(); + return new DelimiterChunker(parameters); default: throw new IllegalArgumentException( "chunker type [" + type + "] is not supported. Supported chunkers types are " + ChunkerFactory.getAllChunkers() diff --git a/src/main/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunker.java b/src/main/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunker.java index 76b258504..31e227999 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunker.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunker.java @@ -15,12 +15,16 @@ */ public class DelimiterChunker implements Chunker { - public DelimiterChunker() {} + public DelimiterChunker(Map parameters) { + validateParameters(parameters); + } public static final String DELIMITER_FIELD = "delimiter"; public static final String DEFAULT_DELIMITER = "."; + private String delimiter = DEFAULT_DELIMITER; + /** * Validate the chunked passages for delimiter algorithm * @@ -31,11 +35,14 @@ public DelimiterChunker() {} @Override public void validateParameters(Map parameters) { if (parameters.containsKey(DELIMITER_FIELD)) { - Object delimiter = parameters.get(DELIMITER_FIELD); - if (!(delimiter instanceof String)) { - throw new IllegalArgumentException("delimiter parameters: " + delimiter + " must be string."); - } else if (StringUtils.isBlank(delimiter.toString())) { - throw new IllegalArgumentException("delimiter parameters should not be empty."); + if (!(parameters.get(DELIMITER_FIELD) instanceof String)) { + throw new IllegalArgumentException( + "delimiter parameter [" + DELIMITER_FIELD + "] cannot be cast to [" + String.class.getName() + "]" + ); + } + this.delimiter = parameters.get(DELIMITER_FIELD).toString(); + if (StringUtils.isBlank(delimiter)) { + throw new IllegalArgumentException("delimiter parameter [" + DELIMITER_FIELD + "] should not be empty."); } } } @@ -44,11 +51,9 @@ public void validateParameters(Map parameters) { * Return the chunked passages for delimiter algorithm * * @param content input string - * @param parameters a map containing parameters, containing the following parameters */ @Override - public List chunk(String content, Map parameters) { - String delimiter = DEFAULT_DELIMITER; + public List chunk(String content) { if (parameters.containsKey(DELIMITER_FIELD)) { Object delimiterObject = parameters.get(DELIMITER_FIELD); delimiter = delimiterObject.toString(); diff --git a/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java b/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java index 3424e87b3..ff413efc1 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java @@ -10,12 +10,20 @@ import java.util.Map; import java.util.List; import java.util.ArrayList; + +import lombok.Setter; import lombok.extern.log4j.Log4j2; import org.apache.commons.lang3.math.NumberUtils; +import org.opensearch.cluster.metadata.IndexMetadata; +import org.opensearch.index.IndexService; +import org.opensearch.index.IndexSettings; import org.opensearch.index.analysis.AnalysisRegistry; import org.opensearch.action.admin.indices.analyze.AnalyzeAction; import org.opensearch.action.admin.indices.analyze.AnalyzeAction.AnalyzeToken; +import org.opensearch.index.mapper.IndexFieldMapper; +import org.opensearch.ingest.IngestDocument; + import static org.opensearch.action.admin.indices.analyze.TransportAnalyzeAction.analyze; /** @@ -27,31 +35,34 @@ public class FixedTokenLengthChunker implements Chunker { public static final String TOKEN_LIMIT_FIELD = "token_limit"; public static final String OVERLAP_RATE_FIELD = "overlap_rate"; public static final String MAX_TOKEN_COUNT_FIELD = "max_token_count"; + public static final String ANALYSIS_REGISTRY_FIELD = "analysis_registry"; public static final String TOKENIZER_FIELD = "tokenizer"; // default values for each parameter - private static final int DEFAULT_TOKEN_LIMIT = 500; + private static final int DEFAULT_TOKEN_LIMIT = 384; private static final BigDecimal DEFAULT_OVERLAP_RATE = new BigDecimal("0"); private static final int DEFAULT_MAX_TOKEN_COUNT = 10000; private static final String DEFAULT_TOKENIZER = "standard"; - private static final BigDecimal OVERLAP_RATE_UPPER_BOUND = new BigDecimal("0.5"); - private final AnalysisRegistry analysisRegistry; + private int tokenLimit = DEFAULT_TOKEN_LIMIT; + private BigDecimal overlapRate = DEFAULT_OVERLAP_RATE; + private String tokenizer = DEFAULT_TOKENIZER; + private AnalysisRegistry analysisRegistry; - public FixedTokenLengthChunker(AnalysisRegistry analysisRegistry) { - this.analysisRegistry = analysisRegistry; + public FixedTokenLengthChunker(Map parameters) { + validateParameters(parameters); } /** - * Validate the chunked passages for fixed token length algorithm, + * Validate and parse the parameters for fixed token length algorithm, * will throw IllegalArgumentException when parameters are invalid * * @param parameters a map containing parameters, containing the following parameters: - * 1. tokenizer the analyzer tokenizer in opensearch, please check https://opensearch.org/docs/latest/analyzers/tokenizers/index/ - * 2. token_limit the token limit for each chunked passage - * 3. overlap_rate the overlapping degree for each chunked passage, indicating how many token comes from the previous passage - * 4. max_token_count the max token limit for the tokenizer + * 1. tokenizer: the analyzer tokenizer in opensearch + * 2. token_limit: the token limit for each chunked passage + * 3. overlap_rate: the overlapping degree for each chunked passage, indicating how many token comes from the previous passage + * 4. max_token_count: the max token limit for the tokenizer * Here are requirements for parameters: * max_token_count and token_limit should be a positive integer * overlap_rate should be within range [0, 0.5] @@ -59,8 +70,31 @@ public FixedTokenLengthChunker(AnalysisRegistry analysisRegistry) { */ @Override public void validateParameters(Map parameters) { - validatePositiveIntegerParameter(parameters, TOKEN_LIMIT_FIELD); - validatePositiveIntegerParameter(parameters, MAX_TOKEN_COUNT_FIELD); + if (parameters.containsKey(TOKEN_LIMIT_FIELD)) { + String tokenLimitString = parameters.get(TOKEN_LIMIT_FIELD).toString(); + if (!(NumberUtils.isParsable(tokenLimitString))) { + throw new IllegalArgumentException( + "fixed length parameter [" + TOKEN_LIMIT_FIELD + "] cannot be cast to [" + Number.class.getName() + "]" + ); + } + this.tokenLimit = NumberUtils.createInteger(tokenLimitString); + if (tokenLimit <= 0) { + throw new IllegalArgumentException("fixed length parameter [" + TOKEN_LIMIT_FIELD + "] must be positive"); + } + } + + if (parameters.containsKey(MAX_TOKEN_COUNT_FIELD)) { + String maxTokenCountString = parameters.get(MAX_TOKEN_COUNT_FIELD).toString(); + if (!(NumberUtils.isParsable(maxTokenCountString))) { + throw new IllegalArgumentException( + "fixed length parameter [" + MAX_TOKEN_COUNT_FIELD + "] cannot be cast to [" + Number.class.getName() + "]" + ); + } + this.maxTokenCount = NumberUtils.createInteger(maxTokenCountString); + if (maxTokenCount <= 0) { + throw new IllegalArgumentException("fixed length parameter [" + MAX_TOKEN_COUNT_FIELD + "] must be positive"); + } + } if (parameters.containsKey(OVERLAP_RATE_FIELD)) { String overlapRateString = parameters.get(OVERLAP_RATE_FIELD).toString(); @@ -69,7 +103,7 @@ public void validateParameters(Map parameters) { "fixed length parameter [" + OVERLAP_RATE_FIELD + "] cannot be cast to [" + Number.class.getName() + "]" ); } - BigDecimal overlapRate = new BigDecimal(overlapRateString); + this.overlapRate = new BigDecimal(overlapRateString); if (overlapRate.compareTo(BigDecimal.ZERO) < 0 || overlapRate.compareTo(OVERLAP_RATE_UPPER_BOUND) > 0) { throw new IllegalArgumentException( "fixed length parameter [" + OVERLAP_RATE_FIELD + "] must be between 0 and " + OVERLAP_RATE_UPPER_BOUND @@ -77,27 +111,13 @@ public void validateParameters(Map parameters) { } } - if (parameters.containsKey(TOKENIZER_FIELD) && !(parameters.get(TOKENIZER_FIELD) instanceof String)) { - throw new IllegalArgumentException( - "fixed length parameter [" + TOKENIZER_FIELD + "] cannot be cast to [" + String.class.getName() + "]" - ); - } - } - - private void validatePositiveIntegerParameter(Map parameters, String fieldName) { - // this method validate that parameter is a positive integer - if (!parameters.containsKey(fieldName)) { - // all parameters are optional - return; - } - String fieldValue = parameters.get(fieldName).toString(); - if (!(NumberUtils.isParsable(fieldValue))) { - throw new IllegalArgumentException( - "fixed length parameter [" + fieldName + "] cannot be cast to [" + Number.class.getName() + "]" - ); - } - if (NumberUtils.createInteger(fieldValue) <= 0) { - throw new IllegalArgumentException("fixed length parameter [" + fieldName + "] must be positive"); + if (parameters.containsKey(TOKENIZER_FIELD)) { + if (!(parameters.get(TOKENIZER_FIELD) instanceof String)) { + throw new IllegalArgumentException( + "fixed length parameter [" + TOKENIZER_FIELD + "] cannot be cast to [" + String.class.getName() + "]" + ); + } + this.tokenizer = parameters.get(TOKENIZER_FIELD).toString(); } } @@ -105,34 +125,9 @@ private void validatePositiveIntegerParameter(Map parameters, St * Return the chunked passages for fixed token length algorithm * * @param content input string - * @param parameters a map containing parameters, containing the following parameters - * 1. tokenizer the analyzer tokenizer in OpenSearch - * 2. token_limit the token limit for each chunked passage - * 3. overlap_rate the overlapping degree for each chunked passage, indicating how many token comes from the previous passage - * 4. max_token_count the max token limit for the tokenizer */ @Override - public List chunk(String content, Map parameters) { - // prior to chunking, parameters have been validated - int tokenLimit = DEFAULT_TOKEN_LIMIT; - BigDecimal overlapRate = DEFAULT_OVERLAP_RATE; - int maxTokenCount = DEFAULT_MAX_TOKEN_COUNT; - - String tokenizer = DEFAULT_TOKENIZER; - - if (parameters.containsKey(TOKEN_LIMIT_FIELD)) { - tokenLimit = ((Number) parameters.get(TOKEN_LIMIT_FIELD)).intValue(); - } - if (parameters.containsKey(OVERLAP_RATE_FIELD)) { - overlapRate = new BigDecimal(parameters.get(OVERLAP_RATE_FIELD).toString()); - } - if (parameters.containsKey(MAX_TOKEN_COUNT_FIELD)) { - maxTokenCount = ((Number) parameters.get(MAX_TOKEN_COUNT_FIELD)).intValue(); - } - if (parameters.containsKey(TOKENIZER_FIELD)) { - tokenizer = (String) parameters.get(TOKENIZER_FIELD); - } - + public List chunk(String content, Map runtimeParameters) { List tokens = tokenize(content, tokenizer, maxTokenCount); List passages = new ArrayList<>(); From e1f6c790a7908f957742b9363f3f4269bc479398 Mon Sep 17 00:00:00 2001 From: yuye-aws Date: Mon, 11 Mar 2024 12:33:36 +0800 Subject: [PATCH 107/189] track chunkCount within function Signed-off-by: yuye-aws --- .../processor/DocumentChunkingProcessor.java | 188 ++++++++++-------- .../processor/chunker/Chunker.java | 1 - .../processor/chunker/ChunkerFactory.java | 2 - 3 files changed, 100 insertions(+), 91 deletions(-) diff --git a/src/main/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessor.java b/src/main/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessor.java index 63d04a505..c3432dfaa 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessor.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessor.java @@ -4,7 +4,9 @@ */ package org.opensearch.neuralsearch.processor; +import java.util.HashMap; import java.util.Map; +import java.util.Map.Entry; import java.util.Set; import java.util.ArrayList; import java.util.List; @@ -49,12 +51,9 @@ public final class DocumentChunkingProcessor extends AbstractProcessor { private static final int DEFAULT_MAX_CHUNK_LIMIT = -1; - private int maxChunkLimit = DEFAULT_MAX_CHUNK_LIMIT; - - private String chunkerType; - - private Map chunkerParameters; + private int maxChunkLimit; + private Chunker chunker; private final Map fieldMap; private final ClusterService clusterService; @@ -65,19 +64,6 @@ public final class DocumentChunkingProcessor extends AbstractProcessor { private final Environment environment; - /** - * Users may specify parameter max_chunk_limit for a restriction on the number of strings from chunking results. - * Here the chunkCountWrapper is to store and increase the number of chunks across all output fields. - * chunkCount: the number of chunks of chunking result. - */ - static class ChunkCountWrapper { - private int chunkCount; - - protected ChunkCountWrapper(int chunkCount) { - this.chunkCount = chunkCount; - } - } - public DocumentChunkingProcessor( String tag, String description, @@ -108,41 +94,42 @@ private void validateAndParseAlgorithmMap(Map algorithmMap) { "Unable to create the processor as [" + ALGORITHM_FIELD + "] must contain and only contain 1 algorithm" ); } - - for (Map.Entry algorithmEntry : algorithmMap.entrySet()) { - String algorithmKey = algorithmEntry.getKey(); - Object algorithmValue = algorithmEntry.getValue(); - Set supportedChunkers = ChunkerFactory.getAllChunkers(); - if (!supportedChunkers.contains(algorithmKey)) { - throw new IllegalArgumentException( - "Unable to create the processor as chunker algorithm [" - + algorithmKey - + "] is not supported. Supported chunkers types are " - + supportedChunkers - ); - } - if (!(algorithmValue instanceof Map)) { + Entry algorithmEntry = algorithmMap.entrySet().iterator().next(); + String algorithmKey = algorithmEntry.getKey(); + Object algorithmValue = algorithmEntry.getValue(); + Set supportedChunkers = ChunkerFactory.getAllChunkers(); + if (!supportedChunkers.contains(algorithmKey)) { + throw new IllegalArgumentException( + "Unable to create the processor as chunker algorithm [" + + algorithmKey + + "] is not supported. Supported chunkers types are " + + supportedChunkers + ); + } + if (!(algorithmValue instanceof Map)) { + throw new IllegalArgumentException( + "Unable to create the processor as [" + algorithmKey + "] parameters cannot be cast to [" + Map.class.getName() + "]" + ); + } + Map chunkerParameters = (Map) algorithmValue; + if (Objects.equals(algorithmKey, FIXED_TOKEN_LENGTH_ALGORITHM)) { + chunkerParameters.put(FixedTokenLengthChunker.ANALYSIS_REGISTRY_FIELD, analysisRegistry); + } + this.chunker = ChunkerFactory.create(algorithmKey, chunkerParameters); + if (chunkerParameters.containsKey(MAX_CHUNK_LIMIT_FIELD)) { + String maxChunkLimitString = chunkerParameters.get(MAX_CHUNK_LIMIT_FIELD).toString(); + if (!(NumberUtils.isParsable(maxChunkLimitString))) { throw new IllegalArgumentException( - "Unable to create the processor as [" + algorithmKey + "] parameters cannot be cast to [" + Map.class.getName() + "]" + "Parameter [" + MAX_CHUNK_LIMIT_FIELD + "] cannot be cast to [" + Number.class.getName() + "]" ); } - Chunker chunker = ChunkerFactory.create(algorithmKey, analysisRegistry); - this.chunkerType = algorithmKey; - this.chunkerParameters = (Map) algorithmValue; - chunker.validateParameters(chunkerParameters); - if (chunkerParameters.containsKey(MAX_CHUNK_LIMIT_FIELD)) { - String maxChunkLimitString = chunkerParameters.get(MAX_CHUNK_LIMIT_FIELD).toString(); - if (!(NumberUtils.isParsable(maxChunkLimitString))) { - throw new IllegalArgumentException( - "Parameter [" + MAX_CHUNK_LIMIT_FIELD + "] cannot be cast to [" + Number.class.getName() + "]" - ); - } - int maxChunkLimit = NumberUtils.createInteger(maxChunkLimitString); - if (maxChunkLimit <= 0 && maxChunkLimit != DEFAULT_MAX_CHUNK_LIMIT) { - throw new IllegalArgumentException("Parameter [" + MAX_CHUNK_LIMIT_FIELD + "] must be a positive integer"); - } - this.maxChunkLimit = maxChunkLimit; + int maxChunkLimit = NumberUtils.createInteger(maxChunkLimitString); + if (maxChunkLimit <= 0 && maxChunkLimit != DEFAULT_MAX_CHUNK_LIMIT) { + throw new IllegalArgumentException("Parameter [" + MAX_CHUNK_LIMIT_FIELD + "] must be a positive integer"); } + this.maxChunkLimit = maxChunkLimit; + } else { + this.maxChunkLimit = DEFAULT_MAX_CHUNK_LIMIT; } } @@ -160,41 +147,60 @@ private boolean isListOfString(Object value) { return true; } - private List chunkString(String content, ChunkCountWrapper chunkCountWrapper) { - Chunker chunker = ChunkerFactory.create(chunkerType, analysisRegistry); - List result = chunker.chunk(content, chunkerParameters); - chunkCountWrapper.chunkCount += result.size(); - if (maxChunkLimit != DEFAULT_MAX_CHUNK_LIMIT && chunkCountWrapper.chunkCount > maxChunkLimit) { + private int chunkString(String content, List result, Map runTimeParameters, int chunkCount) { + // chunk the content, return the updated chunkCount and add chunk passages to result + List contentResult; + if (chunker instanceof FixedTokenLengthChunker) { + contentResult = chunker.chunk(content, runTimeParameters); + } else { + contentResult = chunker.chunk(content); + } + chunkCount += contentResult.size(); + if (maxChunkLimit != DEFAULT_MAX_CHUNK_LIMIT && chunkCount > maxChunkLimit) { throw new IllegalArgumentException( "Unable to create the processor as the number of chunks [" - + chunkCountWrapper.chunkCount + + chunkCount + "] exceeds the maximum chunk limit [" + maxChunkLimit + "]" ); } - return result; + result.addAll(contentResult); + return chunkCount; } - private List chunkList(List contentList, ChunkCountWrapper chunkCountWrapper) { + private int chunkList(List contentList, List result, Map runTimeParameters, int chunkCount) { // flatten the List> output to List - List result = new ArrayList<>(); for (String content : contentList) { - result.addAll(chunkString(content, chunkCountWrapper)); + chunkCount = chunkString(content, result, runTimeParameters, chunkCount); } - return result; + return chunkCount; } @SuppressWarnings("unchecked") - private List chunkLeafType(Object value, ChunkCountWrapper chunkCountWrapper) { + private int chunkLeafType(Object value, List result, Map runTimeParameters, int chunkCount) { // leaf type is either String or List - List chunkedResult = null; + // the result should be an empty string if (value instanceof String) { - chunkedResult = chunkString(value.toString(), chunkCountWrapper); + chunkCount = chunkString(value.toString(), result, runTimeParameters, chunkCount); } else if (isListOfString(value)) { - chunkedResult = chunkList((List) value, chunkCountWrapper); + chunkCount = chunkList((List) value, result, runTimeParameters, chunkCount); + } + return chunkCount; + } + + private int getMaxTokenCount(Map sourceAndMetadataMap) { + String indexName = sourceAndMetadataMap.get(IndexFieldMapper.NAME).toString(); + IndexMetadata indexMetadata = clusterService.state().metadata().index(indexName); + int maxTokenCount; + if (indexMetadata != null) { + // if the index exists, read maxTokenCount from the index setting + IndexService indexService = indicesService.indexServiceSafe(indexMetadata.getIndex()); + maxTokenCount = indexService.getIndexSettings().getMaxTokenCount(); + } else { + maxTokenCount = IndexSettings.MAX_TOKEN_COUNT_SETTING.get(environment.settings()); } - return chunkedResult; + return maxTokenCount; } /** @@ -204,23 +210,14 @@ private List chunkLeafType(Object value, ChunkCountWrapper chunkCountWra @Override public IngestDocument execute(IngestDocument ingestDocument) { validateFieldsValue(ingestDocument); - ChunkCountWrapper chunkCountWrapper = new ChunkCountWrapper(0); - if (Objects.equals(chunkerType, FIXED_TOKEN_LENGTH_ALGORITHM)) { - // add maxTokenCount setting from index metadata to chunker parameters - Map sourceAndMetadataMap = ingestDocument.getSourceAndMetadata(); - String indexName = sourceAndMetadataMap.get(IndexFieldMapper.NAME).toString(); - int maxTokenCount = IndexSettings.MAX_TOKEN_COUNT_SETTING.get(environment.settings()); - IndexMetadata indexMetadata = clusterService.state().metadata().index(indexName); - if (indexMetadata != null) { - // if the index exists, read maxTokenCount from the index setting - IndexService indexService = indicesService.indexServiceSafe(indexMetadata.getIndex()); - maxTokenCount = indexService.getIndexSettings().getMaxTokenCount(); - } - chunkerParameters.put(FixedTokenLengthChunker.MAX_TOKEN_COUNT_FIELD, maxTokenCount); - } - + int chunkCount = 0; + Map runtimeParameters = new HashMap<>(); Map sourceAndMetadataMap = ingestDocument.getSourceAndMetadata(); - chunkMapType(sourceAndMetadataMap, fieldMap, chunkCountWrapper); + if (chunker instanceof FixedTokenLengthChunker) { + int maxTokenCount = getMaxTokenCount(sourceAndMetadataMap); + runtimeParameters.put(FixedTokenLengthChunker.MAX_TOKEN_COUNT_FIELD, maxTokenCount); + } + chunkMapType(sourceAndMetadataMap, fieldMap, runtimeParameters, chunkCount); return ingestDocument; } @@ -269,7 +266,12 @@ private void validateListTypeValue(String sourceKey, Object sourceValue, int max } @SuppressWarnings("unchecked") - private void chunkMapType(Map sourceAndMetadataMap, Map fieldMap, ChunkCountWrapper chunkCountWrapper) { + private int chunkMapType( + Map sourceAndMetadataMap, + Map fieldMap, + Map runtimeParameters, + int chunkCount + ) { for (Map.Entry fieldMapEntry : fieldMap.entrySet()) { String originalKey = fieldMapEntry.getKey(); Object targetKey = fieldMapEntry.getValue(); @@ -280,20 +282,30 @@ private void chunkMapType(Map sourceAndMetadataMap, Map sourceObjectList = (List) sourceObject; for (Object source : sourceObjectList) { if (source instanceof Map) { - chunkMapType((Map) source, (Map) targetKey, chunkCountWrapper); + chunkCount = chunkMapType( + (Map) source, + (Map) targetKey, + runtimeParameters, + chunkCount + ); } } } else if (sourceObject instanceof Map) { - chunkMapType((Map) sourceObject, (Map) targetKey, chunkCountWrapper); + chunkCount = chunkMapType( + (Map) sourceObject, + (Map) targetKey, + runtimeParameters, + chunkCount + ); } } else { // chunk the object when target key is a string Object chunkObject = sourceAndMetadataMap.get(originalKey); - List chunkedResult = chunkLeafType(chunkObject, chunkCountWrapper); - if (chunkedResult != null) { - sourceAndMetadataMap.put(String.valueOf(targetKey), chunkedResult); - } + List chunkedResult = new ArrayList<>(); + chunkCount = chunkLeafType(chunkObject, chunkedResult, runtimeParameters, chunkCount); + sourceAndMetadataMap.put(String.valueOf(targetKey), chunkedResult); } } + return chunkCount; } } diff --git a/src/main/java/org/opensearch/neuralsearch/processor/chunker/Chunker.java b/src/main/java/org/opensearch/neuralsearch/processor/chunker/Chunker.java index 90f5a11ee..29a0539f2 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/chunker/Chunker.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/chunker/Chunker.java @@ -6,7 +6,6 @@ import com.google.common.collect.ImmutableList; -import java.util.ArrayList; import java.util.Map; import java.util.List; diff --git a/src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerFactory.java b/src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerFactory.java index 2f72eab19..086cb0b71 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerFactory.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerFactory.java @@ -7,8 +7,6 @@ import java.util.Map; import java.util.Set; -import org.opensearch.index.analysis.AnalysisRegistry; - /** * A factory to create different chunking algorithm classes and return all supported chunking algorithms. */ From bb372e6301b56d92a9d48bdda1ca144ccb59a11f Mon Sep 17 00:00:00 2001 From: yuye-aws Date: Mon, 11 Mar 2024 12:34:01 +0800 Subject: [PATCH 108/189] bug fix: allow white space as the delimiter Signed-off-by: yuye-aws --- .../processor/chunker/DelimiterChunker.java | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/src/main/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunker.java b/src/main/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunker.java index 31e227999..1559c7a79 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunker.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunker.java @@ -8,8 +8,6 @@ import java.util.List; import java.util.ArrayList; -import org.apache.commons.lang3.StringUtils; - /** * The implementation {@link Chunker} for delimiter algorithm */ @@ -37,11 +35,11 @@ public void validateParameters(Map parameters) { if (parameters.containsKey(DELIMITER_FIELD)) { if (!(parameters.get(DELIMITER_FIELD) instanceof String)) { throw new IllegalArgumentException( - "delimiter parameter [" + DELIMITER_FIELD + "] cannot be cast to [" + String.class.getName() + "]" + "delimiter parameter [" + DELIMITER_FIELD + "] cannot be cast to [" + String.class.getName() + "]" ); } this.delimiter = parameters.get(DELIMITER_FIELD).toString(); - if (StringUtils.isBlank(delimiter)) { + if (delimiter.isEmpty()) { throw new IllegalArgumentException("delimiter parameter [" + DELIMITER_FIELD + "] should not be empty."); } } @@ -54,11 +52,6 @@ public void validateParameters(Map parameters) { */ @Override public List chunk(String content) { - if (parameters.containsKey(DELIMITER_FIELD)) { - Object delimiterObject = parameters.get(DELIMITER_FIELD); - delimiter = delimiterObject.toString(); - } - List chunkResult = new ArrayList<>(); int start = 0, end; int nextDelimiterPosition = content.indexOf(delimiter); From 2538ab3ccbca1cefc65541377f0f67999e0fae18 Mon Sep 17 00:00:00 2001 From: xinyual Date: Mon, 11 Mar 2024 13:44:25 +0800 Subject: [PATCH 109/189] fix fixed length chunker Signed-off-by: xinyual --- .../chunker/FixedTokenLengthChunker.java | 174 ++++++++++-------- 1 file changed, 96 insertions(+), 78 deletions(-) diff --git a/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java b/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java index ff413efc1..8059e5651 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java @@ -5,25 +5,15 @@ package org.opensearch.neuralsearch.processor.chunker; import java.io.IOException; -import java.math.BigDecimal; -import java.math.RoundingMode; import java.util.Map; import java.util.List; import java.util.ArrayList; - -import lombok.Setter; import lombok.extern.log4j.Log4j2; +import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.math.NumberUtils; -import org.opensearch.cluster.metadata.IndexMetadata; -import org.opensearch.index.IndexService; -import org.opensearch.index.IndexSettings; import org.opensearch.index.analysis.AnalysisRegistry; import org.opensearch.action.admin.indices.analyze.AnalyzeAction; -import org.opensearch.action.admin.indices.analyze.AnalyzeAction.AnalyzeToken; -import org.opensearch.index.mapper.IndexFieldMapper; -import org.opensearch.ingest.IngestDocument; - import static org.opensearch.action.admin.indices.analyze.TransportAnalyzeAction.analyze; /** @@ -35,34 +25,44 @@ public class FixedTokenLengthChunker implements Chunker { public static final String TOKEN_LIMIT_FIELD = "token_limit"; public static final String OVERLAP_RATE_FIELD = "overlap_rate"; public static final String MAX_TOKEN_COUNT_FIELD = "max_token_count"; - public static final String ANALYSIS_REGISTRY_FIELD = "analysis_registry"; public static final String TOKENIZER_FIELD = "tokenizer"; + public static final String TOKEN_CONCATENATOR_FIELD = "token_concatenator"; + // default values for each parameter private static final int DEFAULT_TOKEN_LIMIT = 384; - private static final BigDecimal DEFAULT_OVERLAP_RATE = new BigDecimal("0"); + private static final Double DEFAULT_OVERLAP_RATE = 0.0; private static final int DEFAULT_MAX_TOKEN_COUNT = 10000; private static final String DEFAULT_TOKENIZER = "standard"; - private static final BigDecimal OVERLAP_RATE_UPPER_BOUND = new BigDecimal("0.5"); - private int tokenLimit = DEFAULT_TOKEN_LIMIT; - private BigDecimal overlapRate = DEFAULT_OVERLAP_RATE; - private String tokenizer = DEFAULT_TOKENIZER; - private AnalysisRegistry analysisRegistry; + private static final String DEFAULT_TOKEN_CONCATENATOR = " "; + + private static final Double OVERLAP_RATE_UPPER_BOUND = 0.5; + + private Double overlapRate; + + private int tokenLimit; + + private String tokenConcatenator; + + private String tokenizer; + + private final AnalysisRegistry analysisRegistry; - public FixedTokenLengthChunker(Map parameters) { + public FixedTokenLengthChunker(AnalysisRegistry analysisRegistry, Map parameters) { validateParameters(parameters); + this.analysisRegistry = analysisRegistry; } /** - * Validate and parse the parameters for fixed token length algorithm, + * Validate the chunked passages for fixed token length algorithm, * will throw IllegalArgumentException when parameters are invalid * * @param parameters a map containing parameters, containing the following parameters: - * 1. tokenizer: the analyzer tokenizer in opensearch - * 2. token_limit: the token limit for each chunked passage - * 3. overlap_rate: the overlapping degree for each chunked passage, indicating how many token comes from the previous passage - * 4. max_token_count: the max token limit for the tokenizer + * 1. tokenizer the analyzer tokenizer in opensearch, please check https://opensearch.org/docs/latest/analyzers/tokenizers/index/ + * 2. token_limit the token limit for each chunked passage + * 3. overlap_rate the overlapping degree for each chunked passage, indicating how many token comes from the previous passage + * 4. max_token_count the max token limit for the tokenizer * Here are requirements for parameters: * max_token_count and token_limit should be a positive integer * overlap_rate should be within range [0, 0.5] @@ -70,93 +70,111 @@ public FixedTokenLengthChunker(Map parameters) { */ @Override public void validateParameters(Map parameters) { - if (parameters.containsKey(TOKEN_LIMIT_FIELD)) { - String tokenLimitString = parameters.get(TOKEN_LIMIT_FIELD).toString(); - if (!(NumberUtils.isParsable(tokenLimitString))) { - throw new IllegalArgumentException( - "fixed length parameter [" + TOKEN_LIMIT_FIELD + "] cannot be cast to [" + Number.class.getName() + "]" - ); - } - this.tokenLimit = NumberUtils.createInteger(tokenLimitString); - if (tokenLimit <= 0) { - throw new IllegalArgumentException("fixed length parameter [" + TOKEN_LIMIT_FIELD + "] must be positive"); - } - } - - if (parameters.containsKey(MAX_TOKEN_COUNT_FIELD)) { - String maxTokenCountString = parameters.get(MAX_TOKEN_COUNT_FIELD).toString(); - if (!(NumberUtils.isParsable(maxTokenCountString))) { - throw new IllegalArgumentException( - "fixed length parameter [" + MAX_TOKEN_COUNT_FIELD + "] cannot be cast to [" + Number.class.getName() + "]" - ); - } - this.maxTokenCount = NumberUtils.createInteger(maxTokenCountString); - if (maxTokenCount <= 0) { - throw new IllegalArgumentException("fixed length parameter [" + MAX_TOKEN_COUNT_FIELD + "] must be positive"); - } - } - + this.tokenLimit = validatePositiveIntegerParameter(parameters, TOKEN_LIMIT_FIELD, DEFAULT_TOKEN_LIMIT); if (parameters.containsKey(OVERLAP_RATE_FIELD)) { String overlapRateString = parameters.get(OVERLAP_RATE_FIELD).toString(); if (!(NumberUtils.isParsable(overlapRateString))) { throw new IllegalArgumentException( - "fixed length parameter [" + OVERLAP_RATE_FIELD + "] cannot be cast to [" + Number.class.getName() + "]" + "fixed length parameter [" + OVERLAP_RATE_FIELD + "] cannot be cast to [" + Number.class.getName() + "]" ); } - this.overlapRate = new BigDecimal(overlapRateString); - if (overlapRate.compareTo(BigDecimal.ZERO) < 0 || overlapRate.compareTo(OVERLAP_RATE_UPPER_BOUND) > 0) { + Double overlapRate = Double.valueOf(overlapRateString); + if (overlapRate < 0 || overlapRate.compareTo(OVERLAP_RATE_UPPER_BOUND) > 0) { throw new IllegalArgumentException( - "fixed length parameter [" + OVERLAP_RATE_FIELD + "] must be between 0 and " + OVERLAP_RATE_UPPER_BOUND + "fixed length parameter [" + OVERLAP_RATE_FIELD + "] must be between 0 and " + OVERLAP_RATE_UPPER_BOUND ); } + this.overlapRate = overlapRate; + } else { + this.overlapRate = DEFAULT_OVERLAP_RATE; } + this.tokenizer = validateStringParameters(parameters, TOKENIZER_FIELD, DEFAULT_TOKENIZER, false); + this.tokenConcatenator = validateStringParameters(parameters, TOKEN_CONCATENATOR_FIELD, DEFAULT_TOKEN_CONCATENATOR, true); + } - if (parameters.containsKey(TOKENIZER_FIELD)) { - if (!(parameters.get(TOKENIZER_FIELD) instanceof String)) { - throw new IllegalArgumentException( - "fixed length parameter [" + TOKENIZER_FIELD + "] cannot be cast to [" + String.class.getName() + "]" - ); - } - this.tokenizer = parameters.get(TOKENIZER_FIELD).toString(); + public static String validateStringParameters( + Map parameters, + String fieldName, + String defaultValue, + boolean allowEmpty + ) { + if (!parameters.containsKey(fieldName)) { + // all parameters are optional + return defaultValue; + } + if (!(parameters.get(fieldName) instanceof String)) { + throw new IllegalArgumentException("Chunker parameter [" + fieldName + "] cannot be cast to [" + String.class.getName() + "]"); + } else if (StringUtils.isEmpty(parameters.get(fieldName).toString()) && !allowEmpty) { + throw new IllegalArgumentException("Chunker parameter: " + fieldName + " should not be empty."); } + return (String) parameters.get(fieldName); + } + + private int validatePositiveIntegerParameter(Map parameters, String fieldName, int defaultValue) { + // this method validate that parameter is a positive integer + if (!parameters.containsKey(fieldName)) { + // all parameters are optional + return defaultValue; + } + String fieldValue = parameters.get(fieldName).toString(); + if (!(NumberUtils.isParsable(fieldValue))) { + throw new IllegalArgumentException( + "fixed length parameter [" + fieldName + "] cannot be cast to [" + Number.class.getName() + "]" + ); + } + if (NumberUtils.createInteger(fieldValue) <= 0) { + throw new IllegalArgumentException("fixed length parameter [" + fieldName + "] must be positive"); + } + return Integer.valueOf(fieldValue); } /** * Return the chunked passages for fixed token length algorithm * * @param content input string + * @param parameters a map containing parameters, containing the following parameters + * 1. tokenizer the analyzer tokenizer in OpenSearch + * 2. token_limit the token limit for each chunked passage + * 3. overlap_rate the overlapping degree for each chunked passage, indicating how many token comes from the previous passage + * 4. max_token_count the max token limit for the tokenizer */ @Override - public List chunk(String content, Map runtimeParameters) { - List tokens = tokenize(content, tokenizer, maxTokenCount); + public List chunk(String content, Map parameters) { + // prior to chunking, parameters have been validated + int maxTokenCount = validatePositiveIntegerParameter(parameters, MAX_TOKEN_COUNT_FIELD, DEFAULT_MAX_TOKEN_COUNT); + + List tokens = tokenize(content, tokenizer, maxTokenCount); List passages = new ArrayList<>(); - int startTokenIndex = 0, endTokenIndex; - int startContentPosition, endContentPosition; - BigDecimal overlapTokenNumberBigDecimal = overlapRate.multiply(new BigDecimal(String.valueOf(tokenLimit))) - .setScale(0, RoundingMode.DOWN); - int overlapTokenNumber = overlapTokenNumberBigDecimal.intValue(); - - while (startTokenIndex < tokens.size()) { - endTokenIndex = Math.min(tokens.size(), startTokenIndex + tokenLimit) - 1; - startContentPosition = tokens.get(startTokenIndex).getStartOffset(); - endContentPosition = tokens.get(endTokenIndex).getEndOffset(); - passages.add(content.substring(startContentPosition, endContentPosition)); - if (startTokenIndex + tokenLimit >= tokens.size()) { + Double overlapTokenNumberDouble = overlapRate * tokenLimit; + int overlapTokenNumber = overlapTokenNumberDouble.intValue(); + + int startToken = 0; + while (startToken < tokens.size()) { + if (startToken + tokenLimit >= tokens.size()) { + // break the loop when already cover the last token + passages.add(String.join(tokenConcatenator, tokens.subList(startToken, tokens.size()))); break; + } else { + passages.add(String.join(tokenConcatenator, tokens.subList(startToken, startToken + tokenLimit))); } - startTokenIndex += tokenLimit - overlapTokenNumber; + startToken += tokenLimit - overlapTokenNumber; } return passages; } - private List tokenize(String content, String tokenizer, int maxTokenCount) { + private List tokenize(String content, String tokenizer, int maxTokenCount) { AnalyzeAction.Request analyzeRequest = new AnalyzeAction.Request(); analyzeRequest.text(content); analyzeRequest.tokenizer(tokenizer); try { AnalyzeAction.Response analyzeResponse = analyze(analyzeRequest, analysisRegistry, null, maxTokenCount); - return analyzeResponse.getTokens(); + List tokenList = new ArrayList<>(); + List analyzeTokenList = analyzeResponse.getTokens(); + for (AnalyzeAction.AnalyzeToken analyzeToken : analyzeTokenList) { + tokenList.add(analyzeToken.getTerm()); + } + return tokenList; } catch (IOException e) { throw new RuntimeException("Fixed token length algorithm meet with exception: " + e); } From 9c9172d85f737fa38c857b515aeaaf63516ea956 Mon Sep 17 00:00:00 2001 From: xinyual Date: Mon, 11 Mar 2024 13:44:54 +0800 Subject: [PATCH 110/189] fix delimiter chunker Signed-off-by: xinyual --- .../processor/chunker/DelimiterChunker.java | 32 +++++++++---------- 1 file changed, 15 insertions(+), 17 deletions(-) diff --git a/src/main/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunker.java b/src/main/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunker.java index 1559c7a79..dc37b7876 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunker.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunker.java @@ -8,6 +8,8 @@ import java.util.List; import java.util.ArrayList; +import static org.opensearch.neuralsearch.processor.chunker.FixedTokenLengthChunker.validateStringParameters; + /** * The implementation {@link Chunker} for delimiter algorithm */ @@ -21,7 +23,7 @@ public DelimiterChunker(Map parameters) { public static final String DEFAULT_DELIMITER = "."; - private String delimiter = DEFAULT_DELIMITER; + private String delimiter; /** * Validate the chunked passages for delimiter algorithm @@ -32,24 +34,9 @@ public DelimiterChunker(Map parameters) { */ @Override public void validateParameters(Map parameters) { - if (parameters.containsKey(DELIMITER_FIELD)) { - if (!(parameters.get(DELIMITER_FIELD) instanceof String)) { - throw new IllegalArgumentException( - "delimiter parameter [" + DELIMITER_FIELD + "] cannot be cast to [" + String.class.getName() + "]" - ); - } - this.delimiter = parameters.get(DELIMITER_FIELD).toString(); - if (delimiter.isEmpty()) { - throw new IllegalArgumentException("delimiter parameter [" + DELIMITER_FIELD + "] should not be empty."); - } - } + this.delimiter = validateStringParameters(parameters, DELIMITER_FIELD, DEFAULT_DELIMITER, false); } - /** - * Return the chunked passages for delimiter algorithm - * - * @param content input string - */ @Override public List chunk(String content) { List chunkResult = new ArrayList<>(); @@ -69,4 +56,15 @@ public List chunk(String content) { return chunkResult; } + + /** + * Return the chunked passages for delimiter algorithm + * + * @param content input string + * @param parameters a map containing parameters, containing the following parameters + */ + @Override + public List chunk(String content, Map parameters) { + return chunk(content); + } } From d05b2469201c4f665d473db429360138c39328ef Mon Sep 17 00:00:00 2001 From: xinyual Date: Mon, 11 Mar 2024 13:45:33 +0800 Subject: [PATCH 111/189] fix chunker factory Signed-off-by: xinyual --- .../neuralsearch/processor/chunker/ChunkerFactory.java | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerFactory.java b/src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerFactory.java index 086cb0b71..99460c762 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerFactory.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerFactory.java @@ -7,6 +7,8 @@ import java.util.Map; import java.util.Set; +import org.opensearch.index.analysis.AnalysisRegistry; + /** * A factory to create different chunking algorithm classes and return all supported chunking algorithms. */ @@ -15,15 +17,15 @@ public class ChunkerFactory { public static final String FIXED_TOKEN_LENGTH_ALGORITHM = "fixed_token_length"; public static final String DELIMITER_ALGORITHM = "delimiter"; - public static Chunker create(String type, Map parameters) { + public static Chunker create(String type, AnalysisRegistry analysisRegistry, Map parameters) { switch (type) { case FIXED_TOKEN_LENGTH_ALGORITHM: - return new FixedTokenLengthChunker(parameters); + return new FixedTokenLengthChunker(analysisRegistry, parameters); case DELIMITER_ALGORITHM: return new DelimiterChunker(parameters); default: throw new IllegalArgumentException( - "chunker type [" + type + "] is not supported. Supported chunkers types are " + ChunkerFactory.getAllChunkers() + "chunker type [" + type + "] is not supported. Supported chunkers types are " + ChunkerFactory.getAllChunkers() ); } } From 04fc7d31673b37652da4cd54b3504c80e93d1f6d Mon Sep 17 00:00:00 2001 From: xinyual Date: Mon, 11 Mar 2024 13:39:23 +0800 Subject: [PATCH 112/189] fix UTs Signed-off-by: xinyual --- .../chunker/FixedTokenLengthChunker.java | 8 ++-- .../chunker/ChunkerFactoryTests.java | 7 +-- .../chunker/DelimiterChunkerTests.java | 46 +++++++++---------- .../chunker/FixedTokenLengthChunkerTests.java | 36 +++++++++------ 4 files changed, 54 insertions(+), 43 deletions(-) diff --git a/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java b/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java index 8059e5651..b63bed987 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java @@ -93,10 +93,10 @@ public void validateParameters(Map parameters) { } public static String validateStringParameters( - Map parameters, - String fieldName, - String defaultValue, - boolean allowEmpty + Map parameters, + String fieldName, + String defaultValue, + boolean allowEmpty ) { if (!parameters.containsKey(fieldName)) { // all parameters are optional diff --git a/src/test/java/org/opensearch/neuralsearch/processor/chunker/ChunkerFactoryTests.java b/src/test/java/org/opensearch/neuralsearch/processor/chunker/ChunkerFactoryTests.java index 8978946a7..32f1fd924 100644 --- a/src/test/java/org/opensearch/neuralsearch/processor/chunker/ChunkerFactoryTests.java +++ b/src/test/java/org/opensearch/neuralsearch/processor/chunker/ChunkerFactoryTests.java @@ -8,6 +8,7 @@ import org.opensearch.index.analysis.AnalysisRegistry; import org.opensearch.test.OpenSearchTestCase; +import java.util.Map; import java.util.Set; public class ChunkerFactoryTests extends OpenSearchTestCase { @@ -21,13 +22,13 @@ public void testGetAllChunkers() { } public void testCreate_FixedTokenLength() { - Chunker chunker = ChunkerFactory.create(ChunkerFactory.FIXED_TOKEN_LENGTH_ALGORITHM, analysisRegistry); + Chunker chunker = ChunkerFactory.create(ChunkerFactory.FIXED_TOKEN_LENGTH_ALGORITHM, analysisRegistry, Map.of()); assertNotNull(chunker); assertTrue(chunker instanceof FixedTokenLengthChunker); } public void testCreate_Delimiter() { - Chunker chunker = ChunkerFactory.create(ChunkerFactory.DELIMITER_ALGORITHM, analysisRegistry); + Chunker chunker = ChunkerFactory.create(ChunkerFactory.DELIMITER_ALGORITHM, analysisRegistry, Map.of()); assertNotNull(chunker); assertTrue(chunker instanceof DelimiterChunker); } @@ -36,7 +37,7 @@ public void testCreate_Invalid() { String invalidChunkerType = "Invalid Chunker Type"; IllegalArgumentException illegalArgumentException = assertThrows( IllegalArgumentException.class, - () -> ChunkerFactory.create(invalidChunkerType, analysisRegistry) + () -> ChunkerFactory.create(invalidChunkerType, analysisRegistry, Map.of()) ); assert (illegalArgumentException.getMessage().contains("chunker type [" + invalidChunkerType + "] is not supported.")); } diff --git a/src/test/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunkerTests.java b/src/test/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunkerTests.java index 1245f2a71..5f5539c37 100644 --- a/src/test/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunkerTests.java +++ b/src/test/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunkerTests.java @@ -17,30 +17,31 @@ public class DelimiterChunkerTests extends OpenSearchTestCase { public void testChunkerWithDelimiterFieldNotString() { - DelimiterChunker chunker = new DelimiterChunker(); - String content = "a\nb\nc\nd"; - Map inputParameters = Map.of(DELIMITER_FIELD, List.of("")); - Exception exception = assertThrows(IllegalArgumentException.class, () -> chunker.validateParameters(inputParameters)); - Assert.assertEquals("delimiter parameters: " + List.of("") + " must be string.", exception.getMessage()); + Exception exception = assertThrows( + IllegalArgumentException.class, + () -> new DelimiterChunker(Map.of(DELIMITER_FIELD, List.of(""))) + ); + Assert.assertEquals( + "Chunker parameter [" + DELIMITER_FIELD + "] cannot be cast to [" + String.class.getName() + "]", + exception.getMessage() + ); } public void testChunkerWithDelimiterFieldNoString() { - DelimiterChunker chunker = new DelimiterChunker(); - Map inputParameters = Map.of(DELIMITER_FIELD, ""); - Exception exception = assertThrows(IllegalArgumentException.class, () -> chunker.validateParameters(inputParameters)); - Assert.assertEquals("delimiter parameters should not be empty.", exception.getMessage()); + Exception exception = assertThrows(IllegalArgumentException.class, () -> new DelimiterChunker(Map.of(DELIMITER_FIELD, ""))); + Assert.assertEquals("Chunker parameter: " + DELIMITER_FIELD + " should not be empty.", exception.getMessage()); } public void testChunker() { - DelimiterChunker chunker = new DelimiterChunker(); + DelimiterChunker chunker = new DelimiterChunker(Map.of(DELIMITER_FIELD, "\n")); String content = "a\nb\nc\nd"; - Map inputParameters = Map.of(DELIMITER_FIELD, "\n"); + Map inputParameters = Map.of(); List chunkResult = chunker.chunk(content, inputParameters); assertEquals(List.of("a\n", "b\n", "c\n", "d"), chunkResult); } public void testChunkerWithDefaultDelimiter() { - DelimiterChunker chunker = new DelimiterChunker(); + DelimiterChunker chunker = new DelimiterChunker(Map.of()); String content = "a.b.c.d"; Map inputParameters = Map.of(); List chunkResult = chunker.chunk(content, inputParameters); @@ -48,41 +49,40 @@ public void testChunkerWithDefaultDelimiter() { } public void testChunkerWithDelimiterEnd() { - DelimiterChunker chunker = new DelimiterChunker(); + DelimiterChunker chunker = new DelimiterChunker(Map.of(DELIMITER_FIELD, "\n")); String content = "a\nb\nc\nd\n"; - Map inputParameters = Map.of(DELIMITER_FIELD, "\n"); + Map inputParameters = Map.of(); List chunkResult = chunker.chunk(content, inputParameters); assertEquals(List.of("a\n", "b\n", "c\n", "d\n"), chunkResult); } public void testChunkerWithOnlyDelimiter() { - DelimiterChunker chunker = new DelimiterChunker(); + DelimiterChunker chunker = new DelimiterChunker(Map.of(DELIMITER_FIELD, "\n")); String content = "\n"; - Map inputParameters = Map.of(DELIMITER_FIELD, "\n"); + Map inputParameters = Map.of(); List chunkResult = chunker.chunk(content, inputParameters); assertEquals(List.of("\n"), chunkResult); } public void testChunkerWithAllDelimiters() { - DelimiterChunker chunker = new DelimiterChunker(); + DelimiterChunker chunker = new DelimiterChunker(Map.of(DELIMITER_FIELD, "\n")); String content = "\n\n\n"; - Map inputParameters = Map.of(DELIMITER_FIELD, "\n"); + Map inputParameters = Map.of(); List chunkResult = chunker.chunk(content, inputParameters); assertEquals(List.of("\n", "\n", "\n"), chunkResult); } public void testChunkerWithDifferentDelimiters() { - DelimiterChunker chunker = new DelimiterChunker(); + DelimiterChunker chunker = new DelimiterChunker(Map.of(DELIMITER_FIELD, ".")); String content = "a.b.cc.d."; - Map inputParameters = Map.of(DELIMITER_FIELD, "."); - List chunkResult = chunker.chunk(content, inputParameters); + List chunkResult = chunker.chunk(content, Map.of()); assertEquals(List.of("a.", "b.", "cc.", "d."), chunkResult); } public void testChunkerWithStringDelimiter() { - DelimiterChunker chunker = new DelimiterChunker(); + DelimiterChunker chunker = new DelimiterChunker(Map.of(DELIMITER_FIELD, "\n\n")); String content = "\n\na\n\n\n"; - Map inputParameters = Map.of(DELIMITER_FIELD, "\n\n"); + Map inputParameters = Map.of(); List chunkResult = chunker.chunk(content, inputParameters); assertEquals(List.of("\n\n", "a\n\n", "\n"), chunkResult); } diff --git a/src/test/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunkerTests.java b/src/test/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunkerTests.java index 1c678fa9c..583a4b2c7 100644 --- a/src/test/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunkerTests.java +++ b/src/test/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunkerTests.java @@ -35,6 +35,11 @@ public class FixedTokenLengthChunkerTests extends OpenSearchTestCase { @Before @SneakyThrows public void setup() { + FixedTokenLengthChunker = createFixedTokenLengthChunker(Map.of()); + } + + @SneakyThrows + public FixedTokenLengthChunker createFixedTokenLengthChunker(Map parameters) { Settings settings = Settings.builder().put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()).build(); Environment environment = TestEnvironment.newEnvironment(settings); AnalysisPlugin plugin = new AnalysisPlugin() { @@ -51,7 +56,7 @@ public Map> getTokeniz } }; AnalysisRegistry analysisRegistry = new AnalysisModule(environment, singletonList(plugin)).getAnalysisRegistry(); - FixedTokenLengthChunker = new FixedTokenLengthChunker(analysisRegistry); + return new FixedTokenLengthChunker(analysisRegistry, parameters); } public void testValidateParameters_whenNoParams_thenSuccessful() { @@ -116,7 +121,7 @@ public void testValidateParameters_whenIllegalTokenizerType_thenFail() { () -> FixedTokenLengthChunker.validateParameters(parameters) ); assertEquals( - "fixed length parameter [" + TOKENIZER_FIELD + "] cannot be cast to [" + String.class.getName() + "]", + "Chunker parameter [" + TOKENIZER_FIELD + "] cannot be cast to [" + String.class.getName() + "]", illegalArgumentException.getMessage() ); } @@ -125,13 +130,15 @@ public void testChunk_withTokenLimit_10() { Map parameters = new HashMap<>(); parameters.put(TOKEN_LIMIT_FIELD, 10); parameters.put(TOKENIZER_FIELD, "standard"); - parameters.put(MAX_TOKEN_COUNT_FIELD, 10000); + FixedTokenLengthChunker fixedTokenLengthChunker = createFixedTokenLengthChunker(parameters); + Map runtimeParameters = new HashMap<>(); + runtimeParameters.put(MAX_TOKEN_COUNT_FIELD, 10000); String content = "This is an example document to be chunked. The document contains a single paragraph, two sentences and 24 tokens by standard tokenizer in OpenSearch."; - List passages = FixedTokenLengthChunker.chunk(content, parameters); + List passages = fixedTokenLengthChunker.chunk(content, runtimeParameters); List expectedPassages = new ArrayList<>(); - expectedPassages.add("This is an example document to be chunked. The document"); - expectedPassages.add("contains a single paragraph, two sentences and 24 tokens by"); + expectedPassages.add("This is an example document to be chunked The document"); + expectedPassages.add("contains a single paragraph two sentences and 24 tokens by"); expectedPassages.add("standard tokenizer in OpenSearch"); assertEquals(expectedPassages, passages); } @@ -140,13 +147,15 @@ public void testChunk_withTokenLimit_20() { Map parameters = new HashMap<>(); parameters.put(TOKEN_LIMIT_FIELD, 20); parameters.put(TOKENIZER_FIELD, "standard"); - parameters.put(MAX_TOKEN_COUNT_FIELD, 10000); + FixedTokenLengthChunker fixedTokenLengthChunker = createFixedTokenLengthChunker(parameters); + Map runtimeParameters = new HashMap<>(); + runtimeParameters.put(MAX_TOKEN_COUNT_FIELD, 10000); String content = "This is an example document to be chunked. The document contains a single paragraph, two sentences and 24 tokens by standard tokenizer in OpenSearch."; - List passages = FixedTokenLengthChunker.chunk(content, parameters); + List passages = fixedTokenLengthChunker.chunk(content, runtimeParameters); List expectedPassages = new ArrayList<>(); expectedPassages.add( - "This is an example document to be chunked. The document contains a single paragraph, two sentences and 24 tokens by" + "This is an example document to be chunked The document contains a single paragraph two sentences and 24 tokens by" ); expectedPassages.add("standard tokenizer in OpenSearch"); assertEquals(expectedPassages, passages); @@ -156,13 +165,14 @@ public void testChunk_withOverlapRate_half() { Map parameters = new HashMap<>(); parameters.put(TOKEN_LIMIT_FIELD, 10); parameters.put(OVERLAP_RATE_FIELD, 0.5); + FixedTokenLengthChunker fixedTokenLengthChunker = createFixedTokenLengthChunker(parameters); String content = "This is an example document to be chunked. The document contains a single paragraph, two sentences and 24 tokens by standard tokenizer in OpenSearch."; - List passages = FixedTokenLengthChunker.chunk(content, parameters); + List passages = fixedTokenLengthChunker.chunk(content, Map.of()); List expectedPassages = new ArrayList<>(); - expectedPassages.add("This is an example document to be chunked. The document"); - expectedPassages.add("to be chunked. The document contains a single paragraph, two"); - expectedPassages.add("contains a single paragraph, two sentences and 24 tokens by"); + expectedPassages.add("This is an example document to be chunked The document"); + expectedPassages.add("to be chunked The document contains a single paragraph two"); + expectedPassages.add("contains a single paragraph two sentences and 24 tokens by"); expectedPassages.add("sentences and 24 tokens by standard tokenizer in OpenSearch"); assertEquals(expectedPassages, passages); } From 7fe93c0a5c55b0b705d2448d4f79522ab17c649b Mon Sep 17 00:00:00 2001 From: xinyual Date: Mon, 11 Mar 2024 13:48:30 +0800 Subject: [PATCH 113/189] fix UT and chunker factory Signed-off-by: xinyual --- .../neuralsearch/processor/DocumentChunkingProcessor.java | 6 +----- .../neuralsearch/processor/chunker/ChunkerFactory.java | 2 +- .../processor/chunker/FixedTokenLengthChunker.java | 6 +++--- 3 files changed, 5 insertions(+), 9 deletions(-) diff --git a/src/main/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessor.java b/src/main/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessor.java index c3432dfaa..275438a88 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessor.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessor.java @@ -30,7 +30,6 @@ import org.opensearch.neuralsearch.processor.chunker.Chunker; import org.opensearch.index.mapper.IndexFieldMapper; import org.opensearch.neuralsearch.processor.chunker.FixedTokenLengthChunker; -import static org.opensearch.neuralsearch.processor.chunker.ChunkerFactory.FIXED_TOKEN_LENGTH_ALGORITHM; /** * This processor is used for chunking user input data and chunked data could be used for downstream embedding processor, @@ -112,10 +111,7 @@ private void validateAndParseAlgorithmMap(Map algorithmMap) { ); } Map chunkerParameters = (Map) algorithmValue; - if (Objects.equals(algorithmKey, FIXED_TOKEN_LENGTH_ALGORITHM)) { - chunkerParameters.put(FixedTokenLengthChunker.ANALYSIS_REGISTRY_FIELD, analysisRegistry); - } - this.chunker = ChunkerFactory.create(algorithmKey, chunkerParameters); + this.chunker = ChunkerFactory.create(algorithmKey, analysisRegistry, chunkerParameters); if (chunkerParameters.containsKey(MAX_CHUNK_LIMIT_FIELD)) { String maxChunkLimitString = chunkerParameters.get(MAX_CHUNK_LIMIT_FIELD).toString(); if (!(NumberUtils.isParsable(maxChunkLimitString))) { diff --git a/src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerFactory.java b/src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerFactory.java index 99460c762..332c62c4f 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerFactory.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerFactory.java @@ -25,7 +25,7 @@ public static Chunker create(String type, AnalysisRegistry analysisRegistry, Map return new DelimiterChunker(parameters); default: throw new IllegalArgumentException( - "chunker type [" + type + "] is not supported. Supported chunkers types are " + ChunkerFactory.getAllChunkers() + "chunker type [" + type + "] is not supported. Supported chunkers types are " + ChunkerFactory.getAllChunkers() ); } } diff --git a/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java b/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java index b63bed987..95b80363d 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java @@ -75,13 +75,13 @@ public void validateParameters(Map parameters) { String overlapRateString = parameters.get(OVERLAP_RATE_FIELD).toString(); if (!(NumberUtils.isParsable(overlapRateString))) { throw new IllegalArgumentException( - "fixed length parameter [" + OVERLAP_RATE_FIELD + "] cannot be cast to [" + Number.class.getName() + "]" + "fixed length parameter [" + OVERLAP_RATE_FIELD + "] cannot be cast to [" + Number.class.getName() + "]" ); } Double overlapRate = Double.valueOf(overlapRateString); if (overlapRate < 0 || overlapRate.compareTo(OVERLAP_RATE_UPPER_BOUND) > 0) { throw new IllegalArgumentException( - "fixed length parameter [" + OVERLAP_RATE_FIELD + "] must be between 0 and " + OVERLAP_RATE_UPPER_BOUND + "fixed length parameter [" + OVERLAP_RATE_FIELD + "] must be between 0 and " + OVERLAP_RATE_UPPER_BOUND ); } this.overlapRate = overlapRate; @@ -119,7 +119,7 @@ private int validatePositiveIntegerParameter(Map parameters, Str String fieldValue = parameters.get(fieldName).toString(); if (!(NumberUtils.isParsable(fieldValue))) { throw new IllegalArgumentException( - "fixed length parameter [" + fieldName + "] cannot be cast to [" + Number.class.getName() + "]" + "fixed length parameter [" + fieldName + "] cannot be cast to [" + Number.class.getName() + "]" ); } if (NumberUtils.createInteger(fieldValue) <= 0) { From cefb0a6a84e0929132a12b6be40edcddeacad33e Mon Sep 17 00:00:00 2001 From: xinyual Date: Mon, 11 Mar 2024 14:04:18 +0800 Subject: [PATCH 114/189] move analysis_registry to non-runtime parameters Signed-off-by: xinyual --- .../neuralsearch/processor/DocumentChunkingProcessor.java | 7 ++++++- .../neuralsearch/processor/chunker/ChunkerFactory.java | 6 ++++-- .../processor/chunker/FixedTokenLengthChunker.java | 1 + .../processor/chunker/ChunkerFactoryTests.java | 8 +++++--- 4 files changed, 16 insertions(+), 6 deletions(-) diff --git a/src/main/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessor.java b/src/main/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessor.java index 275438a88..526910b44 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessor.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessor.java @@ -31,6 +31,8 @@ import org.opensearch.index.mapper.IndexFieldMapper; import org.opensearch.neuralsearch.processor.chunker.FixedTokenLengthChunker; +import static org.opensearch.neuralsearch.processor.chunker.ChunkerFactory.FIXED_TOKEN_LENGTH_ALGORITHM; + /** * This processor is used for chunking user input data and chunked data could be used for downstream embedding processor, * algorithm can be used to indicate chunking algorithm and parameters, @@ -111,7 +113,10 @@ private void validateAndParseAlgorithmMap(Map algorithmMap) { ); } Map chunkerParameters = (Map) algorithmValue; - this.chunker = ChunkerFactory.create(algorithmKey, analysisRegistry, chunkerParameters); + if (Objects.equals(algorithmKey, FIXED_TOKEN_LENGTH_ALGORITHM)) { + chunkerParameters.put(FixedTokenLengthChunker.ANALYSIS_REGISTRY_FIELD, analysisRegistry); + } + this.chunker = ChunkerFactory.create(algorithmKey, chunkerParameters); if (chunkerParameters.containsKey(MAX_CHUNK_LIMIT_FIELD)) { String maxChunkLimitString = chunkerParameters.get(MAX_CHUNK_LIMIT_FIELD).toString(); if (!(NumberUtils.isParsable(maxChunkLimitString))) { diff --git a/src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerFactory.java b/src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerFactory.java index 332c62c4f..b86894f60 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerFactory.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerFactory.java @@ -9,6 +9,8 @@ import org.opensearch.index.analysis.AnalysisRegistry; +import static org.opensearch.neuralsearch.processor.chunker.FixedTokenLengthChunker.ANALYSIS_REGISTRY_FIELD; + /** * A factory to create different chunking algorithm classes and return all supported chunking algorithms. */ @@ -17,10 +19,10 @@ public class ChunkerFactory { public static final String FIXED_TOKEN_LENGTH_ALGORITHM = "fixed_token_length"; public static final String DELIMITER_ALGORITHM = "delimiter"; - public static Chunker create(String type, AnalysisRegistry analysisRegistry, Map parameters) { + public static Chunker create(String type, Map parameters) { switch (type) { case FIXED_TOKEN_LENGTH_ALGORITHM: - return new FixedTokenLengthChunker(analysisRegistry, parameters); + return new FixedTokenLengthChunker((AnalysisRegistry) parameters.get(ANALYSIS_REGISTRY_FIELD), parameters); case DELIMITER_ALGORITHM: return new DelimiterChunker(parameters); default: diff --git a/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java b/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java index 95b80363d..9fb8245ef 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java @@ -22,6 +22,7 @@ @Log4j2 public class FixedTokenLengthChunker implements Chunker { + public static final String ANALYSIS_REGISTRY_FIELD = "analysis_registry"; public static final String TOKEN_LIMIT_FIELD = "token_limit"; public static final String OVERLAP_RATE_FIELD = "overlap_rate"; public static final String MAX_TOKEN_COUNT_FIELD = "max_token_count"; diff --git a/src/test/java/org/opensearch/neuralsearch/processor/chunker/ChunkerFactoryTests.java b/src/test/java/org/opensearch/neuralsearch/processor/chunker/ChunkerFactoryTests.java index 32f1fd924..1860e1ec9 100644 --- a/src/test/java/org/opensearch/neuralsearch/processor/chunker/ChunkerFactoryTests.java +++ b/src/test/java/org/opensearch/neuralsearch/processor/chunker/ChunkerFactoryTests.java @@ -11,6 +11,8 @@ import java.util.Map; import java.util.Set; +import static org.opensearch.neuralsearch.processor.chunker.FixedTokenLengthChunker.ANALYSIS_REGISTRY_FIELD; + public class ChunkerFactoryTests extends OpenSearchTestCase { @Mock @@ -22,13 +24,13 @@ public void testGetAllChunkers() { } public void testCreate_FixedTokenLength() { - Chunker chunker = ChunkerFactory.create(ChunkerFactory.FIXED_TOKEN_LENGTH_ALGORITHM, analysisRegistry, Map.of()); + Chunker chunker = ChunkerFactory.create(ChunkerFactory.FIXED_TOKEN_LENGTH_ALGORITHM, Map.of(ANALYSIS_REGISTRY_FIELD, analysisRegistry)); assertNotNull(chunker); assertTrue(chunker instanceof FixedTokenLengthChunker); } public void testCreate_Delimiter() { - Chunker chunker = ChunkerFactory.create(ChunkerFactory.DELIMITER_ALGORITHM, analysisRegistry, Map.of()); + Chunker chunker = ChunkerFactory.create(ChunkerFactory.DELIMITER_ALGORITHM, Map.of(ANALYSIS_REGISTRY_FIELD, analysisRegistry)); assertNotNull(chunker); assertTrue(chunker instanceof DelimiterChunker); } @@ -37,7 +39,7 @@ public void testCreate_Invalid() { String invalidChunkerType = "Invalid Chunker Type"; IllegalArgumentException illegalArgumentException = assertThrows( IllegalArgumentException.class, - () -> ChunkerFactory.create(invalidChunkerType, analysisRegistry, Map.of()) + () -> ChunkerFactory.create(invalidChunkerType, Map.of(ANALYSIS_REGISTRY_FIELD, analysisRegistry)) ); assert (illegalArgumentException.getMessage().contains("chunker type [" + invalidChunkerType + "] is not supported.")); } From 16038af16e65b696eddab48ba4a9a5dd2da8b91a Mon Sep 17 00:00:00 2001 From: xinyual Date: Mon, 11 Mar 2024 14:29:06 +0800 Subject: [PATCH 115/189] fix Uts Signed-off-by: xinyual --- .../processor/chunker/ChunkerFactory.java | 6 +--- .../chunker/FixedTokenLengthChunker.java | 34 +++++++++---------- .../chunker/ChunkerFactoryTests.java | 13 +++++-- .../chunker/FixedTokenLengthChunkerTests.java | 8 +++-- 4 files changed, 33 insertions(+), 28 deletions(-) diff --git a/src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerFactory.java b/src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerFactory.java index b86894f60..086cb0b71 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerFactory.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerFactory.java @@ -7,10 +7,6 @@ import java.util.Map; import java.util.Set; -import org.opensearch.index.analysis.AnalysisRegistry; - -import static org.opensearch.neuralsearch.processor.chunker.FixedTokenLengthChunker.ANALYSIS_REGISTRY_FIELD; - /** * A factory to create different chunking algorithm classes and return all supported chunking algorithms. */ @@ -22,7 +18,7 @@ public class ChunkerFactory { public static Chunker create(String type, Map parameters) { switch (type) { case FIXED_TOKEN_LENGTH_ALGORITHM: - return new FixedTokenLengthChunker((AnalysisRegistry) parameters.get(ANALYSIS_REGISTRY_FIELD), parameters); + return new FixedTokenLengthChunker(parameters); case DELIMITER_ALGORITHM: return new DelimiterChunker(parameters); default: diff --git a/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java b/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java index 9fb8245ef..cbdc57f30 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java @@ -32,15 +32,15 @@ public class FixedTokenLengthChunker implements Chunker { // default values for each parameter private static final int DEFAULT_TOKEN_LIMIT = 384; - private static final Double DEFAULT_OVERLAP_RATE = 0.0; + private static final double DEFAULT_OVERLAP_RATE = 0.0; private static final int DEFAULT_MAX_TOKEN_COUNT = 10000; private static final String DEFAULT_TOKENIZER = "standard"; private static final String DEFAULT_TOKEN_CONCATENATOR = " "; - private static final Double OVERLAP_RATE_UPPER_BOUND = 0.5; + private static final double OVERLAP_RATE_UPPER_BOUND = 0.5; - private Double overlapRate; + private double overlapRate; private int tokenLimit; @@ -50,9 +50,9 @@ public class FixedTokenLengthChunker implements Chunker { private final AnalysisRegistry analysisRegistry; - public FixedTokenLengthChunker(AnalysisRegistry analysisRegistry, Map parameters) { + public FixedTokenLengthChunker(Map parameters) { validateParameters(parameters); - this.analysisRegistry = analysisRegistry; + this.analysisRegistry = (AnalysisRegistry) parameters.get(ANALYSIS_REGISTRY_FIELD); } /** @@ -79,8 +79,8 @@ public void validateParameters(Map parameters) { "fixed length parameter [" + OVERLAP_RATE_FIELD + "] cannot be cast to [" + Number.class.getName() + "]" ); } - Double overlapRate = Double.valueOf(overlapRateString); - if (overlapRate < 0 || overlapRate.compareTo(OVERLAP_RATE_UPPER_BOUND) > 0) { + double overlapRate = NumberUtils.createDouble(overlapRateString); + if (overlapRate < 0 || overlapRate > OVERLAP_RATE_UPPER_BOUND) { throw new IllegalArgumentException( "fixed length parameter [" + OVERLAP_RATE_FIELD + "] must be between 0 and " + OVERLAP_RATE_UPPER_BOUND ); @@ -103,12 +103,13 @@ public static String validateStringParameters( // all parameters are optional return defaultValue; } - if (!(parameters.get(fieldName) instanceof String)) { + Object fieldValue = parameters.get(fieldName); + if (!(fieldValue instanceof String)) { throw new IllegalArgumentException("Chunker parameter [" + fieldName + "] cannot be cast to [" + String.class.getName() + "]"); - } else if (StringUtils.isEmpty(parameters.get(fieldName).toString()) && !allowEmpty) { + } else if (!allowEmpty && StringUtils.isEmpty(fieldValue.toString())) { throw new IllegalArgumentException("Chunker parameter: " + fieldName + " should not be empty."); } - return (String) parameters.get(fieldName); + return (String) fieldValue; } private int validatePositiveIntegerParameter(Map parameters, String fieldName, int defaultValue) { @@ -133,16 +134,13 @@ private int validatePositiveIntegerParameter(Map parameters, Str * Return the chunked passages for fixed token length algorithm * * @param content input string - * @param parameters a map containing parameters, containing the following parameters - * 1. tokenizer the analyzer tokenizer in OpenSearch - * 2. token_limit the token limit for each chunked passage - * 3. overlap_rate the overlapping degree for each chunked passage, indicating how many token comes from the previous passage - * 4. max_token_count the max token limit for the tokenizer + * @param runtimeParameters a map containing runtimeParameters, containing the following runtimeParameters + * max_token_count the max token limit for the tokenizer */ @Override - public List chunk(String content, Map parameters) { - // prior to chunking, parameters have been validated - int maxTokenCount = validatePositiveIntegerParameter(parameters, MAX_TOKEN_COUNT_FIELD, DEFAULT_MAX_TOKEN_COUNT); + public List chunk(String content, Map runtimeParameters) { + // prior to chunking, runtimeParameters have been validated + int maxTokenCount = validatePositiveIntegerParameter(runtimeParameters, MAX_TOKEN_COUNT_FIELD, DEFAULT_MAX_TOKEN_COUNT); List tokens = tokenize(content, tokenizer, maxTokenCount); List passages = new ArrayList<>(); diff --git a/src/test/java/org/opensearch/neuralsearch/processor/chunker/ChunkerFactoryTests.java b/src/test/java/org/opensearch/neuralsearch/processor/chunker/ChunkerFactoryTests.java index 1860e1ec9..a26021559 100644 --- a/src/test/java/org/opensearch/neuralsearch/processor/chunker/ChunkerFactoryTests.java +++ b/src/test/java/org/opensearch/neuralsearch/processor/chunker/ChunkerFactoryTests.java @@ -8,6 +8,7 @@ import org.opensearch.index.analysis.AnalysisRegistry; import org.opensearch.test.OpenSearchTestCase; +import java.util.HashMap; import java.util.Map; import java.util.Set; @@ -24,13 +25,13 @@ public void testGetAllChunkers() { } public void testCreate_FixedTokenLength() { - Chunker chunker = ChunkerFactory.create(ChunkerFactory.FIXED_TOKEN_LENGTH_ALGORITHM, Map.of(ANALYSIS_REGISTRY_FIELD, analysisRegistry)); + Chunker chunker = ChunkerFactory.create(ChunkerFactory.FIXED_TOKEN_LENGTH_ALGORITHM, createChunkParameters()); assertNotNull(chunker); assertTrue(chunker instanceof FixedTokenLengthChunker); } public void testCreate_Delimiter() { - Chunker chunker = ChunkerFactory.create(ChunkerFactory.DELIMITER_ALGORITHM, Map.of(ANALYSIS_REGISTRY_FIELD, analysisRegistry)); + Chunker chunker = ChunkerFactory.create(ChunkerFactory.DELIMITER_ALGORITHM, createChunkParameters()); assertNotNull(chunker); assertTrue(chunker instanceof DelimiterChunker); } @@ -39,8 +40,14 @@ public void testCreate_Invalid() { String invalidChunkerType = "Invalid Chunker Type"; IllegalArgumentException illegalArgumentException = assertThrows( IllegalArgumentException.class, - () -> ChunkerFactory.create(invalidChunkerType, Map.of(ANALYSIS_REGISTRY_FIELD, analysisRegistry)) + () -> ChunkerFactory.create(invalidChunkerType, createChunkParameters()) ); assert (illegalArgumentException.getMessage().contains("chunker type [" + invalidChunkerType + "] is not supported.")); } + + private Map createChunkParameters() { + Map parameters = new HashMap<>(); + parameters.put(ANALYSIS_REGISTRY_FIELD, analysisRegistry); + return parameters; + } } diff --git a/src/test/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunkerTests.java b/src/test/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunkerTests.java index 583a4b2c7..a169e788a 100644 --- a/src/test/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunkerTests.java +++ b/src/test/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunkerTests.java @@ -23,9 +23,10 @@ import static java.util.Collections.singletonList; import static java.util.Collections.singletonMap; -import static org.opensearch.neuralsearch.processor.chunker.FixedTokenLengthChunker.TOKENIZER_FIELD; +import static org.opensearch.neuralsearch.processor.chunker.FixedTokenLengthChunker.ANALYSIS_REGISTRY_FIELD; import static org.opensearch.neuralsearch.processor.chunker.FixedTokenLengthChunker.TOKEN_LIMIT_FIELD; import static org.opensearch.neuralsearch.processor.chunker.FixedTokenLengthChunker.OVERLAP_RATE_FIELD; +import static org.opensearch.neuralsearch.processor.chunker.FixedTokenLengthChunker.TOKENIZER_FIELD; import static org.opensearch.neuralsearch.processor.chunker.FixedTokenLengthChunker.MAX_TOKEN_COUNT_FIELD; public class FixedTokenLengthChunkerTests extends OpenSearchTestCase { @@ -40,6 +41,8 @@ public void setup() { @SneakyThrows public FixedTokenLengthChunker createFixedTokenLengthChunker(Map parameters) { + Map nonruntimeParameters = new HashMap<>(); + nonruntimeParameters.putAll(parameters); Settings settings = Settings.builder().put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()).build(); Environment environment = TestEnvironment.newEnvironment(settings); AnalysisPlugin plugin = new AnalysisPlugin() { @@ -56,7 +59,8 @@ public Map> getTokeniz } }; AnalysisRegistry analysisRegistry = new AnalysisModule(environment, singletonList(plugin)).getAnalysisRegistry(); - return new FixedTokenLengthChunker(analysisRegistry, parameters); + nonruntimeParameters.put(ANALYSIS_REGISTRY_FIELD, analysisRegistry); + return new FixedTokenLengthChunker(nonruntimeParameters); } public void testValidateParameters_whenNoParams_thenSuccessful() { From d1d88dcae047ee4a1f66f8ba838cb7eae3c7bca5 Mon Sep 17 00:00:00 2001 From: xinyual Date: Mon, 11 Mar 2024 14:30:53 +0800 Subject: [PATCH 116/189] avoid java doc change Signed-off-by: xinyual --- .../processor/chunker/FixedTokenLengthChunker.java | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java b/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java index cbdc57f30..bc39607cc 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java @@ -56,14 +56,14 @@ public FixedTokenLengthChunker(Map parameters) { } /** - * Validate the chunked passages for fixed token length algorithm, + * Validate and parse the parameters for fixed token length algorithm, * will throw IllegalArgumentException when parameters are invalid * * @param parameters a map containing parameters, containing the following parameters: - * 1. tokenizer the analyzer tokenizer in opensearch, please check https://opensearch.org/docs/latest/analyzers/tokenizers/index/ - * 2. token_limit the token limit for each chunked passage - * 3. overlap_rate the overlapping degree for each chunked passage, indicating how many token comes from the previous passage - * 4. max_token_count the max token limit for the tokenizer + * 1. tokenizer: the analyzer tokenizer in opensearch + * 2. token_limit: the token limit for each chunked passage + * 3. overlap_rate: the overlapping degree for each chunked passage, indicating how many token comes from the previous passage + * 4. max_token_count: the max token limit for the tokenizer * Here are requirements for parameters: * max_token_count and token_limit should be a positive integer * overlap_rate should be within range [0, 0.5] From eb439bd557c8687e16e611f27b6d23260fc3f474 Mon Sep 17 00:00:00 2001 From: xinyual Date: Mon, 11 Mar 2024 14:46:25 +0800 Subject: [PATCH 117/189] move validate to commonUtlis Signed-off-by: xinyual --- .../processor/chunker/ChunkerUtils.java | 50 +++++++++++++++++++ .../processor/chunker/DelimiterChunker.java | 2 +- .../chunker/FixedTokenLengthChunker.java | 39 ++------------- 3 files changed, 55 insertions(+), 36 deletions(-) create mode 100644 src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerUtils.java diff --git a/src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerUtils.java b/src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerUtils.java new file mode 100644 index 000000000..d638de04e --- /dev/null +++ b/src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerUtils.java @@ -0,0 +1,50 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ + +package org.opensearch.neuralsearch.processor.chunker; + +import org.apache.commons.lang3.StringUtils; +import org.apache.commons.lang3.math.NumberUtils; + +import java.util.Map; + +public class ChunkerUtils { + public static String validateStringParameters( + Map parameters, + String fieldName, + String defaultValue, + boolean allowEmpty + ) { + if (!parameters.containsKey(fieldName)) { + // all parameters are optional + return defaultValue; + } + Object fieldValue = parameters.get(fieldName); + if (!(fieldValue instanceof String)) { + throw new IllegalArgumentException("Chunker parameter [" + fieldName + "] cannot be cast to [" + String.class.getName() + "]"); + } else if (!allowEmpty && StringUtils.isEmpty(fieldValue.toString())) { + throw new IllegalArgumentException("Chunker parameter: " + fieldName + " should not be empty."); + } + return (String) fieldValue; + } + + public static int validatePositiveIntegerParameter(Map parameters, String fieldName, int defaultValue) { + // this method validate that parameter is a positive integer + if (!parameters.containsKey(fieldName)) { + // all parameters are optional + return defaultValue; + } + String fieldValue = parameters.get(fieldName).toString(); + if (!(NumberUtils.isParsable(fieldValue))) { + throw new IllegalArgumentException( + "fixed length parameter [" + fieldName + "] cannot be cast to [" + Number.class.getName() + "]" + ); + } + if (NumberUtils.createInteger(fieldValue) <= 0) { + throw new IllegalArgumentException("fixed length parameter [" + fieldName + "] must be positive"); + } + return Integer.valueOf(fieldValue); + } +} diff --git a/src/main/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunker.java b/src/main/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunker.java index dc37b7876..0981075ba 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunker.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunker.java @@ -8,7 +8,7 @@ import java.util.List; import java.util.ArrayList; -import static org.opensearch.neuralsearch.processor.chunker.FixedTokenLengthChunker.validateStringParameters; +import static org.opensearch.neuralsearch.processor.chunker.ChunkerUtils.validateStringParameters; /** * The implementation {@link Chunker} for delimiter algorithm diff --git a/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java b/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java index bc39607cc..b0c8f542a 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java @@ -15,6 +15,8 @@ import org.opensearch.index.analysis.AnalysisRegistry; import org.opensearch.action.admin.indices.analyze.AnalyzeAction; import static org.opensearch.action.admin.indices.analyze.TransportAnalyzeAction.analyze; +import static org.opensearch.neuralsearch.processor.chunker.ChunkerUtils.validatePositiveIntegerParameter; +import static org.opensearch.neuralsearch.processor.chunker.ChunkerUtils.validateStringParameters; /** * The implementation {@link Chunker} for fixed token length algorithm. @@ -93,42 +95,9 @@ public void validateParameters(Map parameters) { this.tokenConcatenator = validateStringParameters(parameters, TOKEN_CONCATENATOR_FIELD, DEFAULT_TOKEN_CONCATENATOR, true); } - public static String validateStringParameters( - Map parameters, - String fieldName, - String defaultValue, - boolean allowEmpty - ) { - if (!parameters.containsKey(fieldName)) { - // all parameters are optional - return defaultValue; - } - Object fieldValue = parameters.get(fieldName); - if (!(fieldValue instanceof String)) { - throw new IllegalArgumentException("Chunker parameter [" + fieldName + "] cannot be cast to [" + String.class.getName() + "]"); - } else if (!allowEmpty && StringUtils.isEmpty(fieldValue.toString())) { - throw new IllegalArgumentException("Chunker parameter: " + fieldName + " should not be empty."); - } - return (String) fieldValue; - } - private int validatePositiveIntegerParameter(Map parameters, String fieldName, int defaultValue) { - // this method validate that parameter is a positive integer - if (!parameters.containsKey(fieldName)) { - // all parameters are optional - return defaultValue; - } - String fieldValue = parameters.get(fieldName).toString(); - if (!(NumberUtils.isParsable(fieldValue))) { - throw new IllegalArgumentException( - "fixed length parameter [" + fieldName + "] cannot be cast to [" + Number.class.getName() + "]" - ); - } - if (NumberUtils.createInteger(fieldValue) <= 0) { - throw new IllegalArgumentException("fixed length parameter [" + fieldName + "] must be positive"); - } - return Integer.valueOf(fieldValue); - } + + /** * Return the chunked passages for fixed token length algorithm From bc7f70cfb02742873bb5230f5468260360b4722c Mon Sep 17 00:00:00 2001 From: xinyual Date: Mon, 11 Mar 2024 14:48:10 +0800 Subject: [PATCH 118/189] remove useless function Signed-off-by: xinyual --- .../processor/chunker/DelimiterChunker.java | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/src/main/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunker.java b/src/main/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunker.java index 0981075ba..412cdf5f8 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunker.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunker.java @@ -57,14 +57,4 @@ public List chunk(String content) { return chunkResult; } - /** - * Return the chunked passages for delimiter algorithm - * - * @param content input string - * @param parameters a map containing parameters, containing the following parameters - */ - @Override - public List chunk(String content, Map parameters) { - return chunk(content); - } } From bb941cd696717514e62260c485159cdf84b77006 Mon Sep 17 00:00:00 2001 From: xinyual Date: Mon, 11 Mar 2024 14:53:14 +0800 Subject: [PATCH 119/189] change java doc Signed-off-by: xinyual --- .../processor/chunker/ChunkerUtils.java | 11 ++++----- .../chunker/FixedTokenLengthChunker.java | 23 ++++++++----------- 2 files changed, 14 insertions(+), 20 deletions(-) diff --git a/src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerUtils.java b/src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerUtils.java index d638de04e..be8074c50 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerUtils.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerUtils.java @@ -2,7 +2,6 @@ * Copyright OpenSearch Contributors * SPDX-License-Identifier: Apache-2.0 */ - package org.opensearch.neuralsearch.processor.chunker; import org.apache.commons.lang3.StringUtils; @@ -12,10 +11,10 @@ public class ChunkerUtils { public static String validateStringParameters( - Map parameters, - String fieldName, - String defaultValue, - boolean allowEmpty + Map parameters, + String fieldName, + String defaultValue, + boolean allowEmpty ) { if (!parameters.containsKey(fieldName)) { // all parameters are optional @@ -39,7 +38,7 @@ public static int validatePositiveIntegerParameter(Map parameter String fieldValue = parameters.get(fieldName).toString(); if (!(NumberUtils.isParsable(fieldValue))) { throw new IllegalArgumentException( - "fixed length parameter [" + fieldName + "] cannot be cast to [" + Number.class.getName() + "]" + "fixed length parameter [" + fieldName + "] cannot be cast to [" + Number.class.getName() + "]" ); } if (NumberUtils.createInteger(fieldValue) <= 0) { diff --git a/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java b/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java index b0c8f542a..1775e752b 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java @@ -9,7 +9,6 @@ import java.util.List; import java.util.ArrayList; import lombok.extern.log4j.Log4j2; -import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.math.NumberUtils; import org.opensearch.index.analysis.AnalysisRegistry; @@ -95,15 +94,11 @@ public void validateParameters(Map parameters) { this.tokenConcatenator = validateStringParameters(parameters, TOKEN_CONCATENATOR_FIELD, DEFAULT_TOKEN_CONCATENATOR, true); } - - - - /** * Return the chunked passages for fixed token length algorithm * * @param content input string - * @param runtimeParameters a map containing runtimeParameters, containing the following runtimeParameters + * @param runtimeParameters a map for runtime parameters, containing the following runtime parameters: * max_token_count the max token limit for the tokenizer */ @Override @@ -114,19 +109,19 @@ public List chunk(String content, Map runtimeParameters) List tokens = tokenize(content, tokenizer, maxTokenCount); List passages = new ArrayList<>(); - Double overlapTokenNumberDouble = overlapRate * tokenLimit; - int overlapTokenNumber = overlapTokenNumberDouble.intValue(); + double overlapTokenNumberDouble = overlapRate * tokenLimit; + int overlapTokenNumber = (int) Math.round(overlapTokenNumberDouble); - int startToken = 0; - while (startToken < tokens.size()) { - if (startToken + tokenLimit >= tokens.size()) { + int startTokenIndex = 0; + while (startTokenIndex < tokens.size()) { + if (startTokenIndex + tokenLimit >= tokens.size()) { // break the loop when already cover the last token - passages.add(String.join(tokenConcatenator, tokens.subList(startToken, tokens.size()))); + passages.add(String.join(tokenConcatenator, tokens.subList(startTokenIndex, tokens.size()))); break; } else { - passages.add(String.join(tokenConcatenator, tokens.subList(startToken, startToken + tokenLimit))); + passages.add(String.join(tokenConcatenator, tokens.subList(startTokenIndex, startTokenIndex + tokenLimit))); } - startToken += tokenLimit - overlapTokenNumber; + startTokenIndex += tokenLimit - overlapTokenNumber; } return passages; } From 77d4101dc5b1e51059baf7360d7d182bfca4c4e4 Mon Sep 17 00:00:00 2001 From: xinyual Date: Mon, 11 Mar 2024 15:03:03 +0800 Subject: [PATCH 120/189] fix Document process ut Signed-off-by: xinyual --- .../processor/DocumentChunkingProcessor.java | 2 +- .../DocumentChunkingProcessorTests.java | 28 +++++++++---------- 2 files changed, 15 insertions(+), 15 deletions(-) diff --git a/src/main/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessor.java b/src/main/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessor.java index 526910b44..1807bb970 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessor.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessor.java @@ -76,12 +76,12 @@ public DocumentChunkingProcessor( AnalysisRegistry analysisRegistry ) { super(tag, description); - validateAndParseAlgorithmMap(algorithmMap); this.fieldMap = fieldMap; this.environment = environment; this.clusterService = clusterService; this.indicesService = indicesService; this.analysisRegistry = analysisRegistry; + validateAndParseAlgorithmMap(algorithmMap); } public String getType() { diff --git a/src/test/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessorTests.java b/src/test/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessorTests.java index 9fffce3ac..bd0a2a32d 100644 --- a/src/test/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessorTests.java +++ b/src/test/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessorTests.java @@ -360,8 +360,8 @@ public void testExecute_withFixedTokenLength_andSourceDataStringWithMaxChunkNum_ Object passages = document.getSourceAndMetadata().get(OUTPUT_FIELD); assert (passages instanceof List); List expectedPassages = new ArrayList<>(); - expectedPassages.add("This is an example document to be chunked. The document"); - expectedPassages.add("contains a single paragraph, two sentences and 24 tokens by"); + expectedPassages.add("This is an example document to be chunked The document"); + expectedPassages.add("contains a single paragraph two sentences and 24 tokens by"); expectedPassages.add("standard tokenizer in OpenSearch"); assertEquals(expectedPassages, passages); } @@ -376,8 +376,8 @@ public void testExecute_withFixedTokenLength_andSourceDataStringWithMaxChunkNumT Object passages = document.getSourceAndMetadata().get(OUTPUT_FIELD); assert (passages instanceof List); List expectedPassages = new ArrayList<>(); - expectedPassages.add("This is an example document to be chunked. The document"); - expectedPassages.add("contains a single paragraph, two sentences and 24 tokens by"); + expectedPassages.add("This is an example document to be chunked The document"); + expectedPassages.add("contains a single paragraph two sentences and 24 tokens by"); expectedPassages.add("standard tokenizer in OpenSearch"); assertEquals(expectedPassages, passages); } @@ -407,8 +407,8 @@ public void testExecute_withFixedTokenLength_andSourceDataString_thenSucceed() { Object passages = document.getSourceAndMetadata().get(OUTPUT_FIELD); assert (passages instanceof List); List expectedPassages = new ArrayList<>(); - expectedPassages.add("This is an example document to be chunked. The document"); - expectedPassages.add("contains a single paragraph, two sentences and 24 tokens by"); + expectedPassages.add("This is an example document to be chunked The document"); + expectedPassages.add("contains a single paragraph two sentences and 24 tokens by"); expectedPassages.add("standard tokenizer in OpenSearch"); assertEquals(expectedPassages, passages); } @@ -440,11 +440,11 @@ public void testExecute_withFixedTokenLength_andSourceDataListStrings_thenSuccee assert (passages instanceof List); List expectedPassages = new ArrayList<>(); - expectedPassages.add("This is the first document to be chunked. The document"); - expectedPassages.add("contains a single paragraph, two sentences and 24 tokens by"); + expectedPassages.add("This is the first document to be chunked The document"); + expectedPassages.add("contains a single paragraph two sentences and 24 tokens by"); expectedPassages.add("standard tokenizer in OpenSearch"); - expectedPassages.add("This is the second document to be chunked. The document"); - expectedPassages.add("contains a single paragraph, two sentences and 24 tokens by"); + expectedPassages.add("This is the second document to be chunked The document"); + expectedPassages.add("contains a single paragraph two sentences and 24 tokens by"); expectedPassages.add("standard tokenizer in OpenSearch"); assertEquals(expectedPassages, passages); } @@ -488,8 +488,8 @@ public void testExecute_withFixedTokenLength_andFieldMapNestedMap_thenSucceed() assert (passages instanceof List); List expectedPassages = new ArrayList<>(); - expectedPassages.add("This is an example document to be chunked. The document"); - expectedPassages.add("contains a single paragraph, two sentences and 24 tokens by"); + expectedPassages.add("This is an example document to be chunked The document"); + expectedPassages.add("contains a single paragraph two sentences and 24 tokens by"); expectedPassages.add("standard tokenizer in OpenSearch"); assertEquals(expectedPassages, passages); } @@ -532,8 +532,8 @@ public void testExecute_withFixedTokenLength_andFieldMapNestedMap_sourceList_the Object nestedResult = document.getSourceAndMetadata().get(INPUT_NESTED_FIELD_KEY); List expectedPassages = new ArrayList<>(); - expectedPassages.add("This is an example document to be chunked. The document"); - expectedPassages.add("contains a single paragraph, two sentences and 24 tokens by"); + expectedPassages.add("This is an example document to be chunked The document"); + expectedPassages.add("contains a single paragraph two sentences and 24 tokens by"); expectedPassages.add("standard tokenizer in OpenSearch"); assert (nestedResult instanceof List); assertEquals(((List) nestedResult).size(), 2); From 92f587f2f27d3e329cbb44c753572519eab7d9fd Mon Sep 17 00:00:00 2001 From: yuye-aws Date: Mon, 11 Mar 2024 22:52:09 +0800 Subject: [PATCH 121/189] fixed token length: re-implement with start and end offset Signed-off-by: yuye-aws --- ...ls.java => ChunkerParameterValidator.java} | 3 +- .../processor/chunker/DelimiterChunker.java | 2 +- .../chunker/FixedTokenLengthChunker.java | 40 +++++++--------- .../DocumentChunkingProcessorIT.java | 10 ++-- .../DocumentChunkingProcessorTests.java | 46 +++++++++---------- .../chunker/DelimiterChunkerTests.java | 28 ++++------- .../chunker/FixedTokenLengthChunkerTests.java | 45 +++++++++--------- 7 files changed, 78 insertions(+), 96 deletions(-) rename src/main/java/org/opensearch/neuralsearch/processor/chunker/{ChunkerUtils.java => ChunkerParameterValidator.java} (97%) diff --git a/src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerUtils.java b/src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerParameterValidator.java similarity index 97% rename from src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerUtils.java rename to src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerParameterValidator.java index be8074c50..b3f399074 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerUtils.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerParameterValidator.java @@ -9,7 +9,8 @@ import java.util.Map; -public class ChunkerUtils { +public class ChunkerParameterValidator { + public static String validateStringParameters( Map parameters, String fieldName, diff --git a/src/main/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunker.java b/src/main/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunker.java index 412cdf5f8..86f5aac69 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunker.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunker.java @@ -8,7 +8,7 @@ import java.util.List; import java.util.ArrayList; -import static org.opensearch.neuralsearch.processor.chunker.ChunkerUtils.validateStringParameters; +import static org.opensearch.neuralsearch.processor.chunker.ChunkerParameterValidator.validateStringParameters; /** * The implementation {@link Chunker} for delimiter algorithm diff --git a/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java b/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java index 1775e752b..6c035753a 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java @@ -13,9 +13,10 @@ import org.opensearch.index.analysis.AnalysisRegistry; import org.opensearch.action.admin.indices.analyze.AnalyzeAction; +import org.opensearch.action.admin.indices.analyze.AnalyzeAction.AnalyzeToken; import static org.opensearch.action.admin.indices.analyze.TransportAnalyzeAction.analyze; -import static org.opensearch.neuralsearch.processor.chunker.ChunkerUtils.validatePositiveIntegerParameter; -import static org.opensearch.neuralsearch.processor.chunker.ChunkerUtils.validateStringParameters; +import static org.opensearch.neuralsearch.processor.chunker.ChunkerParameterValidator.validatePositiveIntegerParameter; +import static org.opensearch.neuralsearch.processor.chunker.ChunkerParameterValidator.validateStringParameters; /** * The implementation {@link Chunker} for fixed token length algorithm. @@ -29,24 +30,17 @@ public class FixedTokenLengthChunker implements Chunker { public static final String MAX_TOKEN_COUNT_FIELD = "max_token_count"; public static final String TOKENIZER_FIELD = "tokenizer"; - public static final String TOKEN_CONCATENATOR_FIELD = "token_concatenator"; - // default values for each parameter private static final int DEFAULT_TOKEN_LIMIT = 384; private static final double DEFAULT_OVERLAP_RATE = 0.0; private static final int DEFAULT_MAX_TOKEN_COUNT = 10000; private static final String DEFAULT_TOKENIZER = "standard"; - private static final String DEFAULT_TOKEN_CONCATENATOR = " "; - private static final double OVERLAP_RATE_UPPER_BOUND = 0.5; private double overlapRate; private int tokenLimit; - - private String tokenConcatenator; - private String tokenizer; private final AnalysisRegistry analysisRegistry; @@ -91,7 +85,6 @@ public void validateParameters(Map parameters) { this.overlapRate = DEFAULT_OVERLAP_RATE; } this.tokenizer = validateStringParameters(parameters, TOKENIZER_FIELD, DEFAULT_TOKENIZER, false); - this.tokenConcatenator = validateStringParameters(parameters, TOKEN_CONCATENATOR_FIELD, DEFAULT_TOKEN_CONCATENATOR, true); } /** @@ -106,38 +99,37 @@ public List chunk(String content, Map runtimeParameters) // prior to chunking, runtimeParameters have been validated int maxTokenCount = validatePositiveIntegerParameter(runtimeParameters, MAX_TOKEN_COUNT_FIELD, DEFAULT_MAX_TOKEN_COUNT); - List tokens = tokenize(content, tokenizer, maxTokenCount); + List tokens = tokenize(content, tokenizer, maxTokenCount); List passages = new ArrayList<>(); - double overlapTokenNumberDouble = overlapRate * tokenLimit; - int overlapTokenNumber = (int) Math.round(overlapTokenNumberDouble); - int startTokenIndex = 0; + int startContentPosition, endContentPosition; + int overlapTokenNumber = (int) Math.floor(tokenLimit * overlapRate); + while (startTokenIndex < tokens.size()) { + startContentPosition = tokens.get(startTokenIndex).getStartOffset(); if (startTokenIndex + tokenLimit >= tokens.size()) { - // break the loop when already cover the last token - passages.add(String.join(tokenConcatenator, tokens.subList(startTokenIndex, tokens.size()))); + // include all characters till the end if no next passage + endContentPosition = content.length(); + passages.add(content.substring(startContentPosition, endContentPosition)); break; } else { - passages.add(String.join(tokenConcatenator, tokens.subList(startTokenIndex, startTokenIndex + tokenLimit))); + // include gap characters between two passages + endContentPosition = tokens.get(startTokenIndex + tokenLimit).getStartOffset() - 1; + passages.add(content.substring(startContentPosition, endContentPosition)); } startTokenIndex += tokenLimit - overlapTokenNumber; } return passages; } - private List tokenize(String content, String tokenizer, int maxTokenCount) { + private List tokenize(String content, String tokenizer, int maxTokenCount) { AnalyzeAction.Request analyzeRequest = new AnalyzeAction.Request(); analyzeRequest.text(content); analyzeRequest.tokenizer(tokenizer); try { AnalyzeAction.Response analyzeResponse = analyze(analyzeRequest, analysisRegistry, null, maxTokenCount); - List tokenList = new ArrayList<>(); - List analyzeTokenList = analyzeResponse.getTokens(); - for (AnalyzeAction.AnalyzeToken analyzeToken : analyzeTokenList) { - tokenList.add(analyzeToken.getTerm()); - } - return tokenList; + return analyzeResponse.getTokens(); } catch (IOException e) { throw new RuntimeException("Fixed token length algorithm meet with exception: " + e); } diff --git a/src/test/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessorIT.java b/src/test/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessorIT.java index 762db29bd..2b6ee7898 100644 --- a/src/test/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessorIT.java +++ b/src/test/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessorIT.java @@ -76,7 +76,7 @@ public void testDocumentChunkingProcessor_withFixedTokenLengthAlgorithmStandardT List expectedPassages = new ArrayList<>(); expectedPassages.add("This is an example document to be chunked. The document"); expectedPassages.add("contains a single paragraph, two sentences and 24 tokens by"); - expectedPassages.add("standard tokenizer in OpenSearch"); + expectedPassages.add("standard tokenizer in OpenSearch."); validateIndexIngestResults(INDEX_NAME, OUTPUT_FIELD, expectedPassages); } finally { wipeOfTestResources(INDEX_NAME, FIXED_TOKEN_LENGTH_PIPELINE_WITH_STANDARD_TOKENIZER_NAME, null, null); @@ -92,7 +92,7 @@ public void testDocumentChunkingProcessor_withFixedTokenLengthAlgorithmLetterTok List expectedPassages = new ArrayList<>(); expectedPassages.add("This is an example document to be chunked. The document"); expectedPassages.add("contains a single paragraph, two sentences and 24 tokens by standard"); - expectedPassages.add("tokenizer in OpenSearch"); + expectedPassages.add("tokenizer in OpenSearch."); validateIndexIngestResults(INDEX_NAME, OUTPUT_FIELD, expectedPassages); } finally { wipeOfTestResources(INDEX_NAME, FIXED_TOKEN_LENGTH_PIPELINE_WITH_LETTER_TOKENIZER_NAME, null, null); @@ -108,7 +108,7 @@ public void testDocumentChunkingProcessor_withFixedTokenLengthAlgorithmLowercase List expectedPassages = new ArrayList<>(); expectedPassages.add("This is an example document to be chunked. The document"); expectedPassages.add("contains a single paragraph, two sentences and 24 tokens by standard"); - expectedPassages.add("tokenizer in OpenSearch"); + expectedPassages.add("tokenizer in OpenSearch."); validateIndexIngestResults(INDEX_NAME, OUTPUT_FIELD, expectedPassages); } finally { wipeOfTestResources(INDEX_NAME, FIXED_TOKEN_LENGTH_PIPELINE_WITH_LOWERCASE_TOKENIZER_NAME, null, null); @@ -154,9 +154,9 @@ public void testDocumentChunkingProcessor_withCascadePipeline_successful() throw ingestDocument(TEST_DOCUMENT); List expectedPassages = new ArrayList<>(); - expectedPassages.add("This is an example document to be chunked"); + expectedPassages.add("This is an example document to be chunked."); expectedPassages.add("The document contains a single paragraph, two sentences and 24"); - expectedPassages.add("tokens by standard tokenizer in OpenSearch"); + expectedPassages.add("tokens by standard tokenizer in OpenSearch."); validateIndexIngestResults(INDEX_NAME, OUTPUT_FIELD, expectedPassages); expectedPassages.clear(); diff --git a/src/test/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessorTests.java b/src/test/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessorTests.java index bd0a2a32d..eb7820c57 100644 --- a/src/test/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessorTests.java +++ b/src/test/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessorTests.java @@ -360,9 +360,9 @@ public void testExecute_withFixedTokenLength_andSourceDataStringWithMaxChunkNum_ Object passages = document.getSourceAndMetadata().get(OUTPUT_FIELD); assert (passages instanceof List); List expectedPassages = new ArrayList<>(); - expectedPassages.add("This is an example document to be chunked The document"); - expectedPassages.add("contains a single paragraph two sentences and 24 tokens by"); - expectedPassages.add("standard tokenizer in OpenSearch"); + expectedPassages.add("This is an example document to be chunked. The document"); + expectedPassages.add("contains a single paragraph, two sentences and 24 tokens by"); + expectedPassages.add("standard tokenizer in OpenSearch."); assertEquals(expectedPassages, passages); } @@ -376,9 +376,9 @@ public void testExecute_withFixedTokenLength_andSourceDataStringWithMaxChunkNumT Object passages = document.getSourceAndMetadata().get(OUTPUT_FIELD); assert (passages instanceof List); List expectedPassages = new ArrayList<>(); - expectedPassages.add("This is an example document to be chunked The document"); - expectedPassages.add("contains a single paragraph two sentences and 24 tokens by"); - expectedPassages.add("standard tokenizer in OpenSearch"); + expectedPassages.add("This is an example document to be chunked. The document"); + expectedPassages.add("contains a single paragraph, two sentences and 24 tokens by"); + expectedPassages.add("standard tokenizer in OpenSearch."); assertEquals(expectedPassages, passages); } } @@ -407,9 +407,9 @@ public void testExecute_withFixedTokenLength_andSourceDataString_thenSucceed() { Object passages = document.getSourceAndMetadata().get(OUTPUT_FIELD); assert (passages instanceof List); List expectedPassages = new ArrayList<>(); - expectedPassages.add("This is an example document to be chunked The document"); - expectedPassages.add("contains a single paragraph two sentences and 24 tokens by"); - expectedPassages.add("standard tokenizer in OpenSearch"); + expectedPassages.add("This is an example document to be chunked. The document"); + expectedPassages.add("contains a single paragraph, two sentences and 24 tokens by"); + expectedPassages.add("standard tokenizer in OpenSearch."); assertEquals(expectedPassages, passages); } @@ -440,12 +440,12 @@ public void testExecute_withFixedTokenLength_andSourceDataListStrings_thenSuccee assert (passages instanceof List); List expectedPassages = new ArrayList<>(); - expectedPassages.add("This is the first document to be chunked The document"); - expectedPassages.add("contains a single paragraph two sentences and 24 tokens by"); - expectedPassages.add("standard tokenizer in OpenSearch"); - expectedPassages.add("This is the second document to be chunked The document"); - expectedPassages.add("contains a single paragraph two sentences and 24 tokens by"); - expectedPassages.add("standard tokenizer in OpenSearch"); + expectedPassages.add("This is the first document to be chunked. The document"); + expectedPassages.add("contains a single paragraph, two sentences and 24 tokens by"); + expectedPassages.add("standard tokenizer in OpenSearch."); + expectedPassages.add("This is the second document to be chunked. The document"); + expectedPassages.add("contains a single paragraph, two sentences and 24 tokens by"); + expectedPassages.add("standard tokenizer in OpenSearch."); assertEquals(expectedPassages, passages); } @@ -488,9 +488,9 @@ public void testExecute_withFixedTokenLength_andFieldMapNestedMap_thenSucceed() assert (passages instanceof List); List expectedPassages = new ArrayList<>(); - expectedPassages.add("This is an example document to be chunked The document"); - expectedPassages.add("contains a single paragraph two sentences and 24 tokens by"); - expectedPassages.add("standard tokenizer in OpenSearch"); + expectedPassages.add("This is an example document to be chunked. The document"); + expectedPassages.add("contains a single paragraph, two sentences and 24 tokens by"); + expectedPassages.add("standard tokenizer in OpenSearch."); assertEquals(expectedPassages, passages); } @@ -524,17 +524,17 @@ public void testExecute_withFixedTokenLength_andFieldMapNestedMap_thenFail() { @SneakyThrows @SuppressWarnings("unchecked") - public void testExecute_withFixedTokenLength_andFieldMapNestedMap_sourceList_thenSucceed() { + public void testExecute_withFixedTokenLength_andFieldMapNestedMap_sourceDataList_thenSucceed() { DocumentChunkingProcessor processor = createFixedTokenLengthInstance(createNestedFieldMap()); IngestDocument ingestDocument = createIngestDocumentWithNestedSourceData(createSourceDataListNestedMap()); IngestDocument document = processor.execute(ingestDocument); assert document.getSourceAndMetadata().containsKey(INPUT_NESTED_FIELD_KEY); Object nestedResult = document.getSourceAndMetadata().get(INPUT_NESTED_FIELD_KEY); - List expectedPassages = new ArrayList<>(); - expectedPassages.add("This is an example document to be chunked The document"); - expectedPassages.add("contains a single paragraph two sentences and 24 tokens by"); - expectedPassages.add("standard tokenizer in OpenSearch"); + List expectedPassages = new ArrayList<>(); + expectedPassages.add("This is an example document to be chunked. The document"); + expectedPassages.add("contains a single paragraph, two sentences and 24 tokens by"); + expectedPassages.add("standard tokenizer in OpenSearch."); assert (nestedResult instanceof List); assertEquals(((List) nestedResult).size(), 2); for (Object result : (List) nestedResult) { diff --git a/src/test/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunkerTests.java b/src/test/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunkerTests.java index 5f5539c37..ddc1b7eb7 100644 --- a/src/test/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunkerTests.java +++ b/src/test/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunkerTests.java @@ -4,14 +4,12 @@ */ package org.opensearch.neuralsearch.processor.chunker; -import org.junit.Assert; -import org.opensearch.test.OpenSearchTestCase; - import java.util.List; import java.util.Map; -import static junit.framework.TestCase.assertEquals; -import static org.junit.Assert.assertThrows; +import org.junit.Assert; +import org.opensearch.test.OpenSearchTestCase; + import static org.opensearch.neuralsearch.processor.chunker.DelimiterChunker.DELIMITER_FIELD; public class DelimiterChunkerTests extends OpenSearchTestCase { @@ -35,55 +33,49 @@ public void testChunkerWithDelimiterFieldNoString() { public void testChunker() { DelimiterChunker chunker = new DelimiterChunker(Map.of(DELIMITER_FIELD, "\n")); String content = "a\nb\nc\nd"; - Map inputParameters = Map.of(); - List chunkResult = chunker.chunk(content, inputParameters); + List chunkResult = chunker.chunk(content); assertEquals(List.of("a\n", "b\n", "c\n", "d"), chunkResult); } public void testChunkerWithDefaultDelimiter() { DelimiterChunker chunker = new DelimiterChunker(Map.of()); String content = "a.b.c.d"; - Map inputParameters = Map.of(); - List chunkResult = chunker.chunk(content, inputParameters); + List chunkResult = chunker.chunk(content); assertEquals(List.of("a.", "b.", "c.", "d"), chunkResult); } public void testChunkerWithDelimiterEnd() { DelimiterChunker chunker = new DelimiterChunker(Map.of(DELIMITER_FIELD, "\n")); String content = "a\nb\nc\nd\n"; - Map inputParameters = Map.of(); - List chunkResult = chunker.chunk(content, inputParameters); + List chunkResult = chunker.chunk(content); assertEquals(List.of("a\n", "b\n", "c\n", "d\n"), chunkResult); } public void testChunkerWithOnlyDelimiter() { DelimiterChunker chunker = new DelimiterChunker(Map.of(DELIMITER_FIELD, "\n")); String content = "\n"; - Map inputParameters = Map.of(); - List chunkResult = chunker.chunk(content, inputParameters); + List chunkResult = chunker.chunk(content); assertEquals(List.of("\n"), chunkResult); } public void testChunkerWithAllDelimiters() { DelimiterChunker chunker = new DelimiterChunker(Map.of(DELIMITER_FIELD, "\n")); String content = "\n\n\n"; - Map inputParameters = Map.of(); - List chunkResult = chunker.chunk(content, inputParameters); + List chunkResult = chunker.chunk(content); assertEquals(List.of("\n", "\n", "\n"), chunkResult); } public void testChunkerWithDifferentDelimiters() { DelimiterChunker chunker = new DelimiterChunker(Map.of(DELIMITER_FIELD, ".")); String content = "a.b.cc.d."; - List chunkResult = chunker.chunk(content, Map.of()); + List chunkResult = chunker.chunk(content); assertEquals(List.of("a.", "b.", "cc.", "d."), chunkResult); } public void testChunkerWithStringDelimiter() { DelimiterChunker chunker = new DelimiterChunker(Map.of(DELIMITER_FIELD, "\n\n")); String content = "\n\na\n\n\n"; - Map inputParameters = Map.of(); - List chunkResult = chunker.chunk(content, inputParameters); + List chunkResult = chunker.chunk(content); assertEquals(List.of("\n\n", "a\n\n", "\n"), chunkResult); } } diff --git a/src/test/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunkerTests.java b/src/test/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunkerTests.java index a169e788a..484e20799 100644 --- a/src/test/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunkerTests.java +++ b/src/test/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunkerTests.java @@ -20,9 +20,9 @@ import java.util.HashMap; import java.util.List; import java.util.Map; - import static java.util.Collections.singletonList; import static java.util.Collections.singletonMap; + import static org.opensearch.neuralsearch.processor.chunker.FixedTokenLengthChunker.ANALYSIS_REGISTRY_FIELD; import static org.opensearch.neuralsearch.processor.chunker.FixedTokenLengthChunker.TOKEN_LIMIT_FIELD; import static org.opensearch.neuralsearch.processor.chunker.FixedTokenLengthChunker.OVERLAP_RATE_FIELD; @@ -31,18 +31,16 @@ public class FixedTokenLengthChunkerTests extends OpenSearchTestCase { - private org.opensearch.neuralsearch.processor.chunker.FixedTokenLengthChunker FixedTokenLengthChunker; + private FixedTokenLengthChunker fixedTokenLengthChunker; @Before - @SneakyThrows public void setup() { - FixedTokenLengthChunker = createFixedTokenLengthChunker(Map.of()); + fixedTokenLengthChunker = createFixedTokenLengthChunker(Map.of()); } @SneakyThrows public FixedTokenLengthChunker createFixedTokenLengthChunker(Map parameters) { - Map nonruntimeParameters = new HashMap<>(); - nonruntimeParameters.putAll(parameters); + Map nonRuntimeParameters = new HashMap<>(parameters); Settings settings = Settings.builder().put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()).build(); Environment environment = TestEnvironment.newEnvironment(settings); AnalysisPlugin plugin = new AnalysisPlugin() { @@ -59,13 +57,12 @@ public Map> getTokeniz } }; AnalysisRegistry analysisRegistry = new AnalysisModule(environment, singletonList(plugin)).getAnalysisRegistry(); - nonruntimeParameters.put(ANALYSIS_REGISTRY_FIELD, analysisRegistry); - return new FixedTokenLengthChunker(nonruntimeParameters); + nonRuntimeParameters.put(ANALYSIS_REGISTRY_FIELD, analysisRegistry); + return new FixedTokenLengthChunker(nonRuntimeParameters); } public void testValidateParameters_whenNoParams_thenSuccessful() { - Map parameters = new HashMap<>(); - FixedTokenLengthChunker.validateParameters(parameters); + fixedTokenLengthChunker.validateParameters(Map.of()); } public void testValidateParameters_whenIllegalTokenLimitType_thenFail() { @@ -73,7 +70,7 @@ public void testValidateParameters_whenIllegalTokenLimitType_thenFail() { parameters.put(TOKEN_LIMIT_FIELD, "invalid token limit"); IllegalArgumentException illegalArgumentException = assertThrows( IllegalArgumentException.class, - () -> FixedTokenLengthChunker.validateParameters(parameters) + () -> fixedTokenLengthChunker.validateParameters(parameters) ); assertEquals( "fixed length parameter [" + TOKEN_LIMIT_FIELD + "] cannot be cast to [" + Number.class.getName() + "]", @@ -86,7 +83,7 @@ public void testValidateParameters_whenIllegalTokenLimitValue_thenFail() { parameters.put(TOKEN_LIMIT_FIELD, -1); IllegalArgumentException illegalArgumentException = assertThrows( IllegalArgumentException.class, - () -> FixedTokenLengthChunker.validateParameters(parameters) + () -> fixedTokenLengthChunker.validateParameters(parameters) ); assertEquals("fixed length parameter [" + TOKEN_LIMIT_FIELD + "] must be positive", illegalArgumentException.getMessage()); } @@ -96,7 +93,7 @@ public void testValidateParameters_whenIllegalOverlapRateType_thenFail() { parameters.put(OVERLAP_RATE_FIELD, "invalid overlap rate"); IllegalArgumentException illegalArgumentException = assertThrows( IllegalArgumentException.class, - () -> FixedTokenLengthChunker.validateParameters(parameters) + () -> fixedTokenLengthChunker.validateParameters(parameters) ); assertEquals( "fixed length parameter [" + OVERLAP_RATE_FIELD + "] cannot be cast to [" + Number.class.getName() + "]", @@ -109,7 +106,7 @@ public void testValidateParameters_whenIllegalOverlapRateValue_thenFail() { parameters.put(OVERLAP_RATE_FIELD, 0.6); IllegalArgumentException illegalArgumentException = assertThrows( IllegalArgumentException.class, - () -> FixedTokenLengthChunker.validateParameters(parameters) + () -> fixedTokenLengthChunker.validateParameters(parameters) ); assertEquals( "fixed length parameter [" + OVERLAP_RATE_FIELD + "] must be between 0 and 0.5", @@ -122,7 +119,7 @@ public void testValidateParameters_whenIllegalTokenizerType_thenFail() { parameters.put(TOKENIZER_FIELD, 111); IllegalArgumentException illegalArgumentException = assertThrows( IllegalArgumentException.class, - () -> FixedTokenLengthChunker.validateParameters(parameters) + () -> fixedTokenLengthChunker.validateParameters(parameters) ); assertEquals( "Chunker parameter [" + TOKENIZER_FIELD + "] cannot be cast to [" + String.class.getName() + "]", @@ -141,9 +138,9 @@ public void testChunk_withTokenLimit_10() { "This is an example document to be chunked. The document contains a single paragraph, two sentences and 24 tokens by standard tokenizer in OpenSearch."; List passages = fixedTokenLengthChunker.chunk(content, runtimeParameters); List expectedPassages = new ArrayList<>(); - expectedPassages.add("This is an example document to be chunked The document"); - expectedPassages.add("contains a single paragraph two sentences and 24 tokens by"); - expectedPassages.add("standard tokenizer in OpenSearch"); + expectedPassages.add("This is an example document to be chunked. The document"); + expectedPassages.add("contains a single paragraph, two sentences and 24 tokens by"); + expectedPassages.add("standard tokenizer in OpenSearch."); assertEquals(expectedPassages, passages); } @@ -159,9 +156,9 @@ public void testChunk_withTokenLimit_20() { List passages = fixedTokenLengthChunker.chunk(content, runtimeParameters); List expectedPassages = new ArrayList<>(); expectedPassages.add( - "This is an example document to be chunked The document contains a single paragraph two sentences and 24 tokens by" + "This is an example document to be chunked. The document contains a single paragraph, two sentences and 24 tokens by" ); - expectedPassages.add("standard tokenizer in OpenSearch"); + expectedPassages.add("standard tokenizer in OpenSearch."); assertEquals(expectedPassages, passages); } @@ -174,10 +171,10 @@ public void testChunk_withOverlapRate_half() { "This is an example document to be chunked. The document contains a single paragraph, two sentences and 24 tokens by standard tokenizer in OpenSearch."; List passages = fixedTokenLengthChunker.chunk(content, Map.of()); List expectedPassages = new ArrayList<>(); - expectedPassages.add("This is an example document to be chunked The document"); - expectedPassages.add("to be chunked The document contains a single paragraph two"); - expectedPassages.add("contains a single paragraph two sentences and 24 tokens by"); - expectedPassages.add("sentences and 24 tokens by standard tokenizer in OpenSearch"); + expectedPassages.add("This is an example document to be chunked. The document"); + expectedPassages.add("to be chunked. The document contains a single paragraph, two"); + expectedPassages.add("contains a single paragraph, two sentences and 24 tokens by"); + expectedPassages.add("sentences and 24 tokens by standard tokenizer in OpenSearch."); assertEquals(expectedPassages, passages); } } From 94b1967cfa2aa88190fee65ac836f05b92d569c2 Mon Sep 17 00:00:00 2001 From: yuye-aws Date: Tue, 12 Mar 2024 11:50:33 +0800 Subject: [PATCH 122/189] update exception message Signed-off-by: yuye-aws --- .../processor/chunker/FixedTokenLengthChunker.java | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java b/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java index 6c035753a..aad430c03 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java @@ -8,7 +8,6 @@ import java.util.Map; import java.util.List; import java.util.ArrayList; -import lombok.extern.log4j.Log4j2; import org.apache.commons.lang3.math.NumberUtils; import org.opensearch.index.analysis.AnalysisRegistry; @@ -21,7 +20,6 @@ /** * The implementation {@link Chunker} for fixed token length algorithm. */ -@Log4j2 public class FixedTokenLengthChunker implements Chunker { public static final String ANALYSIS_REGISTRY_FIELD = "analysis_registry"; @@ -131,7 +129,7 @@ private List tokenize(String content, String tokenizer, int maxTok AnalyzeAction.Response analyzeResponse = analyze(analyzeRequest, analysisRegistry, null, maxTokenCount); return analyzeResponse.getTokens(); } catch (IOException e) { - throw new RuntimeException("Fixed token length algorithm meet with exception: " + e); + throw new IllegalStateException("Fixed token length algorithm encounters exception: " + e.getMessage(), e); } } } From 98944d1f11afc5048797b7eafd46452a2d4eb83f Mon Sep 17 00:00:00 2001 From: yuye-aws Date: Tue, 12 Mar 2024 14:42:37 +0800 Subject: [PATCH 123/189] fix document chunking processor IT Signed-off-by: yuye-aws --- .../DocumentChunkingProcessorIT.java | 19 +++++++++---------- ...nLengthChunkerWithLowercaseTokenizer.json} | 0 2 files changed, 9 insertions(+), 10 deletions(-) rename src/test/resources/processor/chunker/{PipelineForFixedTokenLengthChunkerWithLowerCaseTokenizer.json => PipelineForFixedTokenLengthChunkerWithLowercaseTokenizer.json} (100%) diff --git a/src/test/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessorIT.java b/src/test/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessorIT.java index 2b6ee7898..cc69239fa 100644 --- a/src/test/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessorIT.java +++ b/src/test/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessorIT.java @@ -25,7 +25,7 @@ import static org.opensearch.neuralsearch.TestUtils.DEFAULT_USER_AGENT; public class DocumentChunkingProcessorIT extends BaseNeuralSearchIT { - private static final String INDEX_NAME = "document_chunking_index"; + private static final String INDEX_NAME = "document_chunking_test_index"; private static final String OUTPUT_FIELD = "body_chunk"; @@ -70,7 +70,7 @@ public void setUp() throws Exception { public void testDocumentChunkingProcessor_withFixedTokenLengthAlgorithmStandardTokenizer_thenSucceed() throws Exception { try { createPipelineProcessor(FIXED_TOKEN_LENGTH_PIPELINE_WITH_STANDARD_TOKENIZER_NAME); - createDocumentChunkingIndex(FIXED_TOKEN_LENGTH_PIPELINE_WITH_STANDARD_TOKENIZER_NAME); + createDocumentChunkingIndex(INDEX_NAME, FIXED_TOKEN_LENGTH_PIPELINE_WITH_STANDARD_TOKENIZER_NAME); ingestDocument(TEST_DOCUMENT); List expectedPassages = new ArrayList<>(); @@ -86,7 +86,7 @@ public void testDocumentChunkingProcessor_withFixedTokenLengthAlgorithmStandardT public void testDocumentChunkingProcessor_withFixedTokenLengthAlgorithmLetterTokenizer_thenSucceed() throws Exception { try { createPipelineProcessor(FIXED_TOKEN_LENGTH_PIPELINE_WITH_LETTER_TOKENIZER_NAME); - createDocumentChunkingIndex(FIXED_TOKEN_LENGTH_PIPELINE_WITH_LETTER_TOKENIZER_NAME); + createDocumentChunkingIndex(INDEX_NAME, FIXED_TOKEN_LENGTH_PIPELINE_WITH_LETTER_TOKENIZER_NAME); ingestDocument(TEST_DOCUMENT); List expectedPassages = new ArrayList<>(); @@ -102,7 +102,7 @@ public void testDocumentChunkingProcessor_withFixedTokenLengthAlgorithmLetterTok public void testDocumentChunkingProcessor_withFixedTokenLengthAlgorithmLowercaseTokenizer_thenSucceed() throws Exception { try { createPipelineProcessor(FIXED_TOKEN_LENGTH_PIPELINE_WITH_LOWERCASE_TOKENIZER_NAME); - createDocumentChunkingIndex(FIXED_TOKEN_LENGTH_PIPELINE_WITH_LOWERCASE_TOKENIZER_NAME); + createDocumentChunkingIndex(INDEX_NAME, FIXED_TOKEN_LENGTH_PIPELINE_WITH_LOWERCASE_TOKENIZER_NAME); ingestDocument(TEST_DOCUMENT); List expectedPassages = new ArrayList<>(); @@ -119,7 +119,7 @@ public void testDocumentChunkingProcessor_withFixedTokenLengthAlgorithmStandardT throws Exception { try { createPipelineProcessor(FIXED_TOKEN_LENGTH_PIPELINE_WITH_STANDARD_TOKENIZER_NAME); - createDocumentChunkingIndex(FIXED_TOKEN_LENGTH_PIPELINE_WITH_STANDARD_TOKENIZER_NAME); + createDocumentChunkingIndex(INDEX_NAME, FIXED_TOKEN_LENGTH_PIPELINE_WITH_STANDARD_TOKENIZER_NAME); Exception exception = assertThrows(Exception.class, () -> ingestDocument(TEST_LONG_DOCUMENT)); // max_token_count is 100 by index settings assert (exception.getMessage() @@ -133,7 +133,7 @@ public void testDocumentChunkingProcessor_withFixedTokenLengthAlgorithmStandardT public void testDocumentChunkingProcessor_withDelimiterAlgorithm_successful() throws Exception { try { createPipelineProcessor(DELIMITER_PIPELINE_NAME); - createDocumentChunkingIndex(DELIMITER_PIPELINE_NAME); + createDocumentChunkingIndex(INDEX_NAME, DELIMITER_PIPELINE_NAME); ingestDocument(TEST_DOCUMENT); List expectedPassages = new ArrayList<>(); @@ -150,7 +150,7 @@ public void testDocumentChunkingProcessor_withDelimiterAlgorithm_successful() th public void testDocumentChunkingProcessor_withCascadePipeline_successful() throws Exception { try { createPipelineProcessor(CASCADE_PIPELINE_NAME); - createDocumentChunkingIndex(CASCADE_PIPELINE_NAME); + createDocumentChunkingIndex(INDEX_NAME, CASCADE_PIPELINE_NAME); ingestDocument(TEST_DOCUMENT); List expectedPassages = new ArrayList<>(); @@ -165,7 +165,6 @@ public void testDocumentChunkingProcessor_withCascadePipeline_successful() throw " The document contains a single paragraph, two sentences and 24 tokens by standard tokenizer in OpenSearch." ); validateIndexIngestResults(INDEX_NAME, INTERMEDIATE_FIELD, expectedPassages); - } finally { wipeOfTestResources(INDEX_NAME, CASCADE_PIPELINE_NAME, null, null); } @@ -207,10 +206,10 @@ private void createPipelineProcessor(String pipelineName) throws Exception { assertEquals("true", node.get("acknowledged").toString()); } - private void createDocumentChunkingIndex(String pipelineName) throws Exception { + private void createDocumentChunkingIndex(String indexName, String pipelineName) throws Exception { URL indexSettingsURLPath = classLoader.getResource("processor/chunker/DocumentChunkingIndexSettings.json"); assert indexSettingsURLPath != null; - createIndexWithConfiguration(INDEX_NAME, Files.readString(Path.of(indexSettingsURLPath.toURI())), pipelineName); + createIndexWithConfiguration(indexName, Files.readString(Path.of(indexSettingsURLPath.toURI())), pipelineName); } private void ingestDocument(String documentPath) throws Exception { diff --git a/src/test/resources/processor/chunker/PipelineForFixedTokenLengthChunkerWithLowerCaseTokenizer.json b/src/test/resources/processor/chunker/PipelineForFixedTokenLengthChunkerWithLowercaseTokenizer.json similarity index 100% rename from src/test/resources/processor/chunker/PipelineForFixedTokenLengthChunkerWithLowerCaseTokenizer.json rename to src/test/resources/processor/chunker/PipelineForFixedTokenLengthChunkerWithLowercaseTokenizer.json From 8799fd0bcfbbeb7cf66c3b686b96af2dcfc43e83 Mon Sep 17 00:00:00 2001 From: yuye-aws Date: Tue, 12 Mar 2024 15:07:22 +0800 Subject: [PATCH 124/189] bug fix: adjust start, end content position in fixed token length algorithm Signed-off-by: yuye-aws --- .../chunker/FixedTokenLengthChunker.java | 9 ++++-- .../DocumentChunkingProcessorIT.java | 14 +++++----- .../DocumentChunkingProcessorTests.java | 28 +++++++++---------- .../chunker/FixedTokenLengthChunkerTests.java | 12 ++++---- 4 files changed, 34 insertions(+), 29 deletions(-) diff --git a/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java b/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java index aad430c03..acdd65aea 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java @@ -105,7 +105,12 @@ public List chunk(String content, Map runtimeParameters) int overlapTokenNumber = (int) Math.floor(tokenLimit * overlapRate); while (startTokenIndex < tokens.size()) { - startContentPosition = tokens.get(startTokenIndex).getStartOffset(); + if (startTokenIndex == 0) { + // include all characters till the start if no previous passage + startContentPosition = 0; + } else { + startContentPosition = tokens.get(startTokenIndex).getStartOffset(); + } if (startTokenIndex + tokenLimit >= tokens.size()) { // include all characters till the end if no next passage endContentPosition = content.length(); @@ -113,7 +118,7 @@ public List chunk(String content, Map runtimeParameters) break; } else { // include gap characters between two passages - endContentPosition = tokens.get(startTokenIndex + tokenLimit).getStartOffset() - 1; + endContentPosition = tokens.get(startTokenIndex + tokenLimit).getStartOffset(); passages.add(content.substring(startContentPosition, endContentPosition)); } startTokenIndex += tokenLimit - overlapTokenNumber; diff --git a/src/test/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessorIT.java b/src/test/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessorIT.java index cc69239fa..8c1ced0ed 100644 --- a/src/test/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessorIT.java +++ b/src/test/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessorIT.java @@ -74,8 +74,8 @@ public void testDocumentChunkingProcessor_withFixedTokenLengthAlgorithmStandardT ingestDocument(TEST_DOCUMENT); List expectedPassages = new ArrayList<>(); - expectedPassages.add("This is an example document to be chunked. The document"); - expectedPassages.add("contains a single paragraph, two sentences and 24 tokens by"); + expectedPassages.add("This is an example document to be chunked. The document "); + expectedPassages.add("contains a single paragraph, two sentences and 24 tokens by "); expectedPassages.add("standard tokenizer in OpenSearch."); validateIndexIngestResults(INDEX_NAME, OUTPUT_FIELD, expectedPassages); } finally { @@ -90,8 +90,8 @@ public void testDocumentChunkingProcessor_withFixedTokenLengthAlgorithmLetterTok ingestDocument(TEST_DOCUMENT); List expectedPassages = new ArrayList<>(); - expectedPassages.add("This is an example document to be chunked. The document"); - expectedPassages.add("contains a single paragraph, two sentences and 24 tokens by standard"); + expectedPassages.add("This is an example document to be chunked. The document "); + expectedPassages.add("contains a single paragraph, two sentences and 24 tokens by standard "); expectedPassages.add("tokenizer in OpenSearch."); validateIndexIngestResults(INDEX_NAME, OUTPUT_FIELD, expectedPassages); } finally { @@ -106,8 +106,8 @@ public void testDocumentChunkingProcessor_withFixedTokenLengthAlgorithmLowercase ingestDocument(TEST_DOCUMENT); List expectedPassages = new ArrayList<>(); - expectedPassages.add("This is an example document to be chunked. The document"); - expectedPassages.add("contains a single paragraph, two sentences and 24 tokens by standard"); + expectedPassages.add("This is an example document to be chunked. The document "); + expectedPassages.add("contains a single paragraph, two sentences and 24 tokens by standard "); expectedPassages.add("tokenizer in OpenSearch."); validateIndexIngestResults(INDEX_NAME, OUTPUT_FIELD, expectedPassages); } finally { @@ -155,7 +155,7 @@ public void testDocumentChunkingProcessor_withCascadePipeline_successful() throw List expectedPassages = new ArrayList<>(); expectedPassages.add("This is an example document to be chunked."); - expectedPassages.add("The document contains a single paragraph, two sentences and 24"); + expectedPassages.add(" The document contains a single paragraph, two sentences and 24 "); expectedPassages.add("tokens by standard tokenizer in OpenSearch."); validateIndexIngestResults(INDEX_NAME, OUTPUT_FIELD, expectedPassages); diff --git a/src/test/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessorTests.java b/src/test/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessorTests.java index eb7820c57..b63a512d8 100644 --- a/src/test/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessorTests.java +++ b/src/test/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessorTests.java @@ -360,8 +360,8 @@ public void testExecute_withFixedTokenLength_andSourceDataStringWithMaxChunkNum_ Object passages = document.getSourceAndMetadata().get(OUTPUT_FIELD); assert (passages instanceof List); List expectedPassages = new ArrayList<>(); - expectedPassages.add("This is an example document to be chunked. The document"); - expectedPassages.add("contains a single paragraph, two sentences and 24 tokens by"); + expectedPassages.add("This is an example document to be chunked. The document "); + expectedPassages.add("contains a single paragraph, two sentences and 24 tokens by "); expectedPassages.add("standard tokenizer in OpenSearch."); assertEquals(expectedPassages, passages); } @@ -376,8 +376,8 @@ public void testExecute_withFixedTokenLength_andSourceDataStringWithMaxChunkNumT Object passages = document.getSourceAndMetadata().get(OUTPUT_FIELD); assert (passages instanceof List); List expectedPassages = new ArrayList<>(); - expectedPassages.add("This is an example document to be chunked. The document"); - expectedPassages.add("contains a single paragraph, two sentences and 24 tokens by"); + expectedPassages.add("This is an example document to be chunked. The document "); + expectedPassages.add("contains a single paragraph, two sentences and 24 tokens by "); expectedPassages.add("standard tokenizer in OpenSearch."); assertEquals(expectedPassages, passages); } @@ -407,8 +407,8 @@ public void testExecute_withFixedTokenLength_andSourceDataString_thenSucceed() { Object passages = document.getSourceAndMetadata().get(OUTPUT_FIELD); assert (passages instanceof List); List expectedPassages = new ArrayList<>(); - expectedPassages.add("This is an example document to be chunked. The document"); - expectedPassages.add("contains a single paragraph, two sentences and 24 tokens by"); + expectedPassages.add("This is an example document to be chunked. The document "); + expectedPassages.add("contains a single paragraph, two sentences and 24 tokens by "); expectedPassages.add("standard tokenizer in OpenSearch."); assertEquals(expectedPassages, passages); } @@ -440,11 +440,11 @@ public void testExecute_withFixedTokenLength_andSourceDataListStrings_thenSuccee assert (passages instanceof List); List expectedPassages = new ArrayList<>(); - expectedPassages.add("This is the first document to be chunked. The document"); - expectedPassages.add("contains a single paragraph, two sentences and 24 tokens by"); + expectedPassages.add("This is the first document to be chunked. The document "); + expectedPassages.add("contains a single paragraph, two sentences and 24 tokens by "); expectedPassages.add("standard tokenizer in OpenSearch."); - expectedPassages.add("This is the second document to be chunked. The document"); - expectedPassages.add("contains a single paragraph, two sentences and 24 tokens by"); + expectedPassages.add("This is the second document to be chunked. The document "); + expectedPassages.add("contains a single paragraph, two sentences and 24 tokens by "); expectedPassages.add("standard tokenizer in OpenSearch."); assertEquals(expectedPassages, passages); } @@ -488,8 +488,8 @@ public void testExecute_withFixedTokenLength_andFieldMapNestedMap_thenSucceed() assert (passages instanceof List); List expectedPassages = new ArrayList<>(); - expectedPassages.add("This is an example document to be chunked. The document"); - expectedPassages.add("contains a single paragraph, two sentences and 24 tokens by"); + expectedPassages.add("This is an example document to be chunked. The document "); + expectedPassages.add("contains a single paragraph, two sentences and 24 tokens by "); expectedPassages.add("standard tokenizer in OpenSearch."); assertEquals(expectedPassages, passages); } @@ -532,8 +532,8 @@ public void testExecute_withFixedTokenLength_andFieldMapNestedMap_sourceDataList Object nestedResult = document.getSourceAndMetadata().get(INPUT_NESTED_FIELD_KEY); List expectedPassages = new ArrayList<>(); - expectedPassages.add("This is an example document to be chunked. The document"); - expectedPassages.add("contains a single paragraph, two sentences and 24 tokens by"); + expectedPassages.add("This is an example document to be chunked. The document "); + expectedPassages.add("contains a single paragraph, two sentences and 24 tokens by "); expectedPassages.add("standard tokenizer in OpenSearch."); assert (nestedResult instanceof List); assertEquals(((List) nestedResult).size(), 2); diff --git a/src/test/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunkerTests.java b/src/test/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunkerTests.java index 484e20799..d9934c184 100644 --- a/src/test/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunkerTests.java +++ b/src/test/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunkerTests.java @@ -138,8 +138,8 @@ public void testChunk_withTokenLimit_10() { "This is an example document to be chunked. The document contains a single paragraph, two sentences and 24 tokens by standard tokenizer in OpenSearch."; List passages = fixedTokenLengthChunker.chunk(content, runtimeParameters); List expectedPassages = new ArrayList<>(); - expectedPassages.add("This is an example document to be chunked. The document"); - expectedPassages.add("contains a single paragraph, two sentences and 24 tokens by"); + expectedPassages.add("This is an example document to be chunked. The document "); + expectedPassages.add("contains a single paragraph, two sentences and 24 tokens by "); expectedPassages.add("standard tokenizer in OpenSearch."); assertEquals(expectedPassages, passages); } @@ -156,7 +156,7 @@ public void testChunk_withTokenLimit_20() { List passages = fixedTokenLengthChunker.chunk(content, runtimeParameters); List expectedPassages = new ArrayList<>(); expectedPassages.add( - "This is an example document to be chunked. The document contains a single paragraph, two sentences and 24 tokens by" + "This is an example document to be chunked. The document contains a single paragraph, two sentences and 24 tokens by " ); expectedPassages.add("standard tokenizer in OpenSearch."); assertEquals(expectedPassages, passages); @@ -171,9 +171,9 @@ public void testChunk_withOverlapRate_half() { "This is an example document to be chunked. The document contains a single paragraph, two sentences and 24 tokens by standard tokenizer in OpenSearch."; List passages = fixedTokenLengthChunker.chunk(content, Map.of()); List expectedPassages = new ArrayList<>(); - expectedPassages.add("This is an example document to be chunked. The document"); - expectedPassages.add("to be chunked. The document contains a single paragraph, two"); - expectedPassages.add("contains a single paragraph, two sentences and 24 tokens by"); + expectedPassages.add("This is an example document to be chunked. The document "); + expectedPassages.add("to be chunked. The document contains a single paragraph, two "); + expectedPassages.add("contains a single paragraph, two sentences and 24 tokens by "); expectedPassages.add("sentences and 24 tokens by standard tokenizer in OpenSearch."); assertEquals(expectedPassages, passages); } From 5cda870a69fd11be864ed8ac329d286df74579f9 Mon Sep 17 00:00:00 2001 From: yuye-aws Date: Tue, 12 Mar 2024 15:41:20 +0800 Subject: [PATCH 125/189] update changelog for 2.x release Signed-off-by: yuye-aws --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index e7fadfe4d..c3094af9b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -18,6 +18,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), ## [Unreleased 2.x](https://github.com/opensearch-project/neural-search/compare/2.12...2.x) ### Features +- Implement document chunking processor with fixed token length and delimiter algorithm ([#607](https://github.com/opensearch-project/neural-search/pull/607/)) - Enabled support for applying default modelId in neural sparse query ([#614](https://github.com/opensearch-project/neural-search/pull/614) ### Enhancements - Adding aggregations in hybrid query ([#630](https://github.com/opensearch-project/neural-search/pull/630)) From c942b17b0f4f0508ec70df44f87edd358fdc9ff7 Mon Sep 17 00:00:00 2001 From: yuye-aws Date: Tue, 12 Mar 2024 15:49:30 +0800 Subject: [PATCH 126/189] rename processor Signed-off-by: yuye-aws --- .../neuralsearch/plugin/NeuralSearch.java | 8 +- ...cessor.java => TextChunkingProcessor.java} | 8 +- ...java => TextChunkingProcessorFactory.java} | 16 ++-- ...orIT.java => TextChunkingProcessorIT.java} | 46 +++++------ ...s.java => TextChunkingProcessorTests.java} | 81 +++++++++---------- ...=> TextChunkingProcessorFactoryTests.java} | 30 +++---- ...gs.json => TextChunkingIndexSettings.json} | 0 ...ent.json => TextChunkingTestDocument.json} | 0 ...json => TextChunkingTestLongDocument.json} | 0 9 files changed, 91 insertions(+), 98 deletions(-) rename src/main/java/org/opensearch/neuralsearch/processor/{DocumentChunkingProcessor.java => TextChunkingProcessor.java} (98%) rename src/main/java/org/opensearch/neuralsearch/processor/factory/{DocumentChunkingProcessorFactory.java => TextChunkingProcessorFactory.java} (75%) rename src/test/java/org/opensearch/neuralsearch/processor/{DocumentChunkingProcessorIT.java => TextChunkingProcessorIT.java} (82%) rename src/test/java/org/opensearch/neuralsearch/processor/{DocumentChunkingProcessorTests.java => TextChunkingProcessorTests.java} (86%) rename src/test/java/org/opensearch/neuralsearch/processor/factory/{DocumentChunkingProcessorFactoryTests.java => TextChunkingProcessorFactoryTests.java} (74%) rename src/test/resources/processor/chunker/{DocumentChunkingIndexSettings.json => TextChunkingIndexSettings.json} (100%) rename src/test/resources/processor/chunker/{DocumentChunkingTestDocument.json => TextChunkingTestDocument.json} (100%) rename src/test/resources/processor/chunker/{DocumentChunkingTestLongDocument.json => TextChunkingTestLongDocument.json} (100%) diff --git a/src/main/java/org/opensearch/neuralsearch/plugin/NeuralSearch.java b/src/main/java/org/opensearch/neuralsearch/plugin/NeuralSearch.java index 8f60a6ff8..d54c644c4 100644 --- a/src/main/java/org/opensearch/neuralsearch/plugin/NeuralSearch.java +++ b/src/main/java/org/opensearch/neuralsearch/plugin/NeuralSearch.java @@ -31,11 +31,11 @@ import org.opensearch.neuralsearch.processor.NormalizationProcessorWorkflow; import org.opensearch.neuralsearch.processor.SparseEncodingProcessor; import org.opensearch.neuralsearch.processor.TextEmbeddingProcessor; -import org.opensearch.neuralsearch.processor.DocumentChunkingProcessor; +import org.opensearch.neuralsearch.processor.TextChunkingProcessor; import org.opensearch.neuralsearch.processor.TextImageEmbeddingProcessor; import org.opensearch.neuralsearch.processor.combination.ScoreCombinationFactory; import org.opensearch.neuralsearch.processor.combination.ScoreCombiner; -import org.opensearch.neuralsearch.processor.factory.DocumentChunkingProcessorFactory; +import org.opensearch.neuralsearch.processor.factory.TextChunkingProcessorFactory; import org.opensearch.neuralsearch.processor.factory.NormalizationProcessorFactory; import org.opensearch.neuralsearch.processor.factory.RerankProcessorFactory; import org.opensearch.neuralsearch.processor.factory.SparseEncodingProcessorFactory; @@ -117,8 +117,8 @@ public Map getProcessors(Processor.Parameters paramet new SparseEncodingProcessorFactory(clientAccessor, parameters.env), TextImageEmbeddingProcessor.TYPE, new TextImageEmbeddingProcessorFactory(clientAccessor, parameters.env, parameters.ingestService.getClusterService()), - DocumentChunkingProcessor.TYPE, - new DocumentChunkingProcessorFactory( + TextChunkingProcessor.TYPE, + new TextChunkingProcessorFactory( parameters.env, parameters.ingestService.getClusterService(), parameters.indicesService, diff --git a/src/main/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessor.java b/src/main/java/org/opensearch/neuralsearch/processor/TextChunkingProcessor.java similarity index 98% rename from src/main/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessor.java rename to src/main/java/org/opensearch/neuralsearch/processor/TextChunkingProcessor.java index 1807bb970..8e46adcac 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessor.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/TextChunkingProcessor.java @@ -13,7 +13,6 @@ import java.util.Objects; import com.google.common.annotations.VisibleForTesting; -import lombok.extern.log4j.Log4j2; import org.apache.commons.lang3.math.NumberUtils; import org.opensearch.cluster.metadata.IndexMetadata; @@ -38,10 +37,9 @@ * algorithm can be used to indicate chunking algorithm and parameters, * and field_map can be used to indicate which fields needs chunking and the corresponding keys for the chunking results. */ -@Log4j2 -public final class DocumentChunkingProcessor extends AbstractProcessor { +public final class TextChunkingProcessor extends AbstractProcessor { - public static final String TYPE = "chunking"; + public static final String TYPE = "text_chunking"; public static final String FIELD_MAP_FIELD = "field_map"; @@ -65,7 +63,7 @@ public final class DocumentChunkingProcessor extends AbstractProcessor { private final Environment environment; - public DocumentChunkingProcessor( + public TextChunkingProcessor( String tag, String description, Map fieldMap, diff --git a/src/main/java/org/opensearch/neuralsearch/processor/factory/DocumentChunkingProcessorFactory.java b/src/main/java/org/opensearch/neuralsearch/processor/factory/TextChunkingProcessorFactory.java similarity index 75% rename from src/main/java/org/opensearch/neuralsearch/processor/factory/DocumentChunkingProcessorFactory.java rename to src/main/java/org/opensearch/neuralsearch/processor/factory/TextChunkingProcessorFactory.java index 9fa38b48a..64db2aa83 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/factory/DocumentChunkingProcessorFactory.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/factory/TextChunkingProcessorFactory.java @@ -11,17 +11,17 @@ import org.opensearch.index.analysis.AnalysisRegistry; import org.opensearch.indices.IndicesService; import org.opensearch.ingest.Processor; -import org.opensearch.neuralsearch.processor.DocumentChunkingProcessor; -import static org.opensearch.neuralsearch.processor.DocumentChunkingProcessor.TYPE; -import static org.opensearch.neuralsearch.processor.DocumentChunkingProcessor.FIELD_MAP_FIELD; -import static org.opensearch.neuralsearch.processor.DocumentChunkingProcessor.ALGORITHM_FIELD; +import org.opensearch.neuralsearch.processor.TextChunkingProcessor; +import static org.opensearch.neuralsearch.processor.TextChunkingProcessor.TYPE; +import static org.opensearch.neuralsearch.processor.TextChunkingProcessor.FIELD_MAP_FIELD; +import static org.opensearch.neuralsearch.processor.TextChunkingProcessor.ALGORITHM_FIELD; import static org.opensearch.ingest.ConfigurationUtils.readMap; /** * Factory for chunking ingest processor for ingestion pipeline. * Instantiates processor based on user provided input. */ -public class DocumentChunkingProcessorFactory implements Processor.Factory { +public class TextChunkingProcessorFactory implements Processor.Factory { private final Environment environment; @@ -31,7 +31,7 @@ public class DocumentChunkingProcessorFactory implements Processor.Factory { private final AnalysisRegistry analysisRegistry; - public DocumentChunkingProcessorFactory( + public TextChunkingProcessorFactory( Environment environment, ClusterService clusterService, IndicesService indicesService, @@ -44,7 +44,7 @@ public DocumentChunkingProcessorFactory( } @Override - public DocumentChunkingProcessor create( + public TextChunkingProcessor create( Map registry, String processorTag, String description, @@ -52,7 +52,7 @@ public DocumentChunkingProcessor create( ) throws Exception { Map fieldMap = readMap(TYPE, processorTag, config, FIELD_MAP_FIELD); Map algorithmMap = readMap(TYPE, processorTag, config, ALGORITHM_FIELD); - return new DocumentChunkingProcessor( + return new TextChunkingProcessor( processorTag, description, fieldMap, diff --git a/src/test/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessorIT.java b/src/test/java/org/opensearch/neuralsearch/processor/TextChunkingProcessorIT.java similarity index 82% rename from src/test/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessorIT.java rename to src/test/java/org/opensearch/neuralsearch/processor/TextChunkingProcessorIT.java index 8c1ced0ed..a4e770e0a 100644 --- a/src/test/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessorIT.java +++ b/src/test/java/org/opensearch/neuralsearch/processor/TextChunkingProcessorIT.java @@ -24,29 +24,29 @@ import org.opensearch.neuralsearch.BaseNeuralSearchIT; import static org.opensearch.neuralsearch.TestUtils.DEFAULT_USER_AGENT; -public class DocumentChunkingProcessorIT extends BaseNeuralSearchIT { - private static final String INDEX_NAME = "document_chunking_test_index"; +public class TextChunkingProcessorIT extends BaseNeuralSearchIT { + private static final String INDEX_NAME = "text_chunking_test_index"; private static final String OUTPUT_FIELD = "body_chunk"; private static final String INTERMEDIATE_FIELD = "body_chunk_intermediate"; private static final String FIXED_TOKEN_LENGTH_PIPELINE_WITH_STANDARD_TOKENIZER_NAME = - "pipeline-document-chunking-fixed-token-length-standard-tokenizer"; + "pipeline-text-chunking-fixed-token-length-standard-tokenizer"; private static final String FIXED_TOKEN_LENGTH_PIPELINE_WITH_LETTER_TOKENIZER_NAME = - "pipeline-document-chunking-fixed-token-length-letter-tokenizer"; + "pipeline-text-chunking-fixed-token-length-letter-tokenizer"; private static final String FIXED_TOKEN_LENGTH_PIPELINE_WITH_LOWERCASE_TOKENIZER_NAME = - "pipeline-document-chunking-fixed-token-length-lowercase-tokenizer"; + "pipeline-text-chunking-fixed-token-length-lowercase-tokenizer"; - private static final String DELIMITER_PIPELINE_NAME = "pipeline-document-chunking-delimiter"; + private static final String DELIMITER_PIPELINE_NAME = "pipeline-text-chunking-delimiter"; - private static final String CASCADE_PIPELINE_NAME = "pipeline-document-chunking-cascade"; + private static final String CASCADE_PIPELINE_NAME = "pipeline-text-chunking-cascade"; - private static final String TEST_DOCUMENT = "processor/chunker/DocumentChunkingTestDocument.json"; + private static final String TEST_DOCUMENT = "processor/chunker/TextChunkingTestDocument.json"; - private static final String TEST_LONG_DOCUMENT = "processor/chunker/DocumentChunkingTestLongDocument.json"; + private static final String TEST_LONG_DOCUMENT = "processor/chunker/TextChunkingTestLongDocument.json"; private static final Map PIPELINE_CONFIGS_BY_NAME = Map.of( FIXED_TOKEN_LENGTH_PIPELINE_WITH_STANDARD_TOKENIZER_NAME, @@ -67,10 +67,10 @@ public void setUp() throws Exception { updateClusterSettings(); } - public void testDocumentChunkingProcessor_withFixedTokenLengthAlgorithmStandardTokenizer_thenSucceed() throws Exception { + public void testTextChunkingProcessor_withFixedTokenLengthAlgorithmStandardTokenizer_thenSucceed() throws Exception { try { createPipelineProcessor(FIXED_TOKEN_LENGTH_PIPELINE_WITH_STANDARD_TOKENIZER_NAME); - createDocumentChunkingIndex(INDEX_NAME, FIXED_TOKEN_LENGTH_PIPELINE_WITH_STANDARD_TOKENIZER_NAME); + createTextChunkingIndex(INDEX_NAME, FIXED_TOKEN_LENGTH_PIPELINE_WITH_STANDARD_TOKENIZER_NAME); ingestDocument(TEST_DOCUMENT); List expectedPassages = new ArrayList<>(); @@ -83,10 +83,10 @@ public void testDocumentChunkingProcessor_withFixedTokenLengthAlgorithmStandardT } } - public void testDocumentChunkingProcessor_withFixedTokenLengthAlgorithmLetterTokenizer_thenSucceed() throws Exception { + public void testTextChunkingProcessor_withFixedTokenLengthAlgorithmLetterTokenizer_thenSucceed() throws Exception { try { createPipelineProcessor(FIXED_TOKEN_LENGTH_PIPELINE_WITH_LETTER_TOKENIZER_NAME); - createDocumentChunkingIndex(INDEX_NAME, FIXED_TOKEN_LENGTH_PIPELINE_WITH_LETTER_TOKENIZER_NAME); + createTextChunkingIndex(INDEX_NAME, FIXED_TOKEN_LENGTH_PIPELINE_WITH_LETTER_TOKENIZER_NAME); ingestDocument(TEST_DOCUMENT); List expectedPassages = new ArrayList<>(); @@ -99,10 +99,10 @@ public void testDocumentChunkingProcessor_withFixedTokenLengthAlgorithmLetterTok } } - public void testDocumentChunkingProcessor_withFixedTokenLengthAlgorithmLowercaseTokenizer_thenSucceed() throws Exception { + public void testTextChunkingProcessor_withFixedTokenLengthAlgorithmLowercaseTokenizer_thenSucceed() throws Exception { try { createPipelineProcessor(FIXED_TOKEN_LENGTH_PIPELINE_WITH_LOWERCASE_TOKENIZER_NAME); - createDocumentChunkingIndex(INDEX_NAME, FIXED_TOKEN_LENGTH_PIPELINE_WITH_LOWERCASE_TOKENIZER_NAME); + createTextChunkingIndex(INDEX_NAME, FIXED_TOKEN_LENGTH_PIPELINE_WITH_LOWERCASE_TOKENIZER_NAME); ingestDocument(TEST_DOCUMENT); List expectedPassages = new ArrayList<>(); @@ -115,11 +115,11 @@ public void testDocumentChunkingProcessor_withFixedTokenLengthAlgorithmLowercase } } - public void testDocumentChunkingProcessor_withFixedTokenLengthAlgorithmStandardTokenizer_whenExceedMaxTokenCount_thenFail() + public void testTextChunkingProcessor_withFixedTokenLengthAlgorithmStandardTokenizer_whenExceedMaxTokenCount_thenFail() throws Exception { try { createPipelineProcessor(FIXED_TOKEN_LENGTH_PIPELINE_WITH_STANDARD_TOKENIZER_NAME); - createDocumentChunkingIndex(INDEX_NAME, FIXED_TOKEN_LENGTH_PIPELINE_WITH_STANDARD_TOKENIZER_NAME); + createTextChunkingIndex(INDEX_NAME, FIXED_TOKEN_LENGTH_PIPELINE_WITH_STANDARD_TOKENIZER_NAME); Exception exception = assertThrows(Exception.class, () -> ingestDocument(TEST_LONG_DOCUMENT)); // max_token_count is 100 by index settings assert (exception.getMessage() @@ -130,10 +130,10 @@ public void testDocumentChunkingProcessor_withFixedTokenLengthAlgorithmStandardT } } - public void testDocumentChunkingProcessor_withDelimiterAlgorithm_successful() throws Exception { + public void testTextChunkingProcessor_withDelimiterAlgorithm_successful() throws Exception { try { createPipelineProcessor(DELIMITER_PIPELINE_NAME); - createDocumentChunkingIndex(INDEX_NAME, DELIMITER_PIPELINE_NAME); + createTextChunkingIndex(INDEX_NAME, DELIMITER_PIPELINE_NAME); ingestDocument(TEST_DOCUMENT); List expectedPassages = new ArrayList<>(); @@ -147,10 +147,10 @@ public void testDocumentChunkingProcessor_withDelimiterAlgorithm_successful() th } } - public void testDocumentChunkingProcessor_withCascadePipeline_successful() throws Exception { + public void testTextChunkingProcessor_withCascadePipeline_successful() throws Exception { try { createPipelineProcessor(CASCADE_PIPELINE_NAME); - createDocumentChunkingIndex(INDEX_NAME, CASCADE_PIPELINE_NAME); + createTextChunkingIndex(INDEX_NAME, CASCADE_PIPELINE_NAME); ingestDocument(TEST_DOCUMENT); List expectedPassages = new ArrayList<>(); @@ -206,8 +206,8 @@ private void createPipelineProcessor(String pipelineName) throws Exception { assertEquals("true", node.get("acknowledged").toString()); } - private void createDocumentChunkingIndex(String indexName, String pipelineName) throws Exception { - URL indexSettingsURLPath = classLoader.getResource("processor/chunker/DocumentChunkingIndexSettings.json"); + private void createTextChunkingIndex(String indexName, String pipelineName) throws Exception { + URL indexSettingsURLPath = classLoader.getResource("processor/chunker/TextChunkingIndexSettings.json"); assert indexSettingsURLPath != null; createIndexWithConfiguration(indexName, Files.readString(Path.of(indexSettingsURLPath.toURI())), pipelineName); } diff --git a/src/test/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessorTests.java b/src/test/java/org/opensearch/neuralsearch/processor/TextChunkingProcessorTests.java similarity index 86% rename from src/test/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessorTests.java rename to src/test/java/org/opensearch/neuralsearch/processor/TextChunkingProcessorTests.java index b63a512d8..6f7598bf8 100644 --- a/src/test/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessorTests.java +++ b/src/test/java/org/opensearch/neuralsearch/processor/TextChunkingProcessorTests.java @@ -34,16 +34,16 @@ import org.opensearch.neuralsearch.processor.chunker.ChunkerFactory; import org.opensearch.neuralsearch.processor.chunker.DelimiterChunker; import org.opensearch.neuralsearch.processor.chunker.FixedTokenLengthChunker; -import org.opensearch.neuralsearch.processor.factory.DocumentChunkingProcessorFactory; +import org.opensearch.neuralsearch.processor.factory.TextChunkingProcessorFactory; import org.opensearch.plugins.AnalysisPlugin; import org.opensearch.test.OpenSearchTestCase; -import static org.opensearch.neuralsearch.processor.DocumentChunkingProcessor.FIELD_MAP_FIELD; -import static org.opensearch.neuralsearch.processor.DocumentChunkingProcessor.ALGORITHM_FIELD; -import static org.opensearch.neuralsearch.processor.DocumentChunkingProcessor.MAX_CHUNK_LIMIT_FIELD; +import static org.opensearch.neuralsearch.processor.TextChunkingProcessor.FIELD_MAP_FIELD; +import static org.opensearch.neuralsearch.processor.TextChunkingProcessor.ALGORITHM_FIELD; +import static org.opensearch.neuralsearch.processor.TextChunkingProcessor.MAX_CHUNK_LIMIT_FIELD; -public class DocumentChunkingProcessorTests extends OpenSearchTestCase { +public class TextChunkingProcessorTests extends OpenSearchTestCase { - private DocumentChunkingProcessorFactory documentChunkingProcessorFactory; + private TextChunkingProcessorFactory textChunkingProcessorFactory; private static final String PROCESSOR_TAG = "mockTag"; private static final String DESCRIPTION = "mockDescription"; @@ -84,12 +84,7 @@ public void setup() { when(metadata.index(anyString())).thenReturn(null); when(clusterState.metadata()).thenReturn(metadata); when(clusterService.state()).thenReturn(clusterState); - documentChunkingProcessorFactory = new DocumentChunkingProcessorFactory( - environment, - clusterService, - indicesService, - getAnalysisRegistry() - ); + textChunkingProcessorFactory = new TextChunkingProcessorFactory(environment, clusterService, indicesService, getAnalysisRegistry()); } private Map createFixedTokenLengthParameters() { @@ -130,29 +125,29 @@ private Map createNestedFieldMap() { } @SneakyThrows - private DocumentChunkingProcessor createFixedTokenLengthInstance(Map fieldMap) { + private TextChunkingProcessor createFixedTokenLengthInstance(Map fieldMap) { Map config = new HashMap<>(); Map algorithmMap = new HashMap<>(); algorithmMap.put(ChunkerFactory.FIXED_TOKEN_LENGTH_ALGORITHM, createFixedTokenLengthParameters()); config.put(FIELD_MAP_FIELD, fieldMap); config.put(ALGORITHM_FIELD, algorithmMap); Map registry = new HashMap<>(); - return documentChunkingProcessorFactory.create(registry, PROCESSOR_TAG, DESCRIPTION, config); + return textChunkingProcessorFactory.create(registry, PROCESSOR_TAG, DESCRIPTION, config); } @SneakyThrows - private DocumentChunkingProcessor createFixedTokenLengthInstanceWithMaxChunkNum(Map fieldMap, int maxChunkNum) { + private TextChunkingProcessor createFixedTokenLengthInstanceWithMaxChunkNum(Map fieldMap, int maxChunkNum) { Map config = new HashMap<>(); Map algorithmMap = new HashMap<>(); algorithmMap.put(ChunkerFactory.FIXED_TOKEN_LENGTH_ALGORITHM, createFixedTokenLengthParametersWithMaxChunk(maxChunkNum)); config.put(FIELD_MAP_FIELD, fieldMap); config.put(ALGORITHM_FIELD, algorithmMap); Map registry = new HashMap<>(); - return documentChunkingProcessorFactory.create(registry, PROCESSOR_TAG, DESCRIPTION, config); + return textChunkingProcessorFactory.create(registry, PROCESSOR_TAG, DESCRIPTION, config); } @SneakyThrows - private DocumentChunkingProcessor createDelimiterInstance() { + private TextChunkingProcessor createDelimiterInstance() { Map config = new HashMap<>(); Map fieldMap = new HashMap<>(); Map algorithmMap = new HashMap<>(); @@ -161,7 +156,7 @@ private DocumentChunkingProcessor createDelimiterInstance() { config.put(FIELD_MAP_FIELD, fieldMap); config.put(ALGORITHM_FIELD, algorithmMap); Map registry = new HashMap<>(); - return documentChunkingProcessorFactory.create(registry, PROCESSOR_TAG, DESCRIPTION, config); + return textChunkingProcessorFactory.create(registry, PROCESSOR_TAG, DESCRIPTION, config); } public void testCreate_whenAlgorithmFieldMissing_thenFail() { @@ -171,7 +166,7 @@ public void testCreate_whenAlgorithmFieldMissing_thenFail() { Map registry = new HashMap<>(); OpenSearchParseException openSearchParseException = assertThrows( OpenSearchParseException.class, - () -> documentChunkingProcessorFactory.create(registry, PROCESSOR_TAG, DESCRIPTION, config) + () -> textChunkingProcessorFactory.create(registry, PROCESSOR_TAG, DESCRIPTION, config) ); assertEquals("[" + ALGORITHM_FIELD + "] required property is missing", openSearchParseException.getMessage()); } @@ -188,7 +183,7 @@ public void testCreate_whenMaxChunkNumInvalidValue_thenFail() { config.put(ALGORITHM_FIELD, algorithmMap); IllegalArgumentException illegalArgumentException = assertThrows( IllegalArgumentException.class, - () -> documentChunkingProcessorFactory.create(registry, PROCESSOR_TAG, DESCRIPTION, config) + () -> textChunkingProcessorFactory.create(registry, PROCESSOR_TAG, DESCRIPTION, config) ); assertEquals("Parameter [" + MAX_CHUNK_LIMIT_FIELD + "] must be a positive integer", illegalArgumentException.getMessage()); @@ -199,12 +194,12 @@ public void testCreate_whenAlgorithmFieldNoAlgorithm_thenFail() { Map fieldMap = new HashMap<>(); Map algorithmMap = new HashMap<>(); fieldMap.put(INPUT_FIELD, OUTPUT_FIELD); - config.put(DocumentChunkingProcessor.FIELD_MAP_FIELD, fieldMap); + config.put(TextChunkingProcessor.FIELD_MAP_FIELD, fieldMap); config.put(ALGORITHM_FIELD, algorithmMap); Map registry = new HashMap<>(); IllegalArgumentException illegalArgumentException = assertThrows( IllegalArgumentException.class, - () -> documentChunkingProcessorFactory.create(registry, PROCESSOR_TAG, DESCRIPTION, config) + () -> textChunkingProcessorFactory.create(registry, PROCESSOR_TAG, DESCRIPTION, config) ); assertEquals( "Unable to create the processor as [" + ALGORITHM_FIELD + "] must contain and only contain 1 algorithm", @@ -217,14 +212,14 @@ public void testCreate_whenAlgorithmFieldMultipleAlgorithm_thenFail() { Map fieldMap = new HashMap<>(); Map algorithmMap = new HashMap<>(); fieldMap.put(INPUT_FIELD, OUTPUT_FIELD); - config.put(DocumentChunkingProcessor.FIELD_MAP_FIELD, fieldMap); + config.put(TextChunkingProcessor.FIELD_MAP_FIELD, fieldMap); algorithmMap.put(ChunkerFactory.FIXED_TOKEN_LENGTH_ALGORITHM, createFixedTokenLengthParameters()); algorithmMap.put(ChunkerFactory.DELIMITER_ALGORITHM, createDelimiterParameters()); config.put(ALGORITHM_FIELD, algorithmMap); Map registry = new HashMap<>(); IllegalArgumentException illegalArgumentException = assertThrows( IllegalArgumentException.class, - () -> documentChunkingProcessorFactory.create(registry, PROCESSOR_TAG, DESCRIPTION, config) + () -> textChunkingProcessorFactory.create(registry, PROCESSOR_TAG, DESCRIPTION, config) ); assertEquals( "Unable to create the processor as [" + ALGORITHM_FIELD + "] must contain and only contain 1 algorithm", @@ -238,13 +233,13 @@ public void testCreate_whenAlgorithmFieldInvalidAlgorithmName_thenFail() { Map algorithmMap = new HashMap<>(); String invalid_algorithm_type = "invalid algorithm"; fieldMap.put(INPUT_FIELD, OUTPUT_FIELD); - config.put(DocumentChunkingProcessor.FIELD_MAP_FIELD, fieldMap); + config.put(TextChunkingProcessor.FIELD_MAP_FIELD, fieldMap); algorithmMap.put(invalid_algorithm_type, createFixedTokenLengthParameters()); config.put(ALGORITHM_FIELD, algorithmMap); Map registry = new HashMap<>(); IllegalArgumentException illegalArgumentException = assertThrows( IllegalArgumentException.class, - () -> documentChunkingProcessorFactory.create(registry, PROCESSOR_TAG, DESCRIPTION, config) + () -> textChunkingProcessorFactory.create(registry, PROCESSOR_TAG, DESCRIPTION, config) ); assert (illegalArgumentException.getMessage() .contains("Unable to create the processor as chunker algorithm [" + invalid_algorithm_type + "] is not supported")); @@ -255,13 +250,13 @@ public void testCreate_whenAlgorithmFieldInvalidAlgorithmContent_thenFail() { Map fieldMap = new HashMap<>(); Map algorithmMap = new HashMap<>(); fieldMap.put(INPUT_FIELD, OUTPUT_FIELD); - config.put(DocumentChunkingProcessor.FIELD_MAP_FIELD, fieldMap); + config.put(TextChunkingProcessor.FIELD_MAP_FIELD, fieldMap); algorithmMap.put(ChunkerFactory.FIXED_TOKEN_LENGTH_ALGORITHM, 1); config.put(ALGORITHM_FIELD, algorithmMap); Map registry = new HashMap<>(); IllegalArgumentException illegalArgumentException = assertThrows( IllegalArgumentException.class, - () -> documentChunkingProcessorFactory.create(registry, PROCESSOR_TAG, DESCRIPTION, config) + () -> textChunkingProcessorFactory.create(registry, PROCESSOR_TAG, DESCRIPTION, config) ); assertEquals( "Unable to create the processor as [" @@ -275,9 +270,9 @@ public void testCreate_whenAlgorithmFieldInvalidAlgorithmContent_thenFail() { @SneakyThrows public void testGetType() { - DocumentChunkingProcessor processor = createFixedTokenLengthInstance(createStringFieldMap()); + TextChunkingProcessor processor = createFixedTokenLengthInstance(createStringFieldMap()); String type = processor.getType(); - assertEquals(DocumentChunkingProcessor.TYPE, type); + assertEquals(TextChunkingProcessor.TYPE, type); } private String createSourceDataString() { @@ -353,7 +348,7 @@ private IngestDocument createIngestDocumentWithSourceData(Object sourceData) { @SneakyThrows public void testExecute_withFixedTokenLength_andSourceDataStringWithMaxChunkNum_thenSucceed() { - DocumentChunkingProcessor processor = createFixedTokenLengthInstanceWithMaxChunkNum(createStringFieldMap(), 5); + TextChunkingProcessor processor = createFixedTokenLengthInstanceWithMaxChunkNum(createStringFieldMap(), 5); IngestDocument ingestDocument = createIngestDocumentWithSourceData(createSourceDataString()); IngestDocument document = processor.execute(ingestDocument); assert document.getSourceAndMetadata().containsKey(OUTPUT_FIELD); @@ -368,7 +363,7 @@ public void testExecute_withFixedTokenLength_andSourceDataStringWithMaxChunkNum_ @SneakyThrows public void testExecute_withFixedTokenLength_andSourceDataStringWithMaxChunkNumTwice_thenSucceed() { - DocumentChunkingProcessor processor = createFixedTokenLengthInstanceWithMaxChunkNum(createStringFieldMap(), 5); + TextChunkingProcessor processor = createFixedTokenLengthInstanceWithMaxChunkNum(createStringFieldMap(), 5); for (int i = 0; i < 2; i++) { IngestDocument ingestDocument = createIngestDocumentWithSourceData(createSourceDataString()); IngestDocument document = processor.execute(ingestDocument); @@ -385,7 +380,7 @@ public void testExecute_withFixedTokenLength_andSourceDataStringWithMaxChunkNumT @SneakyThrows public void testExecute_withFixedTokenLength_andSourceDataStringWithMaxChunkNumExceed_thenFail() { - DocumentChunkingProcessor processor = createFixedTokenLengthInstanceWithMaxChunkNum(createStringFieldMap(), 1); + TextChunkingProcessor processor = createFixedTokenLengthInstanceWithMaxChunkNum(createStringFieldMap(), 1); IngestDocument ingestDocument = createIngestDocumentWithSourceData(createSourceDataString()); IllegalArgumentException illegalArgumentException = assertThrows( IllegalArgumentException.class, @@ -400,7 +395,7 @@ public void testExecute_withFixedTokenLength_andSourceDataStringWithMaxChunkNumE @SneakyThrows public void testExecute_withFixedTokenLength_andSourceDataString_thenSucceed() { - DocumentChunkingProcessor processor = createFixedTokenLengthInstance(createStringFieldMap()); + TextChunkingProcessor processor = createFixedTokenLengthInstance(createStringFieldMap()); IngestDocument ingestDocument = createIngestDocumentWithSourceData(createSourceDataString()); IngestDocument document = processor.execute(ingestDocument); assert document.getSourceAndMetadata().containsKey(OUTPUT_FIELD); @@ -415,7 +410,7 @@ public void testExecute_withFixedTokenLength_andSourceDataString_thenSucceed() { @SneakyThrows public void testExecute_withFixedTokenLength_andSourceDataInvalidType_thenFail() { - DocumentChunkingProcessor processor = createFixedTokenLengthInstance(createStringFieldMap()); + TextChunkingProcessor processor = createFixedTokenLengthInstance(createStringFieldMap()); Map sourceAndMetadata = new HashMap<>(); sourceAndMetadata.put(INPUT_FIELD, 1); sourceAndMetadata.put(IndexFieldMapper.NAME, INDEX_NAME); @@ -432,7 +427,7 @@ public void testExecute_withFixedTokenLength_andSourceDataInvalidType_thenFail() @SneakyThrows public void testExecute_withFixedTokenLength_andSourceDataListStrings_thenSucceed() { - DocumentChunkingProcessor processor = createFixedTokenLengthInstance(createStringFieldMap()); + TextChunkingProcessor processor = createFixedTokenLengthInstance(createStringFieldMap()); IngestDocument ingestDocument = createIngestDocumentWithSourceData(createSourceDataListStrings()); IngestDocument document = processor.execute(ingestDocument); assert document.getSourceAndMetadata().containsKey(OUTPUT_FIELD); @@ -451,7 +446,7 @@ public void testExecute_withFixedTokenLength_andSourceDataListStrings_thenSuccee @SneakyThrows public void testExecute_withFixedTokenLength_andSourceDataListHybridType_thenFail() { - DocumentChunkingProcessor processor = createFixedTokenLengthInstance(createStringFieldMap()); + TextChunkingProcessor processor = createFixedTokenLengthInstance(createStringFieldMap()); IngestDocument ingestDocument = createIngestDocumentWithSourceData(createSourceDataListHybridType()); IllegalArgumentException illegalArgumentException = assertThrows( IllegalArgumentException.class, @@ -465,7 +460,7 @@ public void testExecute_withFixedTokenLength_andSourceDataListHybridType_thenFai @SneakyThrows public void testExecute_withFixedTokenLength_andSourceDataListWithNull_thenFail() { - DocumentChunkingProcessor processor = createFixedTokenLengthInstance(createStringFieldMap()); + TextChunkingProcessor processor = createFixedTokenLengthInstance(createStringFieldMap()); IngestDocument ingestDocument = createIngestDocumentWithSourceData(createSourceDataListWithNull()); IllegalArgumentException illegalArgumentException = assertThrows( IllegalArgumentException.class, @@ -477,7 +472,7 @@ public void testExecute_withFixedTokenLength_andSourceDataListWithNull_thenFail( @SuppressWarnings("unchecked") @SneakyThrows public void testExecute_withFixedTokenLength_andFieldMapNestedMap_thenSucceed() { - DocumentChunkingProcessor processor = createFixedTokenLengthInstance(createNestedFieldMap()); + TextChunkingProcessor processor = createFixedTokenLengthInstance(createNestedFieldMap()); IngestDocument ingestDocument = createIngestDocumentWithNestedSourceData(createSourceDataNestedMap()); IngestDocument document = processor.execute(ingestDocument); assert document.getSourceAndMetadata().containsKey(INPUT_NESTED_FIELD_KEY); @@ -496,7 +491,7 @@ public void testExecute_withFixedTokenLength_andFieldMapNestedMap_thenSucceed() @SneakyThrows public void testExecute_withFixedTokenLength_andMaxDepthLimitExceedFieldMap_thenFail() { - DocumentChunkingProcessor processor = createFixedTokenLengthInstance(createNestedFieldMap()); + TextChunkingProcessor processor = createFixedTokenLengthInstance(createNestedFieldMap()); IngestDocument ingestDocument = createIngestDocumentWithNestedSourceData(createMaxDepthLimitExceedMap(0)); IllegalArgumentException illegalArgumentException = assertThrows( IllegalArgumentException.class, @@ -510,7 +505,7 @@ public void testExecute_withFixedTokenLength_andMaxDepthLimitExceedFieldMap_then @SneakyThrows public void testExecute_withFixedTokenLength_andFieldMapNestedMap_thenFail() { - DocumentChunkingProcessor processor = createFixedTokenLengthInstance(createNestedFieldMap()); + TextChunkingProcessor processor = createFixedTokenLengthInstance(createNestedFieldMap()); IngestDocument ingestDocument = createIngestDocumentWithNestedSourceData(createSourceDataInvalidNestedMap()); IllegalArgumentException illegalArgumentException = assertThrows( IllegalArgumentException.class, @@ -525,7 +520,7 @@ public void testExecute_withFixedTokenLength_andFieldMapNestedMap_thenFail() { @SneakyThrows @SuppressWarnings("unchecked") public void testExecute_withFixedTokenLength_andFieldMapNestedMap_sourceDataList_thenSucceed() { - DocumentChunkingProcessor processor = createFixedTokenLengthInstance(createNestedFieldMap()); + TextChunkingProcessor processor = createFixedTokenLengthInstance(createNestedFieldMap()); IngestDocument ingestDocument = createIngestDocumentWithNestedSourceData(createSourceDataListNestedMap()); IngestDocument document = processor.execute(ingestDocument); assert document.getSourceAndMetadata().containsKey(INPUT_NESTED_FIELD_KEY); @@ -548,7 +543,7 @@ public void testExecute_withFixedTokenLength_andFieldMapNestedMap_sourceDataList @SneakyThrows public void testExecute_withDelimiter_andSourceDataString_thenSucceed() { - DocumentChunkingProcessor processor = createDelimiterInstance(); + TextChunkingProcessor processor = createDelimiterInstance(); IngestDocument ingestDocument = createIngestDocumentWithSourceData(createSourceDataString()); IngestDocument document = processor.execute(ingestDocument); assert document.getSourceAndMetadata().containsKey(OUTPUT_FIELD); diff --git a/src/test/java/org/opensearch/neuralsearch/processor/factory/DocumentChunkingProcessorFactoryTests.java b/src/test/java/org/opensearch/neuralsearch/processor/factory/TextChunkingProcessorFactoryTests.java similarity index 74% rename from src/test/java/org/opensearch/neuralsearch/processor/factory/DocumentChunkingProcessorFactoryTests.java rename to src/test/java/org/opensearch/neuralsearch/processor/factory/TextChunkingProcessorFactoryTests.java index 1a5635791..943a6d8d5 100644 --- a/src/test/java/org/opensearch/neuralsearch/processor/factory/DocumentChunkingProcessorFactoryTests.java +++ b/src/test/java/org/opensearch/neuralsearch/processor/factory/TextChunkingProcessorFactoryTests.java @@ -22,21 +22,21 @@ import org.opensearch.indices.IndicesService; import org.opensearch.indices.analysis.AnalysisModule; import org.opensearch.ingest.Processor; -import org.opensearch.neuralsearch.processor.DocumentChunkingProcessor; +import org.opensearch.neuralsearch.processor.TextChunkingProcessor; import org.opensearch.neuralsearch.processor.chunker.ChunkerFactory; import org.opensearch.plugins.AnalysisPlugin; import org.opensearch.test.OpenSearchTestCase; -import static org.opensearch.neuralsearch.processor.DocumentChunkingProcessor.TYPE; -import static org.opensearch.neuralsearch.processor.DocumentChunkingProcessor.FIELD_MAP_FIELD; -import static org.opensearch.neuralsearch.processor.DocumentChunkingProcessor.ALGORITHM_FIELD; +import static org.opensearch.neuralsearch.processor.TextChunkingProcessor.TYPE; +import static org.opensearch.neuralsearch.processor.TextChunkingProcessor.FIELD_MAP_FIELD; +import static org.opensearch.neuralsearch.processor.TextChunkingProcessor.ALGORITHM_FIELD; -public class DocumentChunkingProcessorFactoryTests extends OpenSearchTestCase { +public class TextChunkingProcessorFactoryTests extends OpenSearchTestCase { private static final String PROCESSOR_TAG = "mockTag"; private static final String DESCRIPTION = "mockDescription"; private static final Map algorithmMap = Map.of(ChunkerFactory.FIXED_TOKEN_LENGTH_ALGORITHM, new HashMap<>()); - private DocumentChunkingProcessorFactory documentChunkingProcessorFactory; + private TextChunkingProcessorFactory textChunkingProcessorFactory; @SneakyThrows private AnalysisRegistry getAnalysisRegistry() { @@ -63,7 +63,7 @@ public void setup() { Environment environment = mock(Environment.class); ClusterService clusterService = mock(ClusterService.class); IndicesService indicesService = mock(IndicesService.class); - this.documentChunkingProcessorFactory = new DocumentChunkingProcessorFactory( + this.textChunkingProcessorFactory = new TextChunkingProcessorFactory( environment, clusterService, indicesService, @@ -72,41 +72,41 @@ public void setup() { } @SneakyThrows - public void testDocumentChunkingProcessorFactory_whenAllParamsPassed_thenSuccessful() { + public void testTextChunkingProcessorFactory_whenAllParamsPassed_thenSuccessful() { final Map processorFactories = new HashMap<>(); Map config = new HashMap<>(); config.put(ALGORITHM_FIELD, algorithmMap); config.put(FIELD_MAP_FIELD, new HashMap<>()); - DocumentChunkingProcessor documentChunkingProcessor = documentChunkingProcessorFactory.create( + TextChunkingProcessor textChunkingProcessor = textChunkingProcessorFactory.create( processorFactories, PROCESSOR_TAG, DESCRIPTION, config ); - assertNotNull(documentChunkingProcessor); - assertEquals(TYPE, documentChunkingProcessor.getType()); + assertNotNull(textChunkingProcessor); + assertEquals(TYPE, textChunkingProcessor.getType()); } @SneakyThrows - public void testDocumentChunkingProcessorFactory_whenOnlyFieldMap_thenFail() { + public void testTextChunkingProcessorFactory_whenOnlyFieldMap_thenFail() { final Map processorFactories = new HashMap<>(); Map config = new HashMap<>(); config.put(FIELD_MAP_FIELD, new HashMap<>()); Exception exception = assertThrows( Exception.class, - () -> documentChunkingProcessorFactory.create(processorFactories, PROCESSOR_TAG, DESCRIPTION, config) + () -> textChunkingProcessorFactory.create(processorFactories, PROCESSOR_TAG, DESCRIPTION, config) ); assertEquals("[" + ALGORITHM_FIELD + "] required property is missing", exception.getMessage()); } @SneakyThrows - public void testDocumentChunkingProcessorFactory_whenOnlyAlgorithm_thenFail() { + public void testTextChunkingProcessorFactory_whenOnlyAlgorithm_thenFail() { final Map processorFactories = new HashMap<>(); Map config = new HashMap<>(); config.put(ALGORITHM_FIELD, algorithmMap); Exception exception = assertThrows( Exception.class, - () -> documentChunkingProcessorFactory.create(processorFactories, PROCESSOR_TAG, DESCRIPTION, config) + () -> textChunkingProcessorFactory.create(processorFactories, PROCESSOR_TAG, DESCRIPTION, config) ); assertEquals("[" + FIELD_MAP_FIELD + "] required property is missing", exception.getMessage()); } diff --git a/src/test/resources/processor/chunker/DocumentChunkingIndexSettings.json b/src/test/resources/processor/chunker/TextChunkingIndexSettings.json similarity index 100% rename from src/test/resources/processor/chunker/DocumentChunkingIndexSettings.json rename to src/test/resources/processor/chunker/TextChunkingIndexSettings.json diff --git a/src/test/resources/processor/chunker/DocumentChunkingTestDocument.json b/src/test/resources/processor/chunker/TextChunkingTestDocument.json similarity index 100% rename from src/test/resources/processor/chunker/DocumentChunkingTestDocument.json rename to src/test/resources/processor/chunker/TextChunkingTestDocument.json diff --git a/src/test/resources/processor/chunker/DocumentChunkingTestLongDocument.json b/src/test/resources/processor/chunker/TextChunkingTestLongDocument.json similarity index 100% rename from src/test/resources/processor/chunker/DocumentChunkingTestLongDocument.json rename to src/test/resources/processor/chunker/TextChunkingTestLongDocument.json From 6461b3251308fa9f313956267b4d22ef8dc03beb Mon Sep 17 00:00:00 2001 From: yuye-aws Date: Tue, 12 Mar 2024 15:56:54 +0800 Subject: [PATCH 127/189] update default delimiter to be \n\n Signed-off-by: yuye-aws --- .../neuralsearch/processor/chunker/DelimiterChunker.java | 2 +- .../processor/chunker/DelimiterChunkerTests.java | 5 +++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/src/main/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunker.java b/src/main/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunker.java index 86f5aac69..f426f2b37 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunker.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunker.java @@ -21,7 +21,7 @@ public DelimiterChunker(Map parameters) { public static final String DELIMITER_FIELD = "delimiter"; - public static final String DEFAULT_DELIMITER = "."; + public static final String DEFAULT_DELIMITER = "\n\n"; private String delimiter; diff --git a/src/test/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunkerTests.java b/src/test/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunkerTests.java index ddc1b7eb7..fec187b9e 100644 --- a/src/test/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunkerTests.java +++ b/src/test/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunkerTests.java @@ -38,10 +38,11 @@ public void testChunker() { } public void testChunkerWithDefaultDelimiter() { + // default delimiter is \n\n DelimiterChunker chunker = new DelimiterChunker(Map.of()); - String content = "a.b.c.d"; + String content = "a.b\n\nc.d"; List chunkResult = chunker.chunk(content); - assertEquals(List.of("a.", "b.", "c.", "d"), chunkResult); + assertEquals(List.of("a.b\n\n", "c.d"), chunkResult); } public void testChunkerWithDelimiterEnd() { From 2a0a879d3bc1e5d1f9a6ae0769a0fc47ce949000 Mon Sep 17 00:00:00 2001 From: yuye-aws Date: Tue, 12 Mar 2024 15:59:32 +0800 Subject: [PATCH 128/189] remove change log in 3.0 unreleased Signed-off-by: yuye-aws --- CHANGELOG.md | 1 - 1 file changed, 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index c3094af9b..2b732b446 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,7 +5,6 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), ## [Unreleased 3.0](https://github.com/opensearch-project/neural-search/compare/2.x...HEAD) ### Features -- Implement document chunking processor with fixed token length and delimiter algorithm ([#607](https://github.com/opensearch-project/neural-search/pull/607/)) ### Enhancements ### Bug Fixes - Fix async actions are left in neural_sparse query ([#438](https://github.com/opensearch-project/neural-search/pull/438)) From fbb4edbcccd0d4b9d0fa39048ce1ba3107a8c409 Mon Sep 17 00:00:00 2001 From: yuye-aws Date: Tue, 12 Mar 2024 16:06:34 +0800 Subject: [PATCH 129/189] fix IT failure due to chunking processor rename Signed-off-by: yuye-aws --- .../processor/chunker/PipelineForCascadedChunker.json | 4 ++-- .../processor/chunker/PipelineForDelimiterChunker.json | 2 +- ...PipelineForFixedTokenLengthChunkerWithLetterTokenizer.json | 2 +- ...elineForFixedTokenLengthChunkerWithLowercaseTokenizer.json | 2 +- ...pelineForFixedTokenLengthChunkerWithStandardTokenizer.json | 2 +- 5 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/test/resources/processor/chunker/PipelineForCascadedChunker.json b/src/test/resources/processor/chunker/PipelineForCascadedChunker.json index e6add1f9f..e7ba380d4 100644 --- a/src/test/resources/processor/chunker/PipelineForCascadedChunker.json +++ b/src/test/resources/processor/chunker/PipelineForCascadedChunker.json @@ -2,7 +2,7 @@ "description": "An example cascaded pipeline with fixed token length algorithm after chunking algorithm", "processors" : [ { - "chunking": { + "text_chunking": { "field_map": { "body": "body_chunk_intermediate" }, @@ -14,7 +14,7 @@ } }, { - "chunking": { + "text_chunking": { "field_map": { "body_chunk_intermediate": "body_chunk" }, diff --git a/src/test/resources/processor/chunker/PipelineForDelimiterChunker.json b/src/test/resources/processor/chunker/PipelineForDelimiterChunker.json index dfa504065..c4e66f58c 100644 --- a/src/test/resources/processor/chunker/PipelineForDelimiterChunker.json +++ b/src/test/resources/processor/chunker/PipelineForDelimiterChunker.json @@ -2,7 +2,7 @@ "description": "An example delimiter chunker pipeline", "processors" : [ { - "chunking": { + "text_chunking": { "field_map": { "body": "body_chunk" }, diff --git a/src/test/resources/processor/chunker/PipelineForFixedTokenLengthChunkerWithLetterTokenizer.json b/src/test/resources/processor/chunker/PipelineForFixedTokenLengthChunkerWithLetterTokenizer.json index e94dc1c05..7026676f8 100644 --- a/src/test/resources/processor/chunker/PipelineForFixedTokenLengthChunkerWithLetterTokenizer.json +++ b/src/test/resources/processor/chunker/PipelineForFixedTokenLengthChunkerWithLetterTokenizer.json @@ -2,7 +2,7 @@ "description": "An example fixed token length chunker pipeline with letter tokenizer", "processors" : [ { - "chunking": { + "text_chunking": { "field_map": { "body": "body_chunk" }, diff --git a/src/test/resources/processor/chunker/PipelineForFixedTokenLengthChunkerWithLowercaseTokenizer.json b/src/test/resources/processor/chunker/PipelineForFixedTokenLengthChunkerWithLowercaseTokenizer.json index 2f2ccb664..cd1c67fc5 100644 --- a/src/test/resources/processor/chunker/PipelineForFixedTokenLengthChunkerWithLowercaseTokenizer.json +++ b/src/test/resources/processor/chunker/PipelineForFixedTokenLengthChunkerWithLowercaseTokenizer.json @@ -2,7 +2,7 @@ "description": "An example fixed token length chunker pipeline with lowercase tokenizer", "processors" : [ { - "chunking": { + "text_chunking": { "field_map": { "body": "body_chunk" }, diff --git a/src/test/resources/processor/chunker/PipelineForFixedTokenLengthChunkerWithStandardTokenizer.json b/src/test/resources/processor/chunker/PipelineForFixedTokenLengthChunkerWithStandardTokenizer.json index f6dcd844e..6c727b3b4 100644 --- a/src/test/resources/processor/chunker/PipelineForFixedTokenLengthChunkerWithStandardTokenizer.json +++ b/src/test/resources/processor/chunker/PipelineForFixedTokenLengthChunkerWithStandardTokenizer.json @@ -2,7 +2,7 @@ "description": "An example fixed token length chunker pipeline with standard tokenizer", "processors" : [ { - "chunking": { + "text_chunking": { "field_map": { "body": "body_chunk" }, From 050f16305b3c2c959518529978dab3d3b71c345d Mon Sep 17 00:00:00 2001 From: yuye-aws Date: Tue, 12 Mar 2024 16:22:15 +0800 Subject: [PATCH 130/189] update javadoc for text chunking processor factory Signed-off-by: yuye-aws --- .../processor/factory/TextChunkingProcessorFactory.java | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/main/java/org/opensearch/neuralsearch/processor/factory/TextChunkingProcessorFactory.java b/src/main/java/org/opensearch/neuralsearch/processor/factory/TextChunkingProcessorFactory.java index 64db2aa83..efffcc908 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/factory/TextChunkingProcessorFactory.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/factory/TextChunkingProcessorFactory.java @@ -19,7 +19,9 @@ /** * Factory for chunking ingest processor for ingestion pipeline. - * Instantiates processor based on user provided input. + * Instantiates processor based on user provided input, which includes: + * 1. field_map: the input and output fields specified by the user + * 2. algorithm: chunking algorithm and its parameters */ public class TextChunkingProcessorFactory implements Processor.Factory { From e61f295dbedefa7b78fe89d141e50f008be92be7 Mon Sep 17 00:00:00 2001 From: yuye-aws Date: Tue, 12 Mar 2024 16:51:13 +0800 Subject: [PATCH 131/189] adjust functions in chunker interface Signed-off-by: yuye-aws --- .../processor/TextChunkingProcessor.java | 7 +----- .../processor/chunker/Chunker.java | 20 +++------------- .../chunker/ChunkerParameterValidator.java | 3 +++ .../processor/chunker/DelimiterChunker.java | 13 ++++++---- .../chunker/FixedTokenLengthChunker.java | 4 ++-- .../chunker/DelimiterChunkerTests.java | 14 +++++------ .../chunker/FixedTokenLengthChunkerTests.java | 24 +++++++++---------- 7 files changed, 37 insertions(+), 48 deletions(-) diff --git a/src/main/java/org/opensearch/neuralsearch/processor/TextChunkingProcessor.java b/src/main/java/org/opensearch/neuralsearch/processor/TextChunkingProcessor.java index 8e46adcac..123d3d4ca 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/TextChunkingProcessor.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/TextChunkingProcessor.java @@ -148,12 +148,7 @@ private boolean isListOfString(Object value) { private int chunkString(String content, List result, Map runTimeParameters, int chunkCount) { // chunk the content, return the updated chunkCount and add chunk passages to result - List contentResult; - if (chunker instanceof FixedTokenLengthChunker) { - contentResult = chunker.chunk(content, runTimeParameters); - } else { - contentResult = chunker.chunk(content); - } + List contentResult = chunker.chunk(content, runTimeParameters); chunkCount += contentResult.size(); if (maxChunkLimit != DEFAULT_MAX_CHUNK_LIMIT && chunkCount > maxChunkLimit) { throw new IllegalArgumentException( diff --git a/src/main/java/org/opensearch/neuralsearch/processor/chunker/Chunker.java b/src/main/java/org/opensearch/neuralsearch/processor/chunker/Chunker.java index 29a0539f2..8419c1d98 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/chunker/Chunker.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/chunker/Chunker.java @@ -4,8 +4,6 @@ */ package org.opensearch.neuralsearch.processor.chunker; -import com.google.common.collect.ImmutableList; - import java.util.Map; import java.util.List; @@ -16,22 +14,12 @@ public interface Chunker { /** - * Validate the parameters for chunking algorithm, + * Validate and parse the parameters for chunking algorithm, * will throw IllegalArgumentException when parameters are invalid * * @param parameters a map containing parameters for chunking algorithms */ - void validateParameters(Map parameters); - - /** - * Chunk the incoming string according to parameters and return chunked passages - * - * @param content input string - * @return Chunked passages - */ - default List chunk(String content) { - return ImmutableList.of(); - } + void validateAndParseParameters(Map parameters); /** * Chunk the incoming string according to parameters and return chunked passages @@ -40,7 +28,5 @@ default List chunk(String content) { * @param runtimeParameters a map containing runtime parameters for chunking algorithms * @return Chunked passages */ - default List chunk(String content, Map runtimeParameters) { - return ImmutableList.of(); - } + List chunk(String content, Map runtimeParameters); } diff --git a/src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerParameterValidator.java b/src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerParameterValidator.java index b3f399074..368f2cbfe 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerParameterValidator.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerParameterValidator.java @@ -9,6 +9,9 @@ import java.util.Map; +/** + * Validate and parse the parameter for chunking algorithms + */ public class ChunkerParameterValidator { public static String validateStringParameters( diff --git a/src/main/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunker.java b/src/main/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunker.java index f426f2b37..aabe1d4ae 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunker.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunker.java @@ -16,7 +16,7 @@ public class DelimiterChunker implements Chunker { public DelimiterChunker(Map parameters) { - validateParameters(parameters); + validateAndParseParameters(parameters); } public static final String DELIMITER_FIELD = "delimiter"; @@ -33,12 +33,18 @@ public DelimiterChunker(Map parameters) { * @throws IllegalArgumentException If delimiter is not a string or empty */ @Override - public void validateParameters(Map parameters) { + public void validateAndParseParameters(Map parameters) { this.delimiter = validateStringParameters(parameters, DELIMITER_FIELD, DEFAULT_DELIMITER, false); } + /** + * Return the chunked passages for fixed token length algorithm + * + * @param content input string + * @param runtimeParameters a map for runtime parameters, but not needed by delimiter algorithm + */ @Override - public List chunk(String content) { + public List chunk(String content, Map runtimeParameters) { List chunkResult = new ArrayList<>(); int start = 0, end; int nextDelimiterPosition = content.indexOf(delimiter); @@ -56,5 +62,4 @@ public List chunk(String content) { return chunkResult; } - } diff --git a/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java b/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java index acdd65aea..2968ec9f5 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java @@ -44,7 +44,7 @@ public class FixedTokenLengthChunker implements Chunker { private final AnalysisRegistry analysisRegistry; public FixedTokenLengthChunker(Map parameters) { - validateParameters(parameters); + validateAndParseParameters(parameters); this.analysisRegistry = (AnalysisRegistry) parameters.get(ANALYSIS_REGISTRY_FIELD); } @@ -63,7 +63,7 @@ public FixedTokenLengthChunker(Map parameters) { * tokenizer should be string */ @Override - public void validateParameters(Map parameters) { + public void validateAndParseParameters(Map parameters) { this.tokenLimit = validatePositiveIntegerParameter(parameters, TOKEN_LIMIT_FIELD, DEFAULT_TOKEN_LIMIT); if (parameters.containsKey(OVERLAP_RATE_FIELD)) { String overlapRateString = parameters.get(OVERLAP_RATE_FIELD).toString(); diff --git a/src/test/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunkerTests.java b/src/test/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunkerTests.java index fec187b9e..37969a51b 100644 --- a/src/test/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunkerTests.java +++ b/src/test/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunkerTests.java @@ -33,7 +33,7 @@ public void testChunkerWithDelimiterFieldNoString() { public void testChunker() { DelimiterChunker chunker = new DelimiterChunker(Map.of(DELIMITER_FIELD, "\n")); String content = "a\nb\nc\nd"; - List chunkResult = chunker.chunk(content); + List chunkResult = chunker.chunk(content, Map.of()); assertEquals(List.of("a\n", "b\n", "c\n", "d"), chunkResult); } @@ -41,42 +41,42 @@ public void testChunkerWithDefaultDelimiter() { // default delimiter is \n\n DelimiterChunker chunker = new DelimiterChunker(Map.of()); String content = "a.b\n\nc.d"; - List chunkResult = chunker.chunk(content); + List chunkResult = chunker.chunk(content, Map.of()); assertEquals(List.of("a.b\n\n", "c.d"), chunkResult); } public void testChunkerWithDelimiterEnd() { DelimiterChunker chunker = new DelimiterChunker(Map.of(DELIMITER_FIELD, "\n")); String content = "a\nb\nc\nd\n"; - List chunkResult = chunker.chunk(content); + List chunkResult = chunker.chunk(content, Map.of()); assertEquals(List.of("a\n", "b\n", "c\n", "d\n"), chunkResult); } public void testChunkerWithOnlyDelimiter() { DelimiterChunker chunker = new DelimiterChunker(Map.of(DELIMITER_FIELD, "\n")); String content = "\n"; - List chunkResult = chunker.chunk(content); + List chunkResult = chunker.chunk(content, Map.of()); assertEquals(List.of("\n"), chunkResult); } public void testChunkerWithAllDelimiters() { DelimiterChunker chunker = new DelimiterChunker(Map.of(DELIMITER_FIELD, "\n")); String content = "\n\n\n"; - List chunkResult = chunker.chunk(content); + List chunkResult = chunker.chunk(content, Map.of()); assertEquals(List.of("\n", "\n", "\n"), chunkResult); } public void testChunkerWithDifferentDelimiters() { DelimiterChunker chunker = new DelimiterChunker(Map.of(DELIMITER_FIELD, ".")); String content = "a.b.cc.d."; - List chunkResult = chunker.chunk(content); + List chunkResult = chunker.chunk(content, Map.of()); assertEquals(List.of("a.", "b.", "cc.", "d."), chunkResult); } public void testChunkerWithStringDelimiter() { DelimiterChunker chunker = new DelimiterChunker(Map.of(DELIMITER_FIELD, "\n\n")); String content = "\n\na\n\n\n"; - List chunkResult = chunker.chunk(content); + List chunkResult = chunker.chunk(content, Map.of()); assertEquals(List.of("\n\n", "a\n\n", "\n"), chunkResult); } } diff --git a/src/test/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunkerTests.java b/src/test/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunkerTests.java index d9934c184..49b633ced 100644 --- a/src/test/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunkerTests.java +++ b/src/test/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunkerTests.java @@ -61,16 +61,16 @@ public Map> getTokeniz return new FixedTokenLengthChunker(nonRuntimeParameters); } - public void testValidateParameters_whenNoParams_thenSuccessful() { - fixedTokenLengthChunker.validateParameters(Map.of()); + public void testValidateAndParseParameters_whenNoParams_thenSuccessful() { + fixedTokenLengthChunker.validateAndParseParameters(Map.of()); } - public void testValidateParameters_whenIllegalTokenLimitType_thenFail() { + public void testValidateAndParseParameters_whenIllegalTokenLimitType_thenFail() { Map parameters = new HashMap<>(); parameters.put(TOKEN_LIMIT_FIELD, "invalid token limit"); IllegalArgumentException illegalArgumentException = assertThrows( IllegalArgumentException.class, - () -> fixedTokenLengthChunker.validateParameters(parameters) + () -> fixedTokenLengthChunker.validateAndParseParameters(parameters) ); assertEquals( "fixed length parameter [" + TOKEN_LIMIT_FIELD + "] cannot be cast to [" + Number.class.getName() + "]", @@ -78,22 +78,22 @@ public void testValidateParameters_whenIllegalTokenLimitType_thenFail() { ); } - public void testValidateParameters_whenIllegalTokenLimitValue_thenFail() { + public void testValidateAndParseParameters_whenIllegalTokenLimitValue_thenFail() { Map parameters = new HashMap<>(); parameters.put(TOKEN_LIMIT_FIELD, -1); IllegalArgumentException illegalArgumentException = assertThrows( IllegalArgumentException.class, - () -> fixedTokenLengthChunker.validateParameters(parameters) + () -> fixedTokenLengthChunker.validateAndParseParameters(parameters) ); assertEquals("fixed length parameter [" + TOKEN_LIMIT_FIELD + "] must be positive", illegalArgumentException.getMessage()); } - public void testValidateParameters_whenIllegalOverlapRateType_thenFail() { + public void testValidateAndParseParameters_whenIllegalOverlapRateType_thenFail() { Map parameters = new HashMap<>(); parameters.put(OVERLAP_RATE_FIELD, "invalid overlap rate"); IllegalArgumentException illegalArgumentException = assertThrows( IllegalArgumentException.class, - () -> fixedTokenLengthChunker.validateParameters(parameters) + () -> fixedTokenLengthChunker.validateAndParseParameters(parameters) ); assertEquals( "fixed length parameter [" + OVERLAP_RATE_FIELD + "] cannot be cast to [" + Number.class.getName() + "]", @@ -101,12 +101,12 @@ public void testValidateParameters_whenIllegalOverlapRateType_thenFail() { ); } - public void testValidateParameters_whenIllegalOverlapRateValue_thenFail() { + public void testValidateAndParseParameters_whenIllegalOverlapRateValue_thenFail() { Map parameters = new HashMap<>(); parameters.put(OVERLAP_RATE_FIELD, 0.6); IllegalArgumentException illegalArgumentException = assertThrows( IllegalArgumentException.class, - () -> fixedTokenLengthChunker.validateParameters(parameters) + () -> fixedTokenLengthChunker.validateAndParseParameters(parameters) ); assertEquals( "fixed length parameter [" + OVERLAP_RATE_FIELD + "] must be between 0 and 0.5", @@ -114,12 +114,12 @@ public void testValidateParameters_whenIllegalOverlapRateValue_thenFail() { ); } - public void testValidateParameters_whenIllegalTokenizerType_thenFail() { + public void testValidateAndParseParameters_whenIllegalTokenizerType_thenFail() { Map parameters = new HashMap<>(); parameters.put(TOKENIZER_FIELD, 111); IllegalArgumentException illegalArgumentException = assertThrows( IllegalArgumentException.class, - () -> fixedTokenLengthChunker.validateParameters(parameters) + () -> fixedTokenLengthChunker.validateAndParseParameters(parameters) ); assertEquals( "Chunker parameter [" + TOKENIZER_FIELD + "] cannot be cast to [" + String.class.getName() + "]", From 4f870085eea2169e666e271edc22f303d155f487 Mon Sep 17 00:00:00 2001 From: yuye-aws Date: Tue, 12 Mar 2024 23:44:56 +0800 Subject: [PATCH 132/189] move algorithm name definition to concrete chunker class Signed-off-by: yuye-aws --- .../processor/TextChunkingProcessor.java | 4 +--- .../processor/chunker/ChunkerFactory.java | 9 +++------ .../processor/chunker/DelimiterChunker.java | 10 +++++----- .../chunker/FixedTokenLengthChunker.java | 1 + .../processor/TextChunkingProcessorTests.java | 17 ++++++++--------- .../processor/chunker/ChunkerFactoryTests.java | 6 +++--- .../TextChunkingProcessorFactoryTests.java | 4 ++-- 7 files changed, 23 insertions(+), 28 deletions(-) diff --git a/src/main/java/org/opensearch/neuralsearch/processor/TextChunkingProcessor.java b/src/main/java/org/opensearch/neuralsearch/processor/TextChunkingProcessor.java index 123d3d4ca..da7eb9859 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/TextChunkingProcessor.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/TextChunkingProcessor.java @@ -30,8 +30,6 @@ import org.opensearch.index.mapper.IndexFieldMapper; import org.opensearch.neuralsearch.processor.chunker.FixedTokenLengthChunker; -import static org.opensearch.neuralsearch.processor.chunker.ChunkerFactory.FIXED_TOKEN_LENGTH_ALGORITHM; - /** * This processor is used for chunking user input data and chunked data could be used for downstream embedding processor, * algorithm can be used to indicate chunking algorithm and parameters, @@ -111,7 +109,7 @@ private void validateAndParseAlgorithmMap(Map algorithmMap) { ); } Map chunkerParameters = (Map) algorithmValue; - if (Objects.equals(algorithmKey, FIXED_TOKEN_LENGTH_ALGORITHM)) { + if (Objects.equals(algorithmKey, FixedTokenLengthChunker.ALGORITHM_NAME)) { chunkerParameters.put(FixedTokenLengthChunker.ANALYSIS_REGISTRY_FIELD, analysisRegistry); } this.chunker = ChunkerFactory.create(algorithmKey, chunkerParameters); diff --git a/src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerFactory.java b/src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerFactory.java index 086cb0b71..38f9767ee 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerFactory.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerFactory.java @@ -12,14 +12,11 @@ */ public class ChunkerFactory { - public static final String FIXED_TOKEN_LENGTH_ALGORITHM = "fixed_token_length"; - public static final String DELIMITER_ALGORITHM = "delimiter"; - public static Chunker create(String type, Map parameters) { switch (type) { - case FIXED_TOKEN_LENGTH_ALGORITHM: + case FixedTokenLengthChunker.ALGORITHM_NAME: return new FixedTokenLengthChunker(parameters); - case DELIMITER_ALGORITHM: + case DelimiterChunker.ALGORITHM_NAME: return new DelimiterChunker(parameters); default: throw new IllegalArgumentException( @@ -29,6 +26,6 @@ public static Chunker create(String type, Map parameters) { } public static Set getAllChunkers() { - return Set.of(FIXED_TOKEN_LENGTH_ALGORITHM, DELIMITER_ALGORITHM); + return Set.of(FixedTokenLengthChunker.ALGORITHM_NAME, DelimiterChunker.ALGORITHM_NAME); } } diff --git a/src/main/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunker.java b/src/main/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunker.java index aabe1d4ae..d1a6eec97 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunker.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunker.java @@ -15,16 +15,16 @@ */ public class DelimiterChunker implements Chunker { - public DelimiterChunker(Map parameters) { - validateAndParseParameters(parameters); - } - + public static final String ALGORITHM_NAME = "delimiter"; public static final String DELIMITER_FIELD = "delimiter"; - public static final String DEFAULT_DELIMITER = "\n\n"; private String delimiter; + public DelimiterChunker(Map parameters) { + validateAndParseParameters(parameters); + } + /** * Validate the chunked passages for delimiter algorithm * diff --git a/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java b/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java index 2968ec9f5..efe9cd606 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java @@ -22,6 +22,7 @@ */ public class FixedTokenLengthChunker implements Chunker { + public static final String ALGORITHM_NAME = "fixed_token_length"; public static final String ANALYSIS_REGISTRY_FIELD = "analysis_registry"; public static final String TOKEN_LIMIT_FIELD = "token_limit"; public static final String OVERLAP_RATE_FIELD = "overlap_rate"; diff --git a/src/test/java/org/opensearch/neuralsearch/processor/TextChunkingProcessorTests.java b/src/test/java/org/opensearch/neuralsearch/processor/TextChunkingProcessorTests.java index 6f7598bf8..ddfdfed44 100644 --- a/src/test/java/org/opensearch/neuralsearch/processor/TextChunkingProcessorTests.java +++ b/src/test/java/org/opensearch/neuralsearch/processor/TextChunkingProcessorTests.java @@ -31,7 +31,6 @@ import org.opensearch.indices.analysis.AnalysisModule; import org.opensearch.ingest.IngestDocument; import org.opensearch.ingest.Processor; -import org.opensearch.neuralsearch.processor.chunker.ChunkerFactory; import org.opensearch.neuralsearch.processor.chunker.DelimiterChunker; import org.opensearch.neuralsearch.processor.chunker.FixedTokenLengthChunker; import org.opensearch.neuralsearch.processor.factory.TextChunkingProcessorFactory; @@ -128,7 +127,7 @@ private Map createNestedFieldMap() { private TextChunkingProcessor createFixedTokenLengthInstance(Map fieldMap) { Map config = new HashMap<>(); Map algorithmMap = new HashMap<>(); - algorithmMap.put(ChunkerFactory.FIXED_TOKEN_LENGTH_ALGORITHM, createFixedTokenLengthParameters()); + algorithmMap.put(FixedTokenLengthChunker.ALGORITHM_NAME, createFixedTokenLengthParameters()); config.put(FIELD_MAP_FIELD, fieldMap); config.put(ALGORITHM_FIELD, algorithmMap); Map registry = new HashMap<>(); @@ -139,7 +138,7 @@ private TextChunkingProcessor createFixedTokenLengthInstance(Map private TextChunkingProcessor createFixedTokenLengthInstanceWithMaxChunkNum(Map fieldMap, int maxChunkNum) { Map config = new HashMap<>(); Map algorithmMap = new HashMap<>(); - algorithmMap.put(ChunkerFactory.FIXED_TOKEN_LENGTH_ALGORITHM, createFixedTokenLengthParametersWithMaxChunk(maxChunkNum)); + algorithmMap.put(FixedTokenLengthChunker.ALGORITHM_NAME, createFixedTokenLengthParametersWithMaxChunk(maxChunkNum)); config.put(FIELD_MAP_FIELD, fieldMap); config.put(ALGORITHM_FIELD, algorithmMap); Map registry = new HashMap<>(); @@ -151,7 +150,7 @@ private TextChunkingProcessor createDelimiterInstance() { Map config = new HashMap<>(); Map fieldMap = new HashMap<>(); Map algorithmMap = new HashMap<>(); - algorithmMap.put(ChunkerFactory.DELIMITER_ALGORITHM, createDelimiterParameters()); + algorithmMap.put(DelimiterChunker.ALGORITHM_NAME, createDelimiterParameters()); fieldMap.put(INPUT_FIELD, OUTPUT_FIELD); config.put(FIELD_MAP_FIELD, fieldMap); config.put(ALGORITHM_FIELD, algorithmMap); @@ -178,7 +177,7 @@ public void testCreate_whenMaxChunkNumInvalidValue_thenFail() { Map fieldMap = new HashMap<>(); Map algorithmMap = new HashMap<>(); fieldMap.put(INPUT_FIELD, OUTPUT_FIELD); - algorithmMap.put(ChunkerFactory.FIXED_TOKEN_LENGTH_ALGORITHM, createFixedTokenLengthParametersWithMaxChunk(-2)); + algorithmMap.put(FixedTokenLengthChunker.ALGORITHM_NAME, createFixedTokenLengthParametersWithMaxChunk(-2)); config.put(FIELD_MAP_FIELD, fieldMap); config.put(ALGORITHM_FIELD, algorithmMap); IllegalArgumentException illegalArgumentException = assertThrows( @@ -213,8 +212,8 @@ public void testCreate_whenAlgorithmFieldMultipleAlgorithm_thenFail() { Map algorithmMap = new HashMap<>(); fieldMap.put(INPUT_FIELD, OUTPUT_FIELD); config.put(TextChunkingProcessor.FIELD_MAP_FIELD, fieldMap); - algorithmMap.put(ChunkerFactory.FIXED_TOKEN_LENGTH_ALGORITHM, createFixedTokenLengthParameters()); - algorithmMap.put(ChunkerFactory.DELIMITER_ALGORITHM, createDelimiterParameters()); + algorithmMap.put(FixedTokenLengthChunker.ALGORITHM_NAME, createFixedTokenLengthParameters()); + algorithmMap.put(DelimiterChunker.ALGORITHM_NAME, createDelimiterParameters()); config.put(ALGORITHM_FIELD, algorithmMap); Map registry = new HashMap<>(); IllegalArgumentException illegalArgumentException = assertThrows( @@ -251,7 +250,7 @@ public void testCreate_whenAlgorithmFieldInvalidAlgorithmContent_thenFail() { Map algorithmMap = new HashMap<>(); fieldMap.put(INPUT_FIELD, OUTPUT_FIELD); config.put(TextChunkingProcessor.FIELD_MAP_FIELD, fieldMap); - algorithmMap.put(ChunkerFactory.FIXED_TOKEN_LENGTH_ALGORITHM, 1); + algorithmMap.put(FixedTokenLengthChunker.ALGORITHM_NAME, 1); config.put(ALGORITHM_FIELD, algorithmMap); Map registry = new HashMap<>(); IllegalArgumentException illegalArgumentException = assertThrows( @@ -260,7 +259,7 @@ public void testCreate_whenAlgorithmFieldInvalidAlgorithmContent_thenFail() { ); assertEquals( "Unable to create the processor as [" - + ChunkerFactory.FIXED_TOKEN_LENGTH_ALGORITHM + + FixedTokenLengthChunker.ALGORITHM_NAME + "] parameters cannot be cast to [" + Map.class.getName() + "]", diff --git a/src/test/java/org/opensearch/neuralsearch/processor/chunker/ChunkerFactoryTests.java b/src/test/java/org/opensearch/neuralsearch/processor/chunker/ChunkerFactoryTests.java index a26021559..95adcc075 100644 --- a/src/test/java/org/opensearch/neuralsearch/processor/chunker/ChunkerFactoryTests.java +++ b/src/test/java/org/opensearch/neuralsearch/processor/chunker/ChunkerFactoryTests.java @@ -20,18 +20,18 @@ public class ChunkerFactoryTests extends OpenSearchTestCase { private AnalysisRegistry analysisRegistry; public void testGetAllChunkers() { - Set expected = Set.of(ChunkerFactory.FIXED_TOKEN_LENGTH_ALGORITHM, ChunkerFactory.DELIMITER_ALGORITHM); + Set expected = Set.of(FixedTokenLengthChunker.ALGORITHM_NAME, DelimiterChunker.ALGORITHM_NAME); assertEquals(expected, ChunkerFactory.getAllChunkers()); } public void testCreate_FixedTokenLength() { - Chunker chunker = ChunkerFactory.create(ChunkerFactory.FIXED_TOKEN_LENGTH_ALGORITHM, createChunkParameters()); + Chunker chunker = ChunkerFactory.create(FixedTokenLengthChunker.ALGORITHM_NAME, createChunkParameters()); assertNotNull(chunker); assertTrue(chunker instanceof FixedTokenLengthChunker); } public void testCreate_Delimiter() { - Chunker chunker = ChunkerFactory.create(ChunkerFactory.DELIMITER_ALGORITHM, createChunkParameters()); + Chunker chunker = ChunkerFactory.create(DelimiterChunker.ALGORITHM_NAME, createChunkParameters()); assertNotNull(chunker); assertTrue(chunker instanceof DelimiterChunker); } diff --git a/src/test/java/org/opensearch/neuralsearch/processor/factory/TextChunkingProcessorFactoryTests.java b/src/test/java/org/opensearch/neuralsearch/processor/factory/TextChunkingProcessorFactoryTests.java index 943a6d8d5..3d9993e7b 100644 --- a/src/test/java/org/opensearch/neuralsearch/processor/factory/TextChunkingProcessorFactoryTests.java +++ b/src/test/java/org/opensearch/neuralsearch/processor/factory/TextChunkingProcessorFactoryTests.java @@ -23,7 +23,7 @@ import org.opensearch.indices.analysis.AnalysisModule; import org.opensearch.ingest.Processor; import org.opensearch.neuralsearch.processor.TextChunkingProcessor; -import org.opensearch.neuralsearch.processor.chunker.ChunkerFactory; +import org.opensearch.neuralsearch.processor.chunker.FixedTokenLengthChunker; import org.opensearch.plugins.AnalysisPlugin; import org.opensearch.test.OpenSearchTestCase; import static org.opensearch.neuralsearch.processor.TextChunkingProcessor.TYPE; @@ -34,7 +34,7 @@ public class TextChunkingProcessorFactoryTests extends OpenSearchTestCase { private static final String PROCESSOR_TAG = "mockTag"; private static final String DESCRIPTION = "mockDescription"; - private static final Map algorithmMap = Map.of(ChunkerFactory.FIXED_TOKEN_LENGTH_ALGORITHM, new HashMap<>()); + private static final Map algorithmMap = Map.of(FixedTokenLengthChunker.ALGORITHM_NAME, new HashMap<>()); private TextChunkingProcessorFactory textChunkingProcessorFactory; From c651b3ef898b77ff03ba152ddc0fd15eba6bd784 Mon Sep 17 00:00:00 2001 From: yuye-aws Date: Wed, 13 Mar 2024 15:33:20 +0800 Subject: [PATCH 133/189] update string formatted message for text chunking processor Signed-off-by: yuye-aws --- .../processor/TextChunkingProcessor.java | 65 +++++++++++++------ .../processor/TextChunkingProcessorTests.java | 62 ++++++++++++------ 2 files changed, 89 insertions(+), 38 deletions(-) diff --git a/src/main/java/org/opensearch/neuralsearch/processor/TextChunkingProcessor.java b/src/main/java/org/opensearch/neuralsearch/processor/TextChunkingProcessor.java index da7eb9859..a09a2190a 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/TextChunkingProcessor.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/TextChunkingProcessor.java @@ -8,6 +8,7 @@ import java.util.Map; import java.util.Map.Entry; import java.util.Set; +import java.util.Locale; import java.util.ArrayList; import java.util.List; import java.util.Objects; @@ -86,9 +87,13 @@ public String getType() { @SuppressWarnings("unchecked") private void validateAndParseAlgorithmMap(Map algorithmMap) { - if (algorithmMap.size() != 1) { + if (algorithmMap.isEmpty()) { throw new IllegalArgumentException( - "Unable to create the processor as [" + ALGORITHM_FIELD + "] must contain and only contain 1 algorithm" + String.format(Locale.ROOT, "Unable to create %s processor as [%s] does not contain any algorithm", TYPE, ALGORITHM_FIELD) + ); + } else if (algorithmMap.size() > 1) { + throw new IllegalArgumentException( + String.format(Locale.ROOT, "Unable to create %s processor as [%s] contain multiple algorithms", TYPE, ALGORITHM_FIELD) ); } Entry algorithmEntry = algorithmMap.entrySet().iterator().next(); @@ -97,15 +102,24 @@ private void validateAndParseAlgorithmMap(Map algorithmMap) { Set supportedChunkers = ChunkerFactory.getAllChunkers(); if (!supportedChunkers.contains(algorithmKey)) { throw new IllegalArgumentException( - "Unable to create the processor as chunker algorithm [" - + algorithmKey - + "] is not supported. Supported chunkers types are " - + supportedChunkers + String.format( + Locale.ROOT, + "Unable to create %s processor as chunker algorithm [%s] is not supported. Supported chunkers types are %s", + TYPE, + algorithmKey, + supportedChunkers + ) ); } if (!(algorithmValue instanceof Map)) { throw new IllegalArgumentException( - "Unable to create the processor as [" + algorithmKey + "] parameters cannot be cast to [" + Map.class.getName() + "]" + String.format( + Locale.ROOT, + "Unable to create %s processor as [%s] parameters cannot be cast to [%s]", + TYPE, + algorithmKey, + Map.class.getName() + ) ); } Map chunkerParameters = (Map) algorithmValue; @@ -117,12 +131,14 @@ private void validateAndParseAlgorithmMap(Map algorithmMap) { String maxChunkLimitString = chunkerParameters.get(MAX_CHUNK_LIMIT_FIELD).toString(); if (!(NumberUtils.isParsable(maxChunkLimitString))) { throw new IllegalArgumentException( - "Parameter [" + MAX_CHUNK_LIMIT_FIELD + "] cannot be cast to [" + Number.class.getName() + "]" + String.format(Locale.ROOT, "Parameter [%s] cannot be cast to [%s]", MAX_CHUNK_LIMIT_FIELD, Number.class.getName()) ); } int maxChunkLimit = NumberUtils.createInteger(maxChunkLimitString); if (maxChunkLimit <= 0 && maxChunkLimit != DEFAULT_MAX_CHUNK_LIMIT) { - throw new IllegalArgumentException("Parameter [" + MAX_CHUNK_LIMIT_FIELD + "] must be a positive integer"); + throw new IllegalArgumentException( + String.format(Locale.ROOT, "Parameter [%s] must be a positive integer", MAX_CHUNK_LIMIT_FIELD) + ); } this.maxChunkLimit = maxChunkLimit; } else { @@ -150,11 +166,12 @@ private int chunkString(String content, List result, Map chunkCount += contentResult.size(); if (maxChunkLimit != DEFAULT_MAX_CHUNK_LIMIT && chunkCount > maxChunkLimit) { throw new IllegalArgumentException( - "Unable to create the processor as the number of chunks [" - + chunkCount - + "] exceeds the maximum chunk limit [" - + maxChunkLimit - + "]" + String.format( + Locale.ROOT, + "Unable to chunk the document as the number of chunks [%s] exceeds the maximum chunk limit [%s]", + chunkCount, + maxChunkLimit + ) ); } result.addAll(contentResult); @@ -222,7 +239,9 @@ private void validateFieldsValue(IngestDocument ingestDocument) { if (sourceValue instanceof List || sourceValue instanceof Map) { validateNestedTypeValue(sourceKey, sourceValue, 1); } else if (!(sourceValue instanceof String)) { - throw new IllegalArgumentException("field [" + sourceKey + "] is neither string nor nested type, cannot process it"); + throw new IllegalArgumentException( + String.format(Locale.ROOT, "field [%s] is neither string nor nested type, cannot process it", sourceKey) + ); } } } @@ -231,7 +250,9 @@ private void validateFieldsValue(IngestDocument ingestDocument) { @SuppressWarnings({ "rawtypes", "unchecked" }) private void validateNestedTypeValue(String sourceKey, Object sourceValue, int maxDepth) { if (maxDepth > MapperService.INDEX_MAPPING_DEPTH_LIMIT_SETTING.get(environment.settings())) { - throw new IllegalArgumentException("map type field [" + sourceKey + "] reached max depth limit, cannot process it"); + throw new IllegalArgumentException( + String.format(Locale.ROOT, "map type field [%s] reached max depth limit, cannot process it", sourceKey) + ); } else if (sourceValue instanceof List) { validateListTypeValue(sourceKey, sourceValue, maxDepth); } else if (sourceValue instanceof Map) { @@ -240,7 +261,9 @@ private void validateNestedTypeValue(String sourceKey, Object sourceValue, int m .filter(Objects::nonNull) .forEach(x -> validateNestedTypeValue(sourceKey, x, maxDepth + 1)); } else if (!(sourceValue instanceof String)) { - throw new IllegalArgumentException("map type field [" + sourceKey + "] has non-string type, cannot process it"); + throw new IllegalArgumentException( + String.format(Locale.ROOT, "map type field [%s] has non-string type, cannot process it", sourceKey) + ); } } @@ -250,9 +273,13 @@ private void validateListTypeValue(String sourceKey, Object sourceValue, int max if (value instanceof Map) { validateNestedTypeValue(sourceKey, value, maxDepth + 1); } else if (value == null) { - throw new IllegalArgumentException("list type field [" + sourceKey + "] has null, cannot process it"); + throw new IllegalArgumentException( + String.format(Locale.ROOT, "list type field [%s] has null, cannot process it", sourceKey) + ); } else if (!(value instanceof String)) { - throw new IllegalArgumentException("list type field [" + sourceKey + "] has non string value, cannot process it"); + throw new IllegalArgumentException( + String.format(Locale.ROOT, "list type field [%s] has non-string value, cannot process it", sourceKey) + ); } } } diff --git a/src/test/java/org/opensearch/neuralsearch/processor/TextChunkingProcessorTests.java b/src/test/java/org/opensearch/neuralsearch/processor/TextChunkingProcessorTests.java index ddfdfed44..877e1974d 100644 --- a/src/test/java/org/opensearch/neuralsearch/processor/TextChunkingProcessorTests.java +++ b/src/test/java/org/opensearch/neuralsearch/processor/TextChunkingProcessorTests.java @@ -10,6 +10,7 @@ import java.util.ArrayList; import java.util.HashMap; import java.util.List; +import java.util.Locale; import java.util.Map; import static java.util.Collections.singletonList; import static java.util.Collections.singletonMap; @@ -36,6 +37,7 @@ import org.opensearch.neuralsearch.processor.factory.TextChunkingProcessorFactory; import org.opensearch.plugins.AnalysisPlugin; import org.opensearch.test.OpenSearchTestCase; +import static org.opensearch.neuralsearch.processor.TextChunkingProcessor.TYPE; import static org.opensearch.neuralsearch.processor.TextChunkingProcessor.FIELD_MAP_FIELD; import static org.opensearch.neuralsearch.processor.TextChunkingProcessor.ALGORITHM_FIELD; import static org.opensearch.neuralsearch.processor.TextChunkingProcessor.MAX_CHUNK_LIMIT_FIELD; @@ -167,7 +169,10 @@ public void testCreate_whenAlgorithmFieldMissing_thenFail() { OpenSearchParseException.class, () -> textChunkingProcessorFactory.create(registry, PROCESSOR_TAG, DESCRIPTION, config) ); - assertEquals("[" + ALGORITHM_FIELD + "] required property is missing", openSearchParseException.getMessage()); + assertEquals( + String.format(Locale.ROOT, "[%s] required property is missing", ALGORITHM_FIELD), + openSearchParseException.getMessage() + ); } @SneakyThrows @@ -184,8 +189,10 @@ public void testCreate_whenMaxChunkNumInvalidValue_thenFail() { IllegalArgumentException.class, () -> textChunkingProcessorFactory.create(registry, PROCESSOR_TAG, DESCRIPTION, config) ); - assertEquals("Parameter [" + MAX_CHUNK_LIMIT_FIELD + "] must be a positive integer", illegalArgumentException.getMessage()); - + assertEquals( + String.format(Locale.ROOT, "Parameter [%s] must be a positive integer", MAX_CHUNK_LIMIT_FIELD), + illegalArgumentException.getMessage() + ); } public void testCreate_whenAlgorithmFieldNoAlgorithm_thenFail() { @@ -201,7 +208,7 @@ public void testCreate_whenAlgorithmFieldNoAlgorithm_thenFail() { () -> textChunkingProcessorFactory.create(registry, PROCESSOR_TAG, DESCRIPTION, config) ); assertEquals( - "Unable to create the processor as [" + ALGORITHM_FIELD + "] must contain and only contain 1 algorithm", + String.format(Locale.ROOT, "Unable to create %s processor as [%s] does not contain any algorithm", TYPE, ALGORITHM_FIELD), illegalArgumentException.getMessage() ); } @@ -221,7 +228,7 @@ public void testCreate_whenAlgorithmFieldMultipleAlgorithm_thenFail() { () -> textChunkingProcessorFactory.create(registry, PROCESSOR_TAG, DESCRIPTION, config) ); assertEquals( - "Unable to create the processor as [" + ALGORITHM_FIELD + "] must contain and only contain 1 algorithm", + String.format(Locale.ROOT, "Unable to create %s processor as [%s] contain multiple algorithms", TYPE, ALGORITHM_FIELD), illegalArgumentException.getMessage() ); } @@ -241,7 +248,14 @@ public void testCreate_whenAlgorithmFieldInvalidAlgorithmName_thenFail() { () -> textChunkingProcessorFactory.create(registry, PROCESSOR_TAG, DESCRIPTION, config) ); assert (illegalArgumentException.getMessage() - .contains("Unable to create the processor as chunker algorithm [" + invalid_algorithm_type + "] is not supported")); + .contains( + String.format( + Locale.ROOT, + "Unable to create %s processor as chunker algorithm [%s] is not supported.", + TYPE, + invalid_algorithm_type + ) + )); } public void testCreate_whenAlgorithmFieldInvalidAlgorithmContent_thenFail() { @@ -258,11 +272,13 @@ public void testCreate_whenAlgorithmFieldInvalidAlgorithmContent_thenFail() { () -> textChunkingProcessorFactory.create(registry, PROCESSOR_TAG, DESCRIPTION, config) ); assertEquals( - "Unable to create the processor as [" - + FixedTokenLengthChunker.ALGORITHM_NAME - + "] parameters cannot be cast to [" - + Map.class.getName() - + "]", + String.format( + Locale.ROOT, + "Unable to create %s processor as [%s] parameters cannot be cast to [%s]", + TYPE, + FixedTokenLengthChunker.ALGORITHM_NAME, + Map.class.getName() + ), illegalArgumentException.getMessage() ); } @@ -271,7 +287,7 @@ public void testCreate_whenAlgorithmFieldInvalidAlgorithmContent_thenFail() { public void testGetType() { TextChunkingProcessor processor = createFixedTokenLengthInstance(createStringFieldMap()); String type = processor.getType(); - assertEquals(TextChunkingProcessor.TYPE, type); + assertEquals(TYPE, type); } private String createSourceDataString() { @@ -386,8 +402,13 @@ public void testExecute_withFixedTokenLength_andSourceDataStringWithMaxChunkNumE () -> processor.execute(ingestDocument) ); assertEquals( - illegalArgumentException.getMessage(), - "Unable to create the processor as the number of chunks [" + "3" + "] exceeds the maximum chunk limit [" + "1" + "]" + String.format( + Locale.ROOT, + "Unable to chunk the document as the number of chunks [%s] exceeds the maximum chunk limit [%s]", + 3, + 1 + ), + illegalArgumentException.getMessage() ); } @@ -419,7 +440,7 @@ public void testExecute_withFixedTokenLength_andSourceDataInvalidType_thenFail() () -> processor.execute(ingestDocument) ); assertEquals( - "field [" + INPUT_FIELD + "] is neither string nor nested type, cannot process it", + String.format(Locale.ROOT, "field [%s] is neither string nor nested type, cannot process it", INPUT_FIELD), illegalArgumentException.getMessage() ); } @@ -452,7 +473,7 @@ public void testExecute_withFixedTokenLength_andSourceDataListHybridType_thenFai () -> processor.execute(ingestDocument) ); assertEquals( - "list type field [" + INPUT_FIELD + "] has non string value, cannot process it", + String.format(Locale.ROOT, "list type field [%s] has non-string value, cannot process it", INPUT_FIELD), illegalArgumentException.getMessage() ); } @@ -465,7 +486,10 @@ public void testExecute_withFixedTokenLength_andSourceDataListWithNull_thenFail( IllegalArgumentException.class, () -> processor.execute(ingestDocument) ); - assertEquals("list type field [" + INPUT_FIELD + "] has null, cannot process it", illegalArgumentException.getMessage()); + assertEquals( + String.format(Locale.ROOT, "list type field [%s] has null, cannot process it", INPUT_FIELD), + illegalArgumentException.getMessage() + ); } @SuppressWarnings("unchecked") @@ -497,7 +521,7 @@ public void testExecute_withFixedTokenLength_andMaxDepthLimitExceedFieldMap_then () -> processor.execute(ingestDocument) ); assertEquals( - "map type field [" + INPUT_NESTED_FIELD_KEY + "] reached max depth limit, cannot process it", + String.format(Locale.ROOT, "map type field [%s] reached max depth limit, cannot process it", INPUT_NESTED_FIELD_KEY), illegalArgumentException.getMessage() ); } @@ -511,7 +535,7 @@ public void testExecute_withFixedTokenLength_andFieldMapNestedMap_thenFail() { () -> processor.execute(ingestDocument) ); assertEquals( - "map type field [" + INPUT_NESTED_FIELD_KEY + "] has non-string type, cannot process it", + String.format(Locale.ROOT, "map type field [%s] has non-string type, cannot process it", INPUT_NESTED_FIELD_KEY), illegalArgumentException.getMessage() ); } From 0f4578299a4cd2a43bda989cd8c12936e8f76949 Mon Sep 17 00:00:00 2001 From: yuye-aws Date: Wed, 13 Mar 2024 15:38:23 +0800 Subject: [PATCH 134/189] update string formatted message for chunker factory Signed-off-by: yuye-aws --- .../neuralsearch/processor/chunker/ChunkerFactory.java | 8 +++++++- .../processor/chunker/ChunkerFactoryTests.java | 4 +++- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerFactory.java b/src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerFactory.java index 38f9767ee..d6c475512 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerFactory.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerFactory.java @@ -6,6 +6,7 @@ import java.util.Map; import java.util.Set; +import java.util.Locale; /** * A factory to create different chunking algorithm classes and return all supported chunking algorithms. @@ -20,7 +21,12 @@ public static Chunker create(String type, Map parameters) { return new DelimiterChunker(parameters); default: throw new IllegalArgumentException( - "chunker type [" + type + "] is not supported. Supported chunkers types are " + ChunkerFactory.getAllChunkers() + String.format( + Locale.ROOT, + "chunking algorithm [%s] is not supported. Supported chunking algorithms are %s", + type, + ChunkerFactory.getAllChunkers() + ) ); } } diff --git a/src/test/java/org/opensearch/neuralsearch/processor/chunker/ChunkerFactoryTests.java b/src/test/java/org/opensearch/neuralsearch/processor/chunker/ChunkerFactoryTests.java index 95adcc075..4bfb3f366 100644 --- a/src/test/java/org/opensearch/neuralsearch/processor/chunker/ChunkerFactoryTests.java +++ b/src/test/java/org/opensearch/neuralsearch/processor/chunker/ChunkerFactoryTests.java @@ -9,6 +9,7 @@ import org.opensearch.test.OpenSearchTestCase; import java.util.HashMap; +import java.util.Locale; import java.util.Map; import java.util.Set; @@ -42,7 +43,8 @@ public void testCreate_Invalid() { IllegalArgumentException.class, () -> ChunkerFactory.create(invalidChunkerType, createChunkParameters()) ); - assert (illegalArgumentException.getMessage().contains("chunker type [" + invalidChunkerType + "] is not supported.")); + assert (illegalArgumentException.getMessage() + .contains(String.format(Locale.ROOT, "chunking algorithm [%s] is not supported.", invalidChunkerType))); } private Map createChunkParameters() { From 3d1b79217636cd015d46368c1a1e3bdcc89bef3e Mon Sep 17 00:00:00 2001 From: yuye-aws Date: Wed, 13 Mar 2024 15:48:50 +0800 Subject: [PATCH 135/189] update string formatted message for chunker parameter validator Signed-off-by: yuye-aws --- .../chunker/ChunkerParameterValidator.java | 20 +++++++++++++------ .../chunker/DelimiterChunkerTests.java | 8 ++++++-- .../chunker/FixedTokenLengthChunkerTests.java | 15 +++++++++++--- 3 files changed, 32 insertions(+), 11 deletions(-) diff --git a/src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerParameterValidator.java b/src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerParameterValidator.java index 368f2cbfe..1ac5d971c 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerParameterValidator.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerParameterValidator.java @@ -8,6 +8,7 @@ import org.apache.commons.lang3.math.NumberUtils; import java.util.Map; +import java.util.Locale; /** * Validate and parse the parameter for chunking algorithms @@ -26,9 +27,13 @@ public static String validateStringParameters( } Object fieldValue = parameters.get(fieldName); if (!(fieldValue instanceof String)) { - throw new IllegalArgumentException("Chunker parameter [" + fieldName + "] cannot be cast to [" + String.class.getName() + "]"); + throw new IllegalArgumentException( + String.format(Locale.ROOT, "Chunking algorithm parameter [%s] cannot be cast to [%s]", fieldName, String.class.getName()) + ); } else if (!allowEmpty && StringUtils.isEmpty(fieldValue.toString())) { - throw new IllegalArgumentException("Chunker parameter: " + fieldName + " should not be empty."); + throw new IllegalArgumentException( + String.format(Locale.ROOT, "Chunking algorithm parameter [%s] should not be empty.", fieldName) + ); } return (String) fieldValue; } @@ -42,12 +47,15 @@ public static int validatePositiveIntegerParameter(Map parameter String fieldValue = parameters.get(fieldName).toString(); if (!(NumberUtils.isParsable(fieldValue))) { throw new IllegalArgumentException( - "fixed length parameter [" + fieldName + "] cannot be cast to [" + Number.class.getName() + "]" + String.format(Locale.ROOT, "Chunking algorithm parameter [%s] cannot be cast to [%s]", fieldName, Number.class.getName()) ); } - if (NumberUtils.createInteger(fieldValue) <= 0) { - throw new IllegalArgumentException("fixed length parameter [" + fieldName + "] must be positive"); + int fieldValueInt = NumberUtils.createInteger(fieldValue); + if (fieldValueInt <= 0) { + throw new IllegalArgumentException( + String.format(Locale.ROOT, "Chunking algorithm parameter [%s] must be positive.", fieldName) + ); } - return Integer.valueOf(fieldValue); + return fieldValueInt; } } diff --git a/src/test/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunkerTests.java b/src/test/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunkerTests.java index 37969a51b..50615082b 100644 --- a/src/test/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunkerTests.java +++ b/src/test/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunkerTests.java @@ -5,6 +5,7 @@ package org.opensearch.neuralsearch.processor.chunker; import java.util.List; +import java.util.Locale; import java.util.Map; import org.junit.Assert; @@ -20,14 +21,17 @@ public void testChunkerWithDelimiterFieldNotString() { () -> new DelimiterChunker(Map.of(DELIMITER_FIELD, List.of(""))) ); Assert.assertEquals( - "Chunker parameter [" + DELIMITER_FIELD + "] cannot be cast to [" + String.class.getName() + "]", + String.format(Locale.ROOT, "Chunking algorithm parameter [%s] cannot be cast to [%s]", DELIMITER_FIELD, String.class.getName()), exception.getMessage() ); } public void testChunkerWithDelimiterFieldNoString() { Exception exception = assertThrows(IllegalArgumentException.class, () -> new DelimiterChunker(Map.of(DELIMITER_FIELD, ""))); - Assert.assertEquals("Chunker parameter: " + DELIMITER_FIELD + " should not be empty.", exception.getMessage()); + Assert.assertEquals( + String.format(Locale.ROOT, "Chunking algorithm parameter [%s] should not be empty.", DELIMITER_FIELD), + exception.getMessage() + ); } public void testChunker() { diff --git a/src/test/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunkerTests.java b/src/test/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunkerTests.java index 49b633ced..cf1b11327 100644 --- a/src/test/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunkerTests.java +++ b/src/test/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunkerTests.java @@ -19,6 +19,7 @@ import java.util.ArrayList; import java.util.HashMap; import java.util.List; +import java.util.Locale; import java.util.Map; import static java.util.Collections.singletonList; import static java.util.Collections.singletonMap; @@ -73,7 +74,12 @@ public void testValidateAndParseParameters_whenIllegalTokenLimitType_thenFail() () -> fixedTokenLengthChunker.validateAndParseParameters(parameters) ); assertEquals( - "fixed length parameter [" + TOKEN_LIMIT_FIELD + "] cannot be cast to [" + Number.class.getName() + "]", + String.format( + Locale.ROOT, + "Chunking algorithm parameter [%s] cannot be cast to [%s]", + TOKEN_LIMIT_FIELD, + Number.class.getName() + ), illegalArgumentException.getMessage() ); } @@ -85,7 +91,10 @@ public void testValidateAndParseParameters_whenIllegalTokenLimitValue_thenFail() IllegalArgumentException.class, () -> fixedTokenLengthChunker.validateAndParseParameters(parameters) ); - assertEquals("fixed length parameter [" + TOKEN_LIMIT_FIELD + "] must be positive", illegalArgumentException.getMessage()); + assertEquals( + String.format(Locale.ROOT, "Chunking algorithm parameter [%s] must be positive.", TOKEN_LIMIT_FIELD), + illegalArgumentException.getMessage() + ); } public void testValidateAndParseParameters_whenIllegalOverlapRateType_thenFail() { @@ -122,7 +131,7 @@ public void testValidateAndParseParameters_whenIllegalTokenizerType_thenFail() { () -> fixedTokenLengthChunker.validateAndParseParameters(parameters) ); assertEquals( - "Chunker parameter [" + TOKENIZER_FIELD + "] cannot be cast to [" + String.class.getName() + "]", + String.format(Locale.ROOT, "Chunking algorithm parameter [%s] cannot be cast to [%s]", TOKENIZER_FIELD, String.class.getName()), illegalArgumentException.getMessage() ); } From 5600b3649f0e36dc2f4ade7e3f8c47776dfe7947 Mon Sep 17 00:00:00 2001 From: yuye-aws Date: Wed, 13 Mar 2024 15:50:45 +0800 Subject: [PATCH 136/189] update java doc for delimiter algorithm Signed-off-by: yuye-aws --- .../neuralsearch/processor/chunker/DelimiterChunker.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/main/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunker.java b/src/main/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunker.java index d1a6eec97..ef181fac8 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunker.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunker.java @@ -26,11 +26,11 @@ public DelimiterChunker(Map parameters) { } /** - * Validate the chunked passages for delimiter algorithm + * Validate and parse the parameters for delimiter algorithm, + * will throw IllegalArgumentException if delimiter is not a string or empty * * @param parameters a map containing parameters, containing the following parameters * 1. A string as the paragraph split indicator - * @throws IllegalArgumentException If delimiter is not a string or empty */ @Override public void validateAndParseParameters(Map parameters) { From 3d962ca70302fd5b592019308640d1a4f182f723 Mon Sep 17 00:00:00 2001 From: yuye-aws Date: Wed, 13 Mar 2024 16:08:27 +0800 Subject: [PATCH 137/189] support range double in chunker parameter validator Signed-off-by: yuye-aws --- .../chunker/ChunkerParameterValidator.java | 30 +++++++++++++++++-- .../chunker/FixedTokenLengthChunker.java | 26 +++++----------- .../chunker/FixedTokenLengthChunkerTests.java | 9 ++++-- 3 files changed, 42 insertions(+), 23 deletions(-) diff --git a/src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerParameterValidator.java b/src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerParameterValidator.java index 1ac5d971c..d19c48e34 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerParameterValidator.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerParameterValidator.java @@ -38,8 +38,8 @@ public static String validateStringParameters( return (String) fieldValue; } - public static int validatePositiveIntegerParameter(Map parameters, String fieldName, int defaultValue) { - // this method validate that parameter is a positive integer + public static Number validateNumberParameter(Map parameters, String fieldName, double defaultValue) { + // this method validate that parameter is a number if (!parameters.containsKey(fieldName)) { // all parameters are optional return defaultValue; @@ -50,7 +50,13 @@ public static int validatePositiveIntegerParameter(Map parameter String.format(Locale.ROOT, "Chunking algorithm parameter [%s] cannot be cast to [%s]", fieldName, Number.class.getName()) ); } - int fieldValueInt = NumberUtils.createInteger(fieldValue); + return NumberUtils.createNumber(fieldValue); + } + + public static int validatePositiveIntegerParameter(Map parameters, String fieldName, int defaultValue) { + // this method validate that parameter is a positive integer + Number fieldValueNumber = validateNumberParameter(parameters, fieldName, defaultValue); + int fieldValueInt = fieldValueNumber.intValue(); if (fieldValueInt <= 0) { throw new IllegalArgumentException( String.format(Locale.ROOT, "Chunking algorithm parameter [%s] must be positive.", fieldName) @@ -58,4 +64,22 @@ public static int validatePositiveIntegerParameter(Map parameter } return fieldValueInt; } + + public static double validateRangeDoubleParameter( + Map parameters, + String fieldName, + double lowerBound, + double upperBound, + double defaultValue + ) { + // this method validate that parameter is a double within [lowerBound, upperBound] + Number fieldValueNumber = validateNumberParameter(parameters, fieldName, defaultValue); + double fieldValueDouble = fieldValueNumber.doubleValue(); + if (fieldValueDouble < lowerBound || fieldValueDouble > upperBound) { + throw new IllegalArgumentException( + String.format(Locale.ROOT, "Chunking algorithm parameter [%s] must be between %s and %s", fieldName, lowerBound, upperBound) + ); + } + return fieldValueDouble; + } } diff --git a/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java b/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java index efe9cd606..0a960dca2 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java @@ -8,12 +8,12 @@ import java.util.Map; import java.util.List; import java.util.ArrayList; -import org.apache.commons.lang3.math.NumberUtils; import org.opensearch.index.analysis.AnalysisRegistry; import org.opensearch.action.admin.indices.analyze.AnalyzeAction; import org.opensearch.action.admin.indices.analyze.AnalyzeAction.AnalyzeToken; import static org.opensearch.action.admin.indices.analyze.TransportAnalyzeAction.analyze; +import static org.opensearch.neuralsearch.processor.chunker.ChunkerParameterValidator.validateRangeDoubleParameter; import static org.opensearch.neuralsearch.processor.chunker.ChunkerParameterValidator.validatePositiveIntegerParameter; import static org.opensearch.neuralsearch.processor.chunker.ChunkerParameterValidator.validateStringParameters; @@ -66,23 +66,13 @@ public FixedTokenLengthChunker(Map parameters) { @Override public void validateAndParseParameters(Map parameters) { this.tokenLimit = validatePositiveIntegerParameter(parameters, TOKEN_LIMIT_FIELD, DEFAULT_TOKEN_LIMIT); - if (parameters.containsKey(OVERLAP_RATE_FIELD)) { - String overlapRateString = parameters.get(OVERLAP_RATE_FIELD).toString(); - if (!(NumberUtils.isParsable(overlapRateString))) { - throw new IllegalArgumentException( - "fixed length parameter [" + OVERLAP_RATE_FIELD + "] cannot be cast to [" + Number.class.getName() + "]" - ); - } - double overlapRate = NumberUtils.createDouble(overlapRateString); - if (overlapRate < 0 || overlapRate > OVERLAP_RATE_UPPER_BOUND) { - throw new IllegalArgumentException( - "fixed length parameter [" + OVERLAP_RATE_FIELD + "] must be between 0 and " + OVERLAP_RATE_UPPER_BOUND - ); - } - this.overlapRate = overlapRate; - } else { - this.overlapRate = DEFAULT_OVERLAP_RATE; - } + this.overlapRate = validateRangeDoubleParameter( + parameters, + OVERLAP_RATE_FIELD, + DEFAULT_OVERLAP_RATE, + OVERLAP_RATE_UPPER_BOUND, + DEFAULT_OVERLAP_RATE + ); this.tokenizer = validateStringParameters(parameters, TOKENIZER_FIELD, DEFAULT_TOKENIZER, false); } diff --git a/src/test/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunkerTests.java b/src/test/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunkerTests.java index cf1b11327..b52e1eef1 100644 --- a/src/test/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunkerTests.java +++ b/src/test/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunkerTests.java @@ -105,7 +105,12 @@ public void testValidateAndParseParameters_whenIllegalOverlapRateType_thenFail() () -> fixedTokenLengthChunker.validateAndParseParameters(parameters) ); assertEquals( - "fixed length parameter [" + OVERLAP_RATE_FIELD + "] cannot be cast to [" + Number.class.getName() + "]", + String.format( + Locale.ROOT, + "Chunking algorithm parameter [%s] cannot be cast to [%s]", + OVERLAP_RATE_FIELD, + Number.class.getName() + ), illegalArgumentException.getMessage() ); } @@ -118,7 +123,7 @@ public void testValidateAndParseParameters_whenIllegalOverlapRateValue_thenFail( () -> fixedTokenLengthChunker.validateAndParseParameters(parameters) ); assertEquals( - "fixed length parameter [" + OVERLAP_RATE_FIELD + "] must be between 0 and 0.5", + String.format(Locale.ROOT, "Chunking algorithm parameter [%s] must be between %s and %s", OVERLAP_RATE_FIELD, 0.0, 0.5), illegalArgumentException.getMessage() ); } From 42de9008c3dd4899c6d4f47f446638f41bcc8a31 Mon Sep 17 00:00:00 2001 From: yuye-aws Date: Wed, 13 Mar 2024 16:13:46 +0800 Subject: [PATCH 138/189] update string formatted message for fixed token length algorithm Signed-off-by: yuye-aws --- .../processor/chunker/FixedTokenLengthChunker.java | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java b/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java index 0a960dca2..0fcefa1b5 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java @@ -5,6 +5,7 @@ package org.opensearch.neuralsearch.processor.chunker; import java.io.IOException; +import java.util.Locale; import java.util.Map; import java.util.List; import java.util.ArrayList; @@ -125,7 +126,10 @@ private List tokenize(String content, String tokenizer, int maxTok AnalyzeAction.Response analyzeResponse = analyze(analyzeRequest, analysisRegistry, null, maxTokenCount); return analyzeResponse.getTokens(); } catch (IOException e) { - throw new IllegalStateException("Fixed token length algorithm encounters exception: " + e.getMessage(), e); + throw new IllegalStateException( + String.format(Locale.ROOT, "%s algorithm encounters exception in tokenization: %s", ALGORITHM_NAME, e.getMessage()), + e + ); } } } From 6d4fe8ccc3898ab8cd422fdb37682886abbc9887 Mon Sep 17 00:00:00 2001 From: yuye-aws Date: Wed, 13 Mar 2024 16:22:06 +0800 Subject: [PATCH 139/189] update sneaky throw with text chunking processor it Signed-off-by: yuye-aws --- .../processor/TextChunkingProcessorIT.java | 20 ++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/src/test/java/org/opensearch/neuralsearch/processor/TextChunkingProcessorIT.java b/src/test/java/org/opensearch/neuralsearch/processor/TextChunkingProcessorIT.java index a4e770e0a..dfcf4b2cb 100644 --- a/src/test/java/org/opensearch/neuralsearch/processor/TextChunkingProcessorIT.java +++ b/src/test/java/org/opensearch/neuralsearch/processor/TextChunkingProcessorIT.java @@ -5,6 +5,7 @@ package org.opensearch.neuralsearch.processor; import com.google.common.collect.ImmutableList; +import lombok.SneakyThrows; import org.apache.hc.core5.http.HttpHeaders; import org.apache.hc.core5.http.io.entity.EntityUtils; import org.apache.hc.core5.http.message.BasicHeader; @@ -67,7 +68,8 @@ public void setUp() throws Exception { updateClusterSettings(); } - public void testTextChunkingProcessor_withFixedTokenLengthAlgorithmStandardTokenizer_thenSucceed() throws Exception { + @SneakyThrows + public void testTextChunkingProcessor_withFixedTokenLengthAlgorithmStandardTokenizer_thenSucceed() { try { createPipelineProcessor(FIXED_TOKEN_LENGTH_PIPELINE_WITH_STANDARD_TOKENIZER_NAME); createTextChunkingIndex(INDEX_NAME, FIXED_TOKEN_LENGTH_PIPELINE_WITH_STANDARD_TOKENIZER_NAME); @@ -83,7 +85,8 @@ public void testTextChunkingProcessor_withFixedTokenLengthAlgorithmStandardToken } } - public void testTextChunkingProcessor_withFixedTokenLengthAlgorithmLetterTokenizer_thenSucceed() throws Exception { + @SneakyThrows + public void testTextChunkingProcessor_withFixedTokenLengthAlgorithmLetterTokenizer_thenSucceed() { try { createPipelineProcessor(FIXED_TOKEN_LENGTH_PIPELINE_WITH_LETTER_TOKENIZER_NAME); createTextChunkingIndex(INDEX_NAME, FIXED_TOKEN_LENGTH_PIPELINE_WITH_LETTER_TOKENIZER_NAME); @@ -99,7 +102,8 @@ public void testTextChunkingProcessor_withFixedTokenLengthAlgorithmLetterTokeniz } } - public void testTextChunkingProcessor_withFixedTokenLengthAlgorithmLowercaseTokenizer_thenSucceed() throws Exception { + @SneakyThrows + public void testTextChunkingProcessor_withFixedTokenLengthAlgorithmLowercaseTokenizer_thenSucceed() { try { createPipelineProcessor(FIXED_TOKEN_LENGTH_PIPELINE_WITH_LOWERCASE_TOKENIZER_NAME); createTextChunkingIndex(INDEX_NAME, FIXED_TOKEN_LENGTH_PIPELINE_WITH_LOWERCASE_TOKENIZER_NAME); @@ -115,8 +119,8 @@ public void testTextChunkingProcessor_withFixedTokenLengthAlgorithmLowercaseToke } } - public void testTextChunkingProcessor_withFixedTokenLengthAlgorithmStandardTokenizer_whenExceedMaxTokenCount_thenFail() - throws Exception { + @SneakyThrows + public void testTextChunkingProcessor_withFixedTokenLengthAlgorithmStandardTokenizer_whenExceedMaxTokenCount_thenFail() { try { createPipelineProcessor(FIXED_TOKEN_LENGTH_PIPELINE_WITH_STANDARD_TOKENIZER_NAME); createTextChunkingIndex(INDEX_NAME, FIXED_TOKEN_LENGTH_PIPELINE_WITH_STANDARD_TOKENIZER_NAME); @@ -130,7 +134,8 @@ public void testTextChunkingProcessor_withFixedTokenLengthAlgorithmStandardToken } } - public void testTextChunkingProcessor_withDelimiterAlgorithm_successful() throws Exception { + @SneakyThrows + public void testTextChunkingProcessor_withDelimiterAlgorithm_successful() { try { createPipelineProcessor(DELIMITER_PIPELINE_NAME); createTextChunkingIndex(INDEX_NAME, DELIMITER_PIPELINE_NAME); @@ -147,7 +152,8 @@ public void testTextChunkingProcessor_withDelimiterAlgorithm_successful() throws } } - public void testTextChunkingProcessor_withCascadePipeline_successful() throws Exception { + @SneakyThrows + public void testTextChunkingProcessor_withCascadePipeline_successful() { try { createPipelineProcessor(CASCADE_PIPELINE_NAME); createTextChunkingIndex(INDEX_NAME, CASCADE_PIPELINE_NAME); From e666f177e416f0218ca97cb7005e440bcedca795 Mon Sep 17 00:00:00 2001 From: yuye-aws Date: Wed, 13 Mar 2024 16:47:59 +0800 Subject: [PATCH 140/189] add word tokenizer restriction for fixed token length algorithm Signed-off-by: yuye-aws --- .../chunker/ChunkerParameterValidator.java | 21 ++++++++++----- .../chunker/FixedTokenLengthChunker.java | 26 ++++++++++++++++--- .../chunker/FixedTokenLengthChunkerTests.java | 14 +++++++++- 3 files changed, 50 insertions(+), 11 deletions(-) diff --git a/src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerParameterValidator.java b/src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerParameterValidator.java index d19c48e34..9893efc5f 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerParameterValidator.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerParameterValidator.java @@ -15,6 +15,9 @@ */ public class ChunkerParameterValidator { + /** + * Validate and parse the parameter for string parameters + */ public static String validateStringParameters( Map parameters, String fieldName, @@ -22,7 +25,7 @@ public static String validateStringParameters( boolean allowEmpty ) { if (!parameters.containsKey(fieldName)) { - // all parameters are optional + // all chunking algorithm parameters are optional return defaultValue; } Object fieldValue = parameters.get(fieldName); @@ -38,10 +41,12 @@ public static String validateStringParameters( return (String) fieldValue; } - public static Number validateNumberParameter(Map parameters, String fieldName, double defaultValue) { - // this method validate that parameter is a number + /** + * Validate and parse the parameter for numeric parameters + */ + public static Number validateNumberParameter(Map parameters, String fieldName, Number defaultValue) { if (!parameters.containsKey(fieldName)) { - // all parameters are optional + // all chunking algorithm parameters are optional return defaultValue; } String fieldValue = parameters.get(fieldName).toString(); @@ -53,8 +58,10 @@ public static Number validateNumberParameter(Map parameters, Str return NumberUtils.createNumber(fieldValue); } + /** + * Validate and parse the parameter for positive integer parameters + */ public static int validatePositiveIntegerParameter(Map parameters, String fieldName, int defaultValue) { - // this method validate that parameter is a positive integer Number fieldValueNumber = validateNumberParameter(parameters, fieldName, defaultValue); int fieldValueInt = fieldValueNumber.intValue(); if (fieldValueInt <= 0) { @@ -65,6 +72,9 @@ public static int validatePositiveIntegerParameter(Map parameter return fieldValueInt; } + /** + * Validate and parse the parameter for double parameters within [lowerBound, upperBound] + */ public static double validateRangeDoubleParameter( Map parameters, String fieldName, @@ -72,7 +82,6 @@ public static double validateRangeDoubleParameter( double upperBound, double defaultValue ) { - // this method validate that parameter is a double within [lowerBound, upperBound] Number fieldValueNumber = validateNumberParameter(parameters, fieldName, defaultValue); double fieldValueDouble = fieldValueNumber.doubleValue(); if (fieldValueDouble < lowerBound || fieldValueDouble > upperBound) { diff --git a/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java b/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java index 0fcefa1b5..1f9c0610f 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java @@ -8,6 +8,7 @@ import java.util.Locale; import java.util.Map; import java.util.List; +import java.util.Set; import java.util.ArrayList; import org.opensearch.index.analysis.AnalysisRegistry; @@ -35,14 +36,20 @@ public class FixedTokenLengthChunker implements Chunker { private static final double DEFAULT_OVERLAP_RATE = 0.0; private static final int DEFAULT_MAX_TOKEN_COUNT = 10000; private static final String DEFAULT_TOKENIZER = "standard"; - private static final double OVERLAP_RATE_UPPER_BOUND = 0.5; - - private double overlapRate; + private static final Set WORD_TOKENIZERS = Set.of( + "standard", + "letter", + "lowercase", + "whitespace", + "uax_url_email", + "classic", + "thai" + ); private int tokenLimit; private String tokenizer; - + private double overlapRate; private final AnalysisRegistry analysisRegistry; public FixedTokenLengthChunker(Map parameters) { @@ -75,6 +82,17 @@ public void validateAndParseParameters(Map parameters) { DEFAULT_OVERLAP_RATE ); this.tokenizer = validateStringParameters(parameters, TOKENIZER_FIELD, DEFAULT_TOKENIZER, false); + if (!WORD_TOKENIZERS.contains(this.tokenizer)) { + throw new IllegalArgumentException( + String.format( + Locale.ROOT, + "tokenizer [%s] is not supported for [%s] algorithm. Supported tokenizers are %s", + this.tokenizer, + ALGORITHM_NAME, + WORD_TOKENIZERS + ) + ); + } } /** diff --git a/src/test/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunkerTests.java b/src/test/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunkerTests.java index b52e1eef1..2acd3cd8f 100644 --- a/src/test/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunkerTests.java +++ b/src/test/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunkerTests.java @@ -23,7 +23,7 @@ import java.util.Map; import static java.util.Collections.singletonList; import static java.util.Collections.singletonMap; - +import static org.opensearch.neuralsearch.processor.chunker.FixedTokenLengthChunker.ALGORITHM_NAME; import static org.opensearch.neuralsearch.processor.chunker.FixedTokenLengthChunker.ANALYSIS_REGISTRY_FIELD; import static org.opensearch.neuralsearch.processor.chunker.FixedTokenLengthChunker.TOKEN_LIMIT_FIELD; import static org.opensearch.neuralsearch.processor.chunker.FixedTokenLengthChunker.OVERLAP_RATE_FIELD; @@ -141,6 +141,18 @@ public void testValidateAndParseParameters_whenIllegalTokenizerType_thenFail() { ); } + public void testValidateAndParseParameters_whenUnsupportedTokenizer_thenFail() { + String ngramTokenizer = "ngram"; + Map parameters = new HashMap<>(); + parameters.put(TOKENIZER_FIELD, "ngram"); + IllegalArgumentException illegalArgumentException = assertThrows( + IllegalArgumentException.class, + () -> fixedTokenLengthChunker.validateAndParseParameters(parameters) + ); + assert (illegalArgumentException.getMessage() + .contains(String.format(Locale.ROOT, "tokenizer [%s] is not supported for [%s] algorithm.", ngramTokenizer, ALGORITHM_NAME))); + } + public void testChunk_withTokenLimit_10() { Map parameters = new HashMap<>(); parameters.put(TOKEN_LIMIT_FIELD, 10); From 958cc3be3e0abe57811d2662305248d3e68178fc Mon Sep 17 00:00:00 2001 From: yuye-aws Date: Wed, 13 Mar 2024 17:07:13 +0800 Subject: [PATCH 141/189] update error message for multiple algorithms in text chunking processor Signed-off-by: yuye-aws --- .../neuralsearch/processor/TextChunkingProcessor.java | 2 +- .../neuralsearch/processor/TextChunkingProcessorTests.java | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/main/java/org/opensearch/neuralsearch/processor/TextChunkingProcessor.java b/src/main/java/org/opensearch/neuralsearch/processor/TextChunkingProcessor.java index a09a2190a..74d1e424a 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/TextChunkingProcessor.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/TextChunkingProcessor.java @@ -93,7 +93,7 @@ private void validateAndParseAlgorithmMap(Map algorithmMap) { ); } else if (algorithmMap.size() > 1) { throw new IllegalArgumentException( - String.format(Locale.ROOT, "Unable to create %s processor as [%s] contain multiple algorithms", TYPE, ALGORITHM_FIELD) + String.format(Locale.ROOT, "Unable to create %s processor as [%s] contains multiple algorithms", TYPE, ALGORITHM_FIELD) ); } Entry algorithmEntry = algorithmMap.entrySet().iterator().next(); diff --git a/src/test/java/org/opensearch/neuralsearch/processor/TextChunkingProcessorTests.java b/src/test/java/org/opensearch/neuralsearch/processor/TextChunkingProcessorTests.java index 877e1974d..1ed1a3bdb 100644 --- a/src/test/java/org/opensearch/neuralsearch/processor/TextChunkingProcessorTests.java +++ b/src/test/java/org/opensearch/neuralsearch/processor/TextChunkingProcessorTests.java @@ -228,7 +228,7 @@ public void testCreate_whenAlgorithmFieldMultipleAlgorithm_thenFail() { () -> textChunkingProcessorFactory.create(registry, PROCESSOR_TAG, DESCRIPTION, config) ); assertEquals( - String.format(Locale.ROOT, "Unable to create %s processor as [%s] contain multiple algorithms", TYPE, ALGORITHM_FIELD), + String.format(Locale.ROOT, "Unable to create %s processor as [%s] contains multiple algorithms", TYPE, ALGORITHM_FIELD), illegalArgumentException.getMessage() ); } From 183e92852fce1db8dea1fc88565d6dc348b84cf5 Mon Sep 17 00:00:00 2001 From: yuye-aws Date: Wed, 13 Mar 2024 17:09:35 +0800 Subject: [PATCH 142/189] add comment in text chunking processor Signed-off-by: yuye-aws --- .../neuralsearch/processor/TextChunkingProcessor.java | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/main/java/org/opensearch/neuralsearch/processor/TextChunkingProcessor.java b/src/main/java/org/opensearch/neuralsearch/processor/TextChunkingProcessor.java index 74d1e424a..3cb0d781c 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/TextChunkingProcessor.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/TextChunkingProcessor.java @@ -96,6 +96,7 @@ private void validateAndParseAlgorithmMap(Map algorithmMap) { String.format(Locale.ROOT, "Unable to create %s processor as [%s] contains multiple algorithms", TYPE, ALGORITHM_FIELD) ); } + Entry algorithmEntry = algorithmMap.entrySet().iterator().next(); String algorithmKey = algorithmEntry.getKey(); Object algorithmValue = algorithmEntry.getValue(); @@ -122,7 +123,9 @@ private void validateAndParseAlgorithmMap(Map algorithmMap) { ) ); } + Map chunkerParameters = (Map) algorithmValue; + // fixed token length algorithm needs analysis registry for tokenization if (Objects.equals(algorithmKey, FixedTokenLengthChunker.ALGORITHM_NAME)) { chunkerParameters.put(FixedTokenLengthChunker.ANALYSIS_REGISTRY_FIELD, analysisRegistry); } From 09fccc1b6f1f28d9a759bd2c1bb04224f649ea11 Mon Sep 17 00:00:00 2001 From: yuye-aws Date: Wed, 13 Mar 2024 17:18:45 +0800 Subject: [PATCH 143/189] validate max chunk limit with util parameter class Signed-off-by: yuye-aws --- .../processor/TextChunkingProcessor.java | 24 +++++-------------- .../chunker/ChunkerParameterValidator.java | 19 +++++++-------- .../processor/TextChunkingProcessorTests.java | 2 +- .../chunker/DelimiterChunkerTests.java | 7 ++---- .../chunker/FixedTokenLengthChunkerTests.java | 20 ++++------------ 5 files changed, 22 insertions(+), 50 deletions(-) diff --git a/src/main/java/org/opensearch/neuralsearch/processor/TextChunkingProcessor.java b/src/main/java/org/opensearch/neuralsearch/processor/TextChunkingProcessor.java index 3cb0d781c..5c5b24418 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/TextChunkingProcessor.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/TextChunkingProcessor.java @@ -14,7 +14,6 @@ import java.util.Objects; import com.google.common.annotations.VisibleForTesting; -import org.apache.commons.lang3.math.NumberUtils; import org.opensearch.cluster.metadata.IndexMetadata; import org.opensearch.env.Environment; @@ -29,6 +28,7 @@ import org.opensearch.neuralsearch.processor.chunker.ChunkerFactory; import org.opensearch.neuralsearch.processor.chunker.Chunker; import org.opensearch.index.mapper.IndexFieldMapper; +import org.opensearch.neuralsearch.processor.chunker.ChunkerParameterValidator; import org.opensearch.neuralsearch.processor.chunker.FixedTokenLengthChunker; /** @@ -130,23 +130,11 @@ private void validateAndParseAlgorithmMap(Map algorithmMap) { chunkerParameters.put(FixedTokenLengthChunker.ANALYSIS_REGISTRY_FIELD, analysisRegistry); } this.chunker = ChunkerFactory.create(algorithmKey, chunkerParameters); - if (chunkerParameters.containsKey(MAX_CHUNK_LIMIT_FIELD)) { - String maxChunkLimitString = chunkerParameters.get(MAX_CHUNK_LIMIT_FIELD).toString(); - if (!(NumberUtils.isParsable(maxChunkLimitString))) { - throw new IllegalArgumentException( - String.format(Locale.ROOT, "Parameter [%s] cannot be cast to [%s]", MAX_CHUNK_LIMIT_FIELD, Number.class.getName()) - ); - } - int maxChunkLimit = NumberUtils.createInteger(maxChunkLimitString); - if (maxChunkLimit <= 0 && maxChunkLimit != DEFAULT_MAX_CHUNK_LIMIT) { - throw new IllegalArgumentException( - String.format(Locale.ROOT, "Parameter [%s] must be a positive integer", MAX_CHUNK_LIMIT_FIELD) - ); - } - this.maxChunkLimit = maxChunkLimit; - } else { - this.maxChunkLimit = DEFAULT_MAX_CHUNK_LIMIT; - } + this.maxChunkLimit = ChunkerParameterValidator.validatePositiveIntegerParameter( + chunkerParameters, + MAX_CHUNK_LIMIT_FIELD, + DEFAULT_MAX_CHUNK_LIMIT + ); } @SuppressWarnings("unchecked") diff --git a/src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerParameterValidator.java b/src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerParameterValidator.java index 9893efc5f..ae0776673 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerParameterValidator.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerParameterValidator.java @@ -11,7 +11,7 @@ import java.util.Locale; /** - * Validate and parse the parameter for chunking algorithms + * Validate and parse the parameter for text chunking processor and algorithms */ public class ChunkerParameterValidator { @@ -31,12 +31,10 @@ public static String validateStringParameters( Object fieldValue = parameters.get(fieldName); if (!(fieldValue instanceof String)) { throw new IllegalArgumentException( - String.format(Locale.ROOT, "Chunking algorithm parameter [%s] cannot be cast to [%s]", fieldName, String.class.getName()) + String.format(Locale.ROOT, "Parameter [%s] cannot be cast to [%s]", fieldName, String.class.getName()) ); } else if (!allowEmpty && StringUtils.isEmpty(fieldValue.toString())) { - throw new IllegalArgumentException( - String.format(Locale.ROOT, "Chunking algorithm parameter [%s] should not be empty.", fieldName) - ); + throw new IllegalArgumentException(String.format(Locale.ROOT, "Parameter [%s] should not be empty.", fieldName)); } return (String) fieldValue; } @@ -52,7 +50,7 @@ public static Number validateNumberParameter(Map parameters, Str String fieldValue = parameters.get(fieldName).toString(); if (!(NumberUtils.isParsable(fieldValue))) { throw new IllegalArgumentException( - String.format(Locale.ROOT, "Chunking algorithm parameter [%s] cannot be cast to [%s]", fieldName, Number.class.getName()) + String.format(Locale.ROOT, "Parameter [%s] cannot be cast to [%s]", fieldName, Number.class.getName()) ); } return NumberUtils.createNumber(fieldValue); @@ -64,10 +62,9 @@ public static Number validateNumberParameter(Map parameters, Str public static int validatePositiveIntegerParameter(Map parameters, String fieldName, int defaultValue) { Number fieldValueNumber = validateNumberParameter(parameters, fieldName, defaultValue); int fieldValueInt = fieldValueNumber.intValue(); - if (fieldValueInt <= 0) { - throw new IllegalArgumentException( - String.format(Locale.ROOT, "Chunking algorithm parameter [%s] must be positive.", fieldName) - ); + // sometimes parameter has negative default value, indicating that this parameter is not effective + if (fieldValueInt != defaultValue && fieldValueInt <= 0) { + throw new IllegalArgumentException(String.format(Locale.ROOT, "Parameter [%s] must be positive.", fieldName)); } return fieldValueInt; } @@ -86,7 +83,7 @@ public static double validateRangeDoubleParameter( double fieldValueDouble = fieldValueNumber.doubleValue(); if (fieldValueDouble < lowerBound || fieldValueDouble > upperBound) { throw new IllegalArgumentException( - String.format(Locale.ROOT, "Chunking algorithm parameter [%s] must be between %s and %s", fieldName, lowerBound, upperBound) + String.format(Locale.ROOT, "Parameter [%s] must be between %s and %s", fieldName, lowerBound, upperBound) ); } return fieldValueDouble; diff --git a/src/test/java/org/opensearch/neuralsearch/processor/TextChunkingProcessorTests.java b/src/test/java/org/opensearch/neuralsearch/processor/TextChunkingProcessorTests.java index 1ed1a3bdb..6965d6fd7 100644 --- a/src/test/java/org/opensearch/neuralsearch/processor/TextChunkingProcessorTests.java +++ b/src/test/java/org/opensearch/neuralsearch/processor/TextChunkingProcessorTests.java @@ -190,7 +190,7 @@ public void testCreate_whenMaxChunkNumInvalidValue_thenFail() { () -> textChunkingProcessorFactory.create(registry, PROCESSOR_TAG, DESCRIPTION, config) ); assertEquals( - String.format(Locale.ROOT, "Parameter [%s] must be a positive integer", MAX_CHUNK_LIMIT_FIELD), + String.format(Locale.ROOT, "Parameter [%s] must be positive.", MAX_CHUNK_LIMIT_FIELD), illegalArgumentException.getMessage() ); } diff --git a/src/test/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunkerTests.java b/src/test/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunkerTests.java index 50615082b..c1d489d46 100644 --- a/src/test/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunkerTests.java +++ b/src/test/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunkerTests.java @@ -21,17 +21,14 @@ public void testChunkerWithDelimiterFieldNotString() { () -> new DelimiterChunker(Map.of(DELIMITER_FIELD, List.of(""))) ); Assert.assertEquals( - String.format(Locale.ROOT, "Chunking algorithm parameter [%s] cannot be cast to [%s]", DELIMITER_FIELD, String.class.getName()), + String.format(Locale.ROOT, "Parameter [%s] cannot be cast to [%s]", DELIMITER_FIELD, String.class.getName()), exception.getMessage() ); } public void testChunkerWithDelimiterFieldNoString() { Exception exception = assertThrows(IllegalArgumentException.class, () -> new DelimiterChunker(Map.of(DELIMITER_FIELD, ""))); - Assert.assertEquals( - String.format(Locale.ROOT, "Chunking algorithm parameter [%s] should not be empty.", DELIMITER_FIELD), - exception.getMessage() - ); + Assert.assertEquals(String.format(Locale.ROOT, "Parameter [%s] should not be empty.", DELIMITER_FIELD), exception.getMessage()); } public void testChunker() { diff --git a/src/test/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunkerTests.java b/src/test/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunkerTests.java index 2acd3cd8f..6ab9c3518 100644 --- a/src/test/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunkerTests.java +++ b/src/test/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunkerTests.java @@ -74,12 +74,7 @@ public void testValidateAndParseParameters_whenIllegalTokenLimitType_thenFail() () -> fixedTokenLengthChunker.validateAndParseParameters(parameters) ); assertEquals( - String.format( - Locale.ROOT, - "Chunking algorithm parameter [%s] cannot be cast to [%s]", - TOKEN_LIMIT_FIELD, - Number.class.getName() - ), + String.format(Locale.ROOT, "Parameter [%s] cannot be cast to [%s]", TOKEN_LIMIT_FIELD, Number.class.getName()), illegalArgumentException.getMessage() ); } @@ -92,7 +87,7 @@ public void testValidateAndParseParameters_whenIllegalTokenLimitValue_thenFail() () -> fixedTokenLengthChunker.validateAndParseParameters(parameters) ); assertEquals( - String.format(Locale.ROOT, "Chunking algorithm parameter [%s] must be positive.", TOKEN_LIMIT_FIELD), + String.format(Locale.ROOT, "Parameter [%s] must be positive.", TOKEN_LIMIT_FIELD), illegalArgumentException.getMessage() ); } @@ -105,12 +100,7 @@ public void testValidateAndParseParameters_whenIllegalOverlapRateType_thenFail() () -> fixedTokenLengthChunker.validateAndParseParameters(parameters) ); assertEquals( - String.format( - Locale.ROOT, - "Chunking algorithm parameter [%s] cannot be cast to [%s]", - OVERLAP_RATE_FIELD, - Number.class.getName() - ), + String.format(Locale.ROOT, "Parameter [%s] cannot be cast to [%s]", OVERLAP_RATE_FIELD, Number.class.getName()), illegalArgumentException.getMessage() ); } @@ -123,7 +113,7 @@ public void testValidateAndParseParameters_whenIllegalOverlapRateValue_thenFail( () -> fixedTokenLengthChunker.validateAndParseParameters(parameters) ); assertEquals( - String.format(Locale.ROOT, "Chunking algorithm parameter [%s] must be between %s and %s", OVERLAP_RATE_FIELD, 0.0, 0.5), + String.format(Locale.ROOT, "Parameter [%s] must be between %s and %s", OVERLAP_RATE_FIELD, 0.0, 0.5), illegalArgumentException.getMessage() ); } @@ -136,7 +126,7 @@ public void testValidateAndParseParameters_whenIllegalTokenizerType_thenFail() { () -> fixedTokenLengthChunker.validateAndParseParameters(parameters) ); assertEquals( - String.format(Locale.ROOT, "Chunking algorithm parameter [%s] cannot be cast to [%s]", TOKENIZER_FIELD, String.class.getName()), + String.format(Locale.ROOT, "Parameter [%s] cannot be cast to [%s]", TOKENIZER_FIELD, String.class.getName()), illegalArgumentException.getMessage() ); } From 8ad1e515fe69c8ee947552f289fa0f66c38594d4 Mon Sep 17 00:00:00 2001 From: yuye-aws Date: Wed, 13 Mar 2024 17:26:57 +0800 Subject: [PATCH 144/189] update comments Signed-off-by: yuye-aws --- .../processor/TextChunkingProcessor.java | 81 ++++++++++--------- .../processor/chunker/Chunker.java | 2 +- .../processor/chunker/ChunkerFactory.java | 2 +- 3 files changed, 43 insertions(+), 42 deletions(-) diff --git a/src/main/java/org/opensearch/neuralsearch/processor/TextChunkingProcessor.java b/src/main/java/org/opensearch/neuralsearch/processor/TextChunkingProcessor.java index 5c5b24418..f35eb85de 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/TextChunkingProcessor.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/TextChunkingProcessor.java @@ -125,8 +125,8 @@ private void validateAndParseAlgorithmMap(Map algorithmMap) { } Map chunkerParameters = (Map) algorithmValue; - // fixed token length algorithm needs analysis registry for tokenization if (Objects.equals(algorithmKey, FixedTokenLengthChunker.ALGORITHM_NAME)) { + // fixed token length algorithm needs analysis registry for tokenization chunkerParameters.put(FixedTokenLengthChunker.ANALYSIS_REGISTRY_FIELD, analysisRegistry); } this.chunker = ChunkerFactory.create(algorithmKey, chunkerParameters); @@ -151,50 +151,12 @@ private boolean isListOfString(Object value) { return true; } - private int chunkString(String content, List result, Map runTimeParameters, int chunkCount) { - // chunk the content, return the updated chunkCount and add chunk passages to result - List contentResult = chunker.chunk(content, runTimeParameters); - chunkCount += contentResult.size(); - if (maxChunkLimit != DEFAULT_MAX_CHUNK_LIMIT && chunkCount > maxChunkLimit) { - throw new IllegalArgumentException( - String.format( - Locale.ROOT, - "Unable to chunk the document as the number of chunks [%s] exceeds the maximum chunk limit [%s]", - chunkCount, - maxChunkLimit - ) - ); - } - result.addAll(contentResult); - return chunkCount; - } - - private int chunkList(List contentList, List result, Map runTimeParameters, int chunkCount) { - // flatten the List> output to List - for (String content : contentList) { - chunkCount = chunkString(content, result, runTimeParameters, chunkCount); - } - return chunkCount; - } - - @SuppressWarnings("unchecked") - private int chunkLeafType(Object value, List result, Map runTimeParameters, int chunkCount) { - // leaf type is either String or List - // the result should be an empty string - if (value instanceof String) { - chunkCount = chunkString(value.toString(), result, runTimeParameters, chunkCount); - } else if (isListOfString(value)) { - chunkCount = chunkList((List) value, result, runTimeParameters, chunkCount); - } - return chunkCount; - } - private int getMaxTokenCount(Map sourceAndMetadataMap) { String indexName = sourceAndMetadataMap.get(IndexFieldMapper.NAME).toString(); IndexMetadata indexMetadata = clusterService.state().metadata().index(indexName); int maxTokenCount; if (indexMetadata != null) { - // if the index exists, read maxTokenCount from the index setting + // if the index is specified in the metadata, read maxTokenCount from the index setting IndexService indexService = indicesService.indexServiceSafe(indexMetadata.getIndex()); maxTokenCount = indexService.getIndexSettings().getMaxTokenCount(); } else { @@ -214,6 +176,7 @@ public IngestDocument execute(IngestDocument ingestDocument) { Map runtimeParameters = new HashMap<>(); Map sourceAndMetadataMap = ingestDocument.getSourceAndMetadata(); if (chunker instanceof FixedTokenLengthChunker) { + // fixed token length algorithm needs max_token_count for tokenization int maxTokenCount = getMaxTokenCount(sourceAndMetadataMap); runtimeParameters.put(FixedTokenLengthChunker.MAX_TOKEN_COUNT_FIELD, maxTokenCount); } @@ -318,4 +281,42 @@ private int chunkMapType( } return chunkCount; } + + private int chunkString(String content, List result, Map runTimeParameters, int chunkCount) { + // chunk the content, return the updated chunkCount and add chunk passages into result + List contentResult = chunker.chunk(content, runTimeParameters); + chunkCount += contentResult.size(); + if (maxChunkLimit != DEFAULT_MAX_CHUNK_LIMIT && chunkCount > maxChunkLimit) { + throw new IllegalArgumentException( + String.format( + Locale.ROOT, + "Unable to chunk the document as the number of chunks [%s] exceeds the maximum chunk limit [%s]", + chunkCount, + maxChunkLimit + ) + ); + } + result.addAll(contentResult); + return chunkCount; + } + + private int chunkList(List contentList, List result, Map runTimeParameters, int chunkCount) { + // flatten original output format from List> to List + for (String content : contentList) { + chunkCount = chunkString(content, result, runTimeParameters, chunkCount); + } + return chunkCount; + } + + @SuppressWarnings("unchecked") + private int chunkLeafType(Object value, List result, Map runTimeParameters, int chunkCount) { + // leaf type means either String or List + // the result should be an empty list + if (value instanceof String) { + chunkCount = chunkString(value.toString(), result, runTimeParameters, chunkCount); + } else if (isListOfString(value)) { + chunkCount = chunkList((List) value, result, runTimeParameters, chunkCount); + } + return chunkCount; + } } diff --git a/src/main/java/org/opensearch/neuralsearch/processor/chunker/Chunker.java b/src/main/java/org/opensearch/neuralsearch/processor/chunker/Chunker.java index 8419c1d98..d6aa5fe39 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/chunker/Chunker.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/chunker/Chunker.java @@ -17,7 +17,7 @@ public interface Chunker { * Validate and parse the parameters for chunking algorithm, * will throw IllegalArgumentException when parameters are invalid * - * @param parameters a map containing parameters for chunking algorithms + * @param parameters a map containing non-runtime parameters for chunking algorithms */ void validateAndParseParameters(Map parameters); diff --git a/src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerFactory.java b/src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerFactory.java index d6c475512..9623eee21 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerFactory.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerFactory.java @@ -9,7 +9,7 @@ import java.util.Locale; /** - * A factory to create different chunking algorithm classes and return all supported chunking algorithms. + * A factory to create different chunking algorithm objects and return all supported chunking algorithms. */ public class ChunkerFactory { From 489fe7b9a5171a199db8641ddbf5ba881b7e5e59 Mon Sep 17 00:00:00 2001 From: yuye-aws Date: Wed, 13 Mar 2024 17:35:23 +0800 Subject: [PATCH 145/189] update comments Signed-off-by: yuye-aws --- .../chunker/FixedTokenLengthChunker.java | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java b/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java index 1f9c0610f..86ab392aa 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java @@ -25,6 +25,8 @@ public class FixedTokenLengthChunker implements Chunker { public static final String ALGORITHM_NAME = "fixed_token_length"; + + // field name for each parameter public static final String ANALYSIS_REGISTRY_FIELD = "analysis_registry"; public static final String TOKEN_LIMIT_FIELD = "token_limit"; public static final String OVERLAP_RATE_FIELD = "overlap_rate"; @@ -36,6 +38,9 @@ public class FixedTokenLengthChunker implements Chunker { private static final double DEFAULT_OVERLAP_RATE = 0.0; private static final int DEFAULT_MAX_TOKEN_COUNT = 10000; private static final String DEFAULT_TOKENIZER = "standard"; + + // parameter restrictions + private static final double OVERLAP_RATE_LOWER_BOUND = 0.0; private static final double OVERLAP_RATE_UPPER_BOUND = 0.5; private static final Set WORD_TOKENIZERS = Set.of( "standard", @@ -47,6 +52,7 @@ public class FixedTokenLengthChunker implements Chunker { "thai" ); + // parameter value private int tokenLimit; private String tokenizer; private double overlapRate; @@ -77,7 +83,7 @@ public void validateAndParseParameters(Map parameters) { this.overlapRate = validateRangeDoubleParameter( parameters, OVERLAP_RATE_FIELD, - DEFAULT_OVERLAP_RATE, + OVERLAP_RATE_LOWER_BOUND, OVERLAP_RATE_UPPER_BOUND, DEFAULT_OVERLAP_RATE ); @@ -104,11 +110,11 @@ public void validateAndParseParameters(Map parameters) { */ @Override public List chunk(String content, Map runtimeParameters) { - // prior to chunking, runtimeParameters have been validated + // before chunking, validate and parse runtimeParameters int maxTokenCount = validatePositiveIntegerParameter(runtimeParameters, MAX_TOKEN_COUNT_FIELD, DEFAULT_MAX_TOKEN_COUNT); List tokens = tokenize(content, tokenizer, maxTokenCount); - List passages = new ArrayList<>(); + List chunkResult = new ArrayList<>(); int startTokenIndex = 0; int startContentPosition, endContentPosition; @@ -124,16 +130,16 @@ public List chunk(String content, Map runtimeParameters) if (startTokenIndex + tokenLimit >= tokens.size()) { // include all characters till the end if no next passage endContentPosition = content.length(); - passages.add(content.substring(startContentPosition, endContentPosition)); + chunkResult.add(content.substring(startContentPosition, endContentPosition)); break; } else { // include gap characters between two passages endContentPosition = tokens.get(startTokenIndex + tokenLimit).getStartOffset(); - passages.add(content.substring(startContentPosition, endContentPosition)); + chunkResult.add(content.substring(startContentPosition, endContentPosition)); } startTokenIndex += tokenLimit - overlapTokenNumber; } - return passages; + return chunkResult; } private List tokenize(String content, String tokenizer, int maxTokenCount) { From d67880e9dec0c4b34c9bd80864d8ac7b54a9d8ae Mon Sep 17 00:00:00 2001 From: yuye-aws Date: Thu, 14 Mar 2024 00:11:43 +0800 Subject: [PATCH 146/189] update java doc Signed-off-by: yuye-aws --- .../neuralsearch/processor/TextChunkingProcessor.java | 7 ++++--- .../processor/chunker/FixedTokenLengthChunker.java | 2 +- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/src/main/java/org/opensearch/neuralsearch/processor/TextChunkingProcessor.java b/src/main/java/org/opensearch/neuralsearch/processor/TextChunkingProcessor.java index f35eb85de..0a389c34b 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/TextChunkingProcessor.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/TextChunkingProcessor.java @@ -32,9 +32,10 @@ import org.opensearch.neuralsearch.processor.chunker.FixedTokenLengthChunker; /** - * This processor is used for chunking user input data and chunked data could be used for downstream embedding processor, - * algorithm can be used to indicate chunking algorithm and parameters, - * and field_map can be used to indicate which fields needs chunking and the corresponding keys for the chunking results. + * This processor is used for user input data text chunking. + * The chunking results could be fed to downstream embedding processor, + * algorithm defines chunking algorithm and parameters, + * and field_map specifies which fields needs chunking and the corresponding keys for the chunking results. */ public final class TextChunkingProcessor extends AbstractProcessor { diff --git a/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java b/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java index 86ab392aa..9c8466e95 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java @@ -106,7 +106,7 @@ public void validateAndParseParameters(Map parameters) { * * @param content input string * @param runtimeParameters a map for runtime parameters, containing the following runtime parameters: - * max_token_count the max token limit for the tokenizer + * 1. max_token_count the max token limit for the tokenizer */ @Override public List chunk(String content, Map runtimeParameters) { From 666e7b93c4e0e1ce5483fca44b6b360afb1b994f Mon Sep 17 00:00:00 2001 From: yuye-aws Date: Thu, 14 Mar 2024 00:14:23 +0800 Subject: [PATCH 147/189] update java doc Signed-off-by: yuye-aws --- .../org/opensearch/neuralsearch/processor/chunker/Chunker.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/java/org/opensearch/neuralsearch/processor/chunker/Chunker.java b/src/main/java/org/opensearch/neuralsearch/processor/chunker/Chunker.java index d6aa5fe39..45f758569 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/chunker/Chunker.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/chunker/Chunker.java @@ -9,7 +9,7 @@ /** * The interface for all chunking algorithms. - * All algorithms need to validate parameters and chunk the content, + * All algorithms need to validate parameters and chunk the content. */ public interface Chunker { From 9161c935d1ad84110942f4d9927c35cb93d92a47 Mon Sep 17 00:00:00 2001 From: yuye-aws Date: Thu, 14 Mar 2024 11:29:09 +0800 Subject: [PATCH 148/189] make parameter final Signed-off-by: yuye-aws --- .../processor/TextChunkingProcessor.java | 38 +++++++++---------- .../processor/chunker/ChunkerFactory.java | 2 +- .../chunker/ChunkerParameterValidator.java | 22 +++++------ .../processor/chunker/DelimiterChunker.java | 6 +-- .../chunker/FixedTokenLengthChunker.java | 8 ++-- 5 files changed, 38 insertions(+), 38 deletions(-) diff --git a/src/main/java/org/opensearch/neuralsearch/processor/TextChunkingProcessor.java b/src/main/java/org/opensearch/neuralsearch/processor/TextChunkingProcessor.java index 0a389c34b..df8f4aebf 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/TextChunkingProcessor.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/TextChunkingProcessor.java @@ -64,14 +64,14 @@ public final class TextChunkingProcessor extends AbstractProcessor { private final Environment environment; public TextChunkingProcessor( - String tag, - String description, - Map fieldMap, - Map algorithmMap, - Environment environment, - ClusterService clusterService, - IndicesService indicesService, - AnalysisRegistry analysisRegistry + final String tag, + final String description, + final Map fieldMap, + final Map algorithmMap, + final Environment environment, + final ClusterService clusterService, + final IndicesService indicesService, + final AnalysisRegistry analysisRegistry ) { super(tag, description); this.fieldMap = fieldMap; @@ -87,7 +87,7 @@ public String getType() { } @SuppressWarnings("unchecked") - private void validateAndParseAlgorithmMap(Map algorithmMap) { + private void validateAndParseAlgorithmMap(final Map algorithmMap) { if (algorithmMap.isEmpty()) { throw new IllegalArgumentException( String.format(Locale.ROOT, "Unable to create %s processor as [%s] does not contain any algorithm", TYPE, ALGORITHM_FIELD) @@ -139,7 +139,7 @@ private void validateAndParseAlgorithmMap(Map algorithmMap) { } @SuppressWarnings("unchecked") - private boolean isListOfString(Object value) { + private boolean isListOfString(final Object value) { // an empty list is also List if (!(value instanceof List)) { return false; @@ -152,7 +152,7 @@ private boolean isListOfString(Object value) { return true; } - private int getMaxTokenCount(Map sourceAndMetadataMap) { + private int getMaxTokenCount(final Map sourceAndMetadataMap) { String indexName = sourceAndMetadataMap.get(IndexFieldMapper.NAME).toString(); IndexMetadata indexMetadata = clusterService.state().metadata().index(indexName); int maxTokenCount; @@ -185,7 +185,7 @@ public IngestDocument execute(IngestDocument ingestDocument) { return ingestDocument; } - private void validateFieldsValue(IngestDocument ingestDocument) { + private void validateFieldsValue(final IngestDocument ingestDocument) { Map sourceAndMetadataMap = ingestDocument.getSourceAndMetadata(); for (Map.Entry embeddingFieldsEntry : fieldMap.entrySet()) { Object sourceValue = sourceAndMetadataMap.get(embeddingFieldsEntry.getKey()); @@ -203,7 +203,7 @@ private void validateFieldsValue(IngestDocument ingestDocument) { } @SuppressWarnings({ "rawtypes", "unchecked" }) - private void validateNestedTypeValue(String sourceKey, Object sourceValue, int maxDepth) { + private void validateNestedTypeValue(final String sourceKey, final Object sourceValue, final int maxDepth) { if (maxDepth > MapperService.INDEX_MAPPING_DEPTH_LIMIT_SETTING.get(environment.settings())) { throw new IllegalArgumentException( String.format(Locale.ROOT, "map type field [%s] reached max depth limit, cannot process it", sourceKey) @@ -223,7 +223,7 @@ private void validateNestedTypeValue(String sourceKey, Object sourceValue, int m } @SuppressWarnings({ "rawtypes" }) - private void validateListTypeValue(String sourceKey, Object sourceValue, int maxDepth) { + private void validateListTypeValue(final String sourceKey, final Object sourceValue, final int maxDepth) { for (Object value : (List) sourceValue) { if (value instanceof Map) { validateNestedTypeValue(sourceKey, value, maxDepth + 1); @@ -242,8 +242,8 @@ private void validateListTypeValue(String sourceKey, Object sourceValue, int max @SuppressWarnings("unchecked") private int chunkMapType( Map sourceAndMetadataMap, - Map fieldMap, - Map runtimeParameters, + final Map fieldMap, + final Map runtimeParameters, int chunkCount ) { for (Map.Entry fieldMapEntry : fieldMap.entrySet()) { @@ -283,7 +283,7 @@ private int chunkMapType( return chunkCount; } - private int chunkString(String content, List result, Map runTimeParameters, int chunkCount) { + private int chunkString(final String content, List result, final Map runTimeParameters, int chunkCount) { // chunk the content, return the updated chunkCount and add chunk passages into result List contentResult = chunker.chunk(content, runTimeParameters); chunkCount += contentResult.size(); @@ -301,7 +301,7 @@ private int chunkString(String content, List result, Map return chunkCount; } - private int chunkList(List contentList, List result, Map runTimeParameters, int chunkCount) { + private int chunkList(final List contentList, List result, final Map runTimeParameters, int chunkCount) { // flatten original output format from List> to List for (String content : contentList) { chunkCount = chunkString(content, result, runTimeParameters, chunkCount); @@ -310,7 +310,7 @@ private int chunkList(List contentList, List result, Map result, Map runTimeParameters, int chunkCount) { + private int chunkLeafType(final Object value, List result, final Map runTimeParameters, int chunkCount) { // leaf type means either String or List // the result should be an empty list if (value instanceof String) { diff --git a/src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerFactory.java b/src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerFactory.java index 9623eee21..f2d84b5ad 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerFactory.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerFactory.java @@ -13,7 +13,7 @@ */ public class ChunkerFactory { - public static Chunker create(String type, Map parameters) { + public static Chunker create(final String type, final Map parameters) { switch (type) { case FixedTokenLengthChunker.ALGORITHM_NAME: return new FixedTokenLengthChunker(parameters); diff --git a/src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerParameterValidator.java b/src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerParameterValidator.java index ae0776673..33c5fdee9 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerParameterValidator.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerParameterValidator.java @@ -19,10 +19,10 @@ public class ChunkerParameterValidator { * Validate and parse the parameter for string parameters */ public static String validateStringParameters( - Map parameters, - String fieldName, - String defaultValue, - boolean allowEmpty + final Map parameters, + final String fieldName, + final String defaultValue, + final boolean allowEmpty ) { if (!parameters.containsKey(fieldName)) { // all chunking algorithm parameters are optional @@ -42,7 +42,7 @@ public static String validateStringParameters( /** * Validate and parse the parameter for numeric parameters */ - public static Number validateNumberParameter(Map parameters, String fieldName, Number defaultValue) { + public static Number validateNumberParameter(final Map parameters, final String fieldName, final Number defaultValue) { if (!parameters.containsKey(fieldName)) { // all chunking algorithm parameters are optional return defaultValue; @@ -59,7 +59,7 @@ public static Number validateNumberParameter(Map parameters, Str /** * Validate and parse the parameter for positive integer parameters */ - public static int validatePositiveIntegerParameter(Map parameters, String fieldName, int defaultValue) { + public static int validatePositiveIntegerParameter(final Map parameters, final String fieldName, final int defaultValue) { Number fieldValueNumber = validateNumberParameter(parameters, fieldName, defaultValue); int fieldValueInt = fieldValueNumber.intValue(); // sometimes parameter has negative default value, indicating that this parameter is not effective @@ -73,11 +73,11 @@ public static int validatePositiveIntegerParameter(Map parameter * Validate and parse the parameter for double parameters within [lowerBound, upperBound] */ public static double validateRangeDoubleParameter( - Map parameters, - String fieldName, - double lowerBound, - double upperBound, - double defaultValue + final Map parameters, + final String fieldName, + final double lowerBound, + final double upperBound, + final double defaultValue ) { Number fieldValueNumber = validateNumberParameter(parameters, fieldName, defaultValue); double fieldValueDouble = fieldValueNumber.doubleValue(); diff --git a/src/main/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunker.java b/src/main/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunker.java index ef181fac8..4eef512e6 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunker.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunker.java @@ -21,7 +21,7 @@ public class DelimiterChunker implements Chunker { private String delimiter; - public DelimiterChunker(Map parameters) { + public DelimiterChunker(final Map parameters) { validateAndParseParameters(parameters); } @@ -33,7 +33,7 @@ public DelimiterChunker(Map parameters) { * 1. A string as the paragraph split indicator */ @Override - public void validateAndParseParameters(Map parameters) { + public void validateAndParseParameters(final Map parameters) { this.delimiter = validateStringParameters(parameters, DELIMITER_FIELD, DEFAULT_DELIMITER, false); } @@ -44,7 +44,7 @@ public void validateAndParseParameters(Map parameters) { * @param runtimeParameters a map for runtime parameters, but not needed by delimiter algorithm */ @Override - public List chunk(String content, Map runtimeParameters) { + public List chunk(final String content, final Map runtimeParameters) { List chunkResult = new ArrayList<>(); int start = 0, end; int nextDelimiterPosition = content.indexOf(delimiter); diff --git a/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java b/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java index 9c8466e95..ef38d13cf 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java @@ -58,7 +58,7 @@ public class FixedTokenLengthChunker implements Chunker { private double overlapRate; private final AnalysisRegistry analysisRegistry; - public FixedTokenLengthChunker(Map parameters) { + public FixedTokenLengthChunker(final Map parameters) { validateAndParseParameters(parameters); this.analysisRegistry = (AnalysisRegistry) parameters.get(ANALYSIS_REGISTRY_FIELD); } @@ -78,7 +78,7 @@ public FixedTokenLengthChunker(Map parameters) { * tokenizer should be string */ @Override - public void validateAndParseParameters(Map parameters) { + public void validateAndParseParameters(final Map parameters) { this.tokenLimit = validatePositiveIntegerParameter(parameters, TOKEN_LIMIT_FIELD, DEFAULT_TOKEN_LIMIT); this.overlapRate = validateRangeDoubleParameter( parameters, @@ -109,7 +109,7 @@ public void validateAndParseParameters(Map parameters) { * 1. max_token_count the max token limit for the tokenizer */ @Override - public List chunk(String content, Map runtimeParameters) { + public List chunk(final String content, final Map runtimeParameters) { // before chunking, validate and parse runtimeParameters int maxTokenCount = validatePositiveIntegerParameter(runtimeParameters, MAX_TOKEN_COUNT_FIELD, DEFAULT_MAX_TOKEN_COUNT); @@ -142,7 +142,7 @@ public List chunk(String content, Map runtimeParameters) return chunkResult; } - private List tokenize(String content, String tokenizer, int maxTokenCount) { + private List tokenize(final String content, final String tokenizer, final int maxTokenCount) { AnalyzeAction.Request analyzeRequest = new AnalyzeAction.Request(); analyzeRequest.text(content); analyzeRequest.tokenizer(tokenizer); From 0f9c140f8812d47a878563d10388228bc796659f Mon Sep 17 00:00:00 2001 From: yuye-aws Date: Thu, 14 Mar 2024 12:55:22 +0800 Subject: [PATCH 149/189] implement a map from chunker name to constuctor function in chunker factory Signed-off-by: yuye-aws --- .../processor/chunker/ChunkerFactory.java | 32 ++++++++++--------- 1 file changed, 17 insertions(+), 15 deletions(-) diff --git a/src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerFactory.java b/src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerFactory.java index f2d84b5ad..6ee50442f 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerFactory.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerFactory.java @@ -4,34 +4,36 @@ */ package org.opensearch.neuralsearch.processor.chunker; +import com.google.common.collect.ImmutableMap; + import java.util.Map; import java.util.Set; import java.util.Locale; +import java.util.function.Function; /** * A factory to create different chunking algorithm objects and return all supported chunking algorithms. */ public class ChunkerFactory { + private static final ImmutableMap, Chunker>> chunkers = ImmutableMap.of( + FixedTokenLengthChunker.ALGORITHM_NAME, + FixedTokenLengthChunker::new, + DelimiterChunker.ALGORITHM_NAME, + FixedTokenLengthChunker::new + ); + public static Chunker create(final String type, final Map parameters) { - switch (type) { - case FixedTokenLengthChunker.ALGORITHM_NAME: - return new FixedTokenLengthChunker(parameters); - case DelimiterChunker.ALGORITHM_NAME: - return new DelimiterChunker(parameters); - default: - throw new IllegalArgumentException( - String.format( - Locale.ROOT, - "chunking algorithm [%s] is not supported. Supported chunking algorithms are %s", - type, - ChunkerFactory.getAllChunkers() - ) - ); + Function, Chunker> chunkerConstructionFunction= chunkers.get(type); + if (chunkerConstructionFunction == null) { + throw new IllegalArgumentException( + String.format(Locale.ROOT, "chunking algorithm [%s] is not supported. Supported chunking algorithms are %s", type, chunkers.keySet()) + ); } + return chunkerConstructionFunction.apply(parameters); } public static Set getAllChunkers() { - return Set.of(FixedTokenLengthChunker.ALGORITHM_NAME, DelimiterChunker.ALGORITHM_NAME); + return chunkers.keySet(); } } From a5749807a5e2aab251ed94cffce922cc808f6363 Mon Sep 17 00:00:00 2001 From: yuye-aws Date: Thu, 14 Mar 2024 13:21:18 +0800 Subject: [PATCH 150/189] bug fix in chunker factory Signed-off-by: yuye-aws --- .../processor/TextChunkingProcessor.java | 7 ++++++- .../processor/chunker/ChunkerFactory.java | 19 ++++++++++++------- .../chunker/ChunkerParameterValidator.java | 6 +++++- .../chunker/ChunkerFactoryTests.java | 4 ++-- 4 files changed, 25 insertions(+), 11 deletions(-) diff --git a/src/main/java/org/opensearch/neuralsearch/processor/TextChunkingProcessor.java b/src/main/java/org/opensearch/neuralsearch/processor/TextChunkingProcessor.java index df8f4aebf..3a285dc50 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/TextChunkingProcessor.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/TextChunkingProcessor.java @@ -301,7 +301,12 @@ private int chunkString(final String content, List result, final Map contentList, List result, final Map runTimeParameters, int chunkCount) { + private int chunkList( + final List contentList, + List result, + final Map runTimeParameters, + int chunkCount + ) { // flatten original output format from List> to List for (String content : contentList) { chunkCount = chunkString(content, result, runTimeParameters, chunkCount); diff --git a/src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerFactory.java b/src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerFactory.java index 6ee50442f..119c444d6 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerFactory.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerFactory.java @@ -5,6 +5,7 @@ package org.opensearch.neuralsearch.processor.chunker; import com.google.common.collect.ImmutableMap; +import lombok.Getter; import java.util.Map; import java.util.Set; @@ -20,20 +21,24 @@ public class ChunkerFactory { FixedTokenLengthChunker.ALGORITHM_NAME, FixedTokenLengthChunker::new, DelimiterChunker.ALGORITHM_NAME, - FixedTokenLengthChunker::new + DelimiterChunker::new ); + @Getter + private static final Set allChunkers = chunkers.keySet(); + public static Chunker create(final String type, final Map parameters) { - Function, Chunker> chunkerConstructionFunction= chunkers.get(type); + Function, Chunker> chunkerConstructionFunction = chunkers.get(type); if (chunkerConstructionFunction == null) { throw new IllegalArgumentException( - String.format(Locale.ROOT, "chunking algorithm [%s] is not supported. Supported chunking algorithms are %s", type, chunkers.keySet()) + String.format( + Locale.ROOT, + "chunking algorithm [%s] is not supported. Supported chunking algorithms are %s", + type, + chunkers.keySet() + ) ); } return chunkerConstructionFunction.apply(parameters); } - - public static Set getAllChunkers() { - return chunkers.keySet(); - } } diff --git a/src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerParameterValidator.java b/src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerParameterValidator.java index 33c5fdee9..66fd072d1 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerParameterValidator.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerParameterValidator.java @@ -59,7 +59,11 @@ public static Number validateNumberParameter(final Map parameter /** * Validate and parse the parameter for positive integer parameters */ - public static int validatePositiveIntegerParameter(final Map parameters, final String fieldName, final int defaultValue) { + public static int validatePositiveIntegerParameter( + final Map parameters, + final String fieldName, + final int defaultValue + ) { Number fieldValueNumber = validateNumberParameter(parameters, fieldName, defaultValue); int fieldValueInt = fieldValueNumber.intValue(); // sometimes parameter has negative default value, indicating that this parameter is not effective diff --git a/src/test/java/org/opensearch/neuralsearch/processor/chunker/ChunkerFactoryTests.java b/src/test/java/org/opensearch/neuralsearch/processor/chunker/ChunkerFactoryTests.java index 4bfb3f366..0d7f7d755 100644 --- a/src/test/java/org/opensearch/neuralsearch/processor/chunker/ChunkerFactoryTests.java +++ b/src/test/java/org/opensearch/neuralsearch/processor/chunker/ChunkerFactoryTests.java @@ -28,13 +28,13 @@ public void testGetAllChunkers() { public void testCreate_FixedTokenLength() { Chunker chunker = ChunkerFactory.create(FixedTokenLengthChunker.ALGORITHM_NAME, createChunkParameters()); assertNotNull(chunker); - assertTrue(chunker instanceof FixedTokenLengthChunker); + assert (chunker instanceof FixedTokenLengthChunker); } public void testCreate_Delimiter() { Chunker chunker = ChunkerFactory.create(DelimiterChunker.ALGORITHM_NAME, createChunkParameters()); assertNotNull(chunker); - assertTrue(chunker instanceof DelimiterChunker); + assert (chunker instanceof DelimiterChunker); } public void testCreate_Invalid() { From 87679ada085f4602ab0619bf4b263671fcbae91c Mon Sep 17 00:00:00 2001 From: yuye-aws Date: Thu, 14 Mar 2024 13:38:22 +0800 Subject: [PATCH 151/189] remove get all chunkers in chunker factory Signed-off-by: yuye-aws --- .../processor/TextChunkingProcessor.java | 13 ------------- .../processor/chunker/ChunkerFactory.java | 7 +------ .../processor/TextChunkingProcessorTests.java | 9 +-------- .../processor/chunker/ChunkerFactoryTests.java | 12 +++--------- 4 files changed, 5 insertions(+), 36 deletions(-) diff --git a/src/main/java/org/opensearch/neuralsearch/processor/TextChunkingProcessor.java b/src/main/java/org/opensearch/neuralsearch/processor/TextChunkingProcessor.java index 3a285dc50..1d654e3a1 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/TextChunkingProcessor.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/TextChunkingProcessor.java @@ -7,7 +7,6 @@ import java.util.HashMap; import java.util.Map; import java.util.Map.Entry; -import java.util.Set; import java.util.Locale; import java.util.ArrayList; import java.util.List; @@ -101,18 +100,6 @@ private void validateAndParseAlgorithmMap(final Map algorithmMap Entry algorithmEntry = algorithmMap.entrySet().iterator().next(); String algorithmKey = algorithmEntry.getKey(); Object algorithmValue = algorithmEntry.getValue(); - Set supportedChunkers = ChunkerFactory.getAllChunkers(); - if (!supportedChunkers.contains(algorithmKey)) { - throw new IllegalArgumentException( - String.format( - Locale.ROOT, - "Unable to create %s processor as chunker algorithm [%s] is not supported. Supported chunkers types are %s", - TYPE, - algorithmKey, - supportedChunkers - ) - ); - } if (!(algorithmValue instanceof Map)) { throw new IllegalArgumentException( String.format( diff --git a/src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerFactory.java b/src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerFactory.java index 119c444d6..d66bc423e 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerFactory.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerFactory.java @@ -5,10 +5,8 @@ package org.opensearch.neuralsearch.processor.chunker; import com.google.common.collect.ImmutableMap; -import lombok.Getter; import java.util.Map; -import java.util.Set; import java.util.Locale; import java.util.function.Function; @@ -24,16 +22,13 @@ public class ChunkerFactory { DelimiterChunker::new ); - @Getter - private static final Set allChunkers = chunkers.keySet(); - public static Chunker create(final String type, final Map parameters) { Function, Chunker> chunkerConstructionFunction = chunkers.get(type); if (chunkerConstructionFunction == null) { throw new IllegalArgumentException( String.format( Locale.ROOT, - "chunking algorithm [%s] is not supported. Supported chunking algorithms are %s", + "Chunking algorithm [%s] is not supported. Supported chunking algorithms are %s", type, chunkers.keySet() ) diff --git a/src/test/java/org/opensearch/neuralsearch/processor/TextChunkingProcessorTests.java b/src/test/java/org/opensearch/neuralsearch/processor/TextChunkingProcessorTests.java index 6965d6fd7..2ea5f314a 100644 --- a/src/test/java/org/opensearch/neuralsearch/processor/TextChunkingProcessorTests.java +++ b/src/test/java/org/opensearch/neuralsearch/processor/TextChunkingProcessorTests.java @@ -248,14 +248,7 @@ public void testCreate_whenAlgorithmFieldInvalidAlgorithmName_thenFail() { () -> textChunkingProcessorFactory.create(registry, PROCESSOR_TAG, DESCRIPTION, config) ); assert (illegalArgumentException.getMessage() - .contains( - String.format( - Locale.ROOT, - "Unable to create %s processor as chunker algorithm [%s] is not supported.", - TYPE, - invalid_algorithm_type - ) - )); + .contains(String.format(Locale.ROOT, "Chunking algorithm [%s] is not supported.", invalid_algorithm_type))); } public void testCreate_whenAlgorithmFieldInvalidAlgorithmContent_thenFail() { diff --git a/src/test/java/org/opensearch/neuralsearch/processor/chunker/ChunkerFactoryTests.java b/src/test/java/org/opensearch/neuralsearch/processor/chunker/ChunkerFactoryTests.java index 0d7f7d755..2b06ca10a 100644 --- a/src/test/java/org/opensearch/neuralsearch/processor/chunker/ChunkerFactoryTests.java +++ b/src/test/java/org/opensearch/neuralsearch/processor/chunker/ChunkerFactoryTests.java @@ -11,7 +11,6 @@ import java.util.HashMap; import java.util.Locale; import java.util.Map; -import java.util.Set; import static org.opensearch.neuralsearch.processor.chunker.FixedTokenLengthChunker.ANALYSIS_REGISTRY_FIELD; @@ -20,11 +19,6 @@ public class ChunkerFactoryTests extends OpenSearchTestCase { @Mock private AnalysisRegistry analysisRegistry; - public void testGetAllChunkers() { - Set expected = Set.of(FixedTokenLengthChunker.ALGORITHM_NAME, DelimiterChunker.ALGORITHM_NAME); - assertEquals(expected, ChunkerFactory.getAllChunkers()); - } - public void testCreate_FixedTokenLength() { Chunker chunker = ChunkerFactory.create(FixedTokenLengthChunker.ALGORITHM_NAME, createChunkParameters()); assertNotNull(chunker); @@ -38,13 +32,13 @@ public void testCreate_Delimiter() { } public void testCreate_Invalid() { - String invalidChunkerType = "Invalid Chunker Type"; + String invalidChunkerName = "Invalid Chunker Algorithm"; IllegalArgumentException illegalArgumentException = assertThrows( IllegalArgumentException.class, - () -> ChunkerFactory.create(invalidChunkerType, createChunkParameters()) + () -> ChunkerFactory.create(invalidChunkerName, createChunkParameters()) ); assert (illegalArgumentException.getMessage() - .contains(String.format(Locale.ROOT, "chunking algorithm [%s] is not supported.", invalidChunkerType))); + .contains(String.format(Locale.ROOT, "Chunking algorithm [%s] is not supported.", invalidChunkerName))); } private Map createChunkParameters() { From 08dcd19a4e47e4b40960e8287195d109ae860266 Mon Sep 17 00:00:00 2001 From: yuye-aws Date: Thu, 14 Mar 2024 13:41:16 +0800 Subject: [PATCH 152/189] remove type check for parameter check for max token count Signed-off-by: yuye-aws --- .../neuralsearch/processor/TextChunkingProcessor.java | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/src/main/java/org/opensearch/neuralsearch/processor/TextChunkingProcessor.java b/src/main/java/org/opensearch/neuralsearch/processor/TextChunkingProcessor.java index 1d654e3a1..6b3905642 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/TextChunkingProcessor.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/TextChunkingProcessor.java @@ -163,11 +163,9 @@ public IngestDocument execute(IngestDocument ingestDocument) { int chunkCount = 0; Map runtimeParameters = new HashMap<>(); Map sourceAndMetadataMap = ingestDocument.getSourceAndMetadata(); - if (chunker instanceof FixedTokenLengthChunker) { - // fixed token length algorithm needs max_token_count for tokenization - int maxTokenCount = getMaxTokenCount(sourceAndMetadataMap); - runtimeParameters.put(FixedTokenLengthChunker.MAX_TOKEN_COUNT_FIELD, maxTokenCount); - } + // fixed token length algorithm needs max_token_count for tokenization + int maxTokenCount = getMaxTokenCount(sourceAndMetadataMap); + runtimeParameters.put(FixedTokenLengthChunker.MAX_TOKEN_COUNT_FIELD, maxTokenCount); chunkMapType(sourceAndMetadataMap, fieldMap, runtimeParameters, chunkCount); return ingestDocument; } From f16882dc3e18f12999867fb6684103aae0add20d Mon Sep 17 00:00:00 2001 From: yuye-aws Date: Thu, 14 Mar 2024 13:42:24 +0800 Subject: [PATCH 153/189] remove type check for parameter check for analysis registry Signed-off-by: yuye-aws --- .../neuralsearch/processor/TextChunkingProcessor.java | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/src/main/java/org/opensearch/neuralsearch/processor/TextChunkingProcessor.java b/src/main/java/org/opensearch/neuralsearch/processor/TextChunkingProcessor.java index 6b3905642..7bfd9669f 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/TextChunkingProcessor.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/TextChunkingProcessor.java @@ -113,10 +113,8 @@ private void validateAndParseAlgorithmMap(final Map algorithmMap } Map chunkerParameters = (Map) algorithmValue; - if (Objects.equals(algorithmKey, FixedTokenLengthChunker.ALGORITHM_NAME)) { - // fixed token length algorithm needs analysis registry for tokenization - chunkerParameters.put(FixedTokenLengthChunker.ANALYSIS_REGISTRY_FIELD, analysisRegistry); - } + // fixed token length algorithm needs analysis registry for tokenization + chunkerParameters.put(FixedTokenLengthChunker.ANALYSIS_REGISTRY_FIELD, analysisRegistry); this.chunker = ChunkerFactory.create(algorithmKey, chunkerParameters); this.maxChunkLimit = ChunkerParameterValidator.validatePositiveIntegerParameter( chunkerParameters, From a969a60312dd364a62552e4cc6bc2a4937a3f42a Mon Sep 17 00:00:00 2001 From: yuye-aws Date: Thu, 14 Mar 2024 14:29:24 +0800 Subject: [PATCH 154/189] implement parser and validator Signed-off-by: yuye-aws --- .../processor/TextChunkingProcessor.java | 10 ++- .../processor/chunker/Chunker.java | 12 +++- .../chunker/ChunkerParameterParser.java | 50 ++++++++++++++ .../chunker/ChunkerParameterValidator.java | 69 +++++++++---------- .../processor/chunker/DelimiterChunker.java | 24 +++++-- .../chunker/FixedTokenLengthChunker.java | 53 +++++++++----- .../chunker/FixedTokenLengthChunkerTests.java | 17 +++-- 7 files changed, 156 insertions(+), 79 deletions(-) create mode 100644 src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerParameterParser.java diff --git a/src/main/java/org/opensearch/neuralsearch/processor/TextChunkingProcessor.java b/src/main/java/org/opensearch/neuralsearch/processor/TextChunkingProcessor.java index 7bfd9669f..b2fe4d7d7 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/TextChunkingProcessor.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/TextChunkingProcessor.java @@ -27,8 +27,9 @@ import org.opensearch.neuralsearch.processor.chunker.ChunkerFactory; import org.opensearch.neuralsearch.processor.chunker.Chunker; import org.opensearch.index.mapper.IndexFieldMapper; -import org.opensearch.neuralsearch.processor.chunker.ChunkerParameterValidator; import org.opensearch.neuralsearch.processor.chunker.FixedTokenLengthChunker; +import static org.opensearch.neuralsearch.processor.chunker.ChunkerParameterValidator.validatePositiveIntegerParameter; +import static org.opensearch.neuralsearch.processor.chunker.ChunkerParameterParser.parseIntegerParameter; /** * This processor is used for user input data text chunking. @@ -116,11 +117,8 @@ private void validateAndParseAlgorithmMap(final Map algorithmMap // fixed token length algorithm needs analysis registry for tokenization chunkerParameters.put(FixedTokenLengthChunker.ANALYSIS_REGISTRY_FIELD, analysisRegistry); this.chunker = ChunkerFactory.create(algorithmKey, chunkerParameters); - this.maxChunkLimit = ChunkerParameterValidator.validatePositiveIntegerParameter( - chunkerParameters, - MAX_CHUNK_LIMIT_FIELD, - DEFAULT_MAX_CHUNK_LIMIT - ); + validatePositiveIntegerParameter(chunkerParameters, MAX_CHUNK_LIMIT_FIELD, DEFAULT_MAX_CHUNK_LIMIT); + this.maxChunkLimit = parseIntegerParameter(chunkerParameters, MAX_CHUNK_LIMIT_FIELD, DEFAULT_MAX_CHUNK_LIMIT); } @SuppressWarnings("unchecked") diff --git a/src/main/java/org/opensearch/neuralsearch/processor/chunker/Chunker.java b/src/main/java/org/opensearch/neuralsearch/processor/chunker/Chunker.java index 45f758569..af8290f7e 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/chunker/Chunker.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/chunker/Chunker.java @@ -14,12 +14,20 @@ public interface Chunker { /** - * Validate and parse the parameters for chunking algorithm, + * Validate the parameters for chunking algorithm, * will throw IllegalArgumentException when parameters are invalid * * @param parameters a map containing non-runtime parameters for chunking algorithms */ - void validateAndParseParameters(Map parameters); + void validateParameters(Map parameters); + + /** + * Parse the parameters for chunking algorithm. + * The parameters must be validated before parsing. + * + * @param parameters a map containing non-runtime parameters for chunking algorithms + */ + void parseParameters(Map parameters); /** * Chunk the incoming string according to parameters and return chunked passages diff --git a/src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerParameterParser.java b/src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerParameterParser.java new file mode 100644 index 000000000..5fb5d6666 --- /dev/null +++ b/src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerParameterParser.java @@ -0,0 +1,50 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ +package org.opensearch.neuralsearch.processor.chunker; + +import org.apache.commons.lang3.math.NumberUtils; + +import java.util.Map; + +/** + * Parse the parameter for text chunking processor and algorithms. + * The parameter must be validated before parsing. + */ +public class ChunkerParameterParser { + + /** + * Parse string type parameter + */ + public static String parseStringParameter(final Map parameters, final String fieldName, final String defaultValue) { + if (!parameters.containsKey(fieldName)) { + return defaultValue; + } + return parameters.get(fieldName).toString(); + } + + /** + * Parse integer type parameter + */ + public static int parseIntegerParameter(final Map parameters, final String fieldName, final int defaultValue) { + if (!parameters.containsKey(fieldName)) { + // all chunking algorithm parameters are optional + return defaultValue; + } + String fieldValueString = parameters.get(fieldName).toString(); + return NumberUtils.createInteger(fieldValueString); + } + + /** + * parse double type parameter + */ + public static double parseDoubleParameter(final Map parameters, final String fieldName, final double defaultValue) { + if (!parameters.containsKey(fieldName)) { + // all chunking algorithm parameters are optional + return defaultValue; + } + String fieldValueString = parameters.get(fieldName).toString(); + return NumberUtils.createDouble(fieldValueString); + } +} diff --git a/src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerParameterValidator.java b/src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerParameterValidator.java index 66fd072d1..4bd4a38f9 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerParameterValidator.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerParameterValidator.java @@ -11,85 +11,78 @@ import java.util.Locale; /** - * Validate and parse the parameter for text chunking processor and algorithms + * Validate the parameter for text chunking processor and algorithms */ public class ChunkerParameterValidator { /** - * Validate and parse the parameter for string parameters + * Validate string type parameter */ - public static String validateStringParameters( - final Map parameters, - final String fieldName, - final String defaultValue, - final boolean allowEmpty - ) { + public static void validateStringParameter(final Map parameters, final String fieldName, final boolean allowEmpty) { if (!parameters.containsKey(fieldName)) { // all chunking algorithm parameters are optional - return defaultValue; + return; } Object fieldValue = parameters.get(fieldName); if (!(fieldValue instanceof String)) { throw new IllegalArgumentException( String.format(Locale.ROOT, "Parameter [%s] cannot be cast to [%s]", fieldName, String.class.getName()) ); - } else if (!allowEmpty && StringUtils.isEmpty(fieldValue.toString())) { + } + if (!allowEmpty && StringUtils.isEmpty(fieldValue.toString())) { throw new IllegalArgumentException(String.format(Locale.ROOT, "Parameter [%s] should not be empty.", fieldName)); } - return (String) fieldValue; } /** - * Validate and parse the parameter for numeric parameters + * Validate integer type parameter with positive value */ - public static Number validateNumberParameter(final Map parameters, final String fieldName, final Number defaultValue) { + public static void validatePositiveIntegerParameter( + final Map parameters, + final String fieldName, + final int defaultValue + ) { if (!parameters.containsKey(fieldName)) { // all chunking algorithm parameters are optional - return defaultValue; + return; } - String fieldValue = parameters.get(fieldName).toString(); - if (!(NumberUtils.isParsable(fieldValue))) { + String fieldValueString = parameters.get(fieldName).toString(); + if (!(NumberUtils.isParsable(fieldValueString))) { throw new IllegalArgumentException( String.format(Locale.ROOT, "Parameter [%s] cannot be cast to [%s]", fieldName, Number.class.getName()) ); } - return NumberUtils.createNumber(fieldValue); - } - - /** - * Validate and parse the parameter for positive integer parameters - */ - public static int validatePositiveIntegerParameter( - final Map parameters, - final String fieldName, - final int defaultValue - ) { - Number fieldValueNumber = validateNumberParameter(parameters, fieldName, defaultValue); - int fieldValueInt = fieldValueNumber.intValue(); - // sometimes parameter has negative default value, indicating that this parameter is not effective + int fieldValueInt = NumberUtils.createInteger(fieldValueString); + // sometimes the parameter has negative default value, indicating that this parameter is not effective if (fieldValueInt != defaultValue && fieldValueInt <= 0) { throw new IllegalArgumentException(String.format(Locale.ROOT, "Parameter [%s] must be positive.", fieldName)); } - return fieldValueInt; } /** - * Validate and parse the parameter for double parameters within [lowerBound, upperBound] + * Validate double type parameter within range [lowerBound, upperBound] */ - public static double validateRangeDoubleParameter( + public static void validateDoubleParameterWithinRange( final Map parameters, final String fieldName, final double lowerBound, - final double upperBound, - final double defaultValue + final double upperBound ) { - Number fieldValueNumber = validateNumberParameter(parameters, fieldName, defaultValue); - double fieldValueDouble = fieldValueNumber.doubleValue(); + if (!parameters.containsKey(fieldName)) { + // all chunking algorithm parameters are optional + return; + } + String fieldValueString = parameters.get(fieldName).toString(); + if (!(NumberUtils.isParsable(fieldValueString))) { + throw new IllegalArgumentException( + String.format(Locale.ROOT, "Parameter [%s] cannot be cast to [%s]", fieldName, Number.class.getName()) + ); + } + double fieldValueDouble = NumberUtils.createDouble(fieldValueString); if (fieldValueDouble < lowerBound || fieldValueDouble > upperBound) { throw new IllegalArgumentException( String.format(Locale.ROOT, "Parameter [%s] must be between %s and %s", fieldName, lowerBound, upperBound) ); } - return fieldValueDouble; } } diff --git a/src/main/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunker.java b/src/main/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunker.java index 4eef512e6..168936a8d 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunker.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunker.java @@ -8,7 +8,8 @@ import java.util.List; import java.util.ArrayList; -import static org.opensearch.neuralsearch.processor.chunker.ChunkerParameterValidator.validateStringParameters; +import static org.opensearch.neuralsearch.processor.chunker.ChunkerParameterParser.parseStringParameter; +import static org.opensearch.neuralsearch.processor.chunker.ChunkerParameterValidator.validateStringParameter; /** * The implementation {@link Chunker} for delimiter algorithm @@ -22,19 +23,32 @@ public class DelimiterChunker implements Chunker { private String delimiter; public DelimiterChunker(final Map parameters) { - validateAndParseParameters(parameters); + validateParameters(parameters); + parseParameters(parameters); } /** - * Validate and parse the parameters for delimiter algorithm, + * Validate the parameters for delimiter algorithm, * will throw IllegalArgumentException if delimiter is not a string or empty * * @param parameters a map containing parameters, containing the following parameters * 1. A string as the paragraph split indicator */ @Override - public void validateAndParseParameters(final Map parameters) { - this.delimiter = validateStringParameters(parameters, DELIMITER_FIELD, DEFAULT_DELIMITER, false); + public void validateParameters(Map parameters) { + validateStringParameter(parameters, DELIMITER_FIELD, false); + } + + /** + * Parse the parameters for delimiter algorithm, + * will throw IllegalArgumentException if delimiter is not a string or empty + * + * @param parameters a map containing parameters, containing the following parameters + * 1. A string as the paragraph split indicator + */ + @Override + public void parseParameters(Map parameters) { + this.delimiter = parseStringParameter(parameters, DELIMITER_FIELD, DEFAULT_DELIMITER); } /** diff --git a/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java b/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java index ef38d13cf..39da2fcd6 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java @@ -15,9 +15,12 @@ import org.opensearch.action.admin.indices.analyze.AnalyzeAction; import org.opensearch.action.admin.indices.analyze.AnalyzeAction.AnalyzeToken; import static org.opensearch.action.admin.indices.analyze.TransportAnalyzeAction.analyze; -import static org.opensearch.neuralsearch.processor.chunker.ChunkerParameterValidator.validateRangeDoubleParameter; +import static org.opensearch.neuralsearch.processor.chunker.ChunkerParameterParser.parseStringParameter; +import static org.opensearch.neuralsearch.processor.chunker.ChunkerParameterParser.parseDoubleParameter; +import static org.opensearch.neuralsearch.processor.chunker.ChunkerParameterParser.parseIntegerParameter; +import static org.opensearch.neuralsearch.processor.chunker.ChunkerParameterValidator.validateStringParameter; import static org.opensearch.neuralsearch.processor.chunker.ChunkerParameterValidator.validatePositiveIntegerParameter; -import static org.opensearch.neuralsearch.processor.chunker.ChunkerParameterValidator.validateStringParameters; +import static org.opensearch.neuralsearch.processor.chunker.ChunkerParameterValidator.validateDoubleParameterWithinRange; /** * The implementation {@link Chunker} for fixed token length algorithm. @@ -59,41 +62,36 @@ public class FixedTokenLengthChunker implements Chunker { private final AnalysisRegistry analysisRegistry; public FixedTokenLengthChunker(final Map parameters) { - validateAndParseParameters(parameters); + validateParameters(parameters); + parseParameters(parameters); this.analysisRegistry = (AnalysisRegistry) parameters.get(ANALYSIS_REGISTRY_FIELD); } /** - * Validate and parse the parameters for fixed token length algorithm, + * Validate the parameters for fixed token length algorithm, * will throw IllegalArgumentException when parameters are invalid * - * @param parameters a map containing parameters, containing the following parameters: + * @param parameters a map containing non-runtime parameters as the following: * 1. tokenizer: the analyzer tokenizer in opensearch * 2. token_limit: the token limit for each chunked passage * 3. overlap_rate: the overlapping degree for each chunked passage, indicating how many token comes from the previous passage - * 4. max_token_count: the max token limit for the tokenizer * Here are requirements for parameters: * max_token_count and token_limit should be a positive integer * overlap_rate should be within range [0, 0.5] * tokenizer should be string */ @Override - public void validateAndParseParameters(final Map parameters) { - this.tokenLimit = validatePositiveIntegerParameter(parameters, TOKEN_LIMIT_FIELD, DEFAULT_TOKEN_LIMIT); - this.overlapRate = validateRangeDoubleParameter( - parameters, - OVERLAP_RATE_FIELD, - OVERLAP_RATE_LOWER_BOUND, - OVERLAP_RATE_UPPER_BOUND, - DEFAULT_OVERLAP_RATE - ); - this.tokenizer = validateStringParameters(parameters, TOKENIZER_FIELD, DEFAULT_TOKENIZER, false); - if (!WORD_TOKENIZERS.contains(this.tokenizer)) { + public void validateParameters(Map parameters) { + validatePositiveIntegerParameter(parameters, TOKEN_LIMIT_FIELD, DEFAULT_TOKEN_LIMIT); + validateDoubleParameterWithinRange(parameters, OVERLAP_RATE_FIELD, OVERLAP_RATE_LOWER_BOUND, OVERLAP_RATE_UPPER_BOUND); + validateStringParameter(parameters, TOKENIZER_FIELD, false); + String tokenizer = parseStringParameter(parameters, TOKENIZER_FIELD, DEFAULT_TOKENIZER); + if (!WORD_TOKENIZERS.contains(tokenizer)) { throw new IllegalArgumentException( String.format( Locale.ROOT, "tokenizer [%s] is not supported for [%s] algorithm. Supported tokenizers are %s", - this.tokenizer, + tokenizer, ALGORITHM_NAME, WORD_TOKENIZERS ) @@ -101,6 +99,22 @@ public void validateAndParseParameters(final Map parameters) { } } + /** + * Parse the parameters for fixed token length algorithm, + * will throw IllegalArgumentException when parameters are invalid + * + * @param parameters a map non-runtime parameters as the following: + * 1. tokenizer: the analyzer tokenizer in opensearch + * 2. token_limit: the token limit for each chunked passage + * 3. overlap_rate: the overlapping degree for each chunked passage, indicating how many token comes from the previous passage + */ + @Override + public void parseParameters(Map parameters) { + this.tokenLimit = parseIntegerParameter(parameters, TOKEN_LIMIT_FIELD, DEFAULT_TOKEN_LIMIT); + this.overlapRate = parseDoubleParameter(parameters, OVERLAP_RATE_FIELD, DEFAULT_OVERLAP_RATE); + this.tokenizer = parseStringParameter(parameters, TOKENIZER_FIELD, DEFAULT_TOKENIZER); + } + /** * Return the chunked passages for fixed token length algorithm * @@ -111,7 +125,8 @@ public void validateAndParseParameters(final Map parameters) { @Override public List chunk(final String content, final Map runtimeParameters) { // before chunking, validate and parse runtimeParameters - int maxTokenCount = validatePositiveIntegerParameter(runtimeParameters, MAX_TOKEN_COUNT_FIELD, DEFAULT_MAX_TOKEN_COUNT); + validatePositiveIntegerParameter(runtimeParameters, MAX_TOKEN_COUNT_FIELD, DEFAULT_MAX_TOKEN_COUNT); + int maxTokenCount = parseIntegerParameter(runtimeParameters, MAX_TOKEN_COUNT_FIELD, DEFAULT_MAX_TOKEN_COUNT); List tokens = tokenize(content, tokenizer, maxTokenCount); List chunkResult = new ArrayList<>(); diff --git a/src/test/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunkerTests.java b/src/test/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunkerTests.java index 6ab9c3518..17de0afae 100644 --- a/src/test/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunkerTests.java +++ b/src/test/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunkerTests.java @@ -63,7 +63,7 @@ public Map> getTokeniz } public void testValidateAndParseParameters_whenNoParams_thenSuccessful() { - fixedTokenLengthChunker.validateAndParseParameters(Map.of()); + fixedTokenLengthChunker.validateParameters(Map.of()); } public void testValidateAndParseParameters_whenIllegalTokenLimitType_thenFail() { @@ -71,7 +71,7 @@ public void testValidateAndParseParameters_whenIllegalTokenLimitType_thenFail() parameters.put(TOKEN_LIMIT_FIELD, "invalid token limit"); IllegalArgumentException illegalArgumentException = assertThrows( IllegalArgumentException.class, - () -> fixedTokenLengthChunker.validateAndParseParameters(parameters) + () -> fixedTokenLengthChunker.validateParameters(parameters) ); assertEquals( String.format(Locale.ROOT, "Parameter [%s] cannot be cast to [%s]", TOKEN_LIMIT_FIELD, Number.class.getName()), @@ -84,7 +84,7 @@ public void testValidateAndParseParameters_whenIllegalTokenLimitValue_thenFail() parameters.put(TOKEN_LIMIT_FIELD, -1); IllegalArgumentException illegalArgumentException = assertThrows( IllegalArgumentException.class, - () -> fixedTokenLengthChunker.validateAndParseParameters(parameters) + () -> fixedTokenLengthChunker.validateParameters(parameters) ); assertEquals( String.format(Locale.ROOT, "Parameter [%s] must be positive.", TOKEN_LIMIT_FIELD), @@ -97,7 +97,7 @@ public void testValidateAndParseParameters_whenIllegalOverlapRateType_thenFail() parameters.put(OVERLAP_RATE_FIELD, "invalid overlap rate"); IllegalArgumentException illegalArgumentException = assertThrows( IllegalArgumentException.class, - () -> fixedTokenLengthChunker.validateAndParseParameters(parameters) + () -> fixedTokenLengthChunker.validateParameters(parameters) ); assertEquals( String.format(Locale.ROOT, "Parameter [%s] cannot be cast to [%s]", OVERLAP_RATE_FIELD, Number.class.getName()), @@ -110,7 +110,7 @@ public void testValidateAndParseParameters_whenIllegalOverlapRateValue_thenFail( parameters.put(OVERLAP_RATE_FIELD, 0.6); IllegalArgumentException illegalArgumentException = assertThrows( IllegalArgumentException.class, - () -> fixedTokenLengthChunker.validateAndParseParameters(parameters) + () -> fixedTokenLengthChunker.validateParameters(parameters) ); assertEquals( String.format(Locale.ROOT, "Parameter [%s] must be between %s and %s", OVERLAP_RATE_FIELD, 0.0, 0.5), @@ -123,7 +123,7 @@ public void testValidateAndParseParameters_whenIllegalTokenizerType_thenFail() { parameters.put(TOKENIZER_FIELD, 111); IllegalArgumentException illegalArgumentException = assertThrows( IllegalArgumentException.class, - () -> fixedTokenLengthChunker.validateAndParseParameters(parameters) + () -> fixedTokenLengthChunker.validateParameters(parameters) ); assertEquals( String.format(Locale.ROOT, "Parameter [%s] cannot be cast to [%s]", TOKENIZER_FIELD, String.class.getName()), @@ -133,11 +133,10 @@ public void testValidateAndParseParameters_whenIllegalTokenizerType_thenFail() { public void testValidateAndParseParameters_whenUnsupportedTokenizer_thenFail() { String ngramTokenizer = "ngram"; - Map parameters = new HashMap<>(); - parameters.put(TOKENIZER_FIELD, "ngram"); + Map parameters = Map.of(TOKENIZER_FIELD, ngramTokenizer); IllegalArgumentException illegalArgumentException = assertThrows( IllegalArgumentException.class, - () -> fixedTokenLengthChunker.validateAndParseParameters(parameters) + () -> fixedTokenLengthChunker.validateParameters(parameters) ); assert (illegalArgumentException.getMessage() .contains(String.format(Locale.ROOT, "tokenizer [%s] is not supported for [%s] algorithm.", ngramTokenizer, ALGORITHM_NAME))); From 34348b3be8a91b99ac5c1c29051fabd5a7acc624 Mon Sep 17 00:00:00 2001 From: yuye-aws Date: Thu, 14 Mar 2024 18:09:41 +0800 Subject: [PATCH 155/189] update comment Signed-off-by: yuye-aws --- .../processor/TextChunkingProcessor.java | 15 +++++++-------- .../chunker/FixedTokenLengthChunker.java | 2 +- 2 files changed, 8 insertions(+), 9 deletions(-) diff --git a/src/main/java/org/opensearch/neuralsearch/processor/TextChunkingProcessor.java b/src/main/java/org/opensearch/neuralsearch/processor/TextChunkingProcessor.java index b2fe4d7d7..7b45d6d45 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/TextChunkingProcessor.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/TextChunkingProcessor.java @@ -24,9 +24,9 @@ import org.opensearch.index.IndexSettings; import org.opensearch.ingest.AbstractProcessor; import org.opensearch.ingest.IngestDocument; -import org.opensearch.neuralsearch.processor.chunker.ChunkerFactory; import org.opensearch.neuralsearch.processor.chunker.Chunker; import org.opensearch.index.mapper.IndexFieldMapper; +import org.opensearch.neuralsearch.processor.chunker.ChunkerFactory; import org.opensearch.neuralsearch.processor.chunker.FixedTokenLengthChunker; import static org.opensearch.neuralsearch.processor.chunker.ChunkerParameterValidator.validatePositiveIntegerParameter; import static org.opensearch.neuralsearch.processor.chunker.ChunkerParameterParser.parseIntegerParameter; @@ -155,19 +155,18 @@ private int getMaxTokenCount(final Map sourceAndMetadataMap) { */ @Override public IngestDocument execute(IngestDocument ingestDocument) { - validateFieldsValue(ingestDocument); + Map sourceAndMetadataMap = ingestDocument.getSourceAndMetadata(); + validateFieldsValue(sourceAndMetadataMap); + // fixed token length algorithm needs runtime parameter max_token_count for tokenization int chunkCount = 0; Map runtimeParameters = new HashMap<>(); - Map sourceAndMetadataMap = ingestDocument.getSourceAndMetadata(); - // fixed token length algorithm needs max_token_count for tokenization int maxTokenCount = getMaxTokenCount(sourceAndMetadataMap); runtimeParameters.put(FixedTokenLengthChunker.MAX_TOKEN_COUNT_FIELD, maxTokenCount); chunkMapType(sourceAndMetadataMap, fieldMap, runtimeParameters, chunkCount); return ingestDocument; } - private void validateFieldsValue(final IngestDocument ingestDocument) { - Map sourceAndMetadataMap = ingestDocument.getSourceAndMetadata(); + private void validateFieldsValue(final Map sourceAndMetadataMap) { for (Map.Entry embeddingFieldsEntry : fieldMap.entrySet()) { Object sourceValue = sourceAndMetadataMap.get(embeddingFieldsEntry.getKey()); if (sourceValue != null) { @@ -297,8 +296,8 @@ private int chunkList( @SuppressWarnings("unchecked") private int chunkLeafType(final Object value, List result, final Map runTimeParameters, int chunkCount) { - // leaf type means either String or List - // the result should be an empty list + // leaf type means null, String or List + // the result should be an empty list when the input is null if (value instanceof String) { chunkCount = chunkString(value.toString(), result, runTimeParameters, chunkCount); } else if (isListOfString(value)) { diff --git a/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java b/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java index 39da2fcd6..145484b79 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java @@ -104,7 +104,7 @@ public void validateParameters(Map parameters) { * will throw IllegalArgumentException when parameters are invalid * * @param parameters a map non-runtime parameters as the following: - * 1. tokenizer: the analyzer tokenizer in opensearch + * 1. tokenizer: the word tokenizer in opensearch * 2. token_limit: the token limit for each chunked passage * 3. overlap_rate: the overlapping degree for each chunked passage, indicating how many token comes from the previous passage */ From 4153988e6fc311f4d5902d3d6a028c1a8bfdf7fc Mon Sep 17 00:00:00 2001 From: yuye-aws Date: Thu, 14 Mar 2024 18:21:02 +0800 Subject: [PATCH 156/189] provide fixed token length as the default algorithm Signed-off-by: yuye-aws --- .../processor/TextChunkingProcessor.java | 43 ++++++++++--------- .../processor/TextChunkingProcessorTests.java | 42 ++++++++++-------- 2 files changed, 46 insertions(+), 39 deletions(-) diff --git a/src/main/java/org/opensearch/neuralsearch/processor/TextChunkingProcessor.java b/src/main/java/org/opensearch/neuralsearch/processor/TextChunkingProcessor.java index 7b45d6d45..22c152b36 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/TextChunkingProcessor.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/TextChunkingProcessor.java @@ -40,15 +40,13 @@ public final class TextChunkingProcessor extends AbstractProcessor { public static final String TYPE = "text_chunking"; - public static final String FIELD_MAP_FIELD = "field_map"; - public static final String ALGORITHM_FIELD = "algorithm"; - @VisibleForTesting static final String MAX_CHUNK_LIMIT_FIELD = "max_chunk_limit"; private static final int DEFAULT_MAX_CHUNK_LIMIT = -1; + private static final String DEFAULT_ALGORITHM = FixedTokenLengthChunker.ALGORITHM_NAME; private int maxChunkLimit; @@ -88,29 +86,32 @@ public String getType() { @SuppressWarnings("unchecked") private void validateAndParseAlgorithmMap(final Map algorithmMap) { - if (algorithmMap.isEmpty()) { - throw new IllegalArgumentException( - String.format(Locale.ROOT, "Unable to create %s processor as [%s] does not contain any algorithm", TYPE, ALGORITHM_FIELD) - ); - } else if (algorithmMap.size() > 1) { + if (algorithmMap.size() > 1) { throw new IllegalArgumentException( String.format(Locale.ROOT, "Unable to create %s processor as [%s] contains multiple algorithms", TYPE, ALGORITHM_FIELD) ); } - Entry algorithmEntry = algorithmMap.entrySet().iterator().next(); - String algorithmKey = algorithmEntry.getKey(); - Object algorithmValue = algorithmEntry.getValue(); - if (!(algorithmValue instanceof Map)) { - throw new IllegalArgumentException( - String.format( - Locale.ROOT, - "Unable to create %s processor as [%s] parameters cannot be cast to [%s]", - TYPE, - algorithmKey, - Map.class.getName() - ) - ); + String algorithmKey; + Object algorithmValue; + if (algorithmMap.isEmpty()) { + algorithmKey = DEFAULT_ALGORITHM; + algorithmValue = new HashMap<>(); + } else { + Entry algorithmEntry = algorithmMap.entrySet().iterator().next(); + algorithmKey = algorithmEntry.getKey(); + algorithmValue = algorithmEntry.getValue(); + if (!(algorithmValue instanceof Map)) { + throw new IllegalArgumentException( + String.format( + Locale.ROOT, + "Unable to create %s processor as [%s] parameters cannot be cast to [%s]", + TYPE, + algorithmKey, + Map.class.getName() + ) + ); + } } Map chunkerParameters = (Map) algorithmValue; diff --git a/src/test/java/org/opensearch/neuralsearch/processor/TextChunkingProcessorTests.java b/src/test/java/org/opensearch/neuralsearch/processor/TextChunkingProcessorTests.java index 2ea5f314a..4df563150 100644 --- a/src/test/java/org/opensearch/neuralsearch/processor/TextChunkingProcessorTests.java +++ b/src/test/java/org/opensearch/neuralsearch/processor/TextChunkingProcessorTests.java @@ -125,6 +125,16 @@ private Map createNestedFieldMap() { return fieldMap; } + @SneakyThrows + private TextChunkingProcessor createDefaultAlgorithmInstance(Map fieldMap) { + Map config = new HashMap<>(); + Map algorithmMap = new HashMap<>(); + config.put(FIELD_MAP_FIELD, fieldMap); + config.put(ALGORITHM_FIELD, algorithmMap); + Map registry = new HashMap<>(); + return textChunkingProcessorFactory.create(registry, PROCESSOR_TAG, DESCRIPTION, config); + } + @SneakyThrows private TextChunkingProcessor createFixedTokenLengthInstance(Map fieldMap) { Map config = new HashMap<>(); @@ -195,24 +205,6 @@ public void testCreate_whenMaxChunkNumInvalidValue_thenFail() { ); } - public void testCreate_whenAlgorithmFieldNoAlgorithm_thenFail() { - Map config = new HashMap<>(); - Map fieldMap = new HashMap<>(); - Map algorithmMap = new HashMap<>(); - fieldMap.put(INPUT_FIELD, OUTPUT_FIELD); - config.put(TextChunkingProcessor.FIELD_MAP_FIELD, fieldMap); - config.put(ALGORITHM_FIELD, algorithmMap); - Map registry = new HashMap<>(); - IllegalArgumentException illegalArgumentException = assertThrows( - IllegalArgumentException.class, - () -> textChunkingProcessorFactory.create(registry, PROCESSOR_TAG, DESCRIPTION, config) - ); - assertEquals( - String.format(Locale.ROOT, "Unable to create %s processor as [%s] does not contain any algorithm", TYPE, ALGORITHM_FIELD), - illegalArgumentException.getMessage() - ); - } - public void testCreate_whenAlgorithmFieldMultipleAlgorithm_thenFail() { Map config = new HashMap<>(); Map fieldMap = new HashMap<>(); @@ -403,7 +395,21 @@ public void testExecute_withFixedTokenLength_andSourceDataStringWithMaxChunkNumE ), illegalArgumentException.getMessage() ); + } + @SneakyThrows + public void testCreate_withDefaultAlgorithm_andSourceDataString_thenSucceed() { + TextChunkingProcessor processor = createDefaultAlgorithmInstance(createStringFieldMap()); + IngestDocument ingestDocument = createIngestDocumentWithSourceData(createSourceDataString()); + IngestDocument document = processor.execute(ingestDocument); + assert document.getSourceAndMetadata().containsKey(OUTPUT_FIELD); + Object passages = document.getSourceAndMetadata().get(OUTPUT_FIELD); + assert (passages instanceof List); + List expectedPassages = new ArrayList<>(); + expectedPassages.add( + "This is an example document to be chunked. The document contains a single paragraph, two sentences and 24 tokens by standard tokenizer in OpenSearch." + ); + assertEquals(expectedPassages, passages); } @SneakyThrows From 06ca1c78f6d0578e8e6094a09c32ca2ab5ea17ff Mon Sep 17 00:00:00 2001 From: yuye-aws Date: Thu, 14 Mar 2024 18:32:02 +0800 Subject: [PATCH 157/189] adjust exception message Signed-off-by: yuye-aws --- .../neuralsearch/processor/TextChunkingProcessor.java | 5 ++--- .../neuralsearch/processor/TextChunkingProcessorTests.java | 5 ++--- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/src/main/java/org/opensearch/neuralsearch/processor/TextChunkingProcessor.java b/src/main/java/org/opensearch/neuralsearch/processor/TextChunkingProcessor.java index 22c152b36..026cd09d8 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/TextChunkingProcessor.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/TextChunkingProcessor.java @@ -105,10 +105,9 @@ private void validateAndParseAlgorithmMap(final Map algorithmMap throw new IllegalArgumentException( String.format( Locale.ROOT, - "Unable to create %s processor as [%s] parameters cannot be cast to [%s]", + "Unable to create %s processor as parameters for [%s] algorithm must be an object", TYPE, - algorithmKey, - Map.class.getName() + algorithmKey ) ); } diff --git a/src/test/java/org/opensearch/neuralsearch/processor/TextChunkingProcessorTests.java b/src/test/java/org/opensearch/neuralsearch/processor/TextChunkingProcessorTests.java index 4df563150..c2b1894eb 100644 --- a/src/test/java/org/opensearch/neuralsearch/processor/TextChunkingProcessorTests.java +++ b/src/test/java/org/opensearch/neuralsearch/processor/TextChunkingProcessorTests.java @@ -259,10 +259,9 @@ public void testCreate_whenAlgorithmFieldInvalidAlgorithmContent_thenFail() { assertEquals( String.format( Locale.ROOT, - "Unable to create %s processor as [%s] parameters cannot be cast to [%s]", + "Unable to create %s processor as parameters for [%s] algorithm must be an object", TYPE, - FixedTokenLengthChunker.ALGORITHM_NAME, - Map.class.getName() + FixedTokenLengthChunker.ALGORITHM_NAME ), illegalArgumentException.getMessage() ); From 3cf671dbd273b7b36de638c70d224ca4a14349c8 Mon Sep 17 00:00:00 2001 From: yuye-aws Date: Thu, 14 Mar 2024 18:34:35 +0800 Subject: [PATCH 158/189] adjust exception message Signed-off-by: yuye-aws --- .../processor/chunker/ChunkerParameterValidator.java | 6 +++--- .../processor/chunker/DelimiterChunkerTests.java | 2 +- .../processor/chunker/FixedTokenLengthChunkerTests.java | 6 +++--- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerParameterValidator.java b/src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerParameterValidator.java index 4bd4a38f9..abf37de0f 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerParameterValidator.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerParameterValidator.java @@ -26,7 +26,7 @@ public static void validateStringParameter(final Map parameters, Object fieldValue = parameters.get(fieldName); if (!(fieldValue instanceof String)) { throw new IllegalArgumentException( - String.format(Locale.ROOT, "Parameter [%s] cannot be cast to [%s]", fieldName, String.class.getName()) + String.format(Locale.ROOT, "Parameter [%s] must be of %s type", fieldName, String.class.getName()) ); } if (!allowEmpty && StringUtils.isEmpty(fieldValue.toString())) { @@ -49,7 +49,7 @@ public static void validatePositiveIntegerParameter( String fieldValueString = parameters.get(fieldName).toString(); if (!(NumberUtils.isParsable(fieldValueString))) { throw new IllegalArgumentException( - String.format(Locale.ROOT, "Parameter [%s] cannot be cast to [%s]", fieldName, Number.class.getName()) + String.format(Locale.ROOT, "Parameter [%s] must be of %s type", fieldName, Number.class.getName()) ); } int fieldValueInt = NumberUtils.createInteger(fieldValueString); @@ -75,7 +75,7 @@ public static void validateDoubleParameterWithinRange( String fieldValueString = parameters.get(fieldName).toString(); if (!(NumberUtils.isParsable(fieldValueString))) { throw new IllegalArgumentException( - String.format(Locale.ROOT, "Parameter [%s] cannot be cast to [%s]", fieldName, Number.class.getName()) + String.format(Locale.ROOT, "Parameter [%s] must be of %s type", fieldName, Number.class.getName()) ); } double fieldValueDouble = NumberUtils.createDouble(fieldValueString); diff --git a/src/test/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunkerTests.java b/src/test/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunkerTests.java index c1d489d46..7746a611e 100644 --- a/src/test/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunkerTests.java +++ b/src/test/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunkerTests.java @@ -21,7 +21,7 @@ public void testChunkerWithDelimiterFieldNotString() { () -> new DelimiterChunker(Map.of(DELIMITER_FIELD, List.of(""))) ); Assert.assertEquals( - String.format(Locale.ROOT, "Parameter [%s] cannot be cast to [%s]", DELIMITER_FIELD, String.class.getName()), + String.format(Locale.ROOT, "Parameter [%s] must be of %s type", DELIMITER_FIELD, String.class.getName()), exception.getMessage() ); } diff --git a/src/test/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunkerTests.java b/src/test/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunkerTests.java index 17de0afae..7bba1e73b 100644 --- a/src/test/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunkerTests.java +++ b/src/test/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunkerTests.java @@ -74,7 +74,7 @@ public void testValidateAndParseParameters_whenIllegalTokenLimitType_thenFail() () -> fixedTokenLengthChunker.validateParameters(parameters) ); assertEquals( - String.format(Locale.ROOT, "Parameter [%s] cannot be cast to [%s]", TOKEN_LIMIT_FIELD, Number.class.getName()), + String.format(Locale.ROOT, "Parameter [%s] must be of %s type", TOKEN_LIMIT_FIELD, Number.class.getName()), illegalArgumentException.getMessage() ); } @@ -100,7 +100,7 @@ public void testValidateAndParseParameters_whenIllegalOverlapRateType_thenFail() () -> fixedTokenLengthChunker.validateParameters(parameters) ); assertEquals( - String.format(Locale.ROOT, "Parameter [%s] cannot be cast to [%s]", OVERLAP_RATE_FIELD, Number.class.getName()), + String.format(Locale.ROOT, "Parameter [%s] must be of %s type", OVERLAP_RATE_FIELD, Number.class.getName()), illegalArgumentException.getMessage() ); } @@ -126,7 +126,7 @@ public void testValidateAndParseParameters_whenIllegalTokenizerType_thenFail() { () -> fixedTokenLengthChunker.validateParameters(parameters) ); assertEquals( - String.format(Locale.ROOT, "Parameter [%s] cannot be cast to [%s]", TOKENIZER_FIELD, String.class.getName()), + String.format(Locale.ROOT, "Parameter [%s] must be of %s type", TOKENIZER_FIELD, String.class.getName()), illegalArgumentException.getMessage() ); } From 5fe5eefa400ef9433d6f0f5a5d4c9afd3a691adf Mon Sep 17 00:00:00 2001 From: yuye-aws Date: Fri, 15 Mar 2024 10:25:03 +0800 Subject: [PATCH 159/189] use object nonnull and require nonnull Signed-off-by: yuye-aws --- .../neuralsearch/processor/TextChunkingProcessor.java | 4 ++-- .../neuralsearch/processor/TextChunkingProcessorIT.java | 7 ++++--- .../neuralsearch/processor/TextChunkingProcessorTests.java | 4 +++- 3 files changed, 9 insertions(+), 6 deletions(-) diff --git a/src/main/java/org/opensearch/neuralsearch/processor/TextChunkingProcessor.java b/src/main/java/org/opensearch/neuralsearch/processor/TextChunkingProcessor.java index 026cd09d8..5d46b048a 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/TextChunkingProcessor.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/TextChunkingProcessor.java @@ -139,7 +139,7 @@ private int getMaxTokenCount(final Map sourceAndMetadataMap) { String indexName = sourceAndMetadataMap.get(IndexFieldMapper.NAME).toString(); IndexMetadata indexMetadata = clusterService.state().metadata().index(indexName); int maxTokenCount; - if (indexMetadata != null) { + if (Objects.nonNull(indexMetadata)) { // if the index is specified in the metadata, read maxTokenCount from the index setting IndexService indexService = indicesService.indexServiceSafe(indexMetadata.getIndex()); maxTokenCount = indexService.getIndexSettings().getMaxTokenCount(); @@ -169,7 +169,7 @@ public IngestDocument execute(IngestDocument ingestDocument) { private void validateFieldsValue(final Map sourceAndMetadataMap) { for (Map.Entry embeddingFieldsEntry : fieldMap.entrySet()) { Object sourceValue = sourceAndMetadataMap.get(embeddingFieldsEntry.getKey()); - if (sourceValue != null) { + if (Objects.nonNull(sourceValue)) { String sourceKey = embeddingFieldsEntry.getKey(); if (sourceValue instanceof List || sourceValue instanceof Map) { validateNestedTypeValue(sourceKey, sourceValue, 1); diff --git a/src/test/java/org/opensearch/neuralsearch/processor/TextChunkingProcessorIT.java b/src/test/java/org/opensearch/neuralsearch/processor/TextChunkingProcessorIT.java index dfcf4b2cb..dd517aa17 100644 --- a/src/test/java/org/opensearch/neuralsearch/processor/TextChunkingProcessorIT.java +++ b/src/test/java/org/opensearch/neuralsearch/processor/TextChunkingProcessorIT.java @@ -17,6 +17,7 @@ import java.util.ArrayList; import java.util.List; import java.util.Map; +import java.util.Objects; import org.opensearch.client.Response; import org.opensearch.common.xcontent.XContentHelper; @@ -194,7 +195,7 @@ private void validateIndexIngestResults(String indexName, String fieldName, Obje private void createPipelineProcessor(String pipelineName) throws Exception { URL pipelineURLPath = classLoader.getResource(PIPELINE_CONFIGS_BY_NAME.get(pipelineName)); - assert pipelineURLPath != null; + Objects.requireNonNull(pipelineURLPath); String requestBody = Files.readString(Path.of(pipelineURLPath.toURI())); Response pipelineCreateResponse = makeRequest( client(), @@ -214,13 +215,13 @@ private void createPipelineProcessor(String pipelineName) throws Exception { private void createTextChunkingIndex(String indexName, String pipelineName) throws Exception { URL indexSettingsURLPath = classLoader.getResource("processor/chunker/TextChunkingIndexSettings.json"); - assert indexSettingsURLPath != null; + Objects.requireNonNull(indexSettingsURLPath); createIndexWithConfiguration(indexName, Files.readString(Path.of(indexSettingsURLPath.toURI())), pipelineName); } private void ingestDocument(String documentPath) throws Exception { URL documentURLPath = classLoader.getResource(documentPath); - assert documentURLPath != null; + Objects.requireNonNull(documentURLPath); String ingestDocument = Files.readString(Path.of(documentURLPath.toURI())); Response response = makeRequest( client(), diff --git a/src/test/java/org/opensearch/neuralsearch/processor/TextChunkingProcessorTests.java b/src/test/java/org/opensearch/neuralsearch/processor/TextChunkingProcessorTests.java index c2b1894eb..239537a52 100644 --- a/src/test/java/org/opensearch/neuralsearch/processor/TextChunkingProcessorTests.java +++ b/src/test/java/org/opensearch/neuralsearch/processor/TextChunkingProcessorTests.java @@ -12,6 +12,8 @@ import java.util.List; import java.util.Locale; import java.util.Map; +import java.util.Objects; + import static java.util.Collections.singletonList; import static java.util.Collections.singletonMap; import static org.mockito.ArgumentMatchers.anyString; @@ -325,7 +327,7 @@ private Map createMaxDepthLimitExceedMap(int maxDepth) { } Map resultMap = new HashMap<>(); Map innerMap = createMaxDepthLimitExceedMap(maxDepth + 1); - if (innerMap != null) { + if (Objects.nonNull(innerMap)) { resultMap.put(INPUT_FIELD, innerMap); } return resultMap; From f3decb4ef12d7796d7e4a3549513b7cafcd9c438 Mon Sep 17 00:00:00 2001 From: yuye-aws Date: Fri, 15 Mar 2024 11:30:32 +0800 Subject: [PATCH 160/189] apply final to ingest document and chunk count Signed-off-by: yuye-aws --- .../processor/TextChunkingProcessor.java | 57 ++++++++++--------- 1 file changed, 31 insertions(+), 26 deletions(-) diff --git a/src/main/java/org/opensearch/neuralsearch/processor/TextChunkingProcessor.java b/src/main/java/org/opensearch/neuralsearch/processor/TextChunkingProcessor.java index 5d46b048a..e03c4780f 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/TextChunkingProcessor.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/TextChunkingProcessor.java @@ -77,7 +77,7 @@ public TextChunkingProcessor( this.clusterService = clusterService; this.indicesService = indicesService; this.analysisRegistry = analysisRegistry; - validateAndParseAlgorithmMap(algorithmMap); + parseAlgorithmMap(algorithmMap); } public String getType() { @@ -85,7 +85,7 @@ public String getType() { } @SuppressWarnings("unchecked") - private void validateAndParseAlgorithmMap(final Map algorithmMap) { + private void parseAlgorithmMap(final Map algorithmMap) { if (algorithmMap.size() > 1) { throw new IllegalArgumentException( String.format(Locale.ROOT, "Unable to create %s processor as [%s] contains multiple algorithms", TYPE, ALGORITHM_FIELD) @@ -154,15 +154,16 @@ private int getMaxTokenCount(final Map sourceAndMetadataMap) { * @param ingestDocument {@link IngestDocument} which is the document passed to processor. */ @Override - public IngestDocument execute(IngestDocument ingestDocument) { + public IngestDocument execute(final IngestDocument ingestDocument) { Map sourceAndMetadataMap = ingestDocument.getSourceAndMetadata(); validateFieldsValue(sourceAndMetadataMap); // fixed token length algorithm needs runtime parameter max_token_count for tokenization - int chunkCount = 0; Map runtimeParameters = new HashMap<>(); - int maxTokenCount = getMaxTokenCount(sourceAndMetadataMap); - runtimeParameters.put(FixedTokenLengthChunker.MAX_TOKEN_COUNT_FIELD, maxTokenCount); - chunkMapType(sourceAndMetadataMap, fieldMap, runtimeParameters, chunkCount); + if (chunker instanceof FixedTokenLengthChunker) { + int maxTokenCount = getMaxTokenCount(sourceAndMetadataMap); + runtimeParameters.put(FixedTokenLengthChunker.MAX_TOKEN_COUNT_FIELD, maxTokenCount); + } + chunkMapType(sourceAndMetadataMap, fieldMap, runtimeParameters, 0); return ingestDocument; } @@ -224,8 +225,9 @@ private int chunkMapType( Map sourceAndMetadataMap, final Map fieldMap, final Map runtimeParameters, - int chunkCount + final int chunkCount ) { + int updatedChunkCount = chunkCount; for (Map.Entry fieldMapEntry : fieldMap.entrySet()) { String originalKey = fieldMapEntry.getKey(); Object targetKey = fieldMapEntry.getValue(); @@ -236,73 +238,76 @@ private int chunkMapType( List sourceObjectList = (List) sourceObject; for (Object source : sourceObjectList) { if (source instanceof Map) { - chunkCount = chunkMapType( + updatedChunkCount = chunkMapType( (Map) source, (Map) targetKey, runtimeParameters, - chunkCount + updatedChunkCount ); } } } else if (sourceObject instanceof Map) { - chunkCount = chunkMapType( + updatedChunkCount = chunkMapType( (Map) sourceObject, (Map) targetKey, runtimeParameters, - chunkCount + updatedChunkCount ); } } else { // chunk the object when target key is a string Object chunkObject = sourceAndMetadataMap.get(originalKey); List chunkedResult = new ArrayList<>(); - chunkCount = chunkLeafType(chunkObject, chunkedResult, runtimeParameters, chunkCount); + updatedChunkCount = chunkLeafType(chunkObject, chunkedResult, runtimeParameters, updatedChunkCount); sourceAndMetadataMap.put(String.valueOf(targetKey), chunkedResult); } } - return chunkCount; + return updatedChunkCount; } - private int chunkString(final String content, List result, final Map runTimeParameters, int chunkCount) { + private int chunkString(final String content, List result, final Map runTimeParameters, final int chunkCount) { // chunk the content, return the updated chunkCount and add chunk passages into result + int updatedChunkCount = chunkCount; List contentResult = chunker.chunk(content, runTimeParameters); - chunkCount += contentResult.size(); - if (maxChunkLimit != DEFAULT_MAX_CHUNK_LIMIT && chunkCount > maxChunkLimit) { + updatedChunkCount += contentResult.size(); + if (maxChunkLimit != DEFAULT_MAX_CHUNK_LIMIT && updatedChunkCount > maxChunkLimit) { throw new IllegalArgumentException( String.format( Locale.ROOT, "Unable to chunk the document as the number of chunks [%s] exceeds the maximum chunk limit [%s]", - chunkCount, + updatedChunkCount, maxChunkLimit ) ); } result.addAll(contentResult); - return chunkCount; + return updatedChunkCount; } private int chunkList( final List contentList, List result, final Map runTimeParameters, - int chunkCount + final int chunkCount ) { // flatten original output format from List> to List + int updatedChunkCount = chunkCount; for (String content : contentList) { - chunkCount = chunkString(content, result, runTimeParameters, chunkCount); + updatedChunkCount = chunkString(content, result, runTimeParameters, updatedChunkCount); } - return chunkCount; + return updatedChunkCount; } @SuppressWarnings("unchecked") - private int chunkLeafType(final Object value, List result, final Map runTimeParameters, int chunkCount) { + private int chunkLeafType(final Object value, List result, final Map runTimeParameters, final int chunkCount) { // leaf type means null, String or List // the result should be an empty list when the input is null + int updatedChunkCount = chunkCount; if (value instanceof String) { - chunkCount = chunkString(value.toString(), result, runTimeParameters, chunkCount); + updatedChunkCount = chunkString(value.toString(), result, runTimeParameters, updatedChunkCount); } else if (isListOfString(value)) { - chunkCount = chunkList((List) value, result, runTimeParameters, chunkCount); + updatedChunkCount = chunkList((List) value, result, runTimeParameters, updatedChunkCount); } - return chunkCount; + return updatedChunkCount; } } From 3b8a3af2fbaf36f35a53a1913ad9cfa22909d452 Mon Sep 17 00:00:00 2001 From: yuye-aws Date: Fri, 15 Mar 2024 12:00:30 +0800 Subject: [PATCH 161/189] merge parameter validator into the parser Signed-off-by: yuye-aws --- .../processor/TextChunkingProcessor.java | 11 ++- .../processor/chunker/Chunker.java | 9 +- .../chunker/ChunkerParameterParser.java | 51 +++++++++-- .../chunker/ChunkerParameterValidator.java | 88 ------------------- .../processor/chunker/DelimiterChunker.java | 20 +---- .../chunker/FixedTokenLengthChunker.java | 68 +++++++------- .../chunker/FixedTokenLengthChunkerTests.java | 20 ++--- 7 files changed, 91 insertions(+), 176 deletions(-) delete mode 100644 src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerParameterValidator.java diff --git a/src/main/java/org/opensearch/neuralsearch/processor/TextChunkingProcessor.java b/src/main/java/org/opensearch/neuralsearch/processor/TextChunkingProcessor.java index e03c4780f..5e648152a 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/TextChunkingProcessor.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/TextChunkingProcessor.java @@ -28,13 +28,13 @@ import org.opensearch.index.mapper.IndexFieldMapper; import org.opensearch.neuralsearch.processor.chunker.ChunkerFactory; import org.opensearch.neuralsearch.processor.chunker.FixedTokenLengthChunker; -import static org.opensearch.neuralsearch.processor.chunker.ChunkerParameterValidator.validatePositiveIntegerParameter; -import static org.opensearch.neuralsearch.processor.chunker.ChunkerParameterParser.parseIntegerParameter; +import static org.opensearch.neuralsearch.processor.chunker.ChunkerParameterParser.parsePositiveIntegerParameter; /** * This processor is used for user input data text chunking. - * The chunking results could be fed to downstream embedding processor, - * algorithm defines chunking algorithm and parameters, + * The chunking results could be fed to downstream embedding processor. + * The processor needs two fields: algorithm and field_map, + * where algorithm defines chunking algorithm and parameters, * and field_map specifies which fields needs chunking and the corresponding keys for the chunking results. */ public final class TextChunkingProcessor extends AbstractProcessor { @@ -117,8 +117,7 @@ private void parseAlgorithmMap(final Map algorithmMap) { // fixed token length algorithm needs analysis registry for tokenization chunkerParameters.put(FixedTokenLengthChunker.ANALYSIS_REGISTRY_FIELD, analysisRegistry); this.chunker = ChunkerFactory.create(algorithmKey, chunkerParameters); - validatePositiveIntegerParameter(chunkerParameters, MAX_CHUNK_LIMIT_FIELD, DEFAULT_MAX_CHUNK_LIMIT); - this.maxChunkLimit = parseIntegerParameter(chunkerParameters, MAX_CHUNK_LIMIT_FIELD, DEFAULT_MAX_CHUNK_LIMIT); + this.maxChunkLimit = parsePositiveIntegerParameter(chunkerParameters, MAX_CHUNK_LIMIT_FIELD, DEFAULT_MAX_CHUNK_LIMIT); } @SuppressWarnings("unchecked") diff --git a/src/main/java/org/opensearch/neuralsearch/processor/chunker/Chunker.java b/src/main/java/org/opensearch/neuralsearch/processor/chunker/Chunker.java index af8290f7e..da67c91d0 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/chunker/Chunker.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/chunker/Chunker.java @@ -13,16 +13,9 @@ */ public interface Chunker { - /** - * Validate the parameters for chunking algorithm, - * will throw IllegalArgumentException when parameters are invalid - * - * @param parameters a map containing non-runtime parameters for chunking algorithms - */ - void validateParameters(Map parameters); - /** * Parse the parameters for chunking algorithm. + * Throw IllegalArgumentException when parameters are invalid. * The parameters must be validated before parsing. * * @param parameters a map containing non-runtime parameters for chunking algorithms diff --git a/src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerParameterParser.java b/src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerParameterParser.java index 5fb5d6666..d9a0e75ba 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerParameterParser.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerParameterParser.java @@ -4,47 +4,80 @@ */ package org.opensearch.neuralsearch.processor.chunker; +import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.math.NumberUtils; +import java.util.Locale; import java.util.Map; /** * Parse the parameter for text chunking processor and algorithms. - * The parameter must be validated before parsing. + * Throw IllegalArgumentException when parameters are invalid. */ public class ChunkerParameterParser { /** - * Parse string type parameter + * Parse string type parameter. + * Throw IllegalArgumentException if parameter is not a string or empty. */ public static String parseStringParameter(final Map parameters, final String fieldName, final String defaultValue) { if (!parameters.containsKey(fieldName)) { + // all string parameters are optional return defaultValue; } - return parameters.get(fieldName).toString(); + Object fieldValue = parameters.get(fieldName); + if (!(fieldValue instanceof String)) { + throw new IllegalArgumentException( + String.format(Locale.ROOT, "Parameter [%s] must be of %s type", fieldName, String.class.getName()) + ); + } + if (StringUtils.isEmpty(fieldValue.toString())) { + throw new IllegalArgumentException(String.format(Locale.ROOT, "Parameter [%s] should not be empty.", fieldName)); + } + return fieldValue.toString(); } /** - * Parse integer type parameter + * Parse Integer type parameter with positive value. + * Throw IllegalArgumentException if parameter is not a positive integer. */ - public static int parseIntegerParameter(final Map parameters, final String fieldName, final int defaultValue) { + public static int parsePositiveIntegerParameter(final Map parameters, final String fieldName, final int defaultValue) { if (!parameters.containsKey(fieldName)) { // all chunking algorithm parameters are optional return defaultValue; } + int fieldValueInt; String fieldValueString = parameters.get(fieldName).toString(); - return NumberUtils.createInteger(fieldValueString); + try { + fieldValueInt = NumberUtils.createInteger(fieldValueString); + } catch (Exception e) { + throw new IllegalArgumentException( + String.format(Locale.ROOT, "Parameter [%s] must be of %s type", fieldName, Integer.class.getName()) + ); + } + // some parameter has negative default value, indicating that this parameter is not effective + if (fieldValueInt != defaultValue && fieldValueInt <= 0) { + throw new IllegalArgumentException(String.format(Locale.ROOT, "Parameter [%s] must be positive.", fieldName)); + } + return fieldValueInt; } /** - * parse double type parameter + * Parse double type parameter. + * Throw IllegalArgumentException if parameter is not a double. */ public static double parseDoubleParameter(final Map parameters, final String fieldName, final double defaultValue) { if (!parameters.containsKey(fieldName)) { - // all chunking algorithm parameters are optional + // all double parameters are optional return defaultValue; } String fieldValueString = parameters.get(fieldName).toString(); - return NumberUtils.createDouble(fieldValueString); + try { + return NumberUtils.createDouble(fieldValueString); + } catch (Exception e) { + throw new IllegalArgumentException( + String.format(Locale.ROOT, "Parameter [%s] must be of %s type", fieldName, Double.class.getName()) + ); + } } } diff --git a/src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerParameterValidator.java b/src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerParameterValidator.java deleted file mode 100644 index abf37de0f..000000000 --- a/src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerParameterValidator.java +++ /dev/null @@ -1,88 +0,0 @@ -/* - * Copyright OpenSearch Contributors - * SPDX-License-Identifier: Apache-2.0 - */ -package org.opensearch.neuralsearch.processor.chunker; - -import org.apache.commons.lang3.StringUtils; -import org.apache.commons.lang3.math.NumberUtils; - -import java.util.Map; -import java.util.Locale; - -/** - * Validate the parameter for text chunking processor and algorithms - */ -public class ChunkerParameterValidator { - - /** - * Validate string type parameter - */ - public static void validateStringParameter(final Map parameters, final String fieldName, final boolean allowEmpty) { - if (!parameters.containsKey(fieldName)) { - // all chunking algorithm parameters are optional - return; - } - Object fieldValue = parameters.get(fieldName); - if (!(fieldValue instanceof String)) { - throw new IllegalArgumentException( - String.format(Locale.ROOT, "Parameter [%s] must be of %s type", fieldName, String.class.getName()) - ); - } - if (!allowEmpty && StringUtils.isEmpty(fieldValue.toString())) { - throw new IllegalArgumentException(String.format(Locale.ROOT, "Parameter [%s] should not be empty.", fieldName)); - } - } - - /** - * Validate integer type parameter with positive value - */ - public static void validatePositiveIntegerParameter( - final Map parameters, - final String fieldName, - final int defaultValue - ) { - if (!parameters.containsKey(fieldName)) { - // all chunking algorithm parameters are optional - return; - } - String fieldValueString = parameters.get(fieldName).toString(); - if (!(NumberUtils.isParsable(fieldValueString))) { - throw new IllegalArgumentException( - String.format(Locale.ROOT, "Parameter [%s] must be of %s type", fieldName, Number.class.getName()) - ); - } - int fieldValueInt = NumberUtils.createInteger(fieldValueString); - // sometimes the parameter has negative default value, indicating that this parameter is not effective - if (fieldValueInt != defaultValue && fieldValueInt <= 0) { - throw new IllegalArgumentException(String.format(Locale.ROOT, "Parameter [%s] must be positive.", fieldName)); - } - } - - /** - * Validate double type parameter within range [lowerBound, upperBound] - */ - public static void validateDoubleParameterWithinRange( - final Map parameters, - final String fieldName, - final double lowerBound, - final double upperBound - ) { - if (!parameters.containsKey(fieldName)) { - // all chunking algorithm parameters are optional - return; - } - String fieldValueString = parameters.get(fieldName).toString(); - if (!(NumberUtils.isParsable(fieldValueString))) { - throw new IllegalArgumentException( - String.format(Locale.ROOT, "Parameter [%s] must be of %s type", fieldName, Number.class.getName()) - ); - } - double fieldValueDouble = NumberUtils.createDouble(fieldValueString); - if (fieldValueDouble < lowerBound || fieldValueDouble > upperBound) { - throw new IllegalArgumentException( - String.format(Locale.ROOT, "Parameter [%s] must be between %s and %s", fieldName, lowerBound, upperBound) - ); - } - } -} diff --git a/src/main/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunker.java b/src/main/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunker.java index 168936a8d..0008fad21 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunker.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunker.java @@ -9,7 +9,6 @@ import java.util.ArrayList; import static org.opensearch.neuralsearch.processor.chunker.ChunkerParameterParser.parseStringParameter; -import static org.opensearch.neuralsearch.processor.chunker.ChunkerParameterValidator.validateStringParameter; /** * The implementation {@link Chunker} for delimiter algorithm @@ -23,28 +22,15 @@ public class DelimiterChunker implements Chunker { private String delimiter; public DelimiterChunker(final Map parameters) { - validateParameters(parameters); parseParameters(parameters); } /** - * Validate the parameters for delimiter algorithm, - * will throw IllegalArgumentException if delimiter is not a string or empty + * Parse the parameters for delimiter algorithm. + * Throw IllegalArgumentException if delimiter is not a string or empty. * * @param parameters a map containing parameters, containing the following parameters - * 1. A string as the paragraph split indicator - */ - @Override - public void validateParameters(Map parameters) { - validateStringParameter(parameters, DELIMITER_FIELD, false); - } - - /** - * Parse the parameters for delimiter algorithm, - * will throw IllegalArgumentException if delimiter is not a string or empty - * - * @param parameters a map containing parameters, containing the following parameters - * 1. A string as the paragraph split indicator + * 1. delimiter A string as the paragraph split indicator */ @Override public void parseParameters(Map parameters) { diff --git a/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java b/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java index 145484b79..c21fcf12b 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java @@ -17,10 +17,7 @@ import static org.opensearch.action.admin.indices.analyze.TransportAnalyzeAction.analyze; import static org.opensearch.neuralsearch.processor.chunker.ChunkerParameterParser.parseStringParameter; import static org.opensearch.neuralsearch.processor.chunker.ChunkerParameterParser.parseDoubleParameter; -import static org.opensearch.neuralsearch.processor.chunker.ChunkerParameterParser.parseIntegerParameter; -import static org.opensearch.neuralsearch.processor.chunker.ChunkerParameterValidator.validateStringParameter; -import static org.opensearch.neuralsearch.processor.chunker.ChunkerParameterValidator.validatePositiveIntegerParameter; -import static org.opensearch.neuralsearch.processor.chunker.ChunkerParameterValidator.validateDoubleParameterWithinRange; +import static org.opensearch.neuralsearch.processor.chunker.ChunkerParameterParser.parsePositiveIntegerParameter; /** * The implementation {@link Chunker} for fixed token length algorithm. @@ -62,35 +59,44 @@ public class FixedTokenLengthChunker implements Chunker { private final AnalysisRegistry analysisRegistry; public FixedTokenLengthChunker(final Map parameters) { - validateParameters(parameters); parseParameters(parameters); this.analysisRegistry = (AnalysisRegistry) parameters.get(ANALYSIS_REGISTRY_FIELD); } /** - * Validate the parameters for fixed token length algorithm, - * will throw IllegalArgumentException when parameters are invalid + * Parse the parameters for fixed token length algorithm. + * Throw IllegalArgumentException when parameters are invalid. * - * @param parameters a map containing non-runtime parameters as the following: - * 1. tokenizer: the analyzer tokenizer in opensearch + * @param parameters a map non-runtime parameters as the following: + * 1. tokenizer: the word tokenizer in opensearch * 2. token_limit: the token limit for each chunked passage * 3. overlap_rate: the overlapping degree for each chunked passage, indicating how many token comes from the previous passage * Here are requirements for parameters: - * max_token_count and token_limit should be a positive integer - * overlap_rate should be within range [0, 0.5] - * tokenizer should be string + * 1. token_limit must be a positive integer + * 2. overlap_rate must be within range [0, 0.5] + * 3. tokenizer must be a word tokenizer */ @Override - public void validateParameters(Map parameters) { - validatePositiveIntegerParameter(parameters, TOKEN_LIMIT_FIELD, DEFAULT_TOKEN_LIMIT); - validateDoubleParameterWithinRange(parameters, OVERLAP_RATE_FIELD, OVERLAP_RATE_LOWER_BOUND, OVERLAP_RATE_UPPER_BOUND); - validateStringParameter(parameters, TOKENIZER_FIELD, false); - String tokenizer = parseStringParameter(parameters, TOKENIZER_FIELD, DEFAULT_TOKENIZER); + public void parseParameters(Map parameters) { + this.tokenLimit = parsePositiveIntegerParameter(parameters, TOKEN_LIMIT_FIELD, DEFAULT_TOKEN_LIMIT); + this.overlapRate = parseDoubleParameter(parameters, OVERLAP_RATE_FIELD, DEFAULT_OVERLAP_RATE); + this.tokenizer = parseStringParameter(parameters, TOKENIZER_FIELD, DEFAULT_TOKENIZER); + if (overlapRate < OVERLAP_RATE_LOWER_BOUND || overlapRate > OVERLAP_RATE_UPPER_BOUND) { + throw new IllegalArgumentException( + String.format( + Locale.ROOT, + "Parameter [%s] must be between %s and %s", + OVERLAP_RATE_FIELD, + OVERLAP_RATE_LOWER_BOUND, + OVERLAP_RATE_UPPER_BOUND + ) + ); + } if (!WORD_TOKENIZERS.contains(tokenizer)) { throw new IllegalArgumentException( String.format( Locale.ROOT, - "tokenizer [%s] is not supported for [%s] algorithm. Supported tokenizers are %s", + "Tokenizer [%s] is not supported for [%s] algorithm. Supported tokenizers are %s", tokenizer, ALGORITHM_NAME, WORD_TOKENIZERS @@ -100,33 +106,19 @@ public void validateParameters(Map parameters) { } /** - * Parse the parameters for fixed token length algorithm, - * will throw IllegalArgumentException when parameters are invalid - * - * @param parameters a map non-runtime parameters as the following: - * 1. tokenizer: the word tokenizer in opensearch - * 2. token_limit: the token limit for each chunked passage - * 3. overlap_rate: the overlapping degree for each chunked passage, indicating how many token comes from the previous passage - */ - @Override - public void parseParameters(Map parameters) { - this.tokenLimit = parseIntegerParameter(parameters, TOKEN_LIMIT_FIELD, DEFAULT_TOKEN_LIMIT); - this.overlapRate = parseDoubleParameter(parameters, OVERLAP_RATE_FIELD, DEFAULT_OVERLAP_RATE); - this.tokenizer = parseStringParameter(parameters, TOKENIZER_FIELD, DEFAULT_TOKENIZER); - } - - /** - * Return the chunked passages for fixed token length algorithm + * Return the chunked passages for fixed token length algorithm. + * Throw IllegalArgumentException when runtime parameters are invalid. * * @param content input string * @param runtimeParameters a map for runtime parameters, containing the following runtime parameters: * 1. max_token_count the max token limit for the tokenizer + * Here are requirements for runtime parameters: + * 1. max_token_count must be a positive integer */ @Override public List chunk(final String content, final Map runtimeParameters) { - // before chunking, validate and parse runtimeParameters - validatePositiveIntegerParameter(runtimeParameters, MAX_TOKEN_COUNT_FIELD, DEFAULT_MAX_TOKEN_COUNT); - int maxTokenCount = parseIntegerParameter(runtimeParameters, MAX_TOKEN_COUNT_FIELD, DEFAULT_MAX_TOKEN_COUNT); + // parse runtimeParameters before chunking + int maxTokenCount = parsePositiveIntegerParameter(runtimeParameters, MAX_TOKEN_COUNT_FIELD, DEFAULT_MAX_TOKEN_COUNT); List tokens = tokenize(content, tokenizer, maxTokenCount); List chunkResult = new ArrayList<>(); diff --git a/src/test/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunkerTests.java b/src/test/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunkerTests.java index 7bba1e73b..979228366 100644 --- a/src/test/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunkerTests.java +++ b/src/test/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunkerTests.java @@ -63,7 +63,7 @@ public Map> getTokeniz } public void testValidateAndParseParameters_whenNoParams_thenSuccessful() { - fixedTokenLengthChunker.validateParameters(Map.of()); + fixedTokenLengthChunker.parseParameters(Map.of()); } public void testValidateAndParseParameters_whenIllegalTokenLimitType_thenFail() { @@ -71,10 +71,10 @@ public void testValidateAndParseParameters_whenIllegalTokenLimitType_thenFail() parameters.put(TOKEN_LIMIT_FIELD, "invalid token limit"); IllegalArgumentException illegalArgumentException = assertThrows( IllegalArgumentException.class, - () -> fixedTokenLengthChunker.validateParameters(parameters) + () -> fixedTokenLengthChunker.parseParameters(parameters) ); assertEquals( - String.format(Locale.ROOT, "Parameter [%s] must be of %s type", TOKEN_LIMIT_FIELD, Number.class.getName()), + String.format(Locale.ROOT, "Parameter [%s] must be of %s type", TOKEN_LIMIT_FIELD, Integer.class.getName()), illegalArgumentException.getMessage() ); } @@ -84,7 +84,7 @@ public void testValidateAndParseParameters_whenIllegalTokenLimitValue_thenFail() parameters.put(TOKEN_LIMIT_FIELD, -1); IllegalArgumentException illegalArgumentException = assertThrows( IllegalArgumentException.class, - () -> fixedTokenLengthChunker.validateParameters(parameters) + () -> fixedTokenLengthChunker.parseParameters(parameters) ); assertEquals( String.format(Locale.ROOT, "Parameter [%s] must be positive.", TOKEN_LIMIT_FIELD), @@ -97,10 +97,10 @@ public void testValidateAndParseParameters_whenIllegalOverlapRateType_thenFail() parameters.put(OVERLAP_RATE_FIELD, "invalid overlap rate"); IllegalArgumentException illegalArgumentException = assertThrows( IllegalArgumentException.class, - () -> fixedTokenLengthChunker.validateParameters(parameters) + () -> fixedTokenLengthChunker.parseParameters(parameters) ); assertEquals( - String.format(Locale.ROOT, "Parameter [%s] must be of %s type", OVERLAP_RATE_FIELD, Number.class.getName()), + String.format(Locale.ROOT, "Parameter [%s] must be of %s type", OVERLAP_RATE_FIELD, Double.class.getName()), illegalArgumentException.getMessage() ); } @@ -110,7 +110,7 @@ public void testValidateAndParseParameters_whenIllegalOverlapRateValue_thenFail( parameters.put(OVERLAP_RATE_FIELD, 0.6); IllegalArgumentException illegalArgumentException = assertThrows( IllegalArgumentException.class, - () -> fixedTokenLengthChunker.validateParameters(parameters) + () -> fixedTokenLengthChunker.parseParameters(parameters) ); assertEquals( String.format(Locale.ROOT, "Parameter [%s] must be between %s and %s", OVERLAP_RATE_FIELD, 0.0, 0.5), @@ -123,7 +123,7 @@ public void testValidateAndParseParameters_whenIllegalTokenizerType_thenFail() { parameters.put(TOKENIZER_FIELD, 111); IllegalArgumentException illegalArgumentException = assertThrows( IllegalArgumentException.class, - () -> fixedTokenLengthChunker.validateParameters(parameters) + () -> fixedTokenLengthChunker.parseParameters(parameters) ); assertEquals( String.format(Locale.ROOT, "Parameter [%s] must be of %s type", TOKENIZER_FIELD, String.class.getName()), @@ -136,10 +136,10 @@ public void testValidateAndParseParameters_whenUnsupportedTokenizer_thenFail() { Map parameters = Map.of(TOKENIZER_FIELD, ngramTokenizer); IllegalArgumentException illegalArgumentException = assertThrows( IllegalArgumentException.class, - () -> fixedTokenLengthChunker.validateParameters(parameters) + () -> fixedTokenLengthChunker.parseParameters(parameters) ); assert (illegalArgumentException.getMessage() - .contains(String.format(Locale.ROOT, "tokenizer [%s] is not supported for [%s] algorithm.", ngramTokenizer, ALGORITHM_NAME))); + .contains(String.format(Locale.ROOT, "Tokenizer [%s] is not supported for [%s] algorithm.", ngramTokenizer, ALGORITHM_NAME))); } public void testChunk_withTokenLimit_10() { From 89c465c8ca56c77ea1d5736fcd798ab6648894fe Mon Sep 17 00:00:00 2001 From: yuye-aws Date: Fri, 15 Mar 2024 12:15:52 +0800 Subject: [PATCH 162/189] assign positive default value for max chunk limit Signed-off-by: yuye-aws --- .../neuralsearch/processor/TextChunkingProcessor.java | 10 ++++++---- .../processor/chunker/ChunkerParameterParser.java | 3 +-- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/src/main/java/org/opensearch/neuralsearch/processor/TextChunkingProcessor.java b/src/main/java/org/opensearch/neuralsearch/processor/TextChunkingProcessor.java index 5e648152a..963135e68 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/TextChunkingProcessor.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/TextChunkingProcessor.java @@ -45,7 +45,7 @@ public final class TextChunkingProcessor extends AbstractProcessor { @VisibleForTesting static final String MAX_CHUNK_LIMIT_FIELD = "max_chunk_limit"; - private static final int DEFAULT_MAX_CHUNK_LIMIT = -1; + private static final int DEFAULT_MAX_CHUNK_LIMIT = 100; private static final String DEFAULT_ALGORITHM = FixedTokenLengthChunker.ALGORITHM_NAME; private int maxChunkLimit; @@ -114,8 +114,10 @@ private void parseAlgorithmMap(final Map algorithmMap) { } Map chunkerParameters = (Map) algorithmValue; - // fixed token length algorithm needs analysis registry for tokenization - chunkerParameters.put(FixedTokenLengthChunker.ANALYSIS_REGISTRY_FIELD, analysisRegistry); + if (algorithmKey.equals(FixedTokenLengthChunker.ALGORITHM_NAME)) { + // fixed token length algorithm needs analysis registry for tokenization + chunkerParameters.put(FixedTokenLengthChunker.ANALYSIS_REGISTRY_FIELD, analysisRegistry); + } this.chunker = ChunkerFactory.create(algorithmKey, chunkerParameters); this.maxChunkLimit = parsePositiveIntegerParameter(chunkerParameters, MAX_CHUNK_LIMIT_FIELD, DEFAULT_MAX_CHUNK_LIMIT); } @@ -269,7 +271,7 @@ private int chunkString(final String content, List result, final Map contentResult = chunker.chunk(content, runTimeParameters); updatedChunkCount += contentResult.size(); - if (maxChunkLimit != DEFAULT_MAX_CHUNK_LIMIT && updatedChunkCount > maxChunkLimit) { + if (updatedChunkCount > maxChunkLimit) { throw new IllegalArgumentException( String.format( Locale.ROOT, diff --git a/src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerParameterParser.java b/src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerParameterParser.java index d9a0e75ba..56916ea34 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerParameterParser.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerParameterParser.java @@ -55,8 +55,7 @@ public static int parsePositiveIntegerParameter(final Map parame String.format(Locale.ROOT, "Parameter [%s] must be of %s type", fieldName, Integer.class.getName()) ); } - // some parameter has negative default value, indicating that this parameter is not effective - if (fieldValueInt != defaultValue && fieldValueInt <= 0) { + if (fieldValueInt <= 0) { throw new IllegalArgumentException(String.format(Locale.ROOT, "Parameter [%s] must be positive.", fieldName)); } return fieldValueInt; From e7dffe0f30df03cd0e48a7bfd6deae49ab89ef63 Mon Sep 17 00:00:00 2001 From: yuye-aws Date: Fri, 15 Mar 2024 12:46:42 +0800 Subject: [PATCH 163/189] validate supported chunker algorithm in text chunking processor Signed-off-by: yuye-aws --- .../processor/TextChunkingProcessor.java | 12 ++++++++++++ .../processor/chunker/ChunkerFactory.java | 19 ++++++++----------- .../chunker/ChunkerFactoryTests.java | 8 +------- 3 files changed, 21 insertions(+), 18 deletions(-) diff --git a/src/main/java/org/opensearch/neuralsearch/processor/TextChunkingProcessor.java b/src/main/java/org/opensearch/neuralsearch/processor/TextChunkingProcessor.java index 963135e68..c9bd5e46e 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/TextChunkingProcessor.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/TextChunkingProcessor.java @@ -11,6 +11,7 @@ import java.util.ArrayList; import java.util.List; import java.util.Objects; +import java.util.Set; import com.google.common.annotations.VisibleForTesting; @@ -113,6 +114,17 @@ private void parseAlgorithmMap(final Map algorithmMap) { } } + Set allChunkerAlgorithms = ChunkerFactory.allChunkerAlgorithms; + if (!allChunkerAlgorithms.contains(algorithmKey)) { + throw new IllegalArgumentException( + String.format( + Locale.ROOT, + "Chunking algorithm [%s] is not supported. Supported chunking algorithms are %s", + algorithmKey, + allChunkerAlgorithms + ) + ); + } Map chunkerParameters = (Map) algorithmValue; if (algorithmKey.equals(FixedTokenLengthChunker.ALGORITHM_NAME)) { // fixed token length algorithm needs analysis registry for tokenization diff --git a/src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerFactory.java b/src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerFactory.java index d66bc423e..a3f7346d5 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerFactory.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerFactory.java @@ -5,9 +5,11 @@ package org.opensearch.neuralsearch.processor.chunker; import com.google.common.collect.ImmutableMap; +import lombok.Getter; import java.util.Map; -import java.util.Locale; +import java.util.Objects; +import java.util.Set; import java.util.function.Function; /** @@ -22,18 +24,13 @@ public class ChunkerFactory { DelimiterChunker::new ); + @Getter + public static Set allChunkerAlgorithms = chunkers.keySet(); + public static Chunker create(final String type, final Map parameters) { Function, Chunker> chunkerConstructionFunction = chunkers.get(type); - if (chunkerConstructionFunction == null) { - throw new IllegalArgumentException( - String.format( - Locale.ROOT, - "Chunking algorithm [%s] is not supported. Supported chunking algorithms are %s", - type, - chunkers.keySet() - ) - ); - } + // chunkerConstructionFunction is not null because we have validated the type in text chunking processor + Objects.requireNonNull(chunkerConstructionFunction); return chunkerConstructionFunction.apply(parameters); } } diff --git a/src/test/java/org/opensearch/neuralsearch/processor/chunker/ChunkerFactoryTests.java b/src/test/java/org/opensearch/neuralsearch/processor/chunker/ChunkerFactoryTests.java index 2b06ca10a..21859c24e 100644 --- a/src/test/java/org/opensearch/neuralsearch/processor/chunker/ChunkerFactoryTests.java +++ b/src/test/java/org/opensearch/neuralsearch/processor/chunker/ChunkerFactoryTests.java @@ -9,7 +9,6 @@ import org.opensearch.test.OpenSearchTestCase; import java.util.HashMap; -import java.util.Locale; import java.util.Map; import static org.opensearch.neuralsearch.processor.chunker.FixedTokenLengthChunker.ANALYSIS_REGISTRY_FIELD; @@ -33,12 +32,7 @@ public void testCreate_Delimiter() { public void testCreate_Invalid() { String invalidChunkerName = "Invalid Chunker Algorithm"; - IllegalArgumentException illegalArgumentException = assertThrows( - IllegalArgumentException.class, - () -> ChunkerFactory.create(invalidChunkerName, createChunkParameters()) - ); - assert (illegalArgumentException.getMessage() - .contains(String.format(Locale.ROOT, "Chunking algorithm [%s] is not supported.", invalidChunkerName))); + assertThrows(NullPointerException.class, () -> ChunkerFactory.create(invalidChunkerName, createChunkParameters())); } private Map createChunkParameters() { From 463de71bd0075af4524fe29b4140769193049d90 Mon Sep 17 00:00:00 2001 From: yuye-aws Date: Fri, 15 Mar 2024 14:24:10 +0800 Subject: [PATCH 164/189] update parameter setting of max chunk limit Signed-off-by: yuye-aws --- .../processor/TextChunkingProcessor.java | 27 ++++++--- .../chunker/ChunkerParameterParser.java | 19 ++++-- .../processor/TextChunkingProcessorTests.java | 60 +++++++++++++------ 3 files changed, 75 insertions(+), 31 deletions(-) diff --git a/src/main/java/org/opensearch/neuralsearch/processor/TextChunkingProcessor.java b/src/main/java/org/opensearch/neuralsearch/processor/TextChunkingProcessor.java index c9bd5e46e..268248867 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/TextChunkingProcessor.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/TextChunkingProcessor.java @@ -29,7 +29,8 @@ import org.opensearch.index.mapper.IndexFieldMapper; import org.opensearch.neuralsearch.processor.chunker.ChunkerFactory; import org.opensearch.neuralsearch.processor.chunker.FixedTokenLengthChunker; -import static org.opensearch.neuralsearch.processor.chunker.ChunkerParameterParser.parsePositiveIntegerParameter; + +import static org.opensearch.neuralsearch.processor.chunker.ChunkerParameterParser.parseIntegerParameter; /** * This processor is used for user input data text chunking. @@ -47,6 +48,7 @@ public final class TextChunkingProcessor extends AbstractProcessor { static final String MAX_CHUNK_LIMIT_FIELD = "max_chunk_limit"; private static final int DEFAULT_MAX_CHUNK_LIMIT = 100; + private static final int DISABLED_MAX_CHUNK_LIMIT = -1; private static final String DEFAULT_ALGORITHM = FixedTokenLengthChunker.ALGORITHM_NAME; private int maxChunkLimit; @@ -131,7 +133,17 @@ private void parseAlgorithmMap(final Map algorithmMap) { chunkerParameters.put(FixedTokenLengthChunker.ANALYSIS_REGISTRY_FIELD, analysisRegistry); } this.chunker = ChunkerFactory.create(algorithmKey, chunkerParameters); - this.maxChunkLimit = parsePositiveIntegerParameter(chunkerParameters, MAX_CHUNK_LIMIT_FIELD, DEFAULT_MAX_CHUNK_LIMIT); + this.maxChunkLimit = parseIntegerParameter(chunkerParameters, MAX_CHUNK_LIMIT_FIELD, DEFAULT_MAX_CHUNK_LIMIT); + if (maxChunkLimit <= 0 && maxChunkLimit != DISABLED_MAX_CHUNK_LIMIT) { + throw new IllegalArgumentException( + String.format( + Locale.ROOT, + "Parameter [%s] must be positive or %s to disable this parameter", + MAX_CHUNK_LIMIT_FIELD, + DISABLED_MAX_CHUNK_LIMIT + ) + ); + } } @SuppressWarnings("unchecked") @@ -283,13 +295,14 @@ private int chunkString(final String content, List result, final Map contentResult = chunker.chunk(content, runTimeParameters); updatedChunkCount += contentResult.size(); - if (updatedChunkCount > maxChunkLimit) { - throw new IllegalArgumentException( + if (updatedChunkCount > maxChunkLimit && maxChunkLimit != DISABLED_MAX_CHUNK_LIMIT) { + throw new IllegalStateException( String.format( Locale.ROOT, - "Unable to chunk the document as the number of chunks [%s] exceeds the maximum chunk limit [%s]", - updatedChunkCount, - maxChunkLimit + "The number of chunks produced by %s processor has exceeded the allowed maximum of [%s]. This limit can be set by changing the [%s] parameter.", + TYPE, + maxChunkLimit, + MAX_CHUNK_LIMIT_FIELD ) ); } diff --git a/src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerParameterParser.java b/src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerParameterParser.java index 56916ea34..4ef83e03a 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerParameterParser.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerParameterParser.java @@ -38,23 +38,30 @@ public static String parseStringParameter(final Map parameters, } /** - * Parse Integer type parameter with positive value. - * Throw IllegalArgumentException if parameter is not a positive integer. + * Parse Integer type parameter. + * Throw IllegalArgumentException if parameter is not an integer. */ - public static int parsePositiveIntegerParameter(final Map parameters, final String fieldName, final int defaultValue) { + public static int parseIntegerParameter(final Map parameters, final String fieldName, final int defaultValue) { if (!parameters.containsKey(fieldName)) { - // all chunking algorithm parameters are optional + // all integer parameters are optional return defaultValue; } - int fieldValueInt; String fieldValueString = parameters.get(fieldName).toString(); try { - fieldValueInt = NumberUtils.createInteger(fieldValueString); + return NumberUtils.createInteger(fieldValueString); } catch (Exception e) { throw new IllegalArgumentException( String.format(Locale.ROOT, "Parameter [%s] must be of %s type", fieldName, Integer.class.getName()) ); } + } + + /** + * Parse Integer type parameter with positive value. + * Throw IllegalArgumentException if parameter is not a positive integer. + */ + public static int parsePositiveIntegerParameter(final Map parameters, final String fieldName, final int defaultValue) { + int fieldValueInt = parseIntegerParameter(parameters, fieldName, defaultValue); if (fieldValueInt <= 0) { throw new IllegalArgumentException(String.format(Locale.ROOT, "Parameter [%s] must be positive.", fieldName)); } diff --git a/src/test/java/org/opensearch/neuralsearch/processor/TextChunkingProcessorTests.java b/src/test/java/org/opensearch/neuralsearch/processor/TextChunkingProcessorTests.java index 239537a52..37b66cef1 100644 --- a/src/test/java/org/opensearch/neuralsearch/processor/TextChunkingProcessorTests.java +++ b/src/test/java/org/opensearch/neuralsearch/processor/TextChunkingProcessorTests.java @@ -201,10 +201,21 @@ public void testCreate_whenMaxChunkNumInvalidValue_thenFail() { IllegalArgumentException.class, () -> textChunkingProcessorFactory.create(registry, PROCESSOR_TAG, DESCRIPTION, config) ); - assertEquals( - String.format(Locale.ROOT, "Parameter [%s] must be positive.", MAX_CHUNK_LIMIT_FIELD), - illegalArgumentException.getMessage() - ); + assert (illegalArgumentException.getMessage() + .contains(String.format(Locale.ROOT, "Parameter [%s] must be positive", MAX_CHUNK_LIMIT_FIELD))); + } + + @SneakyThrows + public void testCreate_whenMaxChunkNumDisabledValue_thenSucceed() { + Map registry = new HashMap<>(); + Map config = new HashMap<>(); + Map fieldMap = new HashMap<>(); + Map algorithmMap = new HashMap<>(); + fieldMap.put(INPUT_FIELD, OUTPUT_FIELD); + algorithmMap.put(FixedTokenLengthChunker.ALGORITHM_NAME, createFixedTokenLengthParametersWithMaxChunk(-1)); + config.put(FIELD_MAP_FIELD, fieldMap); + config.put(ALGORITHM_FIELD, algorithmMap); + textChunkingProcessorFactory.create(registry, PROCESSOR_TAG, DESCRIPTION, config); } public void testCreate_whenAlgorithmFieldMultipleAlgorithm_thenFail() { @@ -379,23 +390,36 @@ public void testExecute_withFixedTokenLength_andSourceDataStringWithMaxChunkNumT } } + @SneakyThrows + public void testExecute_withFixedTokenLength_andSourceDataStringWithMaxChunkNumDisabled_thenFail() { + TextChunkingProcessor processor = createFixedTokenLengthInstanceWithMaxChunkNum(createStringFieldMap(), -1); + IngestDocument ingestDocument = createIngestDocumentWithSourceData(createSourceDataString()); + IngestDocument document = processor.execute(ingestDocument); + assert document.getSourceAndMetadata().containsKey(OUTPUT_FIELD); + Object passages = document.getSourceAndMetadata().get(OUTPUT_FIELD); + assert (passages instanceof List); + List expectedPassages = new ArrayList<>(); + expectedPassages.add("This is an example document to be chunked. The document "); + expectedPassages.add("contains a single paragraph, two sentences and 24 tokens by "); + expectedPassages.add("standard tokenizer in OpenSearch."); + assertEquals(expectedPassages, passages); + } + @SneakyThrows public void testExecute_withFixedTokenLength_andSourceDataStringWithMaxChunkNumExceed_thenFail() { - TextChunkingProcessor processor = createFixedTokenLengthInstanceWithMaxChunkNum(createStringFieldMap(), 1); + int maxChunkLimit = 1; + TextChunkingProcessor processor = createFixedTokenLengthInstanceWithMaxChunkNum(createStringFieldMap(), maxChunkLimit); IngestDocument ingestDocument = createIngestDocumentWithSourceData(createSourceDataString()); - IllegalArgumentException illegalArgumentException = assertThrows( - IllegalArgumentException.class, - () -> processor.execute(ingestDocument) - ); - assertEquals( - String.format( - Locale.ROOT, - "Unable to chunk the document as the number of chunks [%s] exceeds the maximum chunk limit [%s]", - 3, - 1 - ), - illegalArgumentException.getMessage() - ); + IllegalStateException illegalStateException = assertThrows(IllegalStateException.class, () -> processor.execute(ingestDocument)); + assert (illegalStateException.getMessage() + .contains( + String.format( + Locale.ROOT, + "The number of chunks produced by %s processor has exceeded the allowed maximum of [%s].", + TYPE, + maxChunkLimit + ) + )); } @SneakyThrows From 0a04012976f24ed068baa6471f464f15134ee11b Mon Sep 17 00:00:00 2001 From: yuye-aws Date: Fri, 15 Mar 2024 15:25:20 +0800 Subject: [PATCH 165/189] add unit test with non list of string Signed-off-by: yuye-aws --- .../processor/TextChunkingProcessorTests.java | 28 +++++++++++++++++-- 1 file changed, 25 insertions(+), 3 deletions(-) diff --git a/src/test/java/org/opensearch/neuralsearch/processor/TextChunkingProcessorTests.java b/src/test/java/org/opensearch/neuralsearch/processor/TextChunkingProcessorTests.java index 37b66cef1..48adf67e9 100644 --- a/src/test/java/org/opensearch/neuralsearch/processor/TextChunkingProcessorTests.java +++ b/src/test/java/org/opensearch/neuralsearch/processor/TextChunkingProcessorTests.java @@ -4,6 +4,7 @@ */ package org.opensearch.neuralsearch.processor; +import com.google.common.collect.ImmutableMap; import lombok.SneakyThrows; import org.apache.lucene.tests.analysis.MockTokenizer; import org.junit.Before; @@ -302,7 +303,7 @@ private List createSourceDataListStrings() { return documents; } - private List createSourceDataListHybridType() { + private List createSourceDataListWithInvalidType() { List documents = new ArrayList<>(); documents.add( "This is the first document to be chunked. The document contains a single paragraph, two sentences and 24 tokens by standard tokenizer in OpenSearch." @@ -311,6 +312,15 @@ private List createSourceDataListHybridType() { return documents; } + private List createSourceDataListWithHybridType() { + List documents = new ArrayList<>(); + documents.add( + "This is the first document to be chunked. The document contains a single paragraph, two sentences and 24 tokens by standard tokenizer in OpenSearch." + ); + documents.add(ImmutableMap.of()); + return documents; + } + private List createSourceDataListWithNull() { List documents = new ArrayList<>(); documents.add( @@ -489,9 +499,9 @@ public void testExecute_withFixedTokenLength_andSourceDataListStrings_thenSuccee } @SneakyThrows - public void testExecute_withFixedTokenLength_andSourceDataListHybridType_thenFail() { + public void testExecute_withFixedTokenLength_andSourceDataListWithInvalidType_thenFail() { TextChunkingProcessor processor = createFixedTokenLengthInstance(createStringFieldMap()); - IngestDocument ingestDocument = createIngestDocumentWithSourceData(createSourceDataListHybridType()); + IngestDocument ingestDocument = createIngestDocumentWithSourceData(createSourceDataListWithInvalidType()); IllegalArgumentException illegalArgumentException = assertThrows( IllegalArgumentException.class, () -> processor.execute(ingestDocument) @@ -588,6 +598,18 @@ public void testExecute_withFixedTokenLength_andFieldMapNestedMap_sourceDataList } } + @SneakyThrows + public void testExecute_withFixedTokenLength_andSourceDataListWithHybridType_thenSucceed() { + TextChunkingProcessor processor = createFixedTokenLengthInstance(createStringFieldMap()); + List sourceDataList = createSourceDataListWithHybridType(); + IngestDocument ingestDocument = createIngestDocumentWithSourceData(sourceDataList); + IngestDocument document = processor.execute(ingestDocument); + assert document.getSourceAndMetadata().containsKey(INPUT_FIELD); + Object listResult = document.getSourceAndMetadata().get(OUTPUT_FIELD); + assert (listResult instanceof List); + assertEquals(((List) listResult).size(), 0); + } + @SneakyThrows public void testExecute_withDelimiter_andSourceDataString_thenSucceed() { TextChunkingProcessor processor = createDelimiterInstance(); From a524954e138ece06c8adb001534a907df64ca005 Mon Sep 17 00:00:00 2001 From: yuye-aws Date: Fri, 15 Mar 2024 15:28:19 +0800 Subject: [PATCH 166/189] add unit test with null input Signed-off-by: yuye-aws --- .../processor/TextChunkingProcessorTests.java | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/src/test/java/org/opensearch/neuralsearch/processor/TextChunkingProcessorTests.java b/src/test/java/org/opensearch/neuralsearch/processor/TextChunkingProcessorTests.java index 48adf67e9..2296fc05c 100644 --- a/src/test/java/org/opensearch/neuralsearch/processor/TextChunkingProcessorTests.java +++ b/src/test/java/org/opensearch/neuralsearch/processor/TextChunkingProcessorTests.java @@ -610,6 +610,17 @@ public void testExecute_withFixedTokenLength_andSourceDataListWithHybridType_the assertEquals(((List) listResult).size(), 0); } + @SneakyThrows + public void testExecute_withFixedTokenLength_andSourceDataNull_thenSucceed() { + TextChunkingProcessor processor = createFixedTokenLengthInstance(createStringFieldMap()); + IngestDocument ingestDocument = createIngestDocumentWithSourceData(null); + IngestDocument document = processor.execute(ingestDocument); + assert document.getSourceAndMetadata().containsKey(INPUT_FIELD); + Object listResult = document.getSourceAndMetadata().get(OUTPUT_FIELD); + assert (listResult instanceof List); + assertEquals(((List) listResult).size(), 0); + } + @SneakyThrows public void testExecute_withDelimiter_andSourceDataString_thenSucceed() { TextChunkingProcessor processor = createDelimiterInstance(); From 10f65682cb710597d4e3b9b44069bda1086ebc1c Mon Sep 17 00:00:00 2001 From: yuye-aws Date: Fri, 15 Mar 2024 15:38:05 +0800 Subject: [PATCH 167/189] add unit test for tokenization excpetion in fixed token length algorithm Signed-off-by: yuye-aws --- .../chunker/FixedTokenLengthChunker.java | 3 +- .../chunker/FixedTokenLengthChunkerTests.java | 35 +++++++++++++------ 2 files changed, 26 insertions(+), 12 deletions(-) diff --git a/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java b/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java index c21fcf12b..f418771ca 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java @@ -4,7 +4,6 @@ */ package org.opensearch.neuralsearch.processor.chunker; -import java.io.IOException; import java.util.Locale; import java.util.Map; import java.util.List; @@ -156,7 +155,7 @@ private List tokenize(final String content, final String tokenizer try { AnalyzeAction.Response analyzeResponse = analyze(analyzeRequest, analysisRegistry, null, maxTokenCount); return analyzeResponse.getTokens(); - } catch (IOException e) { + } catch (Exception e) { throw new IllegalStateException( String.format(Locale.ROOT, "%s algorithm encounters exception in tokenization: %s", ALGORITHM_NAME, e.getMessage()), e diff --git a/src/test/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunkerTests.java b/src/test/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunkerTests.java index 979228366..5098d126d 100644 --- a/src/test/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunkerTests.java +++ b/src/test/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunkerTests.java @@ -62,11 +62,11 @@ public Map> getTokeniz return new FixedTokenLengthChunker(nonRuntimeParameters); } - public void testValidateAndParseParameters_whenNoParams_thenSuccessful() { + public void testParseParameters_whenNoParams_thenSuccessful() { fixedTokenLengthChunker.parseParameters(Map.of()); } - public void testValidateAndParseParameters_whenIllegalTokenLimitType_thenFail() { + public void testParseParameters_whenIllegalTokenLimitType_thenFail() { Map parameters = new HashMap<>(); parameters.put(TOKEN_LIMIT_FIELD, "invalid token limit"); IllegalArgumentException illegalArgumentException = assertThrows( @@ -79,7 +79,7 @@ public void testValidateAndParseParameters_whenIllegalTokenLimitType_thenFail() ); } - public void testValidateAndParseParameters_whenIllegalTokenLimitValue_thenFail() { + public void testParseParameters_whenIllegalTokenLimitValue_thenFail() { Map parameters = new HashMap<>(); parameters.put(TOKEN_LIMIT_FIELD, -1); IllegalArgumentException illegalArgumentException = assertThrows( @@ -92,7 +92,7 @@ public void testValidateAndParseParameters_whenIllegalTokenLimitValue_thenFail() ); } - public void testValidateAndParseParameters_whenIllegalOverlapRateType_thenFail() { + public void testParseParameters_whenIllegalOverlapRateType_thenFail() { Map parameters = new HashMap<>(); parameters.put(OVERLAP_RATE_FIELD, "invalid overlap rate"); IllegalArgumentException illegalArgumentException = assertThrows( @@ -105,7 +105,7 @@ public void testValidateAndParseParameters_whenIllegalOverlapRateType_thenFail() ); } - public void testValidateAndParseParameters_whenIllegalOverlapRateValue_thenFail() { + public void testParseParameters_whenIllegalOverlapRateValue_thenFail() { Map parameters = new HashMap<>(); parameters.put(OVERLAP_RATE_FIELD, 0.6); IllegalArgumentException illegalArgumentException = assertThrows( @@ -118,7 +118,7 @@ public void testValidateAndParseParameters_whenIllegalOverlapRateValue_thenFail( ); } - public void testValidateAndParseParameters_whenIllegalTokenizerType_thenFail() { + public void testParseParameters_whenIllegalTokenizerType_thenFail() { Map parameters = new HashMap<>(); parameters.put(TOKENIZER_FIELD, 111); IllegalArgumentException illegalArgumentException = assertThrows( @@ -131,7 +131,7 @@ public void testValidateAndParseParameters_whenIllegalTokenizerType_thenFail() { ); } - public void testValidateAndParseParameters_whenUnsupportedTokenizer_thenFail() { + public void testParseParameters_whenUnsupportedTokenizer_thenFail() { String ngramTokenizer = "ngram"; Map parameters = Map.of(TOKENIZER_FIELD, ngramTokenizer); IllegalArgumentException illegalArgumentException = assertThrows( @@ -142,7 +142,22 @@ public void testValidateAndParseParameters_whenUnsupportedTokenizer_thenFail() { .contains(String.format(Locale.ROOT, "Tokenizer [%s] is not supported for [%s] algorithm.", ngramTokenizer, ALGORITHM_NAME))); } - public void testChunk_withTokenLimit_10() { + public void testChunk_whenTokenizationException_thenFail() { + // lowercase tokenizer is not supported in unit tests + String lowercaseTokenizer = "lowercase"; + Map parameters = Map.of(TOKENIZER_FIELD, lowercaseTokenizer); + FixedTokenLengthChunker fixedTokenLengthChunker = createFixedTokenLengthChunker(parameters); + String content = + "This is an example document to be chunked. The document contains a single paragraph, two sentences and 24 tokens by standard tokenizer in OpenSearch."; + IllegalStateException illegalStateException = assertThrows( + IllegalStateException.class, + () -> fixedTokenLengthChunker.chunk(content, parameters) + ); + assert (illegalStateException.getMessage() + .contains(String.format(Locale.ROOT, "%s algorithm encounters exception in tokenization", ALGORITHM_NAME))); + } + + public void testChunk_withTokenLimit10_thenSucceed() { Map parameters = new HashMap<>(); parameters.put(TOKEN_LIMIT_FIELD, 10); parameters.put(TOKENIZER_FIELD, "standard"); @@ -159,7 +174,7 @@ public void testChunk_withTokenLimit_10() { assertEquals(expectedPassages, passages); } - public void testChunk_withTokenLimit_20() { + public void testChunk_withTokenLimit20_thenSucceed() { Map parameters = new HashMap<>(); parameters.put(TOKEN_LIMIT_FIELD, 20); parameters.put(TOKENIZER_FIELD, "standard"); @@ -177,7 +192,7 @@ public void testChunk_withTokenLimit_20() { assertEquals(expectedPassages, passages); } - public void testChunk_withOverlapRate_half() { + public void testChunk_withOverlapRateHalf_thenSucceed() { Map parameters = new HashMap<>(); parameters.put(TOKEN_LIMIT_FIELD, 10); parameters.put(OVERLAP_RATE_FIELD, 0.5); From 3f41f3708c48eac107b1f7cad3296daed1b40ca2 Mon Sep 17 00:00:00 2001 From: yuye-aws Date: Fri, 15 Mar 2024 15:44:38 +0800 Subject: [PATCH 168/189] tune method name in text chunking processor unit test Signed-off-by: yuye-aws --- .../neuralsearch/processor/TextChunkingProcessorTests.java | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/test/java/org/opensearch/neuralsearch/processor/TextChunkingProcessorTests.java b/src/test/java/org/opensearch/neuralsearch/processor/TextChunkingProcessorTests.java index 2296fc05c..f1e3024b6 100644 --- a/src/test/java/org/opensearch/neuralsearch/processor/TextChunkingProcessorTests.java +++ b/src/test/java/org/opensearch/neuralsearch/processor/TextChunkingProcessorTests.java @@ -219,7 +219,7 @@ public void testCreate_whenMaxChunkNumDisabledValue_thenSucceed() { textChunkingProcessorFactory.create(registry, PROCESSOR_TAG, DESCRIPTION, config); } - public void testCreate_whenAlgorithmFieldMultipleAlgorithm_thenFail() { + public void testCreate_whenAlgorithmMapMultipleAlgorithms_thenFail() { Map config = new HashMap<>(); Map fieldMap = new HashMap<>(); Map algorithmMap = new HashMap<>(); @@ -239,7 +239,7 @@ public void testCreate_whenAlgorithmFieldMultipleAlgorithm_thenFail() { ); } - public void testCreate_whenAlgorithmFieldInvalidAlgorithmName_thenFail() { + public void testCreate_wheAlgorithmMapInvalidAlgorithmName_thenFail() { Map config = new HashMap<>(); Map fieldMap = new HashMap<>(); Map algorithmMap = new HashMap<>(); @@ -257,7 +257,7 @@ public void testCreate_whenAlgorithmFieldInvalidAlgorithmName_thenFail() { .contains(String.format(Locale.ROOT, "Chunking algorithm [%s] is not supported.", invalid_algorithm_type))); } - public void testCreate_whenAlgorithmFieldInvalidAlgorithmContent_thenFail() { + public void testCreate_whenAlgorithmMapInvalidAlgorithmType_thenFail() { Map config = new HashMap<>(); Map fieldMap = new HashMap<>(); Map algorithmMap = new HashMap<>(); From e4bdabc9bc55edb4fe2306061ce1046ead6feda9 Mon Sep 17 00:00:00 2001 From: yuye-aws Date: Fri, 15 Mar 2024 15:50:39 +0800 Subject: [PATCH 169/189] tune method name in delimiter algorithm unit test Signed-off-by: yuye-aws --- .../chunker/DelimiterChunkerTests.java | 23 +++++++------------ 1 file changed, 8 insertions(+), 15 deletions(-) diff --git a/src/test/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunkerTests.java b/src/test/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunkerTests.java index 7746a611e..c7184d7fc 100644 --- a/src/test/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunkerTests.java +++ b/src/test/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunkerTests.java @@ -15,7 +15,7 @@ public class DelimiterChunkerTests extends OpenSearchTestCase { - public void testChunkerWithDelimiterFieldNotString() { + public void testCreate_withDelimiterFieldInvalidType_thenFail() { Exception exception = assertThrows( IllegalArgumentException.class, () -> new DelimiterChunker(Map.of(DELIMITER_FIELD, List.of(""))) @@ -26,19 +26,19 @@ public void testChunkerWithDelimiterFieldNotString() { ); } - public void testChunkerWithDelimiterFieldNoString() { + public void testCreate_withDelimiterFieldEmptyString_thenFail() { Exception exception = assertThrows(IllegalArgumentException.class, () -> new DelimiterChunker(Map.of(DELIMITER_FIELD, ""))); Assert.assertEquals(String.format(Locale.ROOT, "Parameter [%s] should not be empty.", DELIMITER_FIELD), exception.getMessage()); } - public void testChunker() { + public void testChunk_withNewlineDelimiter_thenSucceed() { DelimiterChunker chunker = new DelimiterChunker(Map.of(DELIMITER_FIELD, "\n")); String content = "a\nb\nc\nd"; List chunkResult = chunker.chunk(content, Map.of()); assertEquals(List.of("a\n", "b\n", "c\n", "d"), chunkResult); } - public void testChunkerWithDefaultDelimiter() { + public void testChunk_withDefaultDelimiter_thenSucceed() { // default delimiter is \n\n DelimiterChunker chunker = new DelimiterChunker(Map.of()); String content = "a.b\n\nc.d"; @@ -46,35 +46,28 @@ public void testChunkerWithDefaultDelimiter() { assertEquals(List.of("a.b\n\n", "c.d"), chunkResult); } - public void testChunkerWithDelimiterEnd() { - DelimiterChunker chunker = new DelimiterChunker(Map.of(DELIMITER_FIELD, "\n")); - String content = "a\nb\nc\nd\n"; - List chunkResult = chunker.chunk(content, Map.of()); - assertEquals(List.of("a\n", "b\n", "c\n", "d\n"), chunkResult); - } - - public void testChunkerWithOnlyDelimiter() { + public void testChunk_withOnlyDelimiterContent_thenSucceed() { DelimiterChunker chunker = new DelimiterChunker(Map.of(DELIMITER_FIELD, "\n")); String content = "\n"; List chunkResult = chunker.chunk(content, Map.of()); assertEquals(List.of("\n"), chunkResult); } - public void testChunkerWithAllDelimiters() { + public void testChunk_WithAllDelimiterContent_thenSucceed() { DelimiterChunker chunker = new DelimiterChunker(Map.of(DELIMITER_FIELD, "\n")); String content = "\n\n\n"; List chunkResult = chunker.chunk(content, Map.of()); assertEquals(List.of("\n", "\n", "\n"), chunkResult); } - public void testChunkerWithDifferentDelimiters() { + public void testChunk_WithPeriodDelimiters_thenSucceed() { DelimiterChunker chunker = new DelimiterChunker(Map.of(DELIMITER_FIELD, ".")); String content = "a.b.cc.d."; List chunkResult = chunker.chunk(content, Map.of()); assertEquals(List.of("a.", "b.", "cc.", "d."), chunkResult); } - public void testChunkerWithStringDelimiter() { + public void testChunk_withDoubleNewlineDelimiter_thenSucceed() { DelimiterChunker chunker = new DelimiterChunker(Map.of(DELIMITER_FIELD, "\n\n")); String content = "\n\na\n\n\n"; List chunkResult = chunker.chunk(content, Map.of()); From 9e37171ce0957bc28dfc24f41c42ed293b017eb2 Mon Sep 17 00:00:00 2001 From: yuye-aws Date: Fri, 15 Mar 2024 15:52:30 +0800 Subject: [PATCH 170/189] add unit test for overlap rate too small in fixed token length algorithm Signed-off-by: yuye-aws --- .../chunker/FixedTokenLengthChunkerTests.java | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/src/test/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunkerTests.java b/src/test/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunkerTests.java index 5098d126d..2ad1ea18e 100644 --- a/src/test/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunkerTests.java +++ b/src/test/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunkerTests.java @@ -105,7 +105,7 @@ public void testParseParameters_whenIllegalOverlapRateType_thenFail() { ); } - public void testParseParameters_whenIllegalOverlapRateValue_thenFail() { + public void testParseParameters_whenTooLargeOverlapRate_thenFail() { Map parameters = new HashMap<>(); parameters.put(OVERLAP_RATE_FIELD, 0.6); IllegalArgumentException illegalArgumentException = assertThrows( @@ -118,6 +118,19 @@ public void testParseParameters_whenIllegalOverlapRateValue_thenFail() { ); } + public void testParseParameters_whenTooSmallOverlapRateValue_thenFail() { + Map parameters = new HashMap<>(); + parameters.put(OVERLAP_RATE_FIELD, -1); + IllegalArgumentException illegalArgumentException = assertThrows( + IllegalArgumentException.class, + () -> fixedTokenLengthChunker.parseParameters(parameters) + ); + assertEquals( + String.format(Locale.ROOT, "Parameter [%s] must be between %s and %s", OVERLAP_RATE_FIELD, 0.0, 0.5), + illegalArgumentException.getMessage() + ); + } + public void testParseParameters_whenIllegalTokenizerType_thenFail() { Map parameters = new HashMap<>(); parameters.put(TOKENIZER_FIELD, 111); From 18ba1b163c05bce5cc806fcdeb6152ec3de80610 Mon Sep 17 00:00:00 2001 From: yuye-aws Date: Fri, 15 Mar 2024 15:56:15 +0800 Subject: [PATCH 171/189] tune method modifier for all classes Signed-off-by: yuye-aws --- .../neuralsearch/processor/chunker/ChunkerFactory.java | 4 +++- .../processor/chunker/ChunkerParameterParser.java | 4 +++- .../neuralsearch/processor/chunker/DelimiterChunker.java | 2 +- .../processor/chunker/FixedTokenLengthChunker.java | 2 +- 4 files changed, 8 insertions(+), 4 deletions(-) diff --git a/src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerFactory.java b/src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerFactory.java index a3f7346d5..1befdad2a 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerFactory.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerFactory.java @@ -15,7 +15,9 @@ /** * A factory to create different chunking algorithm objects and return all supported chunking algorithms. */ -public class ChunkerFactory { +public final class ChunkerFactory { + + private ChunkerFactory() {} // no instance of this factory class private static final ImmutableMap, Chunker>> chunkers = ImmutableMap.of( FixedTokenLengthChunker.ALGORITHM_NAME, diff --git a/src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerParameterParser.java b/src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerParameterParser.java index 4ef83e03a..e80169fdb 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerParameterParser.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerParameterParser.java @@ -14,7 +14,9 @@ * Parse the parameter for text chunking processor and algorithms. * Throw IllegalArgumentException when parameters are invalid. */ -public class ChunkerParameterParser { +public final class ChunkerParameterParser { + + private ChunkerParameterParser() {} // no instance of this util class /** * Parse string type parameter. diff --git a/src/main/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunker.java b/src/main/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunker.java index 0008fad21..eeeda07c2 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunker.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunker.java @@ -13,7 +13,7 @@ /** * The implementation {@link Chunker} for delimiter algorithm */ -public class DelimiterChunker implements Chunker { +public final class DelimiterChunker implements Chunker { public static final String ALGORITHM_NAME = "delimiter"; public static final String DELIMITER_FIELD = "delimiter"; diff --git a/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java b/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java index f418771ca..8da51dd95 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java @@ -21,7 +21,7 @@ /** * The implementation {@link Chunker} for fixed token length algorithm. */ -public class FixedTokenLengthChunker implements Chunker { +public final class FixedTokenLengthChunker implements Chunker { public static final String ALGORITHM_NAME = "fixed_token_length"; From 2ce9840d3a190c03f328a7277d58c3a3b89eb7ae Mon Sep 17 00:00:00 2001 From: yuye-aws Date: Fri, 15 Mar 2024 16:26:46 +0800 Subject: [PATCH 172/189] tune code Signed-off-by: yuye-aws --- .../neuralsearch/processor/TextChunkingProcessor.java | 4 ++-- .../neuralsearch/processor/chunker/ChunkerFactory.java | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/main/java/org/opensearch/neuralsearch/processor/TextChunkingProcessor.java b/src/main/java/org/opensearch/neuralsearch/processor/TextChunkingProcessor.java index 268248867..80e1ce064 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/TextChunkingProcessor.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/TextChunkingProcessor.java @@ -33,8 +33,8 @@ import static org.opensearch.neuralsearch.processor.chunker.ChunkerParameterParser.parseIntegerParameter; /** - * This processor is used for user input data text chunking. - * The chunking results could be fed to downstream embedding processor. + * This processor is used for text chunking. + * The text chunking results could be fed to downstream embedding processor. * The processor needs two fields: algorithm and field_map, * where algorithm defines chunking algorithm and parameters, * and field_map specifies which fields needs chunking and the corresponding keys for the chunking results. diff --git a/src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerFactory.java b/src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerFactory.java index 1befdad2a..804322a0b 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerFactory.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerFactory.java @@ -19,7 +19,7 @@ public final class ChunkerFactory { private ChunkerFactory() {} // no instance of this factory class - private static final ImmutableMap, Chunker>> chunkers = ImmutableMap.of( + private static final Map, Chunker>> chunkerConstructors = ImmutableMap.of( FixedTokenLengthChunker.ALGORITHM_NAME, FixedTokenLengthChunker::new, DelimiterChunker.ALGORITHM_NAME, @@ -27,10 +27,10 @@ private ChunkerFactory() {} // no instance of this factory class ); @Getter - public static Set allChunkerAlgorithms = chunkers.keySet(); + public static Set allChunkerAlgorithms = chunkerConstructors.keySet(); public static Chunker create(final String type, final Map parameters) { - Function, Chunker> chunkerConstructionFunction = chunkers.get(type); + Function, Chunker> chunkerConstructionFunction = chunkerConstructors.get(type); // chunkerConstructionFunction is not null because we have validated the type in text chunking processor Objects.requireNonNull(chunkerConstructionFunction); return chunkerConstructionFunction.apply(parameters); From 2aea7a592018be5862315d304870eedf2a0099f8 Mon Sep 17 00:00:00 2001 From: yuye-aws Date: Fri, 15 Mar 2024 16:30:07 +0800 Subject: [PATCH 173/189] tune code Signed-off-by: yuye-aws --- .../neuralsearch/processor/TextChunkingProcessor.java | 6 ++---- .../neuralsearch/processor/chunker/ChunkerFactory.java | 8 +++----- 2 files changed, 5 insertions(+), 9 deletions(-) diff --git a/src/main/java/org/opensearch/neuralsearch/processor/TextChunkingProcessor.java b/src/main/java/org/opensearch/neuralsearch/processor/TextChunkingProcessor.java index 80e1ce064..05ce1cf36 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/TextChunkingProcessor.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/TextChunkingProcessor.java @@ -11,7 +11,6 @@ import java.util.ArrayList; import java.util.List; import java.util.Objects; -import java.util.Set; import com.google.common.annotations.VisibleForTesting; @@ -116,14 +115,13 @@ private void parseAlgorithmMap(final Map algorithmMap) { } } - Set allChunkerAlgorithms = ChunkerFactory.allChunkerAlgorithms; - if (!allChunkerAlgorithms.contains(algorithmKey)) { + if (!ChunkerFactory.CHUNKER_ALGORITHMS.contains(algorithmKey)) { throw new IllegalArgumentException( String.format( Locale.ROOT, "Chunking algorithm [%s] is not supported. Supported chunking algorithms are %s", algorithmKey, - allChunkerAlgorithms + ChunkerFactory.CHUNKER_ALGORITHMS ) ); } diff --git a/src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerFactory.java b/src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerFactory.java index 804322a0b..76e6b8092 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerFactory.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerFactory.java @@ -5,7 +5,6 @@ package org.opensearch.neuralsearch.processor.chunker; import com.google.common.collect.ImmutableMap; -import lombok.Getter; import java.util.Map; import java.util.Objects; @@ -19,18 +18,17 @@ public final class ChunkerFactory { private ChunkerFactory() {} // no instance of this factory class - private static final Map, Chunker>> chunkerConstructors = ImmutableMap.of( + private static final Map, Chunker>> CHUNKERS_CONSTRUCTORS = ImmutableMap.of( FixedTokenLengthChunker.ALGORITHM_NAME, FixedTokenLengthChunker::new, DelimiterChunker.ALGORITHM_NAME, DelimiterChunker::new ); - @Getter - public static Set allChunkerAlgorithms = chunkerConstructors.keySet(); + public static Set CHUNKER_ALGORITHMS = CHUNKERS_CONSTRUCTORS.keySet(); public static Chunker create(final String type, final Map parameters) { - Function, Chunker> chunkerConstructionFunction = chunkerConstructors.get(type); + Function, Chunker> chunkerConstructionFunction = CHUNKERS_CONSTRUCTORS.get(type); // chunkerConstructionFunction is not null because we have validated the type in text chunking processor Objects.requireNonNull(chunkerConstructionFunction); return chunkerConstructionFunction.apply(parameters); From 63bbae9f5be2da3a9251e5e960fc685953072bc7 Mon Sep 17 00:00:00 2001 From: yuye-aws Date: Fri, 15 Mar 2024 16:36:05 +0800 Subject: [PATCH 174/189] tune exception type in parameter parser Signed-off-by: yuye-aws --- .../processor/chunker/ChunkerParameterParser.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerParameterParser.java b/src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerParameterParser.java index e80169fdb..65bbd135d 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerParameterParser.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerParameterParser.java @@ -51,7 +51,7 @@ public static int parseIntegerParameter(final Map parameters, fi String fieldValueString = parameters.get(fieldName).toString(); try { return NumberUtils.createInteger(fieldValueString); - } catch (Exception e) { + } catch (NumberFormatException e) { throw new IllegalArgumentException( String.format(Locale.ROOT, "Parameter [%s] must be of %s type", fieldName, Integer.class.getName()) ); @@ -82,7 +82,7 @@ public static double parseDoubleParameter(final Map parameters, String fieldValueString = parameters.get(fieldName).toString(); try { return NumberUtils.createDouble(fieldValueString); - } catch (Exception e) { + } catch (NumberFormatException e) { throw new IllegalArgumentException( String.format(Locale.ROOT, "Parameter [%s] must be of %s type", fieldName, Double.class.getName()) ); From aaee028a91f464b519915a904e6bc675f383fcce Mon Sep 17 00:00:00 2001 From: yuye-aws Date: Fri, 15 Mar 2024 16:38:56 +0800 Subject: [PATCH 175/189] tune comment Signed-off-by: yuye-aws --- .../neuralsearch/processor/TextChunkingProcessor.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/java/org/opensearch/neuralsearch/processor/TextChunkingProcessor.java b/src/main/java/org/opensearch/neuralsearch/processor/TextChunkingProcessor.java index 05ce1cf36..a13c2339a 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/TextChunkingProcessor.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/TextChunkingProcessor.java @@ -278,7 +278,7 @@ private int chunkMapType( ); } } else { - // chunk the object when target key is a string + // chunk the object when target key is of leaf type (null, string and list of string) Object chunkObject = sourceAndMetadataMap.get(originalKey); List chunkedResult = new ArrayList<>(); updatedChunkCount = chunkLeafType(chunkObject, chunkedResult, runtimeParameters, updatedChunkCount); From ab2a15106b7b57b4c36dfe44c4cbba2c3b4ee46f Mon Sep 17 00:00:00 2001 From: yuye-aws Date: Fri, 15 Mar 2024 16:58:39 +0800 Subject: [PATCH 176/189] tune comment Signed-off-by: yuye-aws --- .../neuralsearch/processor/TextChunkingProcessor.java | 5 ----- .../neuralsearch/processor/chunker/Chunker.java | 7 +++---- .../neuralsearch/processor/chunker/ChunkerFactory.java | 2 +- .../processor/chunker/ChunkerParameterParser.java | 8 ++++---- .../neuralsearch/processor/chunker/DelimiterChunker.java | 6 +++--- .../processor/chunker/FixedTokenLengthChunker.java | 4 ++-- 6 files changed, 13 insertions(+), 19 deletions(-) diff --git a/src/main/java/org/opensearch/neuralsearch/processor/TextChunkingProcessor.java b/src/main/java/org/opensearch/neuralsearch/processor/TextChunkingProcessor.java index a13c2339a..374f7f69b 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/TextChunkingProcessor.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/TextChunkingProcessor.java @@ -51,16 +51,11 @@ public final class TextChunkingProcessor extends AbstractProcessor { private static final String DEFAULT_ALGORITHM = FixedTokenLengthChunker.ALGORITHM_NAME; private int maxChunkLimit; - private Chunker chunker; private final Map fieldMap; - private final ClusterService clusterService; - private final IndicesService indicesService; - private final AnalysisRegistry analysisRegistry; - private final Environment environment; public TextChunkingProcessor( diff --git a/src/main/java/org/opensearch/neuralsearch/processor/chunker/Chunker.java b/src/main/java/org/opensearch/neuralsearch/processor/chunker/Chunker.java index da67c91d0..f5a0cc64e 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/chunker/Chunker.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/chunker/Chunker.java @@ -9,25 +9,24 @@ /** * The interface for all chunking algorithms. - * All algorithms need to validate parameters and chunk the content. + * All algorithms need to parse parameters and chunk the content. */ public interface Chunker { /** * Parse the parameters for chunking algorithm. * Throw IllegalArgumentException when parameters are invalid. - * The parameters must be validated before parsing. * * @param parameters a map containing non-runtime parameters for chunking algorithms */ void parseParameters(Map parameters); /** - * Chunk the incoming string according to parameters and return chunked passages + * Chunk the input string according to parameters and return chunked passages * * @param content input string * @param runtimeParameters a map containing runtime parameters for chunking algorithms - * @return Chunked passages + * @return chunked passages */ List chunk(String content, Map runtimeParameters); } diff --git a/src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerFactory.java b/src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerFactory.java index 76e6b8092..aab9eaa3e 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerFactory.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerFactory.java @@ -12,7 +12,7 @@ import java.util.function.Function; /** - * A factory to create different chunking algorithm objects and return all supported chunking algorithms. + * A factory to create different chunking algorithm objects. */ public final class ChunkerFactory { diff --git a/src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerParameterParser.java b/src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerParameterParser.java index 65bbd135d..56a61a26f 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerParameterParser.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerParameterParser.java @@ -19,8 +19,8 @@ public final class ChunkerParameterParser { private ChunkerParameterParser() {} // no instance of this util class /** - * Parse string type parameter. - * Throw IllegalArgumentException if parameter is not a string or empty. + * Parse String type parameter. + * Throw IllegalArgumentException if parameter is not a string or an empty string. */ public static String parseStringParameter(final Map parameters, final String fieldName, final String defaultValue) { if (!parameters.containsKey(fieldName)) { @@ -40,7 +40,7 @@ public static String parseStringParameter(final Map parameters, } /** - * Parse Integer type parameter. + * Parse integer type parameter. * Throw IllegalArgumentException if parameter is not an integer. */ public static int parseIntegerParameter(final Map parameters, final String fieldName, final int defaultValue) { @@ -59,7 +59,7 @@ public static int parseIntegerParameter(final Map parameters, fi } /** - * Parse Integer type parameter with positive value. + * Parse integer type parameter with positive value. * Throw IllegalArgumentException if parameter is not a positive integer. */ public static int parsePositiveIntegerParameter(final Map parameters, final String fieldName, final int defaultValue) { diff --git a/src/main/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunker.java b/src/main/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunker.java index eeeda07c2..dff06c635 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunker.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunker.java @@ -27,9 +27,9 @@ public DelimiterChunker(final Map parameters) { /** * Parse the parameters for delimiter algorithm. - * Throw IllegalArgumentException if delimiter is not a string or empty. + * Throw IllegalArgumentException if delimiter is not a string or an empty string. * - * @param parameters a map containing parameters, containing the following parameters + * @param parameters a map with non-runtime parameters as the following: * 1. delimiter A string as the paragraph split indicator */ @Override @@ -38,7 +38,7 @@ public void parseParameters(Map parameters) { } /** - * Return the chunked passages for fixed token length algorithm + * Return the chunked passages for delimiter algorithm * * @param content input string * @param runtimeParameters a map for runtime parameters, but not needed by delimiter algorithm diff --git a/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java b/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java index 8da51dd95..bbd2f1301 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java @@ -66,11 +66,11 @@ public FixedTokenLengthChunker(final Map parameters) { * Parse the parameters for fixed token length algorithm. * Throw IllegalArgumentException when parameters are invalid. * - * @param parameters a map non-runtime parameters as the following: + * @param parameters a map with non-runtime parameters as the following: * 1. tokenizer: the word tokenizer in opensearch * 2. token_limit: the token limit for each chunked passage * 3. overlap_rate: the overlapping degree for each chunked passage, indicating how many token comes from the previous passage - * Here are requirements for parameters: + * Here are requirements for non-runtime parameters: * 1. token_limit must be a positive integer * 2. overlap_rate must be within range [0, 0.5] * 3. tokenizer must be a word tokenizer From 1eb12aaaaf38faf4e932b11a887aebc0d0aff144 Mon Sep 17 00:00:00 2001 From: yuye-aws Date: Fri, 15 Mar 2024 23:21:51 +0800 Subject: [PATCH 177/189] include max chunk limit in both algorithms Signed-off-by: yuye-aws --- .../processor/TextChunkingProcessor.java | 73 +++++++------------ .../processor/chunker/Chunker.java | 4 + .../processor/chunker/DelimiterChunker.java | 30 +++++++- .../chunker/FixedTokenLengthChunker.java | 21 +++++- .../processor/TextChunkingProcessorTests.java | 2 +- 5 files changed, 80 insertions(+), 50 deletions(-) diff --git a/src/main/java/org/opensearch/neuralsearch/processor/TextChunkingProcessor.java b/src/main/java/org/opensearch/neuralsearch/processor/TextChunkingProcessor.java index 374f7f69b..65e5eb4f6 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/TextChunkingProcessor.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/TextChunkingProcessor.java @@ -12,8 +12,6 @@ import java.util.List; import java.util.Objects; -import com.google.common.annotations.VisibleForTesting; - import org.opensearch.cluster.metadata.IndexMetadata; import org.opensearch.env.Environment; import org.opensearch.index.IndexService; @@ -29,6 +27,9 @@ import org.opensearch.neuralsearch.processor.chunker.ChunkerFactory; import org.opensearch.neuralsearch.processor.chunker.FixedTokenLengthChunker; +import static org.opensearch.neuralsearch.processor.chunker.Chunker.MAX_CHUNK_LIMIT_FIELD; +import static org.opensearch.neuralsearch.processor.chunker.Chunker.DEFAULT_MAX_CHUNK_LIMIT; +import static org.opensearch.neuralsearch.processor.chunker.Chunker.DISABLED_MAX_CHUNK_LIMIT; import static org.opensearch.neuralsearch.processor.chunker.ChunkerParameterParser.parseIntegerParameter; /** @@ -43,11 +44,6 @@ public final class TextChunkingProcessor extends AbstractProcessor { public static final String TYPE = "text_chunking"; public static final String FIELD_MAP_FIELD = "field_map"; public static final String ALGORITHM_FIELD = "algorithm"; - @VisibleForTesting - static final String MAX_CHUNK_LIMIT_FIELD = "max_chunk_limit"; - - private static final int DEFAULT_MAX_CHUNK_LIMIT = 100; - private static final int DISABLED_MAX_CHUNK_LIMIT = -1; private static final String DEFAULT_ALGORITHM = FixedTokenLengthChunker.ALGORITHM_NAME; private int maxChunkLimit; @@ -121,11 +117,7 @@ private void parseAlgorithmMap(final Map algorithmMap) { ); } Map chunkerParameters = (Map) algorithmValue; - if (algorithmKey.equals(FixedTokenLengthChunker.ALGORITHM_NAME)) { - // fixed token length algorithm needs analysis registry for tokenization - chunkerParameters.put(FixedTokenLengthChunker.ANALYSIS_REGISTRY_FIELD, analysisRegistry); - } - this.chunker = ChunkerFactory.create(algorithmKey, chunkerParameters); + // parse processor level max chunk limit this.maxChunkLimit = parseIntegerParameter(chunkerParameters, MAX_CHUNK_LIMIT_FIELD, DEFAULT_MAX_CHUNK_LIMIT); if (maxChunkLimit <= 0 && maxChunkLimit != DISABLED_MAX_CHUNK_LIMIT) { throw new IllegalArgumentException( @@ -137,6 +129,11 @@ private void parseAlgorithmMap(final Map algorithmMap) { ) ); } + if (algorithmKey.equals(FixedTokenLengthChunker.ALGORITHM_NAME)) { + // fixed token length algorithm needs analysis registry for tokenization + chunkerParameters.put(FixedTokenLengthChunker.ANALYSIS_REGISTRY_FIELD, analysisRegistry); + } + this.chunker = ChunkerFactory.create(algorithmKey, chunkerParameters); } @SuppressWarnings("unchecked") @@ -181,6 +178,7 @@ public IngestDocument execute(final IngestDocument ingestDocument) { int maxTokenCount = getMaxTokenCount(sourceAndMetadataMap); runtimeParameters.put(FixedTokenLengthChunker.MAX_TOKEN_COUNT_FIELD, maxTokenCount); } + runtimeParameters.put(MAX_CHUNK_LIMIT_FIELD, maxChunkLimit); chunkMapType(sourceAndMetadataMap, fieldMap, runtimeParameters, 0); return ingestDocument; } @@ -275,58 +273,43 @@ private int chunkMapType( } else { // chunk the object when target key is of leaf type (null, string and list of string) Object chunkObject = sourceAndMetadataMap.get(originalKey); - List chunkedResult = new ArrayList<>(); - updatedChunkCount = chunkLeafType(chunkObject, chunkedResult, runtimeParameters, updatedChunkCount); + List chunkedResult = chunkLeafType(chunkObject, runtimeParameters); sourceAndMetadataMap.put(String.valueOf(targetKey), chunkedResult); } } return updatedChunkCount; } - private int chunkString(final String content, List result, final Map runTimeParameters, final int chunkCount) { - // chunk the content, return the updated chunkCount and add chunk passages into result - int updatedChunkCount = chunkCount; + /** + * Chunk the content, update the runtime max_chunk_limit and return the result + */ + private List chunkString(final String content, final Map runTimeParameters) { + // update runtime max_chunk_limit for each content List contentResult = chunker.chunk(content, runTimeParameters); - updatedChunkCount += contentResult.size(); - if (updatedChunkCount > maxChunkLimit && maxChunkLimit != DISABLED_MAX_CHUNK_LIMIT) { - throw new IllegalStateException( - String.format( - Locale.ROOT, - "The number of chunks produced by %s processor has exceeded the allowed maximum of [%s]. This limit can be set by changing the [%s] parameter.", - TYPE, - maxChunkLimit, - MAX_CHUNK_LIMIT_FIELD - ) - ); - } - result.addAll(contentResult); - return updatedChunkCount; + int runtimeMaxChunkLimit = parseIntegerParameter(runTimeParameters, MAX_CHUNK_LIMIT_FIELD, maxChunkLimit); + runTimeParameters.put(MAX_CHUNK_LIMIT_FIELD, runtimeMaxChunkLimit); + return contentResult; } - private int chunkList( - final List contentList, - List result, - final Map runTimeParameters, - final int chunkCount - ) { + private List chunkList(final List contentList, final Map runTimeParameters) { // flatten original output format from List> to List - int updatedChunkCount = chunkCount; + List result = new ArrayList<>(); for (String content : contentList) { - updatedChunkCount = chunkString(content, result, runTimeParameters, updatedChunkCount); + result.addAll(chunkString(content, runTimeParameters)); } - return updatedChunkCount; + return result; } @SuppressWarnings("unchecked") - private int chunkLeafType(final Object value, List result, final Map runTimeParameters, final int chunkCount) { + private List chunkLeafType(final Object value, final Map runTimeParameters) { // leaf type means null, String or List // the result should be an empty list when the input is null - int updatedChunkCount = chunkCount; + List result = new ArrayList<>(); if (value instanceof String) { - updatedChunkCount = chunkString(value.toString(), result, runTimeParameters, updatedChunkCount); + result = chunkString(value.toString(), runTimeParameters); } else if (isListOfString(value)) { - updatedChunkCount = chunkList((List) value, result, runTimeParameters, updatedChunkCount); + result = chunkList((List) value, runTimeParameters); } - return updatedChunkCount; + return result; } } diff --git a/src/main/java/org/opensearch/neuralsearch/processor/chunker/Chunker.java b/src/main/java/org/opensearch/neuralsearch/processor/chunker/Chunker.java index f5a0cc64e..385e1b724 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/chunker/Chunker.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/chunker/Chunker.java @@ -13,6 +13,10 @@ */ public interface Chunker { + static final String MAX_CHUNK_LIMIT_FIELD = "max_chunk_limit"; + static final int DEFAULT_MAX_CHUNK_LIMIT = 100; + static final int DISABLED_MAX_CHUNK_LIMIT = -1; + /** * Parse the parameters for chunking algorithm. * Throw IllegalArgumentException when parameters are invalid. diff --git a/src/main/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunker.java b/src/main/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunker.java index dff06c635..3e9d415de 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunker.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunker.java @@ -6,8 +6,11 @@ import java.util.Map; import java.util.List; +import java.util.Locale; import java.util.ArrayList; +import static org.opensearch.neuralsearch.processor.TextChunkingProcessor.TYPE; +import static org.opensearch.neuralsearch.processor.chunker.ChunkerParameterParser.parseIntegerParameter; import static org.opensearch.neuralsearch.processor.chunker.ChunkerParameterParser.parseStringParameter; /** @@ -16,10 +19,13 @@ public final class DelimiterChunker implements Chunker { public static final String ALGORITHM_NAME = "delimiter"; + public static final String DELIMITER_FIELD = "delimiter"; + public static final String DEFAULT_DELIMITER = "\n\n"; private String delimiter; + private int maxChunkLimit; public DelimiterChunker(final Map parameters) { parseParameters(parameters); @@ -31,25 +37,31 @@ public DelimiterChunker(final Map parameters) { * * @param parameters a map with non-runtime parameters as the following: * 1. delimiter A string as the paragraph split indicator + * 2. max_chunk_limit processor level max chunk level */ @Override public void parseParameters(Map parameters) { this.delimiter = parseStringParameter(parameters, DELIMITER_FIELD, DEFAULT_DELIMITER); + this.maxChunkLimit = parseIntegerParameter(parameters, MAX_CHUNK_LIMIT_FIELD, DEFAULT_MAX_CHUNK_LIMIT); } /** * Return the chunked passages for delimiter algorithm * * @param content input string - * @param runtimeParameters a map for runtime parameters, but not needed by delimiter algorithm + * @param runtimeParameters a map for runtime parameters, containing the following runtime parameters: + * 1. max_chunk_level content level max chunk limit */ @Override public List chunk(final String content, final Map runtimeParameters) { + int runtimeMaxChunkLimit = parseIntegerParameter(runtimeParameters, MAX_CHUNK_LIMIT_FIELD, maxChunkLimit); + List chunkResult = new ArrayList<>(); int start = 0, end; int nextDelimiterPosition = content.indexOf(delimiter); while (nextDelimiterPosition != -1) { + checkRunTimeMaxChunkLimit(chunkResult.size(), runtimeMaxChunkLimit); end = nextDelimiterPosition + delimiter.length(); chunkResult.add(content.substring(start, end)); start = end; @@ -57,9 +69,25 @@ public List chunk(final String content, final Map runtim } if (start < content.length()) { + checkRunTimeMaxChunkLimit(chunkResult.size(), runtimeMaxChunkLimit); chunkResult.add(content.substring(start)); } return chunkResult; } + + private void checkRunTimeMaxChunkLimit(int chunkResultLength, int runtimeMaxChunkLimit) { + if (chunkResultLength == runtimeMaxChunkLimit) { + // need processorMaxChunkLimit to keep exception message consistent + throw new IllegalStateException( + String.format( + Locale.ROOT, + "The number of chunks produced by %s processor has exceeded the allowed maximum of [%s]. This limit can be set by changing the [%s] parameter.", + TYPE, + maxChunkLimit, + MAX_CHUNK_LIMIT_FIELD + ) + ); + } + } } diff --git a/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java b/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java index bbd2f1301..bf56f7f4f 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java @@ -13,9 +13,11 @@ import org.opensearch.index.analysis.AnalysisRegistry; import org.opensearch.action.admin.indices.analyze.AnalyzeAction; import org.opensearch.action.admin.indices.analyze.AnalyzeAction.AnalyzeToken; +import static org.opensearch.neuralsearch.processor.TextChunkingProcessor.TYPE; import static org.opensearch.action.admin.indices.analyze.TransportAnalyzeAction.analyze; import static org.opensearch.neuralsearch.processor.chunker.ChunkerParameterParser.parseStringParameter; import static org.opensearch.neuralsearch.processor.chunker.ChunkerParameterParser.parseDoubleParameter; +import static org.opensearch.neuralsearch.processor.chunker.ChunkerParameterParser.parseIntegerParameter; import static org.opensearch.neuralsearch.processor.chunker.ChunkerParameterParser.parsePositiveIntegerParameter; /** @@ -53,6 +55,7 @@ public final class FixedTokenLengthChunker implements Chunker { // parameter value private int tokenLimit; + private int maxChunkLimit; private String tokenizer; private double overlapRate; private final AnalysisRegistry analysisRegistry; @@ -80,6 +83,7 @@ public void parseParameters(Map parameters) { this.tokenLimit = parsePositiveIntegerParameter(parameters, TOKEN_LIMIT_FIELD, DEFAULT_TOKEN_LIMIT); this.overlapRate = parseDoubleParameter(parameters, OVERLAP_RATE_FIELD, DEFAULT_OVERLAP_RATE); this.tokenizer = parseStringParameter(parameters, TOKENIZER_FIELD, DEFAULT_TOKENIZER); + this.maxChunkLimit = parseIntegerParameter(parameters, MAX_CHUNK_LIMIT_FIELD, DEFAULT_MAX_CHUNK_LIMIT); if (overlapRate < OVERLAP_RATE_LOWER_BOUND || overlapRate > OVERLAP_RATE_UPPER_BOUND) { throw new IllegalArgumentException( String.format( @@ -111,13 +115,12 @@ public void parseParameters(Map parameters) { * @param content input string * @param runtimeParameters a map for runtime parameters, containing the following runtime parameters: * 1. max_token_count the max token limit for the tokenizer - * Here are requirements for runtime parameters: - * 1. max_token_count must be a positive integer + * 2. runtime_max_chunk_limit runtime max chunk limit for the algorithm, which is non-negative */ @Override public List chunk(final String content, final Map runtimeParameters) { - // parse runtimeParameters before chunking int maxTokenCount = parsePositiveIntegerParameter(runtimeParameters, MAX_TOKEN_COUNT_FIELD, DEFAULT_MAX_TOKEN_COUNT); + int runtimeMaxChunkLimit = parseIntegerParameter(runtimeParameters, MAX_CHUNK_LIMIT_FIELD, this.maxChunkLimit); List tokens = tokenize(content, tokenizer, maxTokenCount); List chunkResult = new ArrayList<>(); @@ -127,6 +130,18 @@ public List chunk(final String content, final Map runtim int overlapTokenNumber = (int) Math.floor(tokenLimit * overlapRate); while (startTokenIndex < tokens.size()) { + if (chunkResult.size() == runtimeMaxChunkLimit) { + // need processor level max chunk level to keep exception message consistent + throw new IllegalStateException( + String.format( + Locale.ROOT, + "The number of chunks produced by %s processor has exceeded the allowed maximum of [%s]. This limit can be set by changing the [%s] parameter.", + TYPE, + maxChunkLimit, + MAX_CHUNK_LIMIT_FIELD + ) + ); + } if (startTokenIndex == 0) { // include all characters till the start if no previous passage startContentPosition = 0; diff --git a/src/test/java/org/opensearch/neuralsearch/processor/TextChunkingProcessorTests.java b/src/test/java/org/opensearch/neuralsearch/processor/TextChunkingProcessorTests.java index f1e3024b6..24a90c105 100644 --- a/src/test/java/org/opensearch/neuralsearch/processor/TextChunkingProcessorTests.java +++ b/src/test/java/org/opensearch/neuralsearch/processor/TextChunkingProcessorTests.java @@ -43,7 +43,7 @@ import static org.opensearch.neuralsearch.processor.TextChunkingProcessor.TYPE; import static org.opensearch.neuralsearch.processor.TextChunkingProcessor.FIELD_MAP_FIELD; import static org.opensearch.neuralsearch.processor.TextChunkingProcessor.ALGORITHM_FIELD; -import static org.opensearch.neuralsearch.processor.TextChunkingProcessor.MAX_CHUNK_LIMIT_FIELD; +import static org.opensearch.neuralsearch.processor.chunker.Chunker.MAX_CHUNK_LIMIT_FIELD; public class TextChunkingProcessorTests extends OpenSearchTestCase { From 40991a32ea722e68b5a987e726e1dab429666587 Mon Sep 17 00:00:00 2001 From: yuye-aws Date: Fri, 15 Mar 2024 23:22:16 +0800 Subject: [PATCH 178/189] tune comment Signed-off-by: yuye-aws --- .../neuralsearch/processor/chunker/FixedTokenLengthChunker.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java b/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java index bf56f7f4f..2225436d2 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java @@ -115,7 +115,7 @@ public void parseParameters(Map parameters) { * @param content input string * @param runtimeParameters a map for runtime parameters, containing the following runtime parameters: * 1. max_token_count the max token limit for the tokenizer - * 2. runtime_max_chunk_limit runtime max chunk limit for the algorithm, which is non-negative + * 2. runtime_max_chunk_limit runtime max chunk limit for the algorithm */ @Override public List chunk(final String content, final Map runtimeParameters) { From ea4bbb8beb97ec3e85ae9126844aabf4a6bf4f9e Mon Sep 17 00:00:00 2001 From: yuye-aws Date: Fri, 15 Mar 2024 23:24:02 +0800 Subject: [PATCH 179/189] allow 0 for max chunk limit Signed-off-by: yuye-aws --- .../neuralsearch/processor/TextChunkingProcessor.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/java/org/opensearch/neuralsearch/processor/TextChunkingProcessor.java b/src/main/java/org/opensearch/neuralsearch/processor/TextChunkingProcessor.java index 65e5eb4f6..92e9bfe64 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/TextChunkingProcessor.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/TextChunkingProcessor.java @@ -119,7 +119,7 @@ private void parseAlgorithmMap(final Map algorithmMap) { Map chunkerParameters = (Map) algorithmValue; // parse processor level max chunk limit this.maxChunkLimit = parseIntegerParameter(chunkerParameters, MAX_CHUNK_LIMIT_FIELD, DEFAULT_MAX_CHUNK_LIMIT); - if (maxChunkLimit <= 0 && maxChunkLimit != DISABLED_MAX_CHUNK_LIMIT) { + if (maxChunkLimit < 0 && maxChunkLimit != DISABLED_MAX_CHUNK_LIMIT) { throw new IllegalArgumentException( String.format( Locale.ROOT, From f0dfb5745c728eca9a6922bff26c66c422aa7953 Mon Sep 17 00:00:00 2001 From: yuye-aws Date: Fri, 15 Mar 2024 23:28:06 +0800 Subject: [PATCH 180/189] update runtime max chunk limit in text chunking processor Signed-off-by: yuye-aws --- .../neuralsearch/processor/TextChunkingProcessor.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/java/org/opensearch/neuralsearch/processor/TextChunkingProcessor.java b/src/main/java/org/opensearch/neuralsearch/processor/TextChunkingProcessor.java index 92e9bfe64..a355a70e8 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/TextChunkingProcessor.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/TextChunkingProcessor.java @@ -287,7 +287,7 @@ private List chunkString(final String content, final Map // update runtime max_chunk_limit for each content List contentResult = chunker.chunk(content, runTimeParameters); int runtimeMaxChunkLimit = parseIntegerParameter(runTimeParameters, MAX_CHUNK_LIMIT_FIELD, maxChunkLimit); - runTimeParameters.put(MAX_CHUNK_LIMIT_FIELD, runtimeMaxChunkLimit); + runTimeParameters.put(MAX_CHUNK_LIMIT_FIELD, runtimeMaxChunkLimit - contentResult.size()); return contentResult; } From cb4b39b129a7703df3dcc95e4a02c984a43a20e8 Mon Sep 17 00:00:00 2001 From: yuye-aws Date: Fri, 15 Mar 2024 23:36:32 +0800 Subject: [PATCH 181/189] tune code for chunker Signed-off-by: yuye-aws --- .../opensearch/neuralsearch/processor/chunker/Chunker.java | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/main/java/org/opensearch/neuralsearch/processor/chunker/Chunker.java b/src/main/java/org/opensearch/neuralsearch/processor/chunker/Chunker.java index 385e1b724..fb6712c76 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/chunker/Chunker.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/chunker/Chunker.java @@ -13,9 +13,9 @@ */ public interface Chunker { - static final String MAX_CHUNK_LIMIT_FIELD = "max_chunk_limit"; - static final int DEFAULT_MAX_CHUNK_LIMIT = 100; - static final int DISABLED_MAX_CHUNK_LIMIT = -1; + String MAX_CHUNK_LIMIT_FIELD = "max_chunk_limit"; + int DEFAULT_MAX_CHUNK_LIMIT = 100; + int DISABLED_MAX_CHUNK_LIMIT = -1; /** * Parse the parameters for chunking algorithm. From 98dd886c3d76d77e2c2dd5ef97aa22f31ef21cac Mon Sep 17 00:00:00 2001 From: yuye-aws Date: Fri, 15 Mar 2024 23:36:55 +0800 Subject: [PATCH 182/189] implement test for multiple field max chunk limit exceed Signed-off-by: yuye-aws --- .../processor/TextChunkingProcessorTests.java | 21 +++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/src/test/java/org/opensearch/neuralsearch/processor/TextChunkingProcessorTests.java b/src/test/java/org/opensearch/neuralsearch/processor/TextChunkingProcessorTests.java index 24a90c105..b55136ba4 100644 --- a/src/test/java/org/opensearch/neuralsearch/processor/TextChunkingProcessorTests.java +++ b/src/test/java/org/opensearch/neuralsearch/processor/TextChunkingProcessorTests.java @@ -401,7 +401,7 @@ public void testExecute_withFixedTokenLength_andSourceDataStringWithMaxChunkNumT } @SneakyThrows - public void testExecute_withFixedTokenLength_andSourceDataStringWithMaxChunkNumDisabled_thenFail() { + public void testExecute_withFixedTokenLength_andSourceDataStringWithMaxChunkNumDisabled_thenSucceed() { TextChunkingProcessor processor = createFixedTokenLengthInstanceWithMaxChunkNum(createStringFieldMap(), -1); IngestDocument ingestDocument = createIngestDocumentWithSourceData(createSourceDataString()); IngestDocument document = processor.execute(ingestDocument); @@ -416,7 +416,7 @@ public void testExecute_withFixedTokenLength_andSourceDataStringWithMaxChunkNumD } @SneakyThrows - public void testExecute_withFixedTokenLength_andSourceDataStringWithMaxChunkNumExceed_thenFail() { + public void testExecute_withFixedTokenLength_andSourceDataStringExceedMaxChunkLimit_thenFail() { int maxChunkLimit = 1; TextChunkingProcessor processor = createFixedTokenLengthInstanceWithMaxChunkNum(createStringFieldMap(), maxChunkLimit); IngestDocument ingestDocument = createIngestDocumentWithSourceData(createSourceDataString()); @@ -432,6 +432,23 @@ public void testExecute_withFixedTokenLength_andSourceDataStringWithMaxChunkNumE )); } + @SneakyThrows + public void testExecute_withFixedTokenLength_andSourceDataListWithMaxChunkLimitExceed_thenFail() { + int maxChunkLimit = 5; + TextChunkingProcessor processor = createFixedTokenLengthInstanceWithMaxChunkNum(createStringFieldMap(), maxChunkLimit); + IngestDocument ingestDocument = createIngestDocumentWithSourceData(createSourceDataListStrings()); + IllegalStateException illegalStateException = assertThrows(IllegalStateException.class, () -> processor.execute(ingestDocument)); + assert (illegalStateException.getMessage() + .contains( + String.format( + Locale.ROOT, + "The number of chunks produced by %s processor has exceeded the allowed maximum of [%s].", + TYPE, + maxChunkLimit + ) + )); + } + @SneakyThrows public void testCreate_withDefaultAlgorithm_andSourceDataString_thenSucceed() { TextChunkingProcessor processor = createDefaultAlgorithmInstance(createStringFieldMap()); From d245a042db77b5f342b04e4a821f7ca7d971375b Mon Sep 17 00:00:00 2001 From: yuye-aws Date: Fri, 15 Mar 2024 23:39:57 +0800 Subject: [PATCH 183/189] tune methods name in text chunking proceesor unit tests Signed-off-by: yuye-aws --- .../processor/TextChunkingProcessorTests.java | 34 +++++++++---------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/src/test/java/org/opensearch/neuralsearch/processor/TextChunkingProcessorTests.java b/src/test/java/org/opensearch/neuralsearch/processor/TextChunkingProcessorTests.java index b55136ba4..30a443c6c 100644 --- a/src/test/java/org/opensearch/neuralsearch/processor/TextChunkingProcessorTests.java +++ b/src/test/java/org/opensearch/neuralsearch/processor/TextChunkingProcessorTests.java @@ -103,10 +103,10 @@ private List> createSourceDataListNestedMap() { return List.of(documents, documents); } - private Map createFixedTokenLengthParametersWithMaxChunk(int maxChunkNum) { + private Map createFixedTokenLengthParametersWithMaxChunkLimit(int maxChunkLimit) { Map parameters = new HashMap<>(); parameters.put(FixedTokenLengthChunker.TOKEN_LIMIT_FIELD, 10); - parameters.put(MAX_CHUNK_LIMIT_FIELD, maxChunkNum); + parameters.put(MAX_CHUNK_LIMIT_FIELD, maxChunkLimit); return parameters; } @@ -150,10 +150,10 @@ private TextChunkingProcessor createFixedTokenLengthInstance(Map } @SneakyThrows - private TextChunkingProcessor createFixedTokenLengthInstanceWithMaxChunkNum(Map fieldMap, int maxChunkNum) { + private TextChunkingProcessor createFixedTokenLengthInstanceWithMaxChunkLimit(Map fieldMap, int maxChunkLimit) { Map config = new HashMap<>(); Map algorithmMap = new HashMap<>(); - algorithmMap.put(FixedTokenLengthChunker.ALGORITHM_NAME, createFixedTokenLengthParametersWithMaxChunk(maxChunkNum)); + algorithmMap.put(FixedTokenLengthChunker.ALGORITHM_NAME, createFixedTokenLengthParametersWithMaxChunkLimit(maxChunkLimit)); config.put(FIELD_MAP_FIELD, fieldMap); config.put(ALGORITHM_FIELD, algorithmMap); Map registry = new HashMap<>(); @@ -189,13 +189,13 @@ public void testCreate_whenAlgorithmFieldMissing_thenFail() { } @SneakyThrows - public void testCreate_whenMaxChunkNumInvalidValue_thenFail() { + public void testCreate_whenMaxChunkLimitInvalidValue_thenFail() { Map registry = new HashMap<>(); Map config = new HashMap<>(); Map fieldMap = new HashMap<>(); Map algorithmMap = new HashMap<>(); fieldMap.put(INPUT_FIELD, OUTPUT_FIELD); - algorithmMap.put(FixedTokenLengthChunker.ALGORITHM_NAME, createFixedTokenLengthParametersWithMaxChunk(-2)); + algorithmMap.put(FixedTokenLengthChunker.ALGORITHM_NAME, createFixedTokenLengthParametersWithMaxChunkLimit(-2)); config.put(FIELD_MAP_FIELD, fieldMap); config.put(ALGORITHM_FIELD, algorithmMap); IllegalArgumentException illegalArgumentException = assertThrows( @@ -207,13 +207,13 @@ public void testCreate_whenMaxChunkNumInvalidValue_thenFail() { } @SneakyThrows - public void testCreate_whenMaxChunkNumDisabledValue_thenSucceed() { + public void testCreate_whenMaxChunkLimitDisabledValue_thenSucceed() { Map registry = new HashMap<>(); Map config = new HashMap<>(); Map fieldMap = new HashMap<>(); Map algorithmMap = new HashMap<>(); fieldMap.put(INPUT_FIELD, OUTPUT_FIELD); - algorithmMap.put(FixedTokenLengthChunker.ALGORITHM_NAME, createFixedTokenLengthParametersWithMaxChunk(-1)); + algorithmMap.put(FixedTokenLengthChunker.ALGORITHM_NAME, createFixedTokenLengthParametersWithMaxChunkLimit(-1)); config.put(FIELD_MAP_FIELD, fieldMap); config.put(ALGORITHM_FIELD, algorithmMap); textChunkingProcessorFactory.create(registry, PROCESSOR_TAG, DESCRIPTION, config); @@ -369,8 +369,8 @@ private IngestDocument createIngestDocumentWithSourceData(Object sourceData) { } @SneakyThrows - public void testExecute_withFixedTokenLength_andSourceDataStringWithMaxChunkNum_thenSucceed() { - TextChunkingProcessor processor = createFixedTokenLengthInstanceWithMaxChunkNum(createStringFieldMap(), 5); + public void testExecute_withFixedTokenLength_andSourceDataStringWithMaxChunkLimit_thenSucceed() { + TextChunkingProcessor processor = createFixedTokenLengthInstanceWithMaxChunkLimit(createStringFieldMap(), 5); IngestDocument ingestDocument = createIngestDocumentWithSourceData(createSourceDataString()); IngestDocument document = processor.execute(ingestDocument); assert document.getSourceAndMetadata().containsKey(OUTPUT_FIELD); @@ -384,8 +384,8 @@ public void testExecute_withFixedTokenLength_andSourceDataStringWithMaxChunkNum_ } @SneakyThrows - public void testExecute_withFixedTokenLength_andSourceDataStringWithMaxChunkNumTwice_thenSucceed() { - TextChunkingProcessor processor = createFixedTokenLengthInstanceWithMaxChunkNum(createStringFieldMap(), 5); + public void testExecute_withFixedTokenLength_andSourceDataStringWithMaxChunkLimitTwice_thenSucceed() { + TextChunkingProcessor processor = createFixedTokenLengthInstanceWithMaxChunkLimit(createStringFieldMap(), 5); for (int i = 0; i < 2; i++) { IngestDocument ingestDocument = createIngestDocumentWithSourceData(createSourceDataString()); IngestDocument document = processor.execute(ingestDocument); @@ -401,8 +401,8 @@ public void testExecute_withFixedTokenLength_andSourceDataStringWithMaxChunkNumT } @SneakyThrows - public void testExecute_withFixedTokenLength_andSourceDataStringWithMaxChunkNumDisabled_thenSucceed() { - TextChunkingProcessor processor = createFixedTokenLengthInstanceWithMaxChunkNum(createStringFieldMap(), -1); + public void testExecute_withFixedTokenLength_andSourceDataStringWithMaxChunkLimitDisabled_thenSucceed() { + TextChunkingProcessor processor = createFixedTokenLengthInstanceWithMaxChunkLimit(createStringFieldMap(), -1); IngestDocument ingestDocument = createIngestDocumentWithSourceData(createSourceDataString()); IngestDocument document = processor.execute(ingestDocument); assert document.getSourceAndMetadata().containsKey(OUTPUT_FIELD); @@ -418,7 +418,7 @@ public void testExecute_withFixedTokenLength_andSourceDataStringWithMaxChunkNumD @SneakyThrows public void testExecute_withFixedTokenLength_andSourceDataStringExceedMaxChunkLimit_thenFail() { int maxChunkLimit = 1; - TextChunkingProcessor processor = createFixedTokenLengthInstanceWithMaxChunkNum(createStringFieldMap(), maxChunkLimit); + TextChunkingProcessor processor = createFixedTokenLengthInstanceWithMaxChunkLimit(createStringFieldMap(), maxChunkLimit); IngestDocument ingestDocument = createIngestDocumentWithSourceData(createSourceDataString()); IllegalStateException illegalStateException = assertThrows(IllegalStateException.class, () -> processor.execute(ingestDocument)); assert (illegalStateException.getMessage() @@ -433,9 +433,9 @@ public void testExecute_withFixedTokenLength_andSourceDataStringExceedMaxChunkLi } @SneakyThrows - public void testExecute_withFixedTokenLength_andSourceDataListWithMaxChunkLimitExceed_thenFail() { + public void testExecute_withFixedTokenLength_andSourceDataListExceedMaxChunkLimit_thenFail() { int maxChunkLimit = 5; - TextChunkingProcessor processor = createFixedTokenLengthInstanceWithMaxChunkNum(createStringFieldMap(), maxChunkLimit); + TextChunkingProcessor processor = createFixedTokenLengthInstanceWithMaxChunkLimit(createStringFieldMap(), maxChunkLimit); IngestDocument ingestDocument = createIngestDocumentWithSourceData(createSourceDataListStrings()); IllegalStateException illegalStateException = assertThrows(IllegalStateException.class, () -> processor.execute(ingestDocument)); assert (illegalStateException.getMessage() From ad7ba259c69151b71c94b4bf194f3a73ca09321f Mon Sep 17 00:00:00 2001 From: yuye-aws Date: Fri, 15 Mar 2024 23:54:12 +0800 Subject: [PATCH 184/189] add unit tests for both algorithms with max chunk limit Signed-off-by: yuye-aws --- .../chunker/DelimiterChunkerTests.java | 46 ++++++++++ .../chunker/FixedTokenLengthChunkerTests.java | 86 +++++++++++++++++++ 2 files changed, 132 insertions(+) diff --git a/src/test/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunkerTests.java b/src/test/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunkerTests.java index c7184d7fc..e05f57452 100644 --- a/src/test/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunkerTests.java +++ b/src/test/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunkerTests.java @@ -11,6 +11,8 @@ import org.junit.Assert; import org.opensearch.test.OpenSearchTestCase; +import static org.opensearch.neuralsearch.processor.TextChunkingProcessor.TYPE; +import static org.opensearch.neuralsearch.processor.chunker.Chunker.MAX_CHUNK_LIMIT_FIELD; import static org.opensearch.neuralsearch.processor.chunker.DelimiterChunker.DELIMITER_FIELD; public class DelimiterChunkerTests extends OpenSearchTestCase { @@ -73,4 +75,48 @@ public void testChunk_withDoubleNewlineDelimiter_thenSucceed() { List chunkResult = chunker.chunk(content, Map.of()); assertEquals(List.of("\n\n", "a\n\n", "\n"), chunkResult); } + + public void testChunk_whenExceedMaxChunkLimit_thenFail() { + int maxChunkLimit = 2; + DelimiterChunker chunker = new DelimiterChunker(Map.of(DELIMITER_FIELD, "\n\n", MAX_CHUNK_LIMIT_FIELD, maxChunkLimit)); + String content = "\n\na\n\n\n"; + IllegalStateException illegalStateException = assertThrows(IllegalStateException.class, () -> chunker.chunk(content, Map.of())); + assert (illegalStateException.getMessage() + .contains( + String.format( + Locale.ROOT, + "The number of chunks produced by %s processor has exceeded the allowed maximum of [%s].", + TYPE, + maxChunkLimit + ) + )); + } + + public void testChunk_whenWithinMaxChunkLimit_thenSucceed() { + int maxChunkLimit = 3; + DelimiterChunker chunker = new DelimiterChunker(Map.of(DELIMITER_FIELD, "\n\n", MAX_CHUNK_LIMIT_FIELD, maxChunkLimit)); + String content = "\n\na\n\n\n"; + List chunkResult = chunker.chunk(content, Map.of()); + assertEquals(List.of("\n\n", "a\n\n", "\n"), chunkResult); + } + + public void testChunk_whenExceedRuntimeMaxChunkLimit_thenFail() { + int maxChunkLimit = 3; + DelimiterChunker chunker = new DelimiterChunker(Map.of(DELIMITER_FIELD, "\n\n", MAX_CHUNK_LIMIT_FIELD, maxChunkLimit)); + String content = "\n\na\n\n\n"; + int runtimeMaxChunkLimit = 2; + IllegalStateException illegalStateException = assertThrows( + IllegalStateException.class, + () -> chunker.chunk(content, Map.of(MAX_CHUNK_LIMIT_FIELD, runtimeMaxChunkLimit)) + ); + assert (illegalStateException.getMessage() + .contains( + String.format( + Locale.ROOT, + "The number of chunks produced by %s processor has exceeded the allowed maximum of [%s].", + TYPE, + maxChunkLimit + ) + )); + } } diff --git a/src/test/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunkerTests.java b/src/test/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunkerTests.java index 2ad1ea18e..bd7fa8b87 100644 --- a/src/test/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunkerTests.java +++ b/src/test/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunkerTests.java @@ -23,6 +23,8 @@ import java.util.Map; import static java.util.Collections.singletonList; import static java.util.Collections.singletonMap; +import static org.opensearch.neuralsearch.processor.TextChunkingProcessor.TYPE; +import static org.opensearch.neuralsearch.processor.chunker.Chunker.MAX_CHUNK_LIMIT_FIELD; import static org.opensearch.neuralsearch.processor.chunker.FixedTokenLengthChunker.ALGORITHM_NAME; import static org.opensearch.neuralsearch.processor.chunker.FixedTokenLengthChunker.ANALYSIS_REGISTRY_FIELD; import static org.opensearch.neuralsearch.processor.chunker.FixedTokenLengthChunker.TOKEN_LIMIT_FIELD; @@ -170,6 +172,18 @@ public void testChunk_whenTokenizationException_thenFail() { .contains(String.format(Locale.ROOT, "%s algorithm encounters exception in tokenization", ALGORITHM_NAME))); } + public void testChunk_withEmptyInput_thenSucceed() { + Map parameters = new HashMap<>(); + parameters.put(TOKEN_LIMIT_FIELD, 10); + parameters.put(TOKENIZER_FIELD, "standard"); + FixedTokenLengthChunker fixedTokenLengthChunker = createFixedTokenLengthChunker(parameters); + Map runtimeParameters = new HashMap<>(); + runtimeParameters.put(MAX_TOKEN_COUNT_FIELD, 10000); + String content = ""; + List passages = fixedTokenLengthChunker.chunk(content, runtimeParameters); + assert (passages.isEmpty()); + } + public void testChunk_withTokenLimit10_thenSucceed() { Map parameters = new HashMap<>(); parameters.put(TOKEN_LIMIT_FIELD, 10); @@ -220,4 +234,76 @@ public void testChunk_withOverlapRateHalf_thenSucceed() { expectedPassages.add("sentences and 24 tokens by standard tokenizer in OpenSearch."); assertEquals(expectedPassages, passages); } + + public void testChunk_whenExceedMaxChunkLimit_thenFail() { + int maxChunkLimit = 2; + Map parameters = new HashMap<>(); + parameters.put(TOKEN_LIMIT_FIELD, 10); + parameters.put(TOKENIZER_FIELD, "standard"); + parameters.put(MAX_CHUNK_LIMIT_FIELD, maxChunkLimit); + FixedTokenLengthChunker fixedTokenLengthChunker = createFixedTokenLengthChunker(parameters); + Map runtimeParameters = new HashMap<>(); + runtimeParameters.put(MAX_TOKEN_COUNT_FIELD, 10000); + String content = + "This is an example document to be chunked. The document contains a single paragraph, two sentences and 24 tokens by standard tokenizer in OpenSearch."; + IllegalStateException illegalStateException = assertThrows( + IllegalStateException.class, + () -> fixedTokenLengthChunker.chunk(content, runtimeParameters) + ); + assert (illegalStateException.getMessage() + .contains( + String.format( + Locale.ROOT, + "The number of chunks produced by %s processor has exceeded the allowed maximum of [%s].", + TYPE, + maxChunkLimit + ) + )); + } + + public void testChunk_whenWithinMaxChunkLimit_thenSucceed() { + int maxChunkLimit = 3; + Map parameters = new HashMap<>(); + parameters.put(TOKEN_LIMIT_FIELD, 10); + parameters.put(TOKENIZER_FIELD, "standard"); + parameters.put(MAX_CHUNK_LIMIT_FIELD, maxChunkLimit); + FixedTokenLengthChunker fixedTokenLengthChunker = createFixedTokenLengthChunker(parameters); + Map runtimeParameters = new HashMap<>(); + runtimeParameters.put(MAX_TOKEN_COUNT_FIELD, 10000); + String content = + "This is an example document to be chunked. The document contains a single paragraph, two sentences and 24 tokens by standard tokenizer in OpenSearch."; + List passages = fixedTokenLengthChunker.chunk(content, runtimeParameters); + List expectedPassages = new ArrayList<>(); + expectedPassages.add("This is an example document to be chunked. The document "); + expectedPassages.add("contains a single paragraph, two sentences and 24 tokens by "); + expectedPassages.add("standard tokenizer in OpenSearch."); + assertEquals(expectedPassages, passages); + } + + public void testChunk_whenExceedRuntimeMaxChunkLimit_thenFail() { + int maxChunkLimit = 3, runtimeMaxChunkLimit = 2; + Map parameters = new HashMap<>(); + parameters.put(TOKEN_LIMIT_FIELD, 10); + parameters.put(TOKENIZER_FIELD, "standard"); + parameters.put(MAX_CHUNK_LIMIT_FIELD, maxChunkLimit); + FixedTokenLengthChunker fixedTokenLengthChunker = createFixedTokenLengthChunker(parameters); + Map runtimeParameters = new HashMap<>(); + runtimeParameters.put(MAX_TOKEN_COUNT_FIELD, 10000); + runtimeParameters.put(MAX_CHUNK_LIMIT_FIELD, runtimeMaxChunkLimit); + String content = + "This is an example document to be chunked. The document contains a single paragraph, two sentences and 24 tokens by standard tokenizer in OpenSearch."; + IllegalStateException illegalStateException = assertThrows( + IllegalStateException.class, + () -> fixedTokenLengthChunker.chunk(content, runtimeParameters) + ); + assert (illegalStateException.getMessage() + .contains( + String.format( + Locale.ROOT, + "The number of chunks produced by %s processor has exceeded the allowed maximum of [%s].", + TYPE, + maxChunkLimit + ) + )); + } } From 9702168d8d8c455de00a21fadb945bf05565a3bf Mon Sep 17 00:00:00 2001 From: yuye-aws Date: Sat, 16 Mar 2024 00:29:33 +0800 Subject: [PATCH 185/189] optimize code Signed-off-by: yuye-aws --- .../processor/TextChunkingProcessor.java | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/src/main/java/org/opensearch/neuralsearch/processor/TextChunkingProcessor.java b/src/main/java/org/opensearch/neuralsearch/processor/TextChunkingProcessor.java index a355a70e8..fdd1c211f 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/TextChunkingProcessor.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/TextChunkingProcessor.java @@ -153,15 +153,12 @@ private boolean isListOfString(final Object value) { private int getMaxTokenCount(final Map sourceAndMetadataMap) { String indexName = sourceAndMetadataMap.get(IndexFieldMapper.NAME).toString(); IndexMetadata indexMetadata = clusterService.state().metadata().index(indexName); - int maxTokenCount; - if (Objects.nonNull(indexMetadata)) { - // if the index is specified in the metadata, read maxTokenCount from the index setting - IndexService indexService = indicesService.indexServiceSafe(indexMetadata.getIndex()); - maxTokenCount = indexService.getIndexSettings().getMaxTokenCount(); - } else { - maxTokenCount = IndexSettings.MAX_TOKEN_COUNT_SETTING.get(environment.settings()); + if (Objects.isNull(indexMetadata)) { + return IndexSettings.MAX_TOKEN_COUNT_SETTING.get(environment.settings()); } - return maxTokenCount; + // if the index is specified in the metadata, read maxTokenCount from the index setting + IndexService indexService = indicesService.indexServiceSafe(indexMetadata.getIndex()); + return indexService.getIndexSettings().getMaxTokenCount(); } /** From 3d8c030343eae7baaed6e646221f791893062408 Mon Sep 17 00:00:00 2001 From: yuye-aws Date: Sun, 17 Mar 2024 13:05:34 +0800 Subject: [PATCH 186/189] extract max chunk limit check to util class Signed-off-by: yuye-aws --- .../processor/chunker/ChunkerUtil.java | 40 +++++++++++++++++++ .../processor/chunker/DelimiterChunker.java | 25 ++---------- .../chunker/FixedTokenLengthChunker.java | 18 ++------- 3 files changed, 48 insertions(+), 35 deletions(-) create mode 100644 src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerUtil.java diff --git a/src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerUtil.java b/src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerUtil.java new file mode 100644 index 000000000..1fcc34768 --- /dev/null +++ b/src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerUtil.java @@ -0,0 +1,40 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ +package org.opensearch.neuralsearch.processor.chunker; + +import java.util.Locale; + +import static org.opensearch.neuralsearch.processor.TextChunkingProcessor.TYPE; +import static org.opensearch.neuralsearch.processor.chunker.Chunker.MAX_CHUNK_LIMIT_FIELD; + +/** + * A util class used by chunking algorithms. + */ +public class ChunkerUtil { + + private ChunkerUtil() {} // no instance of this util class + + /** + * Checks whether the chunking results would exceed the max chunk limit. + * If exceeds, then Throw IllegalStateException + * + * @param chunkResultSize the size of chunking result + * @param runtimeMaxChunkLimit runtime max_chunk_limit, used to check with chunkResultSize + * @param nonRuntimeMaxChunkLimit non-runtime max_chunk_limit, used to keep exception message consistent + */ + public static void checkRunTimeMaxChunkLimit(int chunkResultSize, int runtimeMaxChunkLimit, int nonRuntimeMaxChunkLimit) { + if (chunkResultSize == runtimeMaxChunkLimit) { + throw new IllegalStateException( + String.format( + Locale.ROOT, + "The number of chunks produced by %s processor has exceeded the allowed maximum of [%s]. This limit can be set by changing the [%s] parameter.", + TYPE, + nonRuntimeMaxChunkLimit, + MAX_CHUNK_LIMIT_FIELD + ) + ); + } + } +} diff --git a/src/main/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunker.java b/src/main/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunker.java index 3e9d415de..c688af436 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunker.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunker.java @@ -6,10 +6,8 @@ import java.util.Map; import java.util.List; -import java.util.Locale; import java.util.ArrayList; -import static org.opensearch.neuralsearch.processor.TextChunkingProcessor.TYPE; import static org.opensearch.neuralsearch.processor.chunker.ChunkerParameterParser.parseIntegerParameter; import static org.opensearch.neuralsearch.processor.chunker.ChunkerParameterParser.parseStringParameter; @@ -37,7 +35,7 @@ public DelimiterChunker(final Map parameters) { * * @param parameters a map with non-runtime parameters as the following: * 1. delimiter A string as the paragraph split indicator - * 2. max_chunk_limit processor level max chunk level + * 2. max_chunk_limit processor level max chunk limit */ @Override public void parseParameters(Map parameters) { @@ -50,7 +48,7 @@ public void parseParameters(Map parameters) { * * @param content input string * @param runtimeParameters a map for runtime parameters, containing the following runtime parameters: - * 1. max_chunk_level content level max chunk limit + * 1. max_chunk_limit field level max chunk limit */ @Override public List chunk(final String content, final Map runtimeParameters) { @@ -61,7 +59,7 @@ public List chunk(final String content, final Map runtim int nextDelimiterPosition = content.indexOf(delimiter); while (nextDelimiterPosition != -1) { - checkRunTimeMaxChunkLimit(chunkResult.size(), runtimeMaxChunkLimit); + ChunkerUtil.checkRunTimeMaxChunkLimit(chunkResult.size(), runtimeMaxChunkLimit, maxChunkLimit); end = nextDelimiterPosition + delimiter.length(); chunkResult.add(content.substring(start, end)); start = end; @@ -69,25 +67,10 @@ public List chunk(final String content, final Map runtim } if (start < content.length()) { - checkRunTimeMaxChunkLimit(chunkResult.size(), runtimeMaxChunkLimit); + ChunkerUtil.checkRunTimeMaxChunkLimit(chunkResult.size(), runtimeMaxChunkLimit, maxChunkLimit); chunkResult.add(content.substring(start)); } return chunkResult; } - - private void checkRunTimeMaxChunkLimit(int chunkResultLength, int runtimeMaxChunkLimit) { - if (chunkResultLength == runtimeMaxChunkLimit) { - // need processorMaxChunkLimit to keep exception message consistent - throw new IllegalStateException( - String.format( - Locale.ROOT, - "The number of chunks produced by %s processor has exceeded the allowed maximum of [%s]. This limit can be set by changing the [%s] parameter.", - TYPE, - maxChunkLimit, - MAX_CHUNK_LIMIT_FIELD - ) - ); - } - } } diff --git a/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java b/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java index 2225436d2..640fc2ab5 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java @@ -13,7 +13,6 @@ import org.opensearch.index.analysis.AnalysisRegistry; import org.opensearch.action.admin.indices.analyze.AnalyzeAction; import org.opensearch.action.admin.indices.analyze.AnalyzeAction.AnalyzeToken; -import static org.opensearch.neuralsearch.processor.TextChunkingProcessor.TYPE; import static org.opensearch.action.admin.indices.analyze.TransportAnalyzeAction.analyze; import static org.opensearch.neuralsearch.processor.chunker.ChunkerParameterParser.parseStringParameter; import static org.opensearch.neuralsearch.processor.chunker.ChunkerParameterParser.parseDoubleParameter; @@ -73,10 +72,12 @@ public FixedTokenLengthChunker(final Map parameters) { * 1. tokenizer: the word tokenizer in opensearch * 2. token_limit: the token limit for each chunked passage * 3. overlap_rate: the overlapping degree for each chunked passage, indicating how many token comes from the previous passage + * 4. max_chunk_limit processor level max chunk level * Here are requirements for non-runtime parameters: * 1. token_limit must be a positive integer * 2. overlap_rate must be within range [0, 0.5] * 3. tokenizer must be a word tokenizer + * */ @Override public void parseParameters(Map parameters) { @@ -115,7 +116,7 @@ public void parseParameters(Map parameters) { * @param content input string * @param runtimeParameters a map for runtime parameters, containing the following runtime parameters: * 1. max_token_count the max token limit for the tokenizer - * 2. runtime_max_chunk_limit runtime max chunk limit for the algorithm + * 2. max_chunk_limit field level max chunk limit */ @Override public List chunk(final String content, final Map runtimeParameters) { @@ -130,18 +131,7 @@ public List chunk(final String content, final Map runtim int overlapTokenNumber = (int) Math.floor(tokenLimit * overlapRate); while (startTokenIndex < tokens.size()) { - if (chunkResult.size() == runtimeMaxChunkLimit) { - // need processor level max chunk level to keep exception message consistent - throw new IllegalStateException( - String.format( - Locale.ROOT, - "The number of chunks produced by %s processor has exceeded the allowed maximum of [%s]. This limit can be set by changing the [%s] parameter.", - TYPE, - maxChunkLimit, - MAX_CHUNK_LIMIT_FIELD - ) - ); - } + ChunkerUtil.checkRunTimeMaxChunkLimit(chunkResult.size(), runtimeMaxChunkLimit, maxChunkLimit); if (startTokenIndex == 0) { // include all characters till the start if no previous passage startContentPosition = 0; From 9931fae25082877d54d66a9973e225c5d8efc37b Mon Sep 17 00:00:00 2001 From: yuye-aws Date: Mon, 18 Mar 2024 09:36:44 +0800 Subject: [PATCH 187/189] resolve code review comments Signed-off-by: yuye-aws --- .../processor/TextChunkingProcessor.java | 12 ++++-------- .../neuralsearch/processor/chunker/ChunkerUtil.java | 2 +- .../processor/chunker/FixedTokenLengthChunker.java | 2 +- 3 files changed, 6 insertions(+), 10 deletions(-) diff --git a/src/main/java/org/opensearch/neuralsearch/processor/TextChunkingProcessor.java b/src/main/java/org/opensearch/neuralsearch/processor/TextChunkingProcessor.java index fdd1c211f..d3263d3b1 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/TextChunkingProcessor.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/TextChunkingProcessor.java @@ -129,10 +129,8 @@ private void parseAlgorithmMap(final Map algorithmMap) { ) ); } - if (algorithmKey.equals(FixedTokenLengthChunker.ALGORITHM_NAME)) { - // fixed token length algorithm needs analysis registry for tokenization - chunkerParameters.put(FixedTokenLengthChunker.ANALYSIS_REGISTRY_FIELD, analysisRegistry); - } + // fixed token length algorithm needs analysis registry for tokenization + chunkerParameters.put(FixedTokenLengthChunker.ANALYSIS_REGISTRY_FIELD, analysisRegistry); this.chunker = ChunkerFactory.create(algorithmKey, chunkerParameters); } @@ -171,10 +169,8 @@ public IngestDocument execute(final IngestDocument ingestDocument) { validateFieldsValue(sourceAndMetadataMap); // fixed token length algorithm needs runtime parameter max_token_count for tokenization Map runtimeParameters = new HashMap<>(); - if (chunker instanceof FixedTokenLengthChunker) { - int maxTokenCount = getMaxTokenCount(sourceAndMetadataMap); - runtimeParameters.put(FixedTokenLengthChunker.MAX_TOKEN_COUNT_FIELD, maxTokenCount); - } + int maxTokenCount = getMaxTokenCount(sourceAndMetadataMap); + runtimeParameters.put(FixedTokenLengthChunker.MAX_TOKEN_COUNT_FIELD, maxTokenCount); runtimeParameters.put(MAX_CHUNK_LIMIT_FIELD, maxChunkLimit); chunkMapType(sourceAndMetadataMap, fieldMap, runtimeParameters, 0); return ingestDocument; diff --git a/src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerUtil.java b/src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerUtil.java index 1fcc34768..3a1d2786e 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerUtil.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerUtil.java @@ -26,7 +26,7 @@ private ChunkerUtil() {} // no instance of this util class */ public static void checkRunTimeMaxChunkLimit(int chunkResultSize, int runtimeMaxChunkLimit, int nonRuntimeMaxChunkLimit) { if (chunkResultSize == runtimeMaxChunkLimit) { - throw new IllegalStateException( + throw new IllegalArgumentException( String.format( Locale.ROOT, "The number of chunks produced by %s processor has exceeded the allowed maximum of [%s]. This limit can be set by changing the [%s] parameter.", diff --git a/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java b/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java index 640fc2ab5..01949731a 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java @@ -162,7 +162,7 @@ private List tokenize(final String content, final String tokenizer return analyzeResponse.getTokens(); } catch (Exception e) { throw new IllegalStateException( - String.format(Locale.ROOT, "%s algorithm encounters exception in tokenization: %s", ALGORITHM_NAME, e.getMessage()), + String.format(Locale.ROOT, "analyzer %s encounters exception: %s", tokenizer, e.getMessage()), e ); } From fb6a961db5136fa7c338984e05d6f59a42ea8a9d Mon Sep 17 00:00:00 2001 From: yuye-aws Date: Mon, 18 Mar 2024 09:49:33 +0800 Subject: [PATCH 188/189] fix unit tests Signed-off-by: yuye-aws --- .../processor/chunker/ChunkerUtil.java | 3 ++- .../processor/chunker/FixedTokenLengthChunker.java | 5 +---- .../processor/TextChunkingProcessorTests.java | 14 ++++++++++---- .../processor/chunker/DelimiterChunkerTests.java | 13 ++++++++----- .../chunker/FixedTokenLengthChunkerTests.java | 14 +++++++------- 5 files changed, 28 insertions(+), 21 deletions(-) diff --git a/src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerUtil.java b/src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerUtil.java index 3a1d2786e..d4406f33e 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerUtil.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerUtil.java @@ -7,6 +7,7 @@ import java.util.Locale; import static org.opensearch.neuralsearch.processor.TextChunkingProcessor.TYPE; +import static org.opensearch.neuralsearch.processor.chunker.Chunker.DISABLED_MAX_CHUNK_LIMIT; import static org.opensearch.neuralsearch.processor.chunker.Chunker.MAX_CHUNK_LIMIT_FIELD; /** @@ -25,7 +26,7 @@ private ChunkerUtil() {} // no instance of this util class * @param nonRuntimeMaxChunkLimit non-runtime max_chunk_limit, used to keep exception message consistent */ public static void checkRunTimeMaxChunkLimit(int chunkResultSize, int runtimeMaxChunkLimit, int nonRuntimeMaxChunkLimit) { - if (chunkResultSize == runtimeMaxChunkLimit) { + if (runtimeMaxChunkLimit != DISABLED_MAX_CHUNK_LIMIT && chunkResultSize >= runtimeMaxChunkLimit) { throw new IllegalArgumentException( String.format( Locale.ROOT, diff --git a/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java b/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java index 01949731a..cd630adf1 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java @@ -161,10 +161,7 @@ private List tokenize(final String content, final String tokenizer AnalyzeAction.Response analyzeResponse = analyze(analyzeRequest, analysisRegistry, null, maxTokenCount); return analyzeResponse.getTokens(); } catch (Exception e) { - throw new IllegalStateException( - String.format(Locale.ROOT, "analyzer %s encounters exception: %s", tokenizer, e.getMessage()), - e - ); + throw new IllegalStateException(String.format(Locale.ROOT, "analyzer %s throws exception: %s", tokenizer, e.getMessage()), e); } } } diff --git a/src/test/java/org/opensearch/neuralsearch/processor/TextChunkingProcessorTests.java b/src/test/java/org/opensearch/neuralsearch/processor/TextChunkingProcessorTests.java index 30a443c6c..908febe76 100644 --- a/src/test/java/org/opensearch/neuralsearch/processor/TextChunkingProcessorTests.java +++ b/src/test/java/org/opensearch/neuralsearch/processor/TextChunkingProcessorTests.java @@ -420,8 +420,11 @@ public void testExecute_withFixedTokenLength_andSourceDataStringExceedMaxChunkLi int maxChunkLimit = 1; TextChunkingProcessor processor = createFixedTokenLengthInstanceWithMaxChunkLimit(createStringFieldMap(), maxChunkLimit); IngestDocument ingestDocument = createIngestDocumentWithSourceData(createSourceDataString()); - IllegalStateException illegalStateException = assertThrows(IllegalStateException.class, () -> processor.execute(ingestDocument)); - assert (illegalStateException.getMessage() + IllegalArgumentException illegalArgumentException = assertThrows( + IllegalArgumentException.class, + () -> processor.execute(ingestDocument) + ); + assert (illegalArgumentException.getMessage() .contains( String.format( Locale.ROOT, @@ -437,8 +440,11 @@ public void testExecute_withFixedTokenLength_andSourceDataListExceedMaxChunkLimi int maxChunkLimit = 5; TextChunkingProcessor processor = createFixedTokenLengthInstanceWithMaxChunkLimit(createStringFieldMap(), maxChunkLimit); IngestDocument ingestDocument = createIngestDocumentWithSourceData(createSourceDataListStrings()); - IllegalStateException illegalStateException = assertThrows(IllegalStateException.class, () -> processor.execute(ingestDocument)); - assert (illegalStateException.getMessage() + IllegalArgumentException illegalArgumentException = assertThrows( + IllegalArgumentException.class, + () -> processor.execute(ingestDocument) + ); + assert (illegalArgumentException.getMessage() .contains( String.format( Locale.ROOT, diff --git a/src/test/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunkerTests.java b/src/test/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunkerTests.java index e05f57452..54e296861 100644 --- a/src/test/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunkerTests.java +++ b/src/test/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunkerTests.java @@ -80,8 +80,11 @@ public void testChunk_whenExceedMaxChunkLimit_thenFail() { int maxChunkLimit = 2; DelimiterChunker chunker = new DelimiterChunker(Map.of(DELIMITER_FIELD, "\n\n", MAX_CHUNK_LIMIT_FIELD, maxChunkLimit)); String content = "\n\na\n\n\n"; - IllegalStateException illegalStateException = assertThrows(IllegalStateException.class, () -> chunker.chunk(content, Map.of())); - assert (illegalStateException.getMessage() + IllegalArgumentException illegalArgumentException = assertThrows( + IllegalArgumentException.class, + () -> chunker.chunk(content, Map.of()) + ); + assert (illegalArgumentException.getMessage() .contains( String.format( Locale.ROOT, @@ -105,11 +108,11 @@ public void testChunk_whenExceedRuntimeMaxChunkLimit_thenFail() { DelimiterChunker chunker = new DelimiterChunker(Map.of(DELIMITER_FIELD, "\n\n", MAX_CHUNK_LIMIT_FIELD, maxChunkLimit)); String content = "\n\na\n\n\n"; int runtimeMaxChunkLimit = 2; - IllegalStateException illegalStateException = assertThrows( - IllegalStateException.class, + IllegalArgumentException illegalArgumentException = assertThrows( + IllegalArgumentException.class, () -> chunker.chunk(content, Map.of(MAX_CHUNK_LIMIT_FIELD, runtimeMaxChunkLimit)) ); - assert (illegalStateException.getMessage() + assert (illegalArgumentException.getMessage() .contains( String.format( Locale.ROOT, diff --git a/src/test/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunkerTests.java b/src/test/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunkerTests.java index bd7fa8b87..bbcaa7069 100644 --- a/src/test/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunkerTests.java +++ b/src/test/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunkerTests.java @@ -169,7 +169,7 @@ public void testChunk_whenTokenizationException_thenFail() { () -> fixedTokenLengthChunker.chunk(content, parameters) ); assert (illegalStateException.getMessage() - .contains(String.format(Locale.ROOT, "%s algorithm encounters exception in tokenization", ALGORITHM_NAME))); + .contains(String.format(Locale.ROOT, "analyzer %s throws exception", lowercaseTokenizer))); } public void testChunk_withEmptyInput_thenSucceed() { @@ -246,11 +246,11 @@ public void testChunk_whenExceedMaxChunkLimit_thenFail() { runtimeParameters.put(MAX_TOKEN_COUNT_FIELD, 10000); String content = "This is an example document to be chunked. The document contains a single paragraph, two sentences and 24 tokens by standard tokenizer in OpenSearch."; - IllegalStateException illegalStateException = assertThrows( - IllegalStateException.class, + IllegalArgumentException illegalArgumentException = assertThrows( + IllegalArgumentException.class, () -> fixedTokenLengthChunker.chunk(content, runtimeParameters) ); - assert (illegalStateException.getMessage() + assert (illegalArgumentException.getMessage() .contains( String.format( Locale.ROOT, @@ -292,11 +292,11 @@ public void testChunk_whenExceedRuntimeMaxChunkLimit_thenFail() { runtimeParameters.put(MAX_CHUNK_LIMIT_FIELD, runtimeMaxChunkLimit); String content = "This is an example document to be chunked. The document contains a single paragraph, two sentences and 24 tokens by standard tokenizer in OpenSearch."; - IllegalStateException illegalStateException = assertThrows( - IllegalStateException.class, + IllegalArgumentException illegalArgumentException = assertThrows( + IllegalArgumentException.class, () -> fixedTokenLengthChunker.chunk(content, runtimeParameters) ); - assert (illegalStateException.getMessage() + assert (illegalArgumentException.getMessage() .contains( String.format( Locale.ROOT, From 68fef4f0cd614dc380d5afac3c18f968241e7965 Mon Sep 17 00:00:00 2001 From: yuye-aws Date: Mon, 18 Mar 2024 09:59:11 +0800 Subject: [PATCH 189/189] bug fix: only update runtime max chunk limit when enabled Signed-off-by: yuye-aws --- .../neuralsearch/processor/TextChunkingProcessor.java | 6 ++++-- .../processor/TextChunkingProcessorTests.java | 8 ++++++++ 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/src/main/java/org/opensearch/neuralsearch/processor/TextChunkingProcessor.java b/src/main/java/org/opensearch/neuralsearch/processor/TextChunkingProcessor.java index d3263d3b1..50a9d4b7b 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/TextChunkingProcessor.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/TextChunkingProcessor.java @@ -277,10 +277,12 @@ private int chunkMapType( * Chunk the content, update the runtime max_chunk_limit and return the result */ private List chunkString(final String content, final Map runTimeParameters) { - // update runtime max_chunk_limit for each content + // update runtime max_chunk_limit if not disabled List contentResult = chunker.chunk(content, runTimeParameters); int runtimeMaxChunkLimit = parseIntegerParameter(runTimeParameters, MAX_CHUNK_LIMIT_FIELD, maxChunkLimit); - runTimeParameters.put(MAX_CHUNK_LIMIT_FIELD, runtimeMaxChunkLimit - contentResult.size()); + if (runtimeMaxChunkLimit != DISABLED_MAX_CHUNK_LIMIT) { + runTimeParameters.put(MAX_CHUNK_LIMIT_FIELD, runtimeMaxChunkLimit - contentResult.size()); + } return contentResult; } diff --git a/src/test/java/org/opensearch/neuralsearch/processor/TextChunkingProcessorTests.java b/src/test/java/org/opensearch/neuralsearch/processor/TextChunkingProcessorTests.java index 908febe76..934918e18 100644 --- a/src/test/java/org/opensearch/neuralsearch/processor/TextChunkingProcessorTests.java +++ b/src/test/java/org/opensearch/neuralsearch/processor/TextChunkingProcessorTests.java @@ -455,6 +455,14 @@ public void testExecute_withFixedTokenLength_andSourceDataListExceedMaxChunkLimi )); } + @SneakyThrows + public void testExecute_withFixedTokenLength_andSourceDataListDisabledMaxChunkLimit_thenFail() { + int maxChunkLimit = -1; + TextChunkingProcessor processor = createFixedTokenLengthInstanceWithMaxChunkLimit(createStringFieldMap(), maxChunkLimit); + IngestDocument ingestDocument = createIngestDocumentWithSourceData(createSourceDataListStrings()); + processor.execute(ingestDocument); + } + @SneakyThrows public void testCreate_withDefaultAlgorithm_andSourceDataString_thenSucceed() { TextChunkingProcessor processor = createDefaultAlgorithmInstance(createStringFieldMap());