From 48bd937f5e02b20c45d7fa31d1be48a2aad3d7ca Mon Sep 17 00:00:00 2001 From: Leszek M <63610219+LexiestLeszek@users.noreply.github.com> Date: Mon, 3 Jul 2023 21:01:28 +0200 Subject: [PATCH 1/2] Added RecursiveCharacterSplitter Found the description on LangChain website and created my implementation, seems to be working. It will split documents recursively by different characters - starting with "\n\n", then "\n", then " ". This is nice because it will try to keep all the semantically relevant content in the same place for as long as possible. Orig from langchain: https://js.langchain.com/docs/modules/indexes/text_splitters/examples/recursive_character --- .../RecursiveCharacterSplitter.swift | 79 +++++++++++++++++++ 1 file changed, 79 insertions(+) create mode 100644 Sources/SimilaritySearchKit/Core/Embeddings/Splitters/RecursiveCharacterSplitter.swift diff --git a/Sources/SimilaritySearchKit/Core/Embeddings/Splitters/RecursiveCharacterSplitter.swift b/Sources/SimilaritySearchKit/Core/Embeddings/Splitters/RecursiveCharacterSplitter.swift new file mode 100644 index 0000000..2a83c9c --- /dev/null +++ b/Sources/SimilaritySearchKit/Core/Embeddings/Splitters/RecursiveCharacterSplitter.swift @@ -0,0 +1,79 @@ +// +// RecursiveCharacterSplitter.swift +// +// Created by Leszek Mielnikow on 03/07/2023. +// + +import Foundation +import SimilaritySearchKit + +public class RecursiveCharacterSplitter: TextSplitterProtocol { + let characterSplitter: CharacterSplitter + + public init() { + characterSplitter = CharacterSplitter() + } + + public func split(text: String, chunkSize: Int = 100, overlapSize: Int = 0) -> ([String], [[String]]?) { + let separators = ["\n\n", "\n", ".", " "] + + for separator in separators { + let splits = text.components(separatedBy: separator) + let (isValid, splitTokens) = isSplitValid(chunks: splits, maxChunkSize: chunkSize) + + if isValid { + var chunks: [String] = [] + var chunkTokens: [[String]] = [] + + var currentChunkTokens: [String] = [] + var currentChunkSize: Int = 0 + var currentChunkSplit: String = "" + + for (idx, tokens) in splitTokens.enumerated() { + let tokensSize = tokens.count + + if currentChunkSize + tokensSize < chunkSize { + currentChunkTokens.append(contentsOf: tokens) + currentChunkSize += tokensSize + currentChunkSplit += splits[idx] + separator + + } else { + chunks.append(currentChunkSplit.trimmingCharacters(in: .whitespaces)) + chunkTokens.append(characterSplitter.split(text: currentChunkSplit, chunkSize: chunkSize).0) + + // reset current + currentChunkTokens = tokens + currentChunkSize = tokensSize + currentChunkSplit = splits[idx] + separator + } + } + + // Add the last chunk if it's not empty + if !currentChunkSplit.isEmpty { + chunks.append(currentChunkSplit.trimmingCharacters(in: .whitespaces)) + chunkTokens.append(characterSplitter.split(text: currentChunkSplit, chunkSize: chunkSize).0) + } + + return (chunks, chunkTokens) + } + } + + return ([], []) + } + + // MARK: - Helpers + + private func isSplitValid(chunks: [String], maxChunkSize: Int) -> (Bool, [[String]]) { + var splitTokens: [[String]] = [] + + for chunk in chunks { + let tokens = characterSplitter.split(text: chunk, chunkSize: maxChunkSize).0 + if chunk.count > maxChunkSize { + return (false, []) + } + splitTokens.append(tokens) + } + + return (true, splitTokens) + } +} From 676dfefeca419fa358fba234b9a89b5f2064164c Mon Sep 17 00:00:00 2001 From: ZachNagengast Date: Wed, 5 Jul 2023 07:54:20 -0700 Subject: [PATCH 2/2] Reduce search performance test for CI --- Tests/SimilaritySearchKitTests/BenchmarkTests.swift | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Tests/SimilaritySearchKitTests/BenchmarkTests.swift b/Tests/SimilaritySearchKitTests/BenchmarkTests.swift index ca9ca3b..c3aea86 100644 --- a/Tests/SimilaritySearchKitTests/BenchmarkTests.swift +++ b/Tests/SimilaritySearchKitTests/BenchmarkTests.swift @@ -115,7 +115,7 @@ class BenchmarkTests: XCTestCase { } func testDistilbertPerformanceSearch() { - let testAmount = 10 + let testAmount = 2 let passageIds = Array(0..