-
Notifications
You must be signed in to change notification settings - Fork 3
/
regist_chromadb.py
51 lines (42 loc) · 1.33 KB
/
regist_chromadb.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
__import__('pysqlite3')
import sys
sys.modules['sqlite3'] = sys.modules.pop('pysqlite3')
import itertools
import os
import dotenv
import pandas as pd
import chromadb
from tqdm import tqdm
dotenv.load_dotenv()
chroma_client = chromadb.PersistentClient(path="DB")
try:
chroma_client.delete_collection("my_collection")
print("Collection deleted")
except Exception as e:
print(e)
pass
collection = chroma_client.create_collection(
name="my_collection",
metadata={"hnsw:space": "cosine"}
)
print("Collection created")
df = pd.read_excel('data/DATASET_MASTER.xlsx')
print("Data loaded")
print("length: ", len(df['VALUE']))
data_generator = map(lambda i: {
'id': str(i),
'values': df['VALUE'][i:i+1000].tolist(),
}, range(len(df['VALUE']) - 1000)) # len(df['VALUE'])
def chunks(iterable, batch_size=100):
"""A helper function to break an iterable into chunks of size batch_size."""
it = iter(iterable)
chunk = tuple(itertools.islice(it, batch_size))
while chunk:
yield chunk
chunk = tuple(itertools.islice(it, batch_size))
for vectors_chunk in tqdm(chunks(data_generator, batch_size=100), desc='Upserting vectors'):
collection.upsert(
embeddings=[v['values'] for v in vectors_chunk],
ids=[v['id'] for v in vectors_chunk],
)
# index.upsert(vectors=vectors_chunk)