From 1d0cbbc6d6e8c50299bb38b3bfa5e241488ff6f4 Mon Sep 17 00:00:00 2001 From: Marco Vinciguerra Date: Tue, 18 Jun 2024 09:13:15 +0200 Subject: [PATCH] fix: add new embedding models --- requirements-dev.lock | 67 +++++++++++++------------- scrapegraphai/helpers/models_tokens.py | 5 +- 2 files changed, 37 insertions(+), 35 deletions(-) diff --git a/requirements-dev.lock b/requirements-dev.lock index 52c5faa4..2e8ca0cb 100644 --- a/requirements-dev.lock +++ b/requirements-dev.lock @@ -21,9 +21,9 @@ altair==5.3.0 # via streamlit annotated-types==0.7.0 # via pydantic -anthropic==0.26.1 +anthropic==0.28.1 # via langchain-anthropic -anyio==4.3.0 +anyio==4.4.0 # via anthropic # via groq # via httpx @@ -42,9 +42,9 @@ beautifulsoup4==4.12.3 # via scrapegraphai blinker==1.8.2 # via streamlit -boto3==1.34.113 +boto3==1.34.127 # via langchain-aws -botocore==1.34.113 +botocore==1.34.127 # via boto3 # via s3transfer burr==0.22.1 @@ -52,7 +52,7 @@ burr==0.22.1 cachetools==5.3.3 # via google-auth # via streamlit -certifi==2024.2.2 +certifi==2024.6.2 # via httpcore # via httpx # via requests @@ -67,7 +67,7 @@ contourpy==1.2.1 # via matplotlib cycler==0.12.1 # via matplotlib -dataclasses-json==0.6.6 +dataclasses-json==0.6.7 # via langchain # via langchain-community defusedxml==0.7.1 @@ -80,27 +80,26 @@ dnspython==2.6.1 # via email-validator docutils==0.19 # via sphinx -email-validator==2.1.1 +email-validator==2.1.2 # via fastapi faiss-cpu==1.8.0 # via scrapegraphai fastapi==0.111.0 # via burr - # via fastapi-pagination fastapi-cli==0.0.4 # via fastapi -fastapi-pagination==0.12.24 +fastapi-pagination==0.12.25 # via burr -filelock==3.14.0 +filelock==3.15.1 # via huggingface-hub -fonttools==4.52.1 +fonttools==4.53.0 # via matplotlib free-proxy==1.1.1 # via scrapegraphai frozenlist==1.4.1 # via aiohttp # via aiosignal -fsspec==2024.5.0 +fsspec==2024.6.0 # via huggingface-hub furo==2024.5.6 # via scrapegraphai @@ -116,9 +115,9 @@ google-api-core==2.19.0 # via google-ai-generativelanguage # via google-api-python-client # via google-generativeai -google-api-python-client==2.130.0 +google-api-python-client==2.133.0 # via google-generativeai -google-auth==2.29.0 +google-auth==2.30.0 # via google-ai-generativelanguage # via google-api-core # via google-api-python-client @@ -128,7 +127,7 @@ google-auth-httplib2==0.2.0 # via google-api-python-client google-generativeai==0.5.4 # via langchain-google-genai -googleapis-common-protos==1.63.0 +googleapis-common-protos==1.63.1 # via google-api-core # via grpcio-status graphviz==0.20.3 @@ -136,9 +135,9 @@ graphviz==0.20.3 # via scrapegraphai greenlet==3.0.3 # via playwright -groq==0.8.0 +groq==0.9.0 # via langchain-groq -grpcio==1.64.0 +grpcio==1.64.1 # via google-api-core # via grpcio-status grpcio-status==1.62.2 @@ -160,7 +159,7 @@ httpx==0.27.0 # via fastapi # via groq # via openai -huggingface-hub==0.23.1 +huggingface-hub==0.23.4 # via tokenizers idna==3.7 # via anyio @@ -178,7 +177,7 @@ jinja2==3.1.4 # via fastapi # via pydeck # via sphinx -jiter==0.4.0 +jiter==0.4.2 # via anthropic jmespath==1.0.1 # via boto3 @@ -186,7 +185,7 @@ jmespath==1.0.1 jsonpatch==1.33 # via langchain # via langchain-core -jsonpointer==2.4 +jsonpointer==3.0.0 # via jsonpatch jsonschema==4.22.0 # via altair @@ -219,7 +218,7 @@ langchain-openai==0.1.6 # via scrapegraphai langchain-text-splitters==0.0.2 # via langchain -langsmith==0.1.63 +langsmith==0.1.77 # via langchain # via langchain-community # via langchain-core @@ -231,7 +230,7 @@ markdown-it-py==3.0.0 # via rich markupsafe==2.1.5 # via jinja2 -marshmallow==3.21.2 +marshmallow==3.21.3 # via dataclasses-json matplotlib==3.9.0 # via burr @@ -257,10 +256,10 @@ numpy==1.26.4 # via pydeck # via sf-hamilton # via streamlit -openai==1.30.3 +openai==1.34.0 # via burr # via langchain-openai -orjson==3.10.3 +orjson==3.10.5 # via fastapi # via langsmith packaging==23.2 @@ -303,7 +302,7 @@ pyasn1==0.6.0 # via rsa pyasn1-modules==0.4.0 # via google-auth -pydantic==2.7.1 +pydantic==2.7.4 # via anthropic # via burr # via fastapi @@ -314,7 +313,7 @@ pydantic==2.7.1 # via langchain-core # via langsmith # via openai -pydantic-core==2.18.2 +pydantic-core==2.18.4 # via pydantic pydeck==0.9.1 # via streamlit @@ -352,7 +351,7 @@ referencing==0.35.1 # via jsonschema-specifications regex==2024.5.15 # via tiktoken -requests==2.32.2 +requests==2.32.3 # via burr # via free-proxy # via google-api-core @@ -375,7 +374,7 @@ s3transfer==0.10.1 # via boto3 semchunk==1.0.1 # via scrapegraphai -sf-hamilton==1.63.0 +sf-hamilton==1.66.1 # via burr shellingham==1.5.4 # via typer @@ -418,7 +417,7 @@ starlette==0.37.2 # via fastapi streamlit==1.35.0 # via burr -tenacity==8.3.0 +tenacity==8.4.1 # via langchain # via langchain-community # via langchain-core @@ -432,7 +431,7 @@ toml==0.10.2 # via streamlit toolz==0.12.1 # via altair -tornado==6.4 +tornado==6.4.1 # via streamlit tqdm==4.66.4 # via google-generativeai @@ -442,7 +441,7 @@ tqdm==4.66.4 # via semchunk typer==0.12.3 # via fastapi-cli -typing-extensions==4.12.0 +typing-extensions==4.12.2 # via anthropic # via fastapi # via fastapi-pagination @@ -469,15 +468,15 @@ undetected-playwright==0.3.0 # via scrapegraphai uritemplate==4.1.1 # via google-api-python-client -urllib3==1.26.18 +urllib3==2.2.2 # via botocore # via requests -uvicorn==0.29.0 +uvicorn==0.30.1 # via burr # via fastapi uvloop==0.19.0 # via uvicorn -watchfiles==0.21.0 +watchfiles==0.22.0 # via uvicorn websockets==12.0 # via uvicorn diff --git a/scrapegraphai/helpers/models_tokens.py b/scrapegraphai/helpers/models_tokens.py index c9d61a98..424f95c6 100644 --- a/scrapegraphai/helpers/models_tokens.py +++ b/scrapegraphai/helpers/models_tokens.py @@ -60,7 +60,10 @@ "stablelm-zephyr": 8192, "wizardlm2:8x22b": 65536, # embedding models - "shaw/dmeta-embedding-zh": 8192, + "shaw/dmeta-embedding-zh-small-q4": 8192, + "shaw/dmeta-embedding-zh-q4": 8192, + "chevalblanc/acge_text_embedding": 8192, + "martcreation/dmeta-embedding-zh": 8192, "snowflake-arctic-embed": 8192, "mxbai-embed-large": 512 },