diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
index b0fb983..b6c2b2d 100644
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -74,7 +74,7 @@ jobs:
- uses: conda-incubator/setup-miniconda@v2
with:
auto-update-conda: true
- python-version: '3.7'
+ python-version: '3.10'
- name: Set preferred device to CPU
shell: bash -l {0}
@@ -169,7 +169,7 @@ jobs:
- uses: conda-incubator/setup-miniconda@v2
with:
auto-update-conda: true
- python-version: '3.7'
+ python-version: '3.10'
- name: Set preferred device to GPU
shell: bash -l {0}
diff --git a/README.md b/README.md
index 624f35a..417738c 100644
--- a/README.md
+++ b/README.md
@@ -33,7 +33,7 @@
**Features:**
- [x] Image annotation for polygon, rectangle, circle, line and point.
-- [x] Auto-labeling with YOLOv5 and Segment Anything.
+- [x] Auto-labeling YOLOv8, Segment Anything (SAM, SAM2).
- [x] Text detection, recognition and KIE (Key Information Extraction) labeling.
- [x] Multiple languages availables: English, Vietnamese, Chinese.
@@ -49,7 +49,7 @@
### 2. Install from Pypi
-- Requirements: Python >= 3.8, <= 3.12. Recommended: Python 3.12.
+- Requirements: Python 3.10+. Recommended: Python 3.12.
- Recommended: [Miniconda/Anaconda](https://docs.conda.io/en/latest/miniconda.html).
- Create environment:
@@ -126,4 +126,4 @@ If you want to contribute to **AnyLabeling**, please read [Contribution Guidelin
- Labeling UI built with ideas and components from [LabelImg](https://github.com/heartexlabs/labelImg), [LabelMe](https://github.com/wkentaro/labelme).
- Auto-labeling with [Segment Anything Models](https://segment-anything.com/), [MobileSAM](https://github.com/ChaoningZhang/MobileSAM).
-- Auto-labeling with [YOLOv5](https://github.com/ultralytics/yolov5), [YOLOv8](https://github.com/ultralytics/ultralytics).
+- Auto-labeling with [YOLOv8](https://github.com/ultralytics/ultralytics).
diff --git a/anylabeling/app_info.py b/anylabeling/app_info.py
index ec4ce92..f236fd4 100644
--- a/anylabeling/app_info.py
+++ b/anylabeling/app_info.py
@@ -1,4 +1,4 @@
__appname__ = "AnyLabeling"
__appdescription__ = "Effortless data labeling with AI support"
-__version__ = "0.3.3"
+__version__ = "0.4.0"
__preferred_device__ = "CPU" # GPU or CPU
diff --git a/anylabeling/configs/auto_labeling/models.yaml b/anylabeling/configs/auto_labeling/models.yaml
index 523265b..2c05408 100644
--- a/anylabeling/configs/auto_labeling/models.yaml
+++ b/anylabeling/configs/auto_labeling/models.yaml
@@ -1,3 +1,15 @@
+- name: "sam2_hiera_tiny_20240803"
+ display_name: Segment Anything 2 (Hiera-Tiny)
+ download_url: https://huggingface.co/vietanhdev/segment-anything-2-onnx-models/resolve/main/sam2_hiera_tiny.zip
+- name: "sam2_hiera_small_20240803"
+ display_name: Segment Anything 2 (Hiera-Small)
+ download_url: https://huggingface.co/vietanhdev/segment-anything-2-onnx-models/resolve/main/sam2_hiera_small.zip
+- name: "sam2_hiera_base_plus_20240803"
+ display_name: Segment Anything 2 (Hiera-Base+)
+ download_url: https://huggingface.co/vietanhdev/segment-anything-2-onnx-models/resolve/main/sam2_hiera_base_plus.zip
+- name: "sam2_hiera_large_20240803"
+ display_name: Segment Anything 2 (Hiera-Large)
+ download_url: https://huggingface.co/vietanhdev/segment-anything-2-onnx-models/resolve/main/sam2_hiera_large.zip
- name: "mobile_sam_20230629"
display_name: Segment Anything (MobileSAM)
download_url: https://huggingface.co/vietanhdev/segment-anything-onnx-models/resolve/main/mobile_sam_20230629.zip
@@ -19,22 +31,6 @@
- name: "sam_vit_h_4b8939_quant"
display_name: Segment Anything (ViT-H Quant)
download_url: https://huggingface.co/vietanhdev/segment-anything-onnx-models/resolve/main/sam_vit_h_4b8939_quant.zip
-- name: "yolov5n-r20230415"
- display_name: YOLOv5n
- download_url: https://github.com/vietanhdev/anylabeling-assets/releases/download/v0.4.0/yolov5n-r20230415.zip
-- name: "yolov5s-r20230415"
- display_name: YOLOv5s
- download_url: https://github.com/vietanhdev/anylabeling-assets/releases/download/v0.4.0/yolov5s-r20230415.zip
-- name: "yolov5m-r20230415"
- display_name: YOLOv5m
- download_url: https://github.com/vietanhdev/anylabeling-assets/releases/download/v0.4.0/yolov5m-r20230415.zip
-- name: "yolov5l-r20230415"
- display_name: YOLOv5l
- download_url: https://github.com/vietanhdev/anylabeling-assets/releases/download/v0.4.0/yolov5l-r20230415.zip
-- name: "yolov5x-r20230415"
- display_name: YOLOv5x
- download_url: https://github.com/vietanhdev/anylabeling-assets/releases/download/v0.4.0/yolov5x-r20230415.zip
-- name: "yolov8n-r20230415"
display_name: YOLOv8n
download_url: https://github.com/vietanhdev/anylabeling-assets/releases/download/v0.4.0/yolov8n-r20230415.zip
- name: "yolov8s-r20230415"
diff --git a/anylabeling/services/auto_labeling/sam2_onnx.py b/anylabeling/services/auto_labeling/sam2_onnx.py
new file mode 100644
index 0000000..1ad5a1a
--- /dev/null
+++ b/anylabeling/services/auto_labeling/sam2_onnx.py
@@ -0,0 +1,332 @@
+# Code from: https://github.com/vietanhdev/samexporter/blob/main/samexporter/sam2_onnx.py
+import time
+from typing import Any
+
+import cv2
+import numpy as np
+import onnxruntime
+from numpy import ndarray
+
+
+class SegmentAnything2ONNX:
+ """Segmentation model using Segment Anything 2 (SAM2)"""
+
+ def __init__(self, encoder_model_path, decoder_model_path) -> None:
+ self.encoder = SAM2ImageEncoder(encoder_model_path)
+ self.decoder = SAM2ImageDecoder(
+ decoder_model_path, self.encoder.input_shape[2:]
+ )
+
+ def encode(self, cv_image: np.ndarray) -> list[np.ndarray]:
+ original_size = cv_image.shape[:2]
+ high_res_feats_0, high_res_feats_1, image_embed = self.encoder(cv_image)
+ return {
+ "high_res_feats_0": high_res_feats_0,
+ "high_res_feats_1": high_res_feats_1,
+ "image_embedding": image_embed,
+ "original_size": original_size,
+ }
+
+ def predict_masks(self, embedding, prompt) -> list[np.ndarray]:
+ points = []
+ labels = []
+ for mark in prompt:
+ if mark["type"] == "point":
+ points.append(mark["data"])
+ labels.append(mark["label"])
+ elif mark["type"] == "rectangle":
+ points.append([mark["data"][0], mark["data"][1]]) # top left
+ points.append(
+ [mark["data"][2], mark["data"][3]]
+ ) # bottom right
+ labels.append(2)
+ labels.append(3)
+ points, labels = np.array(points), np.array(labels)
+
+ image_embedding = embedding["image_embedding"]
+ high_res_feats_0 = embedding["high_res_feats_0"]
+ high_res_feats_1 = embedding["high_res_feats_1"]
+ original_size = embedding["original_size"]
+ self.decoder.set_image_size(original_size)
+ masks, _ = self.decoder(
+ image_embedding,
+ high_res_feats_0,
+ high_res_feats_1,
+ points,
+ labels,
+ )
+
+ return masks
+
+ def transform_masks(self, masks, original_size, transform_matrix):
+ """Transform the masks back to the original image size."""
+ output_masks = []
+ for batch in range(masks.shape[0]):
+ batch_masks = []
+ for mask_id in range(masks.shape[1]):
+ mask = masks[batch, mask_id]
+ mask = cv2.warpAffine(
+ mask,
+ transform_matrix[:2],
+ (original_size[1], original_size[0]),
+ flags=cv2.INTER_LINEAR,
+ )
+ batch_masks.append(mask)
+ output_masks.append(batch_masks)
+ return np.array(output_masks)
+
+
+class SAM2ImageEncoder:
+ def __init__(self, path: str) -> None:
+ # Initialize model
+ self.session = onnxruntime.InferenceSession(
+ path, providers=onnxruntime.get_available_providers()
+ )
+
+ # Get model info
+ self.get_input_details()
+ self.get_output_details()
+
+ def __call__(
+ self, image: np.ndarray
+ ) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
+ return self.encode_image(image)
+
+ def encode_image(
+ self, image: np.ndarray
+ ) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
+ input_tensor = self.prepare_input(image)
+
+ outputs = self.infer(input_tensor)
+
+ return self.process_output(outputs)
+
+ def prepare_input(self, image: np.ndarray) -> np.ndarray:
+ self.img_height, self.img_width = image.shape[:2]
+
+ input_img = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+ input_img = cv2.resize(input_img, (self.input_width, self.input_height))
+
+ mean = np.array([0.485, 0.456, 0.406])
+ std = np.array([0.229, 0.224, 0.225])
+ input_img = (input_img / 255.0 - mean) / std
+ input_img = input_img.transpose(2, 0, 1)
+ input_tensor = input_img[np.newaxis, :, :, :].astype(np.float32)
+
+ return input_tensor
+
+ def infer(self, input_tensor: np.ndarray) -> list[np.ndarray]:
+ start = time.perf_counter()
+ outputs = self.session.run(
+ self.output_names, {self.input_names[0]: input_tensor}
+ )
+
+ print(f"infer time: {(time.perf_counter() - start) * 1000:.2f} ms")
+ return outputs
+
+ def process_output(
+ self, outputs: list[np.ndarray]
+ ) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
+ return outputs[0], outputs[1], outputs[2]
+
+ def get_input_details(self) -> None:
+ model_inputs = self.session.get_inputs()
+ self.input_names = [
+ model_inputs[i].name for i in range(len(model_inputs))
+ ]
+
+ self.input_shape = model_inputs[0].shape
+ self.input_height = self.input_shape[2]
+ self.input_width = self.input_shape[3]
+
+ def get_output_details(self) -> None:
+ model_outputs = self.session.get_outputs()
+ self.output_names = [
+ model_outputs[i].name for i in range(len(model_outputs))
+ ]
+
+
+class SAM2ImageDecoder:
+ def __init__(
+ self,
+ path: str,
+ encoder_input_size: tuple[int, int],
+ orig_im_size: tuple[int, int] = None,
+ mask_threshold: float = 0.0,
+ ) -> None:
+ # Initialize model
+ self.session = onnxruntime.InferenceSession(
+ path, providers=onnxruntime.get_available_providers()
+ )
+
+ self.orig_im_size = (
+ orig_im_size if orig_im_size is not None else encoder_input_size
+ )
+ self.encoder_input_size = encoder_input_size
+ self.mask_threshold = mask_threshold
+ self.scale_factor = 4
+
+ # Get model info
+ self.get_input_details()
+ self.get_output_details()
+
+ def __call__(
+ self,
+ image_embed: np.ndarray,
+ high_res_feats_0: np.ndarray,
+ high_res_feats_1: np.ndarray,
+ point_coords: list[np.ndarray] | np.ndarray,
+ point_labels: list[np.ndarray] | np.ndarray,
+ ) -> tuple[list[np.ndarray], ndarray]:
+
+ return self.predict(
+ image_embed,
+ high_res_feats_0,
+ high_res_feats_1,
+ point_coords,
+ point_labels,
+ )
+
+ def predict(
+ self,
+ image_embed: np.ndarray,
+ high_res_feats_0: np.ndarray,
+ high_res_feats_1: np.ndarray,
+ point_coords: list[np.ndarray] | np.ndarray,
+ point_labels: list[np.ndarray] | np.ndarray,
+ ) -> tuple[list[np.ndarray], ndarray]:
+
+ inputs = self.prepare_inputs(
+ image_embed,
+ high_res_feats_0,
+ high_res_feats_1,
+ point_coords,
+ point_labels,
+ )
+
+ outputs = self.infer(inputs)
+
+ return self.process_output(outputs)
+
+ def prepare_inputs(
+ self,
+ image_embed: np.ndarray,
+ high_res_feats_0: np.ndarray,
+ high_res_feats_1: np.ndarray,
+ point_coords: list[np.ndarray] | np.ndarray,
+ point_labels: list[np.ndarray] | np.ndarray,
+ ):
+
+ input_point_coords, input_point_labels = self.prepare_points(
+ point_coords, point_labels
+ )
+
+ num_labels = input_point_labels.shape[0]
+ mask_input = np.zeros(
+ (
+ num_labels,
+ 1,
+ self.encoder_input_size[0] // self.scale_factor,
+ self.encoder_input_size[1] // self.scale_factor,
+ ),
+ dtype=np.float32,
+ )
+ has_mask_input = np.array([0], dtype=np.float32)
+
+ return (
+ image_embed,
+ high_res_feats_0,
+ high_res_feats_1,
+ input_point_coords,
+ input_point_labels,
+ mask_input,
+ has_mask_input,
+ )
+
+ def prepare_points(
+ self,
+ point_coords: list[np.ndarray] | np.ndarray,
+ point_labels: list[np.ndarray] | np.ndarray,
+ ) -> tuple[np.ndarray, np.ndarray]:
+
+ if isinstance(point_coords, np.ndarray):
+ input_point_coords = point_coords[np.newaxis, ...]
+ input_point_labels = point_labels[np.newaxis, ...]
+ else:
+ max_num_points = max([coords.shape[0] for coords in point_coords])
+ # We need to make sure that all inputs have the same number of points
+ # Add invalid points to pad the input (0, 0) with -1 value for labels
+ input_point_coords = np.zeros(
+ (len(point_coords), max_num_points, 2), dtype=np.float32
+ )
+ input_point_labels = (
+ np.ones((len(point_coords), max_num_points), dtype=np.float32)
+ * -1
+ )
+
+ for i, (coords, labels) in enumerate(
+ zip(point_coords, point_labels)
+ ):
+ input_point_coords[i, : coords.shape[0], :] = coords
+ input_point_labels[i, : labels.shape[0]] = labels
+
+ input_point_coords[..., 0] = (
+ input_point_coords[..., 0]
+ / self.orig_im_size[1]
+ * self.encoder_input_size[1]
+ ) # Normalize x
+ input_point_coords[..., 1] = (
+ input_point_coords[..., 1]
+ / self.orig_im_size[0]
+ * self.encoder_input_size[0]
+ ) # Normalize y
+
+ return input_point_coords.astype(np.float32), input_point_labels.astype(
+ np.float32
+ )
+
+ def infer(self, inputs) -> list[np.ndarray]:
+ start = time.perf_counter()
+
+ outputs = self.session.run(
+ self.output_names,
+ {
+ self.input_names[i]: inputs[i]
+ for i in range(len(self.input_names))
+ },
+ )
+
+ print(f"infer time: {(time.perf_counter() - start) * 1000:.2f} ms")
+ return outputs
+
+ def process_output(
+ self, outputs: list[np.ndarray]
+ ) -> tuple[list[ndarray | Any], ndarray[Any, Any]]:
+
+ scores = outputs[1].squeeze()
+ masks = outputs[0][0]
+
+ # Select the best masks based on the scores
+ best_mask = masks[np.argmax(scores)]
+ best_mask = cv2.resize(
+ best_mask, (self.orig_im_size[1], self.orig_im_size[0])
+ )
+ return (
+ np.array([[best_mask]]),
+ scores,
+ )
+
+ def set_image_size(self, orig_im_size: tuple[int, int]) -> None:
+ self.orig_im_size = orig_im_size
+
+ def get_input_details(self) -> None:
+ model_inputs = self.session.get_inputs()
+ self.input_names = [
+ model_inputs[i].name for i in range(len(model_inputs))
+ ]
+
+ def get_output_details(self) -> None:
+ model_outputs = self.session.get_outputs()
+ self.output_names = [
+ model_outputs[i].name for i in range(len(model_outputs))
+ ]
diff --git a/anylabeling/services/auto_labeling/sam_onnx.py b/anylabeling/services/auto_labeling/sam_onnx.py
index b50cfa0..c99bcb4 100644
--- a/anylabeling/services/auto_labeling/sam_onnx.py
+++ b/anylabeling/services/auto_labeling/sam_onnx.py
@@ -13,25 +13,12 @@ def __init__(self, encoder_model_path, decoder_model_path) -> None:
self.target_size = 1024
self.input_size = (684, 1024)
- # Load models
- providers = onnxruntime.get_available_providers()
-
- # Pop TensorRT Runtime due to crashing issues
- # TODO: Add back when TensorRT backend is stable
- providers = [p for p in providers if p != "TensorrtExecutionProvider"]
-
- if providers:
- logging.info(
- "Available providers for ONNXRuntime: %s", ", ".join(providers)
- )
- else:
- logging.warning("No available providers for ONNXRuntime")
self.encoder_session = onnxruntime.InferenceSession(
- encoder_model_path, providers=providers
+ encoder_model_path
)
self.encoder_input_name = self.encoder_session.get_inputs()[0].name
self.decoder_session = onnxruntime.InferenceSession(
- decoder_model_path, providers=providers
+ decoder_model_path
)
def get_input_points(self, prompt):
diff --git a/anylabeling/services/auto_labeling/segment_anything.py b/anylabeling/services/auto_labeling/segment_anything.py
index 61d910b..1248238 100644
--- a/anylabeling/services/auto_labeling/segment_anything.py
+++ b/anylabeling/services/auto_labeling/segment_anything.py
@@ -3,6 +3,7 @@
import traceback
import cv2
+import onnx
import numpy as np
from PyQt5 import QtCore
from PyQt5.QtCore import QThread
@@ -16,6 +17,7 @@
from .model import Model
from .types import AutoLabelingResult
from .sam_onnx import SegmentAnythingONNX
+from .sam2_onnx import SegmentAnything2ONNX
class SegmentAnything(Model):
@@ -78,9 +80,14 @@ def __init__(self, config_path, on_message) -> None:
)
# Load models
- self.model = SegmentAnythingONNX(
- encoder_model_abs_path, decoder_model_abs_path
- )
+ if self.detect_model_variant(decoder_model_abs_path) == "sam2":
+ self.model = SegmentAnything2ONNX(
+ encoder_model_abs_path, decoder_model_abs_path
+ )
+ else:
+ self.model = SegmentAnythingONNX(
+ encoder_model_abs_path, decoder_model_abs_path
+ )
# Mark for auto labeling
# points, rectangles
@@ -96,6 +103,14 @@ def __init__(self, config_path, on_message) -> None:
self.pre_inference_worker = None
self.stop_inference = False
+ def detect_model_variant(self, decoder_model_abs_path):
+ """Load and detect model variant based on the model architecture"""
+ model = onnx.load(decoder_model_abs_path)
+ input_names = [input.name for input in model.graph.input]
+ if "high_res_feats_0" in input_names:
+ return "sam2"
+ return "sam"
+
def set_auto_labeling_marks(self, marks):
"""Set auto labeling marks"""
self.marks = marks
diff --git a/anylabeling/views/labeling/label_widget.py b/anylabeling/views/labeling/label_widget.py
index 2c327c2..2971e6a 100644
--- a/anylabeling/views/labeling/label_widget.py
+++ b/anylabeling/views/labeling/label_widget.py
@@ -1097,7 +1097,7 @@ def get_labeling_instruction(self):
f"{text_mode} {self.canvas.get_mode()} - {text_shortcuts}"
f" {text_previous} A, {text_next} D,"
f" {text_rectangle} R,"
- f" {text_polygon}: P"
+ f" {text_polygon} P"
)
@pyqtSlot()
diff --git a/anylabeling/views/labeling/widgets/auto_labeling/auto_labeling.ui b/anylabeling/views/labeling/widgets/auto_labeling/auto_labeling.ui
index 7c962c6..56d8081 100644
--- a/anylabeling/views/labeling/widgets/auto_labeling/auto_labeling.ui
+++ b/anylabeling/views/labeling/widgets/auto_labeling/auto_labeling.ui
@@ -43,6 +43,9 @@
2
+
+ 4
+
0
@@ -187,6 +190,11 @@
0
+
+
+ 10
+
+
margin-top: 0;
margin-bottom: 10px;
diff --git a/requirements-gpu.txt b/requirements-gpu.txt
index e981bc2..66e5303 100644
--- a/requirements-gpu.txt
+++ b/requirements-gpu.txt
@@ -4,7 +4,7 @@ imgviz==1.5.0
natsort==8.1.0
termcolor==1.1.0
PyYAML==6.0
-onnx==1.16.1
-onnxruntime-gpu==1.18.1
+onnx==1.16.2
+onnxruntime-gpu==1.16.3
qimage2ndarray==1.10.0
darkdetect==0.8.0
\ No newline at end of file
diff --git a/requirements-macos.txt b/requirements-macos.txt
index 8c6f2f3..dceb623 100644
--- a/requirements-macos.txt
+++ b/requirements-macos.txt
@@ -4,7 +4,7 @@ imgviz==1.5.0
natsort==8.1.0
termcolor==1.1.0
PyYAML==6.0
-onnx==1.16.1
-onnxruntime==1.18.1
+onnx==1.16.2
+onnxruntime==1.16.3
qimage2ndarray==1.10.0
darkdetect==0.8.0
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
index 92ad1c7..9bcf324 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -4,7 +4,7 @@ imgviz==1.5.0
natsort==8.1.0
termcolor==1.1.0
PyYAML==6.0
-onnx==1.16.1
-onnxruntime==1.18.1
+onnx==1.16.2
+onnxruntime==1.16.3
qimage2ndarray==1.10.0
darkdetect==0.8.0
\ No newline at end of file
diff --git a/setup.py b/setup.py
index bd81ee4..2500cb4 100644
--- a/setup.py
+++ b/setup.py
@@ -50,7 +50,7 @@ def get_install_requires():
"termcolor",
"opencv-python-headless",
'PyQt5>=5.15.7; platform_system != "Darwin"',
- "onnx==1.16.1",
+ "onnx==1.16.2",
"qimage2ndarray==1.10.0",
"darkdetect==0.8.0",
]
@@ -60,10 +60,10 @@ def get_install_requires():
# Note: onnxruntime-gpu is not available on macOS
preferred_device = get_preferred_device()
if preferred_device == "GPU" and platform.system() != "Darwin":
- install_requires.append("onnxruntime-gpu==1.18.1")
+ install_requires.append("onnxruntime-gpu==1.16.3")
print("Building AnyLabeling with GPU support")
else:
- install_requires.append("onnxruntime==1.18.1")
+ install_requires.append("onnxruntime==1.16.3")
print("Building AnyLabeling without GPU support")
return install_requires
@@ -95,8 +95,6 @@ def get_long_description():
"Natural Language :: English",
"Operating System :: OS Independent",
"Programming Language :: Python",
- "Programming Language :: Python :: 3.8",
- "Programming Language :: Python :: 3.9",
"Programming Language :: Python :: 3.10",
"Programming Language :: Python :: 3.11",
"Programming Language :: Python :: 3.12",